{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3520179920307038, "eval_steps": 500, "global_step": 900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00039113110225633753, "grad_norm": 73.33983612060547, "learning_rate": 0.0, "loss": 9022.7637, "step": 1 }, { "epoch": 0.0007822622045126751, "grad_norm": 341.7502746582031, "learning_rate": 5.555555555555555e-07, "loss": 13173.7656, "step": 2 }, { "epoch": 0.0011733933067690125, "grad_norm": 87.00965881347656, "learning_rate": 1.111111111111111e-06, "loss": 9796.1172, "step": 3 }, { "epoch": 0.0015645244090253501, "grad_norm": 17.399965286254883, "learning_rate": 1.6666666666666667e-06, "loss": 11499.0176, "step": 4 }, { "epoch": 0.0019556555112816877, "grad_norm": 13.876103401184082, "learning_rate": 2.222222222222222e-06, "loss": 8862.1523, "step": 5 }, { "epoch": 0.002346786613538025, "grad_norm": 79.78038787841797, "learning_rate": 2.7777777777777783e-06, "loss": 9754.3584, "step": 6 }, { "epoch": 0.002737917715794363, "grad_norm": 32.10177230834961, "learning_rate": 3.3333333333333333e-06, "loss": 6731.0986, "step": 7 }, { "epoch": 0.0031290488180507003, "grad_norm": 35.9862174987793, "learning_rate": 3.88888888888889e-06, "loss": 11245.0312, "step": 8 }, { "epoch": 0.003520179920307038, "grad_norm": 28.863813400268555, "learning_rate": 4.444444444444444e-06, "loss": 6245.1196, "step": 9 }, { "epoch": 0.003911311022563375, "grad_norm": 42.70909881591797, "learning_rate": 5e-06, "loss": 6062.3369, "step": 10 }, { "epoch": 0.004302442124819713, "grad_norm": 8.477494239807129, "learning_rate": 5.555555555555557e-06, "loss": 9045.1113, "step": 11 }, { "epoch": 0.00469357322707605, "grad_norm": 17.350603103637695, "learning_rate": 6.111111111111112e-06, "loss": 4973.7451, "step": 12 }, { "epoch": 0.005084704329332388, "grad_norm": 10729.576171875, "learning_rate": 6.666666666666667e-06, "loss": 2556.2378, "step": 13 }, { "epoch": 0.005475835431588726, "grad_norm": 12796.4111328125, "learning_rate": 7.222222222222223e-06, "loss": 2469.0479, "step": 14 }, { "epoch": 0.005866966533845063, "grad_norm": 11690.9521484375, "learning_rate": 7.77777777777778e-06, "loss": 1900.1685, "step": 15 }, { "epoch": 0.0062580976361014005, "grad_norm": 10036.77734375, "learning_rate": 8.333333333333334e-06, "loss": 2600.8911, "step": 16 }, { "epoch": 0.006649228738357738, "grad_norm": 10.607966423034668, "learning_rate": 8.888888888888888e-06, "loss": 871.7581, "step": 17 }, { "epoch": 0.007040359840614076, "grad_norm": 19968.548828125, "learning_rate": 9.444444444444445e-06, "loss": 472.9429, "step": 18 }, { "epoch": 0.0074314909428704135, "grad_norm": 15.735692024230957, "learning_rate": 1e-05, "loss": 2482.8877, "step": 19 }, { "epoch": 0.00782262204512675, "grad_norm": 6.307197570800781, "learning_rate": 9.999968282268043e-06, "loss": 538.6899, "step": 20 }, { "epoch": 0.008213753147383089, "grad_norm": 7844.2470703125, "learning_rate": 9.999873129474573e-06, "loss": 1317.9186, "step": 21 }, { "epoch": 0.008604884249639426, "grad_norm": 1.590285062789917, "learning_rate": 9.999714542826806e-06, "loss": 641.1613, "step": 22 }, { "epoch": 0.008996015351895764, "grad_norm": 7149.05322265625, "learning_rate": 9.999492524336743e-06, "loss": 821.1104, "step": 23 }, { "epoch": 0.0093871464541521, "grad_norm": 3.6406304836273193, "learning_rate": 9.999207076821155e-06, "loss": 1000.1948, "step": 24 }, { "epoch": 0.009778277556408439, "grad_norm": 3.2047431468963623, "learning_rate": 9.99885820390154e-06, "loss": 418.2515, "step": 25 }, { "epoch": 0.010169408658664777, "grad_norm": 2.1247236728668213, "learning_rate": 9.998445910004082e-06, "loss": 206.9073, "step": 26 }, { "epoch": 0.010560539760921113, "grad_norm": 6176.64404296875, "learning_rate": 9.997970200359592e-06, "loss": 277.8287, "step": 27 }, { "epoch": 0.010951670863177452, "grad_norm": 0.7870343327522278, "learning_rate": 9.99743108100344e-06, "loss": 28.0133, "step": 28 }, { "epoch": 0.011342801965433788, "grad_norm": 2.914349317550659, "learning_rate": 9.996828558775486e-06, "loss": 528.6738, "step": 29 }, { "epoch": 0.011733933067690126, "grad_norm": 5678.888671875, "learning_rate": 9.996162641319985e-06, "loss": 173.2077, "step": 30 }, { "epoch": 0.012125064169946465, "grad_norm": 1.8666399717330933, "learning_rate": 9.995433337085492e-06, "loss": 227.239, "step": 31 }, { "epoch": 0.012516195272202801, "grad_norm": 2.4892313480377197, "learning_rate": 9.994640655324758e-06, "loss": 32.1116, "step": 32 }, { "epoch": 0.01290732637445914, "grad_norm": 1591.641357421875, "learning_rate": 9.993784606094612e-06, "loss": 211.6248, "step": 33 }, { "epoch": 0.013298457476715476, "grad_norm": 10072.65234375, "learning_rate": 9.992865200255829e-06, "loss": 374.1296, "step": 34 }, { "epoch": 0.013689588578971814, "grad_norm": 4825.43408203125, "learning_rate": 9.991882449472994e-06, "loss": 444.3439, "step": 35 }, { "epoch": 0.014080719681228152, "grad_norm": 30619.453125, "learning_rate": 9.99083636621436e-06, "loss": 1014.899, "step": 36 }, { "epoch": 0.014471850783484489, "grad_norm": 0.7788718342781067, "learning_rate": 9.989726963751683e-06, "loss": 637.2323, "step": 37 }, { "epoch": 0.014862981885740827, "grad_norm": 0.37763819098472595, "learning_rate": 9.988554256160052e-06, "loss": 338.0694, "step": 38 }, { "epoch": 0.015254112987997163, "grad_norm": 5639.20166015625, "learning_rate": 9.987318258317718e-06, "loss": 280.443, "step": 39 }, { "epoch": 0.0156452440902535, "grad_norm": 0.34363701939582825, "learning_rate": 9.986018985905901e-06, "loss": 127.4894, "step": 40 }, { "epoch": 0.01603637519250984, "grad_norm": 1.3454755544662476, "learning_rate": 9.984656455408591e-06, "loss": 227.2196, "step": 41 }, { "epoch": 0.016427506294766178, "grad_norm": 1.1212941408157349, "learning_rate": 9.983230684112338e-06, "loss": 388.5173, "step": 42 }, { "epoch": 0.016818637397022513, "grad_norm": 4135.07958984375, "learning_rate": 9.981741690106035e-06, "loss": 277.9473, "step": 43 }, { "epoch": 0.01720976849927885, "grad_norm": 0.3415931761264801, "learning_rate": 9.980189492280688e-06, "loss": 141.4491, "step": 44 }, { "epoch": 0.01760089960153519, "grad_norm": 0.3497358560562134, "learning_rate": 9.978574110329174e-06, "loss": 125.8424, "step": 45 }, { "epoch": 0.017992030703791528, "grad_norm": 5.21222448348999, "learning_rate": 9.976895564745993e-06, "loss": 693.0908, "step": 46 }, { "epoch": 0.018383161806047866, "grad_norm": 6.805148124694824, "learning_rate": 9.975153876827008e-06, "loss": 237.5693, "step": 47 }, { "epoch": 0.0187742929083042, "grad_norm": 0.32790857553482056, "learning_rate": 9.973349068669178e-06, "loss": 231.3147, "step": 48 }, { "epoch": 0.01916542401056054, "grad_norm": 1.088236927986145, "learning_rate": 9.97148116317027e-06, "loss": 59.4532, "step": 49 }, { "epoch": 0.019556555112816877, "grad_norm": 3.897584915161133, "learning_rate": 9.969550184028572e-06, "loss": 181.5394, "step": 50 }, { "epoch": 0.019947686215073215, "grad_norm": 0.36891791224479675, "learning_rate": 9.9675561557426e-06, "loss": 274.0787, "step": 51 }, { "epoch": 0.020338817317329554, "grad_norm": 4463.36474609375, "learning_rate": 9.965499103610775e-06, "loss": 334.0236, "step": 52 }, { "epoch": 0.02072994841958589, "grad_norm": 1.5177428722381592, "learning_rate": 9.963379053731104e-06, "loss": 116.3086, "step": 53 }, { "epoch": 0.021121079521842227, "grad_norm": 0.948329746723175, "learning_rate": 9.961196033000862e-06, "loss": 167.3062, "step": 54 }, { "epoch": 0.021512210624098565, "grad_norm": 6488.12744140625, "learning_rate": 9.95895006911623e-06, "loss": 280.1733, "step": 55 }, { "epoch": 0.021903341726354903, "grad_norm": 1.1784950494766235, "learning_rate": 9.956641190571967e-06, "loss": 211.0869, "step": 56 }, { "epoch": 0.02229447282861124, "grad_norm": 1.0954192876815796, "learning_rate": 9.954269426661023e-06, "loss": 108.0197, "step": 57 }, { "epoch": 0.022685603930867576, "grad_norm": 2431.99755859375, "learning_rate": 9.951834807474191e-06, "loss": 182.8621, "step": 58 }, { "epoch": 0.023076735033123914, "grad_norm": 0.5318559408187866, "learning_rate": 9.949337363899709e-06, "loss": 203.5584, "step": 59 }, { "epoch": 0.023467866135380253, "grad_norm": 611.1079711914062, "learning_rate": 9.946777127622874e-06, "loss": 304.5495, "step": 60 }, { "epoch": 0.02385899723763659, "grad_norm": 1.1814780235290527, "learning_rate": 9.944154131125643e-06, "loss": 305.7924, "step": 61 }, { "epoch": 0.02425012833989293, "grad_norm": 0.3324390947818756, "learning_rate": 9.941468407686216e-06, "loss": 133.2165, "step": 62 }, { "epoch": 0.024641259442149264, "grad_norm": 0.3267706036567688, "learning_rate": 9.938719991378614e-06, "loss": 234.4715, "step": 63 }, { "epoch": 0.025032390544405602, "grad_norm": 1.3336584568023682, "learning_rate": 9.935908917072253e-06, "loss": 44.1555, "step": 64 }, { "epoch": 0.02542352164666194, "grad_norm": 0.33313995599746704, "learning_rate": 9.933035220431489e-06, "loss": 199.4248, "step": 65 }, { "epoch": 0.02581465274891828, "grad_norm": 3202.915771484375, "learning_rate": 9.930098937915177e-06, "loss": 142.3462, "step": 66 }, { "epoch": 0.026205783851174617, "grad_norm": 1.4841117858886719, "learning_rate": 9.927100106776213e-06, "loss": 334.0991, "step": 67 }, { "epoch": 0.02659691495343095, "grad_norm": 3751.06689453125, "learning_rate": 9.924038765061042e-06, "loss": 254.8563, "step": 68 }, { "epoch": 0.02698804605568729, "grad_norm": 0.25357913970947266, "learning_rate": 9.920914951609189e-06, "loss": 173.2204, "step": 69 }, { "epoch": 0.027379177157943628, "grad_norm": 619.5060424804688, "learning_rate": 9.917728706052765e-06, "loss": 158.5274, "step": 70 }, { "epoch": 0.027770308260199966, "grad_norm": 0.3924911320209503, "learning_rate": 9.914480068815964e-06, "loss": 101.5475, "step": 71 }, { "epoch": 0.028161439362456304, "grad_norm": 477.5389404296875, "learning_rate": 9.91116908111455e-06, "loss": 150.1975, "step": 72 }, { "epoch": 0.02855257046471264, "grad_norm": 0.28718748688697815, "learning_rate": 9.907795784955327e-06, "loss": 209.6969, "step": 73 }, { "epoch": 0.028943701566968977, "grad_norm": 0.29111266136169434, "learning_rate": 9.90436022313562e-06, "loss": 153.0687, "step": 74 }, { "epoch": 0.029334832669225316, "grad_norm": 323.9737243652344, "learning_rate": 9.900862439242719e-06, "loss": 74.2796, "step": 75 }, { "epoch": 0.029725963771481654, "grad_norm": 2.6136021614074707, "learning_rate": 9.897302477653334e-06, "loss": 291.2503, "step": 76 }, { "epoch": 0.030117094873737992, "grad_norm": 0.22756816446781158, "learning_rate": 9.893680383533027e-06, "loss": 96.8319, "step": 77 }, { "epoch": 0.030508225975994327, "grad_norm": 0.9840016961097717, "learning_rate": 9.889996202835642e-06, "loss": 309.5101, "step": 78 }, { "epoch": 0.030899357078250665, "grad_norm": 0.275417685508728, "learning_rate": 9.88624998230272e-06, "loss": 234.1608, "step": 79 }, { "epoch": 0.031290488180507, "grad_norm": 0.7580969929695129, "learning_rate": 9.882441769462911e-06, "loss": 237.8195, "step": 80 }, { "epoch": 0.03168161928276334, "grad_norm": 0.76526939868927, "learning_rate": 9.878571612631364e-06, "loss": 84.615, "step": 81 }, { "epoch": 0.03207275038501968, "grad_norm": 373.3411560058594, "learning_rate": 9.874639560909118e-06, "loss": 300.4078, "step": 82 }, { "epoch": 0.03246388148727602, "grad_norm": 1.3133848905563354, "learning_rate": 9.870645664182478e-06, "loss": 131.5505, "step": 83 }, { "epoch": 0.032855012589532356, "grad_norm": 311.3302917480469, "learning_rate": 9.86658997312238e-06, "loss": 196.0266, "step": 84 }, { "epoch": 0.033246143691788695, "grad_norm": 0.6496802568435669, "learning_rate": 9.862472539183757e-06, "loss": 73.9141, "step": 85 }, { "epoch": 0.033637274794045026, "grad_norm": 459.7312927246094, "learning_rate": 9.858293414604871e-06, "loss": 152.4336, "step": 86 }, { "epoch": 0.034028405896301364, "grad_norm": 335.05426025390625, "learning_rate": 9.854052652406666e-06, "loss": 239.4406, "step": 87 }, { "epoch": 0.0344195369985577, "grad_norm": 1.7906296253204346, "learning_rate": 9.849750306392085e-06, "loss": 129.0228, "step": 88 }, { "epoch": 0.03481066810081404, "grad_norm": 0.2897918224334717, "learning_rate": 9.84538643114539e-06, "loss": 150.6289, "step": 89 }, { "epoch": 0.03520179920307038, "grad_norm": 486.94952392578125, "learning_rate": 9.840961082031473e-06, "loss": 98.7655, "step": 90 }, { "epoch": 0.03559293030532672, "grad_norm": 1.4039978981018066, "learning_rate": 9.836474315195148e-06, "loss": 153.2002, "step": 91 }, { "epoch": 0.035984061407583055, "grad_norm": 2591.096435546875, "learning_rate": 9.831926187560441e-06, "loss": 345.5845, "step": 92 }, { "epoch": 0.036375192509839394, "grad_norm": 2290.58740234375, "learning_rate": 9.827316756829871e-06, "loss": 95.79, "step": 93 }, { "epoch": 0.03676632361209573, "grad_norm": 2.362971067428589, "learning_rate": 9.822646081483713e-06, "loss": 177.6802, "step": 94 }, { "epoch": 0.03715745471435207, "grad_norm": 0.3684510886669159, "learning_rate": 9.817914220779258e-06, "loss": 364.415, "step": 95 }, { "epoch": 0.0375485858166084, "grad_norm": 3289.91162109375, "learning_rate": 9.81312123475006e-06, "loss": 353.144, "step": 96 }, { "epoch": 0.03793971691886474, "grad_norm": 0.3244830369949341, "learning_rate": 9.808267184205182e-06, "loss": 189.2679, "step": 97 }, { "epoch": 0.03833084802112108, "grad_norm": 1.0287890434265137, "learning_rate": 9.80335213072841e-06, "loss": 139.2987, "step": 98 }, { "epoch": 0.038721979123377416, "grad_norm": 1508.3958740234375, "learning_rate": 9.798376136677486e-06, "loss": 187.994, "step": 99 }, { "epoch": 0.039113110225633754, "grad_norm": 2.16009521484375, "learning_rate": 9.793339265183303e-06, "loss": 142.5293, "step": 100 }, { "epoch": 0.03950424132789009, "grad_norm": 0.4904083013534546, "learning_rate": 9.788241580149123e-06, "loss": 147.5625, "step": 101 }, { "epoch": 0.03989537243014643, "grad_norm": 1.0540492534637451, "learning_rate": 9.783083146249749e-06, "loss": 195.4026, "step": 102 }, { "epoch": 0.04028650353240277, "grad_norm": 602.712158203125, "learning_rate": 9.777864028930705e-06, "loss": 101.4185, "step": 103 }, { "epoch": 0.04067763463465911, "grad_norm": 2.976310968399048, "learning_rate": 9.77258429440742e-06, "loss": 218.7537, "step": 104 }, { "epoch": 0.041068765736915445, "grad_norm": 2.811608076095581, "learning_rate": 9.767244009664376e-06, "loss": 96.5371, "step": 105 }, { "epoch": 0.04145989683917178, "grad_norm": 0.34032607078552246, "learning_rate": 9.761843242454261e-06, "loss": 191.7003, "step": 106 }, { "epoch": 0.041851027941428115, "grad_norm": 5016.841796875, "learning_rate": 9.75638206129711e-06, "loss": 112.3467, "step": 107 }, { "epoch": 0.04224215904368445, "grad_norm": 0.2334691882133484, "learning_rate": 9.750860535479434e-06, "loss": 153.987, "step": 108 }, { "epoch": 0.04263329014594079, "grad_norm": 0.16604840755462646, "learning_rate": 9.745278735053345e-06, "loss": 164.6346, "step": 109 }, { "epoch": 0.04302442124819713, "grad_norm": 2.011397361755371, "learning_rate": 9.73963673083566e-06, "loss": 96.3216, "step": 110 }, { "epoch": 0.04341555235045347, "grad_norm": 216.0008544921875, "learning_rate": 9.733934594407012e-06, "loss": 97.9669, "step": 111 }, { "epoch": 0.043806683452709806, "grad_norm": 781.0093383789062, "learning_rate": 9.728172398110935e-06, "loss": 124.8754, "step": 112 }, { "epoch": 0.044197814554966144, "grad_norm": 1206.0252685546875, "learning_rate": 9.722350215052946e-06, "loss": 126.4322, "step": 113 }, { "epoch": 0.04458894565722248, "grad_norm": 1861.3472900390625, "learning_rate": 9.716468119099626e-06, "loss": 111.0165, "step": 114 }, { "epoch": 0.04498007675947882, "grad_norm": 0.7697561979293823, "learning_rate": 9.710526184877667e-06, "loss": 1.3695, "step": 115 }, { "epoch": 0.04537120786173515, "grad_norm": 0.5405284762382507, "learning_rate": 9.704524487772944e-06, "loss": 126.1109, "step": 116 }, { "epoch": 0.04576233896399149, "grad_norm": 0.6306818723678589, "learning_rate": 9.698463103929542e-06, "loss": 21.7656, "step": 117 }, { "epoch": 0.04615347006624783, "grad_norm": 3005.80712890625, "learning_rate": 9.692342110248802e-06, "loss": 127.39, "step": 118 }, { "epoch": 0.04654460116850417, "grad_norm": 392.67950439453125, "learning_rate": 9.68616158438834e-06, "loss": 110.3292, "step": 119 }, { "epoch": 0.046935732270760505, "grad_norm": 3.171126365661621, "learning_rate": 9.679921604761056e-06, "loss": 132.2241, "step": 120 }, { "epoch": 0.04732686337301684, "grad_norm": 0.4518907368183136, "learning_rate": 9.673622250534155e-06, "loss": 136.119, "step": 121 }, { "epoch": 0.04771799447527318, "grad_norm": 2589.1015625, "learning_rate": 9.66726360162813e-06, "loss": 381.2232, "step": 122 }, { "epoch": 0.04810912557752952, "grad_norm": 0.1722714751958847, "learning_rate": 9.660845738715743e-06, "loss": 97.7159, "step": 123 }, { "epoch": 0.04850025667978586, "grad_norm": 0.5003955364227295, "learning_rate": 9.654368743221022e-06, "loss": 50.7664, "step": 124 }, { "epoch": 0.048891387782042196, "grad_norm": 0.28121402859687805, "learning_rate": 9.647832697318207e-06, "loss": 93.6975, "step": 125 }, { "epoch": 0.04928251888429853, "grad_norm": 2588.393310546875, "learning_rate": 9.641237683930722e-06, "loss": 54.1281, "step": 126 }, { "epoch": 0.049673649986554866, "grad_norm": 522.7401123046875, "learning_rate": 9.63458378673011e-06, "loss": 21.376, "step": 127 }, { "epoch": 0.050064781088811204, "grad_norm": 1337.2923583984375, "learning_rate": 9.627871090134984e-06, "loss": 97.5455, "step": 128 }, { "epoch": 0.05045591219106754, "grad_norm": 0.17062804102897644, "learning_rate": 9.621099679309948e-06, "loss": 66.8805, "step": 129 }, { "epoch": 0.05084704329332388, "grad_norm": 1.8228886127471924, "learning_rate": 9.61426964016452e-06, "loss": 90.8759, "step": 130 }, { "epoch": 0.05123817439558022, "grad_norm": 0.9226913452148438, "learning_rate": 9.60738105935204e-06, "loss": 57.6078, "step": 131 }, { "epoch": 0.05162930549783656, "grad_norm": 0.695069432258606, "learning_rate": 9.60043402426857e-06, "loss": 125.0028, "step": 132 }, { "epoch": 0.052020436600092895, "grad_norm": 1.5678439140319824, "learning_rate": 9.593428623051793e-06, "loss": 94.1563, "step": 133 }, { "epoch": 0.052411567702349234, "grad_norm": 1.5755531787872314, "learning_rate": 9.58636494457988e-06, "loss": 83.3838, "step": 134 }, { "epoch": 0.05280269880460557, "grad_norm": 0.2734808623790741, "learning_rate": 9.57924307847038e-06, "loss": 31.6734, "step": 135 }, { "epoch": 0.0531938299068619, "grad_norm": 1.4930920600891113, "learning_rate": 9.572063115079063e-06, "loss": 126.7836, "step": 136 }, { "epoch": 0.05358496100911824, "grad_norm": 0.25422215461730957, "learning_rate": 9.564825145498795e-06, "loss": 76.5774, "step": 137 }, { "epoch": 0.05397609211137458, "grad_norm": 163.52911376953125, "learning_rate": 9.557529261558367e-06, "loss": 53.7238, "step": 138 }, { "epoch": 0.05436722321363092, "grad_norm": 553.9047241210938, "learning_rate": 9.550175555821333e-06, "loss": 76.2747, "step": 139 }, { "epoch": 0.054758354315887256, "grad_norm": 0.18982116878032684, "learning_rate": 9.542764121584845e-06, "loss": 289.7023, "step": 140 }, { "epoch": 0.055149485418143594, "grad_norm": 1.4099926948547363, "learning_rate": 9.53529505287845e-06, "loss": 43.1031, "step": 141 }, { "epoch": 0.05554061652039993, "grad_norm": 1.5430524349212646, "learning_rate": 9.527768444462922e-06, "loss": 4.821, "step": 142 }, { "epoch": 0.05593174762265627, "grad_norm": 0.40151554346084595, "learning_rate": 9.520184391829037e-06, "loss": 67.8287, "step": 143 }, { "epoch": 0.05632287872491261, "grad_norm": 0.22733083367347717, "learning_rate": 9.512542991196377e-06, "loss": 162.4188, "step": 144 }, { "epoch": 0.05671400982716895, "grad_norm": 0.44649437069892883, "learning_rate": 9.504844339512096e-06, "loss": 80.723, "step": 145 }, { "epoch": 0.05710514092942528, "grad_norm": 4683.439453125, "learning_rate": 9.497088534449707e-06, "loss": 362.2365, "step": 146 }, { "epoch": 0.05749627203168162, "grad_norm": 1.031704306602478, "learning_rate": 9.489275674407826e-06, "loss": 164.8387, "step": 147 }, { "epoch": 0.057887403133937955, "grad_norm": 0.1672774702310562, "learning_rate": 9.481405858508935e-06, "loss": 117.6841, "step": 148 }, { "epoch": 0.05827853423619429, "grad_norm": 292.0991516113281, "learning_rate": 9.473479186598115e-06, "loss": 86.3267, "step": 149 }, { "epoch": 0.05866966533845063, "grad_norm": 2.811321496963501, "learning_rate": 9.465495759241793e-06, "loss": 291.6378, "step": 150 }, { "epoch": 0.05906079644070697, "grad_norm": 3673.74951171875, "learning_rate": 9.457455677726447e-06, "loss": 118.6411, "step": 151 }, { "epoch": 0.05945192754296331, "grad_norm": 3.3059866428375244, "learning_rate": 9.449359044057344e-06, "loss": 152.2512, "step": 152 }, { "epoch": 0.059843058645219646, "grad_norm": 1.5615016222000122, "learning_rate": 9.441205960957221e-06, "loss": 75.4238, "step": 153 }, { "epoch": 0.060234189747475984, "grad_norm": 129.04693603515625, "learning_rate": 9.432996531865001e-06, "loss": 98.264, "step": 154 }, { "epoch": 0.06062532084973232, "grad_norm": 1.3238544464111328, "learning_rate": 9.424730860934474e-06, "loss": 138.5685, "step": 155 }, { "epoch": 0.061016451951988654, "grad_norm": 0.6455661058425903, "learning_rate": 9.416409053032971e-06, "loss": 76.155, "step": 156 }, { "epoch": 0.06140758305424499, "grad_norm": 3.3306515216827393, "learning_rate": 9.408031213740045e-06, "loss": 193.0163, "step": 157 }, { "epoch": 0.06179871415650133, "grad_norm": 363.2554626464844, "learning_rate": 9.399597449346119e-06, "loss": 71.2268, "step": 158 }, { "epoch": 0.06218984525875767, "grad_norm": 1775.3311767578125, "learning_rate": 9.391107866851143e-06, "loss": 102.3627, "step": 159 }, { "epoch": 0.062580976361014, "grad_norm": 0.23126842081546783, "learning_rate": 9.382562573963238e-06, "loss": 33.1838, "step": 160 }, { "epoch": 0.06297210746327034, "grad_norm": 425.8428955078125, "learning_rate": 9.37396167909733e-06, "loss": 122.2196, "step": 161 }, { "epoch": 0.06336323856552668, "grad_norm": 2.3558237552642822, "learning_rate": 9.365305291373769e-06, "loss": 91.5736, "step": 162 }, { "epoch": 0.06375436966778301, "grad_norm": 0.19831174612045288, "learning_rate": 9.356593520616948e-06, "loss": 126.7139, "step": 163 }, { "epoch": 0.06414550077003936, "grad_norm": 2583.596435546875, "learning_rate": 9.347826477353911e-06, "loss": 114.9668, "step": 164 }, { "epoch": 0.06453663187229569, "grad_norm": 468.4013977050781, "learning_rate": 9.33900427281295e-06, "loss": 58.1353, "step": 165 }, { "epoch": 0.06492776297455204, "grad_norm": 0.607861340045929, "learning_rate": 9.330127018922195e-06, "loss": 89.4309, "step": 166 }, { "epoch": 0.06531889407680837, "grad_norm": 1.087489128112793, "learning_rate": 9.321194828308185e-06, "loss": 57.7468, "step": 167 }, { "epoch": 0.06571002517906471, "grad_norm": 1.4299119710922241, "learning_rate": 9.312207814294454e-06, "loss": 131.9059, "step": 168 }, { "epoch": 0.06610115628132104, "grad_norm": 0.32067033648490906, "learning_rate": 9.303166090900082e-06, "loss": 113.9032, "step": 169 }, { "epoch": 0.06649228738357739, "grad_norm": 2900.03857421875, "learning_rate": 9.294069772838253e-06, "loss": 62.0634, "step": 170 }, { "epoch": 0.06688341848583372, "grad_norm": 2043.025634765625, "learning_rate": 9.284918975514798e-06, "loss": 128.1048, "step": 171 }, { "epoch": 0.06727454958809005, "grad_norm": 763.0787963867188, "learning_rate": 9.275713815026732e-06, "loss": 100.8595, "step": 172 }, { "epoch": 0.0676656806903464, "grad_norm": 316.2015380859375, "learning_rate": 9.266454408160779e-06, "loss": 83.9507, "step": 173 }, { "epoch": 0.06805681179260273, "grad_norm": 1.8424168825149536, "learning_rate": 9.257140872391895e-06, "loss": 150.7857, "step": 174 }, { "epoch": 0.06844794289485907, "grad_norm": 3.162083387374878, "learning_rate": 9.24777332588177e-06, "loss": 109.7346, "step": 175 }, { "epoch": 0.0688390739971154, "grad_norm": 0.48428380489349365, "learning_rate": 9.238351887477338e-06, "loss": 198.0632, "step": 176 }, { "epoch": 0.06923020509937175, "grad_norm": 283.169921875, "learning_rate": 9.22887667670926e-06, "loss": 133.4527, "step": 177 }, { "epoch": 0.06962133620162808, "grad_norm": 0.24236121773719788, "learning_rate": 9.219347813790416e-06, "loss": 134.3827, "step": 178 }, { "epoch": 0.07001246730388443, "grad_norm": 1.1005642414093018, "learning_rate": 9.209765419614375e-06, "loss": 95.9749, "step": 179 }, { "epoch": 0.07040359840614076, "grad_norm": 0.2186761498451233, "learning_rate": 9.200129615753858e-06, "loss": 99.3441, "step": 180 }, { "epoch": 0.07079472950839709, "grad_norm": 1.1894441843032837, "learning_rate": 9.190440524459203e-06, "loss": 64.4476, "step": 181 }, { "epoch": 0.07118586061065343, "grad_norm": 1906.7816162109375, "learning_rate": 9.180698268656814e-06, "loss": 76.6484, "step": 182 }, { "epoch": 0.07157699171290977, "grad_norm": 1806.0740966796875, "learning_rate": 9.170902971947589e-06, "loss": 124.0878, "step": 183 }, { "epoch": 0.07196812281516611, "grad_norm": 0.9422973394393921, "learning_rate": 9.16105475860537e-06, "loss": 61.4217, "step": 184 }, { "epoch": 0.07235925391742244, "grad_norm": 0.22520415484905243, "learning_rate": 9.151153753575351e-06, "loss": 76.0462, "step": 185 }, { "epoch": 0.07275038501967879, "grad_norm": 0.5669053792953491, "learning_rate": 9.141200082472503e-06, "loss": 66.0641, "step": 186 }, { "epoch": 0.07314151612193512, "grad_norm": 4875.95751953125, "learning_rate": 9.131193871579975e-06, "loss": 363.5605, "step": 187 }, { "epoch": 0.07353264722419146, "grad_norm": 1.5607740879058838, "learning_rate": 9.121135247847492e-06, "loss": 163.8841, "step": 188 }, { "epoch": 0.0739237783264478, "grad_norm": 0.15883424878120422, "learning_rate": 9.111024338889748e-06, "loss": 99.735, "step": 189 }, { "epoch": 0.07431490942870414, "grad_norm": 1.4388748407363892, "learning_rate": 9.10086127298478e-06, "loss": 37.5176, "step": 190 }, { "epoch": 0.07470604053096047, "grad_norm": 5.2772040367126465, "learning_rate": 9.090646179072352e-06, "loss": 107.7335, "step": 191 }, { "epoch": 0.0750971716332168, "grad_norm": 1.8319369554519653, "learning_rate": 9.080379186752304e-06, "loss": 71.8792, "step": 192 }, { "epoch": 0.07548830273547315, "grad_norm": 0.7392552495002747, "learning_rate": 9.070060426282924e-06, "loss": 113.0879, "step": 193 }, { "epoch": 0.07587943383772948, "grad_norm": 1.0116955041885376, "learning_rate": 9.059690028579285e-06, "loss": 37.4901, "step": 194 }, { "epoch": 0.07627056493998582, "grad_norm": 0.36633720993995667, "learning_rate": 9.049268125211577e-06, "loss": 33.619, "step": 195 }, { "epoch": 0.07666169604224216, "grad_norm": 0.25814440846443176, "learning_rate": 9.038794848403463e-06, "loss": 73.8944, "step": 196 }, { "epoch": 0.0770528271444985, "grad_norm": 0.9375834465026855, "learning_rate": 9.028270331030373e-06, "loss": 130.3545, "step": 197 }, { "epoch": 0.07744395824675483, "grad_norm": 0.18285518884658813, "learning_rate": 9.017694706617836e-06, "loss": 52.1208, "step": 198 }, { "epoch": 0.07783508934901118, "grad_norm": 3008.265869140625, "learning_rate": 9.007068109339783e-06, "loss": 67.7978, "step": 199 }, { "epoch": 0.07822622045126751, "grad_norm": 0.1591944545507431, "learning_rate": 8.996390674016839e-06, "loss": 56.9001, "step": 200 }, { "epoch": 0.07861735155352384, "grad_norm": 0.13576161861419678, "learning_rate": 8.985662536114614e-06, "loss": 136.3152, "step": 201 }, { "epoch": 0.07900848265578019, "grad_norm": 411.30450439453125, "learning_rate": 8.97488383174199e-06, "loss": 125.2447, "step": 202 }, { "epoch": 0.07939961375803652, "grad_norm": 0.5793523788452148, "learning_rate": 8.964054697649389e-06, "loss": 179.7917, "step": 203 }, { "epoch": 0.07979074486029286, "grad_norm": 1.1756185293197632, "learning_rate": 8.953175271227042e-06, "loss": 208.6852, "step": 204 }, { "epoch": 0.08018187596254919, "grad_norm": 0.4390352666378021, "learning_rate": 8.94224569050324e-06, "loss": 78.6966, "step": 205 }, { "epoch": 0.08057300706480554, "grad_norm": 0.20057456195354462, "learning_rate": 8.931266094142588e-06, "loss": 79.7404, "step": 206 }, { "epoch": 0.08096413816706187, "grad_norm": 2346.697509765625, "learning_rate": 8.920236621444243e-06, "loss": 162.8069, "step": 207 }, { "epoch": 0.08135526926931821, "grad_norm": 0.19323071837425232, "learning_rate": 8.90915741234015e-06, "loss": 182.8168, "step": 208 }, { "epoch": 0.08174640037157455, "grad_norm": 301.8100891113281, "learning_rate": 8.89802860739326e-06, "loss": 75.6532, "step": 209 }, { "epoch": 0.08213753147383089, "grad_norm": 1862.041748046875, "learning_rate": 8.88685034779576e-06, "loss": 147.3607, "step": 210 }, { "epoch": 0.08252866257608722, "grad_norm": 0.15555168688297272, "learning_rate": 8.87562277536726e-06, "loss": 8.7277, "step": 211 }, { "epoch": 0.08291979367834355, "grad_norm": 0.22444282472133636, "learning_rate": 8.864346032553016e-06, "loss": 168.385, "step": 212 }, { "epoch": 0.0833109247805999, "grad_norm": 1.4300802946090698, "learning_rate": 8.853020262422111e-06, "loss": 170.4068, "step": 213 }, { "epoch": 0.08370205588285623, "grad_norm": 0.5101105570793152, "learning_rate": 8.84164560866564e-06, "loss": 322.9848, "step": 214 }, { "epoch": 0.08409318698511258, "grad_norm": 1.2417665719985962, "learning_rate": 8.83022221559489e-06, "loss": 265.6229, "step": 215 }, { "epoch": 0.0844843180873689, "grad_norm": 0.5343866944313049, "learning_rate": 8.818750228139513e-06, "loss": 146.003, "step": 216 }, { "epoch": 0.08487544918962525, "grad_norm": 0.30504119396209717, "learning_rate": 8.807229791845673e-06, "loss": 37.3566, "step": 217 }, { "epoch": 0.08526658029188158, "grad_norm": 0.8993078470230103, "learning_rate": 8.795661052874217e-06, "loss": 83.1912, "step": 218 }, { "epoch": 0.08565771139413793, "grad_norm": 520.2239379882812, "learning_rate": 8.78404415799881e-06, "loss": 6.4627, "step": 219 }, { "epoch": 0.08604884249639426, "grad_norm": 1.5063972473144531, "learning_rate": 8.772379254604074e-06, "loss": 59.2478, "step": 220 }, { "epoch": 0.0864399735986506, "grad_norm": 0.9105270504951477, "learning_rate": 8.76066649068372e-06, "loss": 30.8765, "step": 221 }, { "epoch": 0.08683110470090694, "grad_norm": 1.8939876556396484, "learning_rate": 8.748906014838672e-06, "loss": 147.7755, "step": 222 }, { "epoch": 0.08722223580316327, "grad_norm": 0.8894091844558716, "learning_rate": 8.737097976275177e-06, "loss": 229.9513, "step": 223 }, { "epoch": 0.08761336690541961, "grad_norm": 0.28113725781440735, "learning_rate": 8.725242524802919e-06, "loss": 185.9432, "step": 224 }, { "epoch": 0.08800449800767594, "grad_norm": 2170.448974609375, "learning_rate": 8.713339810833105e-06, "loss": 86.6734, "step": 225 }, { "epoch": 0.08839562910993229, "grad_norm": 0.9475433826446533, "learning_rate": 8.701389985376578e-06, "loss": 117.112, "step": 226 }, { "epoch": 0.08878676021218862, "grad_norm": 0.6161375641822815, "learning_rate": 8.689393200041878e-06, "loss": 75.888, "step": 227 }, { "epoch": 0.08917789131444497, "grad_norm": 1.6829807758331299, "learning_rate": 8.677349607033336e-06, "loss": 23.0701, "step": 228 }, { "epoch": 0.0895690224167013, "grad_norm": 0.21339260041713715, "learning_rate": 8.665259359149132e-06, "loss": 28.2332, "step": 229 }, { "epoch": 0.08996015351895764, "grad_norm": 2635.137451171875, "learning_rate": 8.653122609779365e-06, "loss": 124.7725, "step": 230 }, { "epoch": 0.09035128462121397, "grad_norm": 0.34213894605636597, "learning_rate": 8.640939512904097e-06, "loss": 96.0887, "step": 231 }, { "epoch": 0.0907424157234703, "grad_norm": 0.23164384067058563, "learning_rate": 8.62871022309141e-06, "loss": 69.6245, "step": 232 }, { "epoch": 0.09113354682572665, "grad_norm": 1.0450925827026367, "learning_rate": 8.61643489549544e-06, "loss": 60.8622, "step": 233 }, { "epoch": 0.09152467792798298, "grad_norm": 928.1207275390625, "learning_rate": 8.604113685854407e-06, "loss": 200.9607, "step": 234 }, { "epoch": 0.09191580903023933, "grad_norm": 0.23424312472343445, "learning_rate": 8.591746750488639e-06, "loss": 37.2375, "step": 235 }, { "epoch": 0.09230694013249566, "grad_norm": 0.17486761510372162, "learning_rate": 8.579334246298593e-06, "loss": 92.1142, "step": 236 }, { "epoch": 0.092698071234752, "grad_norm": 0.2754494845867157, "learning_rate": 8.566876330762861e-06, "loss": 146.6022, "step": 237 }, { "epoch": 0.09308920233700833, "grad_norm": 0.6534192562103271, "learning_rate": 8.554373161936176e-06, "loss": 152.2259, "step": 238 }, { "epoch": 0.09348033343926468, "grad_norm": 0.19244331121444702, "learning_rate": 8.541824898447399e-06, "loss": 285.4724, "step": 239 }, { "epoch": 0.09387146454152101, "grad_norm": 0.6852802038192749, "learning_rate": 8.529231699497512e-06, "loss": 170.299, "step": 240 }, { "epoch": 0.09426259564377736, "grad_norm": 3.1934654712677, "learning_rate": 8.516593724857598e-06, "loss": 167.2633, "step": 241 }, { "epoch": 0.09465372674603369, "grad_norm": 0.19524431228637695, "learning_rate": 8.503911134866819e-06, "loss": 103.9347, "step": 242 }, { "epoch": 0.09504485784829002, "grad_norm": 316.60107421875, "learning_rate": 8.491184090430365e-06, "loss": 82.532, "step": 243 }, { "epoch": 0.09543598895054636, "grad_norm": 1987.078125, "learning_rate": 8.478412753017433e-06, "loss": 205.8121, "step": 244 }, { "epoch": 0.0958271200528027, "grad_norm": 4.607306957244873, "learning_rate": 8.465597284659163e-06, "loss": 28.8357, "step": 245 }, { "epoch": 0.09621825115505904, "grad_norm": 317.93145751953125, "learning_rate": 8.452737847946597e-06, "loss": 96.3762, "step": 246 }, { "epoch": 0.09660938225731537, "grad_norm": 1839.33740234375, "learning_rate": 8.439834606028594e-06, "loss": 81.6538, "step": 247 }, { "epoch": 0.09700051335957172, "grad_norm": 1.588068962097168, "learning_rate": 8.426887722609787e-06, "loss": 86.3677, "step": 248 }, { "epoch": 0.09739164446182805, "grad_norm": 0.27696385979652405, "learning_rate": 8.413897361948484e-06, "loss": 37.9441, "step": 249 }, { "epoch": 0.09778277556408439, "grad_norm": 0.5097156167030334, "learning_rate": 8.400863688854598e-06, "loss": 46.0221, "step": 250 }, { "epoch": 0.09817390666634072, "grad_norm": 0.16560612618923187, "learning_rate": 8.387786868687549e-06, "loss": 63.0729, "step": 251 }, { "epoch": 0.09856503776859706, "grad_norm": 0.4086504280567169, "learning_rate": 8.374667067354164e-06, "loss": 123.6349, "step": 252 }, { "epoch": 0.0989561688708534, "grad_norm": 2121.2421875, "learning_rate": 8.361504451306585e-06, "loss": 79.6353, "step": 253 }, { "epoch": 0.09934729997310973, "grad_norm": 0.5328729152679443, "learning_rate": 8.34829918754014e-06, "loss": 57.2165, "step": 254 }, { "epoch": 0.09973843107536608, "grad_norm": 0.5634379386901855, "learning_rate": 8.335051443591236e-06, "loss": 80.4085, "step": 255 }, { "epoch": 0.10012956217762241, "grad_norm": 1125.6824951171875, "learning_rate": 8.321761387535231e-06, "loss": 95.3526, "step": 256 }, { "epoch": 0.10052069327987875, "grad_norm": 3538.38525390625, "learning_rate": 8.308429187984298e-06, "loss": 142.8311, "step": 257 }, { "epoch": 0.10091182438213508, "grad_norm": 0.32026898860931396, "learning_rate": 8.295055014085289e-06, "loss": 33.7843, "step": 258 }, { "epoch": 0.10130295548439143, "grad_norm": 3.7499899864196777, "learning_rate": 8.281639035517591e-06, "loss": 64.4205, "step": 259 }, { "epoch": 0.10169408658664776, "grad_norm": 0.6029073596000671, "learning_rate": 8.268181422490969e-06, "loss": 91.6323, "step": 260 }, { "epoch": 0.1020852176889041, "grad_norm": 481.76361083984375, "learning_rate": 8.254682345743406e-06, "loss": 92.7615, "step": 261 }, { "epoch": 0.10247634879116044, "grad_norm": 0.34256768226623535, "learning_rate": 8.241141976538944e-06, "loss": 104.3669, "step": 262 }, { "epoch": 0.10286747989341677, "grad_norm": 1.014054775238037, "learning_rate": 8.227560486665498e-06, "loss": 30.8866, "step": 263 }, { "epoch": 0.10325861099567311, "grad_norm": 0.3670375943183899, "learning_rate": 8.213938048432697e-06, "loss": 89.6379, "step": 264 }, { "epoch": 0.10364974209792945, "grad_norm": 0.2565741539001465, "learning_rate": 8.200274834669675e-06, "loss": 81.031, "step": 265 }, { "epoch": 0.10404087320018579, "grad_norm": 0.15868939459323883, "learning_rate": 8.186571018722894e-06, "loss": 184.6763, "step": 266 }, { "epoch": 0.10443200430244212, "grad_norm": 1.7641242742538452, "learning_rate": 8.172826774453937e-06, "loss": 156.6617, "step": 267 }, { "epoch": 0.10482313540469847, "grad_norm": 2.6388673782348633, "learning_rate": 8.159042276237308e-06, "loss": 36.3769, "step": 268 }, { "epoch": 0.1052142665069548, "grad_norm": 0.3692081868648529, "learning_rate": 8.145217698958213e-06, "loss": 94.0488, "step": 269 }, { "epoch": 0.10560539760921114, "grad_norm": 1.1230888366699219, "learning_rate": 8.131353218010347e-06, "loss": 182.0155, "step": 270 }, { "epoch": 0.10599652871146747, "grad_norm": 2172.541259765625, "learning_rate": 8.117449009293668e-06, "loss": 144.515, "step": 271 }, { "epoch": 0.1063876598137238, "grad_norm": 1854.9971923828125, "learning_rate": 8.10350524921216e-06, "loss": 135.7373, "step": 272 }, { "epoch": 0.10677879091598015, "grad_norm": 0.37005868554115295, "learning_rate": 8.089522114671603e-06, "loss": 71.444, "step": 273 }, { "epoch": 0.10716992201823648, "grad_norm": 1958.9207763671875, "learning_rate": 8.075499783077321e-06, "loss": 129.372, "step": 274 }, { "epoch": 0.10756105312049283, "grad_norm": 0.5390088558197021, "learning_rate": 8.061438432331935e-06, "loss": 205.9629, "step": 275 }, { "epoch": 0.10795218422274916, "grad_norm": 1.4290964603424072, "learning_rate": 8.047338240833108e-06, "loss": 107.3386, "step": 276 }, { "epoch": 0.1083433153250055, "grad_norm": 0.30491700768470764, "learning_rate": 8.033199387471278e-06, "loss": 126.0688, "step": 277 }, { "epoch": 0.10873444642726184, "grad_norm": 0.6881037950515747, "learning_rate": 8.019022051627387e-06, "loss": 137.38, "step": 278 }, { "epoch": 0.10912557752951818, "grad_norm": 0.5975064039230347, "learning_rate": 8.004806413170613e-06, "loss": 49.5408, "step": 279 }, { "epoch": 0.10951670863177451, "grad_norm": 2138.67041015625, "learning_rate": 7.99055265245608e-06, "loss": 87.7601, "step": 280 }, { "epoch": 0.10990783973403086, "grad_norm": 0.19253171980381012, "learning_rate": 7.976260950322572e-06, "loss": 76.5956, "step": 281 }, { "epoch": 0.11029897083628719, "grad_norm": 1.348046898841858, "learning_rate": 7.96193148809024e-06, "loss": 47.8194, "step": 282 }, { "epoch": 0.11069010193854352, "grad_norm": 0.4207615554332733, "learning_rate": 7.9475644475583e-06, "loss": 59.7233, "step": 283 }, { "epoch": 0.11108123304079986, "grad_norm": 1892.4029541015625, "learning_rate": 7.933160011002729e-06, "loss": 147.8456, "step": 284 }, { "epoch": 0.1114723641430562, "grad_norm": 1.315190076828003, "learning_rate": 7.918718361173951e-06, "loss": 93.2988, "step": 285 }, { "epoch": 0.11186349524531254, "grad_norm": 1.3171827793121338, "learning_rate": 7.904239681294515e-06, "loss": 44.8592, "step": 286 }, { "epoch": 0.11225462634756887, "grad_norm": 314.198486328125, "learning_rate": 7.889724155056776e-06, "loss": 87.7935, "step": 287 }, { "epoch": 0.11264575744982522, "grad_norm": 0.28883472084999084, "learning_rate": 7.875171966620567e-06, "loss": 101.7658, "step": 288 }, { "epoch": 0.11303688855208155, "grad_norm": 0.49919599294662476, "learning_rate": 7.860583300610849e-06, "loss": 132.276, "step": 289 }, { "epoch": 0.1134280196543379, "grad_norm": 0.644845187664032, "learning_rate": 7.84595834211538e-06, "loss": 88.4937, "step": 290 }, { "epoch": 0.11381915075659423, "grad_norm": 0.21764078736305237, "learning_rate": 7.83129727668237e-06, "loss": 106.7949, "step": 291 }, { "epoch": 0.11421028185885056, "grad_norm": 0.6382555961608887, "learning_rate": 7.81660029031811e-06, "loss": 86.3284, "step": 292 }, { "epoch": 0.1146014129611069, "grad_norm": 3383.526123046875, "learning_rate": 7.801867569484635e-06, "loss": 163.3528, "step": 293 }, { "epoch": 0.11499254406336323, "grad_norm": 1.0071310997009277, "learning_rate": 7.78709930109734e-06, "loss": 55.8198, "step": 294 }, { "epoch": 0.11538367516561958, "grad_norm": 472.02276611328125, "learning_rate": 7.772295672522615e-06, "loss": 68.5582, "step": 295 }, { "epoch": 0.11577480626787591, "grad_norm": 3507.451904296875, "learning_rate": 7.75745687157547e-06, "loss": 200.0802, "step": 296 }, { "epoch": 0.11616593737013226, "grad_norm": 2918.291748046875, "learning_rate": 7.742583086517151e-06, "loss": 235.3087, "step": 297 }, { "epoch": 0.11655706847238859, "grad_norm": 264.2690124511719, "learning_rate": 7.727674506052744e-06, "loss": 46.5808, "step": 298 }, { "epoch": 0.11694819957464493, "grad_norm": 0.6071652173995972, "learning_rate": 7.712731319328798e-06, "loss": 66.332, "step": 299 }, { "epoch": 0.11733933067690126, "grad_norm": 415.38214111328125, "learning_rate": 7.697753715930906e-06, "loss": 55.6848, "step": 300 }, { "epoch": 0.11773046177915761, "grad_norm": 0.16611334681510925, "learning_rate": 7.682741885881314e-06, "loss": 176.503, "step": 301 }, { "epoch": 0.11812159288141394, "grad_norm": 0.9599153995513916, "learning_rate": 7.667696019636504e-06, "loss": 12.792, "step": 302 }, { "epoch": 0.11851272398367027, "grad_norm": 1961.807373046875, "learning_rate": 7.652616308084774e-06, "loss": 144.7594, "step": 303 }, { "epoch": 0.11890385508592662, "grad_norm": 1706.922119140625, "learning_rate": 7.637502942543825e-06, "loss": 77.7838, "step": 304 }, { "epoch": 0.11929498618818295, "grad_norm": 3011.58349609375, "learning_rate": 7.622356114758328e-06, "loss": 66.3472, "step": 305 }, { "epoch": 0.11968611729043929, "grad_norm": 0.18727894127368927, "learning_rate": 7.607176016897491e-06, "loss": 20.0101, "step": 306 }, { "epoch": 0.12007724839269562, "grad_norm": 747.8145141601562, "learning_rate": 7.591962841552627e-06, "loss": 124.6628, "step": 307 }, { "epoch": 0.12046837949495197, "grad_norm": 2174.573486328125, "learning_rate": 7.576716781734699e-06, "loss": 122.6966, "step": 308 }, { "epoch": 0.1208595105972083, "grad_norm": 1657.6822509765625, "learning_rate": 7.561438030871886e-06, "loss": 90.9553, "step": 309 }, { "epoch": 0.12125064169946465, "grad_norm": 409.1562194824219, "learning_rate": 7.546126782807117e-06, "loss": 39.3561, "step": 310 }, { "epoch": 0.12164177280172098, "grad_norm": 0.1211930438876152, "learning_rate": 7.530783231795615e-06, "loss": 1.299, "step": 311 }, { "epoch": 0.12203290390397731, "grad_norm": 2307.365234375, "learning_rate": 7.515407572502438e-06, "loss": 200.4622, "step": 312 }, { "epoch": 0.12242403500623365, "grad_norm": 0.3849581778049469, "learning_rate": 7.500000000000001e-06, "loss": 126.054, "step": 313 }, { "epoch": 0.12281516610848998, "grad_norm": 2264.787109375, "learning_rate": 7.484560709765605e-06, "loss": 172.344, "step": 314 }, { "epoch": 0.12320629721074633, "grad_norm": 1.2760062217712402, "learning_rate": 7.469089897678958e-06, "loss": 107.9826, "step": 315 }, { "epoch": 0.12359742831300266, "grad_norm": 347.2667236328125, "learning_rate": 7.453587760019691e-06, "loss": 123.0772, "step": 316 }, { "epoch": 0.123988559415259, "grad_norm": 0.7116885781288147, "learning_rate": 7.438054493464859e-06, "loss": 53.4364, "step": 317 }, { "epoch": 0.12437969051751534, "grad_norm": 5167.86669921875, "learning_rate": 7.422490295086457e-06, "loss": 314.3017, "step": 318 }, { "epoch": 0.12477082161977168, "grad_norm": 405.8331604003906, "learning_rate": 7.406895362348916e-06, "loss": 157.4734, "step": 319 }, { "epoch": 0.125161952722028, "grad_norm": 289.1830749511719, "learning_rate": 7.391269893106592e-06, "loss": 86.5292, "step": 320 }, { "epoch": 0.12555308382428434, "grad_norm": 1.3253761529922485, "learning_rate": 7.375614085601265e-06, "loss": 62.8788, "step": 321 }, { "epoch": 0.12594421492654068, "grad_norm": 3807.062744140625, "learning_rate": 7.359928138459615e-06, "loss": 193.2301, "step": 322 }, { "epoch": 0.12633534602879704, "grad_norm": 0.38906916975975037, "learning_rate": 7.344212250690712e-06, "loss": 87.5832, "step": 323 }, { "epoch": 0.12672647713105337, "grad_norm": 1681.7589111328125, "learning_rate": 7.328466621683481e-06, "loss": 167.7496, "step": 324 }, { "epoch": 0.1271176082333097, "grad_norm": 0.22865992784500122, "learning_rate": 7.312691451204178e-06, "loss": 42.7028, "step": 325 }, { "epoch": 0.12750873933556603, "grad_norm": 0.14315825700759888, "learning_rate": 7.296886939393852e-06, "loss": 41.7926, "step": 326 }, { "epoch": 0.1278998704378224, "grad_norm": 0.7125481963157654, "learning_rate": 7.281053286765816e-06, "loss": 150.8858, "step": 327 }, { "epoch": 0.12829100154007872, "grad_norm": 1.3114720582962036, "learning_rate": 7.265190694203086e-06, "loss": 200.4679, "step": 328 }, { "epoch": 0.12868213264233505, "grad_norm": 308.97198486328125, "learning_rate": 7.249299362955846e-06, "loss": 56.5048, "step": 329 }, { "epoch": 0.12907326374459138, "grad_norm": 337.1923522949219, "learning_rate": 7.233379494638891e-06, "loss": 43.4137, "step": 330 }, { "epoch": 0.12946439484684774, "grad_norm": 0.3174319565296173, "learning_rate": 7.217431291229068e-06, "loss": 57.2986, "step": 331 }, { "epoch": 0.12985552594910407, "grad_norm": 4877.64306640625, "learning_rate": 7.201454955062712e-06, "loss": 295.9178, "step": 332 }, { "epoch": 0.1302466570513604, "grad_norm": 2004.761474609375, "learning_rate": 7.185450688833083e-06, "loss": 175.5556, "step": 333 }, { "epoch": 0.13063778815361674, "grad_norm": 0.5494270920753479, "learning_rate": 7.169418695587791e-06, "loss": 95.1294, "step": 334 }, { "epoch": 0.13102891925587307, "grad_norm": 0.4806520938873291, "learning_rate": 7.153359178726222e-06, "loss": 40.1013, "step": 335 }, { "epoch": 0.13142005035812943, "grad_norm": 0.6679964065551758, "learning_rate": 7.137272341996958e-06, "loss": 73.998, "step": 336 }, { "epoch": 0.13181118146038576, "grad_norm": 0.3463609516620636, "learning_rate": 7.121158389495187e-06, "loss": 55.503, "step": 337 }, { "epoch": 0.1322023125626421, "grad_norm": 0.4972734749317169, "learning_rate": 7.10501752566012e-06, "loss": 93.7803, "step": 338 }, { "epoch": 0.13259344366489842, "grad_norm": 0.16640667617321014, "learning_rate": 7.088849955272396e-06, "loss": 118.0719, "step": 339 }, { "epoch": 0.13298457476715478, "grad_norm": 0.20604762434959412, "learning_rate": 7.072655883451478e-06, "loss": 135.9177, "step": 340 }, { "epoch": 0.1333757058694111, "grad_norm": 0.49932876229286194, "learning_rate": 7.056435515653059e-06, "loss": 161.2835, "step": 341 }, { "epoch": 0.13376683697166744, "grad_norm": 0.6121784448623657, "learning_rate": 7.040189057666449e-06, "loss": 12.1418, "step": 342 }, { "epoch": 0.13415796807392377, "grad_norm": 1.3118720054626465, "learning_rate": 7.023916715611969e-06, "loss": 122.6702, "step": 343 }, { "epoch": 0.1345490991761801, "grad_norm": 0.12975674867630005, "learning_rate": 7.007618695938334e-06, "loss": 165.243, "step": 344 }, { "epoch": 0.13494023027843646, "grad_norm": 1.6560322046279907, "learning_rate": 6.991295205420028e-06, "loss": 38.0507, "step": 345 }, { "epoch": 0.1353313613806928, "grad_norm": 0.20794105529785156, "learning_rate": 6.974946451154694e-06, "loss": 22.9494, "step": 346 }, { "epoch": 0.13572249248294913, "grad_norm": 0.2563984990119934, "learning_rate": 6.9585726405604915e-06, "loss": 109.0126, "step": 347 }, { "epoch": 0.13611362358520546, "grad_norm": 2.3350882530212402, "learning_rate": 6.942173981373474e-06, "loss": 25.5488, "step": 348 }, { "epoch": 0.13650475468746182, "grad_norm": 0.6753470301628113, "learning_rate": 6.925750681644954e-06, "loss": 81.2919, "step": 349 }, { "epoch": 0.13689588578971815, "grad_norm": 1.4040716886520386, "learning_rate": 6.90930294973886e-06, "loss": 140.9022, "step": 350 }, { "epoch": 0.13728701689197448, "grad_norm": 0.4277321696281433, "learning_rate": 6.892830994329089e-06, "loss": 86.2413, "step": 351 }, { "epoch": 0.1376781479942308, "grad_norm": 1191.952880859375, "learning_rate": 6.876335024396872e-06, "loss": 152.5764, "step": 352 }, { "epoch": 0.13806927909648714, "grad_norm": 0.21168895065784454, "learning_rate": 6.859815249228106e-06, "loss": 32.4788, "step": 353 }, { "epoch": 0.1384604101987435, "grad_norm": 352.44512939453125, "learning_rate": 6.8432718784107145e-06, "loss": 42.3575, "step": 354 }, { "epoch": 0.13885154130099983, "grad_norm": 0.7770540118217468, "learning_rate": 6.8267051218319766e-06, "loss": 177.1307, "step": 355 }, { "epoch": 0.13924267240325616, "grad_norm": 1410.75146484375, "learning_rate": 6.81011518967587e-06, "loss": 55.0934, "step": 356 }, { "epoch": 0.1396338035055125, "grad_norm": 0.3775579631328583, "learning_rate": 6.793502292420402e-06, "loss": 34.0766, "step": 357 }, { "epoch": 0.14002493460776885, "grad_norm": 3233.5927734375, "learning_rate": 6.7768666408349445e-06, "loss": 149.3327, "step": 358 }, { "epoch": 0.14041606571002518, "grad_norm": 1.1655175685882568, "learning_rate": 6.760208445977551e-06, "loss": 78.598, "step": 359 }, { "epoch": 0.14080719681228152, "grad_norm": 3.140843629837036, "learning_rate": 6.743527919192285e-06, "loss": 51.5391, "step": 360 }, { "epoch": 0.14119832791453785, "grad_norm": 0.5811576843261719, "learning_rate": 6.726825272106539e-06, "loss": 56.4923, "step": 361 }, { "epoch": 0.14158945901679418, "grad_norm": 0.12908467650413513, "learning_rate": 6.710100716628345e-06, "loss": 57.4381, "step": 362 }, { "epoch": 0.14198059011905054, "grad_norm": 573.8088989257812, "learning_rate": 6.693354464943689e-06, "loss": 98.4893, "step": 363 }, { "epoch": 0.14237172122130687, "grad_norm": 117.73070526123047, "learning_rate": 6.676586729513823e-06, "loss": 96.0484, "step": 364 }, { "epoch": 0.1427628523235632, "grad_norm": 1419.90625, "learning_rate": 6.659797723072558e-06, "loss": 95.183, "step": 365 }, { "epoch": 0.14315398342581953, "grad_norm": 542.6209106445312, "learning_rate": 6.642987658623581e-06, "loss": 65.6222, "step": 366 }, { "epoch": 0.1435451145280759, "grad_norm": 1.1538621187210083, "learning_rate": 6.626156749437736e-06, "loss": 120.1217, "step": 367 }, { "epoch": 0.14393624563033222, "grad_norm": 398.84796142578125, "learning_rate": 6.609305209050332e-06, "loss": 74.3819, "step": 368 }, { "epoch": 0.14432737673258855, "grad_norm": 268.26251220703125, "learning_rate": 6.592433251258423e-06, "loss": 76.2617, "step": 369 }, { "epoch": 0.14471850783484488, "grad_norm": 321.3377990722656, "learning_rate": 6.575541090118105e-06, "loss": 40.9459, "step": 370 }, { "epoch": 0.14510963893710124, "grad_norm": 0.22645658254623413, "learning_rate": 6.558628939941792e-06, "loss": 40.5776, "step": 371 }, { "epoch": 0.14550077003935757, "grad_norm": 0.5002371072769165, "learning_rate": 6.541697015295503e-06, "loss": 75.5995, "step": 372 }, { "epoch": 0.1458919011416139, "grad_norm": 0.16194933652877808, "learning_rate": 6.524745530996137e-06, "loss": 84.202, "step": 373 }, { "epoch": 0.14628303224387024, "grad_norm": 0.37668415904045105, "learning_rate": 6.507774702108748e-06, "loss": 83.8723, "step": 374 }, { "epoch": 0.14667416334612657, "grad_norm": 288.6870422363281, "learning_rate": 6.490784743943819e-06, "loss": 128.4052, "step": 375 }, { "epoch": 0.14706529444838293, "grad_norm": 0.13677971065044403, "learning_rate": 6.473775872054522e-06, "loss": 57.1975, "step": 376 }, { "epoch": 0.14745642555063926, "grad_norm": 0.3166632652282715, "learning_rate": 6.456748302233995e-06, "loss": 48.1106, "step": 377 }, { "epoch": 0.1478475566528956, "grad_norm": 1.3575718402862549, "learning_rate": 6.439702250512596e-06, "loss": 129.3716, "step": 378 }, { "epoch": 0.14823868775515192, "grad_norm": 0.21969862282276154, "learning_rate": 6.4226379331551625e-06, "loss": 113.5939, "step": 379 }, { "epoch": 0.14862981885740828, "grad_norm": 0.13174912333488464, "learning_rate": 6.405555566658276e-06, "loss": 32.1963, "step": 380 }, { "epoch": 0.1490209499596646, "grad_norm": 0.863881528377533, "learning_rate": 6.388455367747503e-06, "loss": 29.8159, "step": 381 }, { "epoch": 0.14941208106192094, "grad_norm": 0.6778990030288696, "learning_rate": 6.3713375533746525e-06, "loss": 128.0998, "step": 382 }, { "epoch": 0.14980321216417727, "grad_norm": 1.4469739198684692, "learning_rate": 6.354202340715027e-06, "loss": 52.4234, "step": 383 }, { "epoch": 0.1501943432664336, "grad_norm": 0.18584023416042328, "learning_rate": 6.337049947164656e-06, "loss": 70.7733, "step": 384 }, { "epoch": 0.15058547436868996, "grad_norm": 2002.4371337890625, "learning_rate": 6.319880590337549e-06, "loss": 84.5797, "step": 385 }, { "epoch": 0.1509766054709463, "grad_norm": 0.19816723465919495, "learning_rate": 6.302694488062931e-06, "loss": 76.0305, "step": 386 }, { "epoch": 0.15136773657320263, "grad_norm": 0.5410847067832947, "learning_rate": 6.2854918583824745e-06, "loss": 74.2372, "step": 387 }, { "epoch": 0.15175886767545896, "grad_norm": 1.376298427581787, "learning_rate": 6.268272919547537e-06, "loss": 24.7493, "step": 388 }, { "epoch": 0.15214999877771532, "grad_norm": 142.1651611328125, "learning_rate": 6.251037890016396e-06, "loss": 34.4196, "step": 389 }, { "epoch": 0.15254112987997165, "grad_norm": 2054.256103515625, "learning_rate": 6.233786988451468e-06, "loss": 86.9907, "step": 390 }, { "epoch": 0.15293226098222798, "grad_norm": 3574.6181640625, "learning_rate": 6.216520433716544e-06, "loss": 162.4229, "step": 391 }, { "epoch": 0.1533233920844843, "grad_norm": 1714.3194580078125, "learning_rate": 6.199238444874005e-06, "loss": 124.9201, "step": 392 }, { "epoch": 0.15371452318674064, "grad_norm": 1639.366455078125, "learning_rate": 6.181941241182044e-06, "loss": 76.7506, "step": 393 }, { "epoch": 0.154105654288997, "grad_norm": 1059.76123046875, "learning_rate": 6.164629042091894e-06, "loss": 34.25, "step": 394 }, { "epoch": 0.15449678539125333, "grad_norm": 1.3340238332748413, "learning_rate": 6.1473020672450275e-06, "loss": 85.7469, "step": 395 }, { "epoch": 0.15488791649350966, "grad_norm": 1639.532958984375, "learning_rate": 6.1299605364703826e-06, "loss": 93.6497, "step": 396 }, { "epoch": 0.155279047595766, "grad_norm": 0.20548588037490845, "learning_rate": 6.112604669781572e-06, "loss": 83.9102, "step": 397 }, { "epoch": 0.15567017869802235, "grad_norm": 1.0026606321334839, "learning_rate": 6.095234687374085e-06, "loss": 74.3395, "step": 398 }, { "epoch": 0.15606130980027869, "grad_norm": 453.576171875, "learning_rate": 6.0778508096224985e-06, "loss": 57.6422, "step": 399 }, { "epoch": 0.15645244090253502, "grad_norm": 0.14913234114646912, "learning_rate": 6.060453257077686e-06, "loss": 26.2558, "step": 400 }, { "epoch": 0.15684357200479135, "grad_norm": 0.1819409877061844, "learning_rate": 6.043042250464005e-06, "loss": 57.1572, "step": 401 }, { "epoch": 0.15723470310704768, "grad_norm": 0.3527912497520447, "learning_rate": 6.025618010676516e-06, "loss": 78.8062, "step": 402 }, { "epoch": 0.15762583420930404, "grad_norm": 1.7945302724838257, "learning_rate": 6.008180758778167e-06, "loss": 34.1183, "step": 403 }, { "epoch": 0.15801696531156037, "grad_norm": 0.3905615508556366, "learning_rate": 5.990730715996989e-06, "loss": 38.1853, "step": 404 }, { "epoch": 0.1584080964138167, "grad_norm": 1662.2044677734375, "learning_rate": 5.973268103723293e-06, "loss": 90.833, "step": 405 }, { "epoch": 0.15879922751607303, "grad_norm": 894.9415283203125, "learning_rate": 5.955793143506863e-06, "loss": 54.7208, "step": 406 }, { "epoch": 0.1591903586183294, "grad_norm": 0.2516638934612274, "learning_rate": 5.938306057054139e-06, "loss": 49.8002, "step": 407 }, { "epoch": 0.15958148972058572, "grad_norm": 0.16948434710502625, "learning_rate": 5.920807066225409e-06, "loss": 119.8379, "step": 408 }, { "epoch": 0.15997262082284205, "grad_norm": 0.480121374130249, "learning_rate": 5.903296393031996e-06, "loss": 57.558, "step": 409 }, { "epoch": 0.16036375192509839, "grad_norm": 467.2088623046875, "learning_rate": 5.885774259633432e-06, "loss": 108.7793, "step": 410 }, { "epoch": 0.16075488302735474, "grad_norm": 0.16975107789039612, "learning_rate": 5.8682408883346535e-06, "loss": 23.7654, "step": 411 }, { "epoch": 0.16114601412961108, "grad_norm": 1.1074670553207397, "learning_rate": 5.850696501583164e-06, "loss": 35.2543, "step": 412 }, { "epoch": 0.1615371452318674, "grad_norm": 2842.92529296875, "learning_rate": 5.8331413219662295e-06, "loss": 78.9185, "step": 413 }, { "epoch": 0.16192827633412374, "grad_norm": 0.22508656978607178, "learning_rate": 5.815575572208042e-06, "loss": 25.2656, "step": 414 }, { "epoch": 0.16231940743638007, "grad_norm": 0.14979542791843414, "learning_rate": 5.797999475166897e-06, "loss": 43.0969, "step": 415 }, { "epoch": 0.16271053853863643, "grad_norm": 1.3583056926727295, "learning_rate": 5.78041325383237e-06, "loss": 111.1471, "step": 416 }, { "epoch": 0.16310166964089276, "grad_norm": 0.3384321630001068, "learning_rate": 5.762817131322482e-06, "loss": 168.3297, "step": 417 }, { "epoch": 0.1634928007431491, "grad_norm": 0.4203161895275116, "learning_rate": 5.745211330880872e-06, "loss": 227.7544, "step": 418 }, { "epoch": 0.16388393184540542, "grad_norm": 0.2434893250465393, "learning_rate": 5.7275960758739655e-06, "loss": 188.506, "step": 419 }, { "epoch": 0.16427506294766178, "grad_norm": 374.4583740234375, "learning_rate": 5.709971589788136e-06, "loss": 103.9844, "step": 420 }, { "epoch": 0.1646661940499181, "grad_norm": 0.3567257523536682, "learning_rate": 5.69233809622687e-06, "loss": 45.0813, "step": 421 }, { "epoch": 0.16505732515217444, "grad_norm": 0.3658342659473419, "learning_rate": 5.674695818907943e-06, "loss": 79.1331, "step": 422 }, { "epoch": 0.16544845625443078, "grad_norm": 1.0847684144973755, "learning_rate": 5.65704498166056e-06, "loss": 89.7686, "step": 423 }, { "epoch": 0.1658395873566871, "grad_norm": 1.6579034328460693, "learning_rate": 5.6393858084225305e-06, "loss": 36.3624, "step": 424 }, { "epoch": 0.16623071845894347, "grad_norm": 136.94493103027344, "learning_rate": 5.621718523237427e-06, "loss": 86.6906, "step": 425 }, { "epoch": 0.1666218495611998, "grad_norm": 1437.6072998046875, "learning_rate": 5.604043350251733e-06, "loss": 47.5283, "step": 426 }, { "epoch": 0.16701298066345613, "grad_norm": 1.1629807949066162, "learning_rate": 5.586360513712011e-06, "loss": 69.8016, "step": 427 }, { "epoch": 0.16740411176571246, "grad_norm": 0.08627960830926895, "learning_rate": 5.568670237962045e-06, "loss": 41.2346, "step": 428 }, { "epoch": 0.16779524286796882, "grad_norm": 307.7284851074219, "learning_rate": 5.550972747440007e-06, "loss": 64.6287, "step": 429 }, { "epoch": 0.16818637397022515, "grad_norm": 275.1657409667969, "learning_rate": 5.533268266675601e-06, "loss": 37.8621, "step": 430 }, { "epoch": 0.16857750507248148, "grad_norm": 321.581298828125, "learning_rate": 5.515557020287219e-06, "loss": 21.3013, "step": 431 }, { "epoch": 0.1689686361747378, "grad_norm": 107.34349822998047, "learning_rate": 5.497839232979084e-06, "loss": 22.1243, "step": 432 }, { "epoch": 0.16935976727699414, "grad_norm": 1374.56396484375, "learning_rate": 5.480115129538409e-06, "loss": 36.583, "step": 433 }, { "epoch": 0.1697508983792505, "grad_norm": 0.42376813292503357, "learning_rate": 5.4623849348325396e-06, "loss": 44.8171, "step": 434 }, { "epoch": 0.17014202948150683, "grad_norm": 1.2415663003921509, "learning_rate": 5.444648873806101e-06, "loss": 61.8365, "step": 435 }, { "epoch": 0.17053316058376317, "grad_norm": 0.8514006733894348, "learning_rate": 5.426907171478143e-06, "loss": 46.8816, "step": 436 }, { "epoch": 0.1709242916860195, "grad_norm": 0.2142760455608368, "learning_rate": 5.409160052939292e-06, "loss": 35.3242, "step": 437 }, { "epoch": 0.17131542278827586, "grad_norm": 0.15164269506931305, "learning_rate": 5.391407743348884e-06, "loss": 11.6772, "step": 438 }, { "epoch": 0.1717065538905322, "grad_norm": 1661.5631103515625, "learning_rate": 5.373650467932122e-06, "loss": 53.8261, "step": 439 }, { "epoch": 0.17209768499278852, "grad_norm": 1743.482421875, "learning_rate": 5.355888451977204e-06, "loss": 35.92, "step": 440 }, { "epoch": 0.17248881609504485, "grad_norm": 0.20529299974441528, "learning_rate": 5.3381219208324755e-06, "loss": 33.3962, "step": 441 }, { "epoch": 0.1728799471973012, "grad_norm": 2.2511260509490967, "learning_rate": 5.320351099903565e-06, "loss": 55.9249, "step": 442 }, { "epoch": 0.17327107829955754, "grad_norm": 0.11457404494285583, "learning_rate": 5.302576214650527e-06, "loss": 73.4981, "step": 443 }, { "epoch": 0.17366220940181387, "grad_norm": 138.77809143066406, "learning_rate": 5.284797490584979e-06, "loss": 41.5857, "step": 444 }, { "epoch": 0.1740533405040702, "grad_norm": 259.2716064453125, "learning_rate": 5.267015153267246e-06, "loss": 63.9765, "step": 445 }, { "epoch": 0.17444447160632653, "grad_norm": 1.0777804851531982, "learning_rate": 5.249229428303486e-06, "loss": 11.9817, "step": 446 }, { "epoch": 0.1748356027085829, "grad_norm": 2536.879150390625, "learning_rate": 5.231440541342846e-06, "loss": 57.9686, "step": 447 }, { "epoch": 0.17522673381083922, "grad_norm": 0.2679949700832367, "learning_rate": 5.213648718074584e-06, "loss": 31.2022, "step": 448 }, { "epoch": 0.17561786491309556, "grad_norm": 0.14789415895938873, "learning_rate": 5.1958541842252145e-06, "loss": 46.2886, "step": 449 }, { "epoch": 0.1760089960153519, "grad_norm": 3.2639567852020264, "learning_rate": 5.178057165555636e-06, "loss": 84.1918, "step": 450 }, { "epoch": 0.17640012711760825, "grad_norm": 372.91387939453125, "learning_rate": 5.160257887858278e-06, "loss": 20.9396, "step": 451 }, { "epoch": 0.17679125821986458, "grad_norm": 0.30689093470573425, "learning_rate": 5.142456576954225e-06, "loss": 15.7738, "step": 452 }, { "epoch": 0.1771823893221209, "grad_norm": 247.07366943359375, "learning_rate": 5.1246534586903655e-06, "loss": 19.9637, "step": 453 }, { "epoch": 0.17757352042437724, "grad_norm": 0.7007215619087219, "learning_rate": 5.106848758936508e-06, "loss": 33.0597, "step": 454 }, { "epoch": 0.17796465152663357, "grad_norm": 290.64202880859375, "learning_rate": 5.089042703582533e-06, "loss": 65.3849, "step": 455 }, { "epoch": 0.17835578262888993, "grad_norm": 0.4069232940673828, "learning_rate": 5.071235518535516e-06, "loss": 25.4287, "step": 456 }, { "epoch": 0.17874691373114626, "grad_norm": 0.6326934099197388, "learning_rate": 5.053427429716867e-06, "loss": 43.9984, "step": 457 }, { "epoch": 0.1791380448334026, "grad_norm": 0.269971638917923, "learning_rate": 5.0356186630594585e-06, "loss": 46.612, "step": 458 }, { "epoch": 0.17952917593565892, "grad_norm": 0.21015766263008118, "learning_rate": 5.017809444504768e-06, "loss": 48.1048, "step": 459 }, { "epoch": 0.17992030703791528, "grad_norm": 357.07867431640625, "learning_rate": 5e-06, "loss": 25.4451, "step": 460 }, { "epoch": 0.18031143814017161, "grad_norm": 476.7673645019531, "learning_rate": 4.982190555495236e-06, "loss": 35.3956, "step": 461 }, { "epoch": 0.18070256924242795, "grad_norm": 0.19846689701080322, "learning_rate": 4.964381336940542e-06, "loss": 76.3696, "step": 462 }, { "epoch": 0.18109370034468428, "grad_norm": 0.3459748923778534, "learning_rate": 4.946572570283135e-06, "loss": 28.662, "step": 463 }, { "epoch": 0.1814848314469406, "grad_norm": 2.347224712371826, "learning_rate": 4.928764481464485e-06, "loss": 43.5744, "step": 464 }, { "epoch": 0.18187596254919697, "grad_norm": 1.0240310430526733, "learning_rate": 4.910957296417467e-06, "loss": 38.6815, "step": 465 }, { "epoch": 0.1822670936514533, "grad_norm": 2363.06884765625, "learning_rate": 4.893151241063493e-06, "loss": 97.8371, "step": 466 }, { "epoch": 0.18265822475370963, "grad_norm": 0.7521228790283203, "learning_rate": 4.875346541309637e-06, "loss": 87.3877, "step": 467 }, { "epoch": 0.18304935585596596, "grad_norm": 186.02557373046875, "learning_rate": 4.857543423045775e-06, "loss": 61.2668, "step": 468 }, { "epoch": 0.18344048695822232, "grad_norm": 0.3884137272834778, "learning_rate": 4.839742112141725e-06, "loss": 57.2652, "step": 469 }, { "epoch": 0.18383161806047865, "grad_norm": 1904.343994140625, "learning_rate": 4.821942834444367e-06, "loss": 44.9092, "step": 470 }, { "epoch": 0.18422274916273498, "grad_norm": 0.933085024356842, "learning_rate": 4.804145815774787e-06, "loss": 45.1724, "step": 471 }, { "epoch": 0.18461388026499131, "grad_norm": 0.17478938400745392, "learning_rate": 4.786351281925417e-06, "loss": 63.2364, "step": 472 }, { "epoch": 0.18500501136724765, "grad_norm": 0.3920159637928009, "learning_rate": 4.768559458657156e-06, "loss": 9.0603, "step": 473 }, { "epoch": 0.185396142469504, "grad_norm": 0.20027987658977509, "learning_rate": 4.750770571696514e-06, "loss": 67.3941, "step": 474 }, { "epoch": 0.18578727357176034, "grad_norm": 4896.18603515625, "learning_rate": 4.732984846732755e-06, "loss": 110.2459, "step": 475 }, { "epoch": 0.18617840467401667, "grad_norm": 0.6982368230819702, "learning_rate": 4.7152025094150214e-06, "loss": 60.1346, "step": 476 }, { "epoch": 0.186569535776273, "grad_norm": 0.21346724033355713, "learning_rate": 4.697423785349475e-06, "loss": 86.276, "step": 477 }, { "epoch": 0.18696066687852936, "grad_norm": 0.5096214413642883, "learning_rate": 4.679648900096436e-06, "loss": 47.5857, "step": 478 }, { "epoch": 0.1873517979807857, "grad_norm": 0.5563175678253174, "learning_rate": 4.661878079167527e-06, "loss": 49.3372, "step": 479 }, { "epoch": 0.18774292908304202, "grad_norm": 1.7563621997833252, "learning_rate": 4.644111548022798e-06, "loss": 40.9886, "step": 480 }, { "epoch": 0.18813406018529835, "grad_norm": 112.32270050048828, "learning_rate": 4.626349532067879e-06, "loss": 47.8111, "step": 481 }, { "epoch": 0.1885251912875547, "grad_norm": 1.5125802755355835, "learning_rate": 4.608592256651117e-06, "loss": 32.0425, "step": 482 }, { "epoch": 0.18891632238981104, "grad_norm": 0.3377935588359833, "learning_rate": 4.5908399470607106e-06, "loss": 41.984, "step": 483 }, { "epoch": 0.18930745349206737, "grad_norm": 262.73553466796875, "learning_rate": 4.573092828521857e-06, "loss": 15.6826, "step": 484 }, { "epoch": 0.1896985845943237, "grad_norm": 1222.323974609375, "learning_rate": 4.555351126193901e-06, "loss": 82.0725, "step": 485 }, { "epoch": 0.19008971569658004, "grad_norm": 0.4867708384990692, "learning_rate": 4.537615065167461e-06, "loss": 68.1378, "step": 486 }, { "epoch": 0.1904808467988364, "grad_norm": 0.24868200719356537, "learning_rate": 4.5198848704615915e-06, "loss": 34.2792, "step": 487 }, { "epoch": 0.19087197790109273, "grad_norm": 1565.045654296875, "learning_rate": 4.502160767020918e-06, "loss": 61.3013, "step": 488 }, { "epoch": 0.19126310900334906, "grad_norm": 0.12846969068050385, "learning_rate": 4.484442979712783e-06, "loss": 101.128, "step": 489 }, { "epoch": 0.1916542401056054, "grad_norm": 0.2493712157011032, "learning_rate": 4.466731733324399e-06, "loss": 16.2675, "step": 490 }, { "epoch": 0.19204537120786175, "grad_norm": 0.8139014840126038, "learning_rate": 4.449027252559994e-06, "loss": 56.7364, "step": 491 }, { "epoch": 0.19243650231011808, "grad_norm": 0.44511955976486206, "learning_rate": 4.431329762037958e-06, "loss": 22.7369, "step": 492 }, { "epoch": 0.1928276334123744, "grad_norm": 334.3827819824219, "learning_rate": 4.413639486287992e-06, "loss": 8.3813, "step": 493 }, { "epoch": 0.19321876451463074, "grad_norm": 1712.8160400390625, "learning_rate": 4.395956649748269e-06, "loss": 57.2132, "step": 494 }, { "epoch": 0.19360989561688707, "grad_norm": 1017.40771484375, "learning_rate": 4.3782814767625755e-06, "loss": 39.137, "step": 495 }, { "epoch": 0.19400102671914343, "grad_norm": 2.061763286590576, "learning_rate": 4.3606141915774695e-06, "loss": 29.4071, "step": 496 }, { "epoch": 0.19439215782139976, "grad_norm": 0.351924329996109, "learning_rate": 4.342955018339442e-06, "loss": 71.3075, "step": 497 }, { "epoch": 0.1947832889236561, "grad_norm": 0.5345131754875183, "learning_rate": 4.3253041810920595e-06, "loss": 53.7062, "step": 498 }, { "epoch": 0.19517442002591243, "grad_norm": 0.4997957944869995, "learning_rate": 4.307661903773129e-06, "loss": 52.0548, "step": 499 }, { "epoch": 0.19556555112816879, "grad_norm": 0.646457850933075, "learning_rate": 4.290028410211866e-06, "loss": 92.2717, "step": 500 }, { "epoch": 0.19595668223042512, "grad_norm": 0.3356407880783081, "learning_rate": 4.272403924126035e-06, "loss": 89.8288, "step": 501 }, { "epoch": 0.19634781333268145, "grad_norm": 1.2155108451843262, "learning_rate": 4.254788669119127e-06, "loss": 42.697, "step": 502 }, { "epoch": 0.19673894443493778, "grad_norm": 0.8059905171394348, "learning_rate": 4.237182868677519e-06, "loss": 45.4195, "step": 503 }, { "epoch": 0.1971300755371941, "grad_norm": 0.19161798059940338, "learning_rate": 4.219586746167632e-06, "loss": 5.6987, "step": 504 }, { "epoch": 0.19752120663945047, "grad_norm": 1.4473623037338257, "learning_rate": 4.2020005248331056e-06, "loss": 83.2729, "step": 505 }, { "epoch": 0.1979123377417068, "grad_norm": 3267.076904296875, "learning_rate": 4.18442442779196e-06, "loss": 64.1532, "step": 506 }, { "epoch": 0.19830346884396313, "grad_norm": 0.3686378002166748, "learning_rate": 4.166858678033771e-06, "loss": 5.8373, "step": 507 }, { "epoch": 0.19869459994621946, "grad_norm": 0.7481865882873535, "learning_rate": 4.149303498416838e-06, "loss": 30.1228, "step": 508 }, { "epoch": 0.19908573104847582, "grad_norm": 1.0720148086547852, "learning_rate": 4.131759111665349e-06, "loss": 16.1707, "step": 509 }, { "epoch": 0.19947686215073215, "grad_norm": 253.39622497558594, "learning_rate": 4.114225740366569e-06, "loss": 11.6924, "step": 510 }, { "epoch": 0.19986799325298849, "grad_norm": 1.3426392078399658, "learning_rate": 4.096703606968007e-06, "loss": 18.6361, "step": 511 }, { "epoch": 0.20025912435524482, "grad_norm": 242.68768310546875, "learning_rate": 4.079192933774592e-06, "loss": 60.1703, "step": 512 }, { "epoch": 0.20065025545750115, "grad_norm": 0.12895415723323822, "learning_rate": 4.061693942945863e-06, "loss": 39.873, "step": 513 }, { "epoch": 0.2010413865597575, "grad_norm": 446.8625793457031, "learning_rate": 4.04420685649314e-06, "loss": 61.0251, "step": 514 }, { "epoch": 0.20143251766201384, "grad_norm": 0.19806069135665894, "learning_rate": 4.026731896276708e-06, "loss": 45.4137, "step": 515 }, { "epoch": 0.20182364876427017, "grad_norm": 0.357597678899765, "learning_rate": 4.009269284003014e-06, "loss": 110.9987, "step": 516 }, { "epoch": 0.2022147798665265, "grad_norm": 0.15286563336849213, "learning_rate": 3.991819241221836e-06, "loss": 52.1221, "step": 517 }, { "epoch": 0.20260591096878286, "grad_norm": 0.1778210997581482, "learning_rate": 3.974381989323484e-06, "loss": 57.3214, "step": 518 }, { "epoch": 0.2029970420710392, "grad_norm": 1.1525286436080933, "learning_rate": 3.956957749535997e-06, "loss": 26.8482, "step": 519 }, { "epoch": 0.20338817317329552, "grad_norm": 1.2070550918579102, "learning_rate": 3.939546742922318e-06, "loss": 68.4404, "step": 520 }, { "epoch": 0.20377930427555185, "grad_norm": 1.1822954416275024, "learning_rate": 3.9221491903775014e-06, "loss": 20.8915, "step": 521 }, { "epoch": 0.2041704353778082, "grad_norm": 0.27339041233062744, "learning_rate": 3.904765312625916e-06, "loss": 38.3124, "step": 522 }, { "epoch": 0.20456156648006454, "grad_norm": 251.54371643066406, "learning_rate": 3.887395330218429e-06, "loss": 75.3845, "step": 523 }, { "epoch": 0.20495269758232088, "grad_norm": 0.2657800316810608, "learning_rate": 3.8700394635296166e-06, "loss": 50.7794, "step": 524 }, { "epoch": 0.2053438286845772, "grad_norm": 0.9252436757087708, "learning_rate": 3.852697932754974e-06, "loss": 80.2724, "step": 525 }, { "epoch": 0.20573495978683354, "grad_norm": 0.18160250782966614, "learning_rate": 3.835370957908108e-06, "loss": 18.6159, "step": 526 }, { "epoch": 0.2061260908890899, "grad_norm": 1.437393069267273, "learning_rate": 3.818058758817956e-06, "loss": 62.4316, "step": 527 }, { "epoch": 0.20651722199134623, "grad_norm": 2809.828369140625, "learning_rate": 3.800761555125997e-06, "loss": 25.8665, "step": 528 }, { "epoch": 0.20690835309360256, "grad_norm": 952.0728149414062, "learning_rate": 3.783479566283457e-06, "loss": 22.2021, "step": 529 }, { "epoch": 0.2072994841958589, "grad_norm": 2.9290764331817627, "learning_rate": 3.7662130115485317e-06, "loss": 62.186, "step": 530 }, { "epoch": 0.20769061529811525, "grad_norm": 0.4081960618495941, "learning_rate": 3.748962109983605e-06, "loss": 36.3092, "step": 531 }, { "epoch": 0.20808174640037158, "grad_norm": 0.33659136295318604, "learning_rate": 3.731727080452464e-06, "loss": 19.2433, "step": 532 }, { "epoch": 0.2084728775026279, "grad_norm": 1739.4482421875, "learning_rate": 3.714508141617527e-06, "loss": 76.853, "step": 533 }, { "epoch": 0.20886400860488424, "grad_norm": 3326.531494140625, "learning_rate": 3.69730551193707e-06, "loss": 99.9301, "step": 534 }, { "epoch": 0.20925513970714057, "grad_norm": 220.89852905273438, "learning_rate": 3.6801194096624515e-06, "loss": 41.4909, "step": 535 }, { "epoch": 0.20964627080939693, "grad_norm": 106.39971160888672, "learning_rate": 3.6629500528353464e-06, "loss": 28.8744, "step": 536 }, { "epoch": 0.21003740191165327, "grad_norm": 1.0691938400268555, "learning_rate": 3.6457976592849753e-06, "loss": 20.5078, "step": 537 }, { "epoch": 0.2104285330139096, "grad_norm": 0.6214406490325928, "learning_rate": 3.6286624466253496e-06, "loss": 45.2672, "step": 538 }, { "epoch": 0.21081966411616593, "grad_norm": 1.1435602903366089, "learning_rate": 3.6115446322525007e-06, "loss": 62.7016, "step": 539 }, { "epoch": 0.2112107952184223, "grad_norm": 0.2811889052391052, "learning_rate": 3.594444433341725e-06, "loss": 73.1093, "step": 540 }, { "epoch": 0.21160192632067862, "grad_norm": 1629.179443359375, "learning_rate": 3.5773620668448384e-06, "loss": 70.3456, "step": 541 }, { "epoch": 0.21199305742293495, "grad_norm": 0.6402444839477539, "learning_rate": 3.560297749487407e-06, "loss": 100.5996, "step": 542 }, { "epoch": 0.21238418852519128, "grad_norm": 0.20082825422286987, "learning_rate": 3.543251697766006e-06, "loss": 45.9456, "step": 543 }, { "epoch": 0.2127753196274476, "grad_norm": 0.2259850949048996, "learning_rate": 3.526224127945479e-06, "loss": 81.2139, "step": 544 }, { "epoch": 0.21316645072970397, "grad_norm": 0.5259697437286377, "learning_rate": 3.5092152560561833e-06, "loss": 29.0951, "step": 545 }, { "epoch": 0.2135575818319603, "grad_norm": 0.2201029360294342, "learning_rate": 3.4922252978912523e-06, "loss": 39.3512, "step": 546 }, { "epoch": 0.21394871293421663, "grad_norm": 0.6921319365501404, "learning_rate": 3.475254469003865e-06, "loss": 74.4222, "step": 547 }, { "epoch": 0.21433984403647297, "grad_norm": 0.3210639953613281, "learning_rate": 3.4583029847044996e-06, "loss": 78.7619, "step": 548 }, { "epoch": 0.21473097513872932, "grad_norm": 0.49791640043258667, "learning_rate": 3.4413710600582096e-06, "loss": 41.5078, "step": 549 }, { "epoch": 0.21512210624098566, "grad_norm": 1.5678467750549316, "learning_rate": 3.424458909881897e-06, "loss": 25.0795, "step": 550 }, { "epoch": 0.215513237343242, "grad_norm": 0.4146921634674072, "learning_rate": 3.4075667487415785e-06, "loss": 53.2567, "step": 551 }, { "epoch": 0.21590436844549832, "grad_norm": 0.3883844017982483, "learning_rate": 3.3906947909496696e-06, "loss": 40.2185, "step": 552 }, { "epoch": 0.21629549954775465, "grad_norm": 1.1996833086013794, "learning_rate": 3.3738432505622653e-06, "loss": 43.9982, "step": 553 }, { "epoch": 0.216686630650011, "grad_norm": 0.203975647687912, "learning_rate": 3.357012341376421e-06, "loss": 29.5247, "step": 554 }, { "epoch": 0.21707776175226734, "grad_norm": 0.3972192108631134, "learning_rate": 3.3402022769274422e-06, "loss": 12.6732, "step": 555 }, { "epoch": 0.21746889285452367, "grad_norm": 0.3920489251613617, "learning_rate": 3.3234132704861786e-06, "loss": 10.7088, "step": 556 }, { "epoch": 0.21786002395678, "grad_norm": 1.8879612684249878, "learning_rate": 3.306645535056312e-06, "loss": 78.3652, "step": 557 }, { "epoch": 0.21825115505903636, "grad_norm": 1339.1312255859375, "learning_rate": 3.289899283371657e-06, "loss": 115.3514, "step": 558 }, { "epoch": 0.2186422861612927, "grad_norm": 0.34053364396095276, "learning_rate": 3.273174727893463e-06, "loss": 30.0332, "step": 559 }, { "epoch": 0.21903341726354902, "grad_norm": 1.0308630466461182, "learning_rate": 3.2564720808077167e-06, "loss": 45.353, "step": 560 }, { "epoch": 0.21942454836580536, "grad_norm": 0.11198209971189499, "learning_rate": 3.2397915540224493e-06, "loss": 21.2855, "step": 561 }, { "epoch": 0.21981567946806171, "grad_norm": 0.1384800672531128, "learning_rate": 3.2231333591650567e-06, "loss": 1.2242, "step": 562 }, { "epoch": 0.22020681057031805, "grad_norm": 0.2989153563976288, "learning_rate": 3.2064977075795988e-06, "loss": 25.7958, "step": 563 }, { "epoch": 0.22059794167257438, "grad_norm": 1617.5919189453125, "learning_rate": 3.189884810324133e-06, "loss": 41.6185, "step": 564 }, { "epoch": 0.2209890727748307, "grad_norm": 0.4813622236251831, "learning_rate": 3.173294878168025e-06, "loss": 54.3557, "step": 565 }, { "epoch": 0.22138020387708704, "grad_norm": 207.3912811279297, "learning_rate": 3.1567281215892868e-06, "loss": 23.3589, "step": 566 }, { "epoch": 0.2217713349793434, "grad_norm": 0.6962729096412659, "learning_rate": 3.140184750771895e-06, "loss": 53.3044, "step": 567 }, { "epoch": 0.22216246608159973, "grad_norm": 0.3524315059185028, "learning_rate": 3.12366497560313e-06, "loss": 44.0667, "step": 568 }, { "epoch": 0.22255359718385606, "grad_norm": 0.2957899868488312, "learning_rate": 3.1071690056709125e-06, "loss": 61.0361, "step": 569 }, { "epoch": 0.2229447282861124, "grad_norm": 0.13717715442180634, "learning_rate": 3.090697050261143e-06, "loss": 48.0218, "step": 570 }, { "epoch": 0.22333585938836875, "grad_norm": 1.4488298892974854, "learning_rate": 3.074249318355046e-06, "loss": 42.8739, "step": 571 }, { "epoch": 0.22372699049062508, "grad_norm": 0.19119593501091003, "learning_rate": 3.057826018626527e-06, "loss": 131.6781, "step": 572 }, { "epoch": 0.22411812159288141, "grad_norm": 0.4780034124851227, "learning_rate": 3.0414273594395106e-06, "loss": 68.4404, "step": 573 }, { "epoch": 0.22450925269513775, "grad_norm": 0.25417017936706543, "learning_rate": 3.0250535488453077e-06, "loss": 78.3688, "step": 574 }, { "epoch": 0.22490038379739408, "grad_norm": 0.2417258769273758, "learning_rate": 3.008704794579973e-06, "loss": 116.528, "step": 575 }, { "epoch": 0.22529151489965044, "grad_norm": 0.19122564792633057, "learning_rate": 2.9923813040616685e-06, "loss": 29.6101, "step": 576 }, { "epoch": 0.22568264600190677, "grad_norm": 0.13005802035331726, "learning_rate": 2.976083284388031e-06, "loss": 35.3624, "step": 577 }, { "epoch": 0.2260737771041631, "grad_norm": 381.5791320800781, "learning_rate": 2.959810942333552e-06, "loss": 90.369, "step": 578 }, { "epoch": 0.22646490820641943, "grad_norm": 0.19559843838214874, "learning_rate": 2.9435644843469434e-06, "loss": 51.9091, "step": 579 }, { "epoch": 0.2268560393086758, "grad_norm": 0.3959593176841736, "learning_rate": 2.9273441165485227e-06, "loss": 38.9128, "step": 580 }, { "epoch": 0.22724717041093212, "grad_norm": 0.18184183537960052, "learning_rate": 2.9111500447276053e-06, "loss": 51.5855, "step": 581 }, { "epoch": 0.22763830151318845, "grad_norm": 0.6553506851196289, "learning_rate": 2.8949824743398804e-06, "loss": 30.2534, "step": 582 }, { "epoch": 0.22802943261544478, "grad_norm": 0.18349502980709076, "learning_rate": 2.8788416105048124e-06, "loss": 15.0662, "step": 583 }, { "epoch": 0.22842056371770111, "grad_norm": 0.57335364818573, "learning_rate": 2.862727658003042e-06, "loss": 48.4215, "step": 584 }, { "epoch": 0.22881169481995747, "grad_norm": 0.36635109782218933, "learning_rate": 2.8466408212737777e-06, "loss": 4.5718, "step": 585 }, { "epoch": 0.2292028259222138, "grad_norm": 0.2556889057159424, "learning_rate": 2.83058130441221e-06, "loss": 43.9021, "step": 586 }, { "epoch": 0.22959395702447014, "grad_norm": 1374.17138671875, "learning_rate": 2.8145493111669186e-06, "loss": 61.402, "step": 587 }, { "epoch": 0.22998508812672647, "grad_norm": 0.1971081793308258, "learning_rate": 2.79854504493729e-06, "loss": 49.1368, "step": 588 }, { "epoch": 0.23037621922898283, "grad_norm": 1701.60693359375, "learning_rate": 2.782568708770933e-06, "loss": 53.8848, "step": 589 }, { "epoch": 0.23076735033123916, "grad_norm": 789.6815185546875, "learning_rate": 2.7666205053611097e-06, "loss": 17.3218, "step": 590 }, { "epoch": 0.2311584814334955, "grad_norm": 0.25564226508140564, "learning_rate": 2.7507006370441557e-06, "loss": 8.3916, "step": 591 }, { "epoch": 0.23154961253575182, "grad_norm": 1354.3128662109375, "learning_rate": 2.734809305796915e-06, "loss": 48.9245, "step": 592 }, { "epoch": 0.23194074363800815, "grad_norm": 161.26748657226562, "learning_rate": 2.718946713234185e-06, "loss": 56.0846, "step": 593 }, { "epoch": 0.2323318747402645, "grad_norm": 0.32130080461502075, "learning_rate": 2.7031130606061486e-06, "loss": 38.3063, "step": 594 }, { "epoch": 0.23272300584252084, "grad_norm": 1.8923166990280151, "learning_rate": 2.687308548795825e-06, "loss": 47.5562, "step": 595 }, { "epoch": 0.23311413694477717, "grad_norm": 0.13152983784675598, "learning_rate": 2.67153337831652e-06, "loss": 26.3863, "step": 596 }, { "epoch": 0.2335052680470335, "grad_norm": 0.1482762098312378, "learning_rate": 2.6557877493092885e-06, "loss": 79.9195, "step": 597 }, { "epoch": 0.23389639914928986, "grad_norm": 0.4109286069869995, "learning_rate": 2.6400718615403852e-06, "loss": 50.5231, "step": 598 }, { "epoch": 0.2342875302515462, "grad_norm": 0.8140275478363037, "learning_rate": 2.624385914398737e-06, "loss": 55.21, "step": 599 }, { "epoch": 0.23467866135380253, "grad_norm": 1361.185302734375, "learning_rate": 2.608730106893411e-06, "loss": 51.1663, "step": 600 }, { "epoch": 0.23506979245605886, "grad_norm": 0.4616069793701172, "learning_rate": 2.5931046376510875e-06, "loss": 14.65, "step": 601 }, { "epoch": 0.23546092355831522, "grad_norm": 0.43858256936073303, "learning_rate": 2.5775097049135445e-06, "loss": 21.6647, "step": 602 }, { "epoch": 0.23585205466057155, "grad_norm": 0.2774844765663147, "learning_rate": 2.561945506535144e-06, "loss": 37.901, "step": 603 }, { "epoch": 0.23624318576282788, "grad_norm": 1338.126220703125, "learning_rate": 2.5464122399803126e-06, "loss": 66.4596, "step": 604 }, { "epoch": 0.2366343168650842, "grad_norm": 0.4828522801399231, "learning_rate": 2.5309101023210426e-06, "loss": 11.7746, "step": 605 }, { "epoch": 0.23702544796734054, "grad_norm": 1.3893892765045166, "learning_rate": 2.5154392902343966e-06, "loss": 38.6192, "step": 606 }, { "epoch": 0.2374165790695969, "grad_norm": 2.981203556060791, "learning_rate": 2.5000000000000015e-06, "loss": 15.9302, "step": 607 }, { "epoch": 0.23780771017185323, "grad_norm": 597.11279296875, "learning_rate": 2.4845924274975625e-06, "loss": 45.9918, "step": 608 }, { "epoch": 0.23819884127410956, "grad_norm": 1.3516067266464233, "learning_rate": 2.4692167682043855e-06, "loss": 46.3228, "step": 609 }, { "epoch": 0.2385899723763659, "grad_norm": 0.1671498566865921, "learning_rate": 2.4538732171928847e-06, "loss": 43.2643, "step": 610 }, { "epoch": 0.23898110347862225, "grad_norm": 0.3257511854171753, "learning_rate": 2.4385619691281144e-06, "loss": 44.6005, "step": 611 }, { "epoch": 0.23937223458087858, "grad_norm": 0.4877566993236542, "learning_rate": 2.4232832182653014e-06, "loss": 72.7035, "step": 612 }, { "epoch": 0.23976336568313492, "grad_norm": 1470.4984130859375, "learning_rate": 2.408037158447375e-06, "loss": 50.2886, "step": 613 }, { "epoch": 0.24015449678539125, "grad_norm": 0.78384929895401, "learning_rate": 2.39282398310251e-06, "loss": 19.3894, "step": 614 }, { "epoch": 0.24054562788764758, "grad_norm": 0.2225428968667984, "learning_rate": 2.3776438852416743e-06, "loss": 53.8646, "step": 615 }, { "epoch": 0.24093675898990394, "grad_norm": 0.996082603931427, "learning_rate": 2.3624970574561773e-06, "loss": 2.6708, "step": 616 }, { "epoch": 0.24132789009216027, "grad_norm": 0.38895338773727417, "learning_rate": 2.3473836919152267e-06, "loss": 50.9521, "step": 617 }, { "epoch": 0.2417190211944166, "grad_norm": 1.0213991403579712, "learning_rate": 2.332303980363497e-06, "loss": 44.1723, "step": 618 }, { "epoch": 0.24211015229667293, "grad_norm": 0.5459339022636414, "learning_rate": 2.317258114118686e-06, "loss": 1.9167, "step": 619 }, { "epoch": 0.2425012833989293, "grad_norm": 428.5580139160156, "learning_rate": 2.3022462840690933e-06, "loss": 60.3034, "step": 620 }, { "epoch": 0.24289241450118562, "grad_norm": 1972.2655029296875, "learning_rate": 2.2872686806712037e-06, "loss": 53.7348, "step": 621 }, { "epoch": 0.24328354560344195, "grad_norm": 0.9528247714042664, "learning_rate": 2.272325493947257e-06, "loss": 34.783, "step": 622 }, { "epoch": 0.24367467670569828, "grad_norm": 0.456136554479599, "learning_rate": 2.257416913482853e-06, "loss": 32.2854, "step": 623 }, { "epoch": 0.24406580780795462, "grad_norm": 0.22959932684898376, "learning_rate": 2.2425431284245302e-06, "loss": 27.3752, "step": 624 }, { "epoch": 0.24445693891021097, "grad_norm": 0.48872700333595276, "learning_rate": 2.2277043274773856e-06, "loss": 22.8773, "step": 625 }, { "epoch": 0.2448480700124673, "grad_norm": 1.4068603515625, "learning_rate": 2.2129006989026612e-06, "loss": 26.3108, "step": 626 }, { "epoch": 0.24523920111472364, "grad_norm": 2.354417085647583, "learning_rate": 2.1981324305153644e-06, "loss": 60.9464, "step": 627 }, { "epoch": 0.24563033221697997, "grad_norm": 1.7362618446350098, "learning_rate": 2.1833997096818897e-06, "loss": 18.6702, "step": 628 }, { "epoch": 0.24602146331923633, "grad_norm": 471.421875, "learning_rate": 2.168702723317632e-06, "loss": 13.0568, "step": 629 }, { "epoch": 0.24641259442149266, "grad_norm": 0.3530268669128418, "learning_rate": 2.1540416578846207e-06, "loss": 14.6524, "step": 630 }, { "epoch": 0.246803725523749, "grad_norm": 0.7190971970558167, "learning_rate": 2.139416699389153e-06, "loss": 12.7703, "step": 631 }, { "epoch": 0.24719485662600532, "grad_norm": 2144.53125, "learning_rate": 2.1248280333794347e-06, "loss": 24.8344, "step": 632 }, { "epoch": 0.24758598772826168, "grad_norm": 1532.8892822265625, "learning_rate": 2.1102758449432233e-06, "loss": 48.1845, "step": 633 }, { "epoch": 0.247977118830518, "grad_norm": 0.18892136216163635, "learning_rate": 2.095760318705487e-06, "loss": 41.4411, "step": 634 }, { "epoch": 0.24836824993277434, "grad_norm": 0.43188950419425964, "learning_rate": 2.081281638826052e-06, "loss": 15.6649, "step": 635 }, { "epoch": 0.24875938103503067, "grad_norm": 0.247590571641922, "learning_rate": 2.0668399889972717e-06, "loss": 52.5673, "step": 636 }, { "epoch": 0.249150512137287, "grad_norm": 0.7205409407615662, "learning_rate": 2.0524355524417017e-06, "loss": 16.6089, "step": 637 }, { "epoch": 0.24954164323954336, "grad_norm": 0.30670174956321716, "learning_rate": 2.038068511909762e-06, "loss": 25.2432, "step": 638 }, { "epoch": 0.2499327743417997, "grad_norm": 1323.96875, "learning_rate": 2.0237390496774284e-06, "loss": 47.6592, "step": 639 }, { "epoch": 0.250323905444056, "grad_norm": 0.4727286696434021, "learning_rate": 2.00944734754392e-06, "loss": 26.6763, "step": 640 }, { "epoch": 0.2507150365463124, "grad_norm": 0.2739315927028656, "learning_rate": 1.995193586829387e-06, "loss": 60.4231, "step": 641 }, { "epoch": 0.2511061676485687, "grad_norm": 0.31088632345199585, "learning_rate": 1.980977948372612e-06, "loss": 12.4899, "step": 642 }, { "epoch": 0.25149729875082505, "grad_norm": 0.2787396013736725, "learning_rate": 1.966800612528723e-06, "loss": 52.5619, "step": 643 }, { "epoch": 0.25188842985308135, "grad_norm": 0.15130284428596497, "learning_rate": 1.952661759166893e-06, "loss": 27.9581, "step": 644 }, { "epoch": 0.2522795609553377, "grad_norm": 1.2856502532958984, "learning_rate": 1.9385615676680663e-06, "loss": 19.7204, "step": 645 }, { "epoch": 0.25267069205759407, "grad_norm": 772.9840087890625, "learning_rate": 1.9245002169226814e-06, "loss": 47.7722, "step": 646 }, { "epoch": 0.2530618231598504, "grad_norm": 342.9947509765625, "learning_rate": 1.910477885328399e-06, "loss": 20.2267, "step": 647 }, { "epoch": 0.25345295426210673, "grad_norm": 0.65385502576828, "learning_rate": 1.8964947507878401e-06, "loss": 24.3055, "step": 648 }, { "epoch": 0.2538440853643631, "grad_norm": 2241.752197265625, "learning_rate": 1.8825509907063328e-06, "loss": 61.2555, "step": 649 }, { "epoch": 0.2542352164666194, "grad_norm": 1303.792724609375, "learning_rate": 1.8686467819896542e-06, "loss": 28.9474, "step": 650 }, { "epoch": 0.25462634756887575, "grad_norm": 1.3625749349594116, "learning_rate": 1.8547823010417876e-06, "loss": 39.3853, "step": 651 }, { "epoch": 0.25501747867113206, "grad_norm": 0.34665849804878235, "learning_rate": 1.8409577237626935e-06, "loss": 35.6182, "step": 652 }, { "epoch": 0.2554086097733884, "grad_norm": 4.089036464691162, "learning_rate": 1.8271732255460644e-06, "loss": 4.3633, "step": 653 }, { "epoch": 0.2557997408756448, "grad_norm": 1152.780517578125, "learning_rate": 1.8134289812771077e-06, "loss": 49.1108, "step": 654 }, { "epoch": 0.2561908719779011, "grad_norm": 0.6453447341918945, "learning_rate": 1.7997251653303249e-06, "loss": 40.7015, "step": 655 }, { "epoch": 0.25658200308015744, "grad_norm": 93.7674789428711, "learning_rate": 1.7860619515673034e-06, "loss": 30.0199, "step": 656 }, { "epoch": 0.25697313418241374, "grad_norm": 0.4583728313446045, "learning_rate": 1.7724395133345025e-06, "loss": 67.6765, "step": 657 }, { "epoch": 0.2573642652846701, "grad_norm": 0.5361828207969666, "learning_rate": 1.7588580234610592e-06, "loss": 17.3098, "step": 658 }, { "epoch": 0.25775539638692646, "grad_norm": 0.22744417190551758, "learning_rate": 1.7453176542565958e-06, "loss": 38.1001, "step": 659 }, { "epoch": 0.25814652748918276, "grad_norm": 0.23170587420463562, "learning_rate": 1.7318185775090336e-06, "loss": 10.4951, "step": 660 }, { "epoch": 0.2585376585914391, "grad_norm": 153.93882751464844, "learning_rate": 1.7183609644824096e-06, "loss": 52.739, "step": 661 }, { "epoch": 0.2589287896936955, "grad_norm": 0.1890573650598526, "learning_rate": 1.7049449859147121e-06, "loss": 29.1403, "step": 662 }, { "epoch": 0.2593199207959518, "grad_norm": 1.4989005327224731, "learning_rate": 1.6915708120157042e-06, "loss": 44.5519, "step": 663 }, { "epoch": 0.25971105189820815, "grad_norm": 1.5247461795806885, "learning_rate": 1.67823861246477e-06, "loss": 40.8943, "step": 664 }, { "epoch": 0.26010218300046445, "grad_norm": 1.0584818124771118, "learning_rate": 1.6649485564087646e-06, "loss": 29.1124, "step": 665 }, { "epoch": 0.2604933141027208, "grad_norm": 1.388893485069275, "learning_rate": 1.6517008124598622e-06, "loss": 20.1575, "step": 666 }, { "epoch": 0.26088444520497717, "grad_norm": 0.27823394536972046, "learning_rate": 1.6384955486934157e-06, "loss": 19.5398, "step": 667 }, { "epoch": 0.26127557630723347, "grad_norm": 0.1498628854751587, "learning_rate": 1.6253329326458367e-06, "loss": 43.7116, "step": 668 }, { "epoch": 0.26166670740948983, "grad_norm": 5067.41357421875, "learning_rate": 1.612213131312454e-06, "loss": 62.5826, "step": 669 }, { "epoch": 0.26205783851174613, "grad_norm": 0.3824155926704407, "learning_rate": 1.5991363111454023e-06, "loss": 43.7866, "step": 670 }, { "epoch": 0.2624489696140025, "grad_norm": 0.30052921175956726, "learning_rate": 1.5861026380515165e-06, "loss": 80.1734, "step": 671 }, { "epoch": 0.26284010071625885, "grad_norm": 0.14637216925621033, "learning_rate": 1.5731122773902147e-06, "loss": 17.5782, "step": 672 }, { "epoch": 0.26323123181851515, "grad_norm": 0.14098778367042542, "learning_rate": 1.5601653939714073e-06, "loss": 52.0247, "step": 673 }, { "epoch": 0.2636223629207715, "grad_norm": 331.9844055175781, "learning_rate": 1.547262152053406e-06, "loss": 32.7034, "step": 674 }, { "epoch": 0.2640134940230278, "grad_norm": 0.5405360460281372, "learning_rate": 1.5344027153408375e-06, "loss": 49.8961, "step": 675 }, { "epoch": 0.2644046251252842, "grad_norm": 0.21132820844650269, "learning_rate": 1.5215872469825682e-06, "loss": 4.232, "step": 676 }, { "epoch": 0.26479575622754054, "grad_norm": 0.930949330329895, "learning_rate": 1.5088159095696365e-06, "loss": 43.6584, "step": 677 }, { "epoch": 0.26518688732979684, "grad_norm": 0.6677309274673462, "learning_rate": 1.4960888651331833e-06, "loss": 40.9175, "step": 678 }, { "epoch": 0.2655780184320532, "grad_norm": 0.35741525888442993, "learning_rate": 1.4834062751424018e-06, "loss": 22.68, "step": 679 }, { "epoch": 0.26596914953430956, "grad_norm": 1111.9866943359375, "learning_rate": 1.4707683005024898e-06, "loss": 41.6105, "step": 680 }, { "epoch": 0.26636028063656586, "grad_norm": 0.6054497957229614, "learning_rate": 1.4581751015526035e-06, "loss": 27.9255, "step": 681 }, { "epoch": 0.2667514117388222, "grad_norm": 0.14572077989578247, "learning_rate": 1.4456268380638262e-06, "loss": 4.4698, "step": 682 }, { "epoch": 0.2671425428410785, "grad_norm": 0.28749310970306396, "learning_rate": 1.4331236692371386e-06, "loss": 24.8405, "step": 683 }, { "epoch": 0.2675336739433349, "grad_norm": 0.3515958786010742, "learning_rate": 1.4206657537014078e-06, "loss": 14.6068, "step": 684 }, { "epoch": 0.26792480504559124, "grad_norm": 554.953857421875, "learning_rate": 1.4082532495113627e-06, "loss": 43.7761, "step": 685 }, { "epoch": 0.26831593614784754, "grad_norm": 0.34047558903694153, "learning_rate": 1.3958863141455937e-06, "loss": 69.1389, "step": 686 }, { "epoch": 0.2687070672501039, "grad_norm": 0.31358927488327026, "learning_rate": 1.38356510450456e-06, "loss": 48.2586, "step": 687 }, { "epoch": 0.2690981983523602, "grad_norm": 0.18278853595256805, "learning_rate": 1.3712897769085903e-06, "loss": 72.0719, "step": 688 }, { "epoch": 0.26948932945461657, "grad_norm": 1582.910888671875, "learning_rate": 1.3590604870959046e-06, "loss": 32.1289, "step": 689 }, { "epoch": 0.2698804605568729, "grad_norm": 0.1720167100429535, "learning_rate": 1.3468773902206378e-06, "loss": 51.6635, "step": 690 }, { "epoch": 0.27027159165912923, "grad_norm": 0.14479337632656097, "learning_rate": 1.3347406408508695e-06, "loss": 34.0947, "step": 691 }, { "epoch": 0.2706627227613856, "grad_norm": 0.16732390224933624, "learning_rate": 1.322650392966665e-06, "loss": 15.8947, "step": 692 }, { "epoch": 0.27105385386364195, "grad_norm": 0.9922708868980408, "learning_rate": 1.3106067999581224e-06, "loss": 23.1571, "step": 693 }, { "epoch": 0.27144498496589825, "grad_norm": 971.8731689453125, "learning_rate": 1.298610014623423e-06, "loss": 52.2057, "step": 694 }, { "epoch": 0.2718361160681546, "grad_norm": 824.1338500976562, "learning_rate": 1.2866601891668945e-06, "loss": 24.8422, "step": 695 }, { "epoch": 0.2722272471704109, "grad_norm": 0.136683851480484, "learning_rate": 1.2747574751970826e-06, "loss": 11.3418, "step": 696 }, { "epoch": 0.27261837827266727, "grad_norm": 0.8831171989440918, "learning_rate": 1.2629020237248241e-06, "loss": 27.774, "step": 697 }, { "epoch": 0.27300950937492363, "grad_norm": 0.16165444254875183, "learning_rate": 1.2510939851613285e-06, "loss": 43.9631, "step": 698 }, { "epoch": 0.27340064047717993, "grad_norm": 1.4575397968292236, "learning_rate": 1.239333509316281e-06, "loss": 60.9957, "step": 699 }, { "epoch": 0.2737917715794363, "grad_norm": 0.35259395837783813, "learning_rate": 1.2276207453959283e-06, "loss": 50.4399, "step": 700 }, { "epoch": 0.2741829026816926, "grad_norm": 0.22403627634048462, "learning_rate": 1.2159558420011907e-06, "loss": 19.9137, "step": 701 }, { "epoch": 0.27457403378394896, "grad_norm": 1029.162353515625, "learning_rate": 1.2043389471257833e-06, "loss": 31.7524, "step": 702 }, { "epoch": 0.2749651648862053, "grad_norm": 900.6036987304688, "learning_rate": 1.1927702081543279e-06, "loss": 11.9903, "step": 703 }, { "epoch": 0.2753562959884616, "grad_norm": 0.16628578305244446, "learning_rate": 1.1812497718604887e-06, "loss": 11.8731, "step": 704 }, { "epoch": 0.275747427090718, "grad_norm": 0.20883895456790924, "learning_rate": 1.1697777844051105e-06, "loss": 2.1834, "step": 705 }, { "epoch": 0.2761385581929743, "grad_norm": 1.7050410509109497, "learning_rate": 1.158354391334362e-06, "loss": 18.1528, "step": 706 }, { "epoch": 0.27652968929523064, "grad_norm": 1408.4571533203125, "learning_rate": 1.1469797375778902e-06, "loss": 37.1046, "step": 707 }, { "epoch": 0.276920820397487, "grad_norm": 0.5376846790313721, "learning_rate": 1.1356539674469852e-06, "loss": 32.4544, "step": 708 }, { "epoch": 0.2773119514997433, "grad_norm": 0.30538272857666016, "learning_rate": 1.1243772246327416e-06, "loss": 26.5578, "step": 709 }, { "epoch": 0.27770308260199966, "grad_norm": 0.1639033704996109, "learning_rate": 1.1131496522042424e-06, "loss": 24.755, "step": 710 }, { "epoch": 0.278094213704256, "grad_norm": 296.2695007324219, "learning_rate": 1.1019713926067394e-06, "loss": 37.8495, "step": 711 }, { "epoch": 0.2784853448065123, "grad_norm": 0.9918115735054016, "learning_rate": 1.0908425876598512e-06, "loss": 32.8862, "step": 712 }, { "epoch": 0.2788764759087687, "grad_norm": 803.0256958007812, "learning_rate": 1.0797633785557582e-06, "loss": 17.9585, "step": 713 }, { "epoch": 0.279267607011025, "grad_norm": 0.4314124584197998, "learning_rate": 1.068733905857413e-06, "loss": 11.3658, "step": 714 }, { "epoch": 0.27965873811328135, "grad_norm": 1402.580078125, "learning_rate": 1.0577543094967613e-06, "loss": 62.5657, "step": 715 }, { "epoch": 0.2800498692155377, "grad_norm": 1.8656405210494995, "learning_rate": 1.0468247287729593e-06, "loss": 33.3876, "step": 716 }, { "epoch": 0.280441000317794, "grad_norm": 0.4445607662200928, "learning_rate": 1.0359453023506123e-06, "loss": 9.0196, "step": 717 }, { "epoch": 0.28083213142005037, "grad_norm": 0.5976535081863403, "learning_rate": 1.0251161682580125e-06, "loss": 21.402, "step": 718 }, { "epoch": 0.28122326252230667, "grad_norm": 0.54698646068573, "learning_rate": 1.0143374638853892e-06, "loss": 26.4295, "step": 719 }, { "epoch": 0.28161439362456303, "grad_norm": 0.8513966798782349, "learning_rate": 1.0036093259831624e-06, "loss": 32.0522, "step": 720 }, { "epoch": 0.2820055247268194, "grad_norm": 0.45537883043289185, "learning_rate": 9.929318906602176e-07, "loss": 39.6447, "step": 721 }, { "epoch": 0.2823966558290757, "grad_norm": 1060.1263427734375, "learning_rate": 9.823052933821643e-07, "loss": 37.912, "step": 722 }, { "epoch": 0.28278778693133205, "grad_norm": 0.5889149308204651, "learning_rate": 9.717296689696283e-07, "loss": 33.4835, "step": 723 }, { "epoch": 0.28317891803358836, "grad_norm": 1.4640785455703735, "learning_rate": 9.612051515965388e-07, "loss": 24.0596, "step": 724 }, { "epoch": 0.2835700491358447, "grad_norm": 0.16718535125255585, "learning_rate": 9.507318747884243e-07, "loss": 30.542, "step": 725 }, { "epoch": 0.2839611802381011, "grad_norm": 0.5365858674049377, "learning_rate": 9.403099714207175e-07, "loss": 43.143, "step": 726 }, { "epoch": 0.2843523113403574, "grad_norm": 1128.4521484375, "learning_rate": 9.299395737170758e-07, "loss": 41.8304, "step": 727 }, { "epoch": 0.28474344244261374, "grad_norm": 917.1222534179688, "learning_rate": 9.196208132476963e-07, "loss": 27.3092, "step": 728 }, { "epoch": 0.2851345735448701, "grad_norm": 0.3612130582332611, "learning_rate": 9.093538209276487e-07, "loss": 22.8086, "step": 729 }, { "epoch": 0.2855257046471264, "grad_norm": 0.26056137681007385, "learning_rate": 8.991387270152202e-07, "loss": 22.6093, "step": 730 }, { "epoch": 0.28591683574938276, "grad_norm": 0.23313690721988678, "learning_rate": 8.88975661110254e-07, "loss": 20.9189, "step": 731 }, { "epoch": 0.28630796685163906, "grad_norm": 0.1749650090932846, "learning_rate": 8.78864752152509e-07, "loss": 17.4111, "step": 732 }, { "epoch": 0.2866990979538954, "grad_norm": 1060.3902587890625, "learning_rate": 8.688061284200266e-07, "loss": 78.8863, "step": 733 }, { "epoch": 0.2870902290561518, "grad_norm": 0.22554989159107208, "learning_rate": 8.587999175274986e-07, "loss": 44.0863, "step": 734 }, { "epoch": 0.2874813601584081, "grad_norm": 1059.94580078125, "learning_rate": 8.488462464246495e-07, "loss": 29.8665, "step": 735 }, { "epoch": 0.28787249126066444, "grad_norm": 949.2442626953125, "learning_rate": 8.389452413946314e-07, "loss": 26.6799, "step": 736 }, { "epoch": 0.28826362236292075, "grad_norm": 789.4210205078125, "learning_rate": 8.290970280524124e-07, "loss": 25.6707, "step": 737 }, { "epoch": 0.2886547534651771, "grad_norm": 2.4044387340545654, "learning_rate": 8.193017313431872e-07, "loss": 10.1112, "step": 738 }, { "epoch": 0.28904588456743346, "grad_norm": 324.33489990234375, "learning_rate": 8.095594755407971e-07, "loss": 11.7497, "step": 739 }, { "epoch": 0.28943701566968977, "grad_norm": 1.1166508197784424, "learning_rate": 7.99870384246143e-07, "loss": 34.0988, "step": 740 }, { "epoch": 0.2898281467719461, "grad_norm": 0.18654634058475494, "learning_rate": 7.902345803856265e-07, "loss": 45.7333, "step": 741 }, { "epoch": 0.2902192778742025, "grad_norm": 0.3777371346950531, "learning_rate": 7.806521862095834e-07, "loss": 35.6395, "step": 742 }, { "epoch": 0.2906104089764588, "grad_norm": 0.24813219904899597, "learning_rate": 7.711233232907401e-07, "loss": 29.0974, "step": 743 }, { "epoch": 0.29100154007871515, "grad_norm": 0.2076377123594284, "learning_rate": 7.616481125226632e-07, "loss": 46.9587, "step": 744 }, { "epoch": 0.29139267118097145, "grad_norm": 0.7038260102272034, "learning_rate": 7.522266741182305e-07, "loss": 21.8313, "step": 745 }, { "epoch": 0.2917838022832278, "grad_norm": 217.7123565673828, "learning_rate": 7.42859127608106e-07, "loss": 6.3769, "step": 746 }, { "epoch": 0.29217493338548417, "grad_norm": 0.12917295098304749, "learning_rate": 7.33545591839222e-07, "loss": 47.6111, "step": 747 }, { "epoch": 0.2925660644877405, "grad_norm": 0.20550723373889923, "learning_rate": 7.242861849732696e-07, "loss": 64.3405, "step": 748 }, { "epoch": 0.29295719558999683, "grad_norm": 1334.2445068359375, "learning_rate": 7.150810244852036e-07, "loss": 26.1011, "step": 749 }, { "epoch": 0.29334832669225314, "grad_norm": 0.3111410439014435, "learning_rate": 7.059302271617485e-07, "loss": 35.9932, "step": 750 }, { "epoch": 0.2937394577945095, "grad_norm": 942.8177490234375, "learning_rate": 6.968339090999188e-07, "loss": 31.1454, "step": 751 }, { "epoch": 0.29413058889676585, "grad_norm": 810.7886962890625, "learning_rate": 6.877921857055476e-07, "loss": 37.078, "step": 752 }, { "epoch": 0.29452171999902216, "grad_norm": 306.9100036621094, "learning_rate": 6.78805171691817e-07, "loss": 34.2818, "step": 753 }, { "epoch": 0.2949128511012785, "grad_norm": 0.9455443620681763, "learning_rate": 6.698729810778065e-07, "loss": 13.8228, "step": 754 }, { "epoch": 0.2953039822035348, "grad_norm": 0.8926202654838562, "learning_rate": 6.609957271870505e-07, "loss": 48.3806, "step": 755 }, { "epoch": 0.2956951133057912, "grad_norm": 217.76356506347656, "learning_rate": 6.521735226460901e-07, "loss": 61.1602, "step": 756 }, { "epoch": 0.29608624440804754, "grad_norm": 0.5974970459938049, "learning_rate": 6.43406479383053e-07, "loss": 14.1005, "step": 757 }, { "epoch": 0.29647737551030384, "grad_norm": 1.2981594800949097, "learning_rate": 6.346947086262323e-07, "loss": 24.2989, "step": 758 }, { "epoch": 0.2968685066125602, "grad_norm": 0.1865173578262329, "learning_rate": 6.260383209026704e-07, "loss": 44.4224, "step": 759 }, { "epoch": 0.29725963771481656, "grad_norm": 0.39853161573410034, "learning_rate": 6.174374260367611e-07, "loss": 37.0987, "step": 760 }, { "epoch": 0.29765076881707286, "grad_norm": 0.5360068678855896, "learning_rate": 6.088921331488568e-07, "loss": 19.2739, "step": 761 }, { "epoch": 0.2980418999193292, "grad_norm": 840.9296875, "learning_rate": 6.004025506538813e-07, "loss": 24.1344, "step": 762 }, { "epoch": 0.2984330310215855, "grad_norm": 0.38888782262802124, "learning_rate": 5.919687862599549e-07, "loss": 37.5647, "step": 763 }, { "epoch": 0.2988241621238419, "grad_norm": 1.1136953830718994, "learning_rate": 5.835909469670292e-07, "loss": 39.4938, "step": 764 }, { "epoch": 0.29921529322609824, "grad_norm": 0.21624325215816498, "learning_rate": 5.752691390655279e-07, "loss": 58.6849, "step": 765 }, { "epoch": 0.29960642432835455, "grad_norm": 0.725331723690033, "learning_rate": 5.670034681349995e-07, "loss": 15.0299, "step": 766 }, { "epoch": 0.2999975554306109, "grad_norm": 0.4943119287490845, "learning_rate": 5.587940390427804e-07, "loss": 22.3662, "step": 767 }, { "epoch": 0.3003886865328672, "grad_norm": 0.3493794798851013, "learning_rate": 5.506409559426573e-07, "loss": 5.3336, "step": 768 }, { "epoch": 0.30077981763512357, "grad_norm": 0.16925600171089172, "learning_rate": 5.425443222735527e-07, "loss": 24.5783, "step": 769 }, { "epoch": 0.30117094873737993, "grad_norm": 0.42793917655944824, "learning_rate": 5.345042407582079e-07, "loss": 22.6456, "step": 770 }, { "epoch": 0.30156207983963623, "grad_norm": 1158.593994140625, "learning_rate": 5.265208134018851e-07, "loss": 28.3467, "step": 771 }, { "epoch": 0.3019532109418926, "grad_norm": 1398.79248046875, "learning_rate": 5.185941414910673e-07, "loss": 13.612, "step": 772 }, { "epoch": 0.30234434204414895, "grad_norm": 0.3084481954574585, "learning_rate": 5.107243255921746e-07, "loss": 40.542, "step": 773 }, { "epoch": 0.30273547314640525, "grad_norm": 0.6440997123718262, "learning_rate": 5.029114655502937e-07, "loss": 23.8275, "step": 774 }, { "epoch": 0.3031266042486616, "grad_norm": 582.021484375, "learning_rate": 4.951556604879049e-07, "loss": 52.9339, "step": 775 }, { "epoch": 0.3035177353509179, "grad_norm": 297.0395202636719, "learning_rate": 4.874570088036252e-07, "loss": 14.6142, "step": 776 }, { "epoch": 0.3039088664531743, "grad_norm": 0.25378304719924927, "learning_rate": 4.798156081709638e-07, "loss": 4.0975, "step": 777 }, { "epoch": 0.30429999755543063, "grad_norm": 0.7152448892593384, "learning_rate": 4.722315555370793e-07, "loss": 14.747, "step": 778 }, { "epoch": 0.30469112865768694, "grad_norm": 750.841064453125, "learning_rate": 4.647049471215498e-07, "loss": 10.1092, "step": 779 }, { "epoch": 0.3050822597599433, "grad_norm": 0.16996781527996063, "learning_rate": 4.5723587841515707e-07, "loss": 8.5861, "step": 780 }, { "epoch": 0.3054733908621996, "grad_norm": 0.2573756277561188, "learning_rate": 4.4982444417866753e-07, "loss": 28.2554, "step": 781 }, { "epoch": 0.30586452196445596, "grad_norm": 0.8471802473068237, "learning_rate": 4.4247073844163434e-07, "loss": 23.4402, "step": 782 }, { "epoch": 0.3062556530667123, "grad_norm": 711.4560546875, "learning_rate": 4.351748545012058e-07, "loss": 35.54, "step": 783 }, { "epoch": 0.3066467841689686, "grad_norm": 660.20458984375, "learning_rate": 4.279368849209381e-07, "loss": 32.8363, "step": 784 }, { "epoch": 0.307037915271225, "grad_norm": 60.09021759033203, "learning_rate": 4.2075692152962145e-07, "loss": 25.1056, "step": 785 }, { "epoch": 0.3074290463734813, "grad_norm": 0.995664119720459, "learning_rate": 4.136350554201196e-07, "loss": 18.2107, "step": 786 }, { "epoch": 0.30782017747573764, "grad_norm": 1.327789545059204, "learning_rate": 4.0657137694820826e-07, "loss": 31.5689, "step": 787 }, { "epoch": 0.308211308577994, "grad_norm": 0.5742402076721191, "learning_rate": 3.9956597573142966e-07, "loss": 9.6179, "step": 788 }, { "epoch": 0.3086024396802503, "grad_norm": 0.27793997526168823, "learning_rate": 3.9261894064796136e-07, "loss": 32.6346, "step": 789 }, { "epoch": 0.30899357078250667, "grad_norm": 1185.6458740234375, "learning_rate": 3.8573035983548167e-07, "loss": 38.3132, "step": 790 }, { "epoch": 0.309384701884763, "grad_norm": 1311.600830078125, "learning_rate": 3.789003206900538e-07, "loss": 44.8882, "step": 791 }, { "epoch": 0.30977583298701933, "grad_norm": 0.7463202476501465, "learning_rate": 3.7212890986501773e-07, "loss": 33.5164, "step": 792 }, { "epoch": 0.3101669640892757, "grad_norm": 0.1619306206703186, "learning_rate": 3.6541621326989183e-07, "loss": 8.8827, "step": 793 }, { "epoch": 0.310558095191532, "grad_norm": 0.290446937084198, "learning_rate": 3.5876231606927936e-07, "loss": 22.0263, "step": 794 }, { "epoch": 0.31094922629378835, "grad_norm": 0.5374624729156494, "learning_rate": 3.5216730268179346e-07, "loss": 54.9825, "step": 795 }, { "epoch": 0.3113403573960447, "grad_norm": 0.45844340324401855, "learning_rate": 3.4563125677897936e-07, "loss": 34.1724, "step": 796 }, { "epoch": 0.311731488498301, "grad_norm": 0.21980485320091248, "learning_rate": 3.3915426128425744e-07, "loss": 23.5609, "step": 797 }, { "epoch": 0.31212261960055737, "grad_norm": 0.1387357860803604, "learning_rate": 3.327363983718723e-07, "loss": 3.3226, "step": 798 }, { "epoch": 0.3125137507028137, "grad_norm": 703.876708984375, "learning_rate": 3.263777494658449e-07, "loss": 14.627, "step": 799 }, { "epoch": 0.31290488180507003, "grad_norm": 0.3398139178752899, "learning_rate": 3.200783952389447e-07, "loss": 23.2432, "step": 800 }, { "epoch": 0.3132960129073264, "grad_norm": 0.2651851177215576, "learning_rate": 3.138384156116614e-07, "loss": 22.8204, "step": 801 }, { "epoch": 0.3136871440095827, "grad_norm": 0.7338706254959106, "learning_rate": 3.076578897511978e-07, "loss": 8.9847, "step": 802 }, { "epoch": 0.31407827511183906, "grad_norm": 674.1719360351562, "learning_rate": 3.015368960704584e-07, "loss": 42.1909, "step": 803 }, { "epoch": 0.31446940621409536, "grad_norm": 0.5109131932258606, "learning_rate": 2.954755122270564e-07, "loss": 16.502, "step": 804 }, { "epoch": 0.3148605373163517, "grad_norm": 38.51890563964844, "learning_rate": 2.894738151223331e-07, "loss": 22.2199, "step": 805 }, { "epoch": 0.3152516684186081, "grad_norm": 236.9777069091797, "learning_rate": 2.835318809003751e-07, "loss": 61.7173, "step": 806 }, { "epoch": 0.3156427995208644, "grad_norm": 0.13343866169452667, "learning_rate": 2.776497849470544e-07, "loss": 14.8311, "step": 807 }, { "epoch": 0.31603393062312074, "grad_norm": 0.2250102311372757, "learning_rate": 2.71827601889067e-07, "loss": 28.0406, "step": 808 }, { "epoch": 0.3164250617253771, "grad_norm": 0.3997911810874939, "learning_rate": 2.6606540559298956e-07, "loss": 32.8142, "step": 809 }, { "epoch": 0.3168161928276334, "grad_norm": 947.511474609375, "learning_rate": 2.6036326916434153e-07, "loss": 32.0284, "step": 810 }, { "epoch": 0.31720732392988976, "grad_norm": 0.18809598684310913, "learning_rate": 2.547212649466568e-07, "loss": 46.4845, "step": 811 }, { "epoch": 0.31759845503214607, "grad_norm": 105.57866668701172, "learning_rate": 2.491394645205669e-07, "loss": 36.195, "step": 812 }, { "epoch": 0.3179895861344024, "grad_norm": 700.2158203125, "learning_rate": 2.436179387028903e-07, "loss": 38.9557, "step": 813 }, { "epoch": 0.3183807172366588, "grad_norm": 778.9248657226562, "learning_rate": 2.3815675754573885e-07, "loss": 22.1128, "step": 814 }, { "epoch": 0.3187718483389151, "grad_norm": 0.24114812910556793, "learning_rate": 2.3275599033562414e-07, "loss": 40.5305, "step": 815 }, { "epoch": 0.31916297944117145, "grad_norm": 539.972900390625, "learning_rate": 2.274157055925802e-07, "loss": 38.5275, "step": 816 }, { "epoch": 0.31955411054342775, "grad_norm": 0.5195923447608948, "learning_rate": 2.2213597106929608e-07, "loss": 31.244, "step": 817 }, { "epoch": 0.3199452416456841, "grad_norm": 242.19908142089844, "learning_rate": 2.1691685375025362e-07, "loss": 39.8784, "step": 818 }, { "epoch": 0.32033637274794047, "grad_norm": 0.1984694004058838, "learning_rate": 2.117584198508771e-07, "loss": 30.9005, "step": 819 }, { "epoch": 0.32072750385019677, "grad_norm": 987.247802734375, "learning_rate": 2.0666073481669714e-07, "loss": 32.3119, "step": 820 }, { "epoch": 0.32111863495245313, "grad_norm": 1.042695164680481, "learning_rate": 2.016238633225165e-07, "loss": 41.9533, "step": 821 }, { "epoch": 0.3215097660547095, "grad_norm": 0.18694262206554413, "learning_rate": 1.9664786927159064e-07, "loss": 55.5923, "step": 822 }, { "epoch": 0.3219008971569658, "grad_norm": 0.35913291573524475, "learning_rate": 1.9173281579481896e-07, "loss": 17.7606, "step": 823 }, { "epoch": 0.32229202825922215, "grad_norm": 0.1270519196987152, "learning_rate": 1.8687876524993987e-07, "loss": 6.9452, "step": 824 }, { "epoch": 0.32268315936147846, "grad_norm": 0.25847911834716797, "learning_rate": 1.820857792207431e-07, "loss": 11.1015, "step": 825 }, { "epoch": 0.3230742904637348, "grad_norm": 507.70074462890625, "learning_rate": 1.7735391851628814e-07, "loss": 32.347, "step": 826 }, { "epoch": 0.3234654215659912, "grad_norm": 90.63712310791016, "learning_rate": 1.7268324317012974e-07, "loss": 16.0917, "step": 827 }, { "epoch": 0.3238565526682475, "grad_norm": 0.3610641658306122, "learning_rate": 1.680738124395598e-07, "loss": 10.4453, "step": 828 }, { "epoch": 0.32424768377050384, "grad_norm": 1.101544976234436, "learning_rate": 1.6352568480485277e-07, "loss": 3.3177, "step": 829 }, { "epoch": 0.32463881487276014, "grad_norm": 1.0258104801177979, "learning_rate": 1.5903891796852756e-07, "loss": 30.9836, "step": 830 }, { "epoch": 0.3250299459750165, "grad_norm": 0.22194243967533112, "learning_rate": 1.5461356885461077e-07, "loss": 11.0591, "step": 831 }, { "epoch": 0.32542107707727286, "grad_norm": 527.8275146484375, "learning_rate": 1.5024969360791564e-07, "loss": 48.5131, "step": 832 }, { "epoch": 0.32581220817952916, "grad_norm": 0.20000404119491577, "learning_rate": 1.4594734759333484e-07, "loss": 33.9883, "step": 833 }, { "epoch": 0.3262033392817855, "grad_norm": 0.5714410543441772, "learning_rate": 1.4170658539512993e-07, "loss": 19.7609, "step": 834 }, { "epoch": 0.3265944703840418, "grad_norm": 1.373691439628601, "learning_rate": 1.375274608162447e-07, "loss": 33.721, "step": 835 }, { "epoch": 0.3269856014862982, "grad_norm": 0.5083162188529968, "learning_rate": 1.3341002687762062e-07, "loss": 21.9135, "step": 836 }, { "epoch": 0.32737673258855454, "grad_norm": 1.2598671913146973, "learning_rate": 1.2935433581752365e-07, "loss": 24.6308, "step": 837 }, { "epoch": 0.32776786369081085, "grad_norm": 615.4888305664062, "learning_rate": 1.253604390908819e-07, "loss": 33.5934, "step": 838 }, { "epoch": 0.3281589947930672, "grad_norm": 0.8651353716850281, "learning_rate": 1.2142838736863562e-07, "loss": 79.5808, "step": 839 }, { "epoch": 0.32855012589532356, "grad_norm": 0.21397706866264343, "learning_rate": 1.175582305370887e-07, "loss": 30.6943, "step": 840 }, { "epoch": 0.32894125699757987, "grad_norm": 748.9775390625, "learning_rate": 1.1375001769728e-07, "loss": 30.7896, "step": 841 }, { "epoch": 0.3293323880998362, "grad_norm": 0.20578625798225403, "learning_rate": 1.1000379716435916e-07, "loss": 28.9501, "step": 842 }, { "epoch": 0.32972351920209253, "grad_norm": 0.34230169653892517, "learning_rate": 1.0631961646697387e-07, "loss": 6.3882, "step": 843 }, { "epoch": 0.3301146503043489, "grad_norm": 136.2682342529297, "learning_rate": 1.0269752234666642e-07, "loss": 9.9568, "step": 844 }, { "epoch": 0.33050578140660525, "grad_norm": 536.7844848632812, "learning_rate": 9.913756075728088e-08, "loss": 42.5157, "step": 845 }, { "epoch": 0.33089691250886155, "grad_norm": 0.6589470505714417, "learning_rate": 9.563977686438019e-08, "loss": 15.0413, "step": 846 }, { "epoch": 0.3312880436111179, "grad_norm": 0.44913989305496216, "learning_rate": 9.22042150446728e-08, "loss": 22.3091, "step": 847 }, { "epoch": 0.3316791747133742, "grad_norm": 0.11731698364019394, "learning_rate": 8.883091888545136e-08, "loss": 26.5784, "step": 848 }, { "epoch": 0.3320703058156306, "grad_norm": 0.27501413226127625, "learning_rate": 8.551993118403656e-08, "loss": 18.7753, "step": 849 }, { "epoch": 0.33246143691788693, "grad_norm": 325.61627197265625, "learning_rate": 8.227129394723643e-08, "loss": 22.7822, "step": 850 }, { "epoch": 0.33285256802014324, "grad_norm": 0.2187097817659378, "learning_rate": 7.908504839081343e-08, "loss": 21.9867, "step": 851 }, { "epoch": 0.3332436991223996, "grad_norm": 476.84649658203125, "learning_rate": 7.59612349389599e-08, "loss": 34.4312, "step": 852 }, { "epoch": 0.33363483022465595, "grad_norm": 0.8119810223579407, "learning_rate": 7.289989322378732e-08, "loss": 9.815, "step": 853 }, { "epoch": 0.33402596132691226, "grad_norm": 0.5497637987136841, "learning_rate": 6.990106208482227e-08, "loss": 25.6327, "step": 854 }, { "epoch": 0.3344170924291686, "grad_norm": 0.14091427624225616, "learning_rate": 6.696477956851356e-08, "loss": 17.0462, "step": 855 }, { "epoch": 0.3348082235314249, "grad_norm": 0.36118146777153015, "learning_rate": 6.409108292774912e-08, "loss": 29.5031, "step": 856 }, { "epoch": 0.3351993546336813, "grad_norm": 0.4133693277835846, "learning_rate": 6.12800086213866e-08, "loss": 23.7123, "step": 857 }, { "epoch": 0.33559048573593764, "grad_norm": 1.1946247816085815, "learning_rate": 5.853159231378469e-08, "loss": 46.5555, "step": 858 }, { "epoch": 0.33598161683819394, "grad_norm": 0.21554416418075562, "learning_rate": 5.584586887435739e-08, "loss": 17.6013, "step": 859 }, { "epoch": 0.3363727479404503, "grad_norm": 0.16092292964458466, "learning_rate": 5.322287237712664e-08, "loss": 10.5376, "step": 860 }, { "epoch": 0.3367638790427066, "grad_norm": 464.0669860839844, "learning_rate": 5.0662636100292094e-08, "loss": 36.5366, "step": 861 }, { "epoch": 0.33715501014496296, "grad_norm": 0.7112561464309692, "learning_rate": 4.8165192525809754e-08, "loss": 32.8205, "step": 862 }, { "epoch": 0.3375461412472193, "grad_norm": 0.18869797885417938, "learning_rate": 4.573057333897679e-08, "loss": 11.1364, "step": 863 }, { "epoch": 0.3379372723494756, "grad_norm": 0.11357381194829941, "learning_rate": 4.335880942803405e-08, "loss": 15.2307, "step": 864 }, { "epoch": 0.338328403451732, "grad_norm": 0.2633415162563324, "learning_rate": 4.104993088376974e-08, "loss": 11.5664, "step": 865 }, { "epoch": 0.3387195345539883, "grad_norm": 1.018863320350647, "learning_rate": 3.8803966999139686e-08, "loss": 8.2019, "step": 866 }, { "epoch": 0.33911066565624465, "grad_norm": 1.0014698505401611, "learning_rate": 3.662094626889656e-08, "loss": 14.888, "step": 867 }, { "epoch": 0.339501796758501, "grad_norm": 0.7487742900848389, "learning_rate": 3.450089638922738e-08, "loss": 51.2638, "step": 868 }, { "epoch": 0.3398929278607573, "grad_norm": 532.7721557617188, "learning_rate": 3.2443844257400434e-08, "loss": 23.042, "step": 869 }, { "epoch": 0.34028405896301367, "grad_norm": 0.3149930536746979, "learning_rate": 3.044981597142837e-08, "loss": 4.3578, "step": 870 }, { "epoch": 0.34067519006527003, "grad_norm": 1.0493534803390503, "learning_rate": 2.8518836829732332e-08, "loss": 25.666, "step": 871 }, { "epoch": 0.34106632116752633, "grad_norm": 582.3631591796875, "learning_rate": 2.6650931330823305e-08, "loss": 31.6531, "step": 872 }, { "epoch": 0.3414574522697827, "grad_norm": 680.9072265625, "learning_rate": 2.4846123172992953e-08, "loss": 22.6484, "step": 873 }, { "epoch": 0.341848583372039, "grad_norm": 0.22280123829841614, "learning_rate": 2.3104435254008852e-08, "loss": 14.0636, "step": 874 }, { "epoch": 0.34223971447429535, "grad_norm": 0.26472118496894836, "learning_rate": 2.1425889670827483e-08, "loss": 19.032, "step": 875 }, { "epoch": 0.3426308455765517, "grad_norm": 238.65916442871094, "learning_rate": 1.981050771931281e-08, "loss": 10.7904, "step": 876 }, { "epoch": 0.343021976678808, "grad_norm": 0.1643354743719101, "learning_rate": 1.8258309893965375e-08, "loss": 14.425, "step": 877 }, { "epoch": 0.3434131077810644, "grad_norm": 0.3684662878513336, "learning_rate": 1.6769315887662508e-08, "loss": 33.7941, "step": 878 }, { "epoch": 0.3438042388833207, "grad_norm": 0.7127506136894226, "learning_rate": 1.5343544591409632e-08, "loss": 21.4788, "step": 879 }, { "epoch": 0.34419536998557704, "grad_norm": 0.591823160648346, "learning_rate": 1.3981014094099354e-08, "loss": 1.5929, "step": 880 }, { "epoch": 0.3445865010878334, "grad_norm": 0.8882282972335815, "learning_rate": 1.2681741682282755e-08, "loss": 16.2899, "step": 881 }, { "epoch": 0.3449776321900897, "grad_norm": 0.4288715720176697, "learning_rate": 1.1445743839949008e-08, "loss": 11.78, "step": 882 }, { "epoch": 0.34536876329234606, "grad_norm": 479.3231506347656, "learning_rate": 1.0273036248318325e-08, "loss": 27.5783, "step": 883 }, { "epoch": 0.3457598943946024, "grad_norm": 0.7523378133773804, "learning_rate": 9.163633785639892e-09, "loss": 31.4348, "step": 884 }, { "epoch": 0.3461510254968587, "grad_norm": 0.4846894145011902, "learning_rate": 8.117550527005913e-09, "loss": 2.143, "step": 885 }, { "epoch": 0.3465421565991151, "grad_norm": 229.96755981445312, "learning_rate": 7.13479974417175e-09, "loss": 30.5446, "step": 886 }, { "epoch": 0.3469332877013714, "grad_norm": 0.1374731957912445, "learning_rate": 6.215393905388278e-09, "loss": 22.3179, "step": 887 }, { "epoch": 0.34732441880362774, "grad_norm": 0.76992267370224, "learning_rate": 5.359344675242018e-09, "loss": 14.0893, "step": 888 }, { "epoch": 0.3477155499058841, "grad_norm": 476.39459228515625, "learning_rate": 4.56666291450858e-09, "loss": 52.4586, "step": 889 }, { "epoch": 0.3481066810081404, "grad_norm": 71.57829284667969, "learning_rate": 3.837358680016112e-09, "loss": 19.4455, "step": 890 }, { "epoch": 0.34849781211039677, "grad_norm": 0.3923113942146301, "learning_rate": 3.1714412245148486e-09, "loss": 43.8042, "step": 891 }, { "epoch": 0.34888894321265307, "grad_norm": 0.11082535982131958, "learning_rate": 2.568918996560532e-09, "loss": 21.0263, "step": 892 }, { "epoch": 0.34928007431490943, "grad_norm": 0.5106098651885986, "learning_rate": 2.029799640409502e-09, "loss": 21.4105, "step": 893 }, { "epoch": 0.3496712054171658, "grad_norm": 0.2212674468755722, "learning_rate": 1.5540899959187727e-09, "loss": 29.8404, "step": 894 }, { "epoch": 0.3500623365194221, "grad_norm": 0.4330773949623108, "learning_rate": 1.1417960984605459e-09, "loss": 28.9654, "step": 895 }, { "epoch": 0.35045346762167845, "grad_norm": 0.1814635843038559, "learning_rate": 7.92923178845606e-10, "loss": 25.6656, "step": 896 }, { "epoch": 0.35084459872393475, "grad_norm": 538.5513916015625, "learning_rate": 5.07475663257262e-10, "loss": 32.209, "step": 897 }, { "epoch": 0.3512357298261911, "grad_norm": 536.6629028320312, "learning_rate": 2.854571731947253e-10, "loss": 13.3711, "step": 898 }, { "epoch": 0.35162686092844747, "grad_norm": 598.1375732421875, "learning_rate": 1.2687052542759148e-10, "loss": 27.3026, "step": 899 }, { "epoch": 0.3520179920307038, "grad_norm": 0.18695542216300964, "learning_rate": 3.171773195809191e-11, "loss": 31.2123, "step": 900 }, { "epoch": 0.3520179920307038, "step": 900, "total_flos": 9.027346976391299e+18, "train_loss": 213.8077490248945, "train_runtime": 76099.1347, "train_samples_per_second": 3.028, "train_steps_per_second": 0.012 } ], "logging_steps": 1.0, "max_steps": 900, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.027346976391299e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }