regup006 / trainer_state.json
semran1's picture
Upload folder using huggingface_hub
fba85c7 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.0625,
"eval_steps": 250,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000125,
"grad_norm": 4.097814559936523,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.1655,
"loss/crossentropy": 2.343535900115967,
"loss/hidden": 0.9296875,
"loss/logits": 0.17379230260849,
"loss/reg": 0.006198255345225334,
"step": 1
},
{
"epoch": 0.00025,
"grad_norm": 3.662576913833618,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.4973,
"loss/crossentropy": 2.318769931793213,
"loss/hidden": 1.1875,
"loss/logits": 0.24786217510700226,
"loss/reg": 0.006198255345225334,
"step": 2
},
{
"epoch": 0.000375,
"grad_norm": 2.8296749591827393,
"learning_rate": 3e-06,
"loss": 1.2258,
"loss/crossentropy": 2.4907937049865723,
"loss/hidden": 0.97265625,
"loss/logits": 0.19112952053546906,
"loss/reg": 0.006198245566338301,
"step": 3
},
{
"epoch": 0.0005,
"grad_norm": 3.057624578475952,
"learning_rate": 4.000000000000001e-06,
"loss": 1.1136,
"loss/crossentropy": 2.744520902633667,
"loss/hidden": 0.890625,
"loss/logits": 0.16101403534412384,
"loss/reg": 0.006198232993483543,
"step": 4
},
{
"epoch": 0.000625,
"grad_norm": 2.7055587768554688,
"learning_rate": 5e-06,
"loss": 1.1943,
"loss/crossentropy": 2.5722062587738037,
"loss/hidden": 0.94921875,
"loss/logits": 0.18310005962848663,
"loss/reg": 0.0061982134357094765,
"step": 5
},
{
"epoch": 0.00075,
"grad_norm": 3.789276361465454,
"learning_rate": 6e-06,
"loss": 1.247,
"loss/crossentropy": 2.613312005996704,
"loss/hidden": 1.0078125,
"loss/logits": 0.17725251615047455,
"loss/reg": 0.006198191549628973,
"step": 6
},
{
"epoch": 0.000875,
"grad_norm": 3.997910499572754,
"learning_rate": 7.000000000000001e-06,
"loss": 1.4206,
"loss/crossentropy": 2.4207534790039062,
"loss/hidden": 1.125,
"loss/logits": 0.2336406409740448,
"loss/reg": 0.006198164541274309,
"step": 7
},
{
"epoch": 0.001,
"grad_norm": 2.5986244678497314,
"learning_rate": 8.000000000000001e-06,
"loss": 1.0878,
"loss/crossentropy": 2.536424160003662,
"loss/hidden": 0.8671875,
"loss/logits": 0.1585812270641327,
"loss/reg": 0.006198132876306772,
"step": 8
},
{
"epoch": 0.001125,
"grad_norm": 2.2757976055145264,
"learning_rate": 9e-06,
"loss": 1.1175,
"loss/crossentropy": 2.745281219482422,
"loss/hidden": 0.89453125,
"loss/logits": 0.16094230115413666,
"loss/reg": 0.006198094692081213,
"step": 9
},
{
"epoch": 0.00125,
"grad_norm": 2.261094808578491,
"learning_rate": 1e-05,
"loss": 1.0803,
"loss/crossentropy": 2.3173577785491943,
"loss/hidden": 0.8671875,
"loss/logits": 0.15108685195446014,
"loss/reg": 0.0061980499885976315,
"step": 10
},
{
"epoch": 0.001375,
"grad_norm": 21.777265548706055,
"learning_rate": 1.1000000000000001e-05,
"loss": 2.0501,
"loss/crossentropy": 3.2122714519500732,
"loss/hidden": 1.7109375,
"loss/logits": 0.27713608741760254,
"loss/reg": 0.006198008079081774,
"step": 11
},
{
"epoch": 0.0015,
"grad_norm": 2.5655505657196045,
"learning_rate": 1.2e-05,
"loss": 1.151,
"loss/crossentropy": 2.706430196762085,
"loss/hidden": 0.8984375,
"loss/logits": 0.19056561589241028,
"loss/reg": 0.0061979577876627445,
"step": 12
},
{
"epoch": 0.001625,
"grad_norm": 2.403053045272827,
"learning_rate": 1.3000000000000001e-05,
"loss": 1.0719,
"loss/crossentropy": 2.0466296672821045,
"loss/hidden": 0.88671875,
"loss/logits": 0.12316589802503586,
"loss/reg": 0.0061978911980986595,
"step": 13
},
{
"epoch": 0.00175,
"grad_norm": 3.840881586074829,
"learning_rate": 1.4000000000000001e-05,
"loss": 1.5441,
"loss/crossentropy": 2.3191423416137695,
"loss/hidden": 1.234375,
"loss/logits": 0.24779079854488373,
"loss/reg": 0.00619781669229269,
"step": 14
},
{
"epoch": 0.001875,
"grad_norm": 2.557331085205078,
"learning_rate": 1.5e-05,
"loss": 0.9444,
"loss/crossentropy": 2.6370084285736084,
"loss/hidden": 0.76953125,
"loss/logits": 0.11287336051464081,
"loss/reg": 0.006197733338922262,
"step": 15
},
{
"epoch": 0.002,
"grad_norm": 3.1850404739379883,
"grad_norm_var": 22.31061335402559,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.3213,
"loss/crossentropy": 2.676577091217041,
"loss/hidden": 1.0546875,
"loss/logits": 0.2046227753162384,
"loss/reg": 0.006197639741003513,
"step": 16
},
{
"epoch": 0.002125,
"grad_norm": 2.2587289810180664,
"grad_norm_var": 22.553268201402446,
"learning_rate": 1.7000000000000003e-05,
"loss": 1.0312,
"loss/crossentropy": 2.4961040019989014,
"loss/hidden": 0.8203125,
"loss/logits": 0.148894801735878,
"loss/reg": 0.006197560112923384,
"step": 17
},
{
"epoch": 0.00225,
"grad_norm": 3.3259811401367188,
"grad_norm_var": 22.58044614452358,
"learning_rate": 1.8e-05,
"loss": 1.3626,
"loss/crossentropy": 2.5914387702941895,
"loss/hidden": 1.046875,
"loss/logits": 0.25370728969573975,
"loss/reg": 0.006197475362569094,
"step": 18
},
{
"epoch": 0.002375,
"grad_norm": 2.468914747238159,
"grad_norm_var": 22.649171856957494,
"learning_rate": 1.9e-05,
"loss": 1.1683,
"loss/crossentropy": 2.6096584796905518,
"loss/hidden": 0.921875,
"loss/logits": 0.18447336554527283,
"loss/reg": 0.00619738781824708,
"step": 19
},
{
"epoch": 0.0025,
"grad_norm": 2.3097646236419678,
"grad_norm_var": 22.784756315801523,
"learning_rate": 2e-05,
"loss": 1.1605,
"loss/crossentropy": 2.299048662185669,
"loss/hidden": 0.9375,
"loss/logits": 0.16106057167053223,
"loss/reg": 0.006197274662554264,
"step": 20
},
{
"epoch": 0.002625,
"grad_norm": 2.1111207008361816,
"grad_norm_var": 22.911025462198744,
"learning_rate": 2.1e-05,
"loss": 0.939,
"loss/crossentropy": 2.547258138656616,
"loss/hidden": 0.75,
"loss/logits": 0.12698382139205933,
"loss/reg": 0.006197154987603426,
"step": 21
},
{
"epoch": 0.00275,
"grad_norm": 2.4918222427368164,
"grad_norm_var": 23.049732177187614,
"learning_rate": 2.2000000000000003e-05,
"loss": 1.2047,
"loss/crossentropy": 2.2802374362945557,
"loss/hidden": 0.953125,
"loss/logits": 0.18965375423431396,
"loss/reg": 0.006197045091539621,
"step": 22
},
{
"epoch": 0.002875,
"grad_norm": 3.3273494243621826,
"grad_norm_var": 23.069242834486193,
"learning_rate": 2.3000000000000003e-05,
"loss": 1.2554,
"loss/crossentropy": 2.3062734603881836,
"loss/hidden": 1.0078125,
"loss/logits": 0.18566077947616577,
"loss/reg": 0.006196921691298485,
"step": 23
},
{
"epoch": 0.003,
"grad_norm": 2.5644068717956543,
"grad_norm_var": 23.075070365271714,
"learning_rate": 2.4e-05,
"loss": 1.2266,
"loss/crossentropy": 2.460878372192383,
"loss/hidden": 0.98046875,
"loss/logits": 0.18418912589550018,
"loss/reg": 0.006196786183863878,
"step": 24
},
{
"epoch": 0.003125,
"grad_norm": 2.3506264686584473,
"grad_norm_var": 23.059636834121356,
"learning_rate": 2.5e-05,
"loss": 1.0205,
"loss/crossentropy": 2.4281811714172363,
"loss/hidden": 0.82421875,
"loss/logits": 0.13434948027133942,
"loss/reg": 0.0061966474168002605,
"step": 25
},
{
"epoch": 0.00325,
"grad_norm": 2.25004506111145,
"grad_norm_var": 23.062003716592635,
"learning_rate": 2.6000000000000002e-05,
"loss": 1.1133,
"loss/crossentropy": 2.326843500137329,
"loss/hidden": 0.9140625,
"loss/logits": 0.13725802302360535,
"loss/reg": 0.006196498870849609,
"step": 26
},
{
"epoch": 0.003375,
"grad_norm": 2.283770799636841,
"grad_norm_var": 0.2469546323472817,
"learning_rate": 2.7000000000000002e-05,
"loss": 1.1459,
"loss/crossentropy": 2.3002493381500244,
"loss/hidden": 0.9140625,
"loss/logits": 0.16987068951129913,
"loss/reg": 0.006196335889399052,
"step": 27
},
{
"epoch": 0.0035,
"grad_norm": 2.805088758468628,
"grad_norm_var": 0.24805442740468303,
"learning_rate": 2.8000000000000003e-05,
"loss": 1.0272,
"loss/crossentropy": 2.510472536087036,
"loss/hidden": 0.8359375,
"loss/logits": 0.12927240133285522,
"loss/reg": 0.006196176633238792,
"step": 28
},
{
"epoch": 0.003625,
"grad_norm": 2.0331132411956787,
"grad_norm_var": 0.2692014993258605,
"learning_rate": 2.9e-05,
"loss": 1.0913,
"loss/crossentropy": 2.51584529876709,
"loss/hidden": 0.87109375,
"loss/logits": 0.15820594131946564,
"loss/reg": 0.006195997819304466,
"step": 29
},
{
"epoch": 0.00375,
"grad_norm": 2.1523566246032715,
"grad_norm_var": 0.17596421900176604,
"learning_rate": 3e-05,
"loss": 1.0026,
"loss/crossentropy": 2.704220771789551,
"loss/hidden": 0.796875,
"loss/logits": 0.14372289180755615,
"loss/reg": 0.0061958180740475655,
"step": 30
},
{
"epoch": 0.003875,
"grad_norm": 2.6658694744110107,
"grad_norm_var": 0.1771001402109505,
"learning_rate": 3.1e-05,
"loss": 1.122,
"loss/crossentropy": 2.4840426445007324,
"loss/hidden": 0.89453125,
"loss/logits": 0.1655040979385376,
"loss/reg": 0.006195634603500366,
"step": 31
},
{
"epoch": 0.004,
"grad_norm": 2.813079595565796,
"grad_norm_var": 0.153583095436327,
"learning_rate": 3.2000000000000005e-05,
"loss": 1.0653,
"loss/crossentropy": 2.442962646484375,
"loss/hidden": 0.859375,
"loss/logits": 0.14400474727153778,
"loss/reg": 0.00619542459025979,
"step": 32
},
{
"epoch": 0.004125,
"grad_norm": 2.4273953437805176,
"grad_norm_var": 0.1496371777315666,
"learning_rate": 3.3e-05,
"loss": 1.1025,
"loss/crossentropy": 2.515721559524536,
"loss/hidden": 0.89453125,
"loss/logits": 0.1460331827402115,
"loss/reg": 0.006195210851728916,
"step": 33
},
{
"epoch": 0.00425,
"grad_norm": 2.0594100952148438,
"grad_norm_var": 0.11442956053255457,
"learning_rate": 3.4000000000000007e-05,
"loss": 1.118,
"loss/crossentropy": 2.5347506999969482,
"loss/hidden": 0.8984375,
"loss/logits": 0.15760375559329987,
"loss/reg": 0.006195001769810915,
"step": 34
},
{
"epoch": 0.004375,
"grad_norm": 2.497893810272217,
"grad_norm_var": 0.11457586733464495,
"learning_rate": 3.5e-05,
"loss": 1.2359,
"loss/crossentropy": 1.7681002616882324,
"loss/hidden": 1.0390625,
"loss/logits": 0.13490143418312073,
"loss/reg": 0.006194803398102522,
"step": 35
},
{
"epoch": 0.0045,
"grad_norm": 3.3231709003448486,
"grad_norm_var": 0.16029457606237638,
"learning_rate": 3.6e-05,
"loss": 1.3588,
"loss/crossentropy": 2.729518175125122,
"loss/hidden": 1.09375,
"loss/logits": 0.20313453674316406,
"loss/reg": 0.00619460316374898,
"step": 36
},
{
"epoch": 0.004625,
"grad_norm": 2.5542962551116943,
"grad_norm_var": 0.14901290879942408,
"learning_rate": 3.7e-05,
"loss": 1.1671,
"loss/crossentropy": 2.3359429836273193,
"loss/hidden": 0.9296875,
"loss/logits": 0.17546769976615906,
"loss/reg": 0.006194361485540867,
"step": 37
},
{
"epoch": 0.00475,
"grad_norm": 3.5138309001922607,
"grad_norm_var": 0.2080724542279834,
"learning_rate": 3.8e-05,
"loss": 1.2044,
"loss/crossentropy": 2.447890520095825,
"loss/hidden": 0.96484375,
"loss/logits": 0.17756858468055725,
"loss/reg": 0.0061941081658005714,
"step": 38
},
{
"epoch": 0.004875,
"grad_norm": 3.813410758972168,
"grad_norm_var": 0.2698887106917669,
"learning_rate": 3.9000000000000006e-05,
"loss": 1.0819,
"loss/crossentropy": 2.766765832901001,
"loss/hidden": 0.88671875,
"loss/logits": 0.13325469195842743,
"loss/reg": 0.006193886045366526,
"step": 39
},
{
"epoch": 0.005,
"grad_norm": 3.1502718925476074,
"grad_norm_var": 0.2860816910243668,
"learning_rate": 4e-05,
"loss": 1.3622,
"loss/crossentropy": 2.3325388431549072,
"loss/hidden": 1.109375,
"loss/logits": 0.19087004661560059,
"loss/reg": 0.006193609442561865,
"step": 40
},
{
"epoch": 0.005125,
"grad_norm": 2.422366142272949,
"grad_norm_var": 0.28336421674108553,
"learning_rate": 4.1e-05,
"loss": 1.2212,
"loss/crossentropy": 2.3002498149871826,
"loss/hidden": 0.96875,
"loss/logits": 0.19054222106933594,
"loss/reg": 0.00619333703070879,
"step": 41
},
{
"epoch": 0.00525,
"grad_norm": 2.7353622913360596,
"grad_norm_var": 0.2707266796228128,
"learning_rate": 4.2e-05,
"loss": 1.0549,
"loss/crossentropy": 2.0319221019744873,
"loss/hidden": 0.87890625,
"loss/logits": 0.1140664741396904,
"loss/reg": 0.006193041335791349,
"step": 42
},
{
"epoch": 0.005375,
"grad_norm": 1.9425387382507324,
"grad_norm_var": 0.2970857034274398,
"learning_rate": 4.3e-05,
"loss": 1.0366,
"loss/crossentropy": 2.431666374206543,
"loss/hidden": 0.83203125,
"loss/logits": 0.1426728069782257,
"loss/reg": 0.006192733999341726,
"step": 43
},
{
"epoch": 0.0055,
"grad_norm": 2.7009642124176025,
"grad_norm_var": 0.2960522402202514,
"learning_rate": 4.4000000000000006e-05,
"loss": 0.9824,
"loss/crossentropy": 2.391608476638794,
"loss/hidden": 0.78515625,
"loss/logits": 0.13533324003219604,
"loss/reg": 0.006192411296069622,
"step": 44
},
{
"epoch": 0.005625,
"grad_norm": 2.6632983684539795,
"grad_norm_var": 0.2669107471214488,
"learning_rate": 4.5e-05,
"loss": 1.1067,
"loss/crossentropy": 2.7733116149902344,
"loss/hidden": 0.87109375,
"loss/logits": 0.1736893653869629,
"loss/reg": 0.006192059256136417,
"step": 45
},
{
"epoch": 0.00575,
"grad_norm": 2.1037468910217285,
"grad_norm_var": 0.2707032714108967,
"learning_rate": 4.600000000000001e-05,
"loss": 0.9831,
"loss/crossentropy": 2.4606895446777344,
"loss/hidden": 0.7890625,
"loss/logits": 0.13213258981704712,
"loss/reg": 0.006191718857735395,
"step": 46
},
{
"epoch": 0.005875,
"grad_norm": 2.1911983489990234,
"grad_norm_var": 0.28768473978113296,
"learning_rate": 4.7e-05,
"loss": 0.9509,
"loss/crossentropy": 2.6825270652770996,
"loss/hidden": 0.76953125,
"loss/logits": 0.11942489445209503,
"loss/reg": 0.006191306747496128,
"step": 47
},
{
"epoch": 0.006,
"grad_norm": 3.2640700340270996,
"grad_norm_var": 0.30827796768009724,
"learning_rate": 4.8e-05,
"loss": 1.0346,
"loss/crossentropy": 2.3665199279785156,
"loss/hidden": 0.83203125,
"loss/logits": 0.14068934321403503,
"loss/reg": 0.0061909533105790615,
"step": 48
},
{
"epoch": 0.006125,
"grad_norm": 2.259894847869873,
"grad_norm_var": 0.3163475179157634,
"learning_rate": 4.9e-05,
"loss": 0.9647,
"loss/crossentropy": 2.4414587020874023,
"loss/hidden": 0.79296875,
"loss/logits": 0.10987477004528046,
"loss/reg": 0.0061905342154204845,
"step": 49
},
{
"epoch": 0.00625,
"grad_norm": 2.7616565227508545,
"grad_norm_var": 0.28721415330329,
"learning_rate": 5e-05,
"loss": 1.019,
"loss/crossentropy": 2.0829460620880127,
"loss/hidden": 0.83984375,
"loss/logits": 0.11724002659320831,
"loss/reg": 0.0061900559812784195,
"step": 50
},
{
"epoch": 0.006375,
"grad_norm": 2.7897861003875732,
"grad_norm_var": 0.28297568806904866,
"learning_rate": 5.1000000000000006e-05,
"loss": 0.853,
"loss/crossentropy": 2.5636909008026123,
"loss/hidden": 0.6953125,
"loss/logits": 0.09577471762895584,
"loss/reg": 0.00618965458124876,
"step": 51
},
{
"epoch": 0.0065,
"grad_norm": 2.3134403228759766,
"grad_norm_var": 0.2711290924819705,
"learning_rate": 5.2000000000000004e-05,
"loss": 1.0497,
"loss/crossentropy": 2.440258026123047,
"loss/hidden": 0.83984375,
"loss/logits": 0.14791719615459442,
"loss/reg": 0.006189141888171434,
"step": 52
},
{
"epoch": 0.006625,
"grad_norm": 2.2032997608184814,
"grad_norm_var": 0.2855897568404882,
"learning_rate": 5.300000000000001e-05,
"loss": 0.9934,
"loss/crossentropy": 2.4747955799102783,
"loss/hidden": 0.796875,
"loss/logits": 0.13461169600486755,
"loss/reg": 0.006188610102981329,
"step": 53
},
{
"epoch": 0.00675,
"grad_norm": 2.267400026321411,
"grad_norm_var": 0.24358579758792467,
"learning_rate": 5.4000000000000005e-05,
"loss": 1.1149,
"loss/crossentropy": 2.705127477645874,
"loss/hidden": 0.89453125,
"loss/logits": 0.1585235595703125,
"loss/reg": 0.0061880191788077354,
"step": 54
},
{
"epoch": 0.006875,
"grad_norm": 2.281036853790283,
"grad_norm_var": 0.14220569464836952,
"learning_rate": 5.500000000000001e-05,
"loss": 0.9642,
"loss/crossentropy": 2.545010805130005,
"loss/hidden": 0.78515625,
"loss/logits": 0.11717304587364197,
"loss/reg": 0.006187579594552517,
"step": 55
},
{
"epoch": 0.007,
"grad_norm": 4.942420959472656,
"grad_norm_var": 0.4975759650139497,
"learning_rate": 5.6000000000000006e-05,
"loss": 1.1237,
"loss/crossentropy": 2.7698795795440674,
"loss/hidden": 0.91796875,
"loss/logits": 0.14385326206684113,
"loss/reg": 0.006187067367136478,
"step": 56
},
{
"epoch": 0.007125,
"grad_norm": 2.4213955402374268,
"grad_norm_var": 0.4976009733976563,
"learning_rate": 5.6999999999999996e-05,
"loss": 1.0386,
"loss/crossentropy": 2.572023868560791,
"loss/hidden": 0.84765625,
"loss/logits": 0.12909512221813202,
"loss/reg": 0.006186594720929861,
"step": 57
},
{
"epoch": 0.00725,
"grad_norm": 2.15891695022583,
"grad_norm_var": 0.5091253321428854,
"learning_rate": 5.8e-05,
"loss": 0.961,
"loss/crossentropy": 2.283557415008545,
"loss/hidden": 0.7734375,
"loss/logits": 0.12568500638008118,
"loss/reg": 0.006185955833643675,
"step": 58
},
{
"epoch": 0.007375,
"grad_norm": 2.36811900138855,
"grad_norm_var": 0.48432608682591366,
"learning_rate": 5.9e-05,
"loss": 0.8386,
"loss/crossentropy": 2.453810453414917,
"loss/hidden": 0.6796875,
"loss/logits": 0.09709502756595612,
"loss/reg": 0.0061853062361478806,
"step": 59
},
{
"epoch": 0.0075,
"grad_norm": 2.591327667236328,
"grad_norm_var": 0.4836842483889178,
"learning_rate": 6e-05,
"loss": 1.033,
"loss/crossentropy": 2.8110511302948,
"loss/hidden": 0.81640625,
"loss/logits": 0.1547423005104065,
"loss/reg": 0.006184632424265146,
"step": 60
},
{
"epoch": 0.007625,
"grad_norm": 2.0103816986083984,
"grad_norm_var": 0.5047142009615214,
"learning_rate": 6.1e-05,
"loss": 0.9296,
"loss/crossentropy": 2.15134334564209,
"loss/hidden": 0.7578125,
"loss/logits": 0.1099701076745987,
"loss/reg": 0.0061841062270104885,
"step": 61
},
{
"epoch": 0.00775,
"grad_norm": 1.80124831199646,
"grad_norm_var": 0.5287549745746596,
"learning_rate": 6.2e-05,
"loss": 0.9266,
"loss/crossentropy": 2.7054479122161865,
"loss/hidden": 0.7421875,
"loss/logits": 0.12253857403993607,
"loss/reg": 0.0061835781671106815,
"step": 62
},
{
"epoch": 0.007875,
"grad_norm": 2.277440309524536,
"grad_norm_var": 0.5252193383179133,
"learning_rate": 6.3e-05,
"loss": 0.914,
"loss/crossentropy": 2.6631381511688232,
"loss/hidden": 0.734375,
"loss/logits": 0.1177992895245552,
"loss/reg": 0.0061830319464206696,
"step": 63
},
{
"epoch": 0.008,
"grad_norm": 3.3314151763916016,
"grad_norm_var": 0.531964164332922,
"learning_rate": 6.400000000000001e-05,
"loss": 1.29,
"loss/crossentropy": 2.1269633769989014,
"loss/hidden": 1.0625,
"loss/logits": 0.16565865278244019,
"loss/reg": 0.006182366982102394,
"step": 64
},
{
"epoch": 0.008125,
"grad_norm": 4.333358287811279,
"grad_norm_var": 0.7208240839518936,
"learning_rate": 6.500000000000001e-05,
"loss": 1.1615,
"loss/crossentropy": 2.714442491531372,
"loss/hidden": 0.94140625,
"loss/logits": 0.15825161337852478,
"loss/reg": 0.006181675940752029,
"step": 65
},
{
"epoch": 0.00825,
"grad_norm": 2.853740930557251,
"grad_norm_var": 0.7223776199927481,
"learning_rate": 6.6e-05,
"loss": 1.062,
"loss/crossentropy": 2.2147135734558105,
"loss/hidden": 0.8515625,
"loss/logits": 0.14859826862812042,
"loss/reg": 0.006180979777127504,
"step": 66
},
{
"epoch": 0.008375,
"grad_norm": 2.8853657245635986,
"grad_norm_var": 0.7242961395218184,
"learning_rate": 6.7e-05,
"loss": 0.9533,
"loss/crossentropy": 2.619598388671875,
"loss/hidden": 0.7734375,
"loss/logits": 0.11804014444351196,
"loss/reg": 0.006180332973599434,
"step": 67
},
{
"epoch": 0.0085,
"grad_norm": 2.725229501724243,
"grad_norm_var": 0.7142181363616674,
"learning_rate": 6.800000000000001e-05,
"loss": 1.1308,
"loss/crossentropy": 2.4091367721557617,
"loss/hidden": 0.90234375,
"loss/logits": 0.16662752628326416,
"loss/reg": 0.006179714575409889,
"step": 68
},
{
"epoch": 0.008625,
"grad_norm": 2.93643856048584,
"grad_norm_var": 0.6977178730278022,
"learning_rate": 6.9e-05,
"loss": 1.1414,
"loss/crossentropy": 2.509793281555176,
"loss/hidden": 0.90234375,
"loss/logits": 0.17730477452278137,
"loss/reg": 0.0061789220198988914,
"step": 69
},
{
"epoch": 0.00875,
"grad_norm": 2.4086973667144775,
"grad_norm_var": 0.6896555586144653,
"learning_rate": 7e-05,
"loss": 0.9852,
"loss/crossentropy": 2.7080371379852295,
"loss/hidden": 0.7890625,
"loss/logits": 0.1343374401330948,
"loss/reg": 0.0061781019903719425,
"step": 70
},
{
"epoch": 0.008875,
"grad_norm": 1.9355547428131104,
"grad_norm_var": 0.7196579708330165,
"learning_rate": 7.1e-05,
"loss": 0.9176,
"loss/crossentropy": 2.451488494873047,
"loss/hidden": 0.7421875,
"loss/logits": 0.11365102231502533,
"loss/reg": 0.006177456583827734,
"step": 71
},
{
"epoch": 0.009,
"grad_norm": 2.273902654647827,
"grad_norm_var": 0.38422972669649574,
"learning_rate": 7.2e-05,
"loss": 1.0112,
"loss/crossentropy": 2.4479947090148926,
"loss/hidden": 0.8125,
"loss/logits": 0.13690924644470215,
"loss/reg": 0.006176764145493507,
"step": 72
},
{
"epoch": 0.009125,
"grad_norm": 3.385849952697754,
"grad_norm_var": 0.4217084598233742,
"learning_rate": 7.3e-05,
"loss": 1.3992,
"loss/crossentropy": 2.3916804790496826,
"loss/hidden": 1.1484375,
"loss/logits": 0.18896484375,
"loss/reg": 0.006176079623401165,
"step": 73
},
{
"epoch": 0.00925,
"grad_norm": 1.893932580947876,
"grad_norm_var": 0.44317594415441114,
"learning_rate": 7.4e-05,
"loss": 0.9357,
"loss/crossentropy": 2.3809518814086914,
"loss/hidden": 0.74609375,
"loss/logits": 0.12787015736103058,
"loss/reg": 0.00617539556697011,
"step": 74
},
{
"epoch": 0.009375,
"grad_norm": 2.431032657623291,
"grad_norm_var": 0.4412621914582907,
"learning_rate": 7.500000000000001e-05,
"loss": 1.0796,
"loss/crossentropy": 2.5346295833587646,
"loss/hidden": 0.86328125,
"loss/logits": 0.1545613557100296,
"loss/reg": 0.006174764130264521,
"step": 75
},
{
"epoch": 0.0095,
"grad_norm": 2.2421321868896484,
"grad_norm_var": 0.45066905079875685,
"learning_rate": 7.6e-05,
"loss": 0.9869,
"loss/crossentropy": 2.756843090057373,
"loss/hidden": 0.796875,
"loss/logits": 0.1282375454902649,
"loss/reg": 0.006174163427203894,
"step": 76
},
{
"epoch": 0.009625,
"grad_norm": 2.7022979259490967,
"grad_norm_var": 0.4254703741989109,
"learning_rate": 7.7e-05,
"loss": 1.2503,
"loss/crossentropy": 2.0696699619293213,
"loss/hidden": 1.015625,
"loss/logits": 0.1729813814163208,
"loss/reg": 0.006173421163111925,
"step": 77
},
{
"epoch": 0.00975,
"grad_norm": 2.501106023788452,
"grad_norm_var": 0.37677934250983375,
"learning_rate": 7.800000000000001e-05,
"loss": 1.0516,
"loss/crossentropy": 2.629380941390991,
"loss/hidden": 0.83984375,
"loss/logits": 0.15003597736358643,
"loss/reg": 0.006172672379761934,
"step": 78
},
{
"epoch": 0.009875,
"grad_norm": 2.137601137161255,
"grad_norm_var": 0.3857841035513881,
"learning_rate": 7.900000000000001e-05,
"loss": 0.9388,
"loss/crossentropy": 2.6841280460357666,
"loss/hidden": 0.75,
"loss/logits": 0.12706515192985535,
"loss/reg": 0.006171974819153547,
"step": 79
},
{
"epoch": 0.01,
"grad_norm": 4.655951976776123,
"grad_norm_var": 0.6093991769416703,
"learning_rate": 8e-05,
"loss": 1.2659,
"loss/crossentropy": 2.4634439945220947,
"loss/hidden": 1.0390625,
"loss/logits": 0.16511483490467072,
"loss/reg": 0.006171175744384527,
"step": 80
},
{
"epoch": 0.010125,
"grad_norm": 2.2418179512023926,
"grad_norm_var": 0.44652068466097317,
"learning_rate": 8.1e-05,
"loss": 1.0773,
"loss/crossentropy": 2.479743480682373,
"loss/hidden": 0.87890625,
"loss/logits": 0.1366729438304901,
"loss/reg": 0.006170437205582857,
"step": 81
},
{
"epoch": 0.01025,
"grad_norm": 2.0470192432403564,
"grad_norm_var": 0.4640077865797357,
"learning_rate": 8.2e-05,
"loss": 0.8599,
"loss/crossentropy": 2.440803050994873,
"loss/hidden": 0.68359375,
"loss/logits": 0.11458206921815872,
"loss/reg": 0.0061697582714259624,
"step": 82
},
{
"epoch": 0.010375,
"grad_norm": 2.0131125450134277,
"grad_norm_var": 0.47694604476552793,
"learning_rate": 8.3e-05,
"loss": 0.8585,
"loss/crossentropy": 2.480877637863159,
"loss/hidden": 0.6875,
"loss/logits": 0.10927767306566238,
"loss/reg": 0.006169027183204889,
"step": 83
},
{
"epoch": 0.0105,
"grad_norm": 2.2644267082214355,
"grad_norm_var": 0.47842071328175656,
"learning_rate": 8.4e-05,
"loss": 0.8351,
"loss/crossentropy": 2.693246841430664,
"loss/hidden": 0.67578125,
"loss/logits": 0.09764716029167175,
"loss/reg": 0.006168315652757883,
"step": 84
},
{
"epoch": 0.010625,
"grad_norm": 3.1729207038879395,
"grad_norm_var": 0.4955376038232837,
"learning_rate": 8.5e-05,
"loss": 1.2314,
"loss/crossentropy": 2.3339309692382812,
"loss/hidden": 1.015625,
"loss/logits": 0.15408015251159668,
"loss/reg": 0.006167604587972164,
"step": 85
},
{
"epoch": 0.01075,
"grad_norm": 2.281872510910034,
"grad_norm_var": 0.4984116504809473,
"learning_rate": 8.6e-05,
"loss": 1.1113,
"loss/crossentropy": 2.410794258117676,
"loss/hidden": 0.8828125,
"loss/logits": 0.16686803102493286,
"loss/reg": 0.0061669000424444675,
"step": 86
},
{
"epoch": 0.010875,
"grad_norm": 2.701244354248047,
"grad_norm_var": 0.4762769450482454,
"learning_rate": 8.7e-05,
"loss": 0.9115,
"loss/crossentropy": 2.5270962715148926,
"loss/hidden": 0.73046875,
"loss/logits": 0.11935658752918243,
"loss/reg": 0.0061660343781113625,
"step": 87
},
{
"epoch": 0.011,
"grad_norm": 2.0738677978515625,
"grad_norm_var": 0.4863854399313406,
"learning_rate": 8.800000000000001e-05,
"loss": 0.9634,
"loss/crossentropy": 2.625903844833374,
"loss/hidden": 0.7734375,
"loss/logits": 0.12826378643512726,
"loss/reg": 0.006165289785712957,
"step": 88
},
{
"epoch": 0.011125,
"grad_norm": 2.827744245529175,
"grad_norm_var": 0.44340376520124375,
"learning_rate": 8.900000000000001e-05,
"loss": 1.0134,
"loss/crossentropy": 2.2436654567718506,
"loss/hidden": 0.80078125,
"loss/logits": 0.15097512304782867,
"loss/reg": 0.006164397578686476,
"step": 89
},
{
"epoch": 0.01125,
"grad_norm": 2.412203788757324,
"grad_norm_var": 0.4174983019540292,
"learning_rate": 9e-05,
"loss": 0.9541,
"loss/crossentropy": 2.4847052097320557,
"loss/hidden": 0.78515625,
"loss/logits": 0.10735376924276352,
"loss/reg": 0.006163434591144323,
"step": 90
},
{
"epoch": 0.011375,
"grad_norm": 2.385309934616089,
"grad_norm_var": 0.41831854842319344,
"learning_rate": 9.1e-05,
"loss": 1.0455,
"loss/crossentropy": 2.1011688709259033,
"loss/hidden": 0.828125,
"loss/logits": 0.15577414631843567,
"loss/reg": 0.0061626131646335125,
"step": 91
},
{
"epoch": 0.0115,
"grad_norm": 2.779266595840454,
"grad_norm_var": 0.4149256226543306,
"learning_rate": 9.200000000000001e-05,
"loss": 0.9782,
"loss/crossentropy": 2.770954132080078,
"loss/hidden": 0.78125,
"loss/logits": 0.13530117273330688,
"loss/reg": 0.006161784287542105,
"step": 92
},
{
"epoch": 0.011625,
"grad_norm": 2.816206216812134,
"grad_norm_var": 0.41767206123470924,
"learning_rate": 9.300000000000001e-05,
"loss": 1.2584,
"loss/crossentropy": 2.4919488430023193,
"loss/hidden": 1.0234375,
"loss/logits": 0.17335021495819092,
"loss/reg": 0.006160792429000139,
"step": 93
},
{
"epoch": 0.01175,
"grad_norm": 2.1000349521636963,
"grad_norm_var": 0.4320504871954351,
"learning_rate": 9.4e-05,
"loss": 0.9293,
"loss/crossentropy": 2.6951355934143066,
"loss/hidden": 0.7421875,
"loss/logits": 0.12551091611385345,
"loss/reg": 0.006159830838441849,
"step": 94
},
{
"epoch": 0.011875,
"grad_norm": 2.6696228981018066,
"grad_norm_var": 0.4199965621062515,
"learning_rate": 9.5e-05,
"loss": 1.0491,
"loss/crossentropy": 2.6532485485076904,
"loss/hidden": 0.83984375,
"loss/logits": 0.14771661162376404,
"loss/reg": 0.006158801261335611,
"step": 95
},
{
"epoch": 0.012,
"grad_norm": 2.308758020401001,
"grad_norm_var": 0.11782165750081125,
"learning_rate": 9.6e-05,
"loss": 1.1178,
"loss/crossentropy": 2.38185977935791,
"loss/hidden": 0.90625,
"loss/logits": 0.1499352604150772,
"loss/reg": 0.006157839670777321,
"step": 96
},
{
"epoch": 0.012125,
"grad_norm": 2.4204304218292236,
"grad_norm_var": 0.11501335190634426,
"learning_rate": 9.7e-05,
"loss": 1.092,
"loss/crossentropy": 2.4358534812927246,
"loss/hidden": 0.86328125,
"loss/logits": 0.16712763905525208,
"loss/reg": 0.006156752817332745,
"step": 97
},
{
"epoch": 0.01225,
"grad_norm": 3.7184524536132812,
"grad_norm_var": 0.198780236272727,
"learning_rate": 9.8e-05,
"loss": 1.4311,
"loss/crossentropy": 2.1283679008483887,
"loss/hidden": 1.171875,
"loss/logits": 0.1976230889558792,
"loss/reg": 0.006155804730951786,
"step": 98
},
{
"epoch": 0.012375,
"grad_norm": 3.2656571865081787,
"grad_norm_var": 0.20565265002658914,
"learning_rate": 9.900000000000001e-05,
"loss": 1.017,
"loss/crossentropy": 2.6715664863586426,
"loss/hidden": 0.80078125,
"loss/logits": 0.15465494990348816,
"loss/reg": 0.006154791917651892,
"step": 99
},
{
"epoch": 0.0125,
"grad_norm": 2.915663719177246,
"grad_norm_var": 0.19977570339779593,
"learning_rate": 0.0001,
"loss": 0.98,
"loss/crossentropy": 2.5455305576324463,
"loss/hidden": 0.77734375,
"loss/logits": 0.1410846710205078,
"loss/reg": 0.0061536673456430435,
"step": 100
},
{
"epoch": 0.012625,
"grad_norm": 3.3153059482574463,
"grad_norm_var": 0.2104372314148539,
"learning_rate": 0.0001,
"loss": 1.1039,
"loss/crossentropy": 2.455479621887207,
"loss/hidden": 0.90625,
"loss/logits": 0.13615351915359497,
"loss/reg": 0.0061526307836174965,
"step": 101
},
{
"epoch": 0.01275,
"grad_norm": 2.40315318107605,
"grad_norm_var": 0.20480568897691,
"learning_rate": 0.0001,
"loss": 0.9588,
"loss/crossentropy": 2.6359853744506836,
"loss/hidden": 0.76953125,
"loss/logits": 0.1277719885110855,
"loss/reg": 0.006151493173092604,
"step": 102
},
{
"epoch": 0.012875,
"grad_norm": 3.625624895095825,
"grad_norm_var": 0.25903479701245613,
"learning_rate": 0.0001,
"loss": 1.2481,
"loss/crossentropy": 2.0148656368255615,
"loss/hidden": 1.046875,
"loss/logits": 0.13969773054122925,
"loss/reg": 0.006150420755147934,
"step": 103
},
{
"epoch": 0.013,
"grad_norm": 2.497906446456909,
"grad_norm_var": 0.23191354079432358,
"learning_rate": 0.0001,
"loss": 1.0603,
"loss/crossentropy": 2.3493525981903076,
"loss/hidden": 0.86328125,
"loss/logits": 0.13548779487609863,
"loss/reg": 0.006149281747639179,
"step": 104
},
{
"epoch": 0.013125,
"grad_norm": 3.258059501647949,
"grad_norm_var": 0.24629299643454275,
"learning_rate": 0.0001,
"loss": 0.9497,
"loss/crossentropy": 2.6988418102264404,
"loss/hidden": 0.7734375,
"loss/logits": 0.11473990976810455,
"loss/reg": 0.006148339249193668,
"step": 105
},
{
"epoch": 0.01325,
"grad_norm": 3.1279666423797607,
"grad_norm_var": 0.24075672502018505,
"learning_rate": 0.0001,
"loss": 1.1195,
"loss/crossentropy": 2.578716278076172,
"loss/hidden": 0.875,
"loss/logits": 0.18304204940795898,
"loss/reg": 0.006147205363959074,
"step": 106
},
{
"epoch": 0.013375,
"grad_norm": 2.760901927947998,
"grad_norm_var": 0.22627915570051277,
"learning_rate": 0.0001,
"loss": 0.9369,
"loss/crossentropy": 2.5835328102111816,
"loss/hidden": 0.75,
"loss/logits": 0.12544697523117065,
"loss/reg": 0.006146106868982315,
"step": 107
},
{
"epoch": 0.0135,
"grad_norm": 3.2917559146881104,
"grad_norm_var": 0.23622539643692994,
"learning_rate": 0.0001,
"loss": 1.1437,
"loss/crossentropy": 2.6001460552215576,
"loss/hidden": 0.91796875,
"loss/logits": 0.16428819298744202,
"loss/reg": 0.006144997663795948,
"step": 108
},
{
"epoch": 0.013625,
"grad_norm": 3.3908517360687256,
"grad_norm_var": 0.2499864352593607,
"learning_rate": 0.0001,
"loss": 1.0747,
"loss/crossentropy": 2.6003377437591553,
"loss/hidden": 0.87109375,
"loss/logits": 0.14213082194328308,
"loss/reg": 0.00614393362775445,
"step": 109
},
{
"epoch": 0.01375,
"grad_norm": 2.7455620765686035,
"grad_norm_var": 0.2035723185991922,
"learning_rate": 0.0001,
"loss": 1.1844,
"loss/crossentropy": 2.446432113647461,
"loss/hidden": 0.94921875,
"loss/logits": 0.17372827231884003,
"loss/reg": 0.00614282488822937,
"step": 110
},
{
"epoch": 0.013875,
"grad_norm": 2.899392604827881,
"grad_norm_var": 0.1972949454934593,
"learning_rate": 0.0001,
"loss": 1.0314,
"loss/crossentropy": 2.4233920574188232,
"loss/hidden": 0.83984375,
"loss/logits": 0.13018067181110382,
"loss/reg": 0.00614172825589776,
"step": 111
},
{
"epoch": 0.014,
"grad_norm": 2.204866647720337,
"grad_norm_var": 0.20749751086427656,
"learning_rate": 0.0001,
"loss": 0.9867,
"loss/crossentropy": 2.4006736278533936,
"loss/hidden": 0.79296875,
"loss/logits": 0.13233302533626556,
"loss/reg": 0.006140332669019699,
"step": 112
},
{
"epoch": 0.014125,
"grad_norm": 2.5094263553619385,
"grad_norm_var": 0.20123279411857975,
"learning_rate": 0.0001,
"loss": 1.2429,
"loss/crossentropy": 2.2730560302734375,
"loss/hidden": 1.0078125,
"loss/logits": 0.1737476885318756,
"loss/reg": 0.006138913799077272,
"step": 113
},
{
"epoch": 0.01425,
"grad_norm": 2.590543031692505,
"grad_norm_var": 0.17204464736018749,
"learning_rate": 0.0001,
"loss": 1.0086,
"loss/crossentropy": 2.5709896087646484,
"loss/hidden": 0.79296875,
"loss/logits": 0.1542350947856903,
"loss/reg": 0.0061377594247460365,
"step": 114
},
{
"epoch": 0.014375,
"grad_norm": 2.5024876594543457,
"grad_norm_var": 0.17379926494707643,
"learning_rate": 0.0001,
"loss": 1.0309,
"loss/crossentropy": 2.539165496826172,
"loss/hidden": 0.828125,
"loss/logits": 0.14142319560050964,
"loss/reg": 0.006136584095656872,
"step": 115
},
{
"epoch": 0.0145,
"grad_norm": 3.2216732501983643,
"grad_norm_var": 0.18121036366206128,
"learning_rate": 0.0001,
"loss": 0.9404,
"loss/crossentropy": 2.7685325145721436,
"loss/hidden": 0.765625,
"loss/logits": 0.1133967787027359,
"loss/reg": 0.006135319825261831,
"step": 116
},
{
"epoch": 0.014625,
"grad_norm": 2.3834009170532227,
"grad_norm_var": 0.18346146088524526,
"learning_rate": 0.0001,
"loss": 1.1432,
"loss/crossentropy": 2.4507999420166016,
"loss/hidden": 0.92578125,
"loss/logits": 0.1561031937599182,
"loss/reg": 0.006133983377367258,
"step": 117
},
{
"epoch": 0.01475,
"grad_norm": 2.4703636169433594,
"grad_norm_var": 0.17984383474256424,
"learning_rate": 0.0001,
"loss": 1.0541,
"loss/crossentropy": 2.3506076335906982,
"loss/hidden": 0.84765625,
"loss/logits": 0.14511807262897491,
"loss/reg": 0.006132753100246191,
"step": 118
},
{
"epoch": 0.014875,
"grad_norm": 2.5960817337036133,
"grad_norm_var": 0.13859654880591943,
"learning_rate": 0.0001,
"loss": 1.2156,
"loss/crossentropy": 2.427006244659424,
"loss/hidden": 0.96875,
"loss/logits": 0.1855170726776123,
"loss/reg": 0.006131566129624844,
"step": 119
},
{
"epoch": 0.015,
"grad_norm": 2.908734083175659,
"grad_norm_var": 0.13379147574996655,
"learning_rate": 0.0001,
"loss": 1.0136,
"loss/crossentropy": 2.4075210094451904,
"loss/hidden": 0.81640625,
"loss/logits": 0.13592825829982758,
"loss/reg": 0.006130332592874765,
"step": 120
},
{
"epoch": 0.015125,
"grad_norm": 3.450002670288086,
"grad_norm_var": 0.147717685364636,
"learning_rate": 0.0001,
"loss": 1.1584,
"loss/crossentropy": 2.446925640106201,
"loss/hidden": 0.92578125,
"loss/logits": 0.17129938304424286,
"loss/reg": 0.0061291721649467945,
"step": 121
},
{
"epoch": 0.01525,
"grad_norm": 2.941195011138916,
"grad_norm_var": 0.14212594790061886,
"learning_rate": 0.0001,
"loss": 1.0996,
"loss/crossentropy": 2.5499086380004883,
"loss/hidden": 0.87109375,
"loss/logits": 0.1672220528125763,
"loss/reg": 0.006127914879471064,
"step": 122
},
{
"epoch": 0.015375,
"grad_norm": 2.951799154281616,
"grad_norm_var": 0.14330143067309015,
"learning_rate": 0.0001,
"loss": 1.0862,
"loss/crossentropy": 2.654383420944214,
"loss/hidden": 0.87109375,
"loss/logits": 0.15379250049591064,
"loss/reg": 0.006126696243882179,
"step": 123
},
{
"epoch": 0.0155,
"grad_norm": 2.5093131065368652,
"grad_norm_var": 0.13194533540905293,
"learning_rate": 0.0001,
"loss": 1.0905,
"loss/crossentropy": 2.4646618366241455,
"loss/hidden": 0.87890625,
"loss/logits": 0.15029752254486084,
"loss/reg": 0.006125394720584154,
"step": 124
},
{
"epoch": 0.015625,
"grad_norm": 2.357142448425293,
"grad_norm_var": 0.11277765633995311,
"learning_rate": 0.0001,
"loss": 1.0794,
"loss/crossentropy": 2.4590322971343994,
"loss/hidden": 0.87109375,
"loss/logits": 0.1471107453107834,
"loss/reg": 0.0061240773648023605,
"step": 125
},
{
"epoch": 0.01575,
"grad_norm": 2.0443954467773438,
"grad_norm_var": 0.13949059079901172,
"learning_rate": 0.0001,
"loss": 1.0064,
"loss/crossentropy": 2.6105568408966064,
"loss/hidden": 0.80859375,
"loss/logits": 0.13658249378204346,
"loss/reg": 0.006122750695794821,
"step": 126
},
{
"epoch": 0.015875,
"grad_norm": 2.334003448486328,
"grad_norm_var": 0.1413326038540049,
"learning_rate": 0.0001,
"loss": 1.128,
"loss/crossentropy": 2.3226428031921387,
"loss/hidden": 0.8984375,
"loss/logits": 0.16836631298065186,
"loss/reg": 0.006121381651610136,
"step": 127
},
{
"epoch": 0.016,
"grad_norm": 2.6693766117095947,
"grad_norm_var": 0.12889249481462456,
"learning_rate": 0.0001,
"loss": 1.0478,
"loss/crossentropy": 2.5844597816467285,
"loss/hidden": 0.84765625,
"loss/logits": 0.1388963758945465,
"loss/reg": 0.006120136007666588,
"step": 128
},
{
"epoch": 0.016125,
"grad_norm": 3.935439348220825,
"grad_norm_var": 0.22878447427120438,
"learning_rate": 0.0001,
"loss": 1.1726,
"loss/crossentropy": 2.7213780879974365,
"loss/hidden": 0.9375,
"loss/logits": 0.1738772690296173,
"loss/reg": 0.006118897348642349,
"step": 129
},
{
"epoch": 0.01625,
"grad_norm": 3.463432788848877,
"grad_norm_var": 0.25882213944617144,
"learning_rate": 0.0001,
"loss": 1.0898,
"loss/crossentropy": 2.3635873794555664,
"loss/hidden": 0.8828125,
"loss/logits": 0.1457763910293579,
"loss/reg": 0.006117486394941807,
"step": 130
},
{
"epoch": 0.016375,
"grad_norm": 3.779526948928833,
"grad_norm_var": 0.31074183113488135,
"learning_rate": 0.0001,
"loss": 1.2078,
"loss/crossentropy": 2.316762924194336,
"loss/hidden": 0.98046875,
"loss/logits": 0.16614478826522827,
"loss/reg": 0.006116243079304695,
"step": 131
},
{
"epoch": 0.0165,
"grad_norm": 2.7554008960723877,
"grad_norm_var": 0.3028391023812749,
"learning_rate": 0.0001,
"loss": 0.9769,
"loss/crossentropy": 2.458954095840454,
"loss/hidden": 0.7890625,
"loss/logits": 0.12667913734912872,
"loss/reg": 0.006114880088716745,
"step": 132
},
{
"epoch": 0.016625,
"grad_norm": 2.342526435852051,
"grad_norm_var": 0.30546929082944035,
"learning_rate": 0.0001,
"loss": 1.1137,
"loss/crossentropy": 2.6329517364501953,
"loss/hidden": 0.890625,
"loss/logits": 0.161947563290596,
"loss/reg": 0.0061136274598538876,
"step": 133
},
{
"epoch": 0.01675,
"grad_norm": 2.2754058837890625,
"grad_norm_var": 0.31756495416411024,
"learning_rate": 0.0001,
"loss": 1.1703,
"loss/crossentropy": 2.2747550010681152,
"loss/hidden": 0.94921875,
"loss/logits": 0.15994513034820557,
"loss/reg": 0.006112351547926664,
"step": 134
},
{
"epoch": 0.016875,
"grad_norm": 3.1313912868499756,
"grad_norm_var": 0.3186282278045513,
"learning_rate": 0.0001,
"loss": 1.2333,
"loss/crossentropy": 2.4932894706726074,
"loss/hidden": 0.99609375,
"loss/logits": 0.17612434923648834,
"loss/reg": 0.006111042574048042,
"step": 135
},
{
"epoch": 0.017,
"grad_norm": 3.960482358932495,
"grad_norm_var": 0.39381746513703864,
"learning_rate": 0.0001,
"loss": 1.3101,
"loss/crossentropy": 2.581660747528076,
"loss/hidden": 1.0625,
"loss/logits": 0.18646802008152008,
"loss/reg": 0.006109676789492369,
"step": 136
},
{
"epoch": 0.017125,
"grad_norm": 2.7605810165405273,
"grad_norm_var": 0.37584340109069647,
"learning_rate": 0.0001,
"loss": 0.8792,
"loss/crossentropy": 2.6490936279296875,
"loss/hidden": 0.703125,
"loss/logits": 0.1150316372513771,
"loss/reg": 0.006108277477324009,
"step": 137
},
{
"epoch": 0.01725,
"grad_norm": 2.6196203231811523,
"grad_norm_var": 0.38003486499210315,
"learning_rate": 0.0001,
"loss": 0.955,
"loss/crossentropy": 2.633441209793091,
"loss/hidden": 0.76953125,
"loss/logits": 0.1244344562292099,
"loss/reg": 0.006106934975832701,
"step": 138
},
{
"epoch": 0.017375,
"grad_norm": 4.534512519836426,
"grad_norm_var": 0.554255985026353,
"learning_rate": 0.0001,
"loss": 1.4104,
"loss/crossentropy": 2.2204151153564453,
"loss/hidden": 1.1796875,
"loss/logits": 0.1696874350309372,
"loss/reg": 0.0061056241393089294,
"step": 139
},
{
"epoch": 0.0175,
"grad_norm": 2.192370653152466,
"grad_norm_var": 0.5798771099829023,
"learning_rate": 0.0001,
"loss": 1.1299,
"loss/crossentropy": 2.375506639480591,
"loss/hidden": 0.921875,
"loss/logits": 0.14694982767105103,
"loss/reg": 0.0061043244786560535,
"step": 140
},
{
"epoch": 0.017625,
"grad_norm": 4.368403911590576,
"grad_norm_var": 0.6744588881998081,
"learning_rate": 0.0001,
"loss": 1.278,
"loss/crossentropy": 2.3692545890808105,
"loss/hidden": 1.03125,
"loss/logits": 0.18568292260169983,
"loss/reg": 0.006102937273681164,
"step": 141
},
{
"epoch": 0.01775,
"grad_norm": 2.2753779888153076,
"grad_norm_var": 0.6461169960118004,
"learning_rate": 0.0001,
"loss": 1.0276,
"loss/crossentropy": 2.470676898956299,
"loss/hidden": 0.82421875,
"loss/logits": 0.14231771230697632,
"loss/reg": 0.006101653911173344,
"step": 142
},
{
"epoch": 0.017875,
"grad_norm": 2.6550562381744385,
"grad_norm_var": 0.6203099666067883,
"learning_rate": 0.0001,
"loss": 0.8712,
"loss/crossentropy": 2.8198063373565674,
"loss/hidden": 0.69921875,
"loss/logits": 0.11099085956811905,
"loss/reg": 0.006100376136600971,
"step": 143
},
{
"epoch": 0.018,
"grad_norm": 2.8701858520507812,
"grad_norm_var": 0.6111015072729884,
"learning_rate": 0.0001,
"loss": 1.1794,
"loss/crossentropy": 2.413463830947876,
"loss/hidden": 0.96484375,
"loss/logits": 0.15351834893226624,
"loss/reg": 0.006099053658545017,
"step": 144
},
{
"epoch": 0.018125,
"grad_norm": 2.2347958087921143,
"grad_norm_var": 0.6069563505613275,
"learning_rate": 0.0001,
"loss": 1.0832,
"loss/crossentropy": 2.446056604385376,
"loss/hidden": 0.8671875,
"loss/logits": 0.1550455242395401,
"loss/reg": 0.006097796373069286,
"step": 145
},
{
"epoch": 0.01825,
"grad_norm": 2.60143780708313,
"grad_norm_var": 0.6017061449507364,
"learning_rate": 0.0001,
"loss": 1.1216,
"loss/crossentropy": 2.2890260219573975,
"loss/hidden": 0.8984375,
"loss/logits": 0.16223573684692383,
"loss/reg": 0.006096460856497288,
"step": 146
},
{
"epoch": 0.018375,
"grad_norm": 3.656100273132324,
"grad_norm_var": 0.5891684064627459,
"learning_rate": 0.0001,
"loss": 1.2759,
"loss/crossentropy": 2.2077646255493164,
"loss/hidden": 1.0546875,
"loss/logits": 0.16024138033390045,
"loss/reg": 0.006095105782151222,
"step": 147
},
{
"epoch": 0.0185,
"grad_norm": 2.8190999031066895,
"grad_norm_var": 0.5877513730221795,
"learning_rate": 0.0001,
"loss": 1.1416,
"loss/crossentropy": 2.4892842769622803,
"loss/hidden": 0.9140625,
"loss/logits": 0.1665700376033783,
"loss/reg": 0.0060938019305467606,
"step": 148
},
{
"epoch": 0.018625,
"grad_norm": 2.6578848361968994,
"grad_norm_var": 0.568168306773175,
"learning_rate": 0.0001,
"loss": 1.1443,
"loss/crossentropy": 2.3138527870178223,
"loss/hidden": 0.93359375,
"loss/logits": 0.14977282285690308,
"loss/reg": 0.006092346739023924,
"step": 149
},
{
"epoch": 0.01875,
"grad_norm": 2.656559944152832,
"grad_norm_var": 0.5416540961853636,
"learning_rate": 0.0001,
"loss": 0.9868,
"loss/crossentropy": 2.7701377868652344,
"loss/hidden": 0.796875,
"loss/logits": 0.12901648879051208,
"loss/reg": 0.006090943701565266,
"step": 150
},
{
"epoch": 0.018875,
"grad_norm": 1.9359983205795288,
"grad_norm_var": 0.6099613145708634,
"learning_rate": 0.0001,
"loss": 0.9127,
"loss/crossentropy": 2.55560040473938,
"loss/hidden": 0.73828125,
"loss/logits": 0.11351295560598373,
"loss/reg": 0.00608965614810586,
"step": 151
},
{
"epoch": 0.019,
"grad_norm": 3.7978732585906982,
"grad_norm_var": 0.5891613317586338,
"learning_rate": 0.0001,
"loss": 1.2275,
"loss/crossentropy": 2.4227731227874756,
"loss/hidden": 0.98828125,
"loss/logits": 0.17836451530456543,
"loss/reg": 0.006088252179324627,
"step": 152
},
{
"epoch": 0.019125,
"grad_norm": 2.8193647861480713,
"grad_norm_var": 0.588169020521083,
"learning_rate": 0.0001,
"loss": 0.9739,
"loss/crossentropy": 2.474368095397949,
"loss/hidden": 0.80078125,
"loss/logits": 0.11225409805774689,
"loss/reg": 0.006086937617510557,
"step": 153
},
{
"epoch": 0.01925,
"grad_norm": 2.2882325649261475,
"grad_norm_var": 0.6082348956957436,
"learning_rate": 0.0001,
"loss": 1.0395,
"loss/crossentropy": 2.3776350021362305,
"loss/hidden": 0.82421875,
"loss/logits": 0.15443992614746094,
"loss/reg": 0.0060854703187942505,
"step": 154
},
{
"epoch": 0.019375,
"grad_norm": 2.006150245666504,
"grad_norm_var": 0.4559805309993303,
"learning_rate": 0.0001,
"loss": 0.9762,
"loss/crossentropy": 2.7556076049804688,
"loss/hidden": 0.78515625,
"loss/logits": 0.13019207119941711,
"loss/reg": 0.006084186024963856,
"step": 155
},
{
"epoch": 0.0195,
"grad_norm": 2.8143231868743896,
"grad_norm_var": 0.43477030174237014,
"learning_rate": 0.0001,
"loss": 1.1927,
"loss/crossentropy": 2.652045249938965,
"loss/hidden": 0.94140625,
"loss/logits": 0.19042611122131348,
"loss/reg": 0.00608274107798934,
"step": 156
},
{
"epoch": 0.019625,
"grad_norm": 2.957540988922119,
"grad_norm_var": 0.2601037584282233,
"learning_rate": 0.0001,
"loss": 1.0641,
"loss/crossentropy": 2.546213150024414,
"loss/hidden": 0.86328125,
"loss/logits": 0.14000022411346436,
"loss/reg": 0.006081291940063238,
"step": 157
},
{
"epoch": 0.01975,
"grad_norm": 2.625493288040161,
"grad_norm_var": 0.24839219907499052,
"learning_rate": 0.0001,
"loss": 1.012,
"loss/crossentropy": 2.5120432376861572,
"loss/hidden": 0.81640625,
"loss/logits": 0.13474689424037933,
"loss/reg": 0.006079958751797676,
"step": 158
},
{
"epoch": 0.019875,
"grad_norm": 2.6614878177642822,
"grad_norm_var": 0.2483457330217589,
"learning_rate": 0.0001,
"loss": 0.9873,
"loss/crossentropy": 2.312061071395874,
"loss/hidden": 0.80859375,
"loss/logits": 0.11790065467357635,
"loss/reg": 0.006078665144741535,
"step": 159
},
{
"epoch": 0.02,
"grad_norm": 2.6204919815063477,
"grad_norm_var": 0.24699792562249925,
"learning_rate": 0.0001,
"loss": 1.0488,
"loss/crossentropy": 2.505072593688965,
"loss/hidden": 0.84375,
"loss/logits": 0.14428117871284485,
"loss/reg": 0.006077310536056757,
"step": 160
},
{
"epoch": 0.020125,
"grad_norm": 3.107072591781616,
"grad_norm_var": 0.24079003208151678,
"learning_rate": 0.0001,
"loss": 1.1736,
"loss/crossentropy": 2.6514599323272705,
"loss/hidden": 0.96484375,
"loss/logits": 0.1480400413274765,
"loss/reg": 0.006076075602322817,
"step": 161
},
{
"epoch": 0.02025,
"grad_norm": 2.669001817703247,
"grad_norm_var": 0.23972287159530806,
"learning_rate": 0.0001,
"loss": 1.1966,
"loss/crossentropy": 2.4616479873657227,
"loss/hidden": 0.9765625,
"loss/logits": 0.15933012962341309,
"loss/reg": 0.006074720993638039,
"step": 162
},
{
"epoch": 0.020375,
"grad_norm": 2.5872421264648438,
"grad_norm_var": 0.1828196031273113,
"learning_rate": 0.0001,
"loss": 1.0551,
"loss/crossentropy": 2.5483999252319336,
"loss/hidden": 0.83984375,
"loss/logits": 0.1544739305973053,
"loss/reg": 0.006073469761759043,
"step": 163
},
{
"epoch": 0.0205,
"grad_norm": 2.3342509269714355,
"grad_norm_var": 0.1891007671877621,
"learning_rate": 0.0001,
"loss": 1.1418,
"loss/crossentropy": 2.610344171524048,
"loss/hidden": 0.90234375,
"loss/logits": 0.17876723408699036,
"loss/reg": 0.006072178483009338,
"step": 164
},
{
"epoch": 0.020625,
"grad_norm": 2.548274278640747,
"grad_norm_var": 0.18986337395058156,
"learning_rate": 0.0001,
"loss": 0.9512,
"loss/crossentropy": 2.747725009918213,
"loss/hidden": 0.7734375,
"loss/logits": 0.11706214398145676,
"loss/reg": 0.00607073912397027,
"step": 165
},
{
"epoch": 0.02075,
"grad_norm": 2.666066884994507,
"grad_norm_var": 0.18987501227134793,
"learning_rate": 0.0001,
"loss": 1.0557,
"loss/crossentropy": 2.3086578845977783,
"loss/hidden": 0.83984375,
"loss/logits": 0.1551416665315628,
"loss/reg": 0.006069260183721781,
"step": 166
},
{
"epoch": 0.020875,
"grad_norm": 3.363084554672241,
"grad_norm_var": 0.18083982986582872,
"learning_rate": 0.0001,
"loss": 0.9886,
"loss/crossentropy": 2.7422661781311035,
"loss/hidden": 0.79296875,
"loss/logits": 0.13497118651866913,
"loss/reg": 0.006067754700779915,
"step": 167
},
{
"epoch": 0.021,
"grad_norm": 2.717400550842285,
"grad_norm_var": 0.10163689874761227,
"learning_rate": 0.0001,
"loss": 1.2413,
"loss/crossentropy": 2.341296672821045,
"loss/hidden": 1.0078125,
"loss/logits": 0.17277640104293823,
"loss/reg": 0.006066245958209038,
"step": 168
},
{
"epoch": 0.021125,
"grad_norm": 2.2773897647857666,
"grad_norm_var": 0.10949759007257095,
"learning_rate": 0.0001,
"loss": 0.9531,
"loss/crossentropy": 2.492532968521118,
"loss/hidden": 0.76953125,
"loss/logits": 0.12295819818973541,
"loss/reg": 0.006064848508685827,
"step": 169
},
{
"epoch": 0.02125,
"grad_norm": 2.7625067234039307,
"grad_norm_var": 0.1012976809853086,
"learning_rate": 0.0001,
"loss": 1.0102,
"loss/crossentropy": 2.3799381256103516,
"loss/hidden": 0.80859375,
"loss/logits": 0.140989288687706,
"loss/reg": 0.0060633583925664425,
"step": 170
},
{
"epoch": 0.021375,
"grad_norm": 3.713162899017334,
"grad_norm_var": 0.1323542313667114,
"learning_rate": 0.0001,
"loss": 1.0173,
"loss/crossentropy": 2.7296385765075684,
"loss/hidden": 0.80078125,
"loss/logits": 0.1559314727783203,
"loss/reg": 0.006062004715204239,
"step": 171
},
{
"epoch": 0.0215,
"grad_norm": 2.8448026180267334,
"grad_norm_var": 0.13256580340874963,
"learning_rate": 0.0001,
"loss": 1.0945,
"loss/crossentropy": 2.211848497390747,
"loss/hidden": 0.87890625,
"loss/logits": 0.15503031015396118,
"loss/reg": 0.006060663145035505,
"step": 172
},
{
"epoch": 0.021625,
"grad_norm": 2.951566696166992,
"grad_norm_var": 0.13242537871232402,
"learning_rate": 0.0001,
"loss": 1.243,
"loss/crossentropy": 2.6379833221435547,
"loss/hidden": 0.96484375,
"loss/logits": 0.21754613518714905,
"loss/reg": 0.00605935649946332,
"step": 173
},
{
"epoch": 0.02175,
"grad_norm": 2.6862404346466064,
"grad_norm_var": 0.13142011502921586,
"learning_rate": 0.0001,
"loss": 1.0053,
"loss/crossentropy": 2.3807766437530518,
"loss/hidden": 0.80078125,
"loss/logits": 0.14393460750579834,
"loss/reg": 0.006058130878955126,
"step": 174
},
{
"epoch": 0.021875,
"grad_norm": 2.5145609378814697,
"grad_norm_var": 0.13512780159794507,
"learning_rate": 0.0001,
"loss": 1.0609,
"loss/crossentropy": 2.4608380794525146,
"loss/hidden": 0.85546875,
"loss/logits": 0.14485566318035126,
"loss/reg": 0.006056922487914562,
"step": 175
},
{
"epoch": 0.022,
"grad_norm": 3.23178768157959,
"grad_norm_var": 0.14607750168249728,
"learning_rate": 0.0001,
"loss": 1.1294,
"loss/crossentropy": 2.9791719913482666,
"loss/hidden": 0.91796875,
"loss/logits": 0.1508345603942871,
"loss/reg": 0.006055623292922974,
"step": 176
},
{
"epoch": 0.022125,
"grad_norm": 2.7397234439849854,
"grad_norm_var": 0.14000512423072375,
"learning_rate": 0.0001,
"loss": 1.0578,
"loss/crossentropy": 2.4559919834136963,
"loss/hidden": 0.86328125,
"loss/logits": 0.1340080350637436,
"loss/reg": 0.0060544307343661785,
"step": 177
},
{
"epoch": 0.02225,
"grad_norm": 2.6637048721313477,
"grad_norm_var": 0.14009088002925954,
"learning_rate": 0.0001,
"loss": 1.076,
"loss/crossentropy": 2.3794586658477783,
"loss/hidden": 0.86328125,
"loss/logits": 0.15214313566684723,
"loss/reg": 0.0060530174523591995,
"step": 178
},
{
"epoch": 0.022375,
"grad_norm": 2.0105221271514893,
"grad_norm_var": 0.17628626628935157,
"learning_rate": 0.0001,
"loss": 0.9703,
"loss/crossentropy": 2.3926336765289307,
"loss/hidden": 0.77734375,
"loss/logits": 0.13244566321372986,
"loss/reg": 0.0060517978854477406,
"step": 179
},
{
"epoch": 0.0225,
"grad_norm": 2.571902275085449,
"grad_norm_var": 0.16659277386996318,
"learning_rate": 0.0001,
"loss": 1.0739,
"loss/crossentropy": 2.7502923011779785,
"loss/hidden": 0.8515625,
"loss/logits": 0.16181406378746033,
"loss/reg": 0.006050686351954937,
"step": 180
},
{
"epoch": 0.022625,
"grad_norm": 2.700366973876953,
"grad_norm_var": 0.1636147823311904,
"learning_rate": 0.0001,
"loss": 1.0113,
"loss/crossentropy": 2.502389669418335,
"loss/hidden": 0.8125,
"loss/logits": 0.138347327709198,
"loss/reg": 0.006049246061593294,
"step": 181
},
{
"epoch": 0.02275,
"grad_norm": 2.7259435653686523,
"grad_norm_var": 0.1629618050893432,
"learning_rate": 0.0001,
"loss": 1.0192,
"loss/crossentropy": 2.2493560314178467,
"loss/hidden": 0.82421875,
"loss/logits": 0.1344609260559082,
"loss/reg": 0.006048021838068962,
"step": 182
},
{
"epoch": 0.022875,
"grad_norm": 4.930091857910156,
"grad_norm_var": 0.43832731745023895,
"learning_rate": 0.0001,
"loss": 1.1874,
"loss/crossentropy": 2.649231433868408,
"loss/hidden": 0.94140625,
"loss/logits": 0.1855432242155075,
"loss/reg": 0.006046844646334648,
"step": 183
},
{
"epoch": 0.023,
"grad_norm": 2.288604259490967,
"grad_norm_var": 0.4589782783160859,
"learning_rate": 0.0001,
"loss": 1.0354,
"loss/crossentropy": 3.0482568740844727,
"loss/hidden": 0.8203125,
"loss/logits": 0.15461647510528564,
"loss/reg": 0.006045445334166288,
"step": 184
},
{
"epoch": 0.023125,
"grad_norm": 2.7902991771698,
"grad_norm_var": 0.4362058684835667,
"learning_rate": 0.0001,
"loss": 1.0744,
"loss/crossentropy": 2.726069211959839,
"loss/hidden": 0.8359375,
"loss/logits": 0.17799492180347443,
"loss/reg": 0.006044231820851564,
"step": 185
},
{
"epoch": 0.02325,
"grad_norm": 3.597017526626587,
"grad_norm_var": 0.46633972017124825,
"learning_rate": 0.0001,
"loss": 1.0985,
"loss/crossentropy": 2.200692892074585,
"loss/hidden": 0.8984375,
"loss/logits": 0.13961729407310486,
"loss/reg": 0.006042772904038429,
"step": 186
},
{
"epoch": 0.023375,
"grad_norm": 2.969062566757202,
"grad_norm_var": 0.42374272593361867,
"learning_rate": 0.0001,
"loss": 1.2314,
"loss/crossentropy": 2.3744540214538574,
"loss/hidden": 0.96875,
"loss/logits": 0.20225511491298676,
"loss/reg": 0.006041594315320253,
"step": 187
},
{
"epoch": 0.0235,
"grad_norm": 3.2257020473480225,
"grad_norm_var": 0.4305906329857976,
"learning_rate": 0.0001,
"loss": 1.0982,
"loss/crossentropy": 2.442505121231079,
"loss/hidden": 0.875,
"loss/logits": 0.16284233331680298,
"loss/reg": 0.006040407810360193,
"step": 188
},
{
"epoch": 0.023625,
"grad_norm": 3.670443058013916,
"grad_norm_var": 0.4666515285365591,
"learning_rate": 0.0001,
"loss": 1.2391,
"loss/crossentropy": 2.533158540725708,
"loss/hidden": 0.98046875,
"loss/logits": 0.19827201962471008,
"loss/reg": 0.0060392809100449085,
"step": 189
},
{
"epoch": 0.02375,
"grad_norm": 7.53206729888916,
"grad_norm_var": 1.7591779439754056,
"learning_rate": 0.0001,
"loss": 1.1689,
"loss/crossentropy": 2.3104734420776367,
"loss/hidden": 0.96875,
"loss/logits": 0.13976144790649414,
"loss/reg": 0.006038178689777851,
"step": 190
},
{
"epoch": 0.023875,
"grad_norm": 4.658889293670654,
"grad_norm_var": 1.833400975261701,
"learning_rate": 0.0001,
"loss": 1.3266,
"loss/crossentropy": 2.286229133605957,
"loss/hidden": 1.1015625,
"loss/logits": 0.16465552151203156,
"loss/reg": 0.006036726757884026,
"step": 191
},
{
"epoch": 0.024,
"grad_norm": 3.2109904289245605,
"grad_norm_var": 1.8338781863373583,
"learning_rate": 0.0001,
"loss": 1.278,
"loss/crossentropy": 2.5849151611328125,
"loss/hidden": 1.0078125,
"loss/logits": 0.20983844995498657,
"loss/reg": 0.006035543512552977,
"step": 192
},
{
"epoch": 0.024125,
"grad_norm": 2.556408643722534,
"grad_norm_var": 1.8519417466969637,
"learning_rate": 0.0001,
"loss": 1.0335,
"loss/crossentropy": 2.635669231414795,
"loss/hidden": 0.8359375,
"loss/logits": 0.13721294701099396,
"loss/reg": 0.006034051068127155,
"step": 193
},
{
"epoch": 0.02425,
"grad_norm": 3.4185855388641357,
"grad_norm_var": 1.8153229069184569,
"learning_rate": 0.0001,
"loss": 1.0115,
"loss/crossentropy": 2.3127341270446777,
"loss/hidden": 0.828125,
"loss/logits": 0.12303752452135086,
"loss/reg": 0.00603274954482913,
"step": 194
},
{
"epoch": 0.024375,
"grad_norm": 3.639681816101074,
"grad_norm_var": 1.6731808292397734,
"learning_rate": 0.0001,
"loss": 1.2374,
"loss/crossentropy": 2.4363749027252197,
"loss/hidden": 0.98046875,
"loss/logits": 0.19659578800201416,
"loss/reg": 0.006031363736838102,
"step": 195
},
{
"epoch": 0.0245,
"grad_norm": 3.266385078430176,
"grad_norm_var": 1.614572274352353,
"learning_rate": 0.0001,
"loss": 1.19,
"loss/crossentropy": 2.2824337482452393,
"loss/hidden": 0.9609375,
"loss/logits": 0.16878634691238403,
"loss/reg": 0.006029782351106405,
"step": 196
},
{
"epoch": 0.024625,
"grad_norm": 3.0692105293273926,
"grad_norm_var": 1.5801212385016838,
"learning_rate": 0.0001,
"loss": 1.1495,
"loss/crossentropy": 2.518056631088257,
"loss/hidden": 0.921875,
"loss/logits": 0.16731634736061096,
"loss/reg": 0.006028252653777599,
"step": 197
},
{
"epoch": 0.02475,
"grad_norm": 3.390202283859253,
"grad_norm_var": 1.530565626963321,
"learning_rate": 0.0001,
"loss": 1.1783,
"loss/crossentropy": 2.3565316200256348,
"loss/hidden": 0.9375,
"loss/logits": 0.18055224418640137,
"loss/reg": 0.006026738323271275,
"step": 198
},
{
"epoch": 0.024875,
"grad_norm": 2.524461030960083,
"grad_norm_var": 1.4779304822181976,
"learning_rate": 0.0001,
"loss": 1.095,
"loss/crossentropy": 2.3489255905151367,
"loss/hidden": 0.88671875,
"loss/logits": 0.1480264812707901,
"loss/reg": 0.006025230046361685,
"step": 199
},
{
"epoch": 0.025,
"grad_norm": 2.8753433227539062,
"grad_norm_var": 1.4056158732497617,
"learning_rate": 0.0001,
"loss": 1.1396,
"loss/crossentropy": 2.379971504211426,
"loss/hidden": 0.90625,
"loss/logits": 0.17312359809875488,
"loss/reg": 0.0060236188583076,
"step": 200
},
{
"epoch": 0.025125,
"grad_norm": 2.2297983169555664,
"grad_norm_var": 1.4801331513155804,
"learning_rate": 0.0001,
"loss": 1.1642,
"loss/crossentropy": 2.401499032974243,
"loss/hidden": 0.9296875,
"loss/logits": 0.1743072271347046,
"loss/reg": 0.006021994166076183,
"step": 201
},
{
"epoch": 0.02525,
"grad_norm": 2.7430193424224854,
"grad_norm_var": 1.5134885749372204,
"learning_rate": 0.0001,
"loss": 1.3503,
"loss/crossentropy": 2.3397345542907715,
"loss/hidden": 1.09375,
"loss/logits": 0.1963859498500824,
"loss/reg": 0.006020485423505306,
"step": 202
},
{
"epoch": 0.025375,
"grad_norm": 3.3862688541412354,
"grad_norm_var": 1.4983780502999742,
"learning_rate": 0.0001,
"loss": 1.3154,
"loss/crossentropy": 2.3259048461914062,
"loss/hidden": 1.09375,
"loss/logits": 0.1614416241645813,
"loss/reg": 0.0060190120711922646,
"step": 203
},
{
"epoch": 0.0255,
"grad_norm": 2.554938316345215,
"grad_norm_var": 1.547662147741073,
"learning_rate": 0.0001,
"loss": 1.1147,
"loss/crossentropy": 2.559544801712036,
"loss/hidden": 0.890625,
"loss/logits": 0.16388913989067078,
"loss/reg": 0.006017730105668306,
"step": 204
},
{
"epoch": 0.025625,
"grad_norm": 2.6290361881256104,
"grad_norm_var": 1.5807281675134672,
"learning_rate": 0.0001,
"loss": 1.049,
"loss/crossentropy": 2.7080090045928955,
"loss/hidden": 0.828125,
"loss/logits": 0.16068041324615479,
"loss/reg": 0.006016433704644442,
"step": 205
},
{
"epoch": 0.02575,
"grad_norm": 2.234259605407715,
"grad_norm_var": 0.38456120947827777,
"learning_rate": 0.0001,
"loss": 1.0392,
"loss/crossentropy": 2.3816347122192383,
"loss/hidden": 0.8359375,
"loss/logits": 0.14315146207809448,
"loss/reg": 0.0060149249620735645,
"step": 206
},
{
"epoch": 0.025875,
"grad_norm": 2.810352325439453,
"grad_norm_var": 0.19522907990381644,
"learning_rate": 0.0001,
"loss": 1.1385,
"loss/crossentropy": 2.6245384216308594,
"loss/hidden": 0.90625,
"loss/logits": 0.17206540703773499,
"loss/reg": 0.006013684440404177,
"step": 207
},
{
"epoch": 0.026,
"grad_norm": 2.198707342147827,
"grad_norm_var": 0.21847125065788287,
"learning_rate": 0.0001,
"loss": 0.9762,
"loss/crossentropy": 2.3812787532806396,
"loss/hidden": 0.796875,
"loss/logits": 0.119233138859272,
"loss/reg": 0.006012204568833113,
"step": 208
},
{
"epoch": 0.026125,
"grad_norm": 2.5001378059387207,
"grad_norm_var": 0.22083751043745087,
"learning_rate": 0.0001,
"loss": 1.2526,
"loss/crossentropy": 2.5999109745025635,
"loss/hidden": 0.984375,
"loss/logits": 0.20815744996070862,
"loss/reg": 0.006010920740664005,
"step": 209
},
{
"epoch": 0.02625,
"grad_norm": 3.175185203552246,
"grad_norm_var": 0.20582482438127556,
"learning_rate": 0.0001,
"loss": 1.239,
"loss/crossentropy": 2.3893682956695557,
"loss/hidden": 1.0234375,
"loss/logits": 0.15550163388252258,
"loss/reg": 0.006009369157254696,
"step": 210
},
{
"epoch": 0.026375,
"grad_norm": 3.482342481613159,
"grad_norm_var": 0.19031657232839597,
"learning_rate": 0.0001,
"loss": 1.1572,
"loss/crossentropy": 2.382542848587036,
"loss/hidden": 0.94921875,
"loss/logits": 0.14788678288459778,
"loss/reg": 0.006007815711200237,
"step": 211
},
{
"epoch": 0.0265,
"grad_norm": 2.285135507583618,
"grad_norm_var": 0.19168098803167197,
"learning_rate": 0.0001,
"loss": 0.9667,
"loss/crossentropy": 2.552724838256836,
"loss/hidden": 0.78125,
"loss/logits": 0.1254206746816635,
"loss/reg": 0.006006232462823391,
"step": 212
},
{
"epoch": 0.026625,
"grad_norm": 2.991971969604492,
"grad_norm_var": 0.1888233667670041,
"learning_rate": 0.0001,
"loss": 1.1472,
"loss/crossentropy": 2.472437620162964,
"loss/hidden": 0.9296875,
"loss/logits": 0.15750399231910706,
"loss/reg": 0.0060045006684958935,
"step": 213
},
{
"epoch": 0.02675,
"grad_norm": 2.3775179386138916,
"grad_norm_var": 0.1665701003974154,
"learning_rate": 0.0001,
"loss": 1.1938,
"loss/crossentropy": 2.294337749481201,
"loss/hidden": 0.95703125,
"loss/logits": 0.17671090364456177,
"loss/reg": 0.006002978887408972,
"step": 214
},
{
"epoch": 0.026875,
"grad_norm": 2.2992701530456543,
"grad_norm_var": 0.17463199132661936,
"learning_rate": 0.0001,
"loss": 1.2097,
"loss/crossentropy": 2.3843300342559814,
"loss/hidden": 0.9609375,
"loss/logits": 0.18876615166664124,
"loss/reg": 0.006001432426273823,
"step": 215
},
{
"epoch": 0.027,
"grad_norm": 2.4926228523254395,
"grad_norm_var": 0.17347807328228151,
"learning_rate": 0.0001,
"loss": 1.3156,
"loss/crossentropy": 2.326836585998535,
"loss/hidden": 1.0625,
"loss/logits": 0.19308596849441528,
"loss/reg": 0.005999880842864513,
"step": 216
},
{
"epoch": 0.027125,
"grad_norm": 2.552459478378296,
"grad_norm_var": 0.16193263198218044,
"learning_rate": 0.0001,
"loss": 1.1424,
"loss/crossentropy": 2.6629388332366943,
"loss/hidden": 0.91015625,
"loss/logits": 0.1722826063632965,
"loss/reg": 0.005998372100293636,
"step": 217
},
{
"epoch": 0.02725,
"grad_norm": 2.866387128829956,
"grad_norm_var": 0.16409192036900605,
"learning_rate": 0.0001,
"loss": 1.0142,
"loss/crossentropy": 2.8154890537261963,
"loss/hidden": 0.80078125,
"loss/logits": 0.15349115431308746,
"loss/reg": 0.005996840540319681,
"step": 218
},
{
"epoch": 0.027375,
"grad_norm": 2.77524471282959,
"grad_norm_var": 0.12966566207502767,
"learning_rate": 0.0001,
"loss": 1.4111,
"loss/crossentropy": 2.4509928226470947,
"loss/hidden": 1.1015625,
"loss/logits": 0.249616801738739,
"loss/reg": 0.005995343904942274,
"step": 219
},
{
"epoch": 0.0275,
"grad_norm": 2.887923240661621,
"grad_norm_var": 0.13285907347625023,
"learning_rate": 0.0001,
"loss": 1.2886,
"loss/crossentropy": 2.4280507564544678,
"loss/hidden": 1.0234375,
"loss/logits": 0.20519307255744934,
"loss/reg": 0.005993579979985952,
"step": 220
},
{
"epoch": 0.027625,
"grad_norm": 2.5383920669555664,
"grad_norm_var": 0.1337457284607846,
"learning_rate": 0.0001,
"loss": 1.3292,
"loss/crossentropy": 2.0803585052490234,
"loss/hidden": 1.09375,
"loss/logits": 0.17551109194755554,
"loss/reg": 0.005991705227643251,
"step": 221
},
{
"epoch": 0.02775,
"grad_norm": 2.639490842819214,
"grad_norm_var": 0.12131687494494538,
"learning_rate": 0.0001,
"loss": 1.0593,
"loss/crossentropy": 2.293325901031494,
"loss/hidden": 0.8515625,
"loss/logits": 0.14782238006591797,
"loss/reg": 0.005989882629364729,
"step": 222
},
{
"epoch": 0.027875,
"grad_norm": 2.4396984577178955,
"grad_norm_var": 0.12344012810124999,
"learning_rate": 0.0001,
"loss": 1.0587,
"loss/crossentropy": 2.7268667221069336,
"loss/hidden": 0.84765625,
"loss/logits": 0.15114662051200867,
"loss/reg": 0.0059883627109229565,
"step": 223
},
{
"epoch": 0.028,
"grad_norm": 2.227886438369751,
"grad_norm_var": 0.12171264621671582,
"learning_rate": 0.0001,
"loss": 1.0087,
"loss/crossentropy": 2.4431943893432617,
"loss/hidden": 0.81640625,
"loss/logits": 0.13243696093559265,
"loss/reg": 0.005986812058836222,
"step": 224
},
{
"epoch": 0.028125,
"grad_norm": 3.690627098083496,
"grad_norm_var": 0.18519755428341872,
"learning_rate": 0.0001,
"loss": 1.0732,
"loss/crossentropy": 2.4630942344665527,
"loss/hidden": 0.875,
"loss/logits": 0.13830721378326416,
"loss/reg": 0.005985158029943705,
"step": 225
},
{
"epoch": 0.02825,
"grad_norm": 3.377890110015869,
"grad_norm_var": 0.19972658805784155,
"learning_rate": 0.0001,
"loss": 1.1848,
"loss/crossentropy": 2.2899203300476074,
"loss/hidden": 0.9609375,
"loss/logits": 0.16401749849319458,
"loss/reg": 0.005983633920550346,
"step": 226
},
{
"epoch": 0.028375,
"grad_norm": 2.7600386142730713,
"grad_norm_var": 0.16135214723361363,
"learning_rate": 0.0001,
"loss": 1.0223,
"loss/crossentropy": 2.8077659606933594,
"loss/hidden": 0.8203125,
"loss/logits": 0.14218226075172424,
"loss/reg": 0.005982026923447847,
"step": 227
},
{
"epoch": 0.0285,
"grad_norm": 2.3397345542907715,
"grad_norm_var": 0.15851713921701366,
"learning_rate": 0.0001,
"loss": 1.077,
"loss/crossentropy": 2.438030958175659,
"loss/hidden": 0.875,
"loss/logits": 0.14217695593833923,
"loss/reg": 0.005980519577860832,
"step": 228
},
{
"epoch": 0.028625,
"grad_norm": 2.744401216506958,
"grad_norm_var": 0.15282793193407448,
"learning_rate": 0.0001,
"loss": 1.1967,
"loss/crossentropy": 2.557457447052002,
"loss/hidden": 0.97265625,
"loss/logits": 0.16425767540931702,
"loss/reg": 0.005979116074740887,
"step": 229
},
{
"epoch": 0.02875,
"grad_norm": 2.4241418838500977,
"grad_norm_var": 0.15103305834679168,
"learning_rate": 0.0001,
"loss": 1.0402,
"loss/crossentropy": 2.743885040283203,
"loss/hidden": 0.828125,
"loss/logits": 0.15231972932815552,
"loss/reg": 0.005977709777653217,
"step": 230
},
{
"epoch": 0.028875,
"grad_norm": 2.0828442573547363,
"grad_norm_var": 0.16526500993595217,
"learning_rate": 0.0001,
"loss": 0.9747,
"loss/crossentropy": 2.719327688217163,
"loss/hidden": 0.78125,
"loss/logits": 0.133681058883667,
"loss/reg": 0.005976095795631409,
"step": 231
},
{
"epoch": 0.029,
"grad_norm": 2.127495527267456,
"grad_norm_var": 0.18259721536013085,
"learning_rate": 0.0001,
"loss": 1.0588,
"loss/crossentropy": 2.8147058486938477,
"loss/hidden": 0.85546875,
"loss/logits": 0.14354225993156433,
"loss/reg": 0.005974431522190571,
"step": 232
},
{
"epoch": 0.029125,
"grad_norm": 4.263195991516113,
"grad_norm_var": 0.34219781045772657,
"learning_rate": 0.0001,
"loss": 1.1724,
"loss/crossentropy": 2.5414481163024902,
"loss/hidden": 0.96484375,
"loss/logits": 0.1478062868118286,
"loss/reg": 0.005972826853394508,
"step": 233
},
{
"epoch": 0.02925,
"grad_norm": 2.9974324703216553,
"grad_norm_var": 0.34510225788824467,
"learning_rate": 0.0001,
"loss": 1.3152,
"loss/crossentropy": 2.697648763656616,
"loss/hidden": 1.0546875,
"loss/logits": 0.20080995559692383,
"loss/reg": 0.005971227772533894,
"step": 234
},
{
"epoch": 0.029375,
"grad_norm": 3.4798855781555176,
"grad_norm_var": 0.37664835069757197,
"learning_rate": 0.0001,
"loss": 1.2096,
"loss/crossentropy": 2.3990559577941895,
"loss/hidden": 0.95703125,
"loss/logits": 0.19287389516830444,
"loss/reg": 0.005969603545963764,
"step": 235
},
{
"epoch": 0.0295,
"grad_norm": 2.43911075592041,
"grad_norm_var": 0.3848032740432508,
"learning_rate": 0.0001,
"loss": 1.0658,
"loss/crossentropy": 1.966374158859253,
"loss/hidden": 0.875,
"loss/logits": 0.13115233182907104,
"loss/reg": 0.005967943929135799,
"step": 236
},
{
"epoch": 0.029625,
"grad_norm": 3.7423646450042725,
"grad_norm_var": 0.4356891905379257,
"learning_rate": 0.0001,
"loss": 1.2397,
"loss/crossentropy": 2.718675374984741,
"loss/hidden": 0.9921875,
"loss/logits": 0.18789833784103394,
"loss/reg": 0.00596608454361558,
"step": 237
},
{
"epoch": 0.02975,
"grad_norm": 3.328033924102783,
"grad_norm_var": 0.4449827328026664,
"learning_rate": 0.0001,
"loss": 1.5581,
"loss/crossentropy": 2.272303819656372,
"loss/hidden": 1.2421875,
"loss/logits": 0.2562662661075592,
"loss/reg": 0.005964066833257675,
"step": 238
},
{
"epoch": 0.029875,
"grad_norm": 2.8761045932769775,
"grad_norm_var": 0.42986649641521024,
"learning_rate": 0.0001,
"loss": 1.1392,
"loss/crossentropy": 2.6973013877868652,
"loss/hidden": 0.91796875,
"loss/logits": 0.16159963607788086,
"loss/reg": 0.005962541792541742,
"step": 239
},
{
"epoch": 0.03,
"grad_norm": 2.4458563327789307,
"grad_norm_var": 0.4123921579785623,
"learning_rate": 0.0001,
"loss": 1.178,
"loss/crossentropy": 2.5731561183929443,
"loss/hidden": 0.9375,
"loss/logits": 0.18093177676200867,
"loss/reg": 0.005961006972938776,
"step": 240
},
{
"epoch": 0.030125,
"grad_norm": 2.4645614624023438,
"grad_norm_var": 0.3844441578530656,
"learning_rate": 0.0001,
"loss": 1.0932,
"loss/crossentropy": 2.648738145828247,
"loss/hidden": 0.890625,
"loss/logits": 0.14302745461463928,
"loss/reg": 0.005959144793450832,
"step": 241
},
{
"epoch": 0.03025,
"grad_norm": 3.0715034008026123,
"grad_norm_var": 0.3694944025754277,
"learning_rate": 0.0001,
"loss": 1.1916,
"loss/crossentropy": 2.4820139408111572,
"loss/hidden": 0.94921875,
"loss/logits": 0.18278783559799194,
"loss/reg": 0.005957332905381918,
"step": 242
},
{
"epoch": 0.030375,
"grad_norm": 2.479677677154541,
"grad_norm_var": 0.37773887013444374,
"learning_rate": 0.0001,
"loss": 1.0787,
"loss/crossentropy": 2.614309549331665,
"loss/hidden": 0.87109375,
"loss/logits": 0.14808647334575653,
"loss/reg": 0.005955492611974478,
"step": 243
},
{
"epoch": 0.0305,
"grad_norm": 3.0970399379730225,
"grad_norm_var": 0.36391299171458796,
"learning_rate": 0.0001,
"loss": 1.1987,
"loss/crossentropy": 2.2731809616088867,
"loss/hidden": 0.95703125,
"loss/logits": 0.18210504949092865,
"loss/reg": 0.00595364673063159,
"step": 244
},
{
"epoch": 0.030625,
"grad_norm": 2.388214588165283,
"grad_norm_var": 0.37823356386532864,
"learning_rate": 0.0001,
"loss": 1.1283,
"loss/crossentropy": 2.532259225845337,
"loss/hidden": 0.91015625,
"loss/logits": 0.15858401358127594,
"loss/reg": 0.005952049978077412,
"step": 245
},
{
"epoch": 0.03075,
"grad_norm": 2.97310733795166,
"grad_norm_var": 0.36540629077152076,
"learning_rate": 0.0001,
"loss": 1.1177,
"loss/crossentropy": 2.5206258296966553,
"loss/hidden": 0.89453125,
"loss/logits": 0.16365137696266174,
"loss/reg": 0.005950110498815775,
"step": 246
},
{
"epoch": 0.030875,
"grad_norm": 2.15498423576355,
"grad_norm_var": 0.3579579158371985,
"learning_rate": 0.0001,
"loss": 1.1046,
"loss/crossentropy": 2.478773832321167,
"loss/hidden": 0.8828125,
"loss/logits": 0.162343829870224,
"loss/reg": 0.005948282778263092,
"step": 247
},
{
"epoch": 0.031,
"grad_norm": 2.3404128551483154,
"grad_norm_var": 0.338987407645584,
"learning_rate": 0.0001,
"loss": 1.1555,
"loss/crossentropy": 2.1949751377105713,
"loss/hidden": 0.93359375,
"loss/logits": 0.1624409407377243,
"loss/reg": 0.005946675315499306,
"step": 248
},
{
"epoch": 0.031125,
"grad_norm": 2.8813085556030273,
"grad_norm_var": 0.20879640313171802,
"learning_rate": 0.0001,
"loss": 1.1599,
"loss/crossentropy": 2.556128978729248,
"loss/hidden": 0.9296875,
"loss/logits": 0.1707805097103119,
"loss/reg": 0.005944731179624796,
"step": 249
},
{
"epoch": 0.03125,
"grad_norm": 3.309937000274658,
"grad_norm_var": 0.22219010027481143,
"learning_rate": 0.0001,
"loss": 1.0939,
"loss/crossentropy": 2.4590022563934326,
"loss/hidden": 0.88671875,
"loss/logits": 0.14774294197559357,
"loss/reg": 0.005942681338638067,
"step": 250
},
{
"epoch": 0.031375,
"grad_norm": 3.1676676273345947,
"grad_norm_var": 0.201728293925846,
"learning_rate": 0.0001,
"loss": 1.3162,
"loss/crossentropy": 2.419811487197876,
"loss/hidden": 1.015625,
"loss/logits": 0.24120670557022095,
"loss/reg": 0.005940672475844622,
"step": 251
},
{
"epoch": 0.0315,
"grad_norm": 2.6006832122802734,
"grad_norm_var": 0.1951007002723287,
"learning_rate": 0.0001,
"loss": 1.3903,
"loss/crossentropy": 2.170666456222534,
"loss/hidden": 1.140625,
"loss/logits": 0.19024603068828583,
"loss/reg": 0.005938523914664984,
"step": 252
},
{
"epoch": 0.031625,
"grad_norm": 2.4954755306243896,
"grad_norm_var": 0.14101991304577552,
"learning_rate": 0.0001,
"loss": 1.1465,
"loss/crossentropy": 2.262831449508667,
"loss/hidden": 0.93359375,
"loss/logits": 0.1535283327102661,
"loss/reg": 0.00593681400641799,
"step": 253
},
{
"epoch": 0.03175,
"grad_norm": 2.339406728744507,
"grad_norm_var": 0.12652605714113535,
"learning_rate": 0.0001,
"loss": 0.984,
"loss/crossentropy": 2.2793617248535156,
"loss/hidden": 0.796875,
"loss/logits": 0.12778240442276,
"loss/reg": 0.005935273133218288,
"step": 254
},
{
"epoch": 0.031875,
"grad_norm": 2.3391647338867188,
"grad_norm_var": 0.131427049667937,
"learning_rate": 0.0001,
"loss": 1.0622,
"loss/crossentropy": 2.4579379558563232,
"loss/hidden": 0.83984375,
"loss/logits": 0.16299216449260712,
"loss/reg": 0.0059331608936190605,
"step": 255
},
{
"epoch": 0.032,
"grad_norm": 2.3896231651306152,
"grad_norm_var": 0.13322512800125588,
"learning_rate": 0.0001,
"loss": 1.057,
"loss/crossentropy": 2.8022475242614746,
"loss/hidden": 0.85546875,
"loss/logits": 0.14219465851783752,
"loss/reg": 0.005931555759161711,
"step": 256
},
{
"epoch": 0.032125,
"grad_norm": 2.125249147415161,
"grad_norm_var": 0.14907278605534582,
"learning_rate": 0.0001,
"loss": 1.0611,
"loss/crossentropy": 2.33644700050354,
"loss/hidden": 0.8515625,
"loss/logits": 0.15020999312400818,
"loss/reg": 0.005930029321461916,
"step": 257
},
{
"epoch": 0.03225,
"grad_norm": 2.521933078765869,
"grad_norm_var": 0.13593429417580463,
"learning_rate": 0.0001,
"loss": 1.0436,
"loss/crossentropy": 2.512619733810425,
"loss/hidden": 0.8203125,
"loss/logits": 0.16396166384220123,
"loss/reg": 0.00592817785218358,
"step": 258
},
{
"epoch": 0.032375,
"grad_norm": 2.5966317653656006,
"grad_norm_var": 0.13490910688263208,
"learning_rate": 0.0001,
"loss": 1.1331,
"loss/crossentropy": 2.248013734817505,
"loss/hidden": 0.91015625,
"loss/logits": 0.16364812850952148,
"loss/reg": 0.00592625979334116,
"step": 259
},
{
"epoch": 0.0325,
"grad_norm": 2.2045137882232666,
"grad_norm_var": 0.12644607438415487,
"learning_rate": 0.0001,
"loss": 1.0015,
"loss/crossentropy": 2.3253698348999023,
"loss/hidden": 0.796875,
"loss/logits": 0.14540287852287292,
"loss/reg": 0.005924653727561235,
"step": 260
},
{
"epoch": 0.032625,
"grad_norm": 2.4450156688690186,
"grad_norm_var": 0.1254090419850094,
"learning_rate": 0.0001,
"loss": 0.9932,
"loss/crossentropy": 2.2374210357666016,
"loss/hidden": 0.80078125,
"loss/logits": 0.13316848874092102,
"loss/reg": 0.005922792013734579,
"step": 261
},
{
"epoch": 0.03275,
"grad_norm": 7.747511863708496,
"grad_norm_var": 1.8160510254643325,
"learning_rate": 0.0001,
"loss": 1.2542,
"loss/crossentropy": 2.8747429847717285,
"loss/hidden": 1.0234375,
"loss/logits": 0.17151576280593872,
"loss/reg": 0.005921173375099897,
"step": 262
},
{
"epoch": 0.032875,
"grad_norm": 2.1854233741760254,
"grad_norm_var": 1.8132730792650582,
"learning_rate": 0.0001,
"loss": 1.0069,
"loss/crossentropy": 2.4989960193634033,
"loss/hidden": 0.8125,
"loss/logits": 0.13518914580345154,
"loss/reg": 0.005919379647821188,
"step": 263
},
{
"epoch": 0.033,
"grad_norm": 3.5132219791412354,
"grad_norm_var": 1.8186749991604263,
"learning_rate": 0.0001,
"loss": 1.054,
"loss/crossentropy": 2.497178316116333,
"loss/hidden": 0.84765625,
"loss/logits": 0.1471494734287262,
"loss/reg": 0.005917761009186506,
"step": 264
},
{
"epoch": 0.033125,
"grad_norm": 4.302145481109619,
"grad_norm_var": 1.9358282916849012,
"learning_rate": 0.0001,
"loss": 1.3123,
"loss/crossentropy": 2.1725542545318604,
"loss/hidden": 1.0859375,
"loss/logits": 0.16722658276557922,
"loss/reg": 0.0059160212986171246,
"step": 265
},
{
"epoch": 0.03325,
"grad_norm": 2.3225510120391846,
"grad_norm_var": 1.9582913809461102,
"learning_rate": 0.0001,
"loss": 1.0153,
"loss/crossentropy": 2.6670029163360596,
"loss/hidden": 0.80859375,
"loss/logits": 0.1475904881954193,
"loss/reg": 0.0059142098762094975,
"step": 266
},
{
"epoch": 0.033375,
"grad_norm": 5.196990013122559,
"grad_norm_var": 2.27294427304937,
"learning_rate": 0.0001,
"loss": 1.1665,
"loss/crossentropy": 2.6792731285095215,
"loss/hidden": 0.94140625,
"loss/logits": 0.1659836769104004,
"loss/reg": 0.00591221172362566,
"step": 267
},
{
"epoch": 0.0335,
"grad_norm": 3.5144336223602295,
"grad_norm_var": 2.26638445070385,
"learning_rate": 0.0001,
"loss": 1.2502,
"loss/crossentropy": 2.2949023246765137,
"loss/hidden": 1.0234375,
"loss/logits": 0.1677004098892212,
"loss/reg": 0.005910532083362341,
"step": 268
},
{
"epoch": 0.033625,
"grad_norm": 2.861222267150879,
"grad_norm_var": 2.2433162495019436,
"learning_rate": 0.0001,
"loss": 1.3308,
"loss/crossentropy": 2.5955142974853516,
"loss/hidden": 1.0703125,
"loss/logits": 0.2013990730047226,
"loss/reg": 0.005908492021262646,
"step": 269
},
{
"epoch": 0.03375,
"grad_norm": 2.964390754699707,
"grad_norm_var": 2.1991134738974947,
"learning_rate": 0.0001,
"loss": 1.0975,
"loss/crossentropy": 2.483924150466919,
"loss/hidden": 0.8828125,
"loss/logits": 0.15562227368354797,
"loss/reg": 0.005906403064727783,
"step": 270
},
{
"epoch": 0.033875,
"grad_norm": 2.75604510307312,
"grad_norm_var": 2.1620222961988325,
"learning_rate": 0.0001,
"loss": 1.2196,
"loss/crossentropy": 2.39125394821167,
"loss/hidden": 0.9765625,
"loss/logits": 0.18403753638267517,
"loss/reg": 0.00590470340102911,
"step": 271
},
{
"epoch": 0.034,
"grad_norm": 2.360309362411499,
"grad_norm_var": 2.165352535939727,
"learning_rate": 0.0001,
"loss": 1.0194,
"loss/crossentropy": 2.530670404434204,
"loss/hidden": 0.8046875,
"loss/logits": 0.15565866231918335,
"loss/reg": 0.005902664735913277,
"step": 272
},
{
"epoch": 0.034125,
"grad_norm": 2.496027946472168,
"grad_norm_var": 2.1195219252368287,
"learning_rate": 0.0001,
"loss": 1.2228,
"loss/crossentropy": 2.7535252571105957,
"loss/hidden": 0.9609375,
"loss/logits": 0.20284873247146606,
"loss/reg": 0.005900639574974775,
"step": 273
},
{
"epoch": 0.03425,
"grad_norm": 2.854250431060791,
"grad_norm_var": 2.0941964139517344,
"learning_rate": 0.0001,
"loss": 1.1387,
"loss/crossentropy": 2.134964942932129,
"loss/hidden": 0.9296875,
"loss/logits": 0.15002194046974182,
"loss/reg": 0.005898929201066494,
"step": 274
},
{
"epoch": 0.034375,
"grad_norm": 4.497798442840576,
"grad_norm_var": 2.149396374832277,
"learning_rate": 0.0001,
"loss": 1.2312,
"loss/crossentropy": 2.3270835876464844,
"loss/hidden": 0.99609375,
"loss/logits": 0.17617599666118622,
"loss/reg": 0.0058972095139324665,
"step": 275
},
{
"epoch": 0.0345,
"grad_norm": 2.321152448654175,
"grad_norm_var": 2.1318278315927155,
"learning_rate": 0.0001,
"loss": 1.1523,
"loss/crossentropy": 1.858445644378662,
"loss/hidden": 0.94921875,
"loss/logits": 0.14408603310585022,
"loss/reg": 0.005895303096622229,
"step": 276
},
{
"epoch": 0.034625,
"grad_norm": 2.4426257610321045,
"grad_norm_var": 2.1321312734782243,
"learning_rate": 0.0001,
"loss": 1.0267,
"loss/crossentropy": 2.4483628273010254,
"loss/hidden": 0.82421875,
"loss/logits": 0.1435263752937317,
"loss/reg": 0.005893299821764231,
"step": 277
},
{
"epoch": 0.03475,
"grad_norm": 2.144637107849121,
"grad_norm_var": 0.843351985629086,
"learning_rate": 0.0001,
"loss": 1.0517,
"loss/crossentropy": 2.237915277481079,
"loss/hidden": 0.8515625,
"loss/logits": 0.14119011163711548,
"loss/reg": 0.005891298409551382,
"step": 278
},
{
"epoch": 0.034875,
"grad_norm": 2.32000732421875,
"grad_norm_var": 0.8290445100225684,
"learning_rate": 0.0001,
"loss": 1.0462,
"loss/crossentropy": 2.6588850021362305,
"loss/hidden": 0.83203125,
"loss/logits": 0.1552983820438385,
"loss/reg": 0.0058892290107905865,
"step": 279
},
{
"epoch": 0.035,
"grad_norm": 3.3390939235687256,
"grad_norm_var": 0.820283282746707,
"learning_rate": 0.0001,
"loss": 1.1937,
"loss/crossentropy": 2.5243186950683594,
"loss/hidden": 0.953125,
"loss/logits": 0.1817275732755661,
"loss/reg": 0.00588742271065712,
"step": 280
},
{
"epoch": 0.035125,
"grad_norm": 3.1800894737243652,
"grad_norm_var": 0.7106469411621028,
"learning_rate": 0.0001,
"loss": 1.1937,
"loss/crossentropy": 2.556126832962036,
"loss/hidden": 0.953125,
"loss/logits": 0.18167603015899658,
"loss/reg": 0.005885709077119827,
"step": 281
},
{
"epoch": 0.03525,
"grad_norm": 4.466390132904053,
"grad_norm_var": 0.8119073339313209,
"learning_rate": 0.0001,
"loss": 1.27,
"loss/crossentropy": 2.5671539306640625,
"loss/hidden": 0.984375,
"loss/logits": 0.2267427146434784,
"loss/reg": 0.0058837407268583775,
"step": 282
},
{
"epoch": 0.035375,
"grad_norm": 3.2809953689575195,
"grad_norm_var": 0.5074810718943117,
"learning_rate": 0.0001,
"loss": 1.1245,
"loss/crossentropy": 2.1554338932037354,
"loss/hidden": 0.9140625,
"loss/logits": 0.1516391634941101,
"loss/reg": 0.005881770513951778,
"step": 283
},
{
"epoch": 0.0355,
"grad_norm": 2.9982316493988037,
"grad_norm_var": 0.48786559613454966,
"learning_rate": 0.0001,
"loss": 1.1286,
"loss/crossentropy": 2.6773006916046143,
"loss/hidden": 0.90625,
"loss/logits": 0.1635606288909912,
"loss/reg": 0.005880062934011221,
"step": 284
},
{
"epoch": 0.035625,
"grad_norm": 2.387657880783081,
"grad_norm_var": 0.5078162485774572,
"learning_rate": 0.0001,
"loss": 1.1214,
"loss/crossentropy": 2.4741320610046387,
"loss/hidden": 0.8984375,
"loss/logits": 0.1641697734594345,
"loss/reg": 0.0058782072737813,
"step": 285
},
{
"epoch": 0.03575,
"grad_norm": 271.6628112792969,
"grad_norm_var": 4514.324895160767,
"learning_rate": 0.0001,
"loss": 1.6171,
"loss/crossentropy": 2.5766143798828125,
"loss/hidden": 1.375,
"loss/logits": 0.1833469420671463,
"loss/reg": 0.005876271054148674,
"step": 286
},
{
"epoch": 0.035875,
"grad_norm": 3.545677900314331,
"grad_norm_var": 4512.577903953303,
"learning_rate": 0.0001,
"loss": 1.1466,
"loss/crossentropy": 2.5389881134033203,
"loss/hidden": 0.88671875,
"loss/logits": 0.20117658376693726,
"loss/reg": 0.005874336697161198,
"step": 287
},
{
"epoch": 0.036,
"grad_norm": 2.9219233989715576,
"grad_norm_var": 4511.294050983276,
"learning_rate": 0.0001,
"loss": 1.1121,
"loss/crossentropy": 2.3270509243011475,
"loss/hidden": 0.8828125,
"loss/logits": 0.17058232426643372,
"loss/reg": 0.005872361361980438,
"step": 288
},
{
"epoch": 0.036125,
"grad_norm": 2.831878423690796,
"grad_norm_var": 4510.526061571783,
"learning_rate": 0.0001,
"loss": 1.148,
"loss/crossentropy": 2.4853744506835938,
"loss/hidden": 0.91796875,
"loss/logits": 0.17128118872642517,
"loss/reg": 0.005870639346539974,
"step": 289
},
{
"epoch": 0.03625,
"grad_norm": 2.284134864807129,
"grad_norm_var": 4511.83639181831,
"learning_rate": 0.0001,
"loss": 1.0599,
"loss/crossentropy": 2.3107759952545166,
"loss/hidden": 0.8515625,
"loss/logits": 0.14969472587108612,
"loss/reg": 0.005868903826922178,
"step": 290
},
{
"epoch": 0.036375,
"grad_norm": 2.2008161544799805,
"grad_norm_var": 4516.84932017332,
"learning_rate": 0.0001,
"loss": 1.0902,
"loss/crossentropy": 2.4265358448028564,
"loss/hidden": 0.86328125,
"loss/logits": 0.1682073473930359,
"loss/reg": 0.0058671231381595135,
"step": 291
},
{
"epoch": 0.0365,
"grad_norm": 2.6285743713378906,
"grad_norm_var": 4516.145108725088,
"learning_rate": 0.0001,
"loss": 1.2494,
"loss/crossentropy": 2.372230291366577,
"loss/hidden": 0.98046875,
"loss/logits": 0.2102714478969574,
"loss/reg": 0.005865375977009535,
"step": 292
},
{
"epoch": 0.036625,
"grad_norm": 2.6784040927886963,
"grad_norm_var": 4515.607170253259,
"learning_rate": 0.0001,
"loss": 1.0752,
"loss/crossentropy": 2.6276440620422363,
"loss/hidden": 0.875,
"loss/logits": 0.14159329235553741,
"loss/reg": 0.005863656289875507,
"step": 293
},
{
"epoch": 0.03675,
"grad_norm": 2.6373047828674316,
"grad_norm_var": 4514.470495103465,
"learning_rate": 0.0001,
"loss": 1.1694,
"loss/crossentropy": 2.70892333984375,
"loss/hidden": 0.9453125,
"loss/logits": 0.16546514630317688,
"loss/reg": 0.005862091202288866,
"step": 294
},
{
"epoch": 0.036875,
"grad_norm": 2.384430170059204,
"grad_norm_var": 4514.321377312488,
"learning_rate": 0.0001,
"loss": 1.2472,
"loss/crossentropy": 2.1273090839385986,
"loss/hidden": 1.0,
"loss/logits": 0.18860690295696259,
"loss/reg": 0.005860424134880304,
"step": 295
},
{
"epoch": 0.037,
"grad_norm": 2.5959692001342773,
"grad_norm_var": 4515.978398966678,
"learning_rate": 0.0001,
"loss": 1.0376,
"loss/crossentropy": 2.7293522357940674,
"loss/hidden": 0.8203125,
"loss/logits": 0.1587076485157013,
"loss/reg": 0.0058588446117937565,
"step": 296
},
{
"epoch": 0.037125,
"grad_norm": 2.2753238677978516,
"grad_norm_var": 4518.0185669920775,
"learning_rate": 0.0001,
"loss": 1.0063,
"loss/crossentropy": 2.4602949619293213,
"loss/hidden": 0.8125,
"loss/logits": 0.13525693118572235,
"loss/reg": 0.005857320036739111,
"step": 297
},
{
"epoch": 0.03725,
"grad_norm": 3.009300708770752,
"grad_norm_var": 4521.093589717446,
"learning_rate": 0.0001,
"loss": 1.2573,
"loss/crossentropy": 2.8883349895477295,
"loss/hidden": 0.9921875,
"loss/logits": 0.20657645165920258,
"loss/reg": 0.005855792202055454,
"step": 298
},
{
"epoch": 0.037375,
"grad_norm": 2.700221538543701,
"grad_norm_var": 4522.372179334166,
"learning_rate": 0.0001,
"loss": 1.1557,
"loss/crossentropy": 2.5446314811706543,
"loss/hidden": 0.90234375,
"loss/logits": 0.19479964673519135,
"loss/reg": 0.005854278337210417,
"step": 299
},
{
"epoch": 0.0375,
"grad_norm": 2.3786559104919434,
"grad_norm_var": 4523.758055495688,
"learning_rate": 0.0001,
"loss": 1.1224,
"loss/crossentropy": 2.469960927963257,
"loss/hidden": 0.90234375,
"loss/logits": 0.16156738996505737,
"loss/reg": 0.00585273839533329,
"step": 300
},
{
"epoch": 0.037625,
"grad_norm": 2.7032158374786377,
"grad_norm_var": 4523.046593599144,
"learning_rate": 0.0001,
"loss": 1.1947,
"loss/crossentropy": 2.7451162338256836,
"loss/hidden": 0.94140625,
"loss/logits": 0.19476984441280365,
"loss/reg": 0.0058509958907961845,
"step": 301
},
{
"epoch": 0.03775,
"grad_norm": 2.507664442062378,
"grad_norm_var": 0.11250867537391755,
"learning_rate": 0.0001,
"loss": 0.9899,
"loss/crossentropy": 2.53341007232666,
"loss/hidden": 0.796875,
"loss/logits": 0.1345081329345703,
"loss/reg": 0.005849248263984919,
"step": 302
},
{
"epoch": 0.037875,
"grad_norm": 3.027892589569092,
"grad_norm_var": 0.06692647718721882,
"learning_rate": 0.0001,
"loss": 1.0973,
"loss/crossentropy": 2.7899296283721924,
"loss/hidden": 0.890625,
"loss/logits": 0.1482122391462326,
"loss/reg": 0.005847662687301636,
"step": 303
},
{
"epoch": 0.038,
"grad_norm": 2.1617183685302734,
"grad_norm_var": 0.07146536810277529,
"learning_rate": 0.0001,
"loss": 0.969,
"loss/crossentropy": 2.4700305461883545,
"loss/hidden": 0.78125,
"loss/logits": 0.12925508618354797,
"loss/reg": 0.005846073850989342,
"step": 304
},
{
"epoch": 0.038125,
"grad_norm": 2.3791332244873047,
"grad_norm_var": 0.06803597239225306,
"learning_rate": 0.0001,
"loss": 1.1912,
"loss/crossentropy": 2.4171202182769775,
"loss/hidden": 0.9453125,
"loss/logits": 0.18739524483680725,
"loss/reg": 0.005844476167112589,
"step": 305
},
{
"epoch": 0.03825,
"grad_norm": 2.7622976303100586,
"grad_norm_var": 0.06636088237049004,
"learning_rate": 0.0001,
"loss": 1.0808,
"loss/crossentropy": 2.5030367374420166,
"loss/hidden": 0.8359375,
"loss/logits": 0.18643516302108765,
"loss/reg": 0.005842759273946285,
"step": 306
},
{
"epoch": 0.038375,
"grad_norm": 2.4079246520996094,
"grad_norm_var": 0.059000676657357566,
"learning_rate": 0.0001,
"loss": 1.0359,
"loss/crossentropy": 2.381542682647705,
"loss/hidden": 0.828125,
"loss/logits": 0.1493588387966156,
"loss/reg": 0.0058412267826497555,
"step": 307
},
{
"epoch": 0.0385,
"grad_norm": 2.5356478691101074,
"grad_norm_var": 0.058906038923372726,
"learning_rate": 0.0001,
"loss": 1.087,
"loss/crossentropy": 2.4928808212280273,
"loss/hidden": 0.875,
"loss/logits": 0.15363982319831848,
"loss/reg": 0.0058394852094352245,
"step": 308
},
{
"epoch": 0.038625,
"grad_norm": 2.4036688804626465,
"grad_norm_var": 0.0597099908353601,
"learning_rate": 0.0001,
"loss": 0.986,
"loss/crossentropy": 2.5816946029663086,
"loss/hidden": 0.7890625,
"loss/logits": 0.13851355016231537,
"loss/reg": 0.005837727338075638,
"step": 309
},
{
"epoch": 0.03875,
"grad_norm": 2.630572557449341,
"grad_norm_var": 0.05963840398777146,
"learning_rate": 0.0001,
"loss": 1.0333,
"loss/crossentropy": 2.140015125274658,
"loss/hidden": 0.828125,
"loss/logits": 0.14680367708206177,
"loss/reg": 0.005835913587361574,
"step": 310
},
{
"epoch": 0.038875,
"grad_norm": 2.3641905784606934,
"grad_norm_var": 0.06012154861927167,
"learning_rate": 0.0001,
"loss": 1.0947,
"loss/crossentropy": 2.3300833702087402,
"loss/hidden": 0.8828125,
"loss/logits": 0.15358075499534607,
"loss/reg": 0.005834224168211222,
"step": 311
},
{
"epoch": 0.039,
"grad_norm": 2.215728759765625,
"grad_norm_var": 0.06696490679455162,
"learning_rate": 0.0001,
"loss": 1.1411,
"loss/crossentropy": 2.4583277702331543,
"loss/hidden": 0.9140625,
"loss/logits": 0.1687404215335846,
"loss/reg": 0.005832599475979805,
"step": 312
},
{
"epoch": 0.039125,
"grad_norm": 2.8934550285339355,
"grad_norm_var": 0.06994228066174794,
"learning_rate": 0.0001,
"loss": 1.2763,
"loss/crossentropy": 2.409702777862549,
"loss/hidden": 1.0390625,
"loss/logits": 0.17889352142810822,
"loss/reg": 0.005831001792103052,
"step": 313
},
{
"epoch": 0.03925,
"grad_norm": 8.741681098937988,
"grad_norm_var": 2.4613182467650705,
"learning_rate": 0.0001,
"loss": 1.1972,
"loss/crossentropy": 2.3858492374420166,
"loss/hidden": 0.96875,
"loss/logits": 0.1701970100402832,
"loss/reg": 0.005829236935824156,
"step": 314
},
{
"epoch": 0.039375,
"grad_norm": 7.412417411804199,
"grad_norm_var": 3.707354176329111,
"learning_rate": 0.0001,
"loss": 1.3096,
"loss/crossentropy": 2.3804125785827637,
"loss/hidden": 1.1015625,
"loss/logits": 0.149795800447464,
"loss/reg": 0.005827469285577536,
"step": 315
},
{
"epoch": 0.0395,
"grad_norm": 3.1443870067596436,
"grad_norm_var": 3.6580641482995806,
"learning_rate": 0.0001,
"loss": 1.1365,
"loss/crossentropy": 2.481820583343506,
"loss/hidden": 0.90234375,
"loss/logits": 0.1759084165096283,
"loss/reg": 0.005825776606798172,
"step": 316
},
{
"epoch": 0.039625,
"grad_norm": 2.8567562103271484,
"grad_norm_var": 3.6479706732170993,
"learning_rate": 0.0001,
"loss": 1.0023,
"loss/crossentropy": 2.5141823291778564,
"loss/hidden": 0.80078125,
"loss/logits": 0.14331723749637604,
"loss/reg": 0.005824015475809574,
"step": 317
},
{
"epoch": 0.03975,
"grad_norm": 2.2817444801330566,
"grad_norm_var": 3.674359828489624,
"learning_rate": 0.0001,
"loss": 1.0893,
"loss/crossentropy": 2.184128999710083,
"loss/hidden": 0.875,
"loss/logits": 0.15605026483535767,
"loss/reg": 0.00582248717546463,
"step": 318
},
{
"epoch": 0.039875,
"grad_norm": 2.249969005584717,
"grad_norm_var": 3.736641439481692,
"learning_rate": 0.0001,
"loss": 1.008,
"loss/crossentropy": 2.768484354019165,
"loss/hidden": 0.80078125,
"loss/logits": 0.14897163212299347,
"loss/reg": 0.00582079216837883,
"step": 319
},
{
"epoch": 0.04,
"grad_norm": 2.6358306407928467,
"grad_norm_var": 3.684102068428194,
"learning_rate": 0.0001,
"loss": 1.3237,
"loss/crossentropy": 2.301954507827759,
"loss/hidden": 1.015625,
"loss/logits": 0.24987459182739258,
"loss/reg": 0.005819002632051706,
"step": 320
},
{
"epoch": 0.040125,
"grad_norm": 2.353457450866699,
"grad_norm_var": 3.6871065280104496,
"learning_rate": 0.0001,
"loss": 1.1095,
"loss/crossentropy": 2.379765272140503,
"loss/hidden": 0.89453125,
"loss/logits": 0.15680107474327087,
"loss/reg": 0.005817302968353033,
"step": 321
},
{
"epoch": 0.04025,
"grad_norm": 2.4568967819213867,
"grad_norm_var": 3.712514538750317,
"learning_rate": 0.0001,
"loss": 0.9706,
"loss/crossentropy": 2.380795955657959,
"loss/hidden": 0.77734375,
"loss/logits": 0.13508911430835724,
"loss/reg": 0.005815597716718912,
"step": 322
},
{
"epoch": 0.040375,
"grad_norm": 3.207794189453125,
"grad_norm_var": 3.6654654630236734,
"learning_rate": 0.0001,
"loss": 1.3668,
"loss/crossentropy": 1.949703574180603,
"loss/hidden": 1.1171875,
"loss/logits": 0.19150257110595703,
"loss/reg": 0.005813860800117254,
"step": 323
},
{
"epoch": 0.0405,
"grad_norm": 3.156318187713623,
"grad_norm_var": 3.6284383166396252,
"learning_rate": 0.0001,
"loss": 1.2742,
"loss/crossentropy": 2.1970410346984863,
"loss/hidden": 1.0,
"loss/logits": 0.21606677770614624,
"loss/reg": 0.005812041461467743,
"step": 324
},
{
"epoch": 0.040625,
"grad_norm": 2.556889533996582,
"grad_norm_var": 3.611332493108523,
"learning_rate": 0.0001,
"loss": 0.9529,
"loss/crossentropy": 2.7647974491119385,
"loss/hidden": 0.7578125,
"loss/logits": 0.1369488537311554,
"loss/reg": 0.00581031059846282,
"step": 325
},
{
"epoch": 0.04075,
"grad_norm": 2.2634167671203613,
"grad_norm_var": 3.653624545749698,
"learning_rate": 0.0001,
"loss": 1.0757,
"loss/crossentropy": 2.334134340286255,
"loss/hidden": 0.859375,
"loss/logits": 0.1581987738609314,
"loss/reg": 0.005808570422232151,
"step": 326
},
{
"epoch": 0.040875,
"grad_norm": 2.3521125316619873,
"grad_norm_var": 3.6551397839485555,
"learning_rate": 0.0001,
"loss": 0.9965,
"loss/crossentropy": 2.78828763961792,
"loss/hidden": 0.79296875,
"loss/logits": 0.1454332172870636,
"loss/reg": 0.005806888919323683,
"step": 327
},
{
"epoch": 0.041,
"grad_norm": 3.0836093425750732,
"grad_norm_var": 3.5768996944618254,
"learning_rate": 0.0001,
"loss": 1.1938,
"loss/crossentropy": 2.2781612873077393,
"loss/hidden": 0.9609375,
"loss/logits": 0.1747758537530899,
"loss/reg": 0.005805303808301687,
"step": 328
},
{
"epoch": 0.041125,
"grad_norm": 3.6110970973968506,
"grad_norm_var": 3.5651235487558246,
"learning_rate": 0.0001,
"loss": 1.1693,
"loss/crossentropy": 2.812913417816162,
"loss/hidden": 0.9375,
"loss/logits": 0.17377659678459167,
"loss/reg": 0.005803780164569616,
"step": 329
},
{
"epoch": 0.04125,
"grad_norm": 2.5020155906677246,
"grad_norm_var": 1.552569952590708,
"learning_rate": 0.0001,
"loss": 1.0862,
"loss/crossentropy": 2.6585140228271484,
"loss/hidden": 0.86328125,
"loss/logits": 0.16489718854427338,
"loss/reg": 0.005802258383482695,
"step": 330
},
{
"epoch": 0.041375,
"grad_norm": 2.383924961090088,
"grad_norm_var": 0.17978007457456116,
"learning_rate": 0.0001,
"loss": 1.1592,
"loss/crossentropy": 2.4862210750579834,
"loss/hidden": 0.94921875,
"loss/logits": 0.15199331939220428,
"loss/reg": 0.005800731014460325,
"step": 331
},
{
"epoch": 0.0415,
"grad_norm": 2.187321424484253,
"grad_norm_var": 0.17949311071790794,
"learning_rate": 0.0001,
"loss": 1.0507,
"loss/crossentropy": 2.6380603313446045,
"loss/hidden": 0.84765625,
"loss/logits": 0.14507073163986206,
"loss/reg": 0.005798923317342997,
"step": 332
},
{
"epoch": 0.041625,
"grad_norm": 2.21768856048584,
"grad_norm_var": 0.18601193201957902,
"learning_rate": 0.0001,
"loss": 1.1027,
"loss/crossentropy": 2.3925793170928955,
"loss/hidden": 0.875,
"loss/logits": 0.16972869634628296,
"loss/reg": 0.00579707371070981,
"step": 333
},
{
"epoch": 0.04175,
"grad_norm": 2.682497262954712,
"grad_norm_var": 0.17937770683656615,
"learning_rate": 0.0001,
"loss": 1.3272,
"loss/crossentropy": 2.3586106300354004,
"loss/hidden": 1.078125,
"loss/logits": 0.1911502480506897,
"loss/reg": 0.005795224104076624,
"step": 334
},
{
"epoch": 0.041875,
"grad_norm": 3.0983307361602783,
"grad_norm_var": 0.1826395003188658,
"learning_rate": 0.0001,
"loss": 1.1675,
"loss/crossentropy": 2.436326265335083,
"loss/hidden": 0.91796875,
"loss/logits": 0.1915540099143982,
"loss/reg": 0.005793258547782898,
"step": 335
},
{
"epoch": 0.042,
"grad_norm": 6.251674652099609,
"grad_norm_var": 0.982431631272856,
"learning_rate": 0.0001,
"loss": 1.6879,
"loss/crossentropy": 2.3841142654418945,
"loss/hidden": 1.265625,
"loss/logits": 0.3643344044685364,
"loss/reg": 0.0057912725023925304,
"step": 336
},
{
"epoch": 0.042125,
"grad_norm": 3.0111782550811768,
"grad_norm_var": 0.9617308564996427,
"learning_rate": 0.0001,
"loss": 1.3497,
"loss/crossentropy": 2.430532217025757,
"loss/hidden": 1.0703125,
"loss/logits": 0.2214677333831787,
"loss/reg": 0.00578899122774601,
"step": 337
},
{
"epoch": 0.04225,
"grad_norm": 2.4221205711364746,
"grad_norm_var": 0.9640415151512265,
"learning_rate": 0.0001,
"loss": 1.0955,
"loss/crossentropy": 2.4376015663146973,
"loss/hidden": 0.890625,
"loss/logits": 0.1470467746257782,
"loss/reg": 0.005786662455648184,
"step": 338
},
{
"epoch": 0.042375,
"grad_norm": 2.615758180618286,
"grad_norm_var": 0.9645524062068328,
"learning_rate": 0.0001,
"loss": 1.0887,
"loss/crossentropy": 2.5318005084991455,
"loss/hidden": 0.875,
"loss/logits": 0.15580901503562927,
"loss/reg": 0.0057848175056278706,
"step": 339
},
{
"epoch": 0.0425,
"grad_norm": 2.857177972793579,
"grad_norm_var": 0.9599117798964886,
"learning_rate": 0.0001,
"loss": 1.1153,
"loss/crossentropy": 2.4260058403015137,
"loss/hidden": 0.89453125,
"loss/logits": 0.16291844844818115,
"loss/reg": 0.005782809574157,
"step": 340
},
{
"epoch": 0.042625,
"grad_norm": 2.4030630588531494,
"grad_norm_var": 0.9680393035693963,
"learning_rate": 0.0001,
"loss": 1.2054,
"loss/crossentropy": 2.3009443283081055,
"loss/hidden": 0.953125,
"loss/logits": 0.194431871175766,
"loss/reg": 0.005780525505542755,
"step": 341
},
{
"epoch": 0.04275,
"grad_norm": 2.264251470565796,
"grad_norm_var": 0.9679716782722624,
"learning_rate": 0.0001,
"loss": 1.0227,
"loss/crossentropy": 2.597288131713867,
"loss/hidden": 0.8203125,
"loss/logits": 0.14457917213439941,
"loss/reg": 0.005778233055025339,
"step": 342
},
{
"epoch": 0.042875,
"grad_norm": 2.2368180751800537,
"grad_norm_var": 0.9767866404468121,
"learning_rate": 0.0001,
"loss": 0.943,
"loss/crossentropy": 2.4534237384796143,
"loss/hidden": 0.7578125,
"loss/logits": 0.12742644548416138,
"loss/reg": 0.005776000674813986,
"step": 343
},
{
"epoch": 0.043,
"grad_norm": 2.469120979309082,
"grad_norm_var": 0.9824165851632264,
"learning_rate": 0.0001,
"loss": 1.0531,
"loss/crossentropy": 2.793834686279297,
"loss/hidden": 0.83984375,
"loss/logits": 0.15554235875606537,
"loss/reg": 0.005774145945906639,
"step": 344
},
{
"epoch": 0.043125,
"grad_norm": 2.8334686756134033,
"grad_norm_var": 0.9387961568478952,
"learning_rate": 0.0001,
"loss": 0.9467,
"loss/crossentropy": 2.678666830062866,
"loss/hidden": 0.7578125,
"loss/logits": 0.13116785883903503,
"loss/reg": 0.005771928001195192,
"step": 345
},
{
"epoch": 0.04325,
"grad_norm": 7.863356590270996,
"grad_norm_var": 2.5385263322105893,
"learning_rate": 0.0001,
"loss": 1.4695,
"loss/crossentropy": 2.613318920135498,
"loss/hidden": 1.2734375,
"loss/logits": 0.13832132518291473,
"loss/reg": 0.005770097486674786,
"step": 346
},
{
"epoch": 0.043375,
"grad_norm": 2.763582468032837,
"grad_norm_var": 2.510660987467067,
"learning_rate": 0.0001,
"loss": 1.1302,
"loss/crossentropy": 2.846453905105591,
"loss/hidden": 0.90625,
"loss/logits": 0.166295126080513,
"loss/reg": 0.0057678911834955215,
"step": 347
},
{
"epoch": 0.0435,
"grad_norm": 3.600456714630127,
"grad_norm_var": 2.4567056984087676,
"learning_rate": 0.0001,
"loss": 1.2108,
"loss/crossentropy": 2.515092372894287,
"loss/hidden": 0.96875,
"loss/logits": 0.18436874449253082,
"loss/reg": 0.005765695124864578,
"step": 348
},
{
"epoch": 0.043625,
"grad_norm": 4.2698073387146,
"grad_norm_var": 2.4444505062987636,
"learning_rate": 0.0001,
"loss": 1.1224,
"loss/crossentropy": 2.3673834800720215,
"loss/hidden": 0.8984375,
"loss/logits": 0.16628439724445343,
"loss/reg": 0.005763507913798094,
"step": 349
},
{
"epoch": 0.04375,
"grad_norm": 2.962045192718506,
"grad_norm_var": 2.42435544256402,
"learning_rate": 0.0001,
"loss": 1.079,
"loss/crossentropy": 2.9470205307006836,
"loss/hidden": 0.83203125,
"loss/logits": 0.1893935650587082,
"loss/reg": 0.005761242005974054,
"step": 350
},
{
"epoch": 0.043875,
"grad_norm": 3.0306880474090576,
"grad_norm_var": 2.427092851603572,
"learning_rate": 0.0001,
"loss": 1.0201,
"loss/crossentropy": 2.3637542724609375,
"loss/hidden": 0.83203125,
"loss/logits": 0.13047108054161072,
"loss/reg": 0.0057592191733419895,
"step": 351
},
{
"epoch": 0.044,
"grad_norm": 2.599585771560669,
"grad_norm_var": 1.855493477227511,
"learning_rate": 0.0001,
"loss": 0.9429,
"loss/crossentropy": 2.9222559928894043,
"loss/hidden": 0.7578125,
"loss/logits": 0.12747693061828613,
"loss/reg": 0.005757040809839964,
"step": 352
},
{
"epoch": 0.044125,
"grad_norm": 2.4723081588745117,
"grad_norm_var": 1.882729557078295,
"learning_rate": 0.0001,
"loss": 1.2276,
"loss/crossentropy": 2.5835001468658447,
"loss/hidden": 0.94921875,
"loss/logits": 0.220790833234787,
"loss/reg": 0.005754764657467604,
"step": 353
},
{
"epoch": 0.04425,
"grad_norm": 2.5266165733337402,
"grad_norm_var": 1.873911870827686,
"learning_rate": 0.0001,
"loss": 1.1879,
"loss/crossentropy": 2.4273722171783447,
"loss/hidden": 0.97265625,
"loss/logits": 0.15772980451583862,
"loss/reg": 0.005752884317189455,
"step": 354
},
{
"epoch": 0.044375,
"grad_norm": 2.8139867782592773,
"grad_norm_var": 1.8632913443851133,
"learning_rate": 0.0001,
"loss": 1.2803,
"loss/crossentropy": 2.591078996658325,
"loss/hidden": 1.0234375,
"loss/logits": 0.19931599497795105,
"loss/reg": 0.0057507967576384544,
"step": 355
},
{
"epoch": 0.0445,
"grad_norm": 2.0173490047454834,
"grad_norm_var": 1.9371277324683585,
"learning_rate": 0.0001,
"loss": 1.0066,
"loss/crossentropy": 2.415416955947876,
"loss/hidden": 0.80859375,
"loss/logits": 0.14050991833209991,
"loss/reg": 0.005748571362346411,
"step": 356
},
{
"epoch": 0.044625,
"grad_norm": 3.5304269790649414,
"grad_norm_var": 1.916250206343263,
"learning_rate": 0.0001,
"loss": 1.2665,
"loss/crossentropy": 2.7149741649627686,
"loss/hidden": 1.0390625,
"loss/logits": 0.16997796297073364,
"loss/reg": 0.005746254697442055,
"step": 357
},
{
"epoch": 0.04475,
"grad_norm": 47.96537399291992,
"grad_norm_var": 127.11164707702224,
"learning_rate": 0.0001,
"loss": 1.4579,
"loss/crossentropy": 2.7637100219726562,
"loss/hidden": 1.2265625,
"loss/logits": 0.17390823364257812,
"loss/reg": 0.005744417663663626,
"step": 358
},
{
"epoch": 0.044875,
"grad_norm": 2.253833055496216,
"grad_norm_var": 127.10313415769795,
"learning_rate": 0.0001,
"loss": 1.1382,
"loss/crossentropy": 2.3016419410705566,
"loss/hidden": 0.9140625,
"loss/logits": 0.16676074266433716,
"loss/reg": 0.005742207169532776,
"step": 359
},
{
"epoch": 0.045,
"grad_norm": 3.2059576511383057,
"grad_norm_var": 126.79034824550331,
"learning_rate": 0.0001,
"loss": 1.2389,
"loss/crossentropy": 2.624589204788208,
"loss/hidden": 1.0,
"loss/logits": 0.18154433369636536,
"loss/reg": 0.005740353371948004,
"step": 360
},
{
"epoch": 0.045125,
"grad_norm": 2.456129789352417,
"grad_norm_var": 126.9607902891753,
"learning_rate": 0.0001,
"loss": 1.0342,
"loss/crossentropy": 2.500290870666504,
"loss/hidden": 0.83203125,
"loss/logits": 0.14475134015083313,
"loss/reg": 0.005738324951380491,
"step": 361
},
{
"epoch": 0.04525,
"grad_norm": 3.081372022628784,
"grad_norm_var": 127.21513938268541,
"learning_rate": 0.0001,
"loss": 1.1093,
"loss/crossentropy": 2.3305118083953857,
"loss/hidden": 0.8984375,
"loss/logits": 0.15346962213516235,
"loss/reg": 0.0057361493818461895,
"step": 362
},
{
"epoch": 0.045375,
"grad_norm": 2.2634801864624023,
"grad_norm_var": 127.4280286195785,
"learning_rate": 0.0001,
"loss": 1.0956,
"loss/crossentropy": 2.4553990364074707,
"loss/hidden": 0.875,
"loss/logits": 0.16324618458747864,
"loss/reg": 0.005734298378229141,
"step": 363
},
{
"epoch": 0.0455,
"grad_norm": 3.9597907066345215,
"grad_norm_var": 127.3359579534097,
"learning_rate": 0.0001,
"loss": 1.3557,
"loss/crossentropy": 2.6449685096740723,
"loss/hidden": 1.078125,
"loss/logits": 0.2202637791633606,
"loss/reg": 0.005732398014515638,
"step": 364
},
{
"epoch": 0.045625,
"grad_norm": 2.7794013023376465,
"grad_norm_var": 127.76159157574789,
"learning_rate": 0.0001,
"loss": 1.0787,
"loss/crossentropy": 2.3118059635162354,
"loss/hidden": 0.86328125,
"loss/logits": 0.1581302285194397,
"loss/reg": 0.005730301141738892,
"step": 365
},
{
"epoch": 0.04575,
"grad_norm": 4.7589192390441895,
"grad_norm_var": 127.32661229099328,
"learning_rate": 0.0001,
"loss": 1.3244,
"loss/crossentropy": 2.5914306640625,
"loss/hidden": 1.078125,
"loss/logits": 0.18898184597492218,
"loss/reg": 0.005728167947381735,
"step": 366
},
{
"epoch": 0.045875,
"grad_norm": 4.024761199951172,
"grad_norm_var": 127.03030673720949,
"learning_rate": 0.0001,
"loss": 1.421,
"loss/crossentropy": 2.083667755126953,
"loss/hidden": 1.1640625,
"loss/logits": 0.1997053027153015,
"loss/reg": 0.005726283416152,
"step": 367
},
{
"epoch": 0.046,
"grad_norm": 2.9291043281555176,
"grad_norm_var": 126.89672944049376,
"learning_rate": 0.0001,
"loss": 1.1321,
"loss/crossentropy": 2.7017500400543213,
"loss/hidden": 0.90625,
"loss/logits": 0.1686232089996338,
"loss/reg": 0.005724436603486538,
"step": 368
},
{
"epoch": 0.046125,
"grad_norm": 2.289379119873047,
"grad_norm_var": 126.98034912166224,
"learning_rate": 0.0001,
"loss": 1.0433,
"loss/crossentropy": 2.404045581817627,
"loss/hidden": 0.8359375,
"loss/logits": 0.1501048356294632,
"loss/reg": 0.005722455680370331,
"step": 369
},
{
"epoch": 0.04625,
"grad_norm": 2.5955307483673096,
"grad_norm_var": 126.95053618311779,
"learning_rate": 0.0001,
"loss": 1.1052,
"loss/crossentropy": 2.555497407913208,
"loss/hidden": 0.87890625,
"loss/logits": 0.16912290453910828,
"loss/reg": 0.0057206167839467525,
"step": 370
},
{
"epoch": 0.046375,
"grad_norm": 2.5631515979766846,
"grad_norm_var": 127.05459572518181,
"learning_rate": 0.0001,
"loss": 1.0105,
"loss/crossentropy": 2.3253824710845947,
"loss/hidden": 0.80859375,
"loss/logits": 0.14470672607421875,
"loss/reg": 0.005718756001442671,
"step": 371
},
{
"epoch": 0.0465,
"grad_norm": 2.8995003700256348,
"grad_norm_var": 126.65924311218065,
"learning_rate": 0.0001,
"loss": 1.0727,
"loss/crossentropy": 2.5171523094177246,
"loss/hidden": 0.859375,
"loss/logits": 0.15616215765476227,
"loss/reg": 0.005716769490391016,
"step": 372
},
{
"epoch": 0.046625,
"grad_norm": 2.4674322605133057,
"grad_norm_var": 127.0582358856119,
"learning_rate": 0.0001,
"loss": 0.9544,
"loss/crossentropy": 2.426679849624634,
"loss/hidden": 0.765625,
"loss/logits": 0.13166998326778412,
"loss/reg": 0.005714884493499994,
"step": 373
},
{
"epoch": 0.04675,
"grad_norm": 2.1486146450042725,
"grad_norm_var": 0.5554253140062239,
"learning_rate": 0.0001,
"loss": 1.0123,
"loss/crossentropy": 2.3567564487457275,
"loss/hidden": 0.8203125,
"loss/logits": 0.1348218023777008,
"loss/reg": 0.0057129692286252975,
"step": 374
},
{
"epoch": 0.046875,
"grad_norm": 2.4249770641326904,
"grad_norm_var": 0.5421168003854054,
"learning_rate": 0.0001,
"loss": 1.0005,
"loss/crossentropy": 2.575383424758911,
"loss/hidden": 0.80078125,
"loss/logits": 0.1425924003124237,
"loss/reg": 0.005710979457944632,
"step": 375
},
{
"epoch": 0.047,
"grad_norm": 3.9449760913848877,
"grad_norm_var": 0.6036429091311817,
"learning_rate": 0.0001,
"loss": 1.1428,
"loss/crossentropy": 2.5839173793792725,
"loss/hidden": 0.94921875,
"loss/logits": 0.13653349876403809,
"loss/reg": 0.0057089440524578094,
"step": 376
},
{
"epoch": 0.047125,
"grad_norm": 2.3119592666625977,
"grad_norm_var": 0.6148998912723904,
"learning_rate": 0.0001,
"loss": 1.088,
"loss/crossentropy": 2.492663860321045,
"loss/hidden": 0.859375,
"loss/logits": 0.1715661883354187,
"loss/reg": 0.005707095842808485,
"step": 377
},
{
"epoch": 0.04725,
"grad_norm": 3.586817979812622,
"grad_norm_var": 0.6386998540868449,
"learning_rate": 0.0001,
"loss": 1.0907,
"loss/crossentropy": 2.8210177421569824,
"loss/hidden": 0.87890625,
"loss/logits": 0.15476316213607788,
"loss/reg": 0.005705154500901699,
"step": 378
},
{
"epoch": 0.047375,
"grad_norm": 2.805647850036621,
"grad_norm_var": 0.6040650287121667,
"learning_rate": 0.0001,
"loss": 1.0792,
"loss/crossentropy": 2.54019832611084,
"loss/hidden": 0.859375,
"loss/logits": 0.16280022263526917,
"loss/reg": 0.005703243892639875,
"step": 379
},
{
"epoch": 0.0475,
"grad_norm": 2.7932748794555664,
"grad_norm_var": 0.5445939245804574,
"learning_rate": 0.0001,
"loss": 1.4621,
"loss/crossentropy": 2.2343437671661377,
"loss/hidden": 1.1953125,
"loss/logits": 0.20978981256484985,
"loss/reg": 0.005701290909200907,
"step": 380
},
{
"epoch": 0.047625,
"grad_norm": 2.661917209625244,
"grad_norm_var": 0.5482497924242672,
"learning_rate": 0.0001,
"loss": 0.9746,
"loss/crossentropy": 2.782052516937256,
"loss/hidden": 0.78125,
"loss/logits": 0.13640211522579193,
"loss/reg": 0.0056994096376001835,
"step": 381
},
{
"epoch": 0.04775,
"grad_norm": 2.4914302825927734,
"grad_norm_var": 0.3228126995822395,
"learning_rate": 0.0001,
"loss": 1.126,
"loss/crossentropy": 2.166295051574707,
"loss/hidden": 0.91015625,
"loss/logits": 0.1589164137840271,
"loss/reg": 0.005697426851838827,
"step": 382
},
{
"epoch": 0.047875,
"grad_norm": 2.961653709411621,
"grad_norm_var": 0.22106978564282992,
"learning_rate": 0.0001,
"loss": 1.1071,
"loss/crossentropy": 2.5477302074432373,
"loss/hidden": 0.8828125,
"loss/logits": 0.16730068624019623,
"loss/reg": 0.005695413798093796,
"step": 383
},
{
"epoch": 0.048,
"grad_norm": 2.9396286010742188,
"grad_norm_var": 0.22133896443579198,
"learning_rate": 0.0001,
"loss": 1.0254,
"loss/crossentropy": 2.555258274078369,
"loss/hidden": 0.828125,
"loss/logits": 0.1403425633907318,
"loss/reg": 0.005693417973816395,
"step": 384
},
{
"epoch": 0.048125,
"grad_norm": 2.8298912048339844,
"grad_norm_var": 0.20691636732209961,
"learning_rate": 0.0001,
"loss": 1.195,
"loss/crossentropy": 2.472844362258911,
"loss/hidden": 0.984375,
"loss/logits": 0.15367946028709412,
"loss/reg": 0.005691539496183395,
"step": 385
},
{
"epoch": 0.04825,
"grad_norm": 15.47062873840332,
"grad_norm_var": 10.256501481265339,
"learning_rate": 0.0001,
"loss": 1.4448,
"loss/crossentropy": 2.521524667739868,
"loss/hidden": 1.203125,
"loss/logits": 0.1847420334815979,
"loss/reg": 0.005689616315066814,
"step": 386
},
{
"epoch": 0.048375,
"grad_norm": 2.455294370651245,
"grad_norm_var": 10.271871141002237,
"learning_rate": 0.0001,
"loss": 1.1018,
"loss/crossentropy": 2.309390068054199,
"loss/hidden": 0.89453125,
"loss/logits": 0.15039557218551636,
"loss/reg": 0.005687698721885681,
"step": 387
},
{
"epoch": 0.0485,
"grad_norm": 3.23420786857605,
"grad_norm_var": 10.248744715041969,
"learning_rate": 0.0001,
"loss": 1.2879,
"loss/crossentropy": 2.4902544021606445,
"loss/hidden": 1.015625,
"loss/logits": 0.2154603898525238,
"loss/reg": 0.005685731768608093,
"step": 388
},
{
"epoch": 0.048625,
"grad_norm": 2.660858631134033,
"grad_norm_var": 10.221989434520331,
"learning_rate": 0.0001,
"loss": 1.025,
"loss/crossentropy": 2.31535267829895,
"loss/hidden": 0.8359375,
"loss/logits": 0.13224059343338013,
"loss/reg": 0.005683773662894964,
"step": 389
},
{
"epoch": 0.04875,
"grad_norm": 2.4209847450256348,
"grad_norm_var": 10.173641089965429,
"learning_rate": 0.0001,
"loss": 0.9974,
"loss/crossentropy": 2.1761093139648438,
"loss/hidden": 0.8125,
"loss/logits": 0.12805956602096558,
"loss/reg": 0.005681932438164949,
"step": 390
},
{
"epoch": 0.048875,
"grad_norm": 3.108008623123169,
"grad_norm_var": 10.09354551501582,
"learning_rate": 0.0001,
"loss": 0.979,
"loss/crossentropy": 2.721165657043457,
"loss/hidden": 0.78125,
"loss/logits": 0.14099523425102234,
"loss/reg": 0.005679869093000889,
"step": 391
},
{
"epoch": 0.049,
"grad_norm": 2.6531527042388916,
"grad_norm_var": 10.150022289467502,
"learning_rate": 0.0001,
"loss": 1.1723,
"loss/crossentropy": 2.518146514892578,
"loss/hidden": 0.9375,
"loss/logits": 0.17805764079093933,
"loss/reg": 0.005677856504917145,
"step": 392
},
{
"epoch": 0.049125,
"grad_norm": 2.2534499168395996,
"grad_norm_var": 10.160179916565673,
"learning_rate": 0.0001,
"loss": 1.1292,
"loss/crossentropy": 2.633385181427002,
"loss/hidden": 0.91015625,
"loss/logits": 0.16230204701423645,
"loss/reg": 0.005675735417753458,
"step": 393
},
{
"epoch": 0.04925,
"grad_norm": 2.9424333572387695,
"grad_norm_var": 10.185797665159741,
"learning_rate": 0.0001,
"loss": 1.4214,
"loss/crossentropy": 2.62923002243042,
"loss/hidden": 1.15625,
"loss/logits": 0.20838308334350586,
"loss/reg": 0.00567356962710619,
"step": 394
},
{
"epoch": 0.049375,
"grad_norm": 2.622178792953491,
"grad_norm_var": 10.20593051221178,
"learning_rate": 0.0001,
"loss": 0.9697,
"loss/crossentropy": 2.5544826984405518,
"loss/hidden": 0.78125,
"loss/logits": 0.13172510266304016,
"loss/reg": 0.005671407096087933,
"step": 395
},
{
"epoch": 0.0495,
"grad_norm": 2.635505199432373,
"grad_norm_var": 10.223008906743342,
"learning_rate": 0.0001,
"loss": 0.933,
"loss/crossentropy": 2.5959105491638184,
"loss/hidden": 0.75390625,
"loss/logits": 0.12239634245634079,
"loss/reg": 0.0056692929938435555,
"step": 396
},
{
"epoch": 0.049625,
"grad_norm": 2.6063406467437744,
"grad_norm_var": 10.229570355797922,
"learning_rate": 0.0001,
"loss": 1.0478,
"loss/crossentropy": 2.719916343688965,
"loss/hidden": 0.83984375,
"loss/logits": 0.15127256512641907,
"loss/reg": 0.0056673381477594376,
"step": 397
},
{
"epoch": 0.04975,
"grad_norm": 2.589893102645874,
"grad_norm_var": 10.216701025853546,
"learning_rate": 0.0001,
"loss": 1.1265,
"loss/crossentropy": 2.3730130195617676,
"loss/hidden": 0.90234375,
"loss/logits": 0.16749918460845947,
"loss/reg": 0.0056652189232409,
"step": 398
},
{
"epoch": 0.049875,
"grad_norm": 2.1503751277923584,
"grad_norm_var": 10.318666846324161,
"learning_rate": 0.0001,
"loss": 1.1685,
"loss/crossentropy": 2.2147741317749023,
"loss/hidden": 0.92578125,
"loss/logits": 0.1860472559928894,
"loss/reg": 0.005663097370415926,
"step": 399
},
{
"epoch": 0.05,
"grad_norm": 3.6945109367370605,
"grad_norm_var": 10.300567557859127,
"learning_rate": 0.0001,
"loss": 1.1272,
"loss/crossentropy": 2.4212143421173096,
"loss/hidden": 0.921875,
"loss/logits": 0.1487593650817871,
"loss/reg": 0.005661314353346825,
"step": 400
},
{
"epoch": 0.050125,
"grad_norm": 3.7444777488708496,
"grad_norm_var": 10.268632820538057,
"learning_rate": 0.0001,
"loss": 1.1221,
"loss/crossentropy": 2.5369904041290283,
"loss/hidden": 0.90625,
"loss/logits": 0.15929211676120758,
"loss/reg": 0.005659462418407202,
"step": 401
},
{
"epoch": 0.05025,
"grad_norm": 5.121776580810547,
"grad_norm_var": 0.5518050614602837,
"learning_rate": 0.0001,
"loss": 1.4671,
"loss/crossentropy": 2.2371129989624023,
"loss/hidden": 1.2109375,
"loss/logits": 0.19960111379623413,
"loss/reg": 0.005657529458403587,
"step": 402
},
{
"epoch": 0.050375,
"grad_norm": 28.607572555541992,
"grad_norm_var": 41.63994308721723,
"learning_rate": 0.0001,
"loss": 1.1515,
"loss/crossentropy": 2.84385347366333,
"loss/hidden": 0.90234375,
"loss/logits": 0.19263674318790436,
"loss/reg": 0.005655454937368631,
"step": 403
},
{
"epoch": 0.0505,
"grad_norm": 2.38948655128479,
"grad_norm_var": 41.834466994087045,
"learning_rate": 0.0001,
"loss": 1.0929,
"loss/crossentropy": 2.2518088817596436,
"loss/hidden": 0.8984375,
"loss/logits": 0.13791221380233765,
"loss/reg": 0.005653408356010914,
"step": 404
},
{
"epoch": 0.050625,
"grad_norm": 6.887917518615723,
"grad_norm_var": 41.907583648135414,
"learning_rate": 0.0001,
"loss": 1.2522,
"loss/crossentropy": 2.8729405403137207,
"loss/hidden": 1.046875,
"loss/logits": 0.14880970120429993,
"loss/reg": 0.005651514511555433,
"step": 405
},
{
"epoch": 0.05075,
"grad_norm": 3.2420449256896973,
"grad_norm_var": 41.69182027548524,
"learning_rate": 0.0001,
"loss": 1.2031,
"loss/crossentropy": 2.598705530166626,
"loss/hidden": 0.98046875,
"loss/logits": 0.16617505252361298,
"loss/reg": 0.005649634636938572,
"step": 406
},
{
"epoch": 0.050875,
"grad_norm": 2.3294692039489746,
"grad_norm_var": 41.9082544413822,
"learning_rate": 0.0001,
"loss": 1.0316,
"loss/crossentropy": 2.7743589878082275,
"loss/hidden": 0.84375,
"loss/logits": 0.13134868443012238,
"loss/reg": 0.005647764541208744,
"step": 407
},
{
"epoch": 0.051,
"grad_norm": 2.3849406242370605,
"grad_norm_var": 41.988788990047645,
"learning_rate": 0.0001,
"loss": 1.1579,
"loss/crossentropy": 2.2934722900390625,
"loss/hidden": 0.9375,
"loss/logits": 0.16397064924240112,
"loss/reg": 0.00564591446891427,
"step": 408
},
{
"epoch": 0.051125,
"grad_norm": 2.616523504257202,
"grad_norm_var": 41.875558070811756,
"learning_rate": 0.0001,
"loss": 0.9281,
"loss/crossentropy": 2.617312431335449,
"loss/hidden": 0.7734375,
"loss/logits": 0.09819567203521729,
"loss/reg": 0.005644225515425205,
"step": 409
},
{
"epoch": 0.05125,
"grad_norm": 2.302281141281128,
"grad_norm_var": 42.058469053043055,
"learning_rate": 0.0001,
"loss": 1.0583,
"loss/crossentropy": 2.8029561042785645,
"loss/hidden": 0.859375,
"loss/logits": 0.14253735542297363,
"loss/reg": 0.005642317235469818,
"step": 410
},
{
"epoch": 0.051375,
"grad_norm": 2.1521739959716797,
"grad_norm_var": 42.20532780726832,
"learning_rate": 0.0001,
"loss": 0.996,
"loss/crossentropy": 2.5798304080963135,
"loss/hidden": 0.80078125,
"loss/logits": 0.13881272077560425,
"loss/reg": 0.005640234332531691,
"step": 411
},
{
"epoch": 0.0515,
"grad_norm": 4.3292155265808105,
"grad_norm_var": 41.914794683811124,
"learning_rate": 0.0001,
"loss": 1.3517,
"loss/crossentropy": 2.4219868183135986,
"loss/hidden": 1.0390625,
"loss/logits": 0.2562292516231537,
"loss/reg": 0.005638125352561474,
"step": 412
},
{
"epoch": 0.051625,
"grad_norm": 19.01975440979004,
"grad_norm_var": 53.903843358167165,
"learning_rate": 0.0001,
"loss": 1.3283,
"loss/crossentropy": 2.2926077842712402,
"loss/hidden": 1.078125,
"loss/logits": 0.19380658864974976,
"loss/reg": 0.005636140704154968,
"step": 413
},
{
"epoch": 0.05175,
"grad_norm": 2.859027862548828,
"grad_norm_var": 53.791467006877085,
"learning_rate": 0.0001,
"loss": 1.1115,
"loss/crossentropy": 2.429117441177368,
"loss/hidden": 0.90234375,
"loss/logits": 0.1528070569038391,
"loss/reg": 0.005634027067571878,
"step": 414
},
{
"epoch": 0.051875,
"grad_norm": 2.385204792022705,
"grad_norm_var": 53.67862289213027,
"learning_rate": 0.0001,
"loss": 1.0186,
"loss/crossentropy": 2.710325002670288,
"loss/hidden": 0.81640625,
"loss/logits": 0.1458669900894165,
"loss/reg": 0.005631967913359404,
"step": 415
},
{
"epoch": 0.052,
"grad_norm": 2.3011677265167236,
"grad_norm_var": 54.20582073402194,
"learning_rate": 0.0001,
"loss": 1.0843,
"loss/crossentropy": 2.485734701156616,
"loss/hidden": 0.87109375,
"loss/logits": 0.1569264829158783,
"loss/reg": 0.0056300037540495396,
"step": 416
},
{
"epoch": 0.052125,
"grad_norm": 2.7714357376098633,
"grad_norm_var": 54.53064815195892,
"learning_rate": 0.0001,
"loss": 1.0741,
"loss/crossentropy": 2.6249403953552246,
"loss/hidden": 0.85546875,
"loss/logits": 0.1623522937297821,
"loss/reg": 0.0056281075812876225,
"step": 417
},
{
"epoch": 0.05225,
"grad_norm": 2.376473903656006,
"grad_norm_var": 55.22478277620113,
"learning_rate": 0.0001,
"loss": 1.2116,
"loss/crossentropy": 2.5150105953216553,
"loss/hidden": 0.95703125,
"loss/logits": 0.19830524921417236,
"loss/reg": 0.005626222584396601,
"step": 418
},
{
"epoch": 0.052375,
"grad_norm": 2.6247470378875732,
"grad_norm_var": 17.572360223815615,
"learning_rate": 0.0001,
"loss": 1.172,
"loss/crossentropy": 2.7201685905456543,
"loss/hidden": 0.9453125,
"loss/logits": 0.17042091488838196,
"loss/reg": 0.005624283570796251,
"step": 419
},
{
"epoch": 0.0525,
"grad_norm": 49.02815628051758,
"grad_norm_var": 143.90483482694842,
"learning_rate": 0.0001,
"loss": 5.3824,
"loss/crossentropy": 2.692047357559204,
"loss/hidden": 4.84375,
"loss/logits": 0.48245739936828613,
"loss/reg": 0.005622203927487135,
"step": 420
},
{
"epoch": 0.052625,
"grad_norm": 2.6867082118988037,
"grad_norm_var": 144.9870986829453,
"learning_rate": 0.0001,
"loss": 1.2507,
"loss/crossentropy": 2.404517412185669,
"loss/hidden": 1.0,
"loss/logits": 0.19445687532424927,
"loss/reg": 0.005620268173515797,
"step": 421
},
{
"epoch": 0.05275,
"grad_norm": 4.397704124450684,
"grad_norm_var": 144.55498651709914,
"learning_rate": 0.0001,
"loss": 1.4596,
"loss/crossentropy": 2.1510226726531982,
"loss/hidden": 1.2109375,
"loss/logits": 0.19246640801429749,
"loss/reg": 0.005618296563625336,
"step": 422
},
{
"epoch": 0.052875,
"grad_norm": 4.239573955535889,
"grad_norm_var": 143.68003611616095,
"learning_rate": 0.0001,
"loss": 1.3275,
"loss/crossentropy": 2.686849355697632,
"loss/hidden": 1.09375,
"loss/logits": 0.17758557200431824,
"loss/reg": 0.005616751033812761,
"step": 423
},
{
"epoch": 0.053,
"grad_norm": 2.749202251434326,
"grad_norm_var": 143.4748837350726,
"learning_rate": 0.0001,
"loss": 1.0827,
"loss/crossentropy": 2.8104846477508545,
"loss/hidden": 0.8828125,
"loss/logits": 0.1437493860721588,
"loss/reg": 0.005615332629531622,
"step": 424
},
{
"epoch": 0.053125,
"grad_norm": 2.459291458129883,
"grad_norm_var": 143.5641839570371,
"learning_rate": 0.0001,
"loss": 1.0548,
"loss/crossentropy": 2.5806379318237305,
"loss/hidden": 0.8515625,
"loss/logits": 0.14714661240577698,
"loss/reg": 0.005613364279270172,
"step": 425
},
{
"epoch": 0.05325,
"grad_norm": 2.294171094894409,
"grad_norm_var": 143.56904366210821,
"learning_rate": 0.0001,
"loss": 1.1486,
"loss/crossentropy": 2.6366002559661865,
"loss/hidden": 0.90234375,
"loss/logits": 0.19014191627502441,
"loss/reg": 0.005611394997686148,
"step": 426
},
{
"epoch": 0.053375,
"grad_norm": 2.2255382537841797,
"grad_norm_var": 143.52399251007708,
"learning_rate": 0.0001,
"loss": 1.0752,
"loss/crossentropy": 2.542306661605835,
"loss/hidden": 0.875,
"loss/logits": 0.14408408105373383,
"loss/reg": 0.005609368905425072,
"step": 427
},
{
"epoch": 0.0535,
"grad_norm": 3.5708723068237305,
"grad_norm_var": 143.80942972780392,
"learning_rate": 0.0001,
"loss": 1.0863,
"loss/crossentropy": 2.2636356353759766,
"loss/hidden": 0.8828125,
"loss/logits": 0.14744916558265686,
"loss/reg": 0.005607361439615488,
"step": 428
},
{
"epoch": 0.053625,
"grad_norm": 2.9189610481262207,
"grad_norm_var": 133.66980873374825,
"learning_rate": 0.0001,
"loss": 0.9895,
"loss/crossentropy": 2.7651426792144775,
"loss/hidden": 0.78515625,
"loss/logits": 0.1482805609703064,
"loss/reg": 0.005605428479611874,
"step": 429
},
{
"epoch": 0.05375,
"grad_norm": 3.2735564708709717,
"grad_norm_var": 133.5211490137515,
"learning_rate": 0.0001,
"loss": 1.2363,
"loss/crossentropy": 2.248082399368286,
"loss/hidden": 0.98046875,
"loss/logits": 0.19977417588233948,
"loss/reg": 0.0056034415028989315,
"step": 430
},
{
"epoch": 0.053875,
"grad_norm": 3.5670769214630127,
"grad_norm_var": 133.0752341056661,
"learning_rate": 0.0001,
"loss": 1.2766,
"loss/crossentropy": 2.500338554382324,
"loss/hidden": 1.0234375,
"loss/logits": 0.19719059765338898,
"loss/reg": 0.005601502023637295,
"step": 431
},
{
"epoch": 0.054,
"grad_norm": 2.2697787284851074,
"grad_norm_var": 133.0901180807591,
"learning_rate": 0.0001,
"loss": 0.9931,
"loss/crossentropy": 2.6418793201446533,
"loss/hidden": 0.7890625,
"loss/logits": 0.14799568057060242,
"loss/reg": 0.005599519703537226,
"step": 432
},
{
"epoch": 0.054125,
"grad_norm": 3.220383405685425,
"grad_norm_var": 132.91898234062202,
"learning_rate": 0.0001,
"loss": 1.2515,
"loss/crossentropy": 2.5073025226593018,
"loss/hidden": 1.0390625,
"loss/logits": 0.15643876791000366,
"loss/reg": 0.005597477313131094,
"step": 433
},
{
"epoch": 0.05425,
"grad_norm": 3.2845206260681152,
"grad_norm_var": 132.5476800488924,
"learning_rate": 0.0001,
"loss": 1.1441,
"loss/crossentropy": 2.509037971496582,
"loss/hidden": 0.9296875,
"loss/logits": 0.15849418938159943,
"loss/reg": 0.005595567170530558,
"step": 434
},
{
"epoch": 0.054375,
"grad_norm": 2.254239320755005,
"grad_norm_var": 132.71932731242507,
"learning_rate": 0.0001,
"loss": 0.9815,
"loss/crossentropy": 2.567584991455078,
"loss/hidden": 0.78125,
"loss/logits": 0.14433184266090393,
"loss/reg": 0.005593593697994947,
"step": 435
},
{
"epoch": 0.0545,
"grad_norm": 3.2273480892181396,
"grad_norm_var": 0.4676980414191933,
"learning_rate": 0.0001,
"loss": 1.1645,
"loss/crossentropy": 2.3639349937438965,
"loss/hidden": 0.94921875,
"loss/logits": 0.15934088826179504,
"loss/reg": 0.0055916691198945045,
"step": 436
},
{
"epoch": 0.054625,
"grad_norm": 2.6044058799743652,
"grad_norm_var": 0.47199755801328347,
"learning_rate": 0.0001,
"loss": 1.1033,
"loss/crossentropy": 2.539247989654541,
"loss/hidden": 0.8984375,
"loss/logits": 0.14898554980754852,
"loss/reg": 0.005589775741100311,
"step": 437
},
{
"epoch": 0.05475,
"grad_norm": 2.9674391746520996,
"grad_norm_var": 0.3399405404704983,
"learning_rate": 0.0001,
"loss": 1.252,
"loss/crossentropy": 2.5642499923706055,
"loss/hidden": 0.9921875,
"loss/logits": 0.20391228795051575,
"loss/reg": 0.005587900057435036,
"step": 438
},
{
"epoch": 0.054875,
"grad_norm": 2.4164047241210938,
"grad_norm_var": 0.23308679379454797,
"learning_rate": 0.0001,
"loss": 1.1939,
"loss/crossentropy": 2.3462696075439453,
"loss/hidden": 0.93359375,
"loss/logits": 0.2044137418270111,
"loss/reg": 0.005585688166320324,
"step": 439
},
{
"epoch": 0.055,
"grad_norm": 2.7590599060058594,
"grad_norm_var": 0.2329847653181711,
"learning_rate": 0.0001,
"loss": 1.0377,
"loss/crossentropy": 2.775485038757324,
"loss/hidden": 0.84375,
"loss/logits": 0.13808496296405792,
"loss/reg": 0.0055835009552538395,
"step": 440
},
{
"epoch": 0.055125,
"grad_norm": 2.7251267433166504,
"grad_norm_var": 0.224188675724659,
"learning_rate": 0.0001,
"loss": 1.0001,
"loss/crossentropy": 2.4934420585632324,
"loss/hidden": 0.80859375,
"loss/logits": 0.1357189267873764,
"loss/reg": 0.005581483710557222,
"step": 441
},
{
"epoch": 0.05525,
"grad_norm": 2.4774584770202637,
"grad_norm_var": 0.21273704839308963,
"learning_rate": 0.0001,
"loss": 1.2166,
"loss/crossentropy": 2.426271438598633,
"loss/hidden": 0.95703125,
"loss/logits": 0.20375394821166992,
"loss/reg": 0.0055792308412492275,
"step": 442
},
{
"epoch": 0.055375,
"grad_norm": 3.2236833572387695,
"grad_norm_var": 0.1905493662305197,
"learning_rate": 0.0001,
"loss": 1.1724,
"loss/crossentropy": 2.9799797534942627,
"loss/hidden": 0.92578125,
"loss/logits": 0.19083930552005768,
"loss/reg": 0.005577271804213524,
"step": 443
},
{
"epoch": 0.0555,
"grad_norm": 2.5997183322906494,
"grad_norm_var": 0.16554225723918894,
"learning_rate": 0.0001,
"loss": 1.126,
"loss/crossentropy": 2.2098257541656494,
"loss/hidden": 0.92578125,
"loss/logits": 0.14447355270385742,
"loss/reg": 0.005575183313339949,
"step": 444
},
{
"epoch": 0.055625,
"grad_norm": 2.5179152488708496,
"grad_norm_var": 0.1725392629592297,
"learning_rate": 0.0001,
"loss": 1.2018,
"loss/crossentropy": 2.0029213428497314,
"loss/hidden": 0.98046875,
"loss/logits": 0.1655960977077484,
"loss/reg": 0.005572900176048279,
"step": 445
},
{
"epoch": 0.05575,
"grad_norm": 2.5075204372406006,
"grad_norm_var": 0.16460110044899826,
"learning_rate": 0.0001,
"loss": 1.0614,
"loss/crossentropy": 2.3672924041748047,
"loss/hidden": 0.85546875,
"loss/logits": 0.15021467208862305,
"loss/reg": 0.005570439621806145,
"step": 446
},
{
"epoch": 0.055875,
"grad_norm": 2.441183567047119,
"grad_norm_var": 0.12700610259855102,
"learning_rate": 0.0001,
"loss": 0.9323,
"loss/crossentropy": 2.311056137084961,
"loss/hidden": 0.7578125,
"loss/logits": 0.11881721019744873,
"loss/reg": 0.00556844100356102,
"step": 447
},
{
"epoch": 0.056,
"grad_norm": 2.6724319458007812,
"grad_norm_var": 0.11304803744365562,
"learning_rate": 0.0001,
"loss": 1.0937,
"loss/crossentropy": 2.562101364135742,
"loss/hidden": 0.8671875,
"loss/logits": 0.1708334982395172,
"loss/reg": 0.005566492676734924,
"step": 448
},
{
"epoch": 0.056125,
"grad_norm": 2.196300506591797,
"grad_norm_var": 0.11350312697665288,
"learning_rate": 0.0001,
"loss": 0.9882,
"loss/crossentropy": 2.4227116107940674,
"loss/hidden": 0.80078125,
"loss/logits": 0.13182450830936432,
"loss/reg": 0.00556437112390995,
"step": 449
},
{
"epoch": 0.05625,
"grad_norm": 2.912667989730835,
"grad_norm_var": 0.0921566818687341,
"learning_rate": 0.0001,
"loss": 1.3721,
"loss/crossentropy": 1.9439491033554077,
"loss/hidden": 1.109375,
"loss/logits": 0.2070913016796112,
"loss/reg": 0.0055623650550842285,
"step": 450
},
{
"epoch": 0.056375,
"grad_norm": 2.011991500854492,
"grad_norm_var": 0.10881512213368959,
"learning_rate": 0.0001,
"loss": 1.0172,
"loss/crossentropy": 2.498812675476074,
"loss/hidden": 0.81640625,
"loss/logits": 0.14521706104278564,
"loss/reg": 0.005560221150517464,
"step": 451
},
{
"epoch": 0.0565,
"grad_norm": 2.2709267139434814,
"grad_norm_var": 0.0912508163184422,
"learning_rate": 0.0001,
"loss": 1.1384,
"loss/crossentropy": 2.320579767227173,
"loss/hidden": 0.9140625,
"loss/logits": 0.16879746317863464,
"loss/reg": 0.005558326840400696,
"step": 452
},
{
"epoch": 0.056625,
"grad_norm": 2.954127788543701,
"grad_norm_var": 0.09996231296479816,
"learning_rate": 0.0001,
"loss": 1.2415,
"loss/crossentropy": 2.483376979827881,
"loss/hidden": 0.99609375,
"loss/logits": 0.18988527357578278,
"loss/reg": 0.005556488875299692,
"step": 453
},
{
"epoch": 0.05675,
"grad_norm": 2.442729949951172,
"grad_norm_var": 0.0916992305907788,
"learning_rate": 0.0001,
"loss": 1.0533,
"loss/crossentropy": 2.414472818374634,
"loss/hidden": 0.84765625,
"loss/logits": 0.1501239389181137,
"loss/reg": 0.005554646719247103,
"step": 454
},
{
"epoch": 0.056875,
"grad_norm": 2.598292589187622,
"grad_norm_var": 0.09002796513685567,
"learning_rate": 0.0001,
"loss": 0.9797,
"loss/crossentropy": 2.8175811767578125,
"loss/hidden": 0.78515625,
"loss/logits": 0.13899990916252136,
"loss/reg": 0.005552831571549177,
"step": 455
},
{
"epoch": 0.057,
"grad_norm": 2.284618616104126,
"grad_norm_var": 0.09289234998963139,
"learning_rate": 0.0001,
"loss": 1.1767,
"loss/crossentropy": 2.5178730487823486,
"loss/hidden": 0.953125,
"loss/logits": 0.1680239588022232,
"loss/reg": 0.005550856236368418,
"step": 456
},
{
"epoch": 0.057125,
"grad_norm": 2.9749691486358643,
"grad_norm_var": 0.10255115779464533,
"learning_rate": 0.0001,
"loss": 1.146,
"loss/crossentropy": 2.6965036392211914,
"loss/hidden": 0.89453125,
"loss/logits": 0.19602364301681519,
"loss/reg": 0.005548745859414339,
"step": 457
},
{
"epoch": 0.05725,
"grad_norm": 2.4419991970062256,
"grad_norm_var": 0.10305738190390912,
"learning_rate": 0.0001,
"loss": 1.0782,
"loss/crossentropy": 2.507200241088867,
"loss/hidden": 0.87890625,
"loss/logits": 0.14385411143302917,
"loss/reg": 0.005546758882701397,
"step": 458
},
{
"epoch": 0.057375,
"grad_norm": 2.41898250579834,
"grad_norm_var": 0.07293072023693033,
"learning_rate": 0.0001,
"loss": 1.0665,
"loss/crossentropy": 2.4068796634674072,
"loss/hidden": 0.87109375,
"loss/logits": 0.13996180891990662,
"loss/reg": 0.005544655025005341,
"step": 459
},
{
"epoch": 0.0575,
"grad_norm": 3.584895372390747,
"grad_norm_var": 0.1446675774892469,
"learning_rate": 0.0001,
"loss": 1.419,
"loss/crossentropy": 2.4029970169067383,
"loss/hidden": 1.15625,
"loss/logits": 0.20734865963459015,
"loss/reg": 0.005542535334825516,
"step": 460
},
{
"epoch": 0.057625,
"grad_norm": 2.5190699100494385,
"grad_norm_var": 0.14465856873481447,
"learning_rate": 0.0001,
"loss": 1.0687,
"loss/crossentropy": 2.632817268371582,
"loss/hidden": 0.84375,
"loss/logits": 0.16959112882614136,
"loss/reg": 0.005540382582694292,
"step": 461
},
{
"epoch": 0.05775,
"grad_norm": 3.293412446975708,
"grad_norm_var": 0.1759751166057581,
"learning_rate": 0.0001,
"loss": 1.2079,
"loss/crossentropy": 1.8526346683502197,
"loss/hidden": 0.984375,
"loss/logits": 0.16817334294319153,
"loss/reg": 0.005538390018045902,
"step": 462
},
{
"epoch": 0.057875,
"grad_norm": 2.090097665786743,
"grad_norm_var": 0.1923380804679141,
"learning_rate": 0.0001,
"loss": 1.0403,
"loss/crossentropy": 2.7256767749786377,
"loss/hidden": 0.83984375,
"loss/logits": 0.14509689807891846,
"loss/reg": 0.005536381620913744,
"step": 463
},
{
"epoch": 0.058,
"grad_norm": 2.367372751235962,
"grad_norm_var": 0.19537989350592183,
"learning_rate": 0.0001,
"loss": 0.967,
"loss/crossentropy": 2.440683603286743,
"loss/hidden": 0.78125,
"loss/logits": 0.13041679561138153,
"loss/reg": 0.005534291733056307,
"step": 464
},
{
"epoch": 0.058125,
"grad_norm": 2.5434730052948,
"grad_norm_var": 0.18491306851457617,
"learning_rate": 0.0001,
"loss": 1.1396,
"loss/crossentropy": 2.811406373977661,
"loss/hidden": 0.91015625,
"loss/logits": 0.1740744560956955,
"loss/reg": 0.005532294511795044,
"step": 465
},
{
"epoch": 0.05825,
"grad_norm": 2.613758087158203,
"grad_norm_var": 0.17830906169392974,
"learning_rate": 0.0001,
"loss": 1.0313,
"loss/crossentropy": 2.5138356685638428,
"loss/hidden": 0.828125,
"loss/logits": 0.1479034125804901,
"loss/reg": 0.005530340131372213,
"step": 466
},
{
"epoch": 0.058375,
"grad_norm": 3.6053991317749023,
"grad_norm_var": 0.21458171164135606,
"learning_rate": 0.0001,
"loss": 1.2109,
"loss/crossentropy": 1.9949983358383179,
"loss/hidden": 1.0,
"loss/logits": 0.155661940574646,
"loss/reg": 0.0055284383706748486,
"step": 467
},
{
"epoch": 0.0585,
"grad_norm": 2.2574644088745117,
"grad_norm_var": 0.21534123971961966,
"learning_rate": 0.0001,
"loss": 1.08,
"loss/crossentropy": 2.514662981033325,
"loss/hidden": 0.859375,
"loss/logits": 0.16538314521312714,
"loss/reg": 0.005526562221348286,
"step": 468
},
{
"epoch": 0.058625,
"grad_norm": 2.2614095211029053,
"grad_norm_var": 0.2206521084247221,
"learning_rate": 0.0001,
"loss": 1.2297,
"loss/crossentropy": 2.4910507202148438,
"loss/hidden": 0.98046875,
"loss/logits": 0.19400066137313843,
"loss/reg": 0.005524714011698961,
"step": 469
},
{
"epoch": 0.05875,
"grad_norm": 3.083524465560913,
"grad_norm_var": 0.22915168035201153,
"learning_rate": 0.0001,
"loss": 1.1725,
"loss/crossentropy": 2.5548853874206543,
"loss/hidden": 0.92578125,
"loss/logits": 0.19151920080184937,
"loss/reg": 0.005522689316421747,
"step": 470
},
{
"epoch": 0.058875,
"grad_norm": 2.6530709266662598,
"grad_norm_var": 0.2287156357176549,
"learning_rate": 0.0001,
"loss": 0.9819,
"loss/crossentropy": 2.5769848823547363,
"loss/hidden": 0.79296875,
"loss/logits": 0.1337730437517166,
"loss/reg": 0.00552078802138567,
"step": 471
},
{
"epoch": 0.059,
"grad_norm": 2.857489585876465,
"grad_norm_var": 0.21848469951039154,
"learning_rate": 0.0001,
"loss": 1.2335,
"loss/crossentropy": 2.6933629512786865,
"loss/hidden": 0.98828125,
"loss/logits": 0.19003306329250336,
"loss/reg": 0.005518974736332893,
"step": 472
},
{
"epoch": 0.059125,
"grad_norm": 1.960106372833252,
"grad_norm_var": 0.24874750636482734,
"learning_rate": 0.0001,
"loss": 0.9776,
"loss/crossentropy": 2.534855365753174,
"loss/hidden": 0.7890625,
"loss/logits": 0.13338381052017212,
"loss/reg": 0.005517229437828064,
"step": 473
},
{
"epoch": 0.05925,
"grad_norm": 2.787822961807251,
"grad_norm_var": 0.24619457779295406,
"learning_rate": 0.0001,
"loss": 1.0858,
"loss/crossentropy": 2.396390438079834,
"loss/hidden": 0.88671875,
"loss/logits": 0.14397624135017395,
"loss/reg": 0.005515479948371649,
"step": 474
},
{
"epoch": 0.059375,
"grad_norm": 2.3396122455596924,
"grad_norm_var": 0.24936205040752385,
"learning_rate": 0.0001,
"loss": 1.0392,
"loss/crossentropy": 2.6306259632110596,
"loss/hidden": 0.83984375,
"loss/logits": 0.14426180720329285,
"loss/reg": 0.005513759795576334,
"step": 475
},
{
"epoch": 0.0595,
"grad_norm": 2.367551803588867,
"grad_norm_var": 0.19447740210993794,
"learning_rate": 0.0001,
"loss": 1.1071,
"loss/crossentropy": 2.342672348022461,
"loss/hidden": 0.890625,
"loss/logits": 0.16136375069618225,
"loss/reg": 0.0055120959877967834,
"step": 476
},
{
"epoch": 0.059625,
"grad_norm": 2.3029873371124268,
"grad_norm_var": 0.19972845357339655,
"learning_rate": 0.0001,
"loss": 0.9785,
"loss/crossentropy": 2.725276231765747,
"loss/hidden": 0.796875,
"loss/logits": 0.12647491693496704,
"loss/reg": 0.0055101178586483,
"step": 477
},
{
"epoch": 0.05975,
"grad_norm": 2.3109138011932373,
"grad_norm_var": 0.1674590503375268,
"learning_rate": 0.0001,
"loss": 1.012,
"loss/crossentropy": 2.6665799617767334,
"loss/hidden": 0.81640625,
"loss/logits": 0.14054208993911743,
"loss/reg": 0.005508116912096739,
"step": 478
},
{
"epoch": 0.059875,
"grad_norm": 2.8778023719787598,
"grad_norm_var": 0.1605488706137739,
"learning_rate": 0.0001,
"loss": 1.0028,
"loss/crossentropy": 2.599010705947876,
"loss/hidden": 0.80078125,
"loss/logits": 0.14697444438934326,
"loss/reg": 0.0055063748732209206,
"step": 479
},
{
"epoch": 0.06,
"grad_norm": 2.7762978076934814,
"grad_norm_var": 0.15971446982347573,
"learning_rate": 0.0001,
"loss": 1.1492,
"loss/crossentropy": 2.6345436573028564,
"loss/hidden": 0.9296875,
"loss/logits": 0.1645045280456543,
"loss/reg": 0.005504653323441744,
"step": 480
},
{
"epoch": 0.060125,
"grad_norm": 3.0745112895965576,
"grad_norm_var": 0.1733429982183973,
"learning_rate": 0.0001,
"loss": 1.2914,
"loss/crossentropy": 2.1021008491516113,
"loss/hidden": 1.0546875,
"loss/logits": 0.18168240785598755,
"loss/reg": 0.005502650048583746,
"step": 481
},
{
"epoch": 0.06025,
"grad_norm": 2.5635828971862793,
"grad_norm_var": 0.17362979402171655,
"learning_rate": 0.0001,
"loss": 1.1746,
"loss/crossentropy": 2.599754810333252,
"loss/hidden": 0.9453125,
"loss/logits": 0.1743006557226181,
"loss/reg": 0.005500909872353077,
"step": 482
},
{
"epoch": 0.060375,
"grad_norm": 2.982170343399048,
"grad_norm_var": 0.11685041441696337,
"learning_rate": 0.0001,
"loss": 1.084,
"loss/crossentropy": 2.780411958694458,
"loss/hidden": 0.875,
"loss/logits": 0.15399503707885742,
"loss/reg": 0.005499421618878841,
"step": 483
},
{
"epoch": 0.0605,
"grad_norm": 6.475743770599365,
"grad_norm_var": 1.0413639393420129,
"learning_rate": 0.0001,
"loss": 2.1473,
"loss/crossentropy": 2.3867931365966797,
"loss/hidden": 1.703125,
"loss/logits": 0.38922837376594543,
"loss/reg": 0.005497433710843325,
"step": 484
},
{
"epoch": 0.060625,
"grad_norm": 2.522434711456299,
"grad_norm_var": 1.024975132918582,
"learning_rate": 0.0001,
"loss": 1.0915,
"loss/crossentropy": 2.741684675216675,
"loss/hidden": 0.88671875,
"loss/logits": 0.14987404644489288,
"loss/reg": 0.0054954588413238525,
"step": 485
},
{
"epoch": 0.06075,
"grad_norm": 2.6852359771728516,
"grad_norm_var": 1.0236023483547378,
"learning_rate": 0.0001,
"loss": 1.0905,
"loss/crossentropy": 2.2552525997161865,
"loss/hidden": 0.8984375,
"loss/logits": 0.13711076974868774,
"loss/reg": 0.005493887234479189,
"step": 486
},
{
"epoch": 0.060875,
"grad_norm": 6.048346996307373,
"grad_norm_var": 1.65671866064532,
"learning_rate": 0.0001,
"loss": 1.4058,
"loss/crossentropy": 3.1526873111724854,
"loss/hidden": 1.0625,
"loss/logits": 0.2884060740470886,
"loss/reg": 0.005492268595844507,
"step": 487
},
{
"epoch": 0.061,
"grad_norm": 5.24729061126709,
"grad_norm_var": 1.9496829900519608,
"learning_rate": 0.0001,
"loss": 1.5487,
"loss/crossentropy": 2.391798496246338,
"loss/hidden": 1.234375,
"loss/logits": 0.2594112157821655,
"loss/reg": 0.0054903156124055386,
"step": 488
},
{
"epoch": 0.061125,
"grad_norm": 3.4879932403564453,
"grad_norm_var": 1.8414378354073275,
"learning_rate": 0.0001,
"loss": 1.2408,
"loss/crossentropy": 2.3853161334991455,
"loss/hidden": 1.015625,
"loss/logits": 0.1702655553817749,
"loss/reg": 0.005488729570060968,
"step": 489
},
{
"epoch": 0.06125,
"grad_norm": 2.416243076324463,
"grad_norm_var": 1.875598350696971,
"learning_rate": 0.0001,
"loss": 1.0646,
"loss/crossentropy": 2.310605049133301,
"loss/hidden": 0.86328125,
"loss/logits": 0.146418958902359,
"loss/reg": 0.005487216170877218,
"step": 490
},
{
"epoch": 0.061375,
"grad_norm": 2.9619152545928955,
"grad_norm_var": 1.8217813283025472,
"learning_rate": 0.0001,
"loss": 1.2577,
"loss/crossentropy": 2.3735132217407227,
"loss/hidden": 1.015625,
"loss/logits": 0.18721503019332886,
"loss/reg": 0.005485245026648045,
"step": 491
},
{
"epoch": 0.0615,
"grad_norm": 2.9602112770080566,
"grad_norm_var": 1.7685642295810833,
"learning_rate": 0.0001,
"loss": 1.1274,
"loss/crossentropy": 2.6420083045959473,
"loss/hidden": 0.90234375,
"loss/logits": 0.17025524377822876,
"loss/reg": 0.005483296699821949,
"step": 492
},
{
"epoch": 0.061625,
"grad_norm": 2.5772223472595215,
"grad_norm_var": 1.7347667738241757,
"learning_rate": 0.0001,
"loss": 1.1004,
"loss/crossentropy": 2.4166319370269775,
"loss/hidden": 0.890625,
"loss/logits": 0.15491390228271484,
"loss/reg": 0.005481342785060406,
"step": 493
},
{
"epoch": 0.06175,
"grad_norm": 2.6494603157043457,
"grad_norm_var": 1.693988292922673,
"learning_rate": 0.0001,
"loss": 1.0762,
"loss/crossentropy": 2.7021005153656006,
"loss/hidden": 0.8671875,
"loss/logits": 0.1542307734489441,
"loss/reg": 0.005479689687490463,
"step": 494
},
{
"epoch": 0.061875,
"grad_norm": 2.065351963043213,
"grad_norm_var": 1.7911776893626628,
"learning_rate": 0.0001,
"loss": 1.015,
"loss/crossentropy": 2.4842755794525146,
"loss/hidden": 0.8203125,
"loss/logits": 0.13995476067066193,
"loss/reg": 0.005478002596646547,
"step": 495
},
{
"epoch": 0.062,
"grad_norm": 2.650660753250122,
"grad_norm_var": 1.8016636980513454,
"learning_rate": 0.0001,
"loss": 1.1699,
"loss/crossentropy": 2.3899097442626953,
"loss/hidden": 0.94921875,
"loss/logits": 0.16591498255729675,
"loss/reg": 0.005476430524140596,
"step": 496
},
{
"epoch": 0.062125,
"grad_norm": 3.412050724029541,
"grad_norm_var": 1.7970375838694677,
"learning_rate": 0.0001,
"loss": 1.1983,
"loss/crossentropy": 2.4459383487701416,
"loss/hidden": 0.94140625,
"loss/logits": 0.20212361216545105,
"loss/reg": 0.005474465899169445,
"step": 497
},
{
"epoch": 0.06225,
"grad_norm": 2.7389674186706543,
"grad_norm_var": 1.7804152177025587,
"learning_rate": 0.0001,
"loss": 1.1076,
"loss/crossentropy": 2.6794888973236084,
"loss/hidden": 0.90625,
"loss/logits": 0.1465749740600586,
"loss/reg": 0.005472847726196051,
"step": 498
},
{
"epoch": 0.062375,
"grad_norm": 20.56003761291504,
"grad_norm_var": 20.18846043733062,
"learning_rate": 0.0001,
"loss": 1.0568,
"loss/crossentropy": 2.527268409729004,
"loss/hidden": 0.859375,
"loss/logits": 0.14275437593460083,
"loss/reg": 0.005471326876431704,
"step": 499
},
{
"epoch": 0.0625,
"grad_norm": 2.9909119606018066,
"grad_norm_var": 20.013739807194945,
"learning_rate": 0.0001,
"loss": 1.0002,
"loss/crossentropy": 2.311053991317749,
"loss/hidden": 0.80859375,
"loss/logits": 0.13688521087169647,
"loss/reg": 0.005469587165862322,
"step": 500
}
],
"logging_steps": 1,
"max_steps": 8000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": true,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.2202930782208e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}