regup50mm / trainer_state.json
semran1's picture
Upload folder using huggingface_hub
770d401 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.0625,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000125,
"grad_norm": 2.377527952194214,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.2768,
"loss/crossentropy": 2.697097063064575,
"loss/hidden": 1.1171875,
"loss/logits": 0.15893849730491638,
"loss/reg": 6.247002602322027e-05,
"step": 1
},
{
"epoch": 0.00025,
"grad_norm": 4.216994762420654,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.3752,
"loss/crossentropy": 3.101844310760498,
"loss/hidden": 1.1796875,
"loss/logits": 0.1949012577533722,
"loss/reg": 6.247002602322027e-05,
"step": 2
},
{
"epoch": 0.000375,
"grad_norm": 2.3287529945373535,
"learning_rate": 3e-06,
"loss": 1.2785,
"loss/crossentropy": 2.63712477684021,
"loss/hidden": 1.09375,
"loss/logits": 0.18410107493400574,
"loss/reg": 6.246996053960174e-05,
"step": 3
},
{
"epoch": 0.0005,
"grad_norm": 5.415231227874756,
"learning_rate": 4.000000000000001e-06,
"loss": 1.4285,
"loss/crossentropy": 2.5702285766601562,
"loss/hidden": 1.265625,
"loss/logits": 0.16228657960891724,
"loss/reg": 6.246980774449185e-05,
"step": 4
},
{
"epoch": 0.000625,
"grad_norm": 4.888370513916016,
"learning_rate": 5e-06,
"loss": 1.5121,
"loss/crossentropy": 2.439383029937744,
"loss/hidden": 1.3125,
"loss/logits": 0.19899356365203857,
"loss/reg": 6.24695821898058e-05,
"step": 5
},
{
"epoch": 0.00075,
"grad_norm": 2.608705997467041,
"learning_rate": 6e-06,
"loss": 1.293,
"loss/crossentropy": 2.668699026107788,
"loss/hidden": 1.109375,
"loss/logits": 0.18298496305942535,
"loss/reg": 6.246933480724692e-05,
"step": 6
},
{
"epoch": 0.000875,
"grad_norm": 2.8447623252868652,
"learning_rate": 7.000000000000001e-06,
"loss": 1.5339,
"loss/crossentropy": 2.5219366550445557,
"loss/hidden": 1.296875,
"loss/logits": 0.2364223599433899,
"loss/reg": 6.246914563234895e-05,
"step": 7
},
{
"epoch": 0.001,
"grad_norm": 3.7877628803253174,
"learning_rate": 8.000000000000001e-06,
"loss": 1.8218,
"loss/crossentropy": 2.1927688121795654,
"loss/hidden": 1.5546875,
"loss/logits": 0.2664879262447357,
"loss/reg": 6.246889097383246e-05,
"step": 8
},
{
"epoch": 0.001125,
"grad_norm": 2.988516330718994,
"learning_rate": 9e-06,
"loss": 1.7373,
"loss/crossentropy": 2.3826897144317627,
"loss/hidden": 1.421875,
"loss/logits": 0.314752995967865,
"loss/reg": 6.246858538361266e-05,
"step": 9
},
{
"epoch": 0.00125,
"grad_norm": 2.143723726272583,
"learning_rate": 1e-05,
"loss": 1.405,
"loss/crossentropy": 2.2246415615081787,
"loss/hidden": 1.234375,
"loss/logits": 0.16997714340686798,
"loss/reg": 6.246842531254515e-05,
"step": 10
},
{
"epoch": 0.001375,
"grad_norm": 2.4413657188415527,
"learning_rate": 1.1000000000000001e-05,
"loss": 1.4206,
"loss/crossentropy": 2.4612021446228027,
"loss/hidden": 1.1796875,
"loss/logits": 0.24033024907112122,
"loss/reg": 6.246819975785911e-05,
"step": 11
},
{
"epoch": 0.0015,
"grad_norm": 2.483156204223633,
"learning_rate": 1.2e-05,
"loss": 1.6449,
"loss/crossentropy": 2.2882771492004395,
"loss/hidden": 1.4140625,
"loss/logits": 0.23023059964179993,
"loss/reg": 6.246790871955454e-05,
"step": 12
},
{
"epoch": 0.001625,
"grad_norm": 2.7368147373199463,
"learning_rate": 1.3000000000000001e-05,
"loss": 1.4981,
"loss/crossentropy": 2.6942052841186523,
"loss/hidden": 1.265625,
"loss/logits": 0.23185348510742188,
"loss/reg": 6.24675813014619e-05,
"step": 13
},
{
"epoch": 0.00175,
"grad_norm": 5.189184665679932,
"learning_rate": 1.4000000000000001e-05,
"loss": 1.946,
"loss/crossentropy": 2.3771214485168457,
"loss/hidden": 1.625,
"loss/logits": 0.320385217666626,
"loss/reg": 6.246678822208196e-05,
"step": 14
},
{
"epoch": 0.001875,
"grad_norm": 2.305589437484741,
"learning_rate": 1.5e-05,
"loss": 1.4982,
"loss/crossentropy": 2.7562549114227295,
"loss/hidden": 1.25,
"loss/logits": 0.2476150244474411,
"loss/reg": 6.246620614547282e-05,
"step": 15
},
{
"epoch": 0.002,
"grad_norm": 2.3378520011901855,
"grad_norm_var": 1.2675163586822178,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.3302,
"loss/crossentropy": 2.445441961288452,
"loss/hidden": 1.125,
"loss/logits": 0.20453599095344543,
"loss/reg": 6.246585689950734e-05,
"step": 16
},
{
"epoch": 0.002125,
"grad_norm": 1.7903435230255127,
"grad_norm_var": 1.3529406709866008,
"learning_rate": 1.7000000000000003e-05,
"loss": 1.1333,
"loss/crossentropy": 2.323503017425537,
"loss/hidden": 0.984375,
"loss/logits": 0.14828170835971832,
"loss/reg": 6.246510747587308e-05,
"step": 17
},
{
"epoch": 0.00225,
"grad_norm": 3.363795518875122,
"grad_norm_var": 1.277817936381435,
"learning_rate": 1.8e-05,
"loss": 1.7292,
"loss/crossentropy": 2.6075525283813477,
"loss/hidden": 1.46875,
"loss/logits": 0.25987327098846436,
"loss/reg": 6.24642925686203e-05,
"step": 18
},
{
"epoch": 0.002375,
"grad_norm": 2.162050724029541,
"grad_norm_var": 1.2967721886362786,
"learning_rate": 1.9e-05,
"loss": 1.3146,
"loss/crossentropy": 2.570558786392212,
"loss/hidden": 1.125,
"loss/logits": 0.18898281455039978,
"loss/reg": 6.246323027880862e-05,
"step": 19
},
{
"epoch": 0.0025,
"grad_norm": 2.147024393081665,
"grad_norm_var": 0.9523869945360727,
"learning_rate": 2e-05,
"loss": 1.3484,
"loss/crossentropy": 2.6676244735717773,
"loss/hidden": 1.1484375,
"loss/logits": 0.19929195940494537,
"loss/reg": 6.246233533602208e-05,
"step": 20
},
{
"epoch": 0.002625,
"grad_norm": 2.0668728351593018,
"grad_norm_var": 0.6976603751830339,
"learning_rate": 2.1e-05,
"loss": 1.1929,
"loss/crossentropy": 2.401143789291382,
"loss/hidden": 1.03125,
"loss/logits": 0.1610003113746643,
"loss/reg": 6.246144039323553e-05,
"step": 21
},
{
"epoch": 0.00275,
"grad_norm": 2.8019566535949707,
"grad_norm_var": 0.6973240463492516,
"learning_rate": 2.2000000000000003e-05,
"loss": 1.419,
"loss/crossentropy": 2.627523183822632,
"loss/hidden": 1.203125,
"loss/logits": 0.2152642011642456,
"loss/reg": 6.246032717172056e-05,
"step": 22
},
{
"epoch": 0.002875,
"grad_norm": 3.8118937015533447,
"grad_norm_var": 0.7713008187193999,
"learning_rate": 2.3000000000000003e-05,
"loss": 1.4284,
"loss/crossentropy": 2.7227890491485596,
"loss/hidden": 1.1640625,
"loss/logits": 0.2637593150138855,
"loss/reg": 6.245896656764671e-05,
"step": 23
},
{
"epoch": 0.003,
"grad_norm": 2.1418018341064453,
"grad_norm_var": 0.7205284729945551,
"learning_rate": 2.4e-05,
"loss": 1.3002,
"loss/crossentropy": 2.545552968978882,
"loss/hidden": 1.1328125,
"loss/logits": 0.16680249571800232,
"loss/reg": 6.245774420676753e-05,
"step": 24
},
{
"epoch": 0.003125,
"grad_norm": 3.5331156253814697,
"grad_norm_var": 0.7613226543465996,
"learning_rate": 2.5e-05,
"loss": 1.3224,
"loss/crossentropy": 2.2371270656585693,
"loss/hidden": 1.15625,
"loss/logits": 0.16548338532447815,
"loss/reg": 6.245705299079418e-05,
"step": 25
},
{
"epoch": 0.00325,
"grad_norm": 1.9795947074890137,
"grad_norm_var": 0.7755306597344306,
"learning_rate": 2.6000000000000002e-05,
"loss": 1.3209,
"loss/crossentropy": 2.7113037109375,
"loss/hidden": 1.1328125,
"loss/logits": 0.18742361664772034,
"loss/reg": 6.245569966267794e-05,
"step": 26
},
{
"epoch": 0.003375,
"grad_norm": 2.6044108867645264,
"grad_norm_var": 0.7714440385524235,
"learning_rate": 2.7000000000000002e-05,
"loss": 1.4566,
"loss/crossentropy": 2.6034419536590576,
"loss/hidden": 1.2265625,
"loss/logits": 0.22937631607055664,
"loss/reg": 6.245376425795257e-05,
"step": 27
},
{
"epoch": 0.0035,
"grad_norm": 2.48085355758667,
"grad_norm_var": 0.7715158471256792,
"learning_rate": 2.8000000000000003e-05,
"loss": 1.4579,
"loss/crossentropy": 2.5794363021850586,
"loss/hidden": 1.2421875,
"loss/logits": 0.21509718894958496,
"loss/reg": 6.245166878215969e-05,
"step": 28
},
{
"epoch": 0.003625,
"grad_norm": 3.0413854122161865,
"grad_norm_var": 0.7781660489700184,
"learning_rate": 2.9e-05,
"loss": 1.6102,
"loss/crossentropy": 2.4173922538757324,
"loss/hidden": 1.375,
"loss/logits": 0.23455965518951416,
"loss/reg": 6.244902033358812e-05,
"step": 29
},
{
"epoch": 0.00375,
"grad_norm": 2.1076390743255615,
"grad_norm_var": 0.36324525064493024,
"learning_rate": 3e-05,
"loss": 1.0735,
"loss/crossentropy": 2.4064886569976807,
"loss/hidden": 0.9453125,
"loss/logits": 0.12752822041511536,
"loss/reg": 6.244838004931808e-05,
"step": 30
},
{
"epoch": 0.003875,
"grad_norm": 2.5296630859375,
"grad_norm_var": 0.359312391151574,
"learning_rate": 3.1e-05,
"loss": 1.3467,
"loss/crossentropy": 2.61391544342041,
"loss/hidden": 1.15625,
"loss/logits": 0.18978667259216309,
"loss/reg": 6.244736141525209e-05,
"step": 31
},
{
"epoch": 0.004,
"grad_norm": 2.123671054840088,
"grad_norm_var": 0.3684168280400947,
"learning_rate": 3.2000000000000005e-05,
"loss": 1.2191,
"loss/crossentropy": 2.6056668758392334,
"loss/hidden": 1.0546875,
"loss/logits": 0.16381201148033142,
"loss/reg": 6.244605174288154e-05,
"step": 32
},
{
"epoch": 0.004125,
"grad_norm": 3.685770034790039,
"grad_norm_var": 0.4027733703548923,
"learning_rate": 3.3e-05,
"loss": 1.6794,
"loss/crossentropy": 2.519561290740967,
"loss/hidden": 1.3828125,
"loss/logits": 0.29592496156692505,
"loss/reg": 6.24443418928422e-05,
"step": 33
},
{
"epoch": 0.00425,
"grad_norm": 1.9660468101501465,
"grad_norm_var": 0.393966226946808,
"learning_rate": 3.4000000000000007e-05,
"loss": 1.3395,
"loss/crossentropy": 2.638051986694336,
"loss/hidden": 1.15625,
"loss/logits": 0.18261724710464478,
"loss/reg": 6.244314135983586e-05,
"step": 34
},
{
"epoch": 0.004375,
"grad_norm": 2.3111677169799805,
"grad_norm_var": 0.38716579449971367,
"learning_rate": 3.5e-05,
"loss": 1.3501,
"loss/crossentropy": 2.599940776824951,
"loss/hidden": 1.15625,
"loss/logits": 0.19327056407928467,
"loss/reg": 6.244215182960033e-05,
"step": 35
},
{
"epoch": 0.0045,
"grad_norm": 2.5357542037963867,
"grad_norm_var": 0.3739975607775089,
"learning_rate": 3.6e-05,
"loss": 1.287,
"loss/crossentropy": 2.9884798526763916,
"loss/hidden": 1.1171875,
"loss/logits": 0.16922441124916077,
"loss/reg": 6.244022370083258e-05,
"step": 36
},
{
"epoch": 0.004625,
"grad_norm": 1.7781621217727661,
"grad_norm_var": 0.40002233468076764,
"learning_rate": 3.7e-05,
"loss": 1.074,
"loss/crossentropy": 2.669071674346924,
"loss/hidden": 0.93359375,
"loss/logits": 0.13981276750564575,
"loss/reg": 6.243858661036938e-05,
"step": 37
},
{
"epoch": 0.00475,
"grad_norm": 24.6973819732666,
"grad_norm_var": 30.983207545217457,
"learning_rate": 3.8e-05,
"loss": 1.3637,
"loss/crossentropy": 2.482579469680786,
"loss/hidden": 1.1953125,
"loss/logits": 0.16777344048023224,
"loss/reg": 6.243725511012599e-05,
"step": 38
},
{
"epoch": 0.004875,
"grad_norm": 2.5728342533111572,
"grad_norm_var": 31.103302953089262,
"learning_rate": 3.9000000000000006e-05,
"loss": 1.3424,
"loss/crossentropy": 2.2785422801971436,
"loss/hidden": 1.171875,
"loss/logits": 0.16988611221313477,
"loss/reg": 6.243555981200188e-05,
"step": 39
},
{
"epoch": 0.005,
"grad_norm": 1.7385622262954712,
"grad_norm_var": 31.206951393275006,
"learning_rate": 4e-05,
"loss": 1.077,
"loss/crossentropy": 2.7017714977264404,
"loss/hidden": 0.9453125,
"loss/logits": 0.13102804124355316,
"loss/reg": 6.243350071599707e-05,
"step": 40
},
{
"epoch": 0.005125,
"grad_norm": 2.455116033554077,
"grad_norm_var": 31.325901099338942,
"learning_rate": 4.1e-05,
"loss": 1.178,
"loss/crossentropy": 2.6521873474121094,
"loss/hidden": 1.015625,
"loss/logits": 0.16170336306095123,
"loss/reg": 6.243147072382271e-05,
"step": 41
},
{
"epoch": 0.00525,
"grad_norm": 3.0441935062408447,
"grad_norm_var": 31.14003983168487,
"learning_rate": 4.2e-05,
"loss": 1.488,
"loss/crossentropy": 2.5000290870666504,
"loss/hidden": 1.265625,
"loss/logits": 0.2217317819595337,
"loss/reg": 6.24291569693014e-05,
"step": 42
},
{
"epoch": 0.005375,
"grad_norm": 2.6227200031280518,
"grad_norm_var": 31.137008952861066,
"learning_rate": 4.3e-05,
"loss": 1.3106,
"loss/crossentropy": 2.6832528114318848,
"loss/hidden": 1.1171875,
"loss/logits": 0.19276997447013855,
"loss/reg": 6.242711242521182e-05,
"step": 43
},
{
"epoch": 0.0055,
"grad_norm": 2.9194633960723877,
"grad_norm_var": 31.06863081080745,
"learning_rate": 4.4000000000000006e-05,
"loss": 1.5396,
"loss/crossentropy": 2.483938455581665,
"loss/hidden": 1.3046875,
"loss/logits": 0.23424991965293884,
"loss/reg": 6.242513336474076e-05,
"step": 44
},
{
"epoch": 0.005625,
"grad_norm": 2.2491037845611572,
"grad_norm_var": 31.196778907875057,
"learning_rate": 4.5e-05,
"loss": 1.2321,
"loss/crossentropy": 2.9735186100006104,
"loss/hidden": 1.0625,
"loss/logits": 0.1689363420009613,
"loss/reg": 6.242344534257427e-05,
"step": 45
},
{
"epoch": 0.00575,
"grad_norm": 2.687225103378296,
"grad_norm_var": 31.084396554405373,
"learning_rate": 4.600000000000001e-05,
"loss": 1.2443,
"loss/crossentropy": 2.913846254348755,
"loss/hidden": 1.0625,
"loss/logits": 0.18112678825855255,
"loss/reg": 6.242193921934813e-05,
"step": 46
},
{
"epoch": 0.005875,
"grad_norm": 2.3648312091827393,
"grad_norm_var": 31.1155476706496,
"learning_rate": 4.7e-05,
"loss": 1.2044,
"loss/crossentropy": 2.374119520187378,
"loss/hidden": 1.046875,
"loss/logits": 0.15688437223434448,
"loss/reg": 6.242006929824129e-05,
"step": 47
},
{
"epoch": 0.006,
"grad_norm": 1.896540880203247,
"grad_norm_var": 31.171339818602494,
"learning_rate": 4.8e-05,
"loss": 1.238,
"loss/crossentropy": 2.613962173461914,
"loss/hidden": 1.0546875,
"loss/logits": 0.1826920211315155,
"loss/reg": 6.24187450739555e-05,
"step": 48
},
{
"epoch": 0.006125,
"grad_norm": 1.7585434913635254,
"grad_norm_var": 31.44447201393312,
"learning_rate": 4.9e-05,
"loss": 1.1411,
"loss/crossentropy": 2.5672757625579834,
"loss/hidden": 1.0,
"loss/logits": 0.14043202996253967,
"loss/reg": 6.241785740712658e-05,
"step": 49
},
{
"epoch": 0.00625,
"grad_norm": 1.8257592916488647,
"grad_norm_var": 31.47860052328912,
"learning_rate": 5e-05,
"loss": 1.2643,
"loss/crossentropy": 2.4829366207122803,
"loss/hidden": 1.0859375,
"loss/logits": 0.1777852475643158,
"loss/reg": 6.2416227592621e-05,
"step": 50
},
{
"epoch": 0.006375,
"grad_norm": 1.9530550241470337,
"grad_norm_var": 31.553698309541367,
"learning_rate": 5.1000000000000006e-05,
"loss": 1.1787,
"loss/crossentropy": 2.501922369003296,
"loss/hidden": 1.015625,
"loss/logits": 0.16241338849067688,
"loss/reg": 6.241373193915933e-05,
"step": 51
},
{
"epoch": 0.0065,
"grad_norm": 2.366898536682129,
"grad_norm_var": 31.58155048439878,
"learning_rate": 5.2000000000000004e-05,
"loss": 1.476,
"loss/crossentropy": 2.557314872741699,
"loss/hidden": 1.234375,
"loss/logits": 0.24098029732704163,
"loss/reg": 6.241213122848421e-05,
"step": 52
},
{
"epoch": 0.006625,
"grad_norm": 2.139944553375244,
"grad_norm_var": 31.497838767117898,
"learning_rate": 5.300000000000001e-05,
"loss": 1.3057,
"loss/crossentropy": 2.5664379596710205,
"loss/hidden": 1.125,
"loss/logits": 0.18005570769309998,
"loss/reg": 6.241026130737737e-05,
"step": 53
},
{
"epoch": 0.00675,
"grad_norm": 2.2614963054656982,
"grad_norm_var": 0.16298419379227144,
"learning_rate": 5.4000000000000005e-05,
"loss": 1.2081,
"loss/crossentropy": 2.5651533603668213,
"loss/hidden": 1.046875,
"loss/logits": 0.1606135070323944,
"loss/reg": 6.240784568944946e-05,
"step": 54
},
{
"epoch": 0.006875,
"grad_norm": 1.88372802734375,
"grad_norm_var": 0.16791840248250048,
"learning_rate": 5.500000000000001e-05,
"loss": 1.2037,
"loss/crossentropy": 2.0431623458862305,
"loss/hidden": 1.0703125,
"loss/logits": 0.13271506130695343,
"loss/reg": 6.240410584723577e-05,
"step": 55
},
{
"epoch": 0.007,
"grad_norm": 1.7579172849655151,
"grad_norm_var": 0.16659499666655736,
"learning_rate": 5.6000000000000006e-05,
"loss": 1.0787,
"loss/crossentropy": 2.5805883407592773,
"loss/hidden": 0.94140625,
"loss/logits": 0.13670633733272552,
"loss/reg": 6.240163202164695e-05,
"step": 56
},
{
"epoch": 0.007125,
"grad_norm": 2.740758180618286,
"grad_norm_var": 0.17906241043444873,
"learning_rate": 5.6999999999999996e-05,
"loss": 1.2499,
"loss/crossentropy": 2.821078062057495,
"loss/hidden": 1.0859375,
"loss/logits": 0.16337308287620544,
"loss/reg": 6.239958747755736e-05,
"step": 57
},
{
"epoch": 0.00725,
"grad_norm": 3.3393216133117676,
"grad_norm_var": 0.21459676497742203,
"learning_rate": 5.8e-05,
"loss": 1.5094,
"loss/crossentropy": 2.6574273109436035,
"loss/hidden": 1.2265625,
"loss/logits": 0.2822623550891876,
"loss/reg": 6.239775393623859e-05,
"step": 58
},
{
"epoch": 0.007375,
"grad_norm": 2.1151742935180664,
"grad_norm_var": 0.20871929877454623,
"learning_rate": 5.9e-05,
"loss": 1.31,
"loss/crossentropy": 2.28176212310791,
"loss/hidden": 1.125,
"loss/logits": 0.18433833122253418,
"loss/reg": 6.239649519557133e-05,
"step": 59
},
{
"epoch": 0.0075,
"grad_norm": 1.9203850030899048,
"grad_norm_var": 0.18408730894700795,
"learning_rate": 6e-05,
"loss": 1.2862,
"loss/crossentropy": 2.319091558456421,
"loss/hidden": 1.09375,
"loss/logits": 0.1918697953224182,
"loss/reg": 6.239335925783962e-05,
"step": 60
},
{
"epoch": 0.007625,
"grad_norm": 2.689425230026245,
"grad_norm_var": 0.1988651894699956,
"learning_rate": 6.1e-05,
"loss": 1.2077,
"loss/crossentropy": 2.396440029144287,
"loss/hidden": 1.0546875,
"loss/logits": 0.1523526906967163,
"loss/reg": 6.239157664822415e-05,
"step": 61
},
{
"epoch": 0.00775,
"grad_norm": 2.0848548412323,
"grad_norm_var": 0.184926237897677,
"learning_rate": 6.2e-05,
"loss": 1.1889,
"loss/crossentropy": 2.375331401824951,
"loss/hidden": 1.03125,
"loss/logits": 0.15707406401634216,
"loss/reg": 6.238814967218786e-05,
"step": 62
},
{
"epoch": 0.007875,
"grad_norm": 1.9770179986953735,
"grad_norm_var": 0.18547542502594508,
"learning_rate": 6.3e-05,
"loss": 1.1255,
"loss/crossentropy": 2.5883288383483887,
"loss/hidden": 0.984375,
"loss/logits": 0.14046350121498108,
"loss/reg": 6.238514470169321e-05,
"step": 63
},
{
"epoch": 0.008,
"grad_norm": 1.9654349088668823,
"grad_norm_var": 0.1832653842408547,
"learning_rate": 6.400000000000001e-05,
"loss": 1.1315,
"loss/crossentropy": 2.6122260093688965,
"loss/hidden": 0.9765625,
"loss/logits": 0.1543133556842804,
"loss/reg": 6.238299101823941e-05,
"step": 64
},
{
"epoch": 0.008125,
"grad_norm": 2.110621690750122,
"grad_norm_var": 0.1715223081433841,
"learning_rate": 6.500000000000001e-05,
"loss": 1.1513,
"loss/crossentropy": 2.3829517364501953,
"loss/hidden": 1.0,
"loss/logits": 0.15063607692718506,
"loss/reg": 6.237896013772115e-05,
"step": 65
},
{
"epoch": 0.00825,
"grad_norm": 3.1477179527282715,
"grad_norm_var": 0.21553302023151552,
"learning_rate": 6.6e-05,
"loss": 1.4659,
"loss/crossentropy": 2.2805211544036865,
"loss/hidden": 1.2421875,
"loss/logits": 0.22310970723628998,
"loss/reg": 6.237393972696736e-05,
"step": 66
},
{
"epoch": 0.008375,
"grad_norm": 2.482203722000122,
"grad_norm_var": 0.21008166056666275,
"learning_rate": 6.7e-05,
"loss": 1.0839,
"loss/crossentropy": 2.982119560241699,
"loss/hidden": 0.94140625,
"loss/logits": 0.14186254143714905,
"loss/reg": 6.236990884644911e-05,
"step": 67
},
{
"epoch": 0.0085,
"grad_norm": 2.198028087615967,
"grad_norm_var": 0.21061508280485744,
"learning_rate": 6.800000000000001e-05,
"loss": 1.2007,
"loss/crossentropy": 2.725332498550415,
"loss/hidden": 1.0390625,
"loss/logits": 0.1610267162322998,
"loss/reg": 6.236397166503593e-05,
"step": 68
},
{
"epoch": 0.008625,
"grad_norm": 1.9412530660629272,
"grad_norm_var": 0.21734592747188602,
"learning_rate": 6.9e-05,
"loss": 1.1269,
"loss/crossentropy": 2.682379722595215,
"loss/hidden": 0.984375,
"loss/logits": 0.14185243844985962,
"loss/reg": 6.235777982510626e-05,
"step": 69
},
{
"epoch": 0.00875,
"grad_norm": 2.223443031311035,
"grad_norm_var": 0.21757323137186588,
"learning_rate": 7e-05,
"loss": 1.3663,
"loss/crossentropy": 2.6186935901641846,
"loss/hidden": 1.1640625,
"loss/logits": 0.2016535997390747,
"loss/reg": 6.23530286247842e-05,
"step": 70
},
{
"epoch": 0.008875,
"grad_norm": 3.4456241130828857,
"grad_norm_var": 0.28625219910078287,
"learning_rate": 7.1e-05,
"loss": 1.6214,
"loss/crossentropy": 2.054266929626465,
"loss/hidden": 1.421875,
"loss/logits": 0.19887767732143402,
"loss/reg": 6.234741158550605e-05,
"step": 71
},
{
"epoch": 0.009,
"grad_norm": 1.9013352394104004,
"grad_norm_var": 0.27557130255187207,
"learning_rate": 7.2e-05,
"loss": 1.1365,
"loss/crossentropy": 2.422841787338257,
"loss/hidden": 0.9765625,
"loss/logits": 0.15926527976989746,
"loss/reg": 6.234211468836293e-05,
"step": 72
},
{
"epoch": 0.009125,
"grad_norm": 2.4032697677612305,
"grad_norm_var": 0.267026183625853,
"learning_rate": 7.3e-05,
"loss": 1.4414,
"loss/crossentropy": 2.4159440994262695,
"loss/hidden": 1.21875,
"loss/logits": 0.22204136848449707,
"loss/reg": 6.233662861632183e-05,
"step": 73
},
{
"epoch": 0.00925,
"grad_norm": 1.915128231048584,
"grad_norm_var": 0.21002777018266153,
"learning_rate": 7.4e-05,
"loss": 1.2439,
"loss/crossentropy": 2.587275505065918,
"loss/hidden": 1.0625,
"loss/logits": 0.1807810664176941,
"loss/reg": 6.232755549717695e-05,
"step": 74
},
{
"epoch": 0.009375,
"grad_norm": 3.4048879146575928,
"grad_norm_var": 0.28520435687560547,
"learning_rate": 7.500000000000001e-05,
"loss": 1.2774,
"loss/crossentropy": 2.6182703971862793,
"loss/hidden": 1.125,
"loss/logits": 0.15172982215881348,
"loss/reg": 6.231923180166632e-05,
"step": 75
},
{
"epoch": 0.0095,
"grad_norm": 2.3605074882507324,
"grad_norm_var": 0.27132747056331724,
"learning_rate": 7.6e-05,
"loss": 1.1409,
"loss/crossentropy": 2.6013262271881104,
"loss/hidden": 0.98828125,
"loss/logits": 0.151985764503479,
"loss/reg": 6.231063889572397e-05,
"step": 76
},
{
"epoch": 0.009625,
"grad_norm": 2.6056039333343506,
"grad_norm_var": 0.2684276793201585,
"learning_rate": 7.7e-05,
"loss": 1.1,
"loss/crossentropy": 2.534158945083618,
"loss/hidden": 0.94921875,
"loss/logits": 0.1501779407262802,
"loss/reg": 6.230256258277223e-05,
"step": 77
},
{
"epoch": 0.00975,
"grad_norm": 1.7923972606658936,
"grad_norm_var": 0.285494251958092,
"learning_rate": 7.800000000000001e-05,
"loss": 1.1471,
"loss/crossentropy": 2.3036601543426514,
"loss/hidden": 0.98828125,
"loss/logits": 0.15817409753799438,
"loss/reg": 6.229766586329788e-05,
"step": 78
},
{
"epoch": 0.009875,
"grad_norm": 2.0376312732696533,
"grad_norm_var": 0.2825708803585835,
"learning_rate": 7.900000000000001e-05,
"loss": 1.2985,
"loss/crossentropy": 2.5548579692840576,
"loss/hidden": 1.140625,
"loss/logits": 0.1572834551334381,
"loss/reg": 6.229063728824258e-05,
"step": 79
},
{
"epoch": 0.01,
"grad_norm": 2.998662233352661,
"grad_norm_var": 0.29342903010298654,
"learning_rate": 8e-05,
"loss": 1.5504,
"loss/crossentropy": 2.4098215103149414,
"loss/hidden": 1.3046875,
"loss/logits": 0.24512597918510437,
"loss/reg": 6.22822335571982e-05,
"step": 80
},
{
"epoch": 0.010125,
"grad_norm": 2.103449583053589,
"grad_norm_var": 0.29374293883859787,
"learning_rate": 8.1e-05,
"loss": 1.2985,
"loss/crossentropy": 2.380378484725952,
"loss/hidden": 1.125,
"loss/logits": 0.17282900214195251,
"loss/reg": 6.227292760740966e-05,
"step": 81
},
{
"epoch": 0.01025,
"grad_norm": 2.6376256942749023,
"grad_norm_var": 0.2615363410208279,
"learning_rate": 8.2e-05,
"loss": 1.266,
"loss/crossentropy": 2.4291374683380127,
"loss/hidden": 1.1015625,
"loss/logits": 0.16384728252887726,
"loss/reg": 6.226752884685993e-05,
"step": 82
},
{
"epoch": 0.010375,
"grad_norm": 2.0763561725616455,
"grad_norm_var": 0.2675552215302521,
"learning_rate": 8.3e-05,
"loss": 1.1733,
"loss/crossentropy": 2.423896312713623,
"loss/hidden": 1.015625,
"loss/logits": 0.15705125033855438,
"loss/reg": 6.225931429071352e-05,
"step": 83
},
{
"epoch": 0.0105,
"grad_norm": 4.398110866546631,
"grad_norm_var": 0.5173355174320988,
"learning_rate": 8.4e-05,
"loss": 1.5654,
"loss/crossentropy": 2.230816602706909,
"loss/hidden": 1.296875,
"loss/logits": 0.26791903376579285,
"loss/reg": 6.225006654858589e-05,
"step": 84
},
{
"epoch": 0.010625,
"grad_norm": 2.7163784503936768,
"grad_norm_var": 0.4955558090734691,
"learning_rate": 8.5e-05,
"loss": 1.2008,
"loss/crossentropy": 2.1671087741851807,
"loss/hidden": 1.0546875,
"loss/logits": 0.145525261759758,
"loss/reg": 6.224414391908795e-05,
"step": 85
},
{
"epoch": 0.01075,
"grad_norm": 1.9465394020080566,
"grad_norm_var": 0.5129132822581631,
"learning_rate": 8.6e-05,
"loss": 1.0109,
"loss/crossentropy": 2.218550443649292,
"loss/hidden": 0.90234375,
"loss/logits": 0.10795612633228302,
"loss/reg": 6.22385778115131e-05,
"step": 86
},
{
"epoch": 0.010875,
"grad_norm": 5.668015956878662,
"grad_norm_var": 1.0880389746416426,
"learning_rate": 8.7e-05,
"loss": 1.2925,
"loss/crossentropy": 2.360995292663574,
"loss/hidden": 1.1484375,
"loss/logits": 0.1434704214334488,
"loss/reg": 6.223141826922074e-05,
"step": 87
},
{
"epoch": 0.011,
"grad_norm": 3.4049394130706787,
"grad_norm_var": 1.0721571012465496,
"learning_rate": 8.800000000000001e-05,
"loss": 1.6353,
"loss/crossentropy": 1.9898579120635986,
"loss/hidden": 1.3828125,
"loss/logits": 0.25186440348625183,
"loss/reg": 6.222462252480909e-05,
"step": 88
},
{
"epoch": 0.011125,
"grad_norm": 1.885895013809204,
"grad_norm_var": 1.1148297312339375,
"learning_rate": 8.900000000000001e-05,
"loss": 1.0561,
"loss/crossentropy": 2.670912027359009,
"loss/hidden": 0.92578125,
"loss/logits": 0.12972213327884674,
"loss/reg": 6.221828516572714e-05,
"step": 89
},
{
"epoch": 0.01125,
"grad_norm": 1.886960506439209,
"grad_norm_var": 1.118003608268531,
"learning_rate": 9e-05,
"loss": 1.1335,
"loss/crossentropy": 2.5691866874694824,
"loss/hidden": 0.97265625,
"loss/logits": 0.16021151840686798,
"loss/reg": 6.221193325472996e-05,
"step": 90
},
{
"epoch": 0.011375,
"grad_norm": 3.117880344390869,
"grad_norm_var": 1.0979090394478965,
"learning_rate": 9.1e-05,
"loss": 1.3175,
"loss/crossentropy": 2.7383711338043213,
"loss/hidden": 1.140625,
"loss/logits": 0.1762513369321823,
"loss/reg": 6.220516661414877e-05,
"step": 91
},
{
"epoch": 0.0115,
"grad_norm": 2.5928220748901367,
"grad_norm_var": 1.0899203711980436,
"learning_rate": 9.200000000000001e-05,
"loss": 1.3898,
"loss/crossentropy": 2.255321741104126,
"loss/hidden": 1.171875,
"loss/logits": 0.21727776527404785,
"loss/reg": 6.2199542298913e-05,
"step": 92
},
{
"epoch": 0.011625,
"grad_norm": 2.5842387676239014,
"grad_norm_var": 1.09033696415262,
"learning_rate": 9.300000000000001e-05,
"loss": 1.3599,
"loss/crossentropy": 2.7780256271362305,
"loss/hidden": 1.15625,
"loss/logits": 0.203078031539917,
"loss/reg": 6.219152419362217e-05,
"step": 93
},
{
"epoch": 0.01175,
"grad_norm": 2.497912645339966,
"grad_norm_var": 1.032260222561935,
"learning_rate": 9.4e-05,
"loss": 1.2791,
"loss/crossentropy": 2.0482513904571533,
"loss/hidden": 1.109375,
"loss/logits": 0.16910339891910553,
"loss/reg": 6.218066846486181e-05,
"step": 94
},
{
"epoch": 0.011875,
"grad_norm": 2.1033713817596436,
"grad_norm_var": 1.0259829914817806,
"learning_rate": 9.5e-05,
"loss": 1.0875,
"loss/crossentropy": 2.427816152572632,
"loss/hidden": 0.94921875,
"loss/logits": 0.13770164549350739,
"loss/reg": 6.21745057287626e-05,
"step": 95
},
{
"epoch": 0.012,
"grad_norm": 2.063559055328369,
"grad_norm_var": 1.0544556100156115,
"learning_rate": 9.6e-05,
"loss": 1.217,
"loss/crossentropy": 2.498270034790039,
"loss/hidden": 1.046875,
"loss/logits": 0.16950619220733643,
"loss/reg": 6.216309702722356e-05,
"step": 96
},
{
"epoch": 0.012125,
"grad_norm": 2.3693654537200928,
"grad_norm_var": 1.036651450071012,
"learning_rate": 9.7e-05,
"loss": 1.2016,
"loss/crossentropy": 2.8368701934814453,
"loss/hidden": 1.0390625,
"loss/logits": 0.16189493238925934,
"loss/reg": 6.215785833774135e-05,
"step": 97
},
{
"epoch": 0.01225,
"grad_norm": 2.2980258464813232,
"grad_norm_var": 1.0488061784492646,
"learning_rate": 9.8e-05,
"loss": 1.5249,
"loss/crossentropy": 2.194488525390625,
"loss/hidden": 1.2421875,
"loss/logits": 0.2820858359336853,
"loss/reg": 6.215048051672056e-05,
"step": 98
},
{
"epoch": 0.012375,
"grad_norm": 3.147524833679199,
"grad_norm_var": 1.0277853179901806,
"learning_rate": 9.900000000000001e-05,
"loss": 1.7374,
"loss/crossentropy": 2.7856016159057617,
"loss/hidden": 1.4609375,
"loss/logits": 0.27581536769866943,
"loss/reg": 6.214459426701069e-05,
"step": 99
},
{
"epoch": 0.0125,
"grad_norm": 2.1317031383514404,
"grad_norm_var": 0.8636563030021608,
"learning_rate": 0.0001,
"loss": 1.3633,
"loss/crossentropy": 2.282402753829956,
"loss/hidden": 1.1484375,
"loss/logits": 0.2142634242773056,
"loss/reg": 6.213640881469473e-05,
"step": 100
},
{
"epoch": 0.012625,
"grad_norm": 2.2720911502838135,
"grad_norm_var": 0.8721171319962743,
"learning_rate": 0.0001,
"loss": 1.2405,
"loss/crossentropy": 2.8501064777374268,
"loss/hidden": 1.0625,
"loss/logits": 0.17741592228412628,
"loss/reg": 6.21288490947336e-05,
"step": 101
},
{
"epoch": 0.01275,
"grad_norm": 2.879110097885132,
"grad_norm_var": 0.8423375514351165,
"learning_rate": 0.0001,
"loss": 1.3486,
"loss/crossentropy": 2.4649596214294434,
"loss/hidden": 1.171875,
"loss/logits": 0.1761254221200943,
"loss/reg": 6.211963773239404e-05,
"step": 102
},
{
"epoch": 0.012875,
"grad_norm": 2.2214345932006836,
"grad_norm_var": 0.2123174305005847,
"learning_rate": 0.0001,
"loss": 1.1049,
"loss/crossentropy": 2.513540029525757,
"loss/hidden": 0.96484375,
"loss/logits": 0.13943374156951904,
"loss/reg": 6.21131548541598e-05,
"step": 103
},
{
"epoch": 0.013,
"grad_norm": 1.9674383401870728,
"grad_norm_var": 0.16151448650877043,
"learning_rate": 0.0001,
"loss": 1.2055,
"loss/crossentropy": 2.4960575103759766,
"loss/hidden": 1.03125,
"loss/logits": 0.17365112900733948,
"loss/reg": 6.210394349182025e-05,
"step": 104
},
{
"epoch": 0.013125,
"grad_norm": 2.152989387512207,
"grad_norm_var": 0.1485118756217919,
"learning_rate": 0.0001,
"loss": 1.3728,
"loss/crossentropy": 2.651463508605957,
"loss/hidden": 1.1796875,
"loss/logits": 0.1924474835395813,
"loss/reg": 6.209702405612916e-05,
"step": 105
},
{
"epoch": 0.01325,
"grad_norm": 2.591555118560791,
"grad_norm_var": 0.13200909593287988,
"learning_rate": 0.0001,
"loss": 1.5933,
"loss/crossentropy": 2.1848952770233154,
"loss/hidden": 1.375,
"loss/logits": 0.21770122647285461,
"loss/reg": 6.208720878930762e-05,
"step": 106
},
{
"epoch": 0.013375,
"grad_norm": 2.205780029296875,
"grad_norm_var": 0.10119294371901374,
"learning_rate": 0.0001,
"loss": 0.9785,
"loss/crossentropy": 2.4988999366760254,
"loss/hidden": 0.8671875,
"loss/logits": 0.1106652021408081,
"loss/reg": 6.207643309608102e-05,
"step": 107
},
{
"epoch": 0.0135,
"grad_norm": 2.427882671356201,
"grad_norm_var": 0.09821140867718908,
"learning_rate": 0.0001,
"loss": 1.2968,
"loss/crossentropy": 2.5072600841522217,
"loss/hidden": 1.09375,
"loss/logits": 0.20241403579711914,
"loss/reg": 6.206895341165364e-05,
"step": 108
},
{
"epoch": 0.013625,
"grad_norm": 2.4435040950775146,
"grad_norm_var": 0.09542213222792188,
"learning_rate": 0.0001,
"loss": 1.2803,
"loss/crossentropy": 2.2629339694976807,
"loss/hidden": 1.1015625,
"loss/logits": 0.17810457944869995,
"loss/reg": 6.205752288224176e-05,
"step": 109
},
{
"epoch": 0.01375,
"grad_norm": 2.9938735961914062,
"grad_norm_var": 0.11986086275213564,
"learning_rate": 0.0001,
"loss": 1.2708,
"loss/crossentropy": 2.5084388256073,
"loss/hidden": 1.09375,
"loss/logits": 0.1764756739139557,
"loss/reg": 6.204319652169943e-05,
"step": 110
},
{
"epoch": 0.013875,
"grad_norm": 2.499802827835083,
"grad_norm_var": 0.11443625726480532,
"learning_rate": 0.0001,
"loss": 1.3281,
"loss/crossentropy": 2.342087507247925,
"loss/hidden": 1.15625,
"loss/logits": 0.17120838165283203,
"loss/reg": 6.20328210061416e-05,
"step": 111
},
{
"epoch": 0.014,
"grad_norm": 3.28193736076355,
"grad_norm_var": 0.149862047644675,
"learning_rate": 0.0001,
"loss": 1.3891,
"loss/crossentropy": 2.396040916442871,
"loss/hidden": 1.1953125,
"loss/logits": 0.193180650472641,
"loss/reg": 6.202506483532488e-05,
"step": 112
},
{
"epoch": 0.014125,
"grad_norm": 2.2074780464172363,
"grad_norm_var": 0.15416329735346365,
"learning_rate": 0.0001,
"loss": 1.2137,
"loss/crossentropy": 2.501718759536743,
"loss/hidden": 1.0546875,
"loss/logits": 0.1583903729915619,
"loss/reg": 6.201667565619573e-05,
"step": 113
},
{
"epoch": 0.01425,
"grad_norm": 2.888498306274414,
"grad_norm_var": 0.1614203311265588,
"learning_rate": 0.0001,
"loss": 1.3498,
"loss/crossentropy": 3.097370147705078,
"loss/hidden": 1.15625,
"loss/logits": 0.19293376803398132,
"loss/reg": 6.200573989190161e-05,
"step": 114
},
{
"epoch": 0.014375,
"grad_norm": 2.385442018508911,
"grad_norm_var": 0.1339080451651928,
"learning_rate": 0.0001,
"loss": 1.3415,
"loss/crossentropy": 2.4950473308563232,
"loss/hidden": 1.15625,
"loss/logits": 0.18464481830596924,
"loss/reg": 6.199457857292145e-05,
"step": 115
},
{
"epoch": 0.0145,
"grad_norm": 3.3269190788269043,
"grad_norm_var": 0.16897616880053803,
"learning_rate": 0.0001,
"loss": 1.6405,
"loss/crossentropy": 2.19484806060791,
"loss/hidden": 1.3828125,
"loss/logits": 0.2570968270301819,
"loss/reg": 6.198590563144535e-05,
"step": 116
},
{
"epoch": 0.014625,
"grad_norm": 2.2415361404418945,
"grad_norm_var": 0.17015290356553733,
"learning_rate": 0.0001,
"loss": 1.2381,
"loss/crossentropy": 2.540816068649292,
"loss/hidden": 1.0625,
"loss/logits": 0.1749531626701355,
"loss/reg": 6.197726906975731e-05,
"step": 117
},
{
"epoch": 0.01475,
"grad_norm": 2.397615671157837,
"grad_norm_var": 0.1631737555736056,
"learning_rate": 0.0001,
"loss": 1.2192,
"loss/crossentropy": 2.6213266849517822,
"loss/hidden": 1.0546875,
"loss/logits": 0.16386428475379944,
"loss/reg": 6.197066250024363e-05,
"step": 118
},
{
"epoch": 0.014875,
"grad_norm": 2.75325345993042,
"grad_norm_var": 0.16006220619054398,
"learning_rate": 0.0001,
"loss": 1.5693,
"loss/crossentropy": 2.3850035667419434,
"loss/hidden": 1.34375,
"loss/logits": 0.2249460369348526,
"loss/reg": 6.196285539772362e-05,
"step": 119
},
{
"epoch": 0.015,
"grad_norm": 2.675480842590332,
"grad_norm_var": 0.13660137165245084,
"learning_rate": 0.0001,
"loss": 1.299,
"loss/crossentropy": 2.380896806716919,
"loss/hidden": 1.125,
"loss/logits": 0.17339974641799927,
"loss/reg": 6.195474998094141e-05,
"step": 120
},
{
"epoch": 0.015125,
"grad_norm": 2.611541509628296,
"grad_norm_var": 0.12289609882195597,
"learning_rate": 0.0001,
"loss": 1.2924,
"loss/crossentropy": 2.7064404487609863,
"loss/hidden": 1.109375,
"loss/logits": 0.18236055970191956,
"loss/reg": 6.194705929374322e-05,
"step": 121
},
{
"epoch": 0.01525,
"grad_norm": 2.3449323177337646,
"grad_norm_var": 0.12765774775469155,
"learning_rate": 0.0001,
"loss": 1.2957,
"loss/crossentropy": 2.5846447944641113,
"loss/hidden": 1.1171875,
"loss/logits": 0.17786133289337158,
"loss/reg": 6.193818262545392e-05,
"step": 122
},
{
"epoch": 0.015375,
"grad_norm": 2.1001734733581543,
"grad_norm_var": 0.13398098136615483,
"learning_rate": 0.0001,
"loss": 1.1704,
"loss/crossentropy": 2.504185676574707,
"loss/hidden": 1.015625,
"loss/logits": 0.15416675806045532,
"loss/reg": 6.192670116433874e-05,
"step": 123
},
{
"epoch": 0.0155,
"grad_norm": 2.365839719772339,
"grad_norm_var": 0.13563497966163046,
"learning_rate": 0.0001,
"loss": 1.3773,
"loss/crossentropy": 2.3259832859039307,
"loss/hidden": 1.171875,
"loss/logits": 0.20480972528457642,
"loss/reg": 6.19165730313398e-05,
"step": 124
},
{
"epoch": 0.015625,
"grad_norm": 2.1480026245117188,
"grad_norm_var": 0.1470561705316013,
"learning_rate": 0.0001,
"loss": 1.2768,
"loss/crossentropy": 2.288093090057373,
"loss/hidden": 1.109375,
"loss/logits": 0.16683252155780792,
"loss/reg": 6.19063139311038e-05,
"step": 125
},
{
"epoch": 0.01575,
"grad_norm": 2.2346343994140625,
"grad_norm_var": 0.14082182611320845,
"learning_rate": 0.0001,
"loss": 1.1441,
"loss/crossentropy": 2.6062135696411133,
"loss/hidden": 1.0,
"loss/logits": 0.14351129531860352,
"loss/reg": 6.189729174366221e-05,
"step": 126
},
{
"epoch": 0.015875,
"grad_norm": 3.187627077102661,
"grad_norm_var": 0.16771827237098264,
"learning_rate": 0.0001,
"loss": 1.4505,
"loss/crossentropy": 2.3607077598571777,
"loss/hidden": 1.2265625,
"loss/logits": 0.22327345609664917,
"loss/reg": 6.189044506754726e-05,
"step": 127
},
{
"epoch": 0.016,
"grad_norm": 2.1208789348602295,
"grad_norm_var": 0.1420574537193353,
"learning_rate": 0.0001,
"loss": 1.1414,
"loss/crossentropy": 2.408287286758423,
"loss/hidden": 1.0,
"loss/logits": 0.14076298475265503,
"loss/reg": 6.188445695443079e-05,
"step": 128
},
{
"epoch": 0.016125,
"grad_norm": 2.4475457668304443,
"grad_norm_var": 0.13631644029428572,
"learning_rate": 0.0001,
"loss": 1.2863,
"loss/crossentropy": 2.4705042839050293,
"loss/hidden": 1.1171875,
"loss/logits": 0.16846278309822083,
"loss/reg": 6.187462713569403e-05,
"step": 129
},
{
"epoch": 0.01625,
"grad_norm": 2.3132476806640625,
"grad_norm_var": 0.128302854564951,
"learning_rate": 0.0001,
"loss": 1.2265,
"loss/crossentropy": 2.323221445083618,
"loss/hidden": 1.0625,
"loss/logits": 0.16340406239032745,
"loss/reg": 6.18634803686291e-05,
"step": 130
},
{
"epoch": 0.016375,
"grad_norm": 2.6015546321868896,
"grad_norm_var": 0.12854282273958592,
"learning_rate": 0.0001,
"loss": 1.0946,
"loss/crossentropy": 2.554730176925659,
"loss/hidden": 0.9609375,
"loss/logits": 0.13307343423366547,
"loss/reg": 6.185180245665833e-05,
"step": 131
},
{
"epoch": 0.0165,
"grad_norm": 2.040545701980591,
"grad_norm_var": 0.08874970269449302,
"learning_rate": 0.0001,
"loss": 1.1715,
"loss/crossentropy": 2.6177141666412354,
"loss/hidden": 1.0078125,
"loss/logits": 0.163020521402359,
"loss/reg": 6.184292578836903e-05,
"step": 132
},
{
"epoch": 0.016625,
"grad_norm": 2.4451427459716797,
"grad_norm_var": 0.08672588329890019,
"learning_rate": 0.0001,
"loss": 1.2794,
"loss/crossentropy": 2.6671459674835205,
"loss/hidden": 1.109375,
"loss/logits": 0.16941678524017334,
"loss/reg": 6.18349076830782e-05,
"step": 133
},
{
"epoch": 0.01675,
"grad_norm": 2.5730879306793213,
"grad_norm_var": 0.08802712142174655,
"learning_rate": 0.0001,
"loss": 1.356,
"loss/crossentropy": 2.483858585357666,
"loss/hidden": 1.171875,
"loss/logits": 0.1835438758134842,
"loss/reg": 6.182605284266174e-05,
"step": 134
},
{
"epoch": 0.016875,
"grad_norm": 2.996643543243408,
"grad_norm_var": 0.10205043083370029,
"learning_rate": 0.0001,
"loss": 1.5067,
"loss/crossentropy": 2.267930507659912,
"loss/hidden": 1.3046875,
"loss/logits": 0.20140591263771057,
"loss/reg": 6.181577919051051e-05,
"step": 135
},
{
"epoch": 0.017,
"grad_norm": 2.2333881855010986,
"grad_norm_var": 0.10100001995976887,
"learning_rate": 0.0001,
"loss": 1.23,
"loss/crossentropy": 2.552584648132324,
"loss/hidden": 1.0546875,
"loss/logits": 0.17466390132904053,
"loss/reg": 6.180404307087883e-05,
"step": 136
},
{
"epoch": 0.017125,
"grad_norm": 2.476086378097534,
"grad_norm_var": 0.09873795942098601,
"learning_rate": 0.0001,
"loss": 1.2347,
"loss/crossentropy": 2.2955551147460938,
"loss/hidden": 1.09375,
"loss/logits": 0.1402929574251175,
"loss/reg": 6.179526099003851e-05,
"step": 137
},
{
"epoch": 0.01725,
"grad_norm": 2.9701859951019287,
"grad_norm_var": 0.11738609069977789,
"learning_rate": 0.0001,
"loss": 1.1041,
"loss/crossentropy": 2.4560158252716064,
"loss/hidden": 0.97265625,
"loss/logits": 0.1307787150144577,
"loss/reg": 6.178120383992791e-05,
"step": 138
},
{
"epoch": 0.017375,
"grad_norm": 2.151567220687866,
"grad_norm_var": 0.11513060923898569,
"learning_rate": 0.0001,
"loss": 1.1406,
"loss/crossentropy": 2.6192235946655273,
"loss/hidden": 0.98828125,
"loss/logits": 0.15172292292118073,
"loss/reg": 6.176753231557086e-05,
"step": 139
},
{
"epoch": 0.0175,
"grad_norm": 2.0209085941314697,
"grad_norm_var": 0.1267419293205286,
"learning_rate": 0.0001,
"loss": 1.0928,
"loss/crossentropy": 2.6628799438476562,
"loss/hidden": 0.94921875,
"loss/logits": 0.14296585321426392,
"loss/reg": 6.175567978061736e-05,
"step": 140
},
{
"epoch": 0.017625,
"grad_norm": 3.458299398422241,
"grad_norm_var": 0.18389511336323494,
"learning_rate": 0.0001,
"loss": 1.3966,
"loss/crossentropy": 2.885798692703247,
"loss/hidden": 1.171875,
"loss/logits": 0.22411209344863892,
"loss/reg": 6.174653390189633e-05,
"step": 141
},
{
"epoch": 0.01775,
"grad_norm": 2.608558177947998,
"grad_norm_var": 0.17855808227350187,
"learning_rate": 0.0001,
"loss": 1.1734,
"loss/crossentropy": 2.2689590454101562,
"loss/hidden": 1.0234375,
"loss/logits": 0.1493585705757141,
"loss/reg": 6.1732207541354e-05,
"step": 142
},
{
"epoch": 0.017875,
"grad_norm": 2.7264318466186523,
"grad_norm_var": 0.1520478077633771,
"learning_rate": 0.0001,
"loss": 1.2868,
"loss/crossentropy": 2.3888814449310303,
"loss/hidden": 1.1171875,
"loss/logits": 0.16896918416023254,
"loss/reg": 6.172260327730328e-05,
"step": 143
},
{
"epoch": 0.018,
"grad_norm": 2.4999561309814453,
"grad_norm_var": 0.14128539295791806,
"learning_rate": 0.0001,
"loss": 1.3804,
"loss/crossentropy": 2.442732572555542,
"loss/hidden": 1.1875,
"loss/logits": 0.19230639934539795,
"loss/reg": 6.171311542857438e-05,
"step": 144
},
{
"epoch": 0.018125,
"grad_norm": 3.084848642349243,
"grad_norm_var": 0.1592220375940921,
"learning_rate": 0.0001,
"loss": 1.5124,
"loss/crossentropy": 2.6801810264587402,
"loss/hidden": 1.2421875,
"loss/logits": 0.2696050703525543,
"loss/reg": 6.170615233713761e-05,
"step": 145
},
{
"epoch": 0.01825,
"grad_norm": 3.0833539962768555,
"grad_norm_var": 0.16940866671487811,
"learning_rate": 0.0001,
"loss": 1.294,
"loss/crossentropy": 2.434020519256592,
"loss/hidden": 1.140625,
"loss/logits": 0.15272179245948792,
"loss/reg": 6.170049164211378e-05,
"step": 146
},
{
"epoch": 0.018375,
"grad_norm": 2.2046446800231934,
"grad_norm_var": 0.18039814292173043,
"learning_rate": 0.0001,
"loss": 1.1769,
"loss/crossentropy": 2.5624289512634277,
"loss/hidden": 1.015625,
"loss/logits": 0.160653755068779,
"loss/reg": 6.169131665956229e-05,
"step": 147
},
{
"epoch": 0.0185,
"grad_norm": 1.9920902252197266,
"grad_norm_var": 0.18414873169562326,
"learning_rate": 0.0001,
"loss": 1.1186,
"loss/crossentropy": 2.709728479385376,
"loss/hidden": 0.96875,
"loss/logits": 0.1492651402950287,
"loss/reg": 6.168704567244276e-05,
"step": 148
},
{
"epoch": 0.018625,
"grad_norm": 2.7053756713867188,
"grad_norm_var": 0.18317033653553666,
"learning_rate": 0.0001,
"loss": 1.2849,
"loss/crossentropy": 2.594032049179077,
"loss/hidden": 1.09375,
"loss/logits": 0.1905450075864792,
"loss/reg": 6.168089748825878e-05,
"step": 149
},
{
"epoch": 0.01875,
"grad_norm": 2.1234872341156006,
"grad_norm_var": 0.1981121598309187,
"learning_rate": 0.0001,
"loss": 1.2526,
"loss/crossentropy": 2.5880792140960693,
"loss/hidden": 1.0703125,
"loss/logits": 0.18171370029449463,
"loss/reg": 6.167205719975755e-05,
"step": 150
},
{
"epoch": 0.018875,
"grad_norm": 2.4820902347564697,
"grad_norm_var": 0.18631464898325945,
"learning_rate": 0.0001,
"loss": 1.1869,
"loss/crossentropy": 2.2422618865966797,
"loss/hidden": 1.0234375,
"loss/logits": 0.16288068890571594,
"loss/reg": 6.166584353195503e-05,
"step": 151
},
{
"epoch": 0.019,
"grad_norm": 2.5669338703155518,
"grad_norm_var": 0.17912821539433874,
"learning_rate": 0.0001,
"loss": 1.0968,
"loss/crossentropy": 2.5655312538146973,
"loss/hidden": 0.953125,
"loss/logits": 0.1430792212486267,
"loss/reg": 6.165904778754339e-05,
"step": 152
},
{
"epoch": 0.019125,
"grad_norm": 2.191638469696045,
"grad_norm_var": 0.18782946638749062,
"learning_rate": 0.0001,
"loss": 1.297,
"loss/crossentropy": 2.3935883045196533,
"loss/hidden": 1.109375,
"loss/logits": 0.18698745965957642,
"loss/reg": 6.165434024296701e-05,
"step": 153
},
{
"epoch": 0.01925,
"grad_norm": 1.9139376878738403,
"grad_norm_var": 0.19900155234911943,
"learning_rate": 0.0001,
"loss": 1.1497,
"loss/crossentropy": 2.5978732109069824,
"loss/hidden": 0.99609375,
"loss/logits": 0.1530168354511261,
"loss/reg": 6.164138176245615e-05,
"step": 154
},
{
"epoch": 0.019375,
"grad_norm": 2.061805486679077,
"grad_norm_var": 0.20353621009625153,
"learning_rate": 0.0001,
"loss": 1.034,
"loss/crossentropy": 2.29733943939209,
"loss/hidden": 0.91015625,
"loss/logits": 0.12318030744791031,
"loss/reg": 6.162770296214148e-05,
"step": 155
},
{
"epoch": 0.0195,
"grad_norm": 2.686328649520874,
"grad_norm_var": 0.19023239802865194,
"learning_rate": 0.0001,
"loss": 1.4235,
"loss/crossentropy": 2.2928433418273926,
"loss/hidden": 1.2265625,
"loss/logits": 0.19631928205490112,
"loss/reg": 6.16170436842367e-05,
"step": 156
},
{
"epoch": 0.019625,
"grad_norm": 2.6863300800323486,
"grad_norm_var": 0.13134889378527811,
"learning_rate": 0.0001,
"loss": 1.4147,
"loss/crossentropy": 2.289113759994507,
"loss/hidden": 1.21875,
"loss/logits": 0.19536322355270386,
"loss/reg": 6.160605698823929e-05,
"step": 157
},
{
"epoch": 0.01975,
"grad_norm": 3.7774782180786133,
"grad_norm_var": 0.2373896188726722,
"learning_rate": 0.0001,
"loss": 1.3606,
"loss/crossentropy": 2.4960098266601562,
"loss/hidden": 1.171875,
"loss/logits": 0.18812544643878937,
"loss/reg": 6.159812619443983e-05,
"step": 158
},
{
"epoch": 0.019875,
"grad_norm": 2.5556654930114746,
"grad_norm_var": 0.23517615853210802,
"learning_rate": 0.0001,
"loss": 1.1015,
"loss/crossentropy": 2.4794013500213623,
"loss/hidden": 0.9609375,
"loss/logits": 0.1399209052324295,
"loss/reg": 6.158895121188834e-05,
"step": 159
},
{
"epoch": 0.02,
"grad_norm": 2.3351266384124756,
"grad_norm_var": 0.23772124659223212,
"learning_rate": 0.0001,
"loss": 1.1072,
"loss/crossentropy": 2.402188301086426,
"loss/hidden": 0.96484375,
"loss/logits": 0.14173097908496857,
"loss/reg": 6.158249016152695e-05,
"step": 160
},
{
"epoch": 0.020125,
"grad_norm": 2.319366455078125,
"grad_norm_var": 0.21752957054554395,
"learning_rate": 0.0001,
"loss": 1.1774,
"loss/crossentropy": 2.1729917526245117,
"loss/hidden": 1.0234375,
"loss/logits": 0.15335121750831604,
"loss/reg": 6.157202733447775e-05,
"step": 161
},
{
"epoch": 0.02025,
"grad_norm": 2.0917341709136963,
"grad_norm_var": 0.19926011430610652,
"learning_rate": 0.0001,
"loss": 1.2443,
"loss/crossentropy": 2.276581048965454,
"loss/hidden": 1.0859375,
"loss/logits": 0.1577274203300476,
"loss/reg": 6.156737799756229e-05,
"step": 162
},
{
"epoch": 0.020375,
"grad_norm": 4.31035041809082,
"grad_norm_var": 0.41637723338655513,
"learning_rate": 0.0001,
"loss": 1.8974,
"loss/crossentropy": 2.6449058055877686,
"loss/hidden": 1.5625,
"loss/logits": 0.33430173993110657,
"loss/reg": 6.156211748020723e-05,
"step": 163
},
{
"epoch": 0.0205,
"grad_norm": 2.145301342010498,
"grad_norm_var": 0.4064476055559296,
"learning_rate": 0.0001,
"loss": 1.2636,
"loss/crossentropy": 2.613586664199829,
"loss/hidden": 1.078125,
"loss/logits": 0.1848127692937851,
"loss/reg": 6.155785376904532e-05,
"step": 164
},
{
"epoch": 0.020625,
"grad_norm": 3.6308248043060303,
"grad_norm_var": 0.47796885273955964,
"learning_rate": 0.0001,
"loss": 1.2327,
"loss/crossentropy": 2.599729537963867,
"loss/hidden": 1.046875,
"loss/logits": 0.1852511763572693,
"loss/reg": 6.154972652439028e-05,
"step": 165
},
{
"epoch": 0.02075,
"grad_norm": 2.812910556793213,
"grad_norm_var": 0.4622733920417279,
"learning_rate": 0.0001,
"loss": 1.3898,
"loss/crossentropy": 2.7171225547790527,
"loss/hidden": 1.1875,
"loss/logits": 0.20167264342308044,
"loss/reg": 6.154461152618751e-05,
"step": 166
},
{
"epoch": 0.020875,
"grad_norm": 2.4922893047332764,
"grad_norm_var": 0.46203729327833537,
"learning_rate": 0.0001,
"loss": 1.3528,
"loss/crossentropy": 2.648606777191162,
"loss/hidden": 1.140625,
"loss/logits": 0.21159711480140686,
"loss/reg": 6.153558933874592e-05,
"step": 167
},
{
"epoch": 0.021,
"grad_norm": 2.2380781173706055,
"grad_norm_var": 0.47292652355391496,
"learning_rate": 0.0001,
"loss": 1.3863,
"loss/crossentropy": 2.5556812286376953,
"loss/hidden": 1.1796875,
"loss/logits": 0.20603393018245697,
"loss/reg": 6.152570131234825e-05,
"step": 168
},
{
"epoch": 0.021125,
"grad_norm": 2.8179726600646973,
"grad_norm_var": 0.4599538691877346,
"learning_rate": 0.0001,
"loss": 1.3315,
"loss/crossentropy": 2.285341262817383,
"loss/hidden": 1.140625,
"loss/logits": 0.19030849635601044,
"loss/reg": 6.151832349132746e-05,
"step": 169
},
{
"epoch": 0.02125,
"grad_norm": 2.933023691177368,
"grad_norm_var": 0.42080948451517297,
"learning_rate": 0.0001,
"loss": 1.5924,
"loss/crossentropy": 2.254920482635498,
"loss/hidden": 1.3828125,
"loss/logits": 0.20900759100914001,
"loss/reg": 6.151078559923917e-05,
"step": 170
},
{
"epoch": 0.021375,
"grad_norm": 2.9309163093566895,
"grad_norm_var": 0.38903358238886365,
"learning_rate": 0.0001,
"loss": 1.2104,
"loss/crossentropy": 2.771516799926758,
"loss/hidden": 1.0546875,
"loss/logits": 0.15512725710868835,
"loss/reg": 6.14999225945212e-05,
"step": 171
},
{
"epoch": 0.0215,
"grad_norm": 2.7658286094665527,
"grad_norm_var": 0.3882477326935183,
"learning_rate": 0.0001,
"loss": 1.2183,
"loss/crossentropy": 2.565211296081543,
"loss/hidden": 1.0546875,
"loss/logits": 0.16297924518585205,
"loss/reg": 6.149257387733087e-05,
"step": 172
},
{
"epoch": 0.021625,
"grad_norm": 3.39176344871521,
"grad_norm_var": 0.40840451933244426,
"learning_rate": 0.0001,
"loss": 1.3931,
"loss/crossentropy": 2.4181013107299805,
"loss/hidden": 1.1875,
"loss/logits": 0.2049458771944046,
"loss/reg": 6.148203829070553e-05,
"step": 173
},
{
"epoch": 0.02175,
"grad_norm": 2.7971994876861572,
"grad_norm_var": 0.3468190736041642,
"learning_rate": 0.0001,
"loss": 1.2467,
"loss/crossentropy": 2.644824981689453,
"loss/hidden": 1.0703125,
"loss/logits": 0.17579111456871033,
"loss/reg": 6.147275416878983e-05,
"step": 174
},
{
"epoch": 0.021875,
"grad_norm": 7.143955707550049,
"grad_norm_var": 1.5219747541806836,
"learning_rate": 0.0001,
"loss": 1.3279,
"loss/crossentropy": 2.6274638175964355,
"loss/hidden": 1.171875,
"loss/logits": 0.15536972880363464,
"loss/reg": 6.146173836896196e-05,
"step": 175
},
{
"epoch": 0.022,
"grad_norm": 8.911324501037598,
"grad_norm_var": 3.578509022301667,
"learning_rate": 0.0001,
"loss": 1.8863,
"loss/crossentropy": 1.8980119228363037,
"loss/hidden": 1.765625,
"loss/logits": 0.12003660202026367,
"loss/reg": 6.145203224150464e-05,
"step": 176
},
{
"epoch": 0.022125,
"grad_norm": 2.14353609085083,
"grad_norm_var": 3.6077286646662734,
"learning_rate": 0.0001,
"loss": 1.1573,
"loss/crossentropy": 2.1538591384887695,
"loss/hidden": 1.015625,
"loss/logits": 0.1410439908504486,
"loss/reg": 6.144325743662193e-05,
"step": 177
},
{
"epoch": 0.02225,
"grad_norm": 4.625613212585449,
"grad_norm_var": 3.542583274880191,
"learning_rate": 0.0001,
"loss": 1.6226,
"loss/crossentropy": 2.7923362255096436,
"loss/hidden": 1.375,
"loss/logits": 0.24694563448429108,
"loss/reg": 6.143252539914101e-05,
"step": 178
},
{
"epoch": 0.022375,
"grad_norm": 2.543745517730713,
"grad_norm_var": 3.5775446556342367,
"learning_rate": 0.0001,
"loss": 1.4192,
"loss/crossentropy": 2.3237483501434326,
"loss/hidden": 1.203125,
"loss/logits": 0.21549411118030548,
"loss/reg": 6.14215387031436e-05,
"step": 179
},
{
"epoch": 0.0225,
"grad_norm": 2.3068995475769043,
"grad_norm_var": 3.5495511663474453,
"learning_rate": 0.0001,
"loss": 1.2428,
"loss/crossentropy": 2.7135560512542725,
"loss/hidden": 1.0859375,
"loss/logits": 0.1562565714120865,
"loss/reg": 6.141421181382611e-05,
"step": 180
},
{
"epoch": 0.022625,
"grad_norm": 3.465264081954956,
"grad_norm_var": 3.5490467443763025,
"learning_rate": 0.0001,
"loss": 1.4771,
"loss/crossentropy": 3.3183774948120117,
"loss/hidden": 1.234375,
"loss/logits": 0.2421126663684845,
"loss/reg": 6.140418554423377e-05,
"step": 181
},
{
"epoch": 0.02275,
"grad_norm": 2.696394205093384,
"grad_norm_var": 3.5608805573030993,
"learning_rate": 0.0001,
"loss": 1.2269,
"loss/crossentropy": 2.609964370727539,
"loss/hidden": 1.0546875,
"loss/logits": 0.17162814736366272,
"loss/reg": 6.139430479379371e-05,
"step": 182
},
{
"epoch": 0.022875,
"grad_norm": 2.3278727531433105,
"grad_norm_var": 3.5849405900569513,
"learning_rate": 0.0001,
"loss": 1.0795,
"loss/crossentropy": 2.753383159637451,
"loss/hidden": 0.9453125,
"loss/logits": 0.1335984170436859,
"loss/reg": 6.138216122053564e-05,
"step": 183
},
{
"epoch": 0.023,
"grad_norm": 2.4336531162261963,
"grad_norm_var": 3.554360278579671,
"learning_rate": 0.0001,
"loss": 1.3948,
"loss/crossentropy": 2.4162991046905518,
"loss/hidden": 1.171875,
"loss/logits": 0.22235547006130219,
"loss/reg": 6.137174204923213e-05,
"step": 184
},
{
"epoch": 0.023125,
"grad_norm": 2.420710802078247,
"grad_norm_var": 3.601127481620784,
"learning_rate": 0.0001,
"loss": 1.4926,
"loss/crossentropy": 2.30292010307312,
"loss/hidden": 1.296875,
"loss/logits": 0.19511133432388306,
"loss/reg": 6.136245065135881e-05,
"step": 185
},
{
"epoch": 0.02325,
"grad_norm": 2.727184534072876,
"grad_norm_var": 3.6190579859970224,
"learning_rate": 0.0001,
"loss": 1.2816,
"loss/crossentropy": 2.4605464935302734,
"loss/hidden": 1.0703125,
"loss/logits": 0.2107134908437729,
"loss/reg": 6.135714647825807e-05,
"step": 186
},
{
"epoch": 0.023375,
"grad_norm": 1.9292963743209839,
"grad_norm_var": 3.754688597499932,
"learning_rate": 0.0001,
"loss": 1.1628,
"loss/crossentropy": 2.5925047397613525,
"loss/hidden": 1.0,
"loss/logits": 0.16220712661743164,
"loss/reg": 6.134893919806927e-05,
"step": 187
},
{
"epoch": 0.0235,
"grad_norm": 2.1395771503448486,
"grad_norm_var": 3.833355540800866,
"learning_rate": 0.0001,
"loss": 1.2712,
"loss/crossentropy": 2.227994441986084,
"loss/hidden": 1.0859375,
"loss/logits": 0.18463259935379028,
"loss/reg": 6.134230352472514e-05,
"step": 188
},
{
"epoch": 0.023625,
"grad_norm": 3.552602529525757,
"grad_norm_var": 3.8353265135005175,
"learning_rate": 0.0001,
"loss": 1.2518,
"loss/crossentropy": 2.562777280807495,
"loss/hidden": 1.0859375,
"loss/logits": 0.16521015763282776,
"loss/reg": 6.13337178947404e-05,
"step": 189
},
{
"epoch": 0.02375,
"grad_norm": 2.766602039337158,
"grad_norm_var": 3.8377842837978386,
"learning_rate": 0.0001,
"loss": 1.3731,
"loss/crossentropy": 2.4200425148010254,
"loss/hidden": 1.203125,
"loss/logits": 0.1694013774394989,
"loss/reg": 6.132431008154526e-05,
"step": 190
},
{
"epoch": 0.023875,
"grad_norm": 2.403444528579712,
"grad_norm_var": 2.8653780273055327,
"learning_rate": 0.0001,
"loss": 1.1651,
"loss/crossentropy": 2.6963400840759277,
"loss/hidden": 1.0078125,
"loss/logits": 0.1566968709230423,
"loss/reg": 6.132054841145873e-05,
"step": 191
},
{
"epoch": 0.024,
"grad_norm": 2.0356028079986572,
"grad_norm_var": 0.4806738598539164,
"learning_rate": 0.0001,
"loss": 1.4298,
"loss/crossentropy": 2.174285650253296,
"loss/hidden": 1.21875,
"loss/logits": 0.21048110723495483,
"loss/reg": 6.13146330579184e-05,
"step": 192
},
{
"epoch": 0.024125,
"grad_norm": 2.501723051071167,
"grad_norm_var": 0.4641524277019669,
"learning_rate": 0.0001,
"loss": 1.2669,
"loss/crossentropy": 2.6477620601654053,
"loss/hidden": 1.09375,
"loss/logits": 0.17256709933280945,
"loss/reg": 6.130609108367935e-05,
"step": 193
},
{
"epoch": 0.02425,
"grad_norm": 2.8256325721740723,
"grad_norm_var": 0.19964871735684203,
"learning_rate": 0.0001,
"loss": 1.364,
"loss/crossentropy": 2.4205310344696045,
"loss/hidden": 1.1875,
"loss/logits": 0.17588719725608826,
"loss/reg": 6.129377288743854e-05,
"step": 194
},
{
"epoch": 0.024375,
"grad_norm": 3.715850353240967,
"grad_norm_var": 0.28183777248683595,
"learning_rate": 0.0001,
"loss": 1.4108,
"loss/crossentropy": 2.5872642993927,
"loss/hidden": 1.234375,
"loss/logits": 0.1758473813533783,
"loss/reg": 6.128078530309722e-05,
"step": 195
},
{
"epoch": 0.0245,
"grad_norm": 3.3498318195343018,
"grad_norm_var": 0.3034271167360647,
"learning_rate": 0.0001,
"loss": 1.3691,
"loss/crossentropy": 2.6444506645202637,
"loss/hidden": 1.171875,
"loss/logits": 0.19665929675102234,
"loss/reg": 6.126934749772772e-05,
"step": 196
},
{
"epoch": 0.024625,
"grad_norm": 2.0526957511901855,
"grad_norm_var": 0.2850787945150557,
"learning_rate": 0.0001,
"loss": 1.2051,
"loss/crossentropy": 2.592327117919922,
"loss/hidden": 1.0390625,
"loss/logits": 0.16540399193763733,
"loss/reg": 6.125810614321381e-05,
"step": 197
},
{
"epoch": 0.02475,
"grad_norm": 2.4300317764282227,
"grad_norm_var": 0.28670823409057716,
"learning_rate": 0.0001,
"loss": 1.5286,
"loss/crossentropy": 2.36305570602417,
"loss/hidden": 1.2890625,
"loss/logits": 0.2389371693134308,
"loss/reg": 6.124811625340953e-05,
"step": 198
},
{
"epoch": 0.024875,
"grad_norm": 2.3255856037139893,
"grad_norm_var": 0.28679178178242776,
"learning_rate": 0.0001,
"loss": 1.1743,
"loss/crossentropy": 2.0803394317626953,
"loss/hidden": 1.03125,
"loss/logits": 0.1424179971218109,
"loss/reg": 6.124229548731819e-05,
"step": 199
},
{
"epoch": 0.025,
"grad_norm": 2.2634005546569824,
"grad_norm_var": 0.2923937566916393,
"learning_rate": 0.0001,
"loss": 1.2619,
"loss/crossentropy": 2.427354574203491,
"loss/hidden": 1.0859375,
"loss/logits": 0.1753256618976593,
"loss/reg": 6.123317871242762e-05,
"step": 200
},
{
"epoch": 0.025125,
"grad_norm": 2.789698839187622,
"grad_norm_var": 0.292575209213462,
"learning_rate": 0.0001,
"loss": 1.2794,
"loss/crossentropy": 2.4137160778045654,
"loss/hidden": 1.1328125,
"loss/logits": 0.14599566161632538,
"loss/reg": 6.122920603957027e-05,
"step": 201
},
{
"epoch": 0.02525,
"grad_norm": 2.23150897026062,
"grad_norm_var": 0.3003877767651639,
"learning_rate": 0.0001,
"loss": 1.2906,
"loss/crossentropy": 2.502619743347168,
"loss/hidden": 1.09375,
"loss/logits": 0.19620737433433533,
"loss/reg": 6.122409831732512e-05,
"step": 202
},
{
"epoch": 0.025375,
"grad_norm": 3.3167238235473633,
"grad_norm_var": 0.2999410613935005,
"learning_rate": 0.0001,
"loss": 1.4511,
"loss/crossentropy": 2.5889461040496826,
"loss/hidden": 1.2265625,
"loss/logits": 0.2239363044500351,
"loss/reg": 6.122187187429518e-05,
"step": 203
},
{
"epoch": 0.0255,
"grad_norm": 2.5847971439361572,
"grad_norm_var": 0.28091485279191464,
"learning_rate": 0.0001,
"loss": 1.248,
"loss/crossentropy": 2.4720451831817627,
"loss/hidden": 1.078125,
"loss/logits": 0.16930653154850006,
"loss/reg": 6.120974285295233e-05,
"step": 204
},
{
"epoch": 0.025625,
"grad_norm": 2.071563243865967,
"grad_norm_var": 0.24897236933793085,
"learning_rate": 0.0001,
"loss": 1.1016,
"loss/crossentropy": 2.5648884773254395,
"loss/hidden": 0.96875,
"loss/logits": 0.13218875229358673,
"loss/reg": 6.120166654000059e-05,
"step": 205
},
{
"epoch": 0.02575,
"grad_norm": 2.9454479217529297,
"grad_norm_var": 0.2548478796483238,
"learning_rate": 0.0001,
"loss": 1.3574,
"loss/crossentropy": 2.607356309890747,
"loss/hidden": 1.15625,
"loss/logits": 0.20053817331790924,
"loss/reg": 6.119644967839122e-05,
"step": 206
},
{
"epoch": 0.025875,
"grad_norm": 3.396070718765259,
"grad_norm_var": 0.28840087929906133,
"learning_rate": 0.0001,
"loss": 1.1743,
"loss/crossentropy": 2.682058334350586,
"loss/hidden": 1.0078125,
"loss/logits": 0.16590501368045807,
"loss/reg": 6.11838695476763e-05,
"step": 207
},
{
"epoch": 0.026,
"grad_norm": 2.4477601051330566,
"grad_norm_var": 0.26375613878289506,
"learning_rate": 0.0001,
"loss": 1.3022,
"loss/crossentropy": 2.819031000137329,
"loss/hidden": 1.109375,
"loss/logits": 0.19222432374954224,
"loss/reg": 6.117635348346084e-05,
"step": 208
},
{
"epoch": 0.026125,
"grad_norm": 2.5916216373443604,
"grad_norm_var": 0.2618484053528464,
"learning_rate": 0.0001,
"loss": 1.353,
"loss/crossentropy": 2.529510259628296,
"loss/hidden": 1.15625,
"loss/logits": 0.19612029194831848,
"loss/reg": 6.116151052992791e-05,
"step": 209
},
{
"epoch": 0.02625,
"grad_norm": 2.108261823654175,
"grad_norm_var": 0.28282181699858694,
"learning_rate": 0.0001,
"loss": 1.2782,
"loss/crossentropy": 2.3222012519836426,
"loss/hidden": 1.09375,
"loss/logits": 0.18379396200180054,
"loss/reg": 6.114997813710943e-05,
"step": 210
},
{
"epoch": 0.026375,
"grad_norm": 2.48710560798645,
"grad_norm_var": 0.20482550381518247,
"learning_rate": 0.0001,
"loss": 1.2718,
"loss/crossentropy": 2.6183624267578125,
"loss/hidden": 1.0859375,
"loss/logits": 0.18522073328495026,
"loss/reg": 6.114102870924398e-05,
"step": 211
},
{
"epoch": 0.0265,
"grad_norm": 2.63779616355896,
"grad_norm_var": 0.1640915083279668,
"learning_rate": 0.0001,
"loss": 1.3499,
"loss/crossentropy": 2.391116142272949,
"loss/hidden": 1.1640625,
"loss/logits": 0.18524512648582458,
"loss/reg": 6.112866685725749e-05,
"step": 212
},
{
"epoch": 0.026625,
"grad_norm": 2.7476329803466797,
"grad_norm_var": 0.14889028663519804,
"learning_rate": 0.0001,
"loss": 1.2842,
"loss/crossentropy": 2.5770251750946045,
"loss/hidden": 1.1171875,
"loss/logits": 0.16641706228256226,
"loss/reg": 6.111864786362275e-05,
"step": 213
},
{
"epoch": 0.02675,
"grad_norm": 2.565723419189453,
"grad_norm_var": 0.14722036218699916,
"learning_rate": 0.0001,
"loss": 1.2381,
"loss/crossentropy": 2.80257248878479,
"loss/hidden": 1.0546875,
"loss/logits": 0.18279102444648743,
"loss/reg": 6.110716640250757e-05,
"step": 214
},
{
"epoch": 0.026875,
"grad_norm": 4.107775688171387,
"grad_norm_var": 0.2818514081658729,
"learning_rate": 0.0001,
"loss": 1.5243,
"loss/crossentropy": 2.4806065559387207,
"loss/hidden": 1.3046875,
"loss/logits": 0.2190462350845337,
"loss/reg": 6.109999230829999e-05,
"step": 215
},
{
"epoch": 0.027,
"grad_norm": 2.3829445838928223,
"grad_norm_var": 0.27569299833046823,
"learning_rate": 0.0001,
"loss": 1.2079,
"loss/crossentropy": 2.466684579849243,
"loss/hidden": 1.046875,
"loss/logits": 0.16046380996704102,
"loss/reg": 6.108790694270283e-05,
"step": 216
},
{
"epoch": 0.027125,
"grad_norm": 2.554863929748535,
"grad_norm_var": 0.2767468455530223,
"learning_rate": 0.0001,
"loss": 1.1988,
"loss/crossentropy": 2.582035541534424,
"loss/hidden": 1.046875,
"loss/logits": 0.15130122005939484,
"loss/reg": 6.1076192650944e-05,
"step": 217
},
{
"epoch": 0.02725,
"grad_norm": 2.7898809909820557,
"grad_norm_var": 0.26145832144768877,
"learning_rate": 0.0001,
"loss": 1.6592,
"loss/crossentropy": 2.655186414718628,
"loss/hidden": 1.3984375,
"loss/logits": 0.26013702154159546,
"loss/reg": 6.107001536292955e-05,
"step": 218
},
{
"epoch": 0.027375,
"grad_norm": 2.7881548404693604,
"grad_norm_var": 0.2378165583524293,
"learning_rate": 0.0001,
"loss": 1.5451,
"loss/crossentropy": 2.4413743019104004,
"loss/hidden": 1.3203125,
"loss/logits": 0.2241469919681549,
"loss/reg": 6.106249202275649e-05,
"step": 219
},
{
"epoch": 0.0275,
"grad_norm": 2.2896728515625,
"grad_norm_var": 0.24781162791184835,
"learning_rate": 0.0001,
"loss": 1.2198,
"loss/crossentropy": 2.4421772956848145,
"loss/hidden": 1.0703125,
"loss/logits": 0.14890027046203613,
"loss/reg": 6.105640932219103e-05,
"step": 220
},
{
"epoch": 0.027625,
"grad_norm": 2.324869155883789,
"grad_norm_var": 0.23120432182346703,
"learning_rate": 0.0001,
"loss": 1.3402,
"loss/crossentropy": 2.526216745376587,
"loss/hidden": 1.140625,
"loss/logits": 0.19898337125778198,
"loss/reg": 6.10438291914761e-05,
"step": 221
},
{
"epoch": 0.02775,
"grad_norm": 2.88158917427063,
"grad_norm_var": 0.22935101127255847,
"learning_rate": 0.0001,
"loss": 1.372,
"loss/crossentropy": 2.361729621887207,
"loss/hidden": 1.15625,
"loss/logits": 0.21510916948318481,
"loss/reg": 6.10318202234339e-05,
"step": 222
},
{
"epoch": 0.027875,
"grad_norm": 2.9760019779205322,
"grad_norm_var": 0.20104925696453316,
"learning_rate": 0.0001,
"loss": 1.2925,
"loss/crossentropy": 2.5573909282684326,
"loss/hidden": 1.1171875,
"loss/logits": 0.1747477501630783,
"loss/reg": 6.1027145420666784e-05,
"step": 223
},
{
"epoch": 0.028,
"grad_norm": 2.702091932296753,
"grad_norm_var": 0.19763696198550798,
"learning_rate": 0.0001,
"loss": 1.3524,
"loss/crossentropy": 2.717195510864258,
"loss/hidden": 1.15625,
"loss/logits": 0.19553202390670776,
"loss/reg": 6.1014961829641834e-05,
"step": 224
},
{
"epoch": 0.028125,
"grad_norm": 2.1232945919036865,
"grad_norm_var": 0.21708226542899425,
"learning_rate": 0.0001,
"loss": 1.2661,
"loss/crossentropy": 2.4481968879699707,
"loss/hidden": 1.0859375,
"loss/logits": 0.1795472800731659,
"loss/reg": 6.100164682720788e-05,
"step": 225
},
{
"epoch": 0.02825,
"grad_norm": 2.191066026687622,
"grad_norm_var": 0.2114830183011783,
"learning_rate": 0.0001,
"loss": 1.1895,
"loss/crossentropy": 2.34470534324646,
"loss/hidden": 1.03125,
"loss/logits": 0.15763415396213531,
"loss/reg": 6.099118763813749e-05,
"step": 226
},
{
"epoch": 0.028375,
"grad_norm": 2.3068013191223145,
"grad_norm_var": 0.21765702233228598,
"learning_rate": 0.0001,
"loss": 1.539,
"loss/crossentropy": 2.5549845695495605,
"loss/hidden": 1.328125,
"loss/logits": 0.21025767922401428,
"loss/reg": 6.09817034273874e-05,
"step": 227
},
{
"epoch": 0.0285,
"grad_norm": 2.890655279159546,
"grad_norm_var": 0.221304562186567,
"learning_rate": 0.0001,
"loss": 1.5638,
"loss/crossentropy": 2.2339606285095215,
"loss/hidden": 1.34375,
"loss/logits": 0.21939440071582794,
"loss/reg": 6.096933429944329e-05,
"step": 228
},
{
"epoch": 0.028625,
"grad_norm": 2.182521343231201,
"grad_norm_var": 0.2349577927735633,
"learning_rate": 0.0001,
"loss": 1.2085,
"loss/crossentropy": 2.641230583190918,
"loss/hidden": 1.046875,
"loss/logits": 0.161014586687088,
"loss/reg": 6.095720891607925e-05,
"step": 229
},
{
"epoch": 0.02875,
"grad_norm": 2.704406976699829,
"grad_norm_var": 0.23499684870281476,
"learning_rate": 0.0001,
"loss": 1.3456,
"loss/crossentropy": 2.6833486557006836,
"loss/hidden": 1.15625,
"loss/logits": 0.18876385688781738,
"loss/reg": 6.094613127061166e-05,
"step": 230
},
{
"epoch": 0.028875,
"grad_norm": 3.4925310611724854,
"grad_norm_var": 0.13802667852219105,
"learning_rate": 0.0001,
"loss": 1.3709,
"loss/crossentropy": 2.1604089736938477,
"loss/hidden": 1.1953125,
"loss/logits": 0.17500904202461243,
"loss/reg": 6.093499541748315e-05,
"step": 231
},
{
"epoch": 0.029,
"grad_norm": 2.344773530960083,
"grad_norm_var": 0.13921650701028032,
"learning_rate": 0.0001,
"loss": 1.4725,
"loss/crossentropy": 2.493307113647461,
"loss/hidden": 1.25,
"loss/logits": 0.22193682193756104,
"loss/reg": 6.092391777201556e-05,
"step": 232
},
{
"epoch": 0.029125,
"grad_norm": 1.8828089237213135,
"grad_norm_var": 0.17117140448626647,
"learning_rate": 0.0001,
"loss": 1.1104,
"loss/crossentropy": 2.5302743911743164,
"loss/hidden": 0.9765625,
"loss/logits": 0.1331850290298462,
"loss/reg": 6.0912472690688446e-05,
"step": 233
},
{
"epoch": 0.02925,
"grad_norm": 2.747770071029663,
"grad_norm_var": 0.16996031408720758,
"learning_rate": 0.0001,
"loss": 1.1371,
"loss/crossentropy": 2.4189980030059814,
"loss/hidden": 0.99609375,
"loss/logits": 0.14035619795322418,
"loss/reg": 6.089695307309739e-05,
"step": 234
},
{
"epoch": 0.029375,
"grad_norm": 1.8742481470108032,
"grad_norm_var": 0.1933626604088189,
"learning_rate": 0.0001,
"loss": 1.1601,
"loss/crossentropy": 2.2694003582000732,
"loss/hidden": 1.015625,
"loss/logits": 0.14385350048542023,
"loss/reg": 6.088387090130709e-05,
"step": 235
},
{
"epoch": 0.0295,
"grad_norm": 2.0313689708709717,
"grad_norm_var": 0.20459374724346724,
"learning_rate": 0.0001,
"loss": 1.2446,
"loss/crossentropy": 2.4902865886688232,
"loss/hidden": 1.0703125,
"loss/logits": 0.17369529604911804,
"loss/reg": 6.086897337809205e-05,
"step": 236
},
{
"epoch": 0.029625,
"grad_norm": 2.3882880210876465,
"grad_norm_var": 0.20354561810974156,
"learning_rate": 0.0001,
"loss": 1.3947,
"loss/crossentropy": 2.4032340049743652,
"loss/hidden": 1.1875,
"loss/logits": 0.20656049251556396,
"loss/reg": 6.085408676881343e-05,
"step": 237
},
{
"epoch": 0.02975,
"grad_norm": 1.7327938079833984,
"grad_norm_var": 0.22490130088653987,
"learning_rate": 0.0001,
"loss": 1.1777,
"loss/crossentropy": 2.4949777126312256,
"loss/hidden": 1.015625,
"loss/logits": 0.1614799201488495,
"loss/reg": 6.084307824494317e-05,
"step": 238
},
{
"epoch": 0.029875,
"grad_norm": 2.2483370304107666,
"grad_norm_var": 0.20314943964483845,
"learning_rate": 0.0001,
"loss": 1.331,
"loss/crossentropy": 2.5907418727874756,
"loss/hidden": 1.1328125,
"loss/logits": 0.19753864407539368,
"loss/reg": 6.0828475398011506e-05,
"step": 239
},
{
"epoch": 0.03,
"grad_norm": 2.5151193141937256,
"grad_norm_var": 0.19693662117647784,
"learning_rate": 0.0001,
"loss": 1.2278,
"loss/crossentropy": 2.6233856678009033,
"loss/hidden": 1.0546875,
"loss/logits": 0.1725194901227951,
"loss/reg": 6.0820282669737935e-05,
"step": 240
},
{
"epoch": 0.030125,
"grad_norm": 2.198249101638794,
"grad_norm_var": 0.19498660957211478,
"learning_rate": 0.0001,
"loss": 1.1441,
"loss/crossentropy": 2.368884563446045,
"loss/hidden": 0.99609375,
"loss/logits": 0.1473642736673355,
"loss/reg": 6.0812566516688094e-05,
"step": 241
},
{
"epoch": 0.03025,
"grad_norm": 2.195218563079834,
"grad_norm_var": 0.1948951313244331,
"learning_rate": 0.0001,
"loss": 1.2993,
"loss/crossentropy": 2.352041721343994,
"loss/hidden": 1.1171875,
"loss/logits": 0.1815069168806076,
"loss/reg": 6.080829552956857e-05,
"step": 242
},
{
"epoch": 0.030375,
"grad_norm": 2.6142425537109375,
"grad_norm_var": 0.19868367561009795,
"learning_rate": 0.0001,
"loss": 1.3644,
"loss/crossentropy": 2.497286558151245,
"loss/hidden": 1.1875,
"loss/logits": 0.17629210650920868,
"loss/reg": 6.0799306083936244e-05,
"step": 243
},
{
"epoch": 0.0305,
"grad_norm": 2.342033624649048,
"grad_norm_var": 0.1799734399041227,
"learning_rate": 0.0001,
"loss": 1.1311,
"loss/crossentropy": 2.5182478427886963,
"loss/hidden": 0.984375,
"loss/logits": 0.1461625099182129,
"loss/reg": 6.078776277718134e-05,
"step": 244
},
{
"epoch": 0.030625,
"grad_norm": 2.3943874835968018,
"grad_norm_var": 0.17823371257387344,
"learning_rate": 0.0001,
"loss": 1.1773,
"loss/crossentropy": 2.575707197189331,
"loss/hidden": 1.015625,
"loss/logits": 0.1610667109489441,
"loss/reg": 6.078143633203581e-05,
"step": 245
},
{
"epoch": 0.03075,
"grad_norm": 2.2752902507781982,
"grad_norm_var": 0.16984605758260846,
"learning_rate": 0.0001,
"loss": 1.3322,
"loss/crossentropy": 2.228628635406494,
"loss/hidden": 1.1484375,
"loss/logits": 0.18314987421035767,
"loss/reg": 6.077219222788699e-05,
"step": 246
},
{
"epoch": 0.030875,
"grad_norm": 2.1779940128326416,
"grad_norm_var": 0.07406002979102144,
"learning_rate": 0.0001,
"loss": 1.179,
"loss/crossentropy": 2.4325718879699707,
"loss/hidden": 1.0078125,
"loss/logits": 0.17062756419181824,
"loss/reg": 6.076457793824375e-05,
"step": 247
},
{
"epoch": 0.031,
"grad_norm": 2.031386613845825,
"grad_norm_var": 0.07614130749575872,
"learning_rate": 0.0001,
"loss": 1.3177,
"loss/crossentropy": 2.3050920963287354,
"loss/hidden": 1.1328125,
"loss/logits": 0.18426315486431122,
"loss/reg": 6.075216515455395e-05,
"step": 248
},
{
"epoch": 0.031125,
"grad_norm": 2.4880683422088623,
"grad_norm_var": 0.07117238958467732,
"learning_rate": 0.0001,
"loss": 1.2617,
"loss/crossentropy": 2.690160036087036,
"loss/hidden": 1.0625,
"loss/logits": 0.1985635757446289,
"loss/reg": 6.0742688219761476e-05,
"step": 249
},
{
"epoch": 0.03125,
"grad_norm": 2.631229877471924,
"grad_norm_var": 0.06453399427719399,
"learning_rate": 0.0001,
"loss": 1.3072,
"loss/crossentropy": 2.4459030628204346,
"loss/hidden": 1.109375,
"loss/logits": 0.1971898078918457,
"loss/reg": 6.0733007558155805e-05,
"step": 250
},
{
"epoch": 0.031375,
"grad_norm": 2.7028048038482666,
"grad_norm_var": 0.06497512863382227,
"learning_rate": 0.0001,
"loss": 1.3656,
"loss/crossentropy": 2.7830824851989746,
"loss/hidden": 1.1796875,
"loss/logits": 0.18533006310462952,
"loss/reg": 6.0722686612280086e-05,
"step": 251
},
{
"epoch": 0.0315,
"grad_norm": 3.7025880813598633,
"grad_norm_var": 0.17735395269518506,
"learning_rate": 0.0001,
"loss": 1.2542,
"loss/crossentropy": 2.4722542762756348,
"loss/hidden": 1.078125,
"loss/logits": 0.17551761865615845,
"loss/reg": 6.0708127421094105e-05,
"step": 252
},
{
"epoch": 0.031625,
"grad_norm": 2.1496498584747314,
"grad_norm_var": 0.18175923180052275,
"learning_rate": 0.0001,
"loss": 1.0403,
"loss/crossentropy": 2.4383487701416016,
"loss/hidden": 0.91015625,
"loss/logits": 0.12949630618095398,
"loss/reg": 6.069323717383668e-05,
"step": 253
},
{
"epoch": 0.03175,
"grad_norm": 3.212991237640381,
"grad_norm_var": 0.18702365671043306,
"learning_rate": 0.0001,
"loss": 1.3555,
"loss/crossentropy": 2.1896352767944336,
"loss/hidden": 1.1953125,
"loss/logits": 0.1595323085784912,
"loss/reg": 6.067836147849448e-05,
"step": 254
},
{
"epoch": 0.031875,
"grad_norm": 2.53044056892395,
"grad_norm_var": 0.18281462084492142,
"learning_rate": 0.0001,
"loss": 1.2462,
"loss/crossentropy": 2.8005239963531494,
"loss/hidden": 1.0625,
"loss/logits": 0.18304814398288727,
"loss/reg": 6.0668298829114065e-05,
"step": 255
},
{
"epoch": 0.032,
"grad_norm": 5.920226573944092,
"grad_norm_var": 0.9097630014084027,
"learning_rate": 0.0001,
"loss": 1.9011,
"loss/crossentropy": 2.2827932834625244,
"loss/hidden": 1.59375,
"loss/logits": 0.3067648708820343,
"loss/reg": 6.0657377616735175e-05,
"step": 256
},
{
"epoch": 0.032125,
"grad_norm": 3.144649028778076,
"grad_norm_var": 0.8995354429829506,
"learning_rate": 0.0001,
"loss": 1.2361,
"loss/crossentropy": 2.9163215160369873,
"loss/hidden": 1.078125,
"loss/logits": 0.15732741355895996,
"loss/reg": 6.064687840989791e-05,
"step": 257
},
{
"epoch": 0.03225,
"grad_norm": 2.677065849304199,
"grad_norm_var": 0.8763431299745091,
"learning_rate": 0.0001,
"loss": 1.3123,
"loss/crossentropy": 2.9036660194396973,
"loss/hidden": 1.125,
"loss/logits": 0.18664765357971191,
"loss/reg": 6.0635462432401255e-05,
"step": 258
},
{
"epoch": 0.032375,
"grad_norm": 1.9815617799758911,
"grad_norm_var": 0.9180593253885627,
"learning_rate": 0.0001,
"loss": 1.2567,
"loss/crossentropy": 2.6647751331329346,
"loss/hidden": 1.0703125,
"loss/logits": 0.18578888475894928,
"loss/reg": 6.062128159101121e-05,
"step": 259
},
{
"epoch": 0.0325,
"grad_norm": 2.6094260215759277,
"grad_norm_var": 0.9071755924568459,
"learning_rate": 0.0001,
"loss": 1.4176,
"loss/crossentropy": 2.9915220737457275,
"loss/hidden": 1.21875,
"loss/logits": 0.19824379682540894,
"loss/reg": 6.060625673853792e-05,
"step": 260
},
{
"epoch": 0.032625,
"grad_norm": 2.4859585762023926,
"grad_norm_var": 0.9028772625757899,
"learning_rate": 0.0001,
"loss": 1.2047,
"loss/crossentropy": 2.325611114501953,
"loss/hidden": 1.03125,
"loss/logits": 0.17281952500343323,
"loss/reg": 6.0591693909373134e-05,
"step": 261
},
{
"epoch": 0.03275,
"grad_norm": 4.910043716430664,
"grad_norm_var": 1.154144117287072,
"learning_rate": 0.0001,
"loss": 1.2858,
"loss/crossentropy": 2.568098306655884,
"loss/hidden": 1.109375,
"loss/logits": 0.17582398653030396,
"loss/reg": 6.057979408069514e-05,
"step": 262
},
{
"epoch": 0.032875,
"grad_norm": 2.2592694759368896,
"grad_norm_var": 1.1460852387432343,
"learning_rate": 0.0001,
"loss": 1.3156,
"loss/crossentropy": 2.5264766216278076,
"loss/hidden": 1.1171875,
"loss/logits": 0.19776055216789246,
"loss/reg": 6.056776692275889e-05,
"step": 263
},
{
"epoch": 0.033,
"grad_norm": 2.6964571475982666,
"grad_norm_var": 1.0909556269012999,
"learning_rate": 0.0001,
"loss": 1.0468,
"loss/crossentropy": 2.740647792816162,
"loss/hidden": 0.91796875,
"loss/logits": 0.12825211882591248,
"loss/reg": 6.0556718381121755e-05,
"step": 264
},
{
"epoch": 0.033125,
"grad_norm": 2.112201690673828,
"grad_norm_var": 1.125761935491216,
"learning_rate": 0.0001,
"loss": 1.2175,
"loss/crossentropy": 2.475130081176758,
"loss/hidden": 1.0390625,
"loss/logits": 0.1778050661087036,
"loss/reg": 6.0543683503055945e-05,
"step": 265
},
{
"epoch": 0.03325,
"grad_norm": 1.8527328968048096,
"grad_norm_var": 1.2001448152569836,
"learning_rate": 0.0001,
"loss": 1.1913,
"loss/crossentropy": 2.2017788887023926,
"loss/hidden": 1.0234375,
"loss/logits": 0.16727614402770996,
"loss/reg": 6.053145989426412e-05,
"step": 266
},
{
"epoch": 0.033375,
"grad_norm": 2.2294929027557373,
"grad_norm_var": 1.2287526925730277,
"learning_rate": 0.0001,
"loss": 1.3521,
"loss/crossentropy": 2.268073558807373,
"loss/hidden": 1.1640625,
"loss/logits": 0.18739831447601318,
"loss/reg": 6.052442768123001e-05,
"step": 267
},
{
"epoch": 0.0335,
"grad_norm": 2.185410499572754,
"grad_norm_var": 1.2112062552861744,
"learning_rate": 0.0001,
"loss": 1.44,
"loss/crossentropy": 2.390622138977051,
"loss/hidden": 1.234375,
"loss/logits": 0.20500804483890533,
"loss/reg": 6.051711898180656e-05,
"step": 268
},
{
"epoch": 0.033625,
"grad_norm": 2.616452693939209,
"grad_norm_var": 1.1837342905938153,
"learning_rate": 0.0001,
"loss": 1.3338,
"loss/crossentropy": 2.3374340534210205,
"loss/hidden": 1.15625,
"loss/logits": 0.17693625390529633,
"loss/reg": 6.0506343288579956e-05,
"step": 269
},
{
"epoch": 0.03375,
"grad_norm": 2.5214874744415283,
"grad_norm_var": 1.1791403953024882,
"learning_rate": 0.0001,
"loss": 1.4572,
"loss/crossentropy": 2.6334807872772217,
"loss/hidden": 1.25,
"loss/logits": 0.20655225217342377,
"loss/reg": 6.0493421187857166e-05,
"step": 270
},
{
"epoch": 0.033875,
"grad_norm": 2.3426766395568848,
"grad_norm_var": 1.18798729537596,
"learning_rate": 0.0001,
"loss": 1.2858,
"loss/crossentropy": 2.362666130065918,
"loss/hidden": 1.1171875,
"loss/logits": 0.16799038648605347,
"loss/reg": 6.047951683285646e-05,
"step": 271
},
{
"epoch": 0.034,
"grad_norm": 2.483227491378784,
"grad_norm_var": 0.4891016266434789,
"learning_rate": 0.0001,
"loss": 1.4126,
"loss/crossentropy": 2.6330323219299316,
"loss/hidden": 1.203125,
"loss/logits": 0.20882482826709747,
"loss/reg": 6.046749331289902e-05,
"step": 272
},
{
"epoch": 0.034125,
"grad_norm": 3.3453869819641113,
"grad_norm_var": 0.5070205087741229,
"learning_rate": 0.0001,
"loss": 1.3731,
"loss/crossentropy": 2.6637308597564697,
"loss/hidden": 1.171875,
"loss/logits": 0.20059773325920105,
"loss/reg": 6.0458773077698424e-05,
"step": 273
},
{
"epoch": 0.03425,
"grad_norm": 2.2971482276916504,
"grad_norm_var": 0.5112160036914843,
"learning_rate": 0.0001,
"loss": 1.3516,
"loss/crossentropy": 2.400428533554077,
"loss/hidden": 1.1640625,
"loss/logits": 0.18688717484474182,
"loss/reg": 6.0452930483734235e-05,
"step": 274
},
{
"epoch": 0.034375,
"grad_norm": 11.117164611816406,
"grad_norm_var": 5.025199240890341,
"learning_rate": 0.0001,
"loss": 2.1956,
"loss/crossentropy": 2.7653286457061768,
"loss/hidden": 1.8984375,
"loss/logits": 0.2965186834335327,
"loss/reg": 6.045090049155988e-05,
"step": 275
},
{
"epoch": 0.0345,
"grad_norm": 3.6517550945281982,
"grad_norm_var": 5.020888752799834,
"learning_rate": 0.0001,
"loss": 1.4104,
"loss/crossentropy": 2.8897998332977295,
"loss/hidden": 1.1484375,
"loss/logits": 0.26139265298843384,
"loss/reg": 6.0451366152847186e-05,
"step": 276
},
{
"epoch": 0.034625,
"grad_norm": 2.6342201232910156,
"grad_norm_var": 5.008262345647254,
"learning_rate": 0.0001,
"loss": 1.272,
"loss/crossentropy": 2.662801504135132,
"loss/hidden": 1.09375,
"loss/logits": 0.17764705419540405,
"loss/reg": 6.0443973779911175e-05,
"step": 277
},
{
"epoch": 0.03475,
"grad_norm": 2.613866090774536,
"grad_norm_var": 4.815302301096653,
"learning_rate": 0.0001,
"loss": 1.3,
"loss/crossentropy": 2.2599401473999023,
"loss/hidden": 1.125,
"loss/logits": 0.1744215488433838,
"loss/reg": 6.04407032369636e-05,
"step": 278
},
{
"epoch": 0.034875,
"grad_norm": 2.4121639728546143,
"grad_norm_var": 4.800441045565859,
"learning_rate": 0.0001,
"loss": 1.2736,
"loss/crossentropy": 2.3868885040283203,
"loss/hidden": 1.109375,
"loss/logits": 0.16360533237457275,
"loss/reg": 6.0438182117650285e-05,
"step": 279
},
{
"epoch": 0.035,
"grad_norm": 2.257427930831909,
"grad_norm_var": 4.834324037466968,
"learning_rate": 0.0001,
"loss": 1.3236,
"loss/crossentropy": 2.452359914779663,
"loss/hidden": 1.1328125,
"loss/logits": 0.19017404317855835,
"loss/reg": 6.043619578122161e-05,
"step": 280
},
{
"epoch": 0.035125,
"grad_norm": 2.3916571140289307,
"grad_norm_var": 4.8045581397439525,
"learning_rate": 0.0001,
"loss": 1.3161,
"loss/crossentropy": 2.4834201335906982,
"loss/hidden": 1.109375,
"loss/logits": 0.20611721277236938,
"loss/reg": 6.043669054633938e-05,
"step": 281
},
{
"epoch": 0.03525,
"grad_norm": 2.815398931503296,
"grad_norm_var": 4.707581175884913,
"learning_rate": 0.0001,
"loss": 1.1312,
"loss/crossentropy": 3.0801713466644287,
"loss/hidden": 0.98828125,
"loss/logits": 0.14229975640773773,
"loss/reg": 6.044648034730926e-05,
"step": 282
},
{
"epoch": 0.035375,
"grad_norm": 3.1715469360351562,
"grad_norm_var": 4.651233430019207,
"learning_rate": 0.0001,
"loss": 1.409,
"loss/crossentropy": 2.354785919189453,
"loss/hidden": 1.1953125,
"loss/logits": 0.21305763721466064,
"loss/reg": 6.0437832871684805e-05,
"step": 283
},
{
"epoch": 0.0355,
"grad_norm": 2.5010037422180176,
"grad_norm_var": 4.615667456235268,
"learning_rate": 0.0001,
"loss": 1.3572,
"loss/crossentropy": 2.492047071456909,
"loss/hidden": 1.1640625,
"loss/logits": 0.1925477683544159,
"loss/reg": 6.044709516572766e-05,
"step": 284
},
{
"epoch": 0.035625,
"grad_norm": 1.964429259300232,
"grad_norm_var": 4.6928209367171645,
"learning_rate": 0.0001,
"loss": 1.1671,
"loss/crossentropy": 2.3351125717163086,
"loss/hidden": 0.99609375,
"loss/logits": 0.1704423427581787,
"loss/reg": 6.0453679907368496e-05,
"step": 285
},
{
"epoch": 0.03575,
"grad_norm": 2.3656678199768066,
"grad_norm_var": 4.707552916907375,
"learning_rate": 0.0001,
"loss": 1.5385,
"loss/crossentropy": 2.4216158390045166,
"loss/hidden": 1.28125,
"loss/logits": 0.2566841244697571,
"loss/reg": 6.0443537222454324e-05,
"step": 286
},
{
"epoch": 0.035875,
"grad_norm": 3.140928030014038,
"grad_norm_var": 4.661686527481659,
"learning_rate": 0.0001,
"loss": 1.3637,
"loss/crossentropy": 2.8347983360290527,
"loss/hidden": 1.15625,
"loss/logits": 0.20682096481323242,
"loss/reg": 6.043089888407849e-05,
"step": 287
},
{
"epoch": 0.036,
"grad_norm": 2.6460797786712646,
"grad_norm_var": 4.647830565858565,
"learning_rate": 0.0001,
"loss": 1.3928,
"loss/crossentropy": 2.108215093612671,
"loss/hidden": 1.2109375,
"loss/logits": 0.18129181861877441,
"loss/reg": 6.042820677976124e-05,
"step": 288
},
{
"epoch": 0.036125,
"grad_norm": 2.879531145095825,
"grad_norm_var": 4.652852381956769,
"learning_rate": 0.0001,
"loss": 1.4359,
"loss/crossentropy": 2.90163516998291,
"loss/hidden": 1.25,
"loss/logits": 0.1853410005569458,
"loss/reg": 6.042792301741429e-05,
"step": 289
},
{
"epoch": 0.03625,
"grad_norm": 2.5701370239257812,
"grad_norm_var": 4.625421100051376,
"learning_rate": 0.0001,
"loss": 1.3639,
"loss/crossentropy": 2.6896326541900635,
"loss/hidden": 1.15625,
"loss/logits": 0.2070741057395935,
"loss/reg": 6.0414979088818654e-05,
"step": 290
},
{
"epoch": 0.036375,
"grad_norm": 2.988196849822998,
"grad_norm_var": 0.16977142791367086,
"learning_rate": 0.0001,
"loss": 1.103,
"loss/crossentropy": 2.8485705852508545,
"loss/hidden": 0.96875,
"loss/logits": 0.13362044095993042,
"loss/reg": 6.0413527535274625e-05,
"step": 291
},
{
"epoch": 0.0365,
"grad_norm": 5.9153923988342285,
"grad_norm_var": 0.7809789933836029,
"learning_rate": 0.0001,
"loss": 1.6292,
"loss/crossentropy": 2.607590436935425,
"loss/hidden": 1.4375,
"loss/logits": 0.19109681248664856,
"loss/reg": 6.041422238922678e-05,
"step": 292
},
{
"epoch": 0.036625,
"grad_norm": 1.932381510734558,
"grad_norm_var": 0.8300136192923785,
"learning_rate": 0.0001,
"loss": 1.1314,
"loss/crossentropy": 2.2319207191467285,
"loss/hidden": 0.9921875,
"loss/logits": 0.13856041431427002,
"loss/reg": 6.041810775059275e-05,
"step": 293
},
{
"epoch": 0.03675,
"grad_norm": 2.1218042373657227,
"grad_norm_var": 0.8563980373093443,
"learning_rate": 0.0001,
"loss": 1.1898,
"loss/crossentropy": 2.7033910751342773,
"loss/hidden": 1.03125,
"loss/logits": 0.15791726112365723,
"loss/reg": 6.0404745454434305e-05,
"step": 294
},
{
"epoch": 0.036875,
"grad_norm": 3.239748954772949,
"grad_norm_var": 0.8614170936653748,
"learning_rate": 0.0001,
"loss": 1.6186,
"loss/crossentropy": 2.3478281497955322,
"loss/hidden": 1.3671875,
"loss/logits": 0.2507687509059906,
"loss/reg": 6.039286745362915e-05,
"step": 295
},
{
"epoch": 0.037,
"grad_norm": 2.361431121826172,
"grad_norm_var": 0.8544814148079373,
"learning_rate": 0.0001,
"loss": 1.2822,
"loss/crossentropy": 2.4396111965179443,
"loss/hidden": 1.0859375,
"loss/logits": 0.19569119811058044,
"loss/reg": 6.0390335420379415e-05,
"step": 296
},
{
"epoch": 0.037125,
"grad_norm": 2.6921112537384033,
"grad_norm_var": 0.8432509023111928,
"learning_rate": 0.0001,
"loss": 1.3584,
"loss/crossentropy": 2.3235762119293213,
"loss/hidden": 1.15625,
"loss/logits": 0.20157676935195923,
"loss/reg": 6.037576531525701e-05,
"step": 297
},
{
"epoch": 0.03725,
"grad_norm": 2.2376601696014404,
"grad_norm_var": 0.8653611900667765,
"learning_rate": 0.0001,
"loss": 1.3703,
"loss/crossentropy": 2.441978693008423,
"loss/hidden": 1.1875,
"loss/logits": 0.1821848303079605,
"loss/reg": 6.036146805854514e-05,
"step": 298
},
{
"epoch": 0.037375,
"grad_norm": 2.5022082328796387,
"grad_norm_var": 0.8598019948407729,
"learning_rate": 0.0001,
"loss": 1.2909,
"loss/crossentropy": 2.4099972248077393,
"loss/hidden": 1.09375,
"loss/logits": 0.1965959370136261,
"loss/reg": 6.035445403540507e-05,
"step": 299
},
{
"epoch": 0.0375,
"grad_norm": 2.323599338531494,
"grad_norm_var": 0.8677455500426021,
"learning_rate": 0.0001,
"loss": 1.2301,
"loss/crossentropy": 2.714334011077881,
"loss/hidden": 1.0625,
"loss/logits": 0.16703477501869202,
"loss/reg": 6.034153193468228e-05,
"step": 300
},
{
"epoch": 0.037625,
"grad_norm": 2.902794361114502,
"grad_norm_var": 0.8254198045813945,
"learning_rate": 0.0001,
"loss": 1.287,
"loss/crossentropy": 2.5897319316864014,
"loss/hidden": 1.1171875,
"loss/logits": 0.1692187488079071,
"loss/reg": 6.032464443705976e-05,
"step": 301
},
{
"epoch": 0.03775,
"grad_norm": 2.455423355102539,
"grad_norm_var": 0.8207107650276014,
"learning_rate": 0.0001,
"loss": 1.3118,
"loss/crossentropy": 2.2553625106811523,
"loss/hidden": 1.15625,
"loss/logits": 0.15494795143604279,
"loss/reg": 6.031416342011653e-05,
"step": 302
},
{
"epoch": 0.037875,
"grad_norm": 2.70770001411438,
"grad_norm_var": 0.8131429553718594,
"learning_rate": 0.0001,
"loss": 1.3645,
"loss/crossentropy": 2.298628807067871,
"loss/hidden": 1.1875,
"loss/logits": 0.17642799019813538,
"loss/reg": 6.029937867424451e-05,
"step": 303
},
{
"epoch": 0.038,
"grad_norm": 2.4096872806549072,
"grad_norm_var": 0.8208490888498592,
"learning_rate": 0.0001,
"loss": 1.2573,
"loss/crossentropy": 2.6787161827087402,
"loss/hidden": 1.078125,
"loss/logits": 0.17861339449882507,
"loss/reg": 6.027881318004802e-05,
"step": 304
},
{
"epoch": 0.038125,
"grad_norm": 2.364800214767456,
"grad_norm_var": 0.8295471446711137,
"learning_rate": 0.0001,
"loss": 1.3251,
"loss/crossentropy": 2.351970911026001,
"loss/hidden": 1.140625,
"loss/logits": 0.18391045928001404,
"loss/reg": 6.026409027981572e-05,
"step": 305
},
{
"epoch": 0.03825,
"grad_norm": 2.0991923809051514,
"grad_norm_var": 0.8536240669336511,
"learning_rate": 0.0001,
"loss": 1.078,
"loss/crossentropy": 2.7187068462371826,
"loss/hidden": 0.9453125,
"loss/logits": 0.13205038011074066,
"loss/reg": 6.0248257796047255e-05,
"step": 306
},
{
"epoch": 0.038375,
"grad_norm": 2.7471582889556885,
"grad_norm_var": 0.8481018158238611,
"learning_rate": 0.0001,
"loss": 1.4035,
"loss/crossentropy": 2.1265523433685303,
"loss/hidden": 1.21875,
"loss/logits": 0.18416792154312134,
"loss/reg": 6.0230733652133495e-05,
"step": 307
},
{
"epoch": 0.0385,
"grad_norm": 2.2592687606811523,
"grad_norm_var": 0.11041007633642194,
"learning_rate": 0.0001,
"loss": 1.271,
"loss/crossentropy": 2.66719651222229,
"loss/hidden": 1.0859375,
"loss/logits": 0.184452086687088,
"loss/reg": 6.0217109421500936e-05,
"step": 308
},
{
"epoch": 0.038625,
"grad_norm": 2.2400615215301514,
"grad_norm_var": 0.09468951175299385,
"learning_rate": 0.0001,
"loss": 1.2348,
"loss/crossentropy": 2.3710193634033203,
"loss/hidden": 1.0625,
"loss/logits": 0.1717246174812317,
"loss/reg": 6.020214277668856e-05,
"step": 309
},
{
"epoch": 0.03875,
"grad_norm": 2.0783209800720215,
"grad_norm_var": 0.09687885973874776,
"learning_rate": 0.0001,
"loss": 1.2085,
"loss/crossentropy": 2.2699692249298096,
"loss/hidden": 1.03125,
"loss/logits": 0.17665645480155945,
"loss/reg": 6.018438944010995e-05,
"step": 310
},
{
"epoch": 0.038875,
"grad_norm": 2.077648162841797,
"grad_norm_var": 0.06299334570375853,
"learning_rate": 0.0001,
"loss": 1.2169,
"loss/crossentropy": 2.334127426147461,
"loss/hidden": 1.0625,
"loss/logits": 0.15378312766551971,
"loss/reg": 6.0161146393511444e-05,
"step": 311
},
{
"epoch": 0.039,
"grad_norm": 2.440629482269287,
"grad_norm_var": 0.06293910816862744,
"learning_rate": 0.0001,
"loss": 1.2956,
"loss/crossentropy": 2.791874408721924,
"loss/hidden": 1.1171875,
"loss/logits": 0.1777758002281189,
"loss/reg": 6.014638711349107e-05,
"step": 312
},
{
"epoch": 0.039125,
"grad_norm": 2.853940963745117,
"grad_norm_var": 0.07069242228717272,
"learning_rate": 0.0001,
"loss": 1.2688,
"loss/crossentropy": 2.5036516189575195,
"loss/hidden": 1.0859375,
"loss/logits": 0.18226328492164612,
"loss/reg": 6.013087840983644e-05,
"step": 313
},
{
"epoch": 0.03925,
"grad_norm": 3.287529230117798,
"grad_norm_var": 0.11423125477930943,
"learning_rate": 0.0001,
"loss": 1.2435,
"loss/crossentropy": 2.696265697479248,
"loss/hidden": 1.0703125,
"loss/logits": 0.17254707217216492,
"loss/reg": 6.011854929965921e-05,
"step": 314
},
{
"epoch": 0.039375,
"grad_norm": 3.1080963611602783,
"grad_norm_var": 0.1386158794861321,
"learning_rate": 0.0001,
"loss": 1.473,
"loss/crossentropy": 2.1882760524749756,
"loss/hidden": 1.25,
"loss/logits": 0.2224160134792328,
"loss/reg": 6.0103353462181985e-05,
"step": 315
},
{
"epoch": 0.0395,
"grad_norm": 2.7303977012634277,
"grad_norm_var": 0.13818442385569654,
"learning_rate": 0.0001,
"loss": 1.4029,
"loss/crossentropy": 2.361660957336426,
"loss/hidden": 1.2109375,
"loss/logits": 0.19139324128627777,
"loss/reg": 6.008424679748714e-05,
"step": 316
},
{
"epoch": 0.039625,
"grad_norm": 1.7651097774505615,
"grad_norm_var": 0.16520987140884788,
"learning_rate": 0.0001,
"loss": 1.0765,
"loss/crossentropy": 2.435858964920044,
"loss/hidden": 0.953125,
"loss/logits": 0.1227254569530487,
"loss/reg": 6.007165211485699e-05,
"step": 317
},
{
"epoch": 0.03975,
"grad_norm": 2.128772258758545,
"grad_norm_var": 0.17279926669385734,
"learning_rate": 0.0001,
"loss": 1.1848,
"loss/crossentropy": 2.334495782852173,
"loss/hidden": 1.0546875,
"loss/logits": 0.12953956425189972,
"loss/reg": 6.005321120028384e-05,
"step": 318
},
{
"epoch": 0.039875,
"grad_norm": 2.1308538913726807,
"grad_norm_var": 0.1742483958439737,
"learning_rate": 0.0001,
"loss": 1.3191,
"loss/crossentropy": 2.3873021602630615,
"loss/hidden": 1.125,
"loss/logits": 0.19348952174186707,
"loss/reg": 6.0041034885216504e-05,
"step": 319
},
{
"epoch": 0.04,
"grad_norm": 2.706742286682129,
"grad_norm_var": 0.17935140917835876,
"learning_rate": 0.0001,
"loss": 1.4123,
"loss/crossentropy": 2.5321033000946045,
"loss/hidden": 1.203125,
"loss/logits": 0.20852993428707123,
"loss/reg": 6.002993177389726e-05,
"step": 320
},
{
"epoch": 0.040125,
"grad_norm": 6.118154525756836,
"grad_norm_var": 1.0228689502418715,
"learning_rate": 0.0001,
"loss": 1.7298,
"loss/crossentropy": 2.457045316696167,
"loss/hidden": 1.515625,
"loss/logits": 0.2136228382587433,
"loss/reg": 6.001694418955594e-05,
"step": 321
},
{
"epoch": 0.04025,
"grad_norm": 3.091947317123413,
"grad_norm_var": 1.0084811477178388,
"learning_rate": 0.0001,
"loss": 1.6635,
"loss/crossentropy": 2.6943020820617676,
"loss/hidden": 1.40625,
"loss/logits": 0.25662127137184143,
"loss/reg": 6.0004946135450155e-05,
"step": 322
},
{
"epoch": 0.040375,
"grad_norm": 2.488391637802124,
"grad_norm_var": 1.0122566583255546,
"learning_rate": 0.0001,
"loss": 1.2065,
"loss/crossentropy": 2.646897792816162,
"loss/hidden": 1.0546875,
"loss/logits": 0.15123483538627625,
"loss/reg": 5.9991711168549955e-05,
"step": 323
},
{
"epoch": 0.0405,
"grad_norm": 3.0675456523895264,
"grad_norm_var": 1.0035307165437406,
"learning_rate": 0.0001,
"loss": 1.4832,
"loss/crossentropy": 2.4176406860351562,
"loss/hidden": 1.2421875,
"loss/logits": 0.24040505290031433,
"loss/reg": 5.9981128288200125e-05,
"step": 324
},
{
"epoch": 0.040625,
"grad_norm": 2.424546957015991,
"grad_norm_var": 0.9926314451715664,
"learning_rate": 0.0001,
"loss": 1.07,
"loss/crossentropy": 2.703134059906006,
"loss/hidden": 0.94140625,
"loss/logits": 0.12803316116333008,
"loss/reg": 5.997138941893354e-05,
"step": 325
},
{
"epoch": 0.04075,
"grad_norm": 2.9345507621765137,
"grad_norm_var": 0.9582126623175621,
"learning_rate": 0.0001,
"loss": 1.4247,
"loss/crossentropy": 2.8940789699554443,
"loss/hidden": 1.21875,
"loss/logits": 0.20539763569831848,
"loss/reg": 5.996019172016531e-05,
"step": 326
},
{
"epoch": 0.040875,
"grad_norm": 3.069572925567627,
"grad_norm_var": 0.9195850402401864,
"learning_rate": 0.0001,
"loss": 1.3896,
"loss/crossentropy": 2.4416871070861816,
"loss/hidden": 1.1875,
"loss/logits": 0.20154833793640137,
"loss/reg": 5.9947429690510035e-05,
"step": 327
},
{
"epoch": 0.041,
"grad_norm": 2.323606491088867,
"grad_norm_var": 0.9275566292830253,
"learning_rate": 0.0001,
"loss": 1.2888,
"loss/crossentropy": 2.811528444290161,
"loss/hidden": 1.1015625,
"loss/logits": 0.18662354350090027,
"loss/reg": 5.9936231991741806e-05,
"step": 328
},
{
"epoch": 0.041125,
"grad_norm": 3.1679723262786865,
"grad_norm_var": 0.9322370885273564,
"learning_rate": 0.0001,
"loss": 1.5559,
"loss/crossentropy": 2.3170981407165527,
"loss/hidden": 1.3046875,
"loss/logits": 0.2506353557109833,
"loss/reg": 5.991987563902512e-05,
"step": 329
},
{
"epoch": 0.04125,
"grad_norm": 2.7683303356170654,
"grad_norm_var": 0.9228798875820224,
"learning_rate": 0.0001,
"loss": 1.3127,
"loss/crossentropy": 2.51680850982666,
"loss/hidden": 1.1171875,
"loss/logits": 0.1949077993631363,
"loss/reg": 5.990756835672073e-05,
"step": 330
},
{
"epoch": 0.041375,
"grad_norm": 2.4825031757354736,
"grad_norm_var": 0.9280253827718864,
"learning_rate": 0.0001,
"loss": 1.3408,
"loss/crossentropy": 2.605055332183838,
"loss/hidden": 1.140625,
"loss/logits": 0.19955970346927643,
"loss/reg": 5.989522469462827e-05,
"step": 331
},
{
"epoch": 0.0415,
"grad_norm": 3.2399041652679443,
"grad_norm_var": 0.9369785308922095,
"learning_rate": 0.0001,
"loss": 1.5753,
"loss/crossentropy": 2.7269279956817627,
"loss/hidden": 1.3515625,
"loss/logits": 0.22315430641174316,
"loss/reg": 5.988113844068721e-05,
"step": 332
},
{
"epoch": 0.041625,
"grad_norm": 2.8936927318573,
"grad_norm_var": 0.8504314928241191,
"learning_rate": 0.0001,
"loss": 1.3222,
"loss/crossentropy": 2.812412738800049,
"loss/hidden": 1.140625,
"loss/logits": 0.1809367835521698,
"loss/reg": 5.9867059462703764e-05,
"step": 333
},
{
"epoch": 0.04175,
"grad_norm": 2.432213068008423,
"grad_norm_var": 0.8233723477256942,
"learning_rate": 0.0001,
"loss": 1.4094,
"loss/crossentropy": 2.6377694606781006,
"loss/hidden": 1.203125,
"loss/logits": 0.20563456416130066,
"loss/reg": 5.9853711718460545e-05,
"step": 334
},
{
"epoch": 0.041875,
"grad_norm": 2.422299861907959,
"grad_norm_var": 0.7965082638815336,
"learning_rate": 0.0001,
"loss": 1.2328,
"loss/crossentropy": 2.5352189540863037,
"loss/hidden": 1.078125,
"loss/logits": 0.15405428409576416,
"loss/reg": 5.984482049825601e-05,
"step": 335
},
{
"epoch": 0.042,
"grad_norm": 2.703420877456665,
"grad_norm_var": 0.7966286375145801,
"learning_rate": 0.0001,
"loss": 1.2981,
"loss/crossentropy": 2.525949716567993,
"loss/hidden": 1.1015625,
"loss/logits": 0.1959662139415741,
"loss/reg": 5.983649680274539e-05,
"step": 336
},
{
"epoch": 0.042125,
"grad_norm": 3.625760078430176,
"grad_norm_var": 0.14094485019601447,
"learning_rate": 0.0001,
"loss": 1.6517,
"loss/crossentropy": 1.9824917316436768,
"loss/hidden": 1.3828125,
"loss/logits": 0.2682979702949524,
"loss/reg": 5.9825455537065864e-05,
"step": 337
},
{
"epoch": 0.04225,
"grad_norm": 2.2066762447357178,
"grad_norm_var": 0.1579467344221198,
"learning_rate": 0.0001,
"loss": 1.1768,
"loss/crossentropy": 2.5151102542877197,
"loss/hidden": 1.0078125,
"loss/logits": 0.16843904554843903,
"loss/reg": 5.981199865345843e-05,
"step": 338
},
{
"epoch": 0.042375,
"grad_norm": 2.961968421936035,
"grad_norm_var": 0.15445451920782696,
"learning_rate": 0.0001,
"loss": 1.5446,
"loss/crossentropy": 2.397102117538452,
"loss/hidden": 1.3046875,
"loss/logits": 0.23934724926948547,
"loss/reg": 5.979971319902688e-05,
"step": 339
},
{
"epoch": 0.0425,
"grad_norm": 2.4696779251098633,
"grad_norm_var": 0.15509145555751214,
"learning_rate": 0.0001,
"loss": 1.2907,
"loss/crossentropy": 2.518648624420166,
"loss/hidden": 1.1171875,
"loss/logits": 0.17289261519908905,
"loss/reg": 5.979237175779417e-05,
"step": 340
},
{
"epoch": 0.042625,
"grad_norm": 2.2886741161346436,
"grad_norm_var": 0.16228478040589658,
"learning_rate": 0.0001,
"loss": 1.2915,
"loss/crossentropy": 2.4755570888519287,
"loss/hidden": 1.109375,
"loss/logits": 0.18152545392513275,
"loss/reg": 5.978640547255054e-05,
"step": 341
},
{
"epoch": 0.04275,
"grad_norm": 2.4154622554779053,
"grad_norm_var": 0.16631279956205466,
"learning_rate": 0.0001,
"loss": 1.1361,
"loss/crossentropy": 2.620903730392456,
"loss/hidden": 0.9921875,
"loss/logits": 0.1432739496231079,
"loss/reg": 5.977362161502242e-05,
"step": 342
},
{
"epoch": 0.042875,
"grad_norm": 3.9107778072357178,
"grad_norm_var": 0.25008606934497735,
"learning_rate": 0.0001,
"loss": 1.6206,
"loss/crossentropy": 3.3820858001708984,
"loss/hidden": 1.40625,
"loss/logits": 0.21375682950019836,
"loss/reg": 5.976331885904074e-05,
"step": 343
},
{
"epoch": 0.043,
"grad_norm": 2.2201833724975586,
"grad_norm_var": 0.25690416036597197,
"learning_rate": 0.0001,
"loss": 1.2446,
"loss/crossentropy": 2.467216730117798,
"loss/hidden": 1.0625,
"loss/logits": 0.18146604299545288,
"loss/reg": 5.975304884486832e-05,
"step": 344
},
{
"epoch": 0.043125,
"grad_norm": 2.1915907859802246,
"grad_norm_var": 0.26377805805320803,
"learning_rate": 0.0001,
"loss": 1.4337,
"loss/crossentropy": 2.3638522624969482,
"loss/hidden": 1.203125,
"loss/logits": 0.22996577620506287,
"loss/reg": 5.974585292278789e-05,
"step": 345
},
{
"epoch": 0.04325,
"grad_norm": 2.2508416175842285,
"grad_norm_var": 0.27594342104869135,
"learning_rate": 0.0001,
"loss": 1.1804,
"loss/crossentropy": 2.5332260131835938,
"loss/hidden": 1.0234375,
"loss/logits": 0.15640094876289368,
"loss/reg": 5.973771112621762e-05,
"step": 346
},
{
"epoch": 0.043375,
"grad_norm": 2.0090150833129883,
"grad_norm_var": 0.30177518099136,
"learning_rate": 0.0001,
"loss": 1.2994,
"loss/crossentropy": 2.511950731277466,
"loss/hidden": 1.125,
"loss/logits": 0.17384442687034607,
"loss/reg": 5.9728798078140244e-05,
"step": 347
},
{
"epoch": 0.0435,
"grad_norm": 2.7306134700775146,
"grad_norm_var": 0.277258656834267,
"learning_rate": 0.0001,
"loss": 1.4199,
"loss/crossentropy": 2.6389760971069336,
"loss/hidden": 1.1875,
"loss/logits": 0.2318291962146759,
"loss/reg": 5.9719615819631144e-05,
"step": 348
},
{
"epoch": 0.043625,
"grad_norm": 2.270148515701294,
"grad_norm_var": 0.27783213891549774,
"learning_rate": 0.0001,
"loss": 1.5484,
"loss/crossentropy": 2.312831163406372,
"loss/hidden": 1.2890625,
"loss/logits": 0.2587454915046692,
"loss/reg": 5.970869824523106e-05,
"step": 349
},
{
"epoch": 0.04375,
"grad_norm": 2.0988070964813232,
"grad_norm_var": 0.2908751450016543,
"learning_rate": 0.0001,
"loss": 1.2681,
"loss/crossentropy": 2.378908634185791,
"loss/hidden": 1.109375,
"loss/logits": 0.1581004559993744,
"loss/reg": 5.970033089397475e-05,
"step": 350
},
{
"epoch": 0.043875,
"grad_norm": 2.045546770095825,
"grad_norm_var": 0.30608582246859417,
"learning_rate": 0.0001,
"loss": 1.1063,
"loss/crossentropy": 2.4011952877044678,
"loss/hidden": 0.96875,
"loss/logits": 0.13691341876983643,
"loss/reg": 5.969877020106651e-05,
"step": 351
},
{
"epoch": 0.044,
"grad_norm": 2.9582409858703613,
"grad_norm_var": 0.316207957574548,
"learning_rate": 0.0001,
"loss": 1.2072,
"loss/crossentropy": 2.643101215362549,
"loss/hidden": 1.046875,
"loss/logits": 0.15975871682167053,
"loss/reg": 5.9694295487133786e-05,
"step": 352
},
{
"epoch": 0.044125,
"grad_norm": 2.125020742416382,
"grad_norm_var": 0.23988746234485703,
"learning_rate": 0.0001,
"loss": 1.2268,
"loss/crossentropy": 2.5923550128936768,
"loss/hidden": 1.0390625,
"loss/logits": 0.18714120984077454,
"loss/reg": 5.968381810816936e-05,
"step": 353
},
{
"epoch": 0.04425,
"grad_norm": 2.2348685264587402,
"grad_norm_var": 0.2390334750954897,
"learning_rate": 0.0001,
"loss": 1.3948,
"loss/crossentropy": 2.549100637435913,
"loss/hidden": 1.1953125,
"loss/logits": 0.198894202709198,
"loss/reg": 5.9669990150723606e-05,
"step": 354
},
{
"epoch": 0.044375,
"grad_norm": 2.6807351112365723,
"grad_norm_var": 0.2247355561703434,
"learning_rate": 0.0001,
"loss": 1.5721,
"loss/crossentropy": 2.2256884574890137,
"loss/hidden": 1.3046875,
"loss/logits": 0.26683151721954346,
"loss/reg": 5.966486787656322e-05,
"step": 355
},
{
"epoch": 0.0445,
"grad_norm": 3.1524059772491455,
"grad_norm_var": 0.2573648537337417,
"learning_rate": 0.0001,
"loss": 1.5458,
"loss/crossentropy": 2.4026124477386475,
"loss/hidden": 1.3203125,
"loss/logits": 0.22484509646892548,
"loss/reg": 5.965128730167635e-05,
"step": 356
},
{
"epoch": 0.044625,
"grad_norm": 3.806107759475708,
"grad_norm_var": 0.3637951956534662,
"learning_rate": 0.0001,
"loss": 1.2257,
"loss/crossentropy": 2.534790277481079,
"loss/hidden": 1.1015625,
"loss/logits": 0.12353114783763885,
"loss/reg": 5.9637932281475514e-05,
"step": 357
},
{
"epoch": 0.04475,
"grad_norm": 2.6499619483947754,
"grad_norm_var": 0.36243857175732047,
"learning_rate": 0.0001,
"loss": 1.2577,
"loss/crossentropy": 2.786536931991577,
"loss/hidden": 1.078125,
"loss/logits": 0.17896610498428345,
"loss/reg": 5.962959403404966e-05,
"step": 358
},
{
"epoch": 0.044875,
"grad_norm": 2.750371217727661,
"grad_norm_var": 0.24122897908522703,
"learning_rate": 0.0001,
"loss": 1.3213,
"loss/crossentropy": 2.5112698078155518,
"loss/hidden": 1.125,
"loss/logits": 0.19569161534309387,
"loss/reg": 5.961711940472014e-05,
"step": 359
},
{
"epoch": 0.045,
"grad_norm": 2.4145219326019287,
"grad_norm_var": 0.23605635737508593,
"learning_rate": 0.0001,
"loss": 1.4067,
"loss/crossentropy": 2.4327914714813232,
"loss/hidden": 1.171875,
"loss/logits": 0.23425719141960144,
"loss/reg": 5.960506314295344e-05,
"step": 360
},
{
"epoch": 0.045125,
"grad_norm": 2.7820589542388916,
"grad_norm_var": 0.2317516785903725,
"learning_rate": 0.0001,
"loss": 1.5833,
"loss/crossentropy": 2.6201419830322266,
"loss/hidden": 1.3359375,
"loss/logits": 0.2468121349811554,
"loss/reg": 5.959635382168926e-05,
"step": 361
},
{
"epoch": 0.04525,
"grad_norm": 3.0179331302642822,
"grad_norm_var": 0.23691283979908515,
"learning_rate": 0.0001,
"loss": 1.3921,
"loss/crossentropy": 2.728665351867676,
"loss/hidden": 1.1953125,
"loss/logits": 0.19617268443107605,
"loss/reg": 5.958346446277574e-05,
"step": 362
},
{
"epoch": 0.045375,
"grad_norm": 2.577760934829712,
"grad_norm_var": 0.21171492452191767,
"learning_rate": 0.0001,
"loss": 1.3343,
"loss/crossentropy": 2.4396915435791016,
"loss/hidden": 1.140625,
"loss/logits": 0.19306717813014984,
"loss/reg": 5.957194298389368e-05,
"step": 363
},
{
"epoch": 0.0455,
"grad_norm": 2.2478973865509033,
"grad_norm_var": 0.22066793284785244,
"learning_rate": 0.0001,
"loss": 1.2107,
"loss/crossentropy": 2.5703125,
"loss/hidden": 1.046875,
"loss/logits": 0.16320618987083435,
"loss/reg": 5.955886445008218e-05,
"step": 364
},
{
"epoch": 0.045625,
"grad_norm": 2.8303184509277344,
"grad_norm_var": 0.21465200545435412,
"learning_rate": 0.0001,
"loss": 1.3184,
"loss/crossentropy": 2.5793418884277344,
"loss/hidden": 1.1171875,
"loss/logits": 0.20061752200126648,
"loss/reg": 5.954650623607449e-05,
"step": 365
},
{
"epoch": 0.04575,
"grad_norm": 2.3407225608825684,
"grad_norm_var": 0.20058607793752117,
"learning_rate": 0.0001,
"loss": 1.2154,
"loss/crossentropy": 2.5118396282196045,
"loss/hidden": 1.0546875,
"loss/logits": 0.16011780500411987,
"loss/reg": 5.9531517763389274e-05,
"step": 366
},
{
"epoch": 0.045875,
"grad_norm": 2.9164462089538574,
"grad_norm_var": 0.17624459628143327,
"learning_rate": 0.0001,
"loss": 2.3079,
"loss/crossentropy": 2.530949831008911,
"loss/hidden": 1.7890625,
"loss/logits": 0.5182523727416992,
"loss/reg": 5.9519883507164195e-05,
"step": 367
},
{
"epoch": 0.046,
"grad_norm": 2.6031134128570557,
"grad_norm_var": 0.17274354994838556,
"learning_rate": 0.0001,
"loss": 1.3529,
"loss/crossentropy": 2.5211331844329834,
"loss/hidden": 1.140625,
"loss/logits": 0.21172133088111877,
"loss/reg": 5.950441482127644e-05,
"step": 368
},
{
"epoch": 0.046125,
"grad_norm": 2.2432241439819336,
"grad_norm_var": 0.16462358021652007,
"learning_rate": 0.0001,
"loss": 1.2433,
"loss/crossentropy": 2.469212055206299,
"loss/hidden": 1.0625,
"loss/logits": 0.18018998205661774,
"loss/reg": 5.9490499552339315e-05,
"step": 369
},
{
"epoch": 0.04625,
"grad_norm": 3.287365674972534,
"grad_norm_var": 0.16815977224474163,
"learning_rate": 0.0001,
"loss": 1.3311,
"loss/crossentropy": 2.7330899238586426,
"loss/hidden": 1.140625,
"loss/logits": 0.18986304104328156,
"loss/reg": 5.9471924032550305e-05,
"step": 370
},
{
"epoch": 0.046375,
"grad_norm": 2.6555063724517822,
"grad_norm_var": 0.16849581874392333,
"learning_rate": 0.0001,
"loss": 1.3069,
"loss/crossentropy": 2.4908649921417236,
"loss/hidden": 1.1171875,
"loss/logits": 0.1890988051891327,
"loss/reg": 5.9457710449351e-05,
"step": 371
},
{
"epoch": 0.0465,
"grad_norm": 2.2832915782928467,
"grad_norm_var": 0.1710711381355336,
"learning_rate": 0.0001,
"loss": 1.251,
"loss/crossentropy": 2.6485414505004883,
"loss/hidden": 1.0859375,
"loss/logits": 0.16444087028503418,
"loss/reg": 5.9437123127281666e-05,
"step": 372
},
{
"epoch": 0.046625,
"grad_norm": 1.9312299489974976,
"grad_norm_var": 0.11748808484953574,
"learning_rate": 0.0001,
"loss": 1.3104,
"loss/crossentropy": 2.4345285892486572,
"loss/hidden": 1.125,
"loss/logits": 0.1848057061433792,
"loss/reg": 5.941649214946665e-05,
"step": 373
},
{
"epoch": 0.04675,
"grad_norm": 2.2687668800354004,
"grad_norm_var": 0.12381368567199799,
"learning_rate": 0.0001,
"loss": 1.2697,
"loss/crossentropy": 2.514896869659424,
"loss/hidden": 1.0859375,
"loss/logits": 0.18321493268013,
"loss/reg": 5.939120819675736e-05,
"step": 374
},
{
"epoch": 0.046875,
"grad_norm": 2.1616384983062744,
"grad_norm_var": 0.131467626574521,
"learning_rate": 0.0001,
"loss": 1.2405,
"loss/crossentropy": 2.698112964630127,
"loss/hidden": 1.0625,
"loss/logits": 0.1773754358291626,
"loss/reg": 5.936667002970353e-05,
"step": 375
},
{
"epoch": 0.047,
"grad_norm": 2.6922011375427246,
"grad_norm_var": 0.13182201209023336,
"learning_rate": 0.0001,
"loss": 1.3426,
"loss/crossentropy": 2.538865327835083,
"loss/hidden": 1.15625,
"loss/logits": 0.1857489049434662,
"loss/reg": 5.9345431509427726e-05,
"step": 376
},
{
"epoch": 0.047125,
"grad_norm": 2.2630982398986816,
"grad_norm_var": 0.13276797957838743,
"learning_rate": 0.0001,
"loss": 1.2869,
"loss/crossentropy": 2.4644358158111572,
"loss/hidden": 1.109375,
"loss/logits": 0.17694343626499176,
"loss/reg": 5.933275315328501e-05,
"step": 377
},
{
"epoch": 0.04725,
"grad_norm": 2.479646682739258,
"grad_norm_var": 0.11514238570119009,
"learning_rate": 0.0001,
"loss": 1.1618,
"loss/crossentropy": 2.5582141876220703,
"loss/hidden": 1.015625,
"loss/logits": 0.14554372429847717,
"loss/reg": 5.931046689511277e-05,
"step": 378
},
{
"epoch": 0.047375,
"grad_norm": 2.466947317123413,
"grad_norm_var": 0.114559834161389,
"learning_rate": 0.0001,
"loss": 1.48,
"loss/crossentropy": 2.4128925800323486,
"loss/hidden": 1.2421875,
"loss/logits": 0.23724211752414703,
"loss/reg": 5.929026156081818e-05,
"step": 379
},
{
"epoch": 0.0475,
"grad_norm": 2.538424015045166,
"grad_norm_var": 0.11086504579424972,
"learning_rate": 0.0001,
"loss": 1.4136,
"loss/crossentropy": 2.0768887996673584,
"loss/hidden": 1.234375,
"loss/logits": 0.17867109179496765,
"loss/reg": 5.92764736211393e-05,
"step": 380
},
{
"epoch": 0.047625,
"grad_norm": 2.654524564743042,
"grad_norm_var": 0.1049983644074643,
"learning_rate": 0.0001,
"loss": 1.3221,
"loss/crossentropy": 2.1216413974761963,
"loss/hidden": 1.15625,
"loss/logits": 0.16521546244621277,
"loss/reg": 5.926107769482769e-05,
"step": 381
},
{
"epoch": 0.04775,
"grad_norm": 2.237818717956543,
"grad_norm_var": 0.10766217194697697,
"learning_rate": 0.0001,
"loss": 1.2236,
"loss/crossentropy": 2.6475207805633545,
"loss/hidden": 1.0546875,
"loss/logits": 0.16833502054214478,
"loss/reg": 5.924178913119249e-05,
"step": 382
},
{
"epoch": 0.047875,
"grad_norm": 2.7116799354553223,
"grad_norm_var": 0.09837235459497572,
"learning_rate": 0.0001,
"loss": 1.3102,
"loss/crossentropy": 2.6615209579467773,
"loss/hidden": 1.1171875,
"loss/logits": 0.1924624741077423,
"loss/reg": 5.921960837440565e-05,
"step": 383
},
{
"epoch": 0.048,
"grad_norm": 2.5439391136169434,
"grad_norm_var": 0.09752047633307553,
"learning_rate": 0.0001,
"loss": 1.3258,
"loss/crossentropy": 2.10198974609375,
"loss/hidden": 1.15625,
"loss/logits": 0.1689702719449997,
"loss/reg": 5.919525210629217e-05,
"step": 384
},
{
"epoch": 0.048125,
"grad_norm": 2.617921829223633,
"grad_norm_var": 0.09528014676361156,
"learning_rate": 0.0001,
"loss": 1.61,
"loss/crossentropy": 2.445833206176758,
"loss/hidden": 1.328125,
"loss/logits": 0.28133296966552734,
"loss/reg": 5.917950693401508e-05,
"step": 385
},
{
"epoch": 0.04825,
"grad_norm": 2.514899730682373,
"grad_norm_var": 0.05015297139615639,
"learning_rate": 0.0001,
"loss": 1.1964,
"loss/crossentropy": 2.4887778759002686,
"loss/hidden": 1.0390625,
"loss/logits": 0.1567072868347168,
"loss/reg": 5.916162990615703e-05,
"step": 386
},
{
"epoch": 0.048375,
"grad_norm": 2.1075565814971924,
"grad_norm_var": 0.053089324895933446,
"learning_rate": 0.0001,
"loss": 1.0537,
"loss/crossentropy": 2.4045815467834473,
"loss/hidden": 0.921875,
"loss/logits": 0.1312153935432434,
"loss/reg": 5.914089342695661e-05,
"step": 387
},
{
"epoch": 0.0485,
"grad_norm": 2.475404739379883,
"grad_norm_var": 0.05228874002812057,
"learning_rate": 0.0001,
"loss": 1.3003,
"loss/crossentropy": 2.591153383255005,
"loss/hidden": 1.109375,
"loss/logits": 0.19037862122058868,
"loss/reg": 5.9116682677995414e-05,
"step": 388
},
{
"epoch": 0.048625,
"grad_norm": 4.638079643249512,
"grad_norm_var": 0.33504973194641535,
"learning_rate": 0.0001,
"loss": 1.7407,
"loss/crossentropy": 2.992236852645874,
"loss/hidden": 1.4609375,
"loss/logits": 0.2792096734046936,
"loss/reg": 5.9097284974996e-05,
"step": 389
},
{
"epoch": 0.04875,
"grad_norm": 2.4662392139434814,
"grad_norm_var": 0.32913998556907487,
"learning_rate": 0.0001,
"loss": 1.1454,
"loss/crossentropy": 2.9239540100097656,
"loss/hidden": 0.9921875,
"loss/logits": 0.15260137617588043,
"loss/reg": 5.907983722863719e-05,
"step": 390
},
{
"epoch": 0.048875,
"grad_norm": 2.439119338989258,
"grad_norm_var": 0.31780327994806234,
"learning_rate": 0.0001,
"loss": 1.3638,
"loss/crossentropy": 2.450254440307617,
"loss/hidden": 1.1640625,
"loss/logits": 0.19915927946567535,
"loss/reg": 5.9063841035822406e-05,
"step": 391
},
{
"epoch": 0.049,
"grad_norm": 2.3475067615509033,
"grad_norm_var": 0.3217026075593497,
"learning_rate": 0.0001,
"loss": 1.533,
"loss/crossentropy": 2.617830753326416,
"loss/hidden": 1.265625,
"loss/logits": 0.26678475737571716,
"loss/reg": 5.90429590374697e-05,
"step": 392
},
{
"epoch": 0.049125,
"grad_norm": 4.364901065826416,
"grad_norm_var": 0.5050899240629005,
"learning_rate": 0.0001,
"loss": 1.4632,
"loss/crossentropy": 2.4607560634613037,
"loss/hidden": 1.2421875,
"loss/logits": 0.22039487957954407,
"loss/reg": 5.902666089241393e-05,
"step": 393
},
{
"epoch": 0.04925,
"grad_norm": 2.338758707046509,
"grad_norm_var": 0.5109449021123245,
"learning_rate": 0.0001,
"loss": 1.2995,
"loss/crossentropy": 2.6618576049804688,
"loss/hidden": 1.1171875,
"loss/logits": 0.1817541867494583,
"loss/reg": 5.9010566474171355e-05,
"step": 394
},
{
"epoch": 0.049375,
"grad_norm": 3.5642833709716797,
"grad_norm_var": 0.549694181009107,
"learning_rate": 0.0001,
"loss": 1.3152,
"loss/crossentropy": 2.300379753112793,
"loss/hidden": 1.1171875,
"loss/logits": 0.1974020004272461,
"loss/reg": 5.8987676311517134e-05,
"step": 395
},
{
"epoch": 0.0495,
"grad_norm": 2.1328978538513184,
"grad_norm_var": 0.573308372527261,
"learning_rate": 0.0001,
"loss": 1.244,
"loss/crossentropy": 2.4386301040649414,
"loss/hidden": 1.0625,
"loss/logits": 0.18090221285820007,
"loss/reg": 5.8964946219930425e-05,
"step": 396
},
{
"epoch": 0.049625,
"grad_norm": 3.0894107818603516,
"grad_norm_var": 0.5790289690992334,
"learning_rate": 0.0001,
"loss": 1.5661,
"loss/crossentropy": 2.365107297897339,
"loss/hidden": 1.3671875,
"loss/logits": 0.1983477920293808,
"loss/reg": 5.8950212405761704e-05,
"step": 397
},
{
"epoch": 0.04975,
"grad_norm": 3.194427967071533,
"grad_norm_var": 0.566188494588774,
"learning_rate": 0.0001,
"loss": 1.4269,
"loss/crossentropy": 2.384216547012329,
"loss/hidden": 1.21875,
"loss/logits": 0.20751546323299408,
"loss/reg": 5.8928319049300626e-05,
"step": 398
},
{
"epoch": 0.049875,
"grad_norm": 2.5108933448791504,
"grad_norm_var": 0.5723226037333423,
"learning_rate": 0.0001,
"loss": 1.2127,
"loss/crossentropy": 2.5466771125793457,
"loss/hidden": 1.0546875,
"loss/logits": 0.1574660688638687,
"loss/reg": 5.891324326512404e-05,
"step": 399
},
{
"epoch": 0.05,
"grad_norm": 2.9769773483276367,
"grad_norm_var": 0.5672869916808385,
"learning_rate": 0.0001,
"loss": 1.3045,
"loss/crossentropy": 2.7223000526428223,
"loss/hidden": 1.1171875,
"loss/logits": 0.18677057325839996,
"loss/reg": 5.889027670491487e-05,
"step": 400
},
{
"epoch": 0.050125,
"grad_norm": 3.31915283203125,
"grad_norm_var": 0.5752734489563172,
"learning_rate": 0.0001,
"loss": 1.2639,
"loss/crossentropy": 2.4886364936828613,
"loss/hidden": 1.078125,
"loss/logits": 0.1851940155029297,
"loss/reg": 5.887265797355212e-05,
"step": 401
},
{
"epoch": 0.05025,
"grad_norm": 1.8946937322616577,
"grad_norm_var": 0.6315760522485537,
"learning_rate": 0.0001,
"loss": 1.2326,
"loss/crossentropy": 2.414213180541992,
"loss/hidden": 1.0703125,
"loss/logits": 0.16165336966514587,
"loss/reg": 5.885552673134953e-05,
"step": 402
},
{
"epoch": 0.050375,
"grad_norm": 2.5370404720306396,
"grad_norm_var": 0.5996572790739425,
"learning_rate": 0.0001,
"loss": 1.5079,
"loss/crossentropy": 2.3421835899353027,
"loss/hidden": 1.28125,
"loss/logits": 0.22602099180221558,
"loss/reg": 5.88419679843355e-05,
"step": 403
},
{
"epoch": 0.0505,
"grad_norm": 2.4215445518493652,
"grad_norm_var": 0.6028382899137373,
"learning_rate": 0.0001,
"loss": 1.4975,
"loss/crossentropy": 2.7361152172088623,
"loss/hidden": 1.2421875,
"loss/logits": 0.25468122959136963,
"loss/reg": 5.88247858104296e-05,
"step": 404
},
{
"epoch": 0.050625,
"grad_norm": 2.049978733062744,
"grad_norm_var": 0.4181645547932513,
"learning_rate": 0.0001,
"loss": 1.1088,
"loss/crossentropy": 2.350353717803955,
"loss/hidden": 0.953125,
"loss/logits": 0.15509989857673645,
"loss/reg": 5.880888784304261e-05,
"step": 405
},
{
"epoch": 0.05075,
"grad_norm": 2.7967936992645264,
"grad_norm_var": 0.41345734870287976,
"learning_rate": 0.0001,
"loss": 1.3869,
"loss/crossentropy": 2.5875766277313232,
"loss/hidden": 1.171875,
"loss/logits": 0.21445012092590332,
"loss/reg": 5.8793633797904477e-05,
"step": 406
},
{
"epoch": 0.050875,
"grad_norm": 2.169900894165039,
"grad_norm_var": 0.429098064205416,
"learning_rate": 0.0001,
"loss": 1.0776,
"loss/crossentropy": 2.398125410079956,
"loss/hidden": 0.93359375,
"loss/logits": 0.14346018433570862,
"loss/reg": 5.877741932636127e-05,
"step": 407
},
{
"epoch": 0.051,
"grad_norm": 2.5045695304870605,
"grad_norm_var": 0.4225916301522199,
"learning_rate": 0.0001,
"loss": 1.5355,
"loss/crossentropy": 2.1697590351104736,
"loss/hidden": 1.328125,
"loss/logits": 0.20677754282951355,
"loss/reg": 5.876670911675319e-05,
"step": 408
},
{
"epoch": 0.051125,
"grad_norm": 18.23008918762207,
"grad_norm_var": 15.43871781968465,
"learning_rate": 0.0001,
"loss": 1.4882,
"loss/crossentropy": 2.602886438369751,
"loss/hidden": 1.3046875,
"loss/logits": 0.18289120495319366,
"loss/reg": 5.874884300283156e-05,
"step": 409
},
{
"epoch": 0.05125,
"grad_norm": 2.8436660766601562,
"grad_norm_var": 15.369190103974788,
"learning_rate": 0.0001,
"loss": 1.3294,
"loss/crossentropy": 2.4684174060821533,
"loss/hidden": 1.140625,
"loss/logits": 0.18818634748458862,
"loss/reg": 5.873553891433403e-05,
"step": 410
},
{
"epoch": 0.051375,
"grad_norm": 2.2729334831237793,
"grad_norm_var": 15.486411427985377,
"learning_rate": 0.0001,
"loss": 1.2331,
"loss/crossentropy": 2.550140857696533,
"loss/hidden": 1.0390625,
"loss/logits": 0.19348369538784027,
"loss/reg": 5.8720732340589166e-05,
"step": 411
},
{
"epoch": 0.0515,
"grad_norm": 2.5612359046936035,
"grad_norm_var": 15.416427881560285,
"learning_rate": 0.0001,
"loss": 1.3333,
"loss/crossentropy": 2.4774818420410156,
"loss/hidden": 1.125,
"loss/logits": 0.2077203392982483,
"loss/reg": 5.8710702433018014e-05,
"step": 412
},
{
"epoch": 0.051625,
"grad_norm": 4.02579927444458,
"grad_norm_var": 15.409250289477422,
"learning_rate": 0.0001,
"loss": 1.507,
"loss/crossentropy": 2.555722713470459,
"loss/hidden": 1.3046875,
"loss/logits": 0.20171231031417847,
"loss/reg": 5.870195309398696e-05,
"step": 413
},
{
"epoch": 0.05175,
"grad_norm": 2.443574905395508,
"grad_norm_var": 15.489530544756628,
"learning_rate": 0.0001,
"loss": 1.2774,
"loss/crossentropy": 2.706422805786133,
"loss/hidden": 1.09375,
"loss/logits": 0.1831112802028656,
"loss/reg": 5.8690613514045253e-05,
"step": 414
},
{
"epoch": 0.051875,
"grad_norm": 2.079418897628784,
"grad_norm_var": 15.563674426313279,
"learning_rate": 0.0001,
"loss": 1.1798,
"loss/crossentropy": 2.6763839721679688,
"loss/hidden": 1.03125,
"loss/logits": 0.14800235629081726,
"loss/reg": 5.8675475884228945e-05,
"step": 415
},
{
"epoch": 0.052,
"grad_norm": 2.7786471843719482,
"grad_norm_var": 15.581826938638233,
"learning_rate": 0.0001,
"loss": 1.2465,
"loss/crossentropy": 2.6709306240081787,
"loss/hidden": 1.078125,
"loss/logits": 0.1678304374217987,
"loss/reg": 5.866462379344739e-05,
"step": 416
},
{
"epoch": 0.052125,
"grad_norm": 2.770376443862915,
"grad_norm_var": 15.618130403520784,
"learning_rate": 0.0001,
"loss": 1.3111,
"loss/crossentropy": 2.646826982498169,
"loss/hidden": 1.1328125,
"loss/logits": 0.1777157187461853,
"loss/reg": 5.865520142833702e-05,
"step": 417
},
{
"epoch": 0.05225,
"grad_norm": 2.092414617538452,
"grad_norm_var": 15.57762685735369,
"learning_rate": 0.0001,
"loss": 1.3353,
"loss/crossentropy": 2.62361741065979,
"loss/hidden": 1.15625,
"loss/logits": 0.17848479747772217,
"loss/reg": 5.8638761402107775e-05,
"step": 418
},
{
"epoch": 0.052375,
"grad_norm": 2.05226731300354,
"grad_norm_var": 15.656891853986265,
"learning_rate": 0.0001,
"loss": 1.14,
"loss/crossentropy": 2.697723865509033,
"loss/hidden": 0.9921875,
"loss/logits": 0.14726917445659637,
"loss/reg": 5.862316902494058e-05,
"step": 419
},
{
"epoch": 0.0525,
"grad_norm": 2.6924796104431152,
"grad_norm_var": 15.622310414474152,
"learning_rate": 0.0001,
"loss": 1.404,
"loss/crossentropy": 2.601827383041382,
"loss/hidden": 1.2109375,
"loss/logits": 0.19246245920658112,
"loss/reg": 5.860950841451995e-05,
"step": 420
},
{
"epoch": 0.052625,
"grad_norm": 5.301983833312988,
"grad_norm_var": 15.644682914862404,
"learning_rate": 0.0001,
"loss": 1.4862,
"loss/crossentropy": 2.6217854022979736,
"loss/hidden": 1.296875,
"loss/logits": 0.18871337175369263,
"loss/reg": 5.8600846386980265e-05,
"step": 421
},
{
"epoch": 0.05275,
"grad_norm": 2.114091634750366,
"grad_norm_var": 15.758396712898662,
"learning_rate": 0.0001,
"loss": 1.2033,
"loss/crossentropy": 2.5663623809814453,
"loss/hidden": 1.0390625,
"loss/logits": 0.16362521052360535,
"loss/reg": 5.8592915593180805e-05,
"step": 422
},
{
"epoch": 0.052875,
"grad_norm": 2.757091999053955,
"grad_norm_var": 15.661455859551703,
"learning_rate": 0.0001,
"loss": 1.1223,
"loss/crossentropy": 2.4681971073150635,
"loss/hidden": 0.97265625,
"loss/logits": 0.14905983209609985,
"loss/reg": 5.858425720361993e-05,
"step": 423
},
{
"epoch": 0.053,
"grad_norm": 2.4524407386779785,
"grad_norm_var": 15.670073831964206,
"learning_rate": 0.0001,
"loss": 1.2938,
"loss/crossentropy": 2.4758145809173584,
"loss/hidden": 1.1015625,
"loss/logits": 0.19164547324180603,
"loss/reg": 5.857350697624497e-05,
"step": 424
},
{
"epoch": 0.053125,
"grad_norm": 2.3052892684936523,
"grad_norm_var": 0.7038252417895506,
"learning_rate": 0.0001,
"loss": 1.2565,
"loss/crossentropy": 2.597487211227417,
"loss/hidden": 1.0703125,
"loss/logits": 0.18559187650680542,
"loss/reg": 5.855830750078894e-05,
"step": 425
},
{
"epoch": 0.05325,
"grad_norm": 2.7276995182037354,
"grad_norm_var": 0.7027765205874381,
"learning_rate": 0.0001,
"loss": 1.4141,
"loss/crossentropy": 2.6818253993988037,
"loss/hidden": 1.21875,
"loss/logits": 0.1948131024837494,
"loss/reg": 5.854442133568227e-05,
"step": 426
},
{
"epoch": 0.053375,
"grad_norm": 1.725293517112732,
"grad_norm_var": 0.7537440425638384,
"learning_rate": 0.0001,
"loss": 1.1664,
"loss/crossentropy": 2.4244258403778076,
"loss/hidden": 1.015625,
"loss/logits": 0.1502000093460083,
"loss/reg": 5.85384841542691e-05,
"step": 427
},
{
"epoch": 0.0535,
"grad_norm": 2.6642932891845703,
"grad_norm_var": 0.7527758186064119,
"learning_rate": 0.0001,
"loss": 1.5211,
"loss/crossentropy": 2.1209182739257812,
"loss/hidden": 1.328125,
"loss/logits": 0.192403644323349,
"loss/reg": 5.852692629559897e-05,
"step": 428
},
{
"epoch": 0.053625,
"grad_norm": 2.7787868976593018,
"grad_norm_var": 0.6272740663233074,
"learning_rate": 0.0001,
"loss": 1.3046,
"loss/crossentropy": 2.3020565509796143,
"loss/hidden": 1.125,
"loss/logits": 0.179016575217247,
"loss/reg": 5.851646346854977e-05,
"step": 429
},
{
"epoch": 0.05375,
"grad_norm": 2.891101360321045,
"grad_norm_var": 0.6299498912530666,
"learning_rate": 0.0001,
"loss": 1.4198,
"loss/crossentropy": 2.33249568939209,
"loss/hidden": 1.203125,
"loss/logits": 0.21604114770889282,
"loss/reg": 5.850956222275272e-05,
"step": 430
},
{
"epoch": 0.053875,
"grad_norm": 2.7940289974212646,
"grad_norm_var": 0.608789107013446,
"learning_rate": 0.0001,
"loss": 1.1825,
"loss/crossentropy": 2.6553549766540527,
"loss/hidden": 1.03125,
"loss/logits": 0.15064392983913422,
"loss/reg": 5.8500536397332326e-05,
"step": 431
},
{
"epoch": 0.054,
"grad_norm": 25.06597328186035,
"grad_norm_var": 31.943843646690855,
"learning_rate": 0.0001,
"loss": 2.4055,
"loss/crossentropy": 2.7126245498657227,
"loss/hidden": 2.03125,
"loss/logits": 0.3736712336540222,
"loss/reg": 5.849341687280685e-05,
"step": 432
},
{
"epoch": 0.054125,
"grad_norm": 2.4612748622894287,
"grad_norm_var": 32.003546233579016,
"learning_rate": 0.0001,
"loss": 1.4832,
"loss/crossentropy": 2.6244633197784424,
"loss/hidden": 1.25,
"loss/logits": 0.23266229033470154,
"loss/reg": 5.847978172823787e-05,
"step": 433
},
{
"epoch": 0.05425,
"grad_norm": 2.413149356842041,
"grad_norm_var": 31.926055741483236,
"learning_rate": 0.0001,
"loss": 1.405,
"loss/crossentropy": 2.513383626937866,
"loss/hidden": 1.1953125,
"loss/logits": 0.2091376930475235,
"loss/reg": 5.847239663125947e-05,
"step": 434
},
{
"epoch": 0.054375,
"grad_norm": 2.1266605854034424,
"grad_norm_var": 31.906339652731415,
"learning_rate": 0.0001,
"loss": 1.2307,
"loss/crossentropy": 2.645113706588745,
"loss/hidden": 1.0546875,
"loss/logits": 0.17538747191429138,
"loss/reg": 5.8466725022299215e-05,
"step": 435
},
{
"epoch": 0.0545,
"grad_norm": 2.693485975265503,
"grad_norm_var": 31.906153605922054,
"learning_rate": 0.0001,
"loss": 1.3491,
"loss/crossentropy": 2.5616350173950195,
"loss/hidden": 1.171875,
"loss/logits": 0.1766662299633026,
"loss/reg": 5.845691339345649e-05,
"step": 436
},
{
"epoch": 0.054625,
"grad_norm": 3.594322681427002,
"grad_norm_var": 31.81007436255887,
"learning_rate": 0.0001,
"loss": 1.4456,
"loss/crossentropy": 2.320868492126465,
"loss/hidden": 1.171875,
"loss/logits": 0.2730950713157654,
"loss/reg": 5.845166742801666e-05,
"step": 437
},
{
"epoch": 0.05475,
"grad_norm": 2.725066900253296,
"grad_norm_var": 31.681987454427826,
"learning_rate": 0.0001,
"loss": 1.4368,
"loss/crossentropy": 2.4526007175445557,
"loss/hidden": 1.21875,
"loss/logits": 0.21745863556861877,
"loss/reg": 5.844476982019842e-05,
"step": 438
},
{
"epoch": 0.054875,
"grad_norm": 2.615208625793457,
"grad_norm_var": 31.706966746538818,
"learning_rate": 0.0001,
"loss": 1.2902,
"loss/crossentropy": 2.5873489379882812,
"loss/hidden": 1.109375,
"loss/logits": 0.18027284741401672,
"loss/reg": 5.843998587806709e-05,
"step": 439
},
{
"epoch": 0.055,
"grad_norm": 2.679504632949829,
"grad_norm_var": 31.66327199965654,
"learning_rate": 0.0001,
"loss": 1.4142,
"loss/crossentropy": 2.171384811401367,
"loss/hidden": 1.21875,
"loss/logits": 0.1948787271976471,
"loss/reg": 5.8425270253792405e-05,
"step": 440
},
{
"epoch": 0.055125,
"grad_norm": 2.781118869781494,
"grad_norm_var": 31.56886824166385,
"learning_rate": 0.0001,
"loss": 1.2261,
"loss/crossentropy": 2.616610050201416,
"loss/hidden": 1.0625,
"loss/logits": 0.16300562024116516,
"loss/reg": 5.841004167450592e-05,
"step": 441
},
{
"epoch": 0.05525,
"grad_norm": 2.8343710899353027,
"grad_norm_var": 31.550828531904,
"learning_rate": 0.0001,
"loss": 1.6654,
"loss/crossentropy": 2.254971504211426,
"loss/hidden": 1.390625,
"loss/logits": 0.27416497468948364,
"loss/reg": 5.840086305397563e-05,
"step": 442
},
{
"epoch": 0.055375,
"grad_norm": 2.943516254425049,
"grad_norm_var": 31.26553828771242,
"learning_rate": 0.0001,
"loss": 1.3037,
"loss/crossentropy": 2.607365131378174,
"loss/hidden": 1.140625,
"loss/logits": 0.16250211000442505,
"loss/reg": 5.8392772189108655e-05,
"step": 443
},
{
"epoch": 0.0555,
"grad_norm": 4.3494696617126465,
"grad_norm_var": 31.11395178262311,
"learning_rate": 0.0001,
"loss": 1.4874,
"loss/crossentropy": 2.803809642791748,
"loss/hidden": 1.265625,
"loss/logits": 0.22114460170269012,
"loss/reg": 5.8383415307616815e-05,
"step": 444
},
{
"epoch": 0.055625,
"grad_norm": 2.3149962425231934,
"grad_norm_var": 31.21739595793184,
"learning_rate": 0.0001,
"loss": 1.1723,
"loss/crossentropy": 2.7661781311035156,
"loss/hidden": 1.015625,
"loss/logits": 0.1560768485069275,
"loss/reg": 5.8376208471599966e-05,
"step": 445
},
{
"epoch": 0.05575,
"grad_norm": 2.5312862396240234,
"grad_norm_var": 31.288532129977195,
"learning_rate": 0.0001,
"loss": 1.4583,
"loss/crossentropy": 2.3608808517456055,
"loss/hidden": 1.234375,
"loss/logits": 0.22338923811912537,
"loss/reg": 5.83621695113834e-05,
"step": 446
},
{
"epoch": 0.055875,
"grad_norm": 2.0245697498321533,
"grad_norm_var": 31.468007952235922,
"learning_rate": 0.0001,
"loss": 1.2537,
"loss/crossentropy": 2.6646907329559326,
"loss/hidden": 1.0859375,
"loss/logits": 0.1671399027109146,
"loss/reg": 5.835363481310196e-05,
"step": 447
},
{
"epoch": 0.056,
"grad_norm": 4.180586338043213,
"grad_norm_var": 0.4425575902395887,
"learning_rate": 0.0001,
"loss": 1.4287,
"loss/crossentropy": 2.478865623474121,
"loss/hidden": 1.1796875,
"loss/logits": 0.2484455555677414,
"loss/reg": 5.833926479681395e-05,
"step": 448
},
{
"epoch": 0.056125,
"grad_norm": 2.2291383743286133,
"grad_norm_var": 0.4573160813014281,
"learning_rate": 0.0001,
"loss": 1.2997,
"loss/crossentropy": 2.244389295578003,
"loss/hidden": 1.15625,
"loss/logits": 0.14287710189819336,
"loss/reg": 5.833054456161335e-05,
"step": 449
},
{
"epoch": 0.05625,
"grad_norm": 2.204925060272217,
"grad_norm_var": 0.47117643459253195,
"learning_rate": 0.0001,
"loss": 1.2876,
"loss/crossentropy": 2.3469107151031494,
"loss/hidden": 1.1015625,
"loss/logits": 0.1854255050420761,
"loss/reg": 5.832717943121679e-05,
"step": 450
},
{
"epoch": 0.056375,
"grad_norm": 2.5266880989074707,
"grad_norm_var": 0.4451698073358396,
"learning_rate": 0.0001,
"loss": 1.4392,
"loss/crossentropy": 2.440885305404663,
"loss/hidden": 1.2109375,
"loss/logits": 0.22769977152347565,
"loss/reg": 5.8323836128693074e-05,
"step": 451
},
{
"epoch": 0.0565,
"grad_norm": 2.410515785217285,
"grad_norm_var": 0.455202882380185,
"learning_rate": 0.0001,
"loss": 1.4083,
"loss/crossentropy": 2.4578142166137695,
"loss/hidden": 1.203125,
"loss/logits": 0.20461352169513702,
"loss/reg": 5.830869122291915e-05,
"step": 452
},
{
"epoch": 0.056625,
"grad_norm": 2.0389811992645264,
"grad_norm_var": 0.4435531519851603,
"learning_rate": 0.0001,
"loss": 1.1318,
"loss/crossentropy": 2.139033317565918,
"loss/hidden": 0.9921875,
"loss/logits": 0.1390083134174347,
"loss/reg": 5.829246947541833e-05,
"step": 453
},
{
"epoch": 0.05675,
"grad_norm": 1.979454517364502,
"grad_norm_var": 0.47698744011981165,
"learning_rate": 0.0001,
"loss": 1.3115,
"loss/crossentropy": 2.546844005584717,
"loss/hidden": 1.125,
"loss/logits": 0.18587306141853333,
"loss/reg": 5.8282243116991594e-05,
"step": 454
},
{
"epoch": 0.056875,
"grad_norm": 2.0210747718811035,
"grad_norm_var": 0.5030154373593951,
"learning_rate": 0.0001,
"loss": 1.21,
"loss/crossentropy": 2.6095550060272217,
"loss/hidden": 1.046875,
"loss/logits": 0.16256017982959747,
"loss/reg": 5.8266243286198005e-05,
"step": 455
},
{
"epoch": 0.057,
"grad_norm": 2.0944671630859375,
"grad_norm_var": 0.520400331750174,
"learning_rate": 0.0001,
"loss": 1.1407,
"loss/crossentropy": 2.450681447982788,
"loss/hidden": 0.98828125,
"loss/logits": 0.15184549987316132,
"loss/reg": 5.8250909205526114e-05,
"step": 456
},
{
"epoch": 0.057125,
"grad_norm": 2.5854806900024414,
"grad_norm_var": 0.5178481401493921,
"learning_rate": 0.0001,
"loss": 1.1308,
"loss/crossentropy": 2.8090949058532715,
"loss/hidden": 0.97265625,
"loss/logits": 0.15754011273384094,
"loss/reg": 5.8233421441400424e-05,
"step": 457
},
{
"epoch": 0.05725,
"grad_norm": 6.832178592681885,
"grad_norm_var": 1.6526915128701443,
"learning_rate": 0.0001,
"loss": 1.7544,
"loss/crossentropy": 2.4325008392333984,
"loss/hidden": 1.5625,
"loss/logits": 0.1913643479347229,
"loss/reg": 5.821782906423323e-05,
"step": 458
},
{
"epoch": 0.057375,
"grad_norm": 2.4911727905273438,
"grad_norm_var": 1.6585857165051416,
"learning_rate": 0.0001,
"loss": 1.277,
"loss/crossentropy": 2.5682671070098877,
"loss/hidden": 1.09375,
"loss/logits": 0.18270117044448853,
"loss/reg": 5.820325532113202e-05,
"step": 459
},
{
"epoch": 0.0575,
"grad_norm": 2.2592287063598633,
"grad_norm_var": 1.5000806172221008,
"learning_rate": 0.0001,
"loss": 1.149,
"loss/crossentropy": 2.3300366401672363,
"loss/hidden": 0.98828125,
"loss/logits": 0.16016384959220886,
"loss/reg": 5.8191151765640825e-05,
"step": 460
},
{
"epoch": 0.057625,
"grad_norm": 2.6110737323760986,
"grad_norm_var": 1.4915332961489087,
"learning_rate": 0.0001,
"loss": 1.4344,
"loss/crossentropy": 2.560197591781616,
"loss/hidden": 1.21875,
"loss/logits": 0.21507461369037628,
"loss/reg": 5.817634882987477e-05,
"step": 461
},
{
"epoch": 0.05775,
"grad_norm": 2.6446752548217773,
"grad_norm_var": 1.48995546498276,
"learning_rate": 0.0001,
"loss": 1.2381,
"loss/crossentropy": 2.5068211555480957,
"loss/hidden": 1.0546875,
"loss/logits": 0.1828281581401825,
"loss/reg": 5.816355405841023e-05,
"step": 462
},
{
"epoch": 0.057875,
"grad_norm": 2.498300075531006,
"grad_norm_var": 1.4615785550667995,
"learning_rate": 0.0001,
"loss": 1.3019,
"loss/crossentropy": 2.3765523433685303,
"loss/hidden": 1.1328125,
"loss/logits": 0.16848215460777283,
"loss/reg": 5.814860560349189e-05,
"step": 463
},
{
"epoch": 0.058,
"grad_norm": 2.4674289226531982,
"grad_norm_var": 1.3126372255276026,
"learning_rate": 0.0001,
"loss": 1.3472,
"loss/crossentropy": 2.714657783508301,
"loss/hidden": 1.1640625,
"loss/logits": 0.18256625533103943,
"loss/reg": 5.81321437493898e-05,
"step": 464
},
{
"epoch": 0.058125,
"grad_norm": 3.7482964992523193,
"grad_norm_var": 1.3780257940909062,
"learning_rate": 0.0001,
"loss": 1.4579,
"loss/crossentropy": 2.7645256519317627,
"loss/hidden": 1.2109375,
"loss/logits": 0.24636635184288025,
"loss/reg": 5.811548908241093e-05,
"step": 465
},
{
"epoch": 0.05825,
"grad_norm": 3.1881492137908936,
"grad_norm_var": 1.3717908440858895,
"learning_rate": 0.0001,
"loss": 1.2469,
"loss/crossentropy": 2.6280384063720703,
"loss/hidden": 1.078125,
"loss/logits": 0.16818463802337646,
"loss/reg": 5.8095396525459364e-05,
"step": 466
},
{
"epoch": 0.058375,
"grad_norm": 3.4882731437683105,
"grad_norm_var": 1.3977675144088226,
"learning_rate": 0.0001,
"loss": 1.5403,
"loss/crossentropy": 1.8358429670333862,
"loss/hidden": 1.3203125,
"loss/logits": 0.21941694617271423,
"loss/reg": 5.807522757095285e-05,
"step": 467
},
{
"epoch": 0.0585,
"grad_norm": 2.530682325363159,
"grad_norm_var": 1.391870091660969,
"learning_rate": 0.0001,
"loss": 1.1578,
"loss/crossentropy": 2.3950142860412598,
"loss/hidden": 0.99609375,
"loss/logits": 0.1611400693655014,
"loss/reg": 5.80518099013716e-05,
"step": 468
},
{
"epoch": 0.058625,
"grad_norm": 3.4676575660705566,
"grad_norm_var": 1.366390295617852,
"learning_rate": 0.0001,
"loss": 1.5162,
"loss/crossentropy": 2.851280689239502,
"loss/hidden": 1.234375,
"loss/logits": 0.28122612833976746,
"loss/reg": 5.8030982472701e-05,
"step": 469
},
{
"epoch": 0.05875,
"grad_norm": 2.9446208477020264,
"grad_norm_var": 1.302065384350945,
"learning_rate": 0.0001,
"loss": 1.3015,
"loss/crossentropy": 2.740093469619751,
"loss/hidden": 1.125,
"loss/logits": 0.17590749263763428,
"loss/reg": 5.800585859105922e-05,
"step": 470
},
{
"epoch": 0.058875,
"grad_norm": 2.7597243785858154,
"grad_norm_var": 1.2405377686230998,
"learning_rate": 0.0001,
"loss": 1.1651,
"loss/crossentropy": 2.440762996673584,
"loss/hidden": 1.015625,
"loss/logits": 0.14888577163219452,
"loss/reg": 5.7990357163362205e-05,
"step": 471
},
{
"epoch": 0.059,
"grad_norm": 2.8147523403167725,
"grad_norm_var": 1.182327943249795,
"learning_rate": 0.0001,
"loss": 1.3195,
"loss/crossentropy": 2.5801327228546143,
"loss/hidden": 1.140625,
"loss/logits": 0.17824885249137878,
"loss/reg": 5.7975972595158964e-05,
"step": 472
},
{
"epoch": 0.059125,
"grad_norm": 2.4511027336120605,
"grad_norm_var": 1.1923747545104257,
"learning_rate": 0.0001,
"loss": 1.4217,
"loss/crossentropy": 2.5711913108825684,
"loss/hidden": 1.203125,
"loss/logits": 0.2180328667163849,
"loss/reg": 5.796052937512286e-05,
"step": 473
},
{
"epoch": 0.05925,
"grad_norm": 2.9213221073150635,
"grad_norm_var": 0.1890407192544025,
"learning_rate": 0.0001,
"loss": 1.2735,
"loss/crossentropy": 2.5805675983428955,
"loss/hidden": 1.1015625,
"loss/logits": 0.17132875323295593,
"loss/reg": 5.794024036731571e-05,
"step": 474
},
{
"epoch": 0.059375,
"grad_norm": 2.6587464809417725,
"grad_norm_var": 0.1832162860499608,
"learning_rate": 0.0001,
"loss": 1.6569,
"loss/crossentropy": 2.356299638748169,
"loss/hidden": 1.40625,
"loss/logits": 0.25005391240119934,
"loss/reg": 5.791860894532874e-05,
"step": 475
},
{
"epoch": 0.0595,
"grad_norm": 3.5978729724884033,
"grad_norm_var": 0.19139826910290647,
"learning_rate": 0.0001,
"loss": 1.7357,
"loss/crossentropy": 2.0626883506774902,
"loss/hidden": 1.4765625,
"loss/logits": 0.2585859000682831,
"loss/reg": 5.790415525552817e-05,
"step": 476
},
{
"epoch": 0.059625,
"grad_norm": 2.8491876125335693,
"grad_norm_var": 0.18498974202791843,
"learning_rate": 0.0001,
"loss": 1.5276,
"loss/crossentropy": 2.5583596229553223,
"loss/hidden": 1.2734375,
"loss/logits": 0.25358158349990845,
"loss/reg": 5.788617272628471e-05,
"step": 477
},
{
"epoch": 0.05975,
"grad_norm": 2.5821259021759033,
"grad_norm_var": 0.1876924518839881,
"learning_rate": 0.0001,
"loss": 1.3568,
"loss/crossentropy": 2.486640453338623,
"loss/hidden": 1.1640625,
"loss/logits": 0.19216927886009216,
"loss/reg": 5.786680776509456e-05,
"step": 478
},
{
"epoch": 0.059875,
"grad_norm": 2.877934217453003,
"grad_norm_var": 0.17456917708907038,
"learning_rate": 0.0001,
"loss": 1.5607,
"loss/crossentropy": 2.3836066722869873,
"loss/hidden": 1.3203125,
"loss/logits": 0.23981472849845886,
"loss/reg": 5.785070243291557e-05,
"step": 479
},
{
"epoch": 0.06,
"grad_norm": 2.3281009197235107,
"grad_norm_var": 0.1849188959934999,
"learning_rate": 0.0001,
"loss": 1.2716,
"loss/crossentropy": 2.508988380432129,
"loss/hidden": 1.078125,
"loss/logits": 0.19294525682926178,
"loss/reg": 5.783725646324456e-05,
"step": 480
},
{
"epoch": 0.060125,
"grad_norm": 2.8099567890167236,
"grad_norm_var": 0.14013939438571937,
"learning_rate": 0.0001,
"loss": 1.5081,
"loss/crossentropy": 2.3855881690979004,
"loss/hidden": 1.25,
"loss/logits": 0.2575419545173645,
"loss/reg": 5.782474545412697e-05,
"step": 481
},
{
"epoch": 0.06025,
"grad_norm": 2.9827277660369873,
"grad_norm_var": 0.134662315913679,
"learning_rate": 0.0001,
"loss": 1.4593,
"loss/crossentropy": 2.5487606525421143,
"loss/hidden": 1.25,
"loss/logits": 0.2087090015411377,
"loss/reg": 5.7816720072878525e-05,
"step": 482
},
{
"epoch": 0.060375,
"grad_norm": 2.306149959564209,
"grad_norm_var": 0.1259770764512929,
"learning_rate": 0.0001,
"loss": 1.2076,
"loss/crossentropy": 2.4755747318267822,
"loss/hidden": 1.046875,
"loss/logits": 0.16014963388442993,
"loss/reg": 5.781082290923223e-05,
"step": 483
},
{
"epoch": 0.0605,
"grad_norm": 2.4719114303588867,
"grad_norm_var": 0.12834384378027816,
"learning_rate": 0.0001,
"loss": 1.3745,
"loss/crossentropy": 2.8203346729278564,
"loss/hidden": 1.171875,
"loss/logits": 0.20208273828029633,
"loss/reg": 5.7795077736955136e-05,
"step": 484
},
{
"epoch": 0.060625,
"grad_norm": 2.300952911376953,
"grad_norm_var": 0.10978991346620433,
"learning_rate": 0.0001,
"loss": 1.464,
"loss/crossentropy": 2.610508680343628,
"loss/hidden": 1.2265625,
"loss/logits": 0.2368427813053131,
"loss/reg": 5.778546983492561e-05,
"step": 485
},
{
"epoch": 0.06075,
"grad_norm": 3.3388009071350098,
"grad_norm_var": 0.13085586368501342,
"learning_rate": 0.0001,
"loss": 1.5116,
"loss/crossentropy": 2.763427972793579,
"loss/hidden": 1.296875,
"loss/logits": 0.21419215202331543,
"loss/reg": 5.7770797866396606e-05,
"step": 486
},
{
"epoch": 0.060875,
"grad_norm": 2.102293014526367,
"grad_norm_var": 0.1572983810037916,
"learning_rate": 0.0001,
"loss": 1.1595,
"loss/crossentropy": 2.204011917114258,
"loss/hidden": 1.0,
"loss/logits": 0.158901646733284,
"loss/reg": 5.7755187299335375e-05,
"step": 487
},
{
"epoch": 0.061,
"grad_norm": 2.766934633255005,
"grad_norm_var": 0.15678694409689248,
"learning_rate": 0.0001,
"loss": 1.4246,
"loss/crossentropy": 2.537151575088501,
"loss/hidden": 1.2109375,
"loss/logits": 0.2130882441997528,
"loss/reg": 5.774224700871855e-05,
"step": 488
},
{
"epoch": 0.061125,
"grad_norm": 2.0001540184020996,
"grad_norm_var": 0.18501104247654798,
"learning_rate": 0.0001,
"loss": 1.103,
"loss/crossentropy": 2.3592050075531006,
"loss/hidden": 0.96484375,
"loss/logits": 0.13754940032958984,
"loss/reg": 5.77289865759667e-05,
"step": 489
},
{
"epoch": 0.06125,
"grad_norm": 2.3166351318359375,
"grad_norm_var": 0.18848381138351228,
"learning_rate": 0.0001,
"loss": 1.3329,
"loss/crossentropy": 2.7236411571502686,
"loss/hidden": 1.15625,
"loss/logits": 0.1761033535003662,
"loss/reg": 5.771181167801842e-05,
"step": 490
},
{
"epoch": 0.061375,
"grad_norm": 2.357775926589966,
"grad_norm_var": 0.19351960086170053,
"learning_rate": 0.0001,
"loss": 1.1437,
"loss/crossentropy": 2.866445779800415,
"loss/hidden": 0.98828125,
"loss/logits": 0.15484049916267395,
"loss/reg": 5.769642666564323e-05,
"step": 491
},
{
"epoch": 0.0615,
"grad_norm": 3.680264949798584,
"grad_norm_var": 0.20463866822373877,
"learning_rate": 0.0001,
"loss": 1.2002,
"loss/crossentropy": 3.115431308746338,
"loss/hidden": 1.0390625,
"loss/logits": 0.16054463386535645,
"loss/reg": 5.7679084420669824e-05,
"step": 492
},
{
"epoch": 0.061625,
"grad_norm": 2.3650856018066406,
"grad_norm_var": 0.2051052996774897,
"learning_rate": 0.0001,
"loss": 1.1996,
"loss/crossentropy": 2.6519298553466797,
"loss/hidden": 1.0234375,
"loss/logits": 0.17554257810115814,
"loss/reg": 5.766074173152447e-05,
"step": 493
},
{
"epoch": 0.06175,
"grad_norm": 2.7080323696136475,
"grad_norm_var": 0.2058088113620099,
"learning_rate": 0.0001,
"loss": 1.365,
"loss/crossentropy": 2.329538106918335,
"loss/hidden": 1.15625,
"loss/logits": 0.20815491676330566,
"loss/reg": 5.764625166193582e-05,
"step": 494
},
{
"epoch": 0.061875,
"grad_norm": 2.2859530448913574,
"grad_norm_var": 0.2063347958167308,
"learning_rate": 0.0001,
"loss": 1.2994,
"loss/crossentropy": 2.6445348262786865,
"loss/hidden": 1.125,
"loss/logits": 0.1738019585609436,
"loss/reg": 5.763155422755517e-05,
"step": 495
},
{
"epoch": 0.062,
"grad_norm": 2.771320343017578,
"grad_norm_var": 0.20431087500909348,
"learning_rate": 0.0001,
"loss": 1.4714,
"loss/crossentropy": 2.340728282928467,
"loss/hidden": 1.2578125,
"loss/logits": 0.21303007006645203,
"loss/reg": 5.761897409684025e-05,
"step": 496
},
{
"epoch": 0.062125,
"grad_norm": 3.022183656692505,
"grad_norm_var": 0.21312900983479016,
"learning_rate": 0.0001,
"loss": 1.4858,
"loss/crossentropy": 2.6772336959838867,
"loss/hidden": 1.265625,
"loss/logits": 0.2196260541677475,
"loss/reg": 5.761081411037594e-05,
"step": 497
},
{
"epoch": 0.06225,
"grad_norm": 13.948429107666016,
"grad_norm_var": 8.27193520122967,
"learning_rate": 0.0001,
"loss": 1.3633,
"loss/crossentropy": 2.862323760986328,
"loss/hidden": 1.171875,
"loss/logits": 0.19083081185817719,
"loss/reg": 5.7596374972490594e-05,
"step": 498
},
{
"epoch": 0.062375,
"grad_norm": 2.6107678413391113,
"grad_norm_var": 8.237513777759569,
"learning_rate": 0.0001,
"loss": 1.6771,
"loss/crossentropy": 2.1725099086761475,
"loss/hidden": 1.40625,
"loss/logits": 0.2702314555644989,
"loss/reg": 5.7586628827266395e-05,
"step": 499
},
{
"epoch": 0.0625,
"grad_norm": 2.5658040046691895,
"grad_norm_var": 8.22750426778598,
"learning_rate": 0.0001,
"loss": 1.4381,
"loss/crossentropy": 2.246595859527588,
"loss/hidden": 1.25,
"loss/logits": 0.18755751848220825,
"loss/reg": 5.756897371611558e-05,
"step": 500
}
],
"logging_steps": 1,
"max_steps": 8000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": true,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.2202930782208e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}