qwen4b-4k / trainer_state.json
semran1's picture
Upload folder using huggingface_hub
43d37f8 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.2,
"eval_steps": 2000,
"global_step": 4000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005,
"grad_norm": 2128.0,
"learning_rate": 1.9e-05,
"loss": 69.9557,
"loss/crossentropy": 12.354743599891663,
"loss/hidden": 18.71875,
"loss/jsd": 5.161534905433655,
"loss/logits": 0.0,
"step": 10
},
{
"epoch": 0.001,
"grad_norm": 266.0,
"grad_norm_var": 15343106.783333333,
"learning_rate": 2.8000000000000003e-05,
"loss": 52.9613,
"loss/crossentropy": 9.517439389228821,
"loss/hidden": 18.68125,
"loss/jsd": 3.39926393032074,
"loss/logits": 0.0,
"step": 20
},
{
"epoch": 0.0015,
"grad_norm": 186.0,
"grad_norm_var": 174925.440625,
"learning_rate": 3.7e-05,
"loss": 48.1973,
"loss/crossentropy": 8.46514676809311,
"loss/hidden": 18.065625,
"loss/jsd": 2.9373991966247557,
"loss/logits": 0.0,
"step": 30
},
{
"epoch": 0.002,
"grad_norm": 532.0,
"grad_norm_var": 39180.229166666664,
"learning_rate": 4.600000000000001e-05,
"loss": 45.9066,
"loss/crossentropy": 8.040922927856446,
"loss/hidden": 17.096875,
"loss/jsd": 2.860607051849365,
"loss/logits": 0.0,
"step": 40
},
{
"epoch": 0.0025,
"grad_norm": 334.0,
"grad_norm_var": 38047.8,
"learning_rate": 5.500000000000001e-05,
"loss": 41.576,
"loss/crossentropy": 7.545825862884522,
"loss/hidden": 16.803125,
"loss/jsd": 2.476469251513481,
"loss/logits": 0.0,
"step": 50
},
{
"epoch": 0.003,
"grad_norm": 238.0,
"grad_norm_var": 76798.25,
"learning_rate": 6.400000000000001e-05,
"loss": 36.7377,
"loss/crossentropy": 6.656394875049591,
"loss/hidden": 15.9859375,
"loss/jsd": 2.101923054456711,
"loss/logits": 0.0,
"step": 60
},
{
"epoch": 0.0035,
"grad_norm": 221.0,
"grad_norm_var": 72765.58333333333,
"learning_rate": 7.3e-05,
"loss": 28.2567,
"loss/crossentropy": 5.261470526456833,
"loss/hidden": 13.6265625,
"loss/jsd": 1.4439617365598678,
"loss/logits": 0.0,
"step": 70
},
{
"epoch": 0.004,
"grad_norm": 185.0,
"grad_norm_var": 16524.266666666666,
"learning_rate": 8.200000000000001e-05,
"loss": 19.3251,
"loss/crossentropy": 4.03509070277214,
"loss/hidden": 11.1984375,
"loss/jsd": 0.8947193071246147,
"loss/logits": 0.0,
"step": 80
},
{
"epoch": 0.0045,
"grad_norm": 107.5,
"grad_norm_var": 1300.190625,
"learning_rate": 9.1e-05,
"loss": 14.15,
"loss/crossentropy": 3.2564123183488847,
"loss/hidden": 9.371875,
"loss/jsd": 0.4821927219629288,
"loss/logits": 0.0,
"step": 90
},
{
"epoch": 0.005,
"grad_norm": 113.0,
"grad_norm_var": 747.6072916666667,
"learning_rate": 0.0001,
"loss": 12.3004,
"loss/crossentropy": 2.9699372231960295,
"loss/hidden": 8.3890625,
"loss/jsd": 0.37094187960028646,
"loss/logits": 0.0,
"step": 100
},
{
"epoch": 0.0055,
"grad_norm": 143.0,
"grad_norm_var": 1498.7333333333333,
"learning_rate": 0.0001,
"loss": 11.0558,
"loss/crossentropy": 3.028834396600723,
"loss/hidden": 7.6984375,
"loss/jsd": 0.32162978053092955,
"loss/logits": 0.0,
"step": 110
},
{
"epoch": 0.006,
"grad_norm": 141.0,
"grad_norm_var": 384.065625,
"learning_rate": 0.0001,
"loss": 10.3695,
"loss/crossentropy": 2.8776101738214495,
"loss/hidden": 7.54375,
"loss/jsd": 0.31908423118293283,
"loss/logits": 0.0,
"step": 120
},
{
"epoch": 0.0065,
"grad_norm": 126.5,
"grad_norm_var": 376.53229166666665,
"learning_rate": 0.0001,
"loss": 9.7913,
"loss/crossentropy": 2.742277052998543,
"loss/hidden": 7.16328125,
"loss/jsd": 0.2711725488305092,
"loss/logits": 0.0,
"step": 130
},
{
"epoch": 0.007,
"grad_norm": 129.0,
"grad_norm_var": 266.0,
"learning_rate": 0.0001,
"loss": 9.524,
"loss/crossentropy": 2.4384234696626663,
"loss/hidden": 6.9765625,
"loss/jsd": 0.2616196651011705,
"loss/logits": 0.0,
"step": 140
},
{
"epoch": 0.0075,
"grad_norm": 100.5,
"grad_norm_var": 409.03229166666665,
"learning_rate": 0.0001,
"loss": 9.1046,
"loss/crossentropy": 2.8043846026062966,
"loss/hidden": 6.6234375,
"loss/jsd": 0.2574016904458404,
"loss/logits": 0.0,
"step": 150
},
{
"epoch": 0.008,
"grad_norm": 78.0,
"grad_norm_var": 385.70729166666666,
"learning_rate": 0.0001,
"loss": 8.961,
"loss/crossentropy": 2.6820163667201995,
"loss/hidden": 6.44609375,
"loss/jsd": 0.22497861441224815,
"loss/logits": 0.0,
"step": 160
},
{
"epoch": 0.0085,
"grad_norm": 73.0,
"grad_norm_var": 180.540625,
"learning_rate": 0.0001,
"loss": 8.6716,
"loss/crossentropy": 2.56088288128376,
"loss/hidden": 6.525,
"loss/jsd": 0.23445787131786347,
"loss/logits": 0.0,
"step": 170
},
{
"epoch": 0.009,
"grad_norm": 70.5,
"grad_norm_var": 66.43333333333334,
"learning_rate": 0.0001,
"loss": 8.3449,
"loss/crossentropy": 2.5659249514341353,
"loss/hidden": 6.0875,
"loss/jsd": 0.20521375369280576,
"loss/logits": 0.0,
"step": 180
},
{
"epoch": 0.0095,
"grad_norm": 59.0,
"grad_norm_var": 125.99895833333333,
"learning_rate": 0.0001,
"loss": 8.2048,
"loss/crossentropy": 2.4801410123705865,
"loss/hidden": 5.96875,
"loss/jsd": 0.20023126248270273,
"loss/logits": 0.0,
"step": 190
},
{
"epoch": 0.01,
"grad_norm": 95.5,
"grad_norm_var": 151.215625,
"learning_rate": 0.0001,
"loss": 7.9327,
"loss/crossentropy": 2.7575797021389006,
"loss/hidden": 5.9078125,
"loss/jsd": 0.21125836484134197,
"loss/logits": 0.0,
"step": 200
},
{
"epoch": 0.0105,
"grad_norm": 69.5,
"grad_norm_var": 81.01666666666667,
"learning_rate": 0.0001,
"loss": 7.867,
"loss/crossentropy": 2.584353247284889,
"loss/hidden": 5.79140625,
"loss/jsd": 0.18391123060137032,
"loss/logits": 0.0,
"step": 210
},
{
"epoch": 0.011,
"grad_norm": 67.0,
"grad_norm_var": 111.115625,
"learning_rate": 0.0001,
"loss": 7.5262,
"loss/crossentropy": 2.5395505383610724,
"loss/hidden": 5.68125,
"loss/jsd": 0.17292506210505962,
"loss/logits": 0.0,
"step": 220
},
{
"epoch": 0.0115,
"grad_norm": 80.0,
"grad_norm_var": 114.725,
"learning_rate": 0.0001,
"loss": 7.553,
"loss/crossentropy": 2.469125708937645,
"loss/hidden": 5.62890625,
"loss/jsd": 0.1715977793559432,
"loss/logits": 0.0,
"step": 230
},
{
"epoch": 0.012,
"grad_norm": 59.75,
"grad_norm_var": 93.15,
"learning_rate": 0.0001,
"loss": 7.3673,
"loss/crossentropy": 2.572914382815361,
"loss/hidden": 5.51171875,
"loss/jsd": 0.18267902322113513,
"loss/logits": 0.0,
"step": 240
},
{
"epoch": 0.0125,
"grad_norm": 53.5,
"grad_norm_var": 99.42395833333333,
"learning_rate": 0.0001,
"loss": 7.3184,
"loss/crossentropy": 2.6171721309423446,
"loss/hidden": 5.53515625,
"loss/jsd": 0.17945121377706527,
"loss/logits": 0.0,
"step": 250
},
{
"epoch": 0.013,
"grad_norm": 48.75,
"grad_norm_var": 118.975,
"learning_rate": 0.0001,
"loss": 7.2085,
"loss/crossentropy": 2.4379070818424227,
"loss/hidden": 5.509375,
"loss/jsd": 0.1914055148139596,
"loss/logits": 0.0,
"step": 260
},
{
"epoch": 0.0135,
"grad_norm": 67.5,
"grad_norm_var": 102.2875,
"learning_rate": 0.0001,
"loss": 7.0206,
"loss/crossentropy": 2.5107616782188416,
"loss/hidden": 5.4359375,
"loss/jsd": 0.19947240259498358,
"loss/logits": 0.0,
"step": 270
},
{
"epoch": 0.014,
"grad_norm": 67.5,
"grad_norm_var": 136.27057291666668,
"learning_rate": 0.0001,
"loss": 6.9768,
"loss/crossentropy": 2.4130793780088426,
"loss/hidden": 5.28359375,
"loss/jsd": 0.18424466587603092,
"loss/logits": 0.0,
"step": 280
},
{
"epoch": 0.0145,
"grad_norm": 62.0,
"grad_norm_var": 98.04895833333333,
"learning_rate": 0.0001,
"loss": 6.8743,
"loss/crossentropy": 2.382996806502342,
"loss/hidden": 5.20625,
"loss/jsd": 0.1648038787767291,
"loss/logits": 0.0,
"step": 290
},
{
"epoch": 0.015,
"grad_norm": 57.75,
"grad_norm_var": 81.59895833333333,
"learning_rate": 0.0001,
"loss": 6.7946,
"loss/crossentropy": 2.5844862312078476,
"loss/hidden": 5.22265625,
"loss/jsd": 0.1599080903455615,
"loss/logits": 0.0,
"step": 300
},
{
"epoch": 0.0155,
"grad_norm": 77.0,
"grad_norm_var": 103.13229166666666,
"learning_rate": 0.0001,
"loss": 6.7739,
"loss/crossentropy": 2.4337188243865966,
"loss/hidden": 4.98046875,
"loss/jsd": 0.14282424729317428,
"loss/logits": 0.0,
"step": 310
},
{
"epoch": 0.016,
"grad_norm": 51.75,
"grad_norm_var": 154.47265625,
"learning_rate": 0.0001,
"loss": 6.6113,
"loss/crossentropy": 2.516378104686737,
"loss/hidden": 5.03828125,
"loss/jsd": 0.1448629444465041,
"loss/logits": 0.0,
"step": 320
},
{
"epoch": 0.0165,
"grad_norm": 47.0,
"grad_norm_var": 43.2625,
"learning_rate": 0.0001,
"loss": 6.4669,
"loss/crossentropy": 2.5109775930643083,
"loss/hidden": 4.9265625,
"loss/jsd": 0.14978713616728784,
"loss/logits": 0.0,
"step": 330
},
{
"epoch": 0.017,
"grad_norm": 57.75,
"grad_norm_var": 64.21087239583333,
"learning_rate": 0.0001,
"loss": 6.4713,
"loss/crossentropy": 2.497659134864807,
"loss/hidden": 4.91796875,
"loss/jsd": 0.14760203529149293,
"loss/logits": 0.0,
"step": 340
},
{
"epoch": 0.0175,
"grad_norm": 52.5,
"grad_norm_var": 120.26243489583334,
"learning_rate": 0.0001,
"loss": 6.4978,
"loss/crossentropy": 2.402846799790859,
"loss/hidden": 4.7796875,
"loss/jsd": 0.13832223881036043,
"loss/logits": 0.0,
"step": 350
},
{
"epoch": 0.018,
"grad_norm": 46.5,
"grad_norm_var": 21.149739583333332,
"learning_rate": 0.0001,
"loss": 6.3607,
"loss/crossentropy": 2.3924304962158205,
"loss/hidden": 4.9890625,
"loss/jsd": 0.1568290738388896,
"loss/logits": 0.0,
"step": 360
},
{
"epoch": 0.0185,
"grad_norm": 44.75,
"grad_norm_var": 49.326822916666664,
"learning_rate": 0.0001,
"loss": 6.3592,
"loss/crossentropy": 2.4209784388542177,
"loss/hidden": 4.89765625,
"loss/jsd": 0.14134480394423007,
"loss/logits": 0.0,
"step": 370
},
{
"epoch": 0.019,
"grad_norm": 43.75,
"grad_norm_var": 71.22057291666667,
"learning_rate": 0.0001,
"loss": 6.2124,
"loss/crossentropy": 2.549247406423092,
"loss/hidden": 4.7390625,
"loss/jsd": 0.140831589885056,
"loss/logits": 0.0,
"step": 380
},
{
"epoch": 0.0195,
"grad_norm": 92.5,
"grad_norm_var": 9.065077296740351e+17,
"learning_rate": 0.0001,
"loss": 6.2864,
"loss/crossentropy": 2.4922619298100472,
"loss/hidden": 4.87734375,
"loss/jsd": 0.1634673684835434,
"loss/logits": 0.0,
"step": 390
},
{
"epoch": 0.02,
"grad_norm": 43.75,
"grad_norm_var": 9.065077288409414e+17,
"learning_rate": 0.0001,
"loss": 6.2254,
"loss/crossentropy": 2.469875320792198,
"loss/hidden": 4.82265625,
"loss/jsd": 0.1564602382481098,
"loss/logits": 0.0,
"step": 400
},
{
"epoch": 0.0205,
"grad_norm": 56.0,
"grad_norm_var": 47.498958333333334,
"learning_rate": 0.0001,
"loss": 6.1795,
"loss/crossentropy": 2.548477476835251,
"loss/hidden": 4.75703125,
"loss/jsd": 0.17199970744550228,
"loss/logits": 0.0,
"step": 410
},
{
"epoch": 0.021,
"grad_norm": 54.75,
"grad_norm_var": 720.9768229166667,
"learning_rate": 0.0001,
"loss": 6.252,
"loss/crossentropy": 2.479714798927307,
"loss/hidden": 4.68828125,
"loss/jsd": 0.1501935562118888,
"loss/logits": 0.0,
"step": 420
},
{
"epoch": 0.0215,
"grad_norm": 47.25,
"grad_norm_var": 723.6166666666667,
"learning_rate": 0.0001,
"loss": 6.1002,
"loss/crossentropy": 2.529230397939682,
"loss/hidden": 4.7921875,
"loss/jsd": 0.15877617206424474,
"loss/logits": 0.0,
"step": 430
},
{
"epoch": 0.022,
"grad_norm": 53.0,
"grad_norm_var": 1.207597994464615e+18,
"learning_rate": 0.0001,
"loss": 6.0501,
"loss/crossentropy": 2.2404126971960068,
"loss/hidden": 4.54140625,
"loss/jsd": 0.1322522010654211,
"loss/logits": 0.0,
"step": 440
},
{
"epoch": 0.0225,
"grad_norm": 3875536896.0,
"grad_norm_var": 2.004372710541947e+18,
"learning_rate": 0.0001,
"loss": 6.1466,
"loss/crossentropy": 2.430220237374306,
"loss/hidden": 4.62109375,
"loss/jsd": 0.14306345414370297,
"loss/logits": 0.0,
"step": 450
},
{
"epoch": 0.023,
"grad_norm": 43.0,
"grad_norm_var": 9.387366184428504e+17,
"learning_rate": 0.0001,
"loss": 6.0412,
"loss/crossentropy": 2.345375160872936,
"loss/hidden": 4.620703125,
"loss/jsd": 0.1385633122175932,
"loss/logits": 0.0,
"step": 460
},
{
"epoch": 0.0235,
"grad_norm": 42.75,
"grad_norm_var": 21.448958333333334,
"learning_rate": 0.0001,
"loss": 5.9336,
"loss/crossentropy": 2.425405339896679,
"loss/hidden": 4.60546875,
"loss/jsd": 0.13772829296067357,
"loss/logits": 0.0,
"step": 470
},
{
"epoch": 0.024,
"grad_norm": 39.75,
"grad_norm_var": 10.595572916666667,
"learning_rate": 0.0001,
"loss": 5.9238,
"loss/crossentropy": 2.1817762181162834,
"loss/hidden": 4.540234375,
"loss/jsd": 0.12882032115012407,
"loss/logits": 0.0,
"step": 480
},
{
"epoch": 0.0245,
"grad_norm": 33.75,
"grad_norm_var": 21.001822916666665,
"learning_rate": 0.0001,
"loss": 6.0109,
"loss/crossentropy": 2.3736354261636734,
"loss/hidden": 4.64140625,
"loss/jsd": 0.1405269218608737,
"loss/logits": 0.0,
"step": 490
},
{
"epoch": 0.025,
"grad_norm": 41.25,
"grad_norm_var": 220.015625,
"learning_rate": 0.0001,
"loss": 5.9307,
"loss/crossentropy": 2.5042927861213684,
"loss/hidden": 4.7546875,
"loss/jsd": 0.18516455199569465,
"loss/logits": 0.0,
"step": 500
},
{
"epoch": 0.0255,
"grad_norm": 41.0,
"grad_norm_var": 16.257291666666667,
"learning_rate": 0.0001,
"loss": 5.9019,
"loss/crossentropy": 2.526998797059059,
"loss/hidden": 4.47109375,
"loss/jsd": 0.13217656817287207,
"loss/logits": 0.0,
"step": 510
},
{
"epoch": 0.026,
"grad_norm": 32.25,
"grad_norm_var": 16.782291666666666,
"learning_rate": 0.0001,
"loss": 5.8327,
"loss/crossentropy": 2.316130298376083,
"loss/hidden": 4.387109375,
"loss/jsd": 0.12394356895238161,
"loss/logits": 0.0,
"step": 520
},
{
"epoch": 0.0265,
"grad_norm": 39.0,
"grad_norm_var": 24.970833333333335,
"learning_rate": 0.0001,
"loss": 5.8284,
"loss/crossentropy": 2.214504113793373,
"loss/hidden": 4.623046875,
"loss/jsd": 0.15524424342438578,
"loss/logits": 0.0,
"step": 530
},
{
"epoch": 0.027,
"grad_norm": 35.75,
"grad_norm_var": 11.79375,
"learning_rate": 0.0001,
"loss": 5.7037,
"loss/crossentropy": 2.336098350584507,
"loss/hidden": 4.33515625,
"loss/jsd": 0.12178284991532565,
"loss/logits": 0.0,
"step": 540
},
{
"epoch": 0.0275,
"grad_norm": 38.0,
"grad_norm_var": 13.470768229166667,
"learning_rate": 0.0001,
"loss": 5.7146,
"loss/crossentropy": 2.4750932276248934,
"loss/hidden": 4.41953125,
"loss/jsd": 0.12415571566671132,
"loss/logits": 0.0,
"step": 550
},
{
"epoch": 0.028,
"grad_norm": 37.0,
"grad_norm_var": 15.298958333333333,
"learning_rate": 0.0001,
"loss": 5.6597,
"loss/crossentropy": 2.360400839149952,
"loss/hidden": 4.45546875,
"loss/jsd": 0.1331789677962661,
"loss/logits": 0.0,
"step": 560
},
{
"epoch": 0.0285,
"grad_norm": 26.75,
"grad_norm_var": 108.82057291666666,
"learning_rate": 0.0001,
"loss": 5.6905,
"loss/crossentropy": 2.547207270562649,
"loss/hidden": 4.413671875,
"loss/jsd": 0.13257503397762777,
"loss/logits": 0.0,
"step": 570
},
{
"epoch": 0.029,
"grad_norm": 38.25,
"grad_norm_var": 82.65149739583333,
"learning_rate": 0.0001,
"loss": 5.707,
"loss/crossentropy": 2.4661644257605078,
"loss/hidden": 4.43046875,
"loss/jsd": 0.13218661015853286,
"loss/logits": 0.0,
"step": 580
},
{
"epoch": 0.0295,
"grad_norm": 33.0,
"grad_norm_var": 14.656705729166667,
"learning_rate": 0.0001,
"loss": 5.6198,
"loss/crossentropy": 2.3429581418633463,
"loss/hidden": 4.35390625,
"loss/jsd": 0.1255058040842414,
"loss/logits": 0.0,
"step": 590
},
{
"epoch": 0.03,
"grad_norm": 30.5,
"grad_norm_var": 16.014518229166665,
"learning_rate": 0.0001,
"loss": 5.5969,
"loss/crossentropy": 2.6043634325265885,
"loss/hidden": 4.3796875,
"loss/jsd": 0.1311176208779216,
"loss/logits": 0.0,
"step": 600
},
{
"epoch": 0.0305,
"grad_norm": 33.0,
"grad_norm_var": 10.665625,
"learning_rate": 0.0001,
"loss": 5.5352,
"loss/crossentropy": 2.4040530994534492,
"loss/hidden": 4.219140625,
"loss/jsd": 0.12296068714931607,
"loss/logits": 0.0,
"step": 610
},
{
"epoch": 0.031,
"grad_norm": 38.75,
"grad_norm_var": 16.33125,
"learning_rate": 0.0001,
"loss": 5.4814,
"loss/crossentropy": 2.390893703699112,
"loss/hidden": 4.291796875,
"loss/jsd": 0.11912889536470175,
"loss/logits": 0.0,
"step": 620
},
{
"epoch": 0.0315,
"grad_norm": 34.75,
"grad_norm_var": 19.909375,
"learning_rate": 0.0001,
"loss": 5.5724,
"loss/crossentropy": 2.5551778227090836,
"loss/hidden": 4.251171875,
"loss/jsd": 0.134556083381176,
"loss/logits": 0.0,
"step": 630
},
{
"epoch": 0.032,
"grad_norm": 33.0,
"grad_norm_var": 1.2447526950627446e+18,
"learning_rate": 0.0001,
"loss": 5.6162,
"loss/crossentropy": 2.4906763210892677,
"loss/hidden": 4.20234375,
"loss/jsd": 0.1178798858076334,
"loss/logits": 0.0,
"step": 640
},
{
"epoch": 0.0325,
"grad_norm": 29.875,
"grad_norm_var": 1.2447526957786422e+18,
"learning_rate": 0.0001,
"loss": 5.5184,
"loss/crossentropy": 2.437400442361832,
"loss/hidden": 4.23046875,
"loss/jsd": 0.12956738714128732,
"loss/logits": 0.0,
"step": 650
},
{
"epoch": 0.033,
"grad_norm": 33.0,
"grad_norm_var": 27.3134765625,
"learning_rate": 0.0001,
"loss": 5.6369,
"loss/crossentropy": 2.4849177479743956,
"loss/hidden": 4.262109375,
"loss/jsd": 0.12098300596699119,
"loss/logits": 0.0,
"step": 660
},
{
"epoch": 0.0335,
"grad_norm": 28.5,
"grad_norm_var": 17.055989583333332,
"learning_rate": 0.0001,
"loss": 5.4991,
"loss/crossentropy": 2.4364880681037904,
"loss/hidden": 4.26171875,
"loss/jsd": 0.12965436410158873,
"loss/logits": 0.0,
"step": 670
},
{
"epoch": 0.034,
"grad_norm": 28.375,
"grad_norm_var": 19.137955729166666,
"learning_rate": 0.0001,
"loss": 5.5161,
"loss/crossentropy": 2.392630486190319,
"loss/hidden": 4.173828125,
"loss/jsd": 0.11459105852991343,
"loss/logits": 0.0,
"step": 680
},
{
"epoch": 0.0345,
"grad_norm": 27.25,
"grad_norm_var": 13.9322265625,
"learning_rate": 0.0001,
"loss": 5.4332,
"loss/crossentropy": 2.344803684949875,
"loss/hidden": 4.176953125,
"loss/jsd": 0.11463690986856818,
"loss/logits": 0.0,
"step": 690
},
{
"epoch": 0.035,
"grad_norm": 34.75,
"grad_norm_var": 15.854622395833333,
"learning_rate": 0.0001,
"loss": 5.5003,
"loss/crossentropy": 2.395221236348152,
"loss/hidden": 4.260546875,
"loss/jsd": 0.1217193447984755,
"loss/logits": 0.0,
"step": 700
},
{
"epoch": 0.0355,
"grad_norm": 25.25,
"grad_norm_var": 14.663541666666667,
"learning_rate": 0.0001,
"loss": 5.4171,
"loss/crossentropy": 2.4193977400660516,
"loss/hidden": 4.23828125,
"loss/jsd": 0.12150606149807572,
"loss/logits": 0.0,
"step": 710
},
{
"epoch": 0.036,
"grad_norm": 26.875,
"grad_norm_var": 13.142643229166667,
"learning_rate": 0.0001,
"loss": 5.3761,
"loss/crossentropy": 2.2133478626608847,
"loss/hidden": 4.141796875,
"loss/jsd": 0.11149341901764273,
"loss/logits": 0.0,
"step": 720
},
{
"epoch": 0.0365,
"grad_norm": 34.25,
"grad_norm_var": 14.420572916666666,
"learning_rate": 0.0001,
"loss": 5.3258,
"loss/crossentropy": 2.3443893820047377,
"loss/hidden": 4.090234375,
"loss/jsd": 0.11677124733105301,
"loss/logits": 0.0,
"step": 730
},
{
"epoch": 0.037,
"grad_norm": 31.25,
"grad_norm_var": 9.551822916666667,
"learning_rate": 0.0001,
"loss": 5.3054,
"loss/crossentropy": 2.3357947677373887,
"loss/hidden": 4.194140625,
"loss/jsd": 0.12083362191915512,
"loss/logits": 0.0,
"step": 740
},
{
"epoch": 0.0375,
"grad_norm": 32.25,
"grad_norm_var": 9.950455729166666,
"learning_rate": 0.0001,
"loss": 5.2645,
"loss/crossentropy": 2.4039614737033843,
"loss/hidden": 4.08671875,
"loss/jsd": 0.1069810570217669,
"loss/logits": 0.0,
"step": 750
},
{
"epoch": 0.038,
"grad_norm": 24.0,
"grad_norm_var": 1.1710062557908698e+18,
"learning_rate": 0.0001,
"loss": 5.3587,
"loss/crossentropy": 2.4738259255886077,
"loss/hidden": 4.209765625,
"loss/jsd": 0.13927901685237884,
"loss/logits": 0.0,
"step": 760
},
{
"epoch": 0.0385,
"grad_norm": 29.125,
"grad_norm_var": 1.1710062386255852e+18,
"learning_rate": 0.0001,
"loss": 5.3753,
"loss/crossentropy": 2.2876866430044176,
"loss/hidden": 4.1421875,
"loss/jsd": 0.11211317665874958,
"loss/logits": 0.0,
"step": 770
},
{
"epoch": 0.039,
"grad_norm": 27.875,
"grad_norm_var": 485.3025390625,
"learning_rate": 0.0001,
"loss": 5.2875,
"loss/crossentropy": 2.3789359077811243,
"loss/hidden": 4.13828125,
"loss/jsd": 0.11359207816421986,
"loss/logits": 0.0,
"step": 780
},
{
"epoch": 0.0395,
"grad_norm": 21.875,
"grad_norm_var": 19.980208333333334,
"learning_rate": 0.0001,
"loss": 5.2659,
"loss/crossentropy": 2.4840095818042753,
"loss/hidden": 4.076953125,
"loss/jsd": 0.1078010268509388,
"loss/logits": 0.0,
"step": 790
},
{
"epoch": 0.04,
"grad_norm": 32.75,
"grad_norm_var": 21.772330729166665,
"learning_rate": 0.0001,
"loss": 5.3525,
"loss/crossentropy": 2.2179358512163163,
"loss/hidden": 4.16796875,
"loss/jsd": 0.11819018721580506,
"loss/logits": 0.0,
"step": 800
},
{
"epoch": 0.0405,
"grad_norm": 27.0,
"grad_norm_var": 22.1884765625,
"learning_rate": 0.0001,
"loss": 5.3043,
"loss/crossentropy": 2.4508845895528792,
"loss/hidden": 4.133203125,
"loss/jsd": 0.11473200833424926,
"loss/logits": 0.0,
"step": 810
},
{
"epoch": 0.041,
"grad_norm": 28.625,
"grad_norm_var": 62.53515625,
"learning_rate": 0.0001,
"loss": 5.2633,
"loss/crossentropy": 2.5463142573833464,
"loss/hidden": 4.076171875,
"loss/jsd": 0.12316551432013512,
"loss/logits": 0.0,
"step": 820
},
{
"epoch": 0.0415,
"grad_norm": 26.625,
"grad_norm_var": 29.2150390625,
"learning_rate": 0.0001,
"loss": 5.2498,
"loss/crossentropy": 2.379361332952976,
"loss/hidden": 4.125,
"loss/jsd": 0.11994905360043048,
"loss/logits": 0.0,
"step": 830
},
{
"epoch": 0.042,
"grad_norm": 27.75,
"grad_norm_var": 37.1197265625,
"learning_rate": 0.0001,
"loss": 5.25,
"loss/crossentropy": 2.448214793205261,
"loss/hidden": 4.233203125,
"loss/jsd": 0.13263647919520735,
"loss/logits": 0.0,
"step": 840
},
{
"epoch": 0.0425,
"grad_norm": 26.25,
"grad_norm_var": 13.433072916666667,
"learning_rate": 0.0001,
"loss": 5.1491,
"loss/crossentropy": 2.4302526518702505,
"loss/hidden": 4.12578125,
"loss/jsd": 0.11334973787888884,
"loss/logits": 0.0,
"step": 850
},
{
"epoch": 0.043,
"grad_norm": 23.75,
"grad_norm_var": 7.470572916666667,
"learning_rate": 0.0001,
"loss": 5.1671,
"loss/crossentropy": 2.415765553712845,
"loss/hidden": 4.11328125,
"loss/jsd": 0.11990332859568298,
"loss/logits": 0.0,
"step": 860
},
{
"epoch": 0.0435,
"grad_norm": 25.5,
"grad_norm_var": 6.077083333333333,
"learning_rate": 0.0001,
"loss": 5.1279,
"loss/crossentropy": 2.3868868976831434,
"loss/hidden": 4.0671875,
"loss/jsd": 0.11438164403662085,
"loss/logits": 0.0,
"step": 870
},
{
"epoch": 0.044,
"grad_norm": 25.0,
"grad_norm_var": 48.50416666666667,
"learning_rate": 0.0001,
"loss": 5.18,
"loss/crossentropy": 2.367817610502243,
"loss/hidden": 4.136328125,
"loss/jsd": 0.12616985198110342,
"loss/logits": 0.0,
"step": 880
},
{
"epoch": 0.0445,
"grad_norm": 23.625,
"grad_norm_var": 52.3375,
"learning_rate": 0.0001,
"loss": 5.1786,
"loss/crossentropy": 2.4342163532972334,
"loss/hidden": 4.0125,
"loss/jsd": 0.12039547078311444,
"loss/logits": 0.0,
"step": 890
},
{
"epoch": 0.045,
"grad_norm": 28.125,
"grad_norm_var": 6.708268229166666,
"learning_rate": 0.0001,
"loss": 5.1451,
"loss/crossentropy": 2.4633941307663916,
"loss/hidden": 4.08125,
"loss/jsd": 0.11877955347299576,
"loss/logits": 0.0,
"step": 900
},
{
"epoch": 0.0455,
"grad_norm": 28.5,
"grad_norm_var": 8.4603515625,
"learning_rate": 0.0001,
"loss": 5.1919,
"loss/crossentropy": 2.3779468327760696,
"loss/hidden": 4.058984375,
"loss/jsd": 0.11588607728481293,
"loss/logits": 0.0,
"step": 910
},
{
"epoch": 0.046,
"grad_norm": 38.25,
"grad_norm_var": 59.06295572916667,
"learning_rate": 0.0001,
"loss": 5.2033,
"loss/crossentropy": 2.4956902295351027,
"loss/hidden": 4.107421875,
"loss/jsd": 0.11758697256445885,
"loss/logits": 0.0,
"step": 920
},
{
"epoch": 0.0465,
"grad_norm": 22.625,
"grad_norm_var": 21.1744140625,
"learning_rate": 0.0001,
"loss": 5.1248,
"loss/crossentropy": 2.4070512309670447,
"loss/hidden": 4.123828125,
"loss/jsd": 0.12089485572651029,
"loss/logits": 0.0,
"step": 930
},
{
"epoch": 0.047,
"grad_norm": 47.75,
"grad_norm_var": 164.896875,
"learning_rate": 0.0001,
"loss": 5.1273,
"loss/crossentropy": 2.1984025448560716,
"loss/hidden": 3.83359375,
"loss/jsd": 0.10510765034705401,
"loss/logits": 0.0,
"step": 940
},
{
"epoch": 0.0475,
"grad_norm": 24.125,
"grad_norm_var": 171.48326822916667,
"learning_rate": 0.0001,
"loss": 5.0933,
"loss/crossentropy": 2.408414696156979,
"loss/hidden": 3.9015625,
"loss/jsd": 0.09813609030097722,
"loss/logits": 0.0,
"step": 950
},
{
"epoch": 0.048,
"grad_norm": 25.5,
"grad_norm_var": 10.351041666666667,
"learning_rate": 0.0001,
"loss": 5.0887,
"loss/crossentropy": 2.3635219663381575,
"loss/hidden": 3.983984375,
"loss/jsd": 0.10892721712589264,
"loss/logits": 0.0,
"step": 960
},
{
"epoch": 0.0485,
"grad_norm": 23.25,
"grad_norm_var": 15.676497395833334,
"learning_rate": 0.0001,
"loss": 5.0293,
"loss/crossentropy": 2.182341808080673,
"loss/hidden": 3.92421875,
"loss/jsd": 0.10646048728376627,
"loss/logits": 0.0,
"step": 970
},
{
"epoch": 0.049,
"grad_norm": 26.625,
"grad_norm_var": 7.992708333333334,
"learning_rate": 0.0001,
"loss": 5.1407,
"loss/crossentropy": 2.4966017305850983,
"loss/hidden": 3.909375,
"loss/jsd": 0.11931864526122808,
"loss/logits": 0.0,
"step": 980
},
{
"epoch": 0.0495,
"grad_norm": 25.125,
"grad_norm_var": 915.2077473958333,
"learning_rate": 0.0001,
"loss": 5.1799,
"loss/crossentropy": 2.3614319562911987,
"loss/hidden": 3.95390625,
"loss/jsd": 0.10783975422382355,
"loss/logits": 0.0,
"step": 990
},
{
"epoch": 0.05,
"grad_norm": 24.875,
"grad_norm_var": 862.96640625,
"learning_rate": 0.0001,
"loss": 5.1175,
"loss/crossentropy": 2.3259101063013077,
"loss/hidden": 4.09140625,
"loss/jsd": 0.11582606900483369,
"loss/logits": 0.0,
"step": 1000
},
{
"epoch": 0.0505,
"grad_norm": 27.0,
"grad_norm_var": 36.96243489583333,
"learning_rate": 0.0001,
"loss": 5.1445,
"loss/crossentropy": 2.4153922617435457,
"loss/hidden": 4.044140625,
"loss/jsd": 0.11763136927038431,
"loss/logits": 0.0,
"step": 1010
},
{
"epoch": 0.051,
"grad_norm": 27.0,
"grad_norm_var": 11.583333333333334,
"learning_rate": 0.0001,
"loss": 5.0695,
"loss/crossentropy": 2.287649059295654,
"loss/hidden": 3.97578125,
"loss/jsd": 0.10912037892267108,
"loss/logits": 0.0,
"step": 1020
},
{
"epoch": 0.0515,
"grad_norm": 34.25,
"grad_norm_var": 598.6910807291666,
"learning_rate": 0.0001,
"loss": 5.1531,
"loss/crossentropy": 2.5355153501033785,
"loss/hidden": 3.972265625,
"loss/jsd": 0.11578338220715523,
"loss/logits": 0.0,
"step": 1030
},
{
"epoch": 0.052,
"grad_norm": 23.0,
"grad_norm_var": 149.62389322916667,
"learning_rate": 0.0001,
"loss": 5.1453,
"loss/crossentropy": 2.328887623548508,
"loss/hidden": 3.84609375,
"loss/jsd": 0.1067446961067617,
"loss/logits": 0.0,
"step": 1040
},
{
"epoch": 0.0525,
"grad_norm": 22.625,
"grad_norm_var": 23.629166666666666,
"learning_rate": 0.0001,
"loss": 5.0775,
"loss/crossentropy": 2.3245414569973946,
"loss/hidden": 3.950390625,
"loss/jsd": 0.11564150396734477,
"loss/logits": 0.0,
"step": 1050
},
{
"epoch": 0.053,
"grad_norm": 29.375,
"grad_norm_var": 22.822330729166666,
"learning_rate": 0.0001,
"loss": 4.929,
"loss/crossentropy": 2.5518812984228134,
"loss/hidden": 3.76796875,
"loss/jsd": 0.10029144948348404,
"loss/logits": 0.0,
"step": 1060
},
{
"epoch": 0.0535,
"grad_norm": 22.875,
"grad_norm_var": 27.373372395833332,
"learning_rate": 0.0001,
"loss": 5.1682,
"loss/crossentropy": 2.3814490526914596,
"loss/hidden": 4.084765625,
"loss/jsd": 0.13794842325150966,
"loss/logits": 0.0,
"step": 1070
},
{
"epoch": 0.054,
"grad_norm": 30.375,
"grad_norm_var": 25.968684895833334,
"learning_rate": 0.0001,
"loss": 5.0446,
"loss/crossentropy": 2.336636045575142,
"loss/hidden": 3.98984375,
"loss/jsd": 0.11006514001637697,
"loss/logits": 0.0,
"step": 1080
},
{
"epoch": 0.0545,
"grad_norm": 25.5,
"grad_norm_var": 32.0447265625,
"learning_rate": 0.0001,
"loss": 5.0339,
"loss/crossentropy": 2.2337013885378836,
"loss/hidden": 3.945703125,
"loss/jsd": 0.11723029632121325,
"loss/logits": 0.0,
"step": 1090
},
{
"epoch": 0.055,
"grad_norm": 25.375,
"grad_norm_var": 102.66432291666666,
"learning_rate": 0.0001,
"loss": 5.0155,
"loss/crossentropy": 2.443159765005112,
"loss/hidden": 4.062890625,
"loss/jsd": 0.11166490567848086,
"loss/logits": 0.0,
"step": 1100
},
{
"epoch": 0.0555,
"grad_norm": 26.25,
"grad_norm_var": 12.558072916666667,
"learning_rate": 0.0001,
"loss": 5.0531,
"loss/crossentropy": 2.2338882118463514,
"loss/hidden": 4.025,
"loss/jsd": 0.11465255348011852,
"loss/logits": 0.0,
"step": 1110
},
{
"epoch": 0.056,
"grad_norm": 25.875,
"grad_norm_var": 8.347916666666666,
"learning_rate": 0.0001,
"loss": 5.0976,
"loss/crossentropy": 2.3596479177474974,
"loss/hidden": 3.940625,
"loss/jsd": 0.11759824641048908,
"loss/logits": 0.0,
"step": 1120
},
{
"epoch": 0.0565,
"grad_norm": 30.25,
"grad_norm_var": 188.1353515625,
"learning_rate": 0.0001,
"loss": 5.0785,
"loss/crossentropy": 2.3698789328336716,
"loss/hidden": 3.962109375,
"loss/jsd": 0.1172801936045289,
"loss/logits": 0.0,
"step": 1130
},
{
"epoch": 0.057,
"grad_norm": 26.375,
"grad_norm_var": 185.04765625,
"learning_rate": 0.0001,
"loss": 5.0927,
"loss/crossentropy": 2.3481896728277207,
"loss/hidden": 3.9609375,
"loss/jsd": 0.10608052760362625,
"loss/logits": 0.0,
"step": 1140
},
{
"epoch": 0.0575,
"grad_norm": 22.875,
"grad_norm_var": 125.32233072916667,
"learning_rate": 0.0001,
"loss": 5.0263,
"loss/crossentropy": 2.301522643119097,
"loss/hidden": 3.8,
"loss/jsd": 0.10154257528483868,
"loss/logits": 0.0,
"step": 1150
},
{
"epoch": 0.058,
"grad_norm": 27.625,
"grad_norm_var": 81.21432291666666,
"learning_rate": 0.0001,
"loss": 5.1087,
"loss/crossentropy": 2.3300373941659926,
"loss/hidden": 3.923828125,
"loss/jsd": 0.10997985871508717,
"loss/logits": 0.0,
"step": 1160
},
{
"epoch": 0.0585,
"grad_norm": 22.0,
"grad_norm_var": 37.805989583333336,
"learning_rate": 0.0001,
"loss": 4.9669,
"loss/crossentropy": 2.3570085942745207,
"loss/hidden": 3.903125,
"loss/jsd": 0.12716795089654626,
"loss/logits": 0.0,
"step": 1170
},
{
"epoch": 0.059,
"grad_norm": 28.25,
"grad_norm_var": 6.526822916666666,
"learning_rate": 0.0001,
"loss": 4.8827,
"loss/crossentropy": 2.4714103788137436,
"loss/hidden": 3.878125,
"loss/jsd": 0.11338211484253406,
"loss/logits": 0.0,
"step": 1180
},
{
"epoch": 0.0595,
"grad_norm": 25.0,
"grad_norm_var": 1.0217717449682671e+18,
"learning_rate": 0.0001,
"loss": 5.0544,
"loss/crossentropy": 2.170953643321991,
"loss/hidden": 3.91875,
"loss/jsd": 0.11225487310439348,
"loss/logits": 0.0,
"step": 1190
},
{
"epoch": 0.06,
"grad_norm": 22.125,
"grad_norm_var": 22.508072916666666,
"learning_rate": 0.0001,
"loss": 4.8895,
"loss/crossentropy": 2.4479696050286295,
"loss/hidden": 3.896484375,
"loss/jsd": 0.10494228331372142,
"loss/logits": 0.0,
"step": 1200
},
{
"epoch": 0.0605,
"grad_norm": 22.25,
"grad_norm_var": 19.080989583333334,
"learning_rate": 0.0001,
"loss": 4.8699,
"loss/crossentropy": 2.3343143433332445,
"loss/hidden": 3.787109375,
"loss/jsd": 0.10432742889970541,
"loss/logits": 0.0,
"step": 1210
},
{
"epoch": 0.061,
"grad_norm": 19.0,
"grad_norm_var": 7.299934895833333,
"learning_rate": 0.0001,
"loss": 4.9113,
"loss/crossentropy": 2.2152185067534447,
"loss/hidden": 3.838671875,
"loss/jsd": 0.10314544131979346,
"loss/logits": 0.0,
"step": 1220
},
{
"epoch": 0.0615,
"grad_norm": 25.125,
"grad_norm_var": 8.783333333333333,
"learning_rate": 0.0001,
"loss": 4.8793,
"loss/crossentropy": 2.3982258841395376,
"loss/hidden": 3.765625,
"loss/jsd": 0.1033841515891254,
"loss/logits": 0.0,
"step": 1230
},
{
"epoch": 0.062,
"grad_norm": 24.25,
"grad_norm_var": 8.654166666666667,
"learning_rate": 0.0001,
"loss": 4.936,
"loss/crossentropy": 2.3861924752593042,
"loss/hidden": 3.990625,
"loss/jsd": 0.1316368247382343,
"loss/logits": 0.0,
"step": 1240
},
{
"epoch": 0.0625,
"grad_norm": 27.625,
"grad_norm_var": 18.838997395833335,
"learning_rate": 0.0001,
"loss": 5.0574,
"loss/crossentropy": 2.3481432244181635,
"loss/hidden": 3.886328125,
"loss/jsd": 0.12455893289297819,
"loss/logits": 0.0,
"step": 1250
},
{
"epoch": 0.063,
"grad_norm": 24.875,
"grad_norm_var": 741.0330729166667,
"learning_rate": 0.0001,
"loss": 5.054,
"loss/crossentropy": 2.50970872938633,
"loss/hidden": 3.89375,
"loss/jsd": 0.11707814577966928,
"loss/logits": 0.0,
"step": 1260
},
{
"epoch": 0.0635,
"grad_norm": 22.25,
"grad_norm_var": 766.54140625,
"learning_rate": 0.0001,
"loss": 4.9292,
"loss/crossentropy": 2.214522284269333,
"loss/hidden": 3.817578125,
"loss/jsd": 0.09662074805237353,
"loss/logits": 0.0,
"step": 1270
},
{
"epoch": 0.064,
"grad_norm": 27.125,
"grad_norm_var": 1.2075980051835433e+18,
"learning_rate": 0.0001,
"loss": 4.9727,
"loss/crossentropy": 2.5177758872509,
"loss/hidden": 3.872265625,
"loss/jsd": 0.12324077049270273,
"loss/logits": 0.0,
"step": 1280
},
{
"epoch": 0.0645,
"grad_norm": 26.25,
"grad_norm_var": 4.482291666666667,
"learning_rate": 0.0001,
"loss": 4.8651,
"loss/crossentropy": 2.4133356541395186,
"loss/hidden": 3.8109375,
"loss/jsd": 0.10085376175120472,
"loss/logits": 0.0,
"step": 1290
},
{
"epoch": 0.065,
"grad_norm": 20.875,
"grad_norm_var": 4.7875,
"learning_rate": 0.0001,
"loss": 4.8874,
"loss/crossentropy": 2.211686734855175,
"loss/hidden": 3.82578125,
"loss/jsd": 0.10324386316351593,
"loss/logits": 0.0,
"step": 1300
},
{
"epoch": 0.0655,
"grad_norm": 25.125,
"grad_norm_var": 39.35045572916667,
"learning_rate": 0.0001,
"loss": 4.9265,
"loss/crossentropy": 2.386268785595894,
"loss/hidden": 3.837890625,
"loss/jsd": 0.11206256924197078,
"loss/logits": 0.0,
"step": 1310
},
{
"epoch": 0.066,
"grad_norm": 25.5,
"grad_norm_var": 39.68020833333333,
"learning_rate": 0.0001,
"loss": 4.9719,
"loss/crossentropy": 2.3294328808784486,
"loss/hidden": 3.8640625,
"loss/jsd": 0.11526230238378048,
"loss/logits": 0.0,
"step": 1320
},
{
"epoch": 0.0665,
"grad_norm": 23.375,
"grad_norm_var": 47.66295572916667,
"learning_rate": 0.0001,
"loss": 4.922,
"loss/crossentropy": 2.38586545586586,
"loss/hidden": 3.7203125,
"loss/jsd": 0.09609230635687709,
"loss/logits": 0.0,
"step": 1330
},
{
"epoch": 0.067,
"grad_norm": 25.125,
"grad_norm_var": 36.25305989583333,
"learning_rate": 0.0001,
"loss": 4.9463,
"loss/crossentropy": 2.4498814970254896,
"loss/hidden": 3.84296875,
"loss/jsd": 0.10662997653707862,
"loss/logits": 0.0,
"step": 1340
},
{
"epoch": 0.0675,
"grad_norm": 26.625,
"grad_norm_var": 34.154622395833336,
"learning_rate": 0.0001,
"loss": 4.89,
"loss/crossentropy": 2.3147580534219743,
"loss/hidden": 3.840625,
"loss/jsd": 0.10548559352755546,
"loss/logits": 0.0,
"step": 1350
},
{
"epoch": 0.068,
"grad_norm": 20.25,
"grad_norm_var": 5.3369140625,
"learning_rate": 0.0001,
"loss": 4.9495,
"loss/crossentropy": 2.2381860077381135,
"loss/hidden": 3.936328125,
"loss/jsd": 0.1049613301642239,
"loss/logits": 0.0,
"step": 1360
},
{
"epoch": 0.0685,
"grad_norm": 21.625,
"grad_norm_var": 42.57604166666667,
"learning_rate": 0.0001,
"loss": 4.9902,
"loss/crossentropy": 2.3451401717960834,
"loss/hidden": 3.97578125,
"loss/jsd": 0.10501982429996133,
"loss/logits": 0.0,
"step": 1370
},
{
"epoch": 0.069,
"grad_norm": 25.5,
"grad_norm_var": 13.911393229166666,
"learning_rate": 0.0001,
"loss": 4.8738,
"loss/crossentropy": 2.2887198269367217,
"loss/hidden": 3.948828125,
"loss/jsd": 0.10703569920733572,
"loss/logits": 0.0,
"step": 1380
},
{
"epoch": 0.0695,
"grad_norm": 24.0,
"grad_norm_var": 4.178125,
"learning_rate": 0.0001,
"loss": 4.908,
"loss/crossentropy": 2.4341419368982313,
"loss/hidden": 3.9109375,
"loss/jsd": 0.13313074046745896,
"loss/logits": 0.0,
"step": 1390
},
{
"epoch": 0.07,
"grad_norm": 25.5,
"grad_norm_var": 3.2643229166666665,
"learning_rate": 0.0001,
"loss": 4.8483,
"loss/crossentropy": 2.3005983904004097,
"loss/hidden": 3.794921875,
"loss/jsd": 0.1167063161265105,
"loss/logits": 0.0,
"step": 1400
},
{
"epoch": 0.0705,
"grad_norm": 25.375,
"grad_norm_var": 2.6684895833333333,
"learning_rate": 0.0001,
"loss": 4.8661,
"loss/crossentropy": 2.3177727833390236,
"loss/hidden": 3.76328125,
"loss/jsd": 0.09948643315583468,
"loss/logits": 0.0,
"step": 1410
},
{
"epoch": 0.071,
"grad_norm": 21.25,
"grad_norm_var": 6.670572916666667,
"learning_rate": 0.0001,
"loss": 4.8736,
"loss/crossentropy": 2.2698763489723204,
"loss/hidden": 3.831640625,
"loss/jsd": 0.10282253352925182,
"loss/logits": 0.0,
"step": 1420
},
{
"epoch": 0.0715,
"grad_norm": 18.625,
"grad_norm_var": 8.87265625,
"learning_rate": 0.0001,
"loss": 4.8039,
"loss/crossentropy": 2.360131266713142,
"loss/hidden": 3.722265625,
"loss/jsd": 0.10547879729419947,
"loss/logits": 0.0,
"step": 1430
},
{
"epoch": 0.072,
"grad_norm": 21.75,
"grad_norm_var": 3.8889973958333335,
"learning_rate": 0.0001,
"loss": 4.7269,
"loss/crossentropy": 2.311430121213198,
"loss/hidden": 3.7109375,
"loss/jsd": 0.09480313453823327,
"loss/logits": 0.0,
"step": 1440
},
{
"epoch": 0.0725,
"grad_norm": 29.75,
"grad_norm_var": 7.685416666666667,
"learning_rate": 0.0001,
"loss": 4.7292,
"loss/crossentropy": 2.4506467133760452,
"loss/hidden": 3.672265625,
"loss/jsd": 0.09663807023316622,
"loss/logits": 0.0,
"step": 1450
},
{
"epoch": 0.073,
"grad_norm": 23.25,
"grad_norm_var": 8.416666666666666,
"learning_rate": 0.0001,
"loss": 4.7346,
"loss/crossentropy": 2.2691701710224152,
"loss/hidden": 3.836328125,
"loss/jsd": 0.1028917589224875,
"loss/logits": 0.0,
"step": 1460
},
{
"epoch": 0.0735,
"grad_norm": 20.375,
"grad_norm_var": 7.246875,
"learning_rate": 0.0001,
"loss": 4.7517,
"loss/crossentropy": 2.3083701550960543,
"loss/hidden": 3.714453125,
"loss/jsd": 0.09604525147005916,
"loss/logits": 0.0,
"step": 1470
},
{
"epoch": 0.074,
"grad_norm": 22.0,
"grad_norm_var": 11.672916666666667,
"learning_rate": 0.0001,
"loss": 4.8113,
"loss/crossentropy": 2.3635326638817786,
"loss/hidden": 3.6703125,
"loss/jsd": 0.10219773268327118,
"loss/logits": 0.0,
"step": 1480
},
{
"epoch": 0.0745,
"grad_norm": 21.375,
"grad_norm_var": 5.637239583333334,
"learning_rate": 0.0001,
"loss": 4.798,
"loss/crossentropy": 2.182288531959057,
"loss/hidden": 3.784375,
"loss/jsd": 0.09713765853084624,
"loss/logits": 0.0,
"step": 1490
},
{
"epoch": 0.075,
"grad_norm": 22.375,
"grad_norm_var": 13.480143229166666,
"learning_rate": 0.0001,
"loss": 4.9073,
"loss/crossentropy": 2.209014095366001,
"loss/hidden": 3.77734375,
"loss/jsd": 0.100444171205163,
"loss/logits": 0.0,
"step": 1500
},
{
"epoch": 0.0755,
"grad_norm": 20.25,
"grad_norm_var": 15.253125,
"learning_rate": 0.0001,
"loss": 4.8648,
"loss/crossentropy": 2.307139050960541,
"loss/hidden": 3.835546875,
"loss/jsd": 0.10750290956348181,
"loss/logits": 0.0,
"step": 1510
},
{
"epoch": 0.076,
"grad_norm": 22.25,
"grad_norm_var": 5.84765625,
"learning_rate": 0.0001,
"loss": 4.7021,
"loss/crossentropy": 2.4567115128040315,
"loss/hidden": 3.641015625,
"loss/jsd": 0.0963326326571405,
"loss/logits": 0.0,
"step": 1520
},
{
"epoch": 0.0765,
"grad_norm": 23.0,
"grad_norm_var": 17.799934895833335,
"learning_rate": 0.0001,
"loss": 4.7726,
"loss/crossentropy": 2.3501833245158195,
"loss/hidden": 3.71171875,
"loss/jsd": 0.09695078176446259,
"loss/logits": 0.0,
"step": 1530
},
{
"epoch": 0.077,
"grad_norm": 26.875,
"grad_norm_var": 14.445572916666666,
"learning_rate": 0.0001,
"loss": 4.7776,
"loss/crossentropy": 2.35235877931118,
"loss/hidden": 3.7109375,
"loss/jsd": 0.09894683174788951,
"loss/logits": 0.0,
"step": 1540
},
{
"epoch": 0.0775,
"grad_norm": 20.0,
"grad_norm_var": 6.866080729166667,
"learning_rate": 0.0001,
"loss": 4.7465,
"loss/crossentropy": 2.3319214552640917,
"loss/hidden": 3.678515625,
"loss/jsd": 0.10175617430359125,
"loss/logits": 0.0,
"step": 1550
},
{
"epoch": 0.078,
"grad_norm": 25.75,
"grad_norm_var": 5.9306640625,
"learning_rate": 0.0001,
"loss": 4.7662,
"loss/crossentropy": 2.312511496245861,
"loss/hidden": 3.82421875,
"loss/jsd": 0.10424250243231654,
"loss/logits": 0.0,
"step": 1560
},
{
"epoch": 0.0785,
"grad_norm": 19.25,
"grad_norm_var": 7.527018229166667,
"learning_rate": 0.0001,
"loss": 4.7506,
"loss/crossentropy": 2.195492114126682,
"loss/hidden": 3.806640625,
"loss/jsd": 0.10378086129203438,
"loss/logits": 0.0,
"step": 1570
},
{
"epoch": 0.079,
"grad_norm": 20.875,
"grad_norm_var": 5.69765625,
"learning_rate": 0.0001,
"loss": 4.7525,
"loss/crossentropy": 2.3451679602265356,
"loss/hidden": 3.64609375,
"loss/jsd": 0.10021187355741859,
"loss/logits": 0.0,
"step": 1580
},
{
"epoch": 0.0795,
"grad_norm": 25.125,
"grad_norm_var": 4.002083333333333,
"learning_rate": 0.0001,
"loss": 4.7907,
"loss/crossentropy": 2.235419529676437,
"loss/hidden": 3.76875,
"loss/jsd": 0.1017349574714899,
"loss/logits": 0.0,
"step": 1590
},
{
"epoch": 0.08,
"grad_norm": 17.5,
"grad_norm_var": 6.187239583333334,
"learning_rate": 0.0001,
"loss": 4.7544,
"loss/crossentropy": 2.349038490653038,
"loss/hidden": 3.75859375,
"loss/jsd": 0.10723181385546923,
"loss/logits": 0.0,
"step": 1600
},
{
"epoch": 0.0805,
"grad_norm": 21.5,
"grad_norm_var": 5.036458333333333,
"learning_rate": 0.0001,
"loss": 4.7984,
"loss/crossentropy": 2.2953826270997526,
"loss/hidden": 3.735546875,
"loss/jsd": 0.10434331484138966,
"loss/logits": 0.0,
"step": 1610
},
{
"epoch": 0.081,
"grad_norm": 21.625,
"grad_norm_var": 2.8671223958333334,
"learning_rate": 0.0001,
"loss": 4.7802,
"loss/crossentropy": 2.420463111996651,
"loss/hidden": 3.7671875,
"loss/jsd": 0.09350865064188837,
"loss/logits": 0.0,
"step": 1620
},
{
"epoch": 0.0815,
"grad_norm": 21.875,
"grad_norm_var": 5.375455729166666,
"learning_rate": 0.0001,
"loss": 4.7768,
"loss/crossentropy": 2.4329511165618896,
"loss/hidden": 3.770703125,
"loss/jsd": 0.11144884563982486,
"loss/logits": 0.0,
"step": 1630
},
{
"epoch": 0.082,
"grad_norm": 18.0,
"grad_norm_var": 5.853059895833334,
"learning_rate": 0.0001,
"loss": 4.81,
"loss/crossentropy": 2.3901975452899933,
"loss/hidden": 3.816015625,
"loss/jsd": 0.11511239362880588,
"loss/logits": 0.0,
"step": 1640
},
{
"epoch": 0.0825,
"grad_norm": 18.625,
"grad_norm_var": 6.853059895833334,
"learning_rate": 0.0001,
"loss": 4.8666,
"loss/crossentropy": 2.42452190220356,
"loss/hidden": 3.791015625,
"loss/jsd": 0.10730197560042143,
"loss/logits": 0.0,
"step": 1650
},
{
"epoch": 0.083,
"grad_norm": 22.125,
"grad_norm_var": 5.945768229166666,
"learning_rate": 0.0001,
"loss": 4.825,
"loss/crossentropy": 2.415967509150505,
"loss/hidden": 3.7140625,
"loss/jsd": 0.10223841555416584,
"loss/logits": 0.0,
"step": 1660
},
{
"epoch": 0.0835,
"grad_norm": 19.625,
"grad_norm_var": 4.648372395833333,
"learning_rate": 0.0001,
"loss": 4.6893,
"loss/crossentropy": 2.346050335466862,
"loss/hidden": 3.75234375,
"loss/jsd": 0.10205129384994507,
"loss/logits": 0.0,
"step": 1670
},
{
"epoch": 0.084,
"grad_norm": 26.75,
"grad_norm_var": 6.887239583333334,
"learning_rate": 0.0001,
"loss": 4.7759,
"loss/crossentropy": 2.272695492208004,
"loss/hidden": 3.740234375,
"loss/jsd": 0.09743564091622829,
"loss/logits": 0.0,
"step": 1680
},
{
"epoch": 0.0845,
"grad_norm": 21.25,
"grad_norm_var": 8.242122395833333,
"learning_rate": 0.0001,
"loss": 4.7886,
"loss/crossentropy": 2.421866828203201,
"loss/hidden": 3.801171875,
"loss/jsd": 0.10874381214380265,
"loss/logits": 0.0,
"step": 1690
},
{
"epoch": 0.085,
"grad_norm": 21.875,
"grad_norm_var": 5.842643229166667,
"learning_rate": 0.0001,
"loss": 4.7021,
"loss/crossentropy": 2.389561951160431,
"loss/hidden": 3.679296875,
"loss/jsd": 0.1009491034783423,
"loss/logits": 0.0,
"step": 1700
},
{
"epoch": 0.0855,
"grad_norm": 21.625,
"grad_norm_var": 13.94765625,
"learning_rate": 0.0001,
"loss": 4.8137,
"loss/crossentropy": 2.3791208446025847,
"loss/hidden": 3.775,
"loss/jsd": 0.11071940269321204,
"loss/logits": 0.0,
"step": 1710
},
{
"epoch": 0.086,
"grad_norm": 20.5,
"grad_norm_var": 13.2603515625,
"learning_rate": 0.0001,
"loss": 4.738,
"loss/crossentropy": 2.4374333173036575,
"loss/hidden": 3.728125,
"loss/jsd": 0.10198512580245733,
"loss/logits": 0.0,
"step": 1720
},
{
"epoch": 0.0865,
"grad_norm": 16.625,
"grad_norm_var": 11.1853515625,
"learning_rate": 0.0001,
"loss": 4.7524,
"loss/crossentropy": 2.3030879952013494,
"loss/hidden": 3.626953125,
"loss/jsd": 0.09310725582763553,
"loss/logits": 0.0,
"step": 1730
},
{
"epoch": 0.087,
"grad_norm": 18.875,
"grad_norm_var": 6.285416666666666,
"learning_rate": 0.0001,
"loss": 4.7021,
"loss/crossentropy": 2.192840526998043,
"loss/hidden": 3.819140625,
"loss/jsd": 0.09320764979347587,
"loss/logits": 0.0,
"step": 1740
},
{
"epoch": 0.0875,
"grad_norm": 24.625,
"grad_norm_var": 6.4353515625,
"learning_rate": 0.0001,
"loss": 4.7059,
"loss/crossentropy": 2.3610597878694533,
"loss/hidden": 3.733984375,
"loss/jsd": 0.10029621962457895,
"loss/logits": 0.0,
"step": 1750
},
{
"epoch": 0.088,
"grad_norm": 19.5,
"grad_norm_var": 19.762239583333333,
"learning_rate": 0.0001,
"loss": 4.7081,
"loss/crossentropy": 2.410063475370407,
"loss/hidden": 3.65078125,
"loss/jsd": 0.10161215299740434,
"loss/logits": 0.0,
"step": 1760
},
{
"epoch": 0.0885,
"grad_norm": 33.25,
"grad_norm_var": 22.748893229166665,
"learning_rate": 0.0001,
"loss": 4.5743,
"loss/crossentropy": 2.2984881952404974,
"loss/hidden": 3.666796875,
"loss/jsd": 0.09647621251642705,
"loss/logits": 0.0,
"step": 1770
},
{
"epoch": 0.089,
"grad_norm": 20.25,
"grad_norm_var": 16.602083333333333,
"learning_rate": 0.0001,
"loss": 4.7585,
"loss/crossentropy": 2.3432783752679827,
"loss/hidden": 3.716796875,
"loss/jsd": 0.10081057399511337,
"loss/logits": 0.0,
"step": 1780
},
{
"epoch": 0.0895,
"grad_norm": 22.875,
"grad_norm_var": 5.621875,
"learning_rate": 0.0001,
"loss": 4.707,
"loss/crossentropy": 2.352738951146603,
"loss/hidden": 3.673828125,
"loss/jsd": 0.09183212611824274,
"loss/logits": 0.0,
"step": 1790
},
{
"epoch": 0.09,
"grad_norm": 33.0,
"grad_norm_var": 16.277083333333334,
"learning_rate": 0.0001,
"loss": 4.6996,
"loss/crossentropy": 2.443929785490036,
"loss/hidden": 3.55390625,
"loss/jsd": 0.09280467573553323,
"loss/logits": 0.0,
"step": 1800
},
{
"epoch": 0.0905,
"grad_norm": 17.125,
"grad_norm_var": 20.319205729166665,
"learning_rate": 0.0001,
"loss": 4.7503,
"loss/crossentropy": 2.333281812816858,
"loss/hidden": 3.687109375,
"loss/jsd": 0.09856429314240814,
"loss/logits": 0.0,
"step": 1810
},
{
"epoch": 0.091,
"grad_norm": 19.625,
"grad_norm_var": 14.943684895833334,
"learning_rate": 0.0001,
"loss": 4.8011,
"loss/crossentropy": 2.3165989741683006,
"loss/hidden": 3.93515625,
"loss/jsd": 0.1154123242944479,
"loss/logits": 0.0,
"step": 1820
},
{
"epoch": 0.0915,
"grad_norm": 19.125,
"grad_norm_var": 2.5833333333333335,
"learning_rate": 0.0001,
"loss": 4.7784,
"loss/crossentropy": 2.3343286007642745,
"loss/hidden": 3.796875,
"loss/jsd": 0.11231993734836579,
"loss/logits": 0.0,
"step": 1830
},
{
"epoch": 0.092,
"grad_norm": 18.0,
"grad_norm_var": 4.880989583333333,
"learning_rate": 0.0001,
"loss": 4.6886,
"loss/crossentropy": 2.412258565425873,
"loss/hidden": 3.78046875,
"loss/jsd": 0.10415599066764117,
"loss/logits": 0.0,
"step": 1840
},
{
"epoch": 0.0925,
"grad_norm": 17.25,
"grad_norm_var": 6.083072916666667,
"learning_rate": 0.0001,
"loss": 4.7485,
"loss/crossentropy": 2.379472056031227,
"loss/hidden": 3.6609375,
"loss/jsd": 0.09712380319833755,
"loss/logits": 0.0,
"step": 1850
},
{
"epoch": 0.093,
"grad_norm": 19.125,
"grad_norm_var": 9.0041015625,
"learning_rate": 0.0001,
"loss": 4.7145,
"loss/crossentropy": 2.286051708459854,
"loss/hidden": 3.671484375,
"loss/jsd": 0.09749153861775994,
"loss/logits": 0.0,
"step": 1860
},
{
"epoch": 0.0935,
"grad_norm": 24.25,
"grad_norm_var": 10.09765625,
"learning_rate": 0.0001,
"loss": 4.6597,
"loss/crossentropy": 2.3485587686300278,
"loss/hidden": 3.656640625,
"loss/jsd": 0.09961330010555684,
"loss/logits": 0.0,
"step": 1870
},
{
"epoch": 0.094,
"grad_norm": 25.5,
"grad_norm_var": 10.745833333333334,
"learning_rate": 0.0001,
"loss": 4.7654,
"loss/crossentropy": 2.22419136762619,
"loss/hidden": 3.680859375,
"loss/jsd": 0.09599914094433189,
"loss/logits": 0.0,
"step": 1880
},
{
"epoch": 0.0945,
"grad_norm": 23.375,
"grad_norm_var": 11.849739583333333,
"learning_rate": 0.0001,
"loss": 4.6586,
"loss/crossentropy": 2.2319135151803495,
"loss/hidden": 3.776953125,
"loss/jsd": 0.1003801210783422,
"loss/logits": 0.0,
"step": 1890
},
{
"epoch": 0.095,
"grad_norm": 20.125,
"grad_norm_var": 15.6884765625,
"learning_rate": 0.0001,
"loss": 4.7113,
"loss/crossentropy": 2.466662494838238,
"loss/hidden": 3.694140625,
"loss/jsd": 0.09942078748717904,
"loss/logits": 0.0,
"step": 1900
},
{
"epoch": 0.0955,
"grad_norm": 20.875,
"grad_norm_var": 11.4197265625,
"learning_rate": 0.0001,
"loss": 4.6638,
"loss/crossentropy": 2.3695669680833817,
"loss/hidden": 3.593359375,
"loss/jsd": 0.09504008954390883,
"loss/logits": 0.0,
"step": 1910
},
{
"epoch": 0.096,
"grad_norm": 22.125,
"grad_norm_var": 4.581184895833333,
"learning_rate": 0.0001,
"loss": 4.6473,
"loss/crossentropy": 2.345889499783516,
"loss/hidden": 3.691015625,
"loss/jsd": 0.10475197089836001,
"loss/logits": 0.0,
"step": 1920
},
{
"epoch": 0.0965,
"grad_norm": 22.5,
"grad_norm_var": 4.087239583333333,
"learning_rate": 0.0001,
"loss": 4.721,
"loss/crossentropy": 2.256808315217495,
"loss/hidden": 3.7015625,
"loss/jsd": 0.09892030693590641,
"loss/logits": 0.0,
"step": 1930
},
{
"epoch": 0.097,
"grad_norm": 19.125,
"grad_norm_var": 6.395572916666667,
"learning_rate": 0.0001,
"loss": 4.5498,
"loss/crossentropy": 2.5429009228944777,
"loss/hidden": 3.680859375,
"loss/jsd": 0.09861663114279509,
"loss/logits": 0.0,
"step": 1940
},
{
"epoch": 0.0975,
"grad_norm": 21.25,
"grad_norm_var": 5.843489583333334,
"learning_rate": 0.0001,
"loss": 4.6899,
"loss/crossentropy": 2.272120487689972,
"loss/hidden": 3.6375,
"loss/jsd": 0.09743905253708363,
"loss/logits": 0.0,
"step": 1950
},
{
"epoch": 0.098,
"grad_norm": 17.625,
"grad_norm_var": 6.62265625,
"learning_rate": 0.0001,
"loss": 4.6035,
"loss/crossentropy": 2.1649394638836386,
"loss/hidden": 3.584765625,
"loss/jsd": 0.08881366224959493,
"loss/logits": 0.0,
"step": 1960
},
{
"epoch": 0.0985,
"grad_norm": 4328521728.0,
"grad_norm_var": 1.171006260534208e+18,
"learning_rate": 0.0001,
"loss": 4.6906,
"loss/crossentropy": 2.3182963758707045,
"loss/hidden": 3.623046875,
"loss/jsd": 0.10251586111262441,
"loss/logits": 0.0,
"step": 1970
},
{
"epoch": 0.099,
"grad_norm": 20.25,
"grad_norm_var": 2.715501666496903e+18,
"learning_rate": 0.0001,
"loss": 4.7101,
"loss/crossentropy": 2.407327815890312,
"loss/hidden": 3.60703125,
"loss/jsd": 0.09459855072200299,
"loss/logits": 0.0,
"step": 1980
},
{
"epoch": 0.0995,
"grad_norm": 19.25,
"grad_norm_var": 1.7345191619224492e+18,
"learning_rate": 0.0001,
"loss": 4.6395,
"loss/crossentropy": 2.256649875640869,
"loss/hidden": 3.65234375,
"loss/jsd": 0.10230031171813607,
"loss/logits": 0.0,
"step": 1990
},
{
"epoch": 0.1,
"grad_norm": 22.0,
"grad_norm_var": 2.981184895833333,
"learning_rate": 0.0001,
"loss": 4.5112,
"loss/crossentropy": 2.3214069336652754,
"loss/hidden": 3.553515625,
"loss/jsd": 0.09316142341122031,
"loss/logits": 0.0,
"step": 2000
},
{
"epoch": 0.1005,
"grad_norm": 20.25,
"grad_norm_var": 4.611393229166667,
"learning_rate": 0.0001,
"loss": 4.5154,
"loss/crossentropy": 2.297450725734234,
"loss/hidden": 3.569140625,
"loss/jsd": 0.09217815361917019,
"loss/logits": 0.0,
"step": 2010
},
{
"epoch": 0.101,
"grad_norm": 23.25,
"grad_norm_var": 8.597249348958334,
"learning_rate": 0.0001,
"loss": 4.6108,
"loss/crossentropy": 2.3132576078176497,
"loss/hidden": 3.69375,
"loss/jsd": 0.1215221800841391,
"loss/logits": 0.0,
"step": 2020
},
{
"epoch": 0.1015,
"grad_norm": 16.75,
"grad_norm_var": 8.385400390625,
"learning_rate": 0.0001,
"loss": 4.5906,
"loss/crossentropy": 2.42258235514164,
"loss/hidden": 3.58359375,
"loss/jsd": 0.09518450712785125,
"loss/logits": 0.0,
"step": 2030
},
{
"epoch": 0.102,
"grad_norm": 17.75,
"grad_norm_var": 39.30416666666667,
"learning_rate": 0.0001,
"loss": 4.6782,
"loss/crossentropy": 2.226282720267773,
"loss/hidden": 3.6078125,
"loss/jsd": 0.08297519264742732,
"loss/logits": 0.0,
"step": 2040
},
{
"epoch": 0.1025,
"grad_norm": 18.5,
"grad_norm_var": 8.6306640625,
"learning_rate": 0.0001,
"loss": 4.6889,
"loss/crossentropy": 2.23982213139534,
"loss/hidden": 3.660546875,
"loss/jsd": 0.0924127135425806,
"loss/logits": 0.0,
"step": 2050
},
{
"epoch": 0.103,
"grad_norm": 18.25,
"grad_norm_var": 6.187744140625,
"learning_rate": 0.0001,
"loss": 4.6246,
"loss/crossentropy": 2.2483278423547746,
"loss/hidden": 3.60859375,
"loss/jsd": 0.09513462502509355,
"loss/logits": 0.0,
"step": 2060
},
{
"epoch": 0.1035,
"grad_norm": 28.375,
"grad_norm_var": 12.276416015625,
"learning_rate": 0.0001,
"loss": 4.6868,
"loss/crossentropy": 2.2927519381046295,
"loss/hidden": 3.53515625,
"loss/jsd": 0.08648296073079109,
"loss/logits": 0.0,
"step": 2070
},
{
"epoch": 0.104,
"grad_norm": 24.125,
"grad_norm_var": 14.239583333333334,
"learning_rate": 0.0001,
"loss": 4.5602,
"loss/crossentropy": 2.3293472826480865,
"loss/hidden": 3.599609375,
"loss/jsd": 0.09772532721981406,
"loss/logits": 0.0,
"step": 2080
},
{
"epoch": 0.1045,
"grad_norm": 19.75,
"grad_norm_var": 8.269205729166666,
"learning_rate": 0.0001,
"loss": 4.6017,
"loss/crossentropy": 2.3832351714372635,
"loss/hidden": 3.60078125,
"loss/jsd": 0.09314336217939853,
"loss/logits": 0.0,
"step": 2090
},
{
"epoch": 0.105,
"grad_norm": 20.0,
"grad_norm_var": 5.070833333333334,
"learning_rate": 0.0001,
"loss": 4.5706,
"loss/crossentropy": 2.4874933838844298,
"loss/hidden": 3.651171875,
"loss/jsd": 0.09874060060828924,
"loss/logits": 0.0,
"step": 2100
},
{
"epoch": 0.1055,
"grad_norm": 16.0,
"grad_norm_var": 42.828059895833334,
"learning_rate": 0.0001,
"loss": 4.6945,
"loss/crossentropy": 2.185934893786907,
"loss/hidden": 3.78046875,
"loss/jsd": 0.10176362562924623,
"loss/logits": 0.0,
"step": 2110
},
{
"epoch": 0.106,
"grad_norm": 20.5,
"grad_norm_var": 912.2905598958333,
"learning_rate": 0.0001,
"loss": 4.8579,
"loss/crossentropy": 2.337796673178673,
"loss/hidden": 3.673828125,
"loss/jsd": 0.09400355285033583,
"loss/logits": 0.0,
"step": 2120
},
{
"epoch": 0.1065,
"grad_norm": 19.0,
"grad_norm_var": 86.45514322916667,
"learning_rate": 0.0001,
"loss": 4.6905,
"loss/crossentropy": 2.191851982474327,
"loss/hidden": 3.83828125,
"loss/jsd": 0.09336025016382336,
"loss/logits": 0.0,
"step": 2130
},
{
"epoch": 0.107,
"grad_norm": 20.625,
"grad_norm_var": 12.3416015625,
"learning_rate": 0.0001,
"loss": 4.746,
"loss/crossentropy": 2.321294938027859,
"loss/hidden": 3.709375,
"loss/jsd": 0.09611575696617365,
"loss/logits": 0.0,
"step": 2140
},
{
"epoch": 0.1075,
"grad_norm": 20.5,
"grad_norm_var": 11.157291666666667,
"learning_rate": 0.0001,
"loss": 4.6848,
"loss/crossentropy": 2.3641166269779204,
"loss/hidden": 3.802734375,
"loss/jsd": 0.11719204504042864,
"loss/logits": 0.0,
"step": 2150
},
{
"epoch": 0.108,
"grad_norm": 21.25,
"grad_norm_var": 313.9947265625,
"learning_rate": 0.0001,
"loss": 4.703,
"loss/crossentropy": 2.3178130373358727,
"loss/hidden": 3.6640625,
"loss/jsd": 0.10858506197109818,
"loss/logits": 0.0,
"step": 2160
},
{
"epoch": 0.1085,
"grad_norm": 18.125,
"grad_norm_var": 505.7301432291667,
"learning_rate": 0.0001,
"loss": 4.7499,
"loss/crossentropy": 2.2643778324127197,
"loss/hidden": 3.686328125,
"loss/jsd": 0.09995021363720298,
"loss/logits": 0.0,
"step": 2170
},
{
"epoch": 0.109,
"grad_norm": 19.0,
"grad_norm_var": 4.88125,
"learning_rate": 0.0001,
"loss": 4.639,
"loss/crossentropy": 2.333830028772354,
"loss/hidden": 3.77421875,
"loss/jsd": 0.10143324267119169,
"loss/logits": 0.0,
"step": 2180
},
{
"epoch": 0.1095,
"grad_norm": 17.625,
"grad_norm_var": 1.756685316214961e+18,
"learning_rate": 0.0001,
"loss": 4.6091,
"loss/crossentropy": 2.2005941957235335,
"loss/hidden": 3.546875,
"loss/jsd": 0.08694255957379937,
"loss/logits": 0.0,
"step": 2190
},
{
"epoch": 0.11,
"grad_norm": 19.25,
"grad_norm_var": 219.81608072916666,
"learning_rate": 0.0001,
"loss": 4.6177,
"loss/crossentropy": 2.3627296075224877,
"loss/hidden": 3.74765625,
"loss/jsd": 0.10458627291955054,
"loss/logits": 0.0,
"step": 2200
},
{
"epoch": 0.1105,
"grad_norm": 23.25,
"grad_norm_var": 130.35305989583333,
"learning_rate": 0.0001,
"loss": 4.6516,
"loss/crossentropy": 2.4541628479957582,
"loss/hidden": 3.723046875,
"loss/jsd": 0.09181494554504752,
"loss/logits": 0.0,
"step": 2210
},
{
"epoch": 0.111,
"grad_norm": 20.5,
"grad_norm_var": 130.06015625,
"learning_rate": 0.0001,
"loss": 4.69,
"loss/crossentropy": 2.416627970337868,
"loss/hidden": 3.734765625,
"loss/jsd": 0.11581595735624432,
"loss/logits": 0.0,
"step": 2220
},
{
"epoch": 0.1115,
"grad_norm": 18.625,
"grad_norm_var": 5.193473307291667,
"learning_rate": 0.0001,
"loss": 4.685,
"loss/crossentropy": 2.3696270257234575,
"loss/hidden": 3.592578125,
"loss/jsd": 0.09627662082202733,
"loss/logits": 0.0,
"step": 2230
},
{
"epoch": 0.112,
"grad_norm": 17.875,
"grad_norm_var": 3.7067057291666665,
"learning_rate": 0.0001,
"loss": 4.6807,
"loss/crossentropy": 2.374240705370903,
"loss/hidden": 3.63671875,
"loss/jsd": 0.10023370888084174,
"loss/logits": 0.0,
"step": 2240
},
{
"epoch": 0.1125,
"grad_norm": 18.875,
"grad_norm_var": 6.2431640625,
"learning_rate": 0.0001,
"loss": 4.6202,
"loss/crossentropy": 2.39550845772028,
"loss/hidden": 3.655078125,
"loss/jsd": 0.10500529641285539,
"loss/logits": 0.0,
"step": 2250
},
{
"epoch": 0.113,
"grad_norm": 16.75,
"grad_norm_var": 5.78125,
"learning_rate": 0.0001,
"loss": 4.6473,
"loss/crossentropy": 2.3785043194890023,
"loss/hidden": 3.659375,
"loss/jsd": 0.09861900489777327,
"loss/logits": 0.0,
"step": 2260
},
{
"epoch": 0.1135,
"grad_norm": 20.25,
"grad_norm_var": 5.677018229166666,
"learning_rate": 0.0001,
"loss": 4.5771,
"loss/crossentropy": 2.4541394472122193,
"loss/hidden": 3.692578125,
"loss/jsd": 0.10195111334323884,
"loss/logits": 0.0,
"step": 2270
},
{
"epoch": 0.114,
"grad_norm": 21.25,
"grad_norm_var": 7.0228515625,
"learning_rate": 0.0001,
"loss": 4.597,
"loss/crossentropy": 2.3176154881715774,
"loss/hidden": 3.583984375,
"loss/jsd": 0.09049384696409106,
"loss/logits": 0.0,
"step": 2280
},
{
"epoch": 0.1145,
"grad_norm": 15.75,
"grad_norm_var": 15.241520182291667,
"learning_rate": 0.0001,
"loss": 4.5624,
"loss/crossentropy": 2.5178518027067183,
"loss/hidden": 3.528125,
"loss/jsd": 0.09066717140376568,
"loss/logits": 0.0,
"step": 2290
},
{
"epoch": 0.115,
"grad_norm": 17.625,
"grad_norm_var": 7.566520182291667,
"learning_rate": 0.0001,
"loss": 4.5471,
"loss/crossentropy": 2.3759778410196306,
"loss/hidden": 3.553125,
"loss/jsd": 0.09599322909489275,
"loss/logits": 0.0,
"step": 2300
},
{
"epoch": 0.1155,
"grad_norm": 20.0,
"grad_norm_var": 8.312434895833333,
"learning_rate": 0.0001,
"loss": 4.5075,
"loss/crossentropy": 2.3496225073933603,
"loss/hidden": 3.606640625,
"loss/jsd": 0.09744280204176903,
"loss/logits": 0.0,
"step": 2310
},
{
"epoch": 0.116,
"grad_norm": 19.75,
"grad_norm_var": 3.2108723958333334,
"learning_rate": 0.0001,
"loss": 4.5475,
"loss/crossentropy": 2.4485339492559435,
"loss/hidden": 3.523046875,
"loss/jsd": 0.0890957485884428,
"loss/logits": 0.0,
"step": 2320
},
{
"epoch": 0.1165,
"grad_norm": 19.0,
"grad_norm_var": 2.364697265625,
"learning_rate": 0.0001,
"loss": 4.5781,
"loss/crossentropy": 2.299929490685463,
"loss/hidden": 3.598828125,
"loss/jsd": 0.09711863240227103,
"loss/logits": 0.0,
"step": 2330
},
{
"epoch": 0.117,
"grad_norm": 17.75,
"grad_norm_var": 1.4955729166666667,
"learning_rate": 0.0001,
"loss": 4.5392,
"loss/crossentropy": 2.298077051341534,
"loss/hidden": 3.61015625,
"loss/jsd": 0.0896261626854539,
"loss/logits": 0.0,
"step": 2340
},
{
"epoch": 0.1175,
"grad_norm": 17.5,
"grad_norm_var": 3.0098307291666666,
"learning_rate": 0.0001,
"loss": 4.5949,
"loss/crossentropy": 2.2876608431339265,
"loss/hidden": 3.757421875,
"loss/jsd": 0.10631331414915621,
"loss/logits": 0.0,
"step": 2350
},
{
"epoch": 0.118,
"grad_norm": 21.875,
"grad_norm_var": 6.657291666666667,
"learning_rate": 0.0001,
"loss": 4.6054,
"loss/crossentropy": 2.589036238193512,
"loss/hidden": 3.7109375,
"loss/jsd": 0.09777994276955723,
"loss/logits": 0.0,
"step": 2360
},
{
"epoch": 0.1185,
"grad_norm": 19.5,
"grad_norm_var": 4.276497395833333,
"learning_rate": 0.0001,
"loss": 4.6578,
"loss/crossentropy": 2.4440223038196565,
"loss/hidden": 3.63125,
"loss/jsd": 0.10012138104066252,
"loss/logits": 0.0,
"step": 2370
},
{
"epoch": 0.119,
"grad_norm": 15.9375,
"grad_norm_var": 6.341520182291666,
"learning_rate": 0.0001,
"loss": 4.6382,
"loss/crossentropy": 2.3379690438508987,
"loss/hidden": 3.73046875,
"loss/jsd": 0.10282904924824834,
"loss/logits": 0.0,
"step": 2380
},
{
"epoch": 0.1195,
"grad_norm": 20.5,
"grad_norm_var": 7.068733723958333,
"learning_rate": 0.0001,
"loss": 4.5986,
"loss/crossentropy": 2.358085313439369,
"loss/hidden": 3.59140625,
"loss/jsd": 0.0954778247512877,
"loss/logits": 0.0,
"step": 2390
},
{
"epoch": 0.12,
"grad_norm": 18.125,
"grad_norm_var": 4.709375,
"learning_rate": 0.0001,
"loss": 4.5411,
"loss/crossentropy": 2.262301415205002,
"loss/hidden": 3.6265625,
"loss/jsd": 0.09096273891627789,
"loss/logits": 0.0,
"step": 2400
},
{
"epoch": 0.1205,
"grad_norm": 21.125,
"grad_norm_var": 2.4447916666666667,
"learning_rate": 0.0001,
"loss": 4.5437,
"loss/crossentropy": 2.493518462777138,
"loss/hidden": 3.61953125,
"loss/jsd": 0.08979002349078655,
"loss/logits": 0.0,
"step": 2410
},
{
"epoch": 0.121,
"grad_norm": 16.75,
"grad_norm_var": 6.323958333333334,
"learning_rate": 0.0001,
"loss": 4.563,
"loss/crossentropy": 2.4933597564697267,
"loss/hidden": 3.592578125,
"loss/jsd": 0.09690459789708257,
"loss/logits": 0.0,
"step": 2420
},
{
"epoch": 0.1215,
"grad_norm": 18.875,
"grad_norm_var": 6.918489583333334,
"learning_rate": 0.0001,
"loss": 4.5351,
"loss/crossentropy": 2.516791993379593,
"loss/hidden": 3.598828125,
"loss/jsd": 0.09446065053343773,
"loss/logits": 0.0,
"step": 2430
},
{
"epoch": 0.122,
"grad_norm": 19.375,
"grad_norm_var": 5.448942057291666,
"learning_rate": 0.0001,
"loss": 4.5509,
"loss/crossentropy": 2.2249866664409637,
"loss/hidden": 3.53203125,
"loss/jsd": 0.08729059183970093,
"loss/logits": 0.0,
"step": 2440
},
{
"epoch": 0.1225,
"grad_norm": 19.25,
"grad_norm_var": 5.459228515625,
"learning_rate": 0.0001,
"loss": 4.5135,
"loss/crossentropy": 2.2651902705430986,
"loss/hidden": 3.54765625,
"loss/jsd": 0.08877531317993999,
"loss/logits": 0.0,
"step": 2450
},
{
"epoch": 0.123,
"grad_norm": 18.375,
"grad_norm_var": 2.688004557291667,
"learning_rate": 0.0001,
"loss": 4.4698,
"loss/crossentropy": 2.2470821171998976,
"loss/hidden": 3.562890625,
"loss/jsd": 0.09754009852185845,
"loss/logits": 0.0,
"step": 2460
},
{
"epoch": 0.1235,
"grad_norm": 20.0,
"grad_norm_var": 4.591910807291667,
"learning_rate": 0.0001,
"loss": 4.5083,
"loss/crossentropy": 2.1959333077073095,
"loss/hidden": 3.531640625,
"loss/jsd": 0.0921278445981443,
"loss/logits": 0.0,
"step": 2470
},
{
"epoch": 0.124,
"grad_norm": 19.0,
"grad_norm_var": 3.9395182291666666,
"learning_rate": 0.0001,
"loss": 4.5378,
"loss/crossentropy": 2.2659239649772642,
"loss/hidden": 3.5875,
"loss/jsd": 0.09199469089508057,
"loss/logits": 0.0,
"step": 2480
},
{
"epoch": 0.1245,
"grad_norm": 18.375,
"grad_norm_var": 2.9302083333333333,
"learning_rate": 0.0001,
"loss": 4.4709,
"loss/crossentropy": 2.2354795530438425,
"loss/hidden": 3.49140625,
"loss/jsd": 0.08955592634156347,
"loss/logits": 0.0,
"step": 2490
},
{
"epoch": 0.125,
"grad_norm": 21.75,
"grad_norm_var": 3.544791666666667,
"learning_rate": 0.0001,
"loss": 4.546,
"loss/crossentropy": 2.321756035089493,
"loss/hidden": 3.491796875,
"loss/jsd": 0.08398934034630656,
"loss/logits": 0.0,
"step": 2500
},
{
"epoch": 0.1255,
"grad_norm": 20.375,
"grad_norm_var": 3.5992024739583335,
"learning_rate": 0.0001,
"loss": 4.5442,
"loss/crossentropy": 2.327367161214352,
"loss/hidden": 3.562109375,
"loss/jsd": 0.08928178530186415,
"loss/logits": 0.0,
"step": 2510
},
{
"epoch": 0.126,
"grad_norm": 18.875,
"grad_norm_var": 3.892041015625,
"learning_rate": 0.0001,
"loss": 4.4942,
"loss/crossentropy": 2.198644478619099,
"loss/hidden": 3.44921875,
"loss/jsd": 0.08295171349309385,
"loss/logits": 0.0,
"step": 2520
},
{
"epoch": 0.1265,
"grad_norm": 16.125,
"grad_norm_var": 5.773811848958333,
"learning_rate": 0.0001,
"loss": 4.576,
"loss/crossentropy": 2.472541335225105,
"loss/hidden": 3.597265625,
"loss/jsd": 0.10432412773370743,
"loss/logits": 0.0,
"step": 2530
},
{
"epoch": 0.127,
"grad_norm": 20.875,
"grad_norm_var": 5.364583333333333,
"learning_rate": 0.0001,
"loss": 4.5337,
"loss/crossentropy": 2.3647551596164704,
"loss/hidden": 3.618359375,
"loss/jsd": 0.10374335153028369,
"loss/logits": 0.0,
"step": 2540
},
{
"epoch": 0.1275,
"grad_norm": 37.25,
"grad_norm_var": 1281.695947265625,
"learning_rate": 0.0001,
"loss": 4.5825,
"loss/crossentropy": 2.2414861261844634,
"loss/hidden": 3.4828125,
"loss/jsd": 0.09403842501342297,
"loss/logits": 0.0,
"step": 2550
},
{
"epoch": 0.128,
"grad_norm": 14.875,
"grad_norm_var": 1240.8051432291666,
"learning_rate": 0.0001,
"loss": 4.4589,
"loss/crossentropy": 2.234823814034462,
"loss/hidden": 3.509765625,
"loss/jsd": 0.08673453908413649,
"loss/logits": 0.0,
"step": 2560
},
{
"epoch": 0.1285,
"grad_norm": 22.75,
"grad_norm_var": 16.616080729166665,
"learning_rate": 0.0001,
"loss": 4.4816,
"loss/crossentropy": 2.387620323896408,
"loss/hidden": 3.55078125,
"loss/jsd": 0.08936102241277695,
"loss/logits": 0.0,
"step": 2570
},
{
"epoch": 0.129,
"grad_norm": 16.125,
"grad_norm_var": 9.396809895833334,
"learning_rate": 0.0001,
"loss": 4.4124,
"loss/crossentropy": 2.1731797240674497,
"loss/hidden": 3.40390625,
"loss/jsd": 0.07968775480985642,
"loss/logits": 0.0,
"step": 2580
},
{
"epoch": 0.1295,
"grad_norm": 20.625,
"grad_norm_var": 8.490559895833334,
"learning_rate": 0.0001,
"loss": 4.4807,
"loss/crossentropy": 2.1817662701010705,
"loss/hidden": 3.676953125,
"loss/jsd": 0.09472927646711468,
"loss/logits": 0.0,
"step": 2590
},
{
"epoch": 0.13,
"grad_norm": 21.375,
"grad_norm_var": 4.510416666666667,
"learning_rate": 0.0001,
"loss": 4.5324,
"loss/crossentropy": 2.2697513103485107,
"loss/hidden": 3.570703125,
"loss/jsd": 0.08940641283988952,
"loss/logits": 0.0,
"step": 2600
},
{
"epoch": 0.1305,
"grad_norm": 18.375,
"grad_norm_var": 9.0634765625,
"learning_rate": 0.0001,
"loss": 4.4845,
"loss/crossentropy": 2.2707223266363146,
"loss/hidden": 3.52109375,
"loss/jsd": 0.09460832485929131,
"loss/logits": 0.0,
"step": 2610
},
{
"epoch": 0.131,
"grad_norm": 66.0,
"grad_norm_var": 143.3619140625,
"learning_rate": 0.0001,
"loss": 4.5179,
"loss/crossentropy": 2.254822887480259,
"loss/hidden": 3.531640625,
"loss/jsd": 0.09221142884343862,
"loss/logits": 0.0,
"step": 2620
},
{
"epoch": 0.1315,
"grad_norm": 21.625,
"grad_norm_var": 143.6431640625,
"learning_rate": 0.0001,
"loss": 4.4947,
"loss/crossentropy": 2.347915455698967,
"loss/hidden": 3.58203125,
"loss/jsd": 0.09412752091884613,
"loss/logits": 0.0,
"step": 2630
},
{
"epoch": 0.132,
"grad_norm": 17.375,
"grad_norm_var": 3.1708333333333334,
"learning_rate": 0.0001,
"loss": 4.528,
"loss/crossentropy": 2.2751280948519708,
"loss/hidden": 3.56328125,
"loss/jsd": 0.08851864533498884,
"loss/logits": 0.0,
"step": 2640
},
{
"epoch": 0.1325,
"grad_norm": 19.0,
"grad_norm_var": 13.576155598958334,
"learning_rate": 0.0001,
"loss": 4.4741,
"loss/crossentropy": 2.2658936589956284,
"loss/hidden": 3.54296875,
"loss/jsd": 0.08702889690175653,
"loss/logits": 0.0,
"step": 2650
},
{
"epoch": 0.133,
"grad_norm": 21.625,
"grad_norm_var": 722.3559895833333,
"learning_rate": 0.0001,
"loss": 4.5137,
"loss/crossentropy": 2.1781817600131035,
"loss/hidden": 3.56484375,
"loss/jsd": 0.09932177630253136,
"loss/logits": 0.0,
"step": 2660
},
{
"epoch": 0.1335,
"grad_norm": 17.5,
"grad_norm_var": 2.5122395833333333,
"learning_rate": 0.0001,
"loss": 4.4842,
"loss/crossentropy": 2.3243243932724,
"loss/hidden": 3.6015625,
"loss/jsd": 0.09606684306636452,
"loss/logits": 0.0,
"step": 2670
},
{
"epoch": 0.134,
"grad_norm": 16.75,
"grad_norm_var": 38.6853515625,
"learning_rate": 0.0001,
"loss": 4.5164,
"loss/crossentropy": 2.389857916533947,
"loss/hidden": 3.610546875,
"loss/jsd": 0.09261430930346251,
"loss/logits": 0.0,
"step": 2680
},
{
"epoch": 0.1345,
"grad_norm": 19.25,
"grad_norm_var": 4.7369140625,
"learning_rate": 0.0001,
"loss": 4.4902,
"loss/crossentropy": 2.3873065978288652,
"loss/hidden": 3.496875,
"loss/jsd": 0.08840383114293218,
"loss/logits": 0.0,
"step": 2690
},
{
"epoch": 0.135,
"grad_norm": 20.5,
"grad_norm_var": 3.177018229166667,
"learning_rate": 0.0001,
"loss": 4.4057,
"loss/crossentropy": 2.338147234916687,
"loss/hidden": 3.534375,
"loss/jsd": 0.09641889259219169,
"loss/logits": 0.0,
"step": 2700
},
{
"epoch": 0.1355,
"grad_norm": 17.75,
"grad_norm_var": 8.540999348958334,
"learning_rate": 0.0001,
"loss": 4.513,
"loss/crossentropy": 2.2362188696861267,
"loss/hidden": 3.6625,
"loss/jsd": 0.10190682755783201,
"loss/logits": 0.0,
"step": 2710
},
{
"epoch": 0.136,
"grad_norm": 22.75,
"grad_norm_var": 38.66183268229167,
"learning_rate": 0.0001,
"loss": 4.6171,
"loss/crossentropy": 2.222167354822159,
"loss/hidden": 3.628125,
"loss/jsd": 0.11221090480685234,
"loss/logits": 0.0,
"step": 2720
},
{
"epoch": 0.1365,
"grad_norm": 16.375,
"grad_norm_var": 37.61015625,
"learning_rate": 0.0001,
"loss": 4.539,
"loss/crossentropy": 2.3914648950099946,
"loss/hidden": 3.59765625,
"loss/jsd": 0.09164496380835771,
"loss/logits": 0.0,
"step": 2730
},
{
"epoch": 0.137,
"grad_norm": 16.5,
"grad_norm_var": 2.32890625,
"learning_rate": 0.0001,
"loss": 4.6123,
"loss/crossentropy": 2.385912075638771,
"loss/hidden": 3.541796875,
"loss/jsd": 0.08835116708651185,
"loss/logits": 0.0,
"step": 2740
},
{
"epoch": 0.1375,
"grad_norm": 17.75,
"grad_norm_var": 6.953125,
"learning_rate": 0.0001,
"loss": 4.5453,
"loss/crossentropy": 2.291456125676632,
"loss/hidden": 3.590625,
"loss/jsd": 0.0944554246030748,
"loss/logits": 0.0,
"step": 2750
},
{
"epoch": 0.138,
"grad_norm": 18.875,
"grad_norm_var": 23.817643229166666,
"learning_rate": 0.0001,
"loss": 4.4631,
"loss/crossentropy": 2.2219670079648495,
"loss/hidden": 3.562890625,
"loss/jsd": 0.08628002055920661,
"loss/logits": 0.0,
"step": 2760
},
{
"epoch": 0.1385,
"grad_norm": 17.5,
"grad_norm_var": 22.864518229166666,
"learning_rate": 0.0001,
"loss": 4.4415,
"loss/crossentropy": 2.3799121528863907,
"loss/hidden": 3.5,
"loss/jsd": 0.09274168154224753,
"loss/logits": 0.0,
"step": 2770
},
{
"epoch": 0.139,
"grad_norm": 20.25,
"grad_norm_var": 6.460791015625,
"learning_rate": 0.0001,
"loss": 4.5309,
"loss/crossentropy": 2.1055190823972225,
"loss/hidden": 3.534765625,
"loss/jsd": 0.08359876750037074,
"loss/logits": 0.0,
"step": 2780
},
{
"epoch": 0.1395,
"grad_norm": 17.875,
"grad_norm_var": 5.627587890625,
"learning_rate": 0.0001,
"loss": 4.4825,
"loss/crossentropy": 2.3334684520959854,
"loss/hidden": 3.533984375,
"loss/jsd": 0.0967961790971458,
"loss/logits": 0.0,
"step": 2790
},
{
"epoch": 0.14,
"grad_norm": 17.125,
"grad_norm_var": 4.620556640625,
"learning_rate": 0.0001,
"loss": 4.4282,
"loss/crossentropy": 2.42918943464756,
"loss/hidden": 3.526953125,
"loss/jsd": 0.08855977468192577,
"loss/logits": 0.0,
"step": 2800
},
{
"epoch": 0.1405,
"grad_norm": 18.5,
"grad_norm_var": 5.042643229166667,
"learning_rate": 0.0001,
"loss": 4.4503,
"loss/crossentropy": 2.272622914612293,
"loss/hidden": 3.58671875,
"loss/jsd": 0.0873057721182704,
"loss/logits": 0.0,
"step": 2810
},
{
"epoch": 0.141,
"grad_norm": 18.125,
"grad_norm_var": 3.658837890625,
"learning_rate": 0.0001,
"loss": 4.4984,
"loss/crossentropy": 2.1938667565584185,
"loss/hidden": 3.553515625,
"loss/jsd": 0.08576443083584309,
"loss/logits": 0.0,
"step": 2820
},
{
"epoch": 0.1415,
"grad_norm": 20.375,
"grad_norm_var": 3.8306640625,
"learning_rate": 0.0001,
"loss": 4.5033,
"loss/crossentropy": 2.3123946458101274,
"loss/hidden": 3.56171875,
"loss/jsd": 0.0924573240801692,
"loss/logits": 0.0,
"step": 2830
},
{
"epoch": 0.142,
"grad_norm": 22.75,
"grad_norm_var": 5.495768229166667,
"learning_rate": 0.0001,
"loss": 4.4168,
"loss/crossentropy": 2.3987593173980715,
"loss/hidden": 3.509765625,
"loss/jsd": 0.0879776468500495,
"loss/logits": 0.0,
"step": 2840
},
{
"epoch": 0.1425,
"grad_norm": 17.375,
"grad_norm_var": 9.082747395833334,
"learning_rate": 0.0001,
"loss": 4.4669,
"loss/crossentropy": 2.2637423157691954,
"loss/hidden": 3.5609375,
"loss/jsd": 0.10491609480232,
"loss/logits": 0.0,
"step": 2850
},
{
"epoch": 0.143,
"grad_norm": 16.625,
"grad_norm_var": 5.2478515625,
"learning_rate": 0.0001,
"loss": 4.3874,
"loss/crossentropy": 2.36127190887928,
"loss/hidden": 3.5421875,
"loss/jsd": 0.08668355047702789,
"loss/logits": 0.0,
"step": 2860
},
{
"epoch": 0.1435,
"grad_norm": 20.625,
"grad_norm_var": 7.6447265625,
"learning_rate": 0.0001,
"loss": 4.4512,
"loss/crossentropy": 2.4082365155220034,
"loss/hidden": 3.561328125,
"loss/jsd": 0.09474811758846044,
"loss/logits": 0.0,
"step": 2870
},
{
"epoch": 0.144,
"grad_norm": 21.125,
"grad_norm_var": 8.8734375,
"learning_rate": 0.0001,
"loss": 4.4895,
"loss/crossentropy": 2.2012635439634325,
"loss/hidden": 3.4703125,
"loss/jsd": 0.08498403234407306,
"loss/logits": 0.0,
"step": 2880
},
{
"epoch": 0.1445,
"grad_norm": 21.625,
"grad_norm_var": 3.4480305989583333,
"learning_rate": 0.0001,
"loss": 4.447,
"loss/crossentropy": 2.298944839835167,
"loss/hidden": 3.555078125,
"loss/jsd": 0.09882149025797844,
"loss/logits": 0.0,
"step": 2890
},
{
"epoch": 0.145,
"grad_norm": 16.75,
"grad_norm_var": 2.8893229166666665,
"learning_rate": 0.0001,
"loss": 4.5259,
"loss/crossentropy": 2.4170736342668535,
"loss/hidden": 3.62265625,
"loss/jsd": 0.10080426596105099,
"loss/logits": 0.0,
"step": 2900
},
{
"epoch": 0.1455,
"grad_norm": 24.75,
"grad_norm_var": 9.176041666666666,
"learning_rate": 0.0001,
"loss": 4.4133,
"loss/crossentropy": 2.266545096039772,
"loss/hidden": 3.488671875,
"loss/jsd": 0.08616750100627542,
"loss/logits": 0.0,
"step": 2910
},
{
"epoch": 0.146,
"grad_norm": 23.5,
"grad_norm_var": 10.60859375,
"learning_rate": 0.0001,
"loss": 4.4536,
"loss/crossentropy": 2.3165148913860323,
"loss/hidden": 3.42890625,
"loss/jsd": 0.07857409287244081,
"loss/logits": 0.0,
"step": 2920
},
{
"epoch": 0.1465,
"grad_norm": 18.125,
"grad_norm_var": 4.756884765625,
"learning_rate": 0.0001,
"loss": 4.4494,
"loss/crossentropy": 2.275170993804932,
"loss/hidden": 3.52421875,
"loss/jsd": 0.08944948101416231,
"loss/logits": 0.0,
"step": 2930
},
{
"epoch": 0.147,
"grad_norm": 19.375,
"grad_norm_var": 4.254931640625,
"learning_rate": 0.0001,
"loss": 4.4176,
"loss/crossentropy": 2.0632098406553268,
"loss/hidden": 3.54296875,
"loss/jsd": 0.09174134442582726,
"loss/logits": 0.0,
"step": 2940
},
{
"epoch": 0.1475,
"grad_norm": 20.75,
"grad_norm_var": 3.086442057291667,
"learning_rate": 0.0001,
"loss": 4.4606,
"loss/crossentropy": 2.251816061139107,
"loss/hidden": 3.43046875,
"loss/jsd": 0.08638259647414089,
"loss/logits": 0.0,
"step": 2950
},
{
"epoch": 0.148,
"grad_norm": 44.75,
"grad_norm_var": 49.42265625,
"learning_rate": 0.0001,
"loss": 4.4651,
"loss/crossentropy": 2.1415898591279983,
"loss/hidden": 3.41796875,
"loss/jsd": 0.08234252617694438,
"loss/logits": 0.0,
"step": 2960
},
{
"epoch": 0.1485,
"grad_norm": 21.0,
"grad_norm_var": 47.44993489583333,
"learning_rate": 0.0001,
"loss": 4.4563,
"loss/crossentropy": 2.2365823119878767,
"loss/hidden": 3.542578125,
"loss/jsd": 0.09432788556441665,
"loss/logits": 0.0,
"step": 2970
},
{
"epoch": 0.149,
"grad_norm": 16.875,
"grad_norm_var": 3.926155598958333,
"learning_rate": 0.0001,
"loss": 4.4151,
"loss/crossentropy": 2.397647699713707,
"loss/hidden": 3.58984375,
"loss/jsd": 0.09230242855846882,
"loss/logits": 0.0,
"step": 2980
},
{
"epoch": 0.1495,
"grad_norm": 18.25,
"grad_norm_var": 2.0455729166666665,
"learning_rate": 0.0001,
"loss": 4.4923,
"loss/crossentropy": 2.270119884610176,
"loss/hidden": 3.588671875,
"loss/jsd": 0.09977766564115882,
"loss/logits": 0.0,
"step": 2990
},
{
"epoch": 0.15,
"grad_norm": 20.875,
"grad_norm_var": 5.620947265625,
"learning_rate": 0.0001,
"loss": 4.424,
"loss/crossentropy": 2.274160121381283,
"loss/hidden": 3.493359375,
"loss/jsd": 0.08368044728413224,
"loss/logits": 0.0,
"step": 3000
},
{
"epoch": 0.1505,
"grad_norm": 18.0,
"grad_norm_var": 5.805843098958333,
"learning_rate": 0.0001,
"loss": 4.4366,
"loss/crossentropy": 2.2047403126955034,
"loss/hidden": 3.48125,
"loss/jsd": 0.09485792317427695,
"loss/logits": 0.0,
"step": 3010
},
{
"epoch": 0.151,
"grad_norm": 15.75,
"grad_norm_var": 3.952018229166667,
"learning_rate": 0.0001,
"loss": 4.4361,
"loss/crossentropy": 2.426369333267212,
"loss/hidden": 3.505078125,
"loss/jsd": 0.09415734186768532,
"loss/logits": 0.0,
"step": 3020
},
{
"epoch": 0.1515,
"grad_norm": 20.5,
"grad_norm_var": 27.647916666666667,
"learning_rate": 0.0001,
"loss": 4.386,
"loss/crossentropy": 2.1474297270178795,
"loss/hidden": 3.485546875,
"loss/jsd": 0.08619686132296919,
"loss/logits": 0.0,
"step": 3030
},
{
"epoch": 0.152,
"grad_norm": 18.125,
"grad_norm_var": 7.828369140625,
"learning_rate": 0.0001,
"loss": 4.5285,
"loss/crossentropy": 2.343987912684679,
"loss/hidden": 3.58515625,
"loss/jsd": 0.09171235403046012,
"loss/logits": 0.0,
"step": 3040
},
{
"epoch": 0.1525,
"grad_norm": 18.25,
"grad_norm_var": 8.158707682291666,
"learning_rate": 0.0001,
"loss": 4.4394,
"loss/crossentropy": 2.3131785288453104,
"loss/hidden": 3.59453125,
"loss/jsd": 0.09000935666263103,
"loss/logits": 0.0,
"step": 3050
},
{
"epoch": 0.153,
"grad_norm": 37.0,
"grad_norm_var": 68.850244140625,
"learning_rate": 0.0001,
"loss": 4.4266,
"loss/crossentropy": 2.333009423315525,
"loss/hidden": 3.5515625,
"loss/jsd": 0.10508564142510295,
"loss/logits": 0.0,
"step": 3060
},
{
"epoch": 0.1535,
"grad_norm": 18.75,
"grad_norm_var": 69.41712239583333,
"learning_rate": 0.0001,
"loss": 4.4924,
"loss/crossentropy": 2.3548025131225585,
"loss/hidden": 3.560546875,
"loss/jsd": 0.08788978308439255,
"loss/logits": 0.0,
"step": 3070
},
{
"epoch": 0.154,
"grad_norm": 19.125,
"grad_norm_var": 2.856494140625,
"learning_rate": 0.0001,
"loss": 4.4327,
"loss/crossentropy": 2.196867881715298,
"loss/hidden": 3.598046875,
"loss/jsd": 0.09646046198904515,
"loss/logits": 0.0,
"step": 3080
},
{
"epoch": 0.1545,
"grad_norm": 22.375,
"grad_norm_var": 3.7356770833333335,
"learning_rate": 0.0001,
"loss": 4.4178,
"loss/crossentropy": 2.2703019440174104,
"loss/hidden": 3.5359375,
"loss/jsd": 0.09918619338423014,
"loss/logits": 0.0,
"step": 3090
},
{
"epoch": 0.155,
"grad_norm": 21.5,
"grad_norm_var": 10.3275390625,
"learning_rate": 0.0001,
"loss": 4.3676,
"loss/crossentropy": 2.2433530882000925,
"loss/hidden": 3.416015625,
"loss/jsd": 0.08071139380335808,
"loss/logits": 0.0,
"step": 3100
},
{
"epoch": 0.1555,
"grad_norm": 18.5,
"grad_norm_var": 10.381510416666666,
"learning_rate": 0.0001,
"loss": 4.3876,
"loss/crossentropy": 2.229444594681263,
"loss/hidden": 3.46640625,
"loss/jsd": 0.08576273424550891,
"loss/logits": 0.0,
"step": 3110
},
{
"epoch": 0.156,
"grad_norm": 15.6875,
"grad_norm_var": 4.5712890625,
"learning_rate": 0.0001,
"loss": 4.345,
"loss/crossentropy": 2.2901594534516336,
"loss/hidden": 3.35859375,
"loss/jsd": 0.07994853192940354,
"loss/logits": 0.0,
"step": 3120
},
{
"epoch": 0.1565,
"grad_norm": 15.625,
"grad_norm_var": 7.159358723958333,
"learning_rate": 0.0001,
"loss": 4.3626,
"loss/crossentropy": 2.351148310303688,
"loss/hidden": 3.499609375,
"loss/jsd": 0.089451711345464,
"loss/logits": 0.0,
"step": 3130
},
{
"epoch": 0.157,
"grad_norm": 16.375,
"grad_norm_var": 6.505192057291667,
"learning_rate": 0.0001,
"loss": 4.4232,
"loss/crossentropy": 2.1818712055683136,
"loss/hidden": 3.45234375,
"loss/jsd": 0.0778524660039693,
"loss/logits": 0.0,
"step": 3140
},
{
"epoch": 0.1575,
"grad_norm": 16.75,
"grad_norm_var": 2.1102701822916665,
"learning_rate": 0.0001,
"loss": 4.4401,
"loss/crossentropy": 2.349280393123627,
"loss/hidden": 3.570703125,
"loss/jsd": 0.09622437562793493,
"loss/logits": 0.0,
"step": 3150
},
{
"epoch": 0.158,
"grad_norm": 17.625,
"grad_norm_var": 2.701546223958333,
"learning_rate": 0.0001,
"loss": 4.3796,
"loss/crossentropy": 2.329416874051094,
"loss/hidden": 3.539453125,
"loss/jsd": 0.08784733964130283,
"loss/logits": 0.0,
"step": 3160
},
{
"epoch": 0.1585,
"grad_norm": 16.375,
"grad_norm_var": 2.156363932291667,
"learning_rate": 0.0001,
"loss": 4.2973,
"loss/crossentropy": 2.2838528990745544,
"loss/hidden": 3.38984375,
"loss/jsd": 0.08452709410339594,
"loss/logits": 0.0,
"step": 3170
},
{
"epoch": 0.159,
"grad_norm": 19.0,
"grad_norm_var": 4.3259765625,
"learning_rate": 0.0001,
"loss": 4.4271,
"loss/crossentropy": 2.1395395755767823,
"loss/hidden": 3.5203125,
"loss/jsd": 0.10172450188547373,
"loss/logits": 0.0,
"step": 3180
},
{
"epoch": 0.1595,
"grad_norm": 19.375,
"grad_norm_var": 4.408268229166667,
"learning_rate": 0.0001,
"loss": 4.4574,
"loss/crossentropy": 2.394977739453316,
"loss/hidden": 3.6234375,
"loss/jsd": 0.10171002727001906,
"loss/logits": 0.0,
"step": 3190
},
{
"epoch": 0.16,
"grad_norm": 16.875,
"grad_norm_var": 6.147379557291667,
"learning_rate": 0.0001,
"loss": 4.4832,
"loss/crossentropy": 2.355364751815796,
"loss/hidden": 3.50546875,
"loss/jsd": 0.08951211860403419,
"loss/logits": 0.0,
"step": 3200
},
{
"epoch": 0.1605,
"grad_norm": 28.625,
"grad_norm_var": 1.244752705334018e+18,
"learning_rate": 0.0001,
"loss": 4.4816,
"loss/crossentropy": 2.305925354361534,
"loss/hidden": 3.58828125,
"loss/jsd": 0.08834340209141374,
"loss/logits": 0.0,
"step": 3210
},
{
"epoch": 0.161,
"grad_norm": 18.25,
"grad_norm_var": 9.839518229166666,
"learning_rate": 0.0001,
"loss": 4.4863,
"loss/crossentropy": 2.3078080981969835,
"loss/hidden": 3.5203125,
"loss/jsd": 0.0940877721644938,
"loss/logits": 0.0,
"step": 3220
},
{
"epoch": 0.1615,
"grad_norm": 16.125,
"grad_norm_var": 2.7150390625,
"learning_rate": 0.0001,
"loss": 4.4064,
"loss/crossentropy": 2.4253244906663896,
"loss/hidden": 3.58125,
"loss/jsd": 0.09865610068663955,
"loss/logits": 0.0,
"step": 3230
},
{
"epoch": 0.162,
"grad_norm": 18.75,
"grad_norm_var": 4.934749348958333,
"learning_rate": 0.0001,
"loss": 4.3174,
"loss/crossentropy": 2.4989412158727644,
"loss/hidden": 3.4234375,
"loss/jsd": 0.08196726078167557,
"loss/logits": 0.0,
"step": 3240
},
{
"epoch": 0.1625,
"grad_norm": 16.375,
"grad_norm_var": 2.9952962239583334,
"learning_rate": 0.0001,
"loss": 4.3693,
"loss/crossentropy": 2.2475881457328795,
"loss/hidden": 3.47109375,
"loss/jsd": 0.09224425395950675,
"loss/logits": 0.0,
"step": 3250
},
{
"epoch": 0.163,
"grad_norm": 15.3125,
"grad_norm_var": 1.9930826822916667,
"learning_rate": 0.0001,
"loss": 4.3783,
"loss/crossentropy": 2.2365039557218553,
"loss/hidden": 3.49921875,
"loss/jsd": 0.09003520868718624,
"loss/logits": 0.0,
"step": 3260
},
{
"epoch": 0.1635,
"grad_norm": 17.375,
"grad_norm_var": 2.0921223958333335,
"learning_rate": 0.0001,
"loss": 4.4329,
"loss/crossentropy": 2.266706997156143,
"loss/hidden": 3.542578125,
"loss/jsd": 0.09133774926885962,
"loss/logits": 0.0,
"step": 3270
},
{
"epoch": 0.164,
"grad_norm": 17.0,
"grad_norm_var": 2.531510416666667,
"learning_rate": 0.0001,
"loss": 4.5264,
"loss/crossentropy": 2.292432078719139,
"loss/hidden": 3.67265625,
"loss/jsd": 0.11363288760185242,
"loss/logits": 0.0,
"step": 3280
},
{
"epoch": 0.1645,
"grad_norm": 20.125,
"grad_norm_var": 2.6884765625,
"learning_rate": 0.0001,
"loss": 4.3941,
"loss/crossentropy": 2.308723744750023,
"loss/hidden": 3.60234375,
"loss/jsd": 0.09796320544555784,
"loss/logits": 0.0,
"step": 3290
},
{
"epoch": 0.165,
"grad_norm": 19.0,
"grad_norm_var": 3.3436848958333334,
"learning_rate": 0.0001,
"loss": 4.3738,
"loss/crossentropy": 2.3898502081632613,
"loss/hidden": 3.471484375,
"loss/jsd": 0.08741156700998545,
"loss/logits": 0.0,
"step": 3300
},
{
"epoch": 0.1655,
"grad_norm": 19.875,
"grad_norm_var": 3.4898274739583335,
"learning_rate": 0.0001,
"loss": 4.3816,
"loss/crossentropy": 2.306541550159454,
"loss/hidden": 3.592578125,
"loss/jsd": 0.09893495552241802,
"loss/logits": 0.0,
"step": 3310
},
{
"epoch": 0.166,
"grad_norm": 14.8125,
"grad_norm_var": 2.6378743489583334,
"learning_rate": 0.0001,
"loss": 4.37,
"loss/crossentropy": 2.3887303933501243,
"loss/hidden": 3.480859375,
"loss/jsd": 0.08493705298751593,
"loss/logits": 0.0,
"step": 3320
},
{
"epoch": 0.1665,
"grad_norm": 21.875,
"grad_norm_var": 3.0442545572916666,
"learning_rate": 0.0001,
"loss": 4.4504,
"loss/crossentropy": 2.3878179833292963,
"loss/hidden": 3.632421875,
"loss/jsd": 0.09913788838312029,
"loss/logits": 0.0,
"step": 3330
},
{
"epoch": 0.167,
"grad_norm": 20.25,
"grad_norm_var": 10.913785807291667,
"learning_rate": 0.0001,
"loss": 4.4553,
"loss/crossentropy": 2.2205622404813767,
"loss/hidden": 3.636328125,
"loss/jsd": 0.10992270009592175,
"loss/logits": 0.0,
"step": 3340
},
{
"epoch": 0.1675,
"grad_norm": 22.0,
"grad_norm_var": 7.966520182291666,
"learning_rate": 0.0001,
"loss": 4.3545,
"loss/crossentropy": 2.108812813460827,
"loss/hidden": 3.40234375,
"loss/jsd": 0.0757693353574723,
"loss/logits": 0.0,
"step": 3350
},
{
"epoch": 0.168,
"grad_norm": 15.6875,
"grad_norm_var": 3.8549479166666667,
"learning_rate": 0.0001,
"loss": 4.3713,
"loss/crossentropy": 2.289568629860878,
"loss/hidden": 3.541015625,
"loss/jsd": 0.10061329454183579,
"loss/logits": 0.0,
"step": 3360
},
{
"epoch": 0.1685,
"grad_norm": 17.5,
"grad_norm_var": 4.482145182291666,
"learning_rate": 0.0001,
"loss": 4.4519,
"loss/crossentropy": 2.352386988699436,
"loss/hidden": 3.42734375,
"loss/jsd": 0.08975700601004064,
"loss/logits": 0.0,
"step": 3370
},
{
"epoch": 0.169,
"grad_norm": 18.875,
"grad_norm_var": 4.2978515625,
"learning_rate": 0.0001,
"loss": 4.3503,
"loss/crossentropy": 2.31557312309742,
"loss/hidden": 3.548828125,
"loss/jsd": 0.0878440142609179,
"loss/logits": 0.0,
"step": 3380
},
{
"epoch": 0.1695,
"grad_norm": 20.875,
"grad_norm_var": 5.702604166666666,
"learning_rate": 0.0001,
"loss": 4.3864,
"loss/crossentropy": 2.339674559235573,
"loss/hidden": 3.464453125,
"loss/jsd": 0.0880395533517003,
"loss/logits": 0.0,
"step": 3390
},
{
"epoch": 0.17,
"grad_norm": 15.1875,
"grad_norm_var": 5.098551432291667,
"learning_rate": 0.0001,
"loss": 4.3726,
"loss/crossentropy": 2.2533027648925783,
"loss/hidden": 3.4921875,
"loss/jsd": 0.08741035936400295,
"loss/logits": 0.0,
"step": 3400
},
{
"epoch": 0.1705,
"grad_norm": 21.875,
"grad_norm_var": 3.4983723958333335,
"learning_rate": 0.0001,
"loss": 4.3701,
"loss/crossentropy": 2.280165506899357,
"loss/hidden": 3.52890625,
"loss/jsd": 0.09410012043081224,
"loss/logits": 0.0,
"step": 3410
},
{
"epoch": 0.171,
"grad_norm": 15.75,
"grad_norm_var": 3.486962890625,
"learning_rate": 0.0001,
"loss": 4.4465,
"loss/crossentropy": 2.3110105454921723,
"loss/hidden": 3.54453125,
"loss/jsd": 0.10350852748379111,
"loss/logits": 0.0,
"step": 3420
},
{
"epoch": 0.1715,
"grad_norm": 15.0625,
"grad_norm_var": 1.7901041666666666,
"learning_rate": 0.0001,
"loss": 4.2987,
"loss/crossentropy": 2.5183032125234606,
"loss/hidden": 3.541015625,
"loss/jsd": 0.0940008645877242,
"loss/logits": 0.0,
"step": 3430
},
{
"epoch": 0.172,
"grad_norm": 16.625,
"grad_norm_var": 1.3207509498935816e+18,
"learning_rate": 0.0001,
"loss": 4.3968,
"loss/crossentropy": 2.298141914606094,
"loss/hidden": 3.538671875,
"loss/jsd": 0.09232875565066934,
"loss/logits": 0.0,
"step": 3440
},
{
"epoch": 0.1725,
"grad_norm": 25.875,
"grad_norm_var": 10.539176432291667,
"learning_rate": 0.0001,
"loss": 4.405,
"loss/crossentropy": 2.4251868039369584,
"loss/hidden": 3.620703125,
"loss/jsd": 0.09764928705990314,
"loss/logits": 0.0,
"step": 3450
},
{
"epoch": 0.173,
"grad_norm": 16.5,
"grad_norm_var": 37.917301432291666,
"learning_rate": 0.0001,
"loss": 4.3521,
"loss/crossentropy": 2.4465878754854202,
"loss/hidden": 3.41328125,
"loss/jsd": 0.09263761136680841,
"loss/logits": 0.0,
"step": 3460
},
{
"epoch": 0.1735,
"grad_norm": 19.5,
"grad_norm_var": 10.325895182291667,
"learning_rate": 0.0001,
"loss": 4.3055,
"loss/crossentropy": 2.341625288128853,
"loss/hidden": 3.46171875,
"loss/jsd": 0.08955673705786467,
"loss/logits": 0.0,
"step": 3470
},
{
"epoch": 0.174,
"grad_norm": 18.25,
"grad_norm_var": 9.155582682291667,
"learning_rate": 0.0001,
"loss": 4.257,
"loss/crossentropy": 2.2626075088977813,
"loss/hidden": 3.38046875,
"loss/jsd": 0.08382235984317958,
"loss/logits": 0.0,
"step": 3480
},
{
"epoch": 0.1745,
"grad_norm": 24.25,
"grad_norm_var": 40.449853515625,
"learning_rate": 0.0001,
"loss": 4.365,
"loss/crossentropy": 2.22887095361948,
"loss/hidden": 3.495703125,
"loss/jsd": 0.08516010586172343,
"loss/logits": 0.0,
"step": 3490
},
{
"epoch": 0.175,
"grad_norm": 29.625,
"grad_norm_var": 22.202848307291667,
"learning_rate": 0.0001,
"loss": 4.4211,
"loss/crossentropy": 2.325581954419613,
"loss/hidden": 3.626171875,
"loss/jsd": 0.10623239502310752,
"loss/logits": 0.0,
"step": 3500
},
{
"epoch": 0.1755,
"grad_norm": 18.25,
"grad_norm_var": 20.083268229166666,
"learning_rate": 0.0001,
"loss": 4.4183,
"loss/crossentropy": 2.3138944447040557,
"loss/hidden": 3.605078125,
"loss/jsd": 0.09691860349848866,
"loss/logits": 0.0,
"step": 3510
},
{
"epoch": 0.176,
"grad_norm": 14.75,
"grad_norm_var": 3.0729166666666665,
"learning_rate": 0.0001,
"loss": 4.3327,
"loss/crossentropy": 2.335177455097437,
"loss/hidden": 3.516796875,
"loss/jsd": 0.08317867233417928,
"loss/logits": 0.0,
"step": 3520
},
{
"epoch": 0.1765,
"grad_norm": 29.375,
"grad_norm_var": 313.28943684895836,
"learning_rate": 0.0001,
"loss": 4.3804,
"loss/crossentropy": 2.2869663372635842,
"loss/hidden": 3.3875,
"loss/jsd": 0.07983446251600981,
"loss/logits": 0.0,
"step": 3530
},
{
"epoch": 0.177,
"grad_norm": 18.75,
"grad_norm_var": 294.8395182291667,
"learning_rate": 0.0001,
"loss": 4.4268,
"loss/crossentropy": 2.3664276599884033,
"loss/hidden": 3.541796875,
"loss/jsd": 0.09534696582704782,
"loss/logits": 0.0,
"step": 3540
},
{
"epoch": 0.1775,
"grad_norm": 16.875,
"grad_norm_var": 9.458707682291667,
"learning_rate": 0.0001,
"loss": 4.2841,
"loss/crossentropy": 2.3651267111301424,
"loss/hidden": 3.37890625,
"loss/jsd": 0.08079936136491597,
"loss/logits": 0.0,
"step": 3550
},
{
"epoch": 0.178,
"grad_norm": 21.125,
"grad_norm_var": 4.593473307291666,
"learning_rate": 0.0001,
"loss": 4.2825,
"loss/crossentropy": 2.392011249065399,
"loss/hidden": 3.41015625,
"loss/jsd": 0.09476534733548761,
"loss/logits": 0.0,
"step": 3560
},
{
"epoch": 0.1785,
"grad_norm": 23.0,
"grad_norm_var": 3.734375,
"learning_rate": 0.0001,
"loss": 4.3928,
"loss/crossentropy": 2.4183569096028803,
"loss/hidden": 3.3953125,
"loss/jsd": 0.0872859289869666,
"loss/logits": 0.0,
"step": 3570
},
{
"epoch": 0.179,
"grad_norm": 17.375,
"grad_norm_var": 6.357405598958334,
"learning_rate": 0.0001,
"loss": 4.366,
"loss/crossentropy": 2.228242626786232,
"loss/hidden": 3.61484375,
"loss/jsd": 0.09655670188367367,
"loss/logits": 0.0,
"step": 3580
},
{
"epoch": 0.1795,
"grad_norm": 19.625,
"grad_norm_var": 4.720247395833334,
"learning_rate": 0.0001,
"loss": 4.3721,
"loss/crossentropy": 2.3514621019363404,
"loss/hidden": 3.55703125,
"loss/jsd": 0.08998525207862258,
"loss/logits": 0.0,
"step": 3590
},
{
"epoch": 0.18,
"grad_norm": 17.75,
"grad_norm_var": 86.24099934895834,
"learning_rate": 0.0001,
"loss": 4.2887,
"loss/crossentropy": 2.3266086250543596,
"loss/hidden": 3.3921875,
"loss/jsd": 0.08203610377386213,
"loss/logits": 0.0,
"step": 3600
},
{
"epoch": 0.1805,
"grad_norm": 17.125,
"grad_norm_var": 6.689428671005983e+17,
"learning_rate": 0.0001,
"loss": 4.2832,
"loss/crossentropy": 2.358739697933197,
"loss/hidden": 3.381640625,
"loss/jsd": 0.07689286703243851,
"loss/logits": 0.0,
"step": 3610
},
{
"epoch": 0.181,
"grad_norm": 19.75,
"grad_norm_var": 6.689428671857951e+17,
"learning_rate": 0.0001,
"loss": 4.2562,
"loss/crossentropy": 2.299892693758011,
"loss/hidden": 3.358203125,
"loss/jsd": 0.07548968028277159,
"loss/logits": 0.0,
"step": 3620
},
{
"epoch": 0.1815,
"grad_norm": 16.5,
"grad_norm_var": 3.8056640625,
"learning_rate": 0.0001,
"loss": 4.3453,
"loss/crossentropy": 2.1600609093904497,
"loss/hidden": 3.521484375,
"loss/jsd": 0.0914209995418787,
"loss/logits": 0.0,
"step": 3630
},
{
"epoch": 0.182,
"grad_norm": 18.375,
"grad_norm_var": 5.445833333333334,
"learning_rate": 0.0001,
"loss": 4.261,
"loss/crossentropy": 2.2630960240960123,
"loss/hidden": 3.2734375,
"loss/jsd": 0.07350569609552622,
"loss/logits": 0.0,
"step": 3640
},
{
"epoch": 0.1825,
"grad_norm": 19.5,
"grad_norm_var": 4.786393229166666,
"learning_rate": 0.0001,
"loss": 4.2941,
"loss/crossentropy": 2.26744422018528,
"loss/hidden": 3.434765625,
"loss/jsd": 0.09071792410686612,
"loss/logits": 0.0,
"step": 3650
},
{
"epoch": 0.183,
"grad_norm": 16.375,
"grad_norm_var": 3.675520833333333,
"learning_rate": 0.0001,
"loss": 4.216,
"loss/crossentropy": 2.392577236890793,
"loss/hidden": 3.39140625,
"loss/jsd": 0.0848071664571762,
"loss/logits": 0.0,
"step": 3660
},
{
"epoch": 0.1835,
"grad_norm": 18.625,
"grad_norm_var": 2.8169270833333333,
"learning_rate": 0.0001,
"loss": 4.2512,
"loss/crossentropy": 2.3719822376966477,
"loss/hidden": 3.4421875,
"loss/jsd": 0.08796066055074334,
"loss/logits": 0.0,
"step": 3670
},
{
"epoch": 0.184,
"grad_norm": 18.125,
"grad_norm_var": 5.067692057291667,
"learning_rate": 0.0001,
"loss": 4.2015,
"loss/crossentropy": 2.2736427552998064,
"loss/hidden": 3.362109375,
"loss/jsd": 0.0805484069045633,
"loss/logits": 0.0,
"step": 3680
},
{
"epoch": 0.1845,
"grad_norm": 14.8125,
"grad_norm_var": 5.062434895833333,
"learning_rate": 0.0001,
"loss": 4.2791,
"loss/crossentropy": 2.298249673843384,
"loss/hidden": 3.403515625,
"loss/jsd": 0.08579938132315874,
"loss/logits": 0.0,
"step": 3690
},
{
"epoch": 0.185,
"grad_norm": 17.75,
"grad_norm_var": 5.132747395833333,
"learning_rate": 0.0001,
"loss": 4.2845,
"loss/crossentropy": 2.29532730281353,
"loss/hidden": 3.43359375,
"loss/jsd": 0.08520804699510336,
"loss/logits": 0.0,
"step": 3700
},
{
"epoch": 0.1855,
"grad_norm": 21.375,
"grad_norm_var": 3.1105305989583334,
"learning_rate": 0.0001,
"loss": 4.2881,
"loss/crossentropy": 2.266036620736122,
"loss/hidden": 3.4421875,
"loss/jsd": 0.08662721011787652,
"loss/logits": 0.0,
"step": 3710
},
{
"epoch": 0.186,
"grad_norm": 17.875,
"grad_norm_var": 2.1442057291666665,
"learning_rate": 0.0001,
"loss": 4.3015,
"loss/crossentropy": 2.3964017778635025,
"loss/hidden": 3.443359375,
"loss/jsd": 0.08621067805215717,
"loss/logits": 0.0,
"step": 3720
},
{
"epoch": 0.1865,
"grad_norm": 16.5,
"grad_norm_var": 3.620556640625,
"learning_rate": 0.0001,
"loss": 4.2411,
"loss/crossentropy": 2.3594220340251923,
"loss/hidden": 3.336328125,
"loss/jsd": 0.0772560654208064,
"loss/logits": 0.0,
"step": 3730
},
{
"epoch": 0.187,
"grad_norm": 16.75,
"grad_norm_var": 3.252978515625,
"learning_rate": 0.0001,
"loss": 4.184,
"loss/crossentropy": 2.2494852378964425,
"loss/hidden": 3.32421875,
"loss/jsd": 0.08321888605132699,
"loss/logits": 0.0,
"step": 3740
},
{
"epoch": 0.1875,
"grad_norm": 18.875,
"grad_norm_var": 3.7570149739583334,
"learning_rate": 0.0001,
"loss": 4.2329,
"loss/crossentropy": 2.178547790646553,
"loss/hidden": 3.322265625,
"loss/jsd": 0.07429210902191699,
"loss/logits": 0.0,
"step": 3750
},
{
"epoch": 0.188,
"grad_norm": 16.125,
"grad_norm_var": 4.394645182291667,
"learning_rate": 0.0001,
"loss": 4.2641,
"loss/crossentropy": 2.2659785449504852,
"loss/hidden": 3.4265625,
"loss/jsd": 0.08605121849104762,
"loss/logits": 0.0,
"step": 3760
},
{
"epoch": 0.1885,
"grad_norm": 19.375,
"grad_norm_var": 4.303889973958333,
"learning_rate": 0.0001,
"loss": 4.2343,
"loss/crossentropy": 2.3981280818581583,
"loss/hidden": 3.384375,
"loss/jsd": 0.0853766439948231,
"loss/logits": 0.0,
"step": 3770
},
{
"epoch": 0.189,
"grad_norm": 15.75,
"grad_norm_var": 3.1048014322916666,
"learning_rate": 0.0001,
"loss": 4.2052,
"loss/crossentropy": 2.2695484533905983,
"loss/hidden": 3.512109375,
"loss/jsd": 0.09265543352812529,
"loss/logits": 0.0,
"step": 3780
},
{
"epoch": 0.1895,
"grad_norm": 15.3125,
"grad_norm_var": 2.510139973958333,
"learning_rate": 0.0001,
"loss": 4.2526,
"loss/crossentropy": 2.1818419501185415,
"loss/hidden": 3.376171875,
"loss/jsd": 0.08593555409461259,
"loss/logits": 0.0,
"step": 3790
},
{
"epoch": 0.19,
"grad_norm": 15.0625,
"grad_norm_var": 2.105729166666667,
"learning_rate": 0.0001,
"loss": 4.2301,
"loss/crossentropy": 2.218023180961609,
"loss/hidden": 3.334765625,
"loss/jsd": 0.07895527156069874,
"loss/logits": 0.0,
"step": 3800
},
{
"epoch": 0.1905,
"grad_norm": 15.5625,
"grad_norm_var": 6.692708333333333,
"learning_rate": 0.0001,
"loss": 4.241,
"loss/crossentropy": 2.2893342286348344,
"loss/hidden": 3.296875,
"loss/jsd": 0.07659890875220299,
"loss/logits": 0.0,
"step": 3810
},
{
"epoch": 0.191,
"grad_norm": 17.25,
"grad_norm_var": 5.272509765625,
"learning_rate": 0.0001,
"loss": 4.2655,
"loss/crossentropy": 2.207420842349529,
"loss/hidden": 3.36484375,
"loss/jsd": 0.08572290684096515,
"loss/logits": 0.0,
"step": 3820
},
{
"epoch": 0.1915,
"grad_norm": 21.625,
"grad_norm_var": 5.355712890625,
"learning_rate": 0.0001,
"loss": 4.2474,
"loss/crossentropy": 2.3076944231986998,
"loss/hidden": 3.31796875,
"loss/jsd": 0.07485279012471438,
"loss/logits": 0.0,
"step": 3830
},
{
"epoch": 0.192,
"grad_norm": 15.25,
"grad_norm_var": 6.646858723958333,
"learning_rate": 0.0001,
"loss": 4.2634,
"loss/crossentropy": 2.42186721265316,
"loss/hidden": 3.4078125,
"loss/jsd": 0.08714157855138183,
"loss/logits": 0.0,
"step": 3840
},
{
"epoch": 0.1925,
"grad_norm": 4076863488.0,
"grad_norm_var": 1.0388009843068502e+18,
"learning_rate": 0.0001,
"loss": 4.2739,
"loss/crossentropy": 2.3014174938201903,
"loss/hidden": 3.364453125,
"loss/jsd": 0.07954654460772873,
"loss/logits": 0.0,
"step": 3850
},
{
"epoch": 0.193,
"grad_norm": 17.5,
"grad_norm_var": 1.0388009847272768e+18,
"learning_rate": 0.0001,
"loss": 4.2215,
"loss/crossentropy": 2.348978337645531,
"loss/hidden": 3.45625,
"loss/jsd": 0.08323998479172587,
"loss/logits": 0.0,
"step": 3860
},
{
"epoch": 0.1935,
"grad_norm": 18.75,
"grad_norm_var": 3.981884765625,
"learning_rate": 0.0001,
"loss": 4.1528,
"loss/crossentropy": 2.3754432618618013,
"loss/hidden": 3.401171875,
"loss/jsd": 0.08722320841625333,
"loss/logits": 0.0,
"step": 3870
},
{
"epoch": 0.194,
"grad_norm": 18.875,
"grad_norm_var": 3.824072265625,
"learning_rate": 0.0001,
"loss": 4.2868,
"loss/crossentropy": 2.3063534289598464,
"loss/hidden": 3.43515625,
"loss/jsd": 0.08732216758653522,
"loss/logits": 0.0,
"step": 3880
},
{
"epoch": 0.1945,
"grad_norm": 18.25,
"grad_norm_var": 3.3018229166666666,
"learning_rate": 0.0001,
"loss": 4.2463,
"loss/crossentropy": 2.4114058747887612,
"loss/hidden": 3.395703125,
"loss/jsd": 0.08345712553709746,
"loss/logits": 0.0,
"step": 3890
},
{
"epoch": 0.195,
"grad_norm": 18.0,
"grad_norm_var": 3.595833333333333,
"learning_rate": 0.0001,
"loss": 4.2128,
"loss/crossentropy": 2.1565380930900573,
"loss/hidden": 3.23984375,
"loss/jsd": 0.07183347269892693,
"loss/logits": 0.0,
"step": 3900
},
{
"epoch": 0.1955,
"grad_norm": 19.5,
"grad_norm_var": 1.669775390625,
"learning_rate": 0.0001,
"loss": 4.2174,
"loss/crossentropy": 2.4012755006551743,
"loss/hidden": 3.35859375,
"loss/jsd": 0.08356887567788363,
"loss/logits": 0.0,
"step": 3910
},
{
"epoch": 0.196,
"grad_norm": 16.75,
"grad_norm_var": 2.569775390625,
"learning_rate": 0.0001,
"loss": 4.2516,
"loss/crossentropy": 2.4133204758167266,
"loss/hidden": 3.405078125,
"loss/jsd": 0.08416441585868598,
"loss/logits": 0.0,
"step": 3920
},
{
"epoch": 0.1965,
"grad_norm": 16.125,
"grad_norm_var": 4.249072265625,
"learning_rate": 0.0001,
"loss": 4.2424,
"loss/crossentropy": 2.2017408296465875,
"loss/hidden": 3.434375,
"loss/jsd": 0.08481362634338438,
"loss/logits": 0.0,
"step": 3930
},
{
"epoch": 0.197,
"grad_norm": 18.0,
"grad_norm_var": 13.563541666666667,
"learning_rate": 0.0001,
"loss": 4.2145,
"loss/crossentropy": 2.1327252730727198,
"loss/hidden": 3.358203125,
"loss/jsd": 0.08263032594695688,
"loss/logits": 0.0,
"step": 3940
},
{
"epoch": 0.1975,
"grad_norm": 15.9375,
"grad_norm_var": 13.279801432291666,
"learning_rate": 0.0001,
"loss": 4.271,
"loss/crossentropy": 2.3732340067625044,
"loss/hidden": 3.38984375,
"loss/jsd": 0.09080582885071635,
"loss/logits": 0.0,
"step": 3950
},
{
"epoch": 0.198,
"grad_norm": 14.125,
"grad_norm_var": 3.5541015625,
"learning_rate": 0.0001,
"loss": 4.3277,
"loss/crossentropy": 2.2829252019524575,
"loss/hidden": 3.504296875,
"loss/jsd": 0.09264815384522081,
"loss/logits": 0.0,
"step": 3960
},
{
"epoch": 0.1985,
"grad_norm": 20.125,
"grad_norm_var": 5.493212890625,
"learning_rate": 0.0001,
"loss": 4.3215,
"loss/crossentropy": 2.284733434021473,
"loss/hidden": 3.394921875,
"loss/jsd": 0.08987429440021515,
"loss/logits": 0.0,
"step": 3970
},
{
"epoch": 0.199,
"grad_norm": 17.0,
"grad_norm_var": 5.512223307291666,
"learning_rate": 0.0001,
"loss": 4.2933,
"loss/crossentropy": 2.2337097018957137,
"loss/hidden": 3.3640625,
"loss/jsd": 0.0808649729937315,
"loss/logits": 0.0,
"step": 3980
},
{
"epoch": 0.1995,
"grad_norm": 16.625,
"grad_norm_var": 12.917122395833333,
"learning_rate": 0.0001,
"loss": 4.2148,
"loss/crossentropy": 2.3057729706168173,
"loss/hidden": 3.411328125,
"loss/jsd": 0.08738104859367013,
"loss/logits": 0.0,
"step": 3990
},
{
"epoch": 0.2,
"grad_norm": 15.9375,
"grad_norm_var": 5.007145182291667,
"learning_rate": 0.0001,
"loss": 4.217,
"loss/crossentropy": 2.3626988530158997,
"loss/hidden": 3.44296875,
"loss/jsd": 0.09443312305957079,
"loss/logits": 0.0,
"step": 4000
}
],
"logging_steps": 10,
"max_steps": 20000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.1430040128035226e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}