checkpoint / checkpoint-7550 /trainer_state.json
fmrdvcerf's picture
Upload folder using huggingface_hub
e46b92c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.94375,
"eval_steps": 500,
"global_step": 7550,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00125,
"grad_norm": 15.714543342590332,
"learning_rate": 4.994375e-05,
"loss": 2.8924,
"step": 10
},
{
"epoch": 0.0025,
"grad_norm": 10.94906234741211,
"learning_rate": 4.988125e-05,
"loss": 2.5952,
"step": 20
},
{
"epoch": 0.00375,
"grad_norm": 15.514859199523926,
"learning_rate": 4.981875e-05,
"loss": 2.4666,
"step": 30
},
{
"epoch": 0.005,
"grad_norm": 9.053750991821289,
"learning_rate": 4.975625000000001e-05,
"loss": 2.3325,
"step": 40
},
{
"epoch": 0.00625,
"grad_norm": 19.168121337890625,
"learning_rate": 4.969375e-05,
"loss": 2.3066,
"step": 50
},
{
"epoch": 0.0075,
"grad_norm": 9.255040168762207,
"learning_rate": 4.9631250000000004e-05,
"loss": 2.2425,
"step": 60
},
{
"epoch": 0.00875,
"grad_norm": 8.124181747436523,
"learning_rate": 4.956875e-05,
"loss": 2.1237,
"step": 70
},
{
"epoch": 0.01,
"grad_norm": 7.121690273284912,
"learning_rate": 4.950625000000001e-05,
"loss": 1.9941,
"step": 80
},
{
"epoch": 0.01125,
"grad_norm": 8.191390991210938,
"learning_rate": 4.944375e-05,
"loss": 2.118,
"step": 90
},
{
"epoch": 0.0125,
"grad_norm": 7.51083517074585,
"learning_rate": 4.9381250000000004e-05,
"loss": 1.9932,
"step": 100
},
{
"epoch": 0.01375,
"grad_norm": 7.706765174865723,
"learning_rate": 4.931875e-05,
"loss": 2.2066,
"step": 110
},
{
"epoch": 0.015,
"grad_norm": 6.421011447906494,
"learning_rate": 4.925625e-05,
"loss": 2.0439,
"step": 120
},
{
"epoch": 0.01625,
"grad_norm": 17.403257369995117,
"learning_rate": 4.9193750000000007e-05,
"loss": 2.1501,
"step": 130
},
{
"epoch": 0.0175,
"grad_norm": 5.682934284210205,
"learning_rate": 4.913125e-05,
"loss": 2.0993,
"step": 140
},
{
"epoch": 0.01875,
"grad_norm": 8.931119918823242,
"learning_rate": 4.9068750000000003e-05,
"loss": 2.0383,
"step": 150
},
{
"epoch": 0.02,
"grad_norm": 5.6062140464782715,
"learning_rate": 4.900625e-05,
"loss": 1.9334,
"step": 160
},
{
"epoch": 0.02125,
"grad_norm": 8.099666595458984,
"learning_rate": 4.894375000000001e-05,
"loss": 1.9423,
"step": 170
},
{
"epoch": 0.0225,
"grad_norm": 5.474426746368408,
"learning_rate": 4.888125e-05,
"loss": 2.0548,
"step": 180
},
{
"epoch": 0.02375,
"grad_norm": 6.269449710845947,
"learning_rate": 4.8818750000000004e-05,
"loss": 1.9097,
"step": 190
},
{
"epoch": 0.025,
"grad_norm": 6.422362804412842,
"learning_rate": 4.875625e-05,
"loss": 1.9114,
"step": 200
},
{
"epoch": 0.02625,
"grad_norm": 8.737632751464844,
"learning_rate": 4.869375000000001e-05,
"loss": 2.0777,
"step": 210
},
{
"epoch": 0.0275,
"grad_norm": 7.279562950134277,
"learning_rate": 4.863125e-05,
"loss": 1.9173,
"step": 220
},
{
"epoch": 0.02875,
"grad_norm": 7.346338272094727,
"learning_rate": 4.8568750000000005e-05,
"loss": 1.7516,
"step": 230
},
{
"epoch": 0.03,
"grad_norm": 7.4580888748168945,
"learning_rate": 4.850625e-05,
"loss": 1.9487,
"step": 240
},
{
"epoch": 0.03125,
"grad_norm": 6.269708633422852,
"learning_rate": 4.844375e-05,
"loss": 1.8654,
"step": 250
},
{
"epoch": 0.0325,
"grad_norm": 5.809817790985107,
"learning_rate": 4.838125e-05,
"loss": 1.9751,
"step": 260
},
{
"epoch": 0.03375,
"grad_norm": 11.987732887268066,
"learning_rate": 4.831875e-05,
"loss": 2.0062,
"step": 270
},
{
"epoch": 0.035,
"grad_norm": 8.748228073120117,
"learning_rate": 4.8256250000000004e-05,
"loss": 2.1293,
"step": 280
},
{
"epoch": 0.03625,
"grad_norm": 5.571599960327148,
"learning_rate": 4.819375e-05,
"loss": 1.9504,
"step": 290
},
{
"epoch": 0.0375,
"grad_norm": 5.935739517211914,
"learning_rate": 4.813125e-05,
"loss": 1.8399,
"step": 300
},
{
"epoch": 0.03875,
"grad_norm": 5.676118850708008,
"learning_rate": 4.806875e-05,
"loss": 1.9019,
"step": 310
},
{
"epoch": 0.04,
"grad_norm": 8.595135688781738,
"learning_rate": 4.8006250000000005e-05,
"loss": 2.0299,
"step": 320
},
{
"epoch": 0.04125,
"grad_norm": 5.606544017791748,
"learning_rate": 4.794375e-05,
"loss": 1.8806,
"step": 330
},
{
"epoch": 0.0425,
"grad_norm": 7.973094463348389,
"learning_rate": 4.788125e-05,
"loss": 1.7285,
"step": 340
},
{
"epoch": 0.04375,
"grad_norm": 9.09721851348877,
"learning_rate": 4.781875e-05,
"loss": 1.9773,
"step": 350
},
{
"epoch": 0.045,
"grad_norm": 9.41321849822998,
"learning_rate": 4.7756250000000005e-05,
"loss": 1.9834,
"step": 360
},
{
"epoch": 0.04625,
"grad_norm": 4.810961723327637,
"learning_rate": 4.7693750000000004e-05,
"loss": 1.9862,
"step": 370
},
{
"epoch": 0.0475,
"grad_norm": 6.283267498016357,
"learning_rate": 4.763125e-05,
"loss": 1.8544,
"step": 380
},
{
"epoch": 0.04875,
"grad_norm": 9.017960548400879,
"learning_rate": 4.756875e-05,
"loss": 1.8689,
"step": 390
},
{
"epoch": 0.05,
"grad_norm": 6.013952732086182,
"learning_rate": 4.750625e-05,
"loss": 1.798,
"step": 400
},
{
"epoch": 0.05125,
"grad_norm": 5.1610026359558105,
"learning_rate": 4.7443750000000005e-05,
"loss": 1.8669,
"step": 410
},
{
"epoch": 0.0525,
"grad_norm": 5.481388092041016,
"learning_rate": 4.738125e-05,
"loss": 1.9778,
"step": 420
},
{
"epoch": 0.05375,
"grad_norm": 8.239945411682129,
"learning_rate": 4.731875e-05,
"loss": 1.6991,
"step": 430
},
{
"epoch": 0.055,
"grad_norm": 5.641376972198486,
"learning_rate": 4.725625e-05,
"loss": 1.7848,
"step": 440
},
{
"epoch": 0.05625,
"grad_norm": 7.561056613922119,
"learning_rate": 4.7193750000000005e-05,
"loss": 1.7594,
"step": 450
},
{
"epoch": 0.0575,
"grad_norm": 4.920119762420654,
"learning_rate": 4.7131250000000004e-05,
"loss": 1.8387,
"step": 460
},
{
"epoch": 0.05875,
"grad_norm": 6.90638542175293,
"learning_rate": 4.706875e-05,
"loss": 1.8461,
"step": 470
},
{
"epoch": 0.06,
"grad_norm": 6.640336036682129,
"learning_rate": 4.700625e-05,
"loss": 1.7583,
"step": 480
},
{
"epoch": 0.06125,
"grad_norm": 5.126943588256836,
"learning_rate": 4.6943750000000006e-05,
"loss": 1.9002,
"step": 490
},
{
"epoch": 0.0625,
"grad_norm": 4.264902591705322,
"learning_rate": 4.6881250000000005e-05,
"loss": 1.9068,
"step": 500
},
{
"epoch": 0.06375,
"grad_norm": 7.165338039398193,
"learning_rate": 4.681875e-05,
"loss": 1.7812,
"step": 510
},
{
"epoch": 0.065,
"grad_norm": 7.0703277587890625,
"learning_rate": 4.675625e-05,
"loss": 1.698,
"step": 520
},
{
"epoch": 0.06625,
"grad_norm": 6.396975040435791,
"learning_rate": 4.669375e-05,
"loss": 1.7182,
"step": 530
},
{
"epoch": 0.0675,
"grad_norm": 8.975642204284668,
"learning_rate": 4.6631250000000005e-05,
"loss": 1.8448,
"step": 540
},
{
"epoch": 0.06875,
"grad_norm": 5.405190467834473,
"learning_rate": 4.656875e-05,
"loss": 1.7348,
"step": 550
},
{
"epoch": 0.07,
"grad_norm": 5.298381328582764,
"learning_rate": 4.650625e-05,
"loss": 1.6221,
"step": 560
},
{
"epoch": 0.07125,
"grad_norm": 4.854972839355469,
"learning_rate": 4.644375e-05,
"loss": 1.7367,
"step": 570
},
{
"epoch": 0.0725,
"grad_norm": 5.756941795349121,
"learning_rate": 4.6381250000000006e-05,
"loss": 1.801,
"step": 580
},
{
"epoch": 0.07375,
"grad_norm": 4.7184600830078125,
"learning_rate": 4.631875e-05,
"loss": 1.6898,
"step": 590
},
{
"epoch": 0.075,
"grad_norm": 5.140761852264404,
"learning_rate": 4.625625e-05,
"loss": 1.6612,
"step": 600
},
{
"epoch": 0.07625,
"grad_norm": 3.7104735374450684,
"learning_rate": 4.619375e-05,
"loss": 1.624,
"step": 610
},
{
"epoch": 0.0775,
"grad_norm": 6.2447896003723145,
"learning_rate": 4.613125000000001e-05,
"loss": 1.7355,
"step": 620
},
{
"epoch": 0.07875,
"grad_norm": 9.780672073364258,
"learning_rate": 4.6068750000000005e-05,
"loss": 1.6192,
"step": 630
},
{
"epoch": 0.08,
"grad_norm": 5.958286285400391,
"learning_rate": 4.6006250000000004e-05,
"loss": 1.6157,
"step": 640
},
{
"epoch": 0.08125,
"grad_norm": 5.786264419555664,
"learning_rate": 4.594375e-05,
"loss": 1.633,
"step": 650
},
{
"epoch": 0.0825,
"grad_norm": 4.75607967376709,
"learning_rate": 4.588125e-05,
"loss": 1.8139,
"step": 660
},
{
"epoch": 0.08375,
"grad_norm": 4.077645301818848,
"learning_rate": 4.5818750000000006e-05,
"loss": 1.6889,
"step": 670
},
{
"epoch": 0.085,
"grad_norm": 5.341221332550049,
"learning_rate": 4.575625e-05,
"loss": 1.7112,
"step": 680
},
{
"epoch": 0.08625,
"grad_norm": 5.421123027801514,
"learning_rate": 4.569375e-05,
"loss": 1.729,
"step": 690
},
{
"epoch": 0.0875,
"grad_norm": 5.6531829833984375,
"learning_rate": 4.563125e-05,
"loss": 1.7805,
"step": 700
},
{
"epoch": 0.08875,
"grad_norm": 4.264986515045166,
"learning_rate": 4.5568750000000006e-05,
"loss": 1.5703,
"step": 710
},
{
"epoch": 0.09,
"grad_norm": 5.658288955688477,
"learning_rate": 4.550625e-05,
"loss": 1.8264,
"step": 720
},
{
"epoch": 0.09125,
"grad_norm": 5.032812118530273,
"learning_rate": 4.5443750000000003e-05,
"loss": 1.6834,
"step": 730
},
{
"epoch": 0.0925,
"grad_norm": 4.175335884094238,
"learning_rate": 4.538125e-05,
"loss": 1.8537,
"step": 740
},
{
"epoch": 0.09375,
"grad_norm": 6.916208267211914,
"learning_rate": 4.531875000000001e-05,
"loss": 1.7556,
"step": 750
},
{
"epoch": 0.095,
"grad_norm": 6.064156532287598,
"learning_rate": 4.525625e-05,
"loss": 1.7132,
"step": 760
},
{
"epoch": 0.09625,
"grad_norm": 6.488204002380371,
"learning_rate": 4.5193750000000004e-05,
"loss": 1.6907,
"step": 770
},
{
"epoch": 0.0975,
"grad_norm": 5.419294357299805,
"learning_rate": 4.513125e-05,
"loss": 1.6911,
"step": 780
},
{
"epoch": 0.09875,
"grad_norm": 6.069253921508789,
"learning_rate": 4.506875e-05,
"loss": 1.7586,
"step": 790
},
{
"epoch": 0.1,
"grad_norm": 5.28116512298584,
"learning_rate": 4.500625e-05,
"loss": 1.7632,
"step": 800
},
{
"epoch": 0.10125,
"grad_norm": 4.091380596160889,
"learning_rate": 4.494375e-05,
"loss": 1.6792,
"step": 810
},
{
"epoch": 0.1025,
"grad_norm": 5.594090938568115,
"learning_rate": 4.488125e-05,
"loss": 1.8079,
"step": 820
},
{
"epoch": 0.10375,
"grad_norm": 5.238066673278809,
"learning_rate": 4.481875e-05,
"loss": 1.7537,
"step": 830
},
{
"epoch": 0.105,
"grad_norm": 4.945870399475098,
"learning_rate": 4.475625e-05,
"loss": 1.7236,
"step": 840
},
{
"epoch": 0.10625,
"grad_norm": 47.752830505371094,
"learning_rate": 4.469375e-05,
"loss": 1.7715,
"step": 850
},
{
"epoch": 0.1075,
"grad_norm": 6.474725246429443,
"learning_rate": 4.4631250000000004e-05,
"loss": 1.6062,
"step": 860
},
{
"epoch": 0.10875,
"grad_norm": 5.710610866546631,
"learning_rate": 4.456875e-05,
"loss": 1.6496,
"step": 870
},
{
"epoch": 0.11,
"grad_norm": 5.289515495300293,
"learning_rate": 4.450625000000001e-05,
"loss": 1.7847,
"step": 880
},
{
"epoch": 0.11125,
"grad_norm": 4.040447235107422,
"learning_rate": 4.444375e-05,
"loss": 1.5642,
"step": 890
},
{
"epoch": 0.1125,
"grad_norm": 4.721045970916748,
"learning_rate": 4.4381250000000005e-05,
"loss": 1.6424,
"step": 900
},
{
"epoch": 0.11375,
"grad_norm": 4.430590629577637,
"learning_rate": 4.431875e-05,
"loss": 1.5693,
"step": 910
},
{
"epoch": 0.115,
"grad_norm": 4.155664443969727,
"learning_rate": 4.425625e-05,
"loss": 1.5653,
"step": 920
},
{
"epoch": 0.11625,
"grad_norm": 5.376486778259277,
"learning_rate": 4.419375e-05,
"loss": 1.5292,
"step": 930
},
{
"epoch": 0.1175,
"grad_norm": 5.888033390045166,
"learning_rate": 4.413125e-05,
"loss": 1.5894,
"step": 940
},
{
"epoch": 0.11875,
"grad_norm": 4.666552543640137,
"learning_rate": 4.4068750000000004e-05,
"loss": 1.6117,
"step": 950
},
{
"epoch": 0.12,
"grad_norm": 4.565912246704102,
"learning_rate": 4.400625e-05,
"loss": 1.7492,
"step": 960
},
{
"epoch": 0.12125,
"grad_norm": 4.324479579925537,
"learning_rate": 4.394375e-05,
"loss": 1.6294,
"step": 970
},
{
"epoch": 0.1225,
"grad_norm": 5.823368072509766,
"learning_rate": 4.388125e-05,
"loss": 1.6953,
"step": 980
},
{
"epoch": 0.12375,
"grad_norm": 6.85033655166626,
"learning_rate": 4.3818750000000005e-05,
"loss": 1.5723,
"step": 990
},
{
"epoch": 0.125,
"grad_norm": 6.246133327484131,
"learning_rate": 4.375625e-05,
"loss": 1.6042,
"step": 1000
},
{
"epoch": 0.12625,
"grad_norm": 4.677844047546387,
"learning_rate": 4.369375e-05,
"loss": 1.6391,
"step": 1010
},
{
"epoch": 0.1275,
"grad_norm": 6.899301528930664,
"learning_rate": 4.363125e-05,
"loss": 1.5956,
"step": 1020
},
{
"epoch": 0.12875,
"grad_norm": 4.530190944671631,
"learning_rate": 4.3568750000000005e-05,
"loss": 1.537,
"step": 1030
},
{
"epoch": 0.13,
"grad_norm": 5.287406921386719,
"learning_rate": 4.3506250000000004e-05,
"loss": 1.7437,
"step": 1040
},
{
"epoch": 0.13125,
"grad_norm": 3.828369140625,
"learning_rate": 4.344375e-05,
"loss": 1.6047,
"step": 1050
},
{
"epoch": 0.1325,
"grad_norm": 4.649002552032471,
"learning_rate": 4.338125e-05,
"loss": 1.5932,
"step": 1060
},
{
"epoch": 0.13375,
"grad_norm": 6.695621967315674,
"learning_rate": 4.331875e-05,
"loss": 1.7681,
"step": 1070
},
{
"epoch": 0.135,
"grad_norm": 3.446563243865967,
"learning_rate": 4.3256250000000004e-05,
"loss": 1.622,
"step": 1080
},
{
"epoch": 0.13625,
"grad_norm": 4.040626525878906,
"learning_rate": 4.3193749999999996e-05,
"loss": 1.5857,
"step": 1090
},
{
"epoch": 0.1375,
"grad_norm": 3.937739372253418,
"learning_rate": 4.313125e-05,
"loss": 1.613,
"step": 1100
},
{
"epoch": 0.13875,
"grad_norm": 3.4581360816955566,
"learning_rate": 4.306875e-05,
"loss": 1.5218,
"step": 1110
},
{
"epoch": 0.14,
"grad_norm": 4.91287899017334,
"learning_rate": 4.3006250000000005e-05,
"loss": 1.6676,
"step": 1120
},
{
"epoch": 0.14125,
"grad_norm": 5.508735179901123,
"learning_rate": 4.2943750000000004e-05,
"loss": 1.673,
"step": 1130
},
{
"epoch": 0.1425,
"grad_norm": 6.8148512840271,
"learning_rate": 4.288125e-05,
"loss": 1.6635,
"step": 1140
},
{
"epoch": 0.14375,
"grad_norm": 4.404072284698486,
"learning_rate": 4.281875e-05,
"loss": 1.6963,
"step": 1150
},
{
"epoch": 0.145,
"grad_norm": 4.8719682693481445,
"learning_rate": 4.2756250000000006e-05,
"loss": 1.554,
"step": 1160
},
{
"epoch": 0.14625,
"grad_norm": 4.299533843994141,
"learning_rate": 4.2693750000000004e-05,
"loss": 1.6114,
"step": 1170
},
{
"epoch": 0.1475,
"grad_norm": 4.48207426071167,
"learning_rate": 4.263125e-05,
"loss": 1.5773,
"step": 1180
},
{
"epoch": 0.14875,
"grad_norm": 3.618351697921753,
"learning_rate": 4.256875e-05,
"loss": 1.4916,
"step": 1190
},
{
"epoch": 0.15,
"grad_norm": 4.393692970275879,
"learning_rate": 4.250625e-05,
"loss": 1.6461,
"step": 1200
},
{
"epoch": 0.15125,
"grad_norm": 3.9668004512786865,
"learning_rate": 4.2443750000000005e-05,
"loss": 1.5421,
"step": 1210
},
{
"epoch": 0.1525,
"grad_norm": 5.683198928833008,
"learning_rate": 4.238125e-05,
"loss": 1.6221,
"step": 1220
},
{
"epoch": 0.15375,
"grad_norm": 4.367931365966797,
"learning_rate": 4.231875e-05,
"loss": 1.6046,
"step": 1230
},
{
"epoch": 0.155,
"grad_norm": 4.630238056182861,
"learning_rate": 4.225625e-05,
"loss": 1.5636,
"step": 1240
},
{
"epoch": 0.15625,
"grad_norm": 3.8488829135894775,
"learning_rate": 4.2193750000000006e-05,
"loss": 1.5471,
"step": 1250
},
{
"epoch": 0.1575,
"grad_norm": 3.6925644874572754,
"learning_rate": 4.213125e-05,
"loss": 1.5385,
"step": 1260
},
{
"epoch": 0.15875,
"grad_norm": 7.884620189666748,
"learning_rate": 4.206875e-05,
"loss": 1.4549,
"step": 1270
},
{
"epoch": 0.16,
"grad_norm": 4.773382186889648,
"learning_rate": 4.200625e-05,
"loss": 1.6672,
"step": 1280
},
{
"epoch": 0.16125,
"grad_norm": 3.8848724365234375,
"learning_rate": 4.1943750000000006e-05,
"loss": 1.4247,
"step": 1290
},
{
"epoch": 0.1625,
"grad_norm": 5.548094272613525,
"learning_rate": 4.188125e-05,
"loss": 1.817,
"step": 1300
},
{
"epoch": 0.16375,
"grad_norm": 5.077624320983887,
"learning_rate": 4.1818750000000003e-05,
"loss": 1.6165,
"step": 1310
},
{
"epoch": 0.165,
"grad_norm": 4.995692729949951,
"learning_rate": 4.175625e-05,
"loss": 1.5932,
"step": 1320
},
{
"epoch": 0.16625,
"grad_norm": 4.223556041717529,
"learning_rate": 4.169375e-05,
"loss": 1.6204,
"step": 1330
},
{
"epoch": 0.1675,
"grad_norm": 5.034037113189697,
"learning_rate": 4.163125e-05,
"loss": 1.5817,
"step": 1340
},
{
"epoch": 0.16875,
"grad_norm": 6.067126274108887,
"learning_rate": 4.1568750000000004e-05,
"loss": 1.6194,
"step": 1350
},
{
"epoch": 0.17,
"grad_norm": 4.081786155700684,
"learning_rate": 4.150625e-05,
"loss": 1.4929,
"step": 1360
},
{
"epoch": 0.17125,
"grad_norm": 4.328601837158203,
"learning_rate": 4.144375e-05,
"loss": 1.6266,
"step": 1370
},
{
"epoch": 0.1725,
"grad_norm": 4.224490642547607,
"learning_rate": 4.1381250000000006e-05,
"loss": 1.577,
"step": 1380
},
{
"epoch": 0.17375,
"grad_norm": 3.7425456047058105,
"learning_rate": 4.131875e-05,
"loss": 1.4331,
"step": 1390
},
{
"epoch": 0.175,
"grad_norm": 4.163187503814697,
"learning_rate": 4.125625e-05,
"loss": 1.6496,
"step": 1400
},
{
"epoch": 0.17625,
"grad_norm": 4.357766628265381,
"learning_rate": 4.119375e-05,
"loss": 1.7169,
"step": 1410
},
{
"epoch": 0.1775,
"grad_norm": 4.384897232055664,
"learning_rate": 4.113125000000001e-05,
"loss": 1.4617,
"step": 1420
},
{
"epoch": 0.17875,
"grad_norm": 4.748656749725342,
"learning_rate": 4.106875e-05,
"loss": 1.5784,
"step": 1430
},
{
"epoch": 0.18,
"grad_norm": 4.227325439453125,
"learning_rate": 4.1006250000000004e-05,
"loss": 1.5098,
"step": 1440
},
{
"epoch": 0.18125,
"grad_norm": 4.830343723297119,
"learning_rate": 4.094375e-05,
"loss": 1.618,
"step": 1450
},
{
"epoch": 0.1825,
"grad_norm": 4.403887748718262,
"learning_rate": 4.088125e-05,
"loss": 1.5866,
"step": 1460
},
{
"epoch": 0.18375,
"grad_norm": 5.818870544433594,
"learning_rate": 4.081875e-05,
"loss": 1.4826,
"step": 1470
},
{
"epoch": 0.185,
"grad_norm": 6.949367046356201,
"learning_rate": 4.0756250000000005e-05,
"loss": 1.4832,
"step": 1480
},
{
"epoch": 0.18625,
"grad_norm": 4.030486583709717,
"learning_rate": 4.069375e-05,
"loss": 1.4703,
"step": 1490
},
{
"epoch": 0.1875,
"grad_norm": 4.741464614868164,
"learning_rate": 4.063125e-05,
"loss": 1.7756,
"step": 1500
},
{
"epoch": 0.18875,
"grad_norm": 4.840798377990723,
"learning_rate": 4.056875e-05,
"loss": 1.6653,
"step": 1510
},
{
"epoch": 0.19,
"grad_norm": 4.910340309143066,
"learning_rate": 4.050625e-05,
"loss": 1.4131,
"step": 1520
},
{
"epoch": 0.19125,
"grad_norm": 5.179189205169678,
"learning_rate": 4.0443750000000004e-05,
"loss": 1.5972,
"step": 1530
},
{
"epoch": 0.1925,
"grad_norm": 5.435876369476318,
"learning_rate": 4.038125e-05,
"loss": 1.5848,
"step": 1540
},
{
"epoch": 0.19375,
"grad_norm": 3.8866443634033203,
"learning_rate": 4.031875e-05,
"loss": 1.5638,
"step": 1550
},
{
"epoch": 0.195,
"grad_norm": 4.297860145568848,
"learning_rate": 4.025625e-05,
"loss": 1.4118,
"step": 1560
},
{
"epoch": 0.19625,
"grad_norm": 5.706923484802246,
"learning_rate": 4.0193750000000005e-05,
"loss": 1.4316,
"step": 1570
},
{
"epoch": 0.1975,
"grad_norm": 4.5453925132751465,
"learning_rate": 4.013125e-05,
"loss": 1.4754,
"step": 1580
},
{
"epoch": 0.19875,
"grad_norm": 4.322735786437988,
"learning_rate": 4.006875e-05,
"loss": 1.5779,
"step": 1590
},
{
"epoch": 0.2,
"grad_norm": 4.953495979309082,
"learning_rate": 4.000625e-05,
"loss": 1.6211,
"step": 1600
},
{
"epoch": 0.20125,
"grad_norm": 3.003465414047241,
"learning_rate": 3.9943750000000005e-05,
"loss": 1.5667,
"step": 1610
},
{
"epoch": 0.2025,
"grad_norm": 4.02094030380249,
"learning_rate": 3.9881250000000004e-05,
"loss": 1.5953,
"step": 1620
},
{
"epoch": 0.20375,
"grad_norm": 3.9984161853790283,
"learning_rate": 3.981875e-05,
"loss": 1.5122,
"step": 1630
},
{
"epoch": 0.205,
"grad_norm": 4.243444442749023,
"learning_rate": 3.975625e-05,
"loss": 1.6517,
"step": 1640
},
{
"epoch": 0.20625,
"grad_norm": 4.29213809967041,
"learning_rate": 3.969375e-05,
"loss": 1.5576,
"step": 1650
},
{
"epoch": 0.2075,
"grad_norm": 4.561241149902344,
"learning_rate": 3.9631250000000004e-05,
"loss": 1.5213,
"step": 1660
},
{
"epoch": 0.20875,
"grad_norm": 4.34321403503418,
"learning_rate": 3.956875e-05,
"loss": 1.4204,
"step": 1670
},
{
"epoch": 0.21,
"grad_norm": 4.353539943695068,
"learning_rate": 3.950625e-05,
"loss": 1.5922,
"step": 1680
},
{
"epoch": 0.21125,
"grad_norm": 4.75934362411499,
"learning_rate": 3.944375e-05,
"loss": 1.4769,
"step": 1690
},
{
"epoch": 0.2125,
"grad_norm": 4.053194999694824,
"learning_rate": 3.9381250000000005e-05,
"loss": 1.5125,
"step": 1700
},
{
"epoch": 0.21375,
"grad_norm": 5.373641490936279,
"learning_rate": 3.9318750000000004e-05,
"loss": 1.6501,
"step": 1710
},
{
"epoch": 0.215,
"grad_norm": 3.5294227600097656,
"learning_rate": 3.925625e-05,
"loss": 1.5721,
"step": 1720
},
{
"epoch": 0.21625,
"grad_norm": 3.870500326156616,
"learning_rate": 3.919375e-05,
"loss": 1.5217,
"step": 1730
},
{
"epoch": 0.2175,
"grad_norm": 27.053773880004883,
"learning_rate": 3.9131250000000006e-05,
"loss": 1.5216,
"step": 1740
},
{
"epoch": 0.21875,
"grad_norm": 8.008138656616211,
"learning_rate": 3.9068750000000004e-05,
"loss": 1.6043,
"step": 1750
},
{
"epoch": 0.22,
"grad_norm": 5.010753631591797,
"learning_rate": 3.900625e-05,
"loss": 1.5854,
"step": 1760
},
{
"epoch": 0.22125,
"grad_norm": 5.048765659332275,
"learning_rate": 3.894375e-05,
"loss": 1.5564,
"step": 1770
},
{
"epoch": 0.2225,
"grad_norm": 5.92832612991333,
"learning_rate": 3.888125e-05,
"loss": 1.6321,
"step": 1780
},
{
"epoch": 0.22375,
"grad_norm": 3.242619037628174,
"learning_rate": 3.8818750000000005e-05,
"loss": 1.5837,
"step": 1790
},
{
"epoch": 0.225,
"grad_norm": 6.22261381149292,
"learning_rate": 3.875625e-05,
"loss": 1.5758,
"step": 1800
},
{
"epoch": 0.22625,
"grad_norm": 4.630067348480225,
"learning_rate": 3.869375e-05,
"loss": 1.379,
"step": 1810
},
{
"epoch": 0.2275,
"grad_norm": 5.385432720184326,
"learning_rate": 3.863125e-05,
"loss": 1.6319,
"step": 1820
},
{
"epoch": 0.22875,
"grad_norm": 3.7115695476531982,
"learning_rate": 3.8568750000000006e-05,
"loss": 1.5246,
"step": 1830
},
{
"epoch": 0.23,
"grad_norm": 18.826236724853516,
"learning_rate": 3.850625e-05,
"loss": 1.5025,
"step": 1840
},
{
"epoch": 0.23125,
"grad_norm": 4.666762828826904,
"learning_rate": 3.844375e-05,
"loss": 1.5511,
"step": 1850
},
{
"epoch": 0.2325,
"grad_norm": 4.544824600219727,
"learning_rate": 3.838125e-05,
"loss": 1.5163,
"step": 1860
},
{
"epoch": 0.23375,
"grad_norm": 6.902198314666748,
"learning_rate": 3.8318750000000006e-05,
"loss": 1.5313,
"step": 1870
},
{
"epoch": 0.235,
"grad_norm": 5.414902687072754,
"learning_rate": 3.8256250000000005e-05,
"loss": 1.4708,
"step": 1880
},
{
"epoch": 0.23625,
"grad_norm": 7.254164218902588,
"learning_rate": 3.8193750000000003e-05,
"loss": 1.4522,
"step": 1890
},
{
"epoch": 0.2375,
"grad_norm": 2.95312237739563,
"learning_rate": 3.813125e-05,
"loss": 1.3787,
"step": 1900
},
{
"epoch": 0.23875,
"grad_norm": 3.5326123237609863,
"learning_rate": 3.806875e-05,
"loss": 1.3401,
"step": 1910
},
{
"epoch": 0.24,
"grad_norm": 4.167004585266113,
"learning_rate": 3.8006250000000006e-05,
"loss": 1.4521,
"step": 1920
},
{
"epoch": 0.24125,
"grad_norm": 4.5269646644592285,
"learning_rate": 3.794375e-05,
"loss": 1.5217,
"step": 1930
},
{
"epoch": 0.2425,
"grad_norm": 3.9108593463897705,
"learning_rate": 3.788125e-05,
"loss": 1.4758,
"step": 1940
},
{
"epoch": 0.24375,
"grad_norm": 5.435783386230469,
"learning_rate": 3.781875e-05,
"loss": 1.5579,
"step": 1950
},
{
"epoch": 0.245,
"grad_norm": 4.055588722229004,
"learning_rate": 3.7756250000000006e-05,
"loss": 1.4664,
"step": 1960
},
{
"epoch": 0.24625,
"grad_norm": 3.8323545455932617,
"learning_rate": 3.769375e-05,
"loss": 1.4167,
"step": 1970
},
{
"epoch": 0.2475,
"grad_norm": 3.928833484649658,
"learning_rate": 3.763125e-05,
"loss": 1.538,
"step": 1980
},
{
"epoch": 0.24875,
"grad_norm": 5.081177711486816,
"learning_rate": 3.756875e-05,
"loss": 1.5143,
"step": 1990
},
{
"epoch": 0.25,
"grad_norm": 4.005138874053955,
"learning_rate": 3.750625000000001e-05,
"loss": 1.4391,
"step": 2000
},
{
"epoch": 0.25125,
"grad_norm": 7.569023132324219,
"learning_rate": 3.744375e-05,
"loss": 1.5854,
"step": 2010
},
{
"epoch": 0.2525,
"grad_norm": 3.234931230545044,
"learning_rate": 3.7381250000000004e-05,
"loss": 1.5288,
"step": 2020
},
{
"epoch": 0.25375,
"grad_norm": 4.21964168548584,
"learning_rate": 3.731875e-05,
"loss": 1.5045,
"step": 2030
},
{
"epoch": 0.255,
"grad_norm": 5.102002143859863,
"learning_rate": 3.725625e-05,
"loss": 1.5022,
"step": 2040
},
{
"epoch": 0.25625,
"grad_norm": 7.323031902313232,
"learning_rate": 3.719375e-05,
"loss": 1.8256,
"step": 2050
},
{
"epoch": 0.2575,
"grad_norm": 3.650108575820923,
"learning_rate": 3.713125e-05,
"loss": 1.3734,
"step": 2060
},
{
"epoch": 0.25875,
"grad_norm": 3.9725680351257324,
"learning_rate": 3.706875e-05,
"loss": 1.6534,
"step": 2070
},
{
"epoch": 0.26,
"grad_norm": 3.5736091136932373,
"learning_rate": 3.700625e-05,
"loss": 1.5137,
"step": 2080
},
{
"epoch": 0.26125,
"grad_norm": 3.700044631958008,
"learning_rate": 3.694375e-05,
"loss": 1.4202,
"step": 2090
},
{
"epoch": 0.2625,
"grad_norm": 4.076671600341797,
"learning_rate": 3.688125e-05,
"loss": 1.5527,
"step": 2100
},
{
"epoch": 0.26375,
"grad_norm": 4.140468597412109,
"learning_rate": 3.6818750000000004e-05,
"loss": 1.4233,
"step": 2110
},
{
"epoch": 0.265,
"grad_norm": 4.703122615814209,
"learning_rate": 3.675625e-05,
"loss": 1.4997,
"step": 2120
},
{
"epoch": 0.26625,
"grad_norm": 5.30742883682251,
"learning_rate": 3.669375000000001e-05,
"loss": 1.3517,
"step": 2130
},
{
"epoch": 0.2675,
"grad_norm": 6.19927453994751,
"learning_rate": 3.663125e-05,
"loss": 1.4677,
"step": 2140
},
{
"epoch": 0.26875,
"grad_norm": 5.471877098083496,
"learning_rate": 3.6568750000000005e-05,
"loss": 1.3718,
"step": 2150
},
{
"epoch": 0.27,
"grad_norm": 5.80817985534668,
"learning_rate": 3.650625e-05,
"loss": 1.5756,
"step": 2160
},
{
"epoch": 0.27125,
"grad_norm": 4.3927717208862305,
"learning_rate": 3.644375e-05,
"loss": 1.4272,
"step": 2170
},
{
"epoch": 0.2725,
"grad_norm": 4.650943279266357,
"learning_rate": 3.638125e-05,
"loss": 1.5826,
"step": 2180
},
{
"epoch": 0.27375,
"grad_norm": 2.954941511154175,
"learning_rate": 3.631875e-05,
"loss": 1.4817,
"step": 2190
},
{
"epoch": 0.275,
"grad_norm": 3.7205264568328857,
"learning_rate": 3.6256250000000004e-05,
"loss": 1.3662,
"step": 2200
},
{
"epoch": 0.27625,
"grad_norm": 4.387423038482666,
"learning_rate": 3.619375e-05,
"loss": 1.4639,
"step": 2210
},
{
"epoch": 0.2775,
"grad_norm": 2.9313790798187256,
"learning_rate": 3.613125e-05,
"loss": 1.5223,
"step": 2220
},
{
"epoch": 0.27875,
"grad_norm": 2.299884796142578,
"learning_rate": 3.606875e-05,
"loss": 1.4924,
"step": 2230
},
{
"epoch": 0.28,
"grad_norm": 6.7828497886657715,
"learning_rate": 3.6006250000000004e-05,
"loss": 1.4188,
"step": 2240
},
{
"epoch": 0.28125,
"grad_norm": 3.3752660751342773,
"learning_rate": 3.594375e-05,
"loss": 1.4604,
"step": 2250
},
{
"epoch": 0.2825,
"grad_norm": 6.302489757537842,
"learning_rate": 3.588125e-05,
"loss": 1.4211,
"step": 2260
},
{
"epoch": 0.28375,
"grad_norm": 26.017749786376953,
"learning_rate": 3.581875e-05,
"loss": 1.415,
"step": 2270
},
{
"epoch": 0.285,
"grad_norm": 3.502596139907837,
"learning_rate": 3.5756250000000005e-05,
"loss": 1.4949,
"step": 2280
},
{
"epoch": 0.28625,
"grad_norm": 5.614380359649658,
"learning_rate": 3.5693750000000004e-05,
"loss": 1.4836,
"step": 2290
},
{
"epoch": 0.2875,
"grad_norm": 3.932626485824585,
"learning_rate": 3.563125e-05,
"loss": 1.3806,
"step": 2300
},
{
"epoch": 0.28875,
"grad_norm": 2.99294114112854,
"learning_rate": 3.556875e-05,
"loss": 1.5548,
"step": 2310
},
{
"epoch": 0.29,
"grad_norm": 4.608867168426514,
"learning_rate": 3.550625e-05,
"loss": 1.4296,
"step": 2320
},
{
"epoch": 0.29125,
"grad_norm": 5.113489627838135,
"learning_rate": 3.5443750000000004e-05,
"loss": 1.3717,
"step": 2330
},
{
"epoch": 0.2925,
"grad_norm": 3.7373545169830322,
"learning_rate": 3.5381249999999996e-05,
"loss": 1.4817,
"step": 2340
},
{
"epoch": 0.29375,
"grad_norm": 5.348628997802734,
"learning_rate": 3.531875e-05,
"loss": 1.4436,
"step": 2350
},
{
"epoch": 0.295,
"grad_norm": 4.982232093811035,
"learning_rate": 3.525625e-05,
"loss": 1.3956,
"step": 2360
},
{
"epoch": 0.29625,
"grad_norm": 9.024496078491211,
"learning_rate": 3.5193750000000005e-05,
"loss": 1.4792,
"step": 2370
},
{
"epoch": 0.2975,
"grad_norm": 4.083111763000488,
"learning_rate": 3.5131250000000004e-05,
"loss": 1.5715,
"step": 2380
},
{
"epoch": 0.29875,
"grad_norm": 3.5770645141601562,
"learning_rate": 3.506875e-05,
"loss": 1.4707,
"step": 2390
},
{
"epoch": 0.3,
"grad_norm": 4.641442775726318,
"learning_rate": 3.500625e-05,
"loss": 1.362,
"step": 2400
},
{
"epoch": 0.30125,
"grad_norm": 3.524186849594116,
"learning_rate": 3.4943750000000006e-05,
"loss": 1.4686,
"step": 2410
},
{
"epoch": 0.3025,
"grad_norm": 4.681451797485352,
"learning_rate": 3.4881250000000004e-05,
"loss": 1.4739,
"step": 2420
},
{
"epoch": 0.30375,
"grad_norm": 5.3212785720825195,
"learning_rate": 3.481875e-05,
"loss": 1.5322,
"step": 2430
},
{
"epoch": 0.305,
"grad_norm": 4.161744117736816,
"learning_rate": 3.475625e-05,
"loss": 1.3801,
"step": 2440
},
{
"epoch": 0.30625,
"grad_norm": 4.7510151863098145,
"learning_rate": 3.469375e-05,
"loss": 1.4092,
"step": 2450
},
{
"epoch": 0.3075,
"grad_norm": 3.8327901363372803,
"learning_rate": 3.4631250000000005e-05,
"loss": 1.4072,
"step": 2460
},
{
"epoch": 0.30875,
"grad_norm": 3.7004222869873047,
"learning_rate": 3.456875e-05,
"loss": 1.4626,
"step": 2470
},
{
"epoch": 0.31,
"grad_norm": 4.66420841217041,
"learning_rate": 3.450625e-05,
"loss": 1.3397,
"step": 2480
},
{
"epoch": 0.31125,
"grad_norm": 5.986914157867432,
"learning_rate": 3.444375e-05,
"loss": 1.4815,
"step": 2490
},
{
"epoch": 0.3125,
"grad_norm": 3.43625807762146,
"learning_rate": 3.4381250000000006e-05,
"loss": 1.4322,
"step": 2500
},
{
"epoch": 0.31375,
"grad_norm": 4.031944274902344,
"learning_rate": 3.431875e-05,
"loss": 1.4035,
"step": 2510
},
{
"epoch": 0.315,
"grad_norm": 3.607931613922119,
"learning_rate": 3.425625e-05,
"loss": 1.5213,
"step": 2520
},
{
"epoch": 0.31625,
"grad_norm": 5.655627727508545,
"learning_rate": 3.419375e-05,
"loss": 1.4377,
"step": 2530
},
{
"epoch": 0.3175,
"grad_norm": 4.682887077331543,
"learning_rate": 3.4131250000000006e-05,
"loss": 1.2967,
"step": 2540
},
{
"epoch": 0.31875,
"grad_norm": 4.796265125274658,
"learning_rate": 3.406875e-05,
"loss": 1.3274,
"step": 2550
},
{
"epoch": 0.32,
"grad_norm": 3.3428382873535156,
"learning_rate": 3.400625e-05,
"loss": 1.3762,
"step": 2560
},
{
"epoch": 0.32125,
"grad_norm": 4.231629371643066,
"learning_rate": 3.394375e-05,
"loss": 1.3922,
"step": 2570
},
{
"epoch": 0.3225,
"grad_norm": 3.5596585273742676,
"learning_rate": 3.388125e-05,
"loss": 1.3427,
"step": 2580
},
{
"epoch": 0.32375,
"grad_norm": 4.1195783615112305,
"learning_rate": 3.381875e-05,
"loss": 1.362,
"step": 2590
},
{
"epoch": 0.325,
"grad_norm": 4.000838279724121,
"learning_rate": 3.375625e-05,
"loss": 1.4245,
"step": 2600
},
{
"epoch": 0.32625,
"grad_norm": 5.7205939292907715,
"learning_rate": 3.369375e-05,
"loss": 1.526,
"step": 2610
},
{
"epoch": 0.3275,
"grad_norm": 4.729959011077881,
"learning_rate": 3.363125e-05,
"loss": 1.3265,
"step": 2620
},
{
"epoch": 0.32875,
"grad_norm": 3.846036434173584,
"learning_rate": 3.3568750000000006e-05,
"loss": 1.3523,
"step": 2630
},
{
"epoch": 0.33,
"grad_norm": 3.625514507293701,
"learning_rate": 3.350625e-05,
"loss": 1.3483,
"step": 2640
},
{
"epoch": 0.33125,
"grad_norm": 3.985917568206787,
"learning_rate": 3.344375e-05,
"loss": 1.3951,
"step": 2650
},
{
"epoch": 0.3325,
"grad_norm": 5.314172267913818,
"learning_rate": 3.338125e-05,
"loss": 1.4334,
"step": 2660
},
{
"epoch": 0.33375,
"grad_norm": 5.172106742858887,
"learning_rate": 3.331875000000001e-05,
"loss": 1.333,
"step": 2670
},
{
"epoch": 0.335,
"grad_norm": 3.5582571029663086,
"learning_rate": 3.325625e-05,
"loss": 1.4376,
"step": 2680
},
{
"epoch": 0.33625,
"grad_norm": 3.68792462348938,
"learning_rate": 3.3193750000000004e-05,
"loss": 1.4401,
"step": 2690
},
{
"epoch": 0.3375,
"grad_norm": 3.366680860519409,
"learning_rate": 3.313125e-05,
"loss": 1.3825,
"step": 2700
},
{
"epoch": 0.33875,
"grad_norm": 4.318718910217285,
"learning_rate": 3.306875e-05,
"loss": 1.4478,
"step": 2710
},
{
"epoch": 0.34,
"grad_norm": 5.707590103149414,
"learning_rate": 3.300625e-05,
"loss": 1.4966,
"step": 2720
},
{
"epoch": 0.34125,
"grad_norm": 3.624086380004883,
"learning_rate": 3.294375e-05,
"loss": 1.419,
"step": 2730
},
{
"epoch": 0.3425,
"grad_norm": 4.7848711013793945,
"learning_rate": 3.288125e-05,
"loss": 1.29,
"step": 2740
},
{
"epoch": 0.34375,
"grad_norm": 4.0258612632751465,
"learning_rate": 3.281875e-05,
"loss": 1.479,
"step": 2750
},
{
"epoch": 0.345,
"grad_norm": 4.456843852996826,
"learning_rate": 3.275625e-05,
"loss": 1.4191,
"step": 2760
},
{
"epoch": 0.34625,
"grad_norm": 4.327670097351074,
"learning_rate": 3.269375e-05,
"loss": 1.3777,
"step": 2770
},
{
"epoch": 0.3475,
"grad_norm": 3.2591614723205566,
"learning_rate": 3.2631250000000004e-05,
"loss": 1.2713,
"step": 2780
},
{
"epoch": 0.34875,
"grad_norm": 4.323492527008057,
"learning_rate": 3.256875e-05,
"loss": 1.3162,
"step": 2790
},
{
"epoch": 0.35,
"grad_norm": 4.254138946533203,
"learning_rate": 3.250625e-05,
"loss": 1.5309,
"step": 2800
},
{
"epoch": 0.35125,
"grad_norm": 3.481466054916382,
"learning_rate": 3.244375e-05,
"loss": 1.6049,
"step": 2810
},
{
"epoch": 0.3525,
"grad_norm": 3.48063063621521,
"learning_rate": 3.2381250000000004e-05,
"loss": 1.5601,
"step": 2820
},
{
"epoch": 0.35375,
"grad_norm": 3.9832093715667725,
"learning_rate": 3.231875e-05,
"loss": 1.4256,
"step": 2830
},
{
"epoch": 0.355,
"grad_norm": 3.842890739440918,
"learning_rate": 3.225625e-05,
"loss": 1.3579,
"step": 2840
},
{
"epoch": 0.35625,
"grad_norm": 4.633380889892578,
"learning_rate": 3.219375e-05,
"loss": 1.3881,
"step": 2850
},
{
"epoch": 0.3575,
"grad_norm": 6.086498260498047,
"learning_rate": 3.213125e-05,
"loss": 1.4811,
"step": 2860
},
{
"epoch": 0.35875,
"grad_norm": 4.031968593597412,
"learning_rate": 3.2068750000000004e-05,
"loss": 1.3418,
"step": 2870
},
{
"epoch": 0.36,
"grad_norm": 4.838329315185547,
"learning_rate": 3.200625e-05,
"loss": 1.4378,
"step": 2880
},
{
"epoch": 0.36125,
"grad_norm": 3.499248743057251,
"learning_rate": 3.194375e-05,
"loss": 1.2617,
"step": 2890
},
{
"epoch": 0.3625,
"grad_norm": 4.68066930770874,
"learning_rate": 3.188125e-05,
"loss": 1.4701,
"step": 2900
},
{
"epoch": 0.36375,
"grad_norm": 3.8823728561401367,
"learning_rate": 3.1818750000000004e-05,
"loss": 1.364,
"step": 2910
},
{
"epoch": 0.365,
"grad_norm": 3.7089786529541016,
"learning_rate": 3.175625e-05,
"loss": 1.4843,
"step": 2920
},
{
"epoch": 0.36625,
"grad_norm": 3.1307108402252197,
"learning_rate": 3.169375e-05,
"loss": 1.3714,
"step": 2930
},
{
"epoch": 0.3675,
"grad_norm": 4.351153373718262,
"learning_rate": 3.163125e-05,
"loss": 1.4312,
"step": 2940
},
{
"epoch": 0.36875,
"grad_norm": 4.765021800994873,
"learning_rate": 3.1568750000000005e-05,
"loss": 1.6201,
"step": 2950
},
{
"epoch": 0.37,
"grad_norm": 3.538285493850708,
"learning_rate": 3.1506250000000003e-05,
"loss": 1.3805,
"step": 2960
},
{
"epoch": 0.37125,
"grad_norm": 4.136841773986816,
"learning_rate": 3.144375e-05,
"loss": 1.3558,
"step": 2970
},
{
"epoch": 0.3725,
"grad_norm": 4.298130512237549,
"learning_rate": 3.138125e-05,
"loss": 1.5242,
"step": 2980
},
{
"epoch": 0.37375,
"grad_norm": 3.6436102390289307,
"learning_rate": 3.131875e-05,
"loss": 1.3199,
"step": 2990
},
{
"epoch": 0.375,
"grad_norm": 4.527806758880615,
"learning_rate": 3.1256250000000004e-05,
"loss": 1.531,
"step": 3000
},
{
"epoch": 0.37625,
"grad_norm": 5.912485122680664,
"learning_rate": 3.119375e-05,
"loss": 1.444,
"step": 3010
},
{
"epoch": 0.3775,
"grad_norm": 5.101160049438477,
"learning_rate": 3.113125e-05,
"loss": 1.5674,
"step": 3020
},
{
"epoch": 0.37875,
"grad_norm": 5.113125324249268,
"learning_rate": 3.106875e-05,
"loss": 1.3053,
"step": 3030
},
{
"epoch": 0.38,
"grad_norm": 3.990057945251465,
"learning_rate": 3.1006250000000005e-05,
"loss": 1.2181,
"step": 3040
},
{
"epoch": 0.38125,
"grad_norm": 6.3468780517578125,
"learning_rate": 3.0943749999999997e-05,
"loss": 1.4149,
"step": 3050
},
{
"epoch": 0.3825,
"grad_norm": 4.712606430053711,
"learning_rate": 3.088125e-05,
"loss": 1.4128,
"step": 3060
},
{
"epoch": 0.38375,
"grad_norm": 5.313744068145752,
"learning_rate": 3.081875e-05,
"loss": 1.4877,
"step": 3070
},
{
"epoch": 0.385,
"grad_norm": 4.46605110168457,
"learning_rate": 3.0756250000000006e-05,
"loss": 1.4314,
"step": 3080
},
{
"epoch": 0.38625,
"grad_norm": 3.6377127170562744,
"learning_rate": 3.069375e-05,
"loss": 1.3966,
"step": 3090
},
{
"epoch": 0.3875,
"grad_norm": 4.34388542175293,
"learning_rate": 3.063125e-05,
"loss": 1.2802,
"step": 3100
},
{
"epoch": 0.38875,
"grad_norm": 11.9617338180542,
"learning_rate": 3.056875e-05,
"loss": 1.4014,
"step": 3110
},
{
"epoch": 0.39,
"grad_norm": 3.758890390396118,
"learning_rate": 3.0506250000000003e-05,
"loss": 1.5544,
"step": 3120
},
{
"epoch": 0.39125,
"grad_norm": 4.928178310394287,
"learning_rate": 3.0443750000000005e-05,
"loss": 1.4504,
"step": 3130
},
{
"epoch": 0.3925,
"grad_norm": 4.8397722244262695,
"learning_rate": 3.038125e-05,
"loss": 1.3976,
"step": 3140
},
{
"epoch": 0.39375,
"grad_norm": 4.685599327087402,
"learning_rate": 3.0318750000000002e-05,
"loss": 1.3353,
"step": 3150
},
{
"epoch": 0.395,
"grad_norm": 6.802610397338867,
"learning_rate": 3.0256250000000004e-05,
"loss": 1.3951,
"step": 3160
},
{
"epoch": 0.39625,
"grad_norm": 5.310746669769287,
"learning_rate": 3.0193750000000005e-05,
"loss": 1.3754,
"step": 3170
},
{
"epoch": 0.3975,
"grad_norm": 3.733003616333008,
"learning_rate": 3.013125e-05,
"loss": 1.4712,
"step": 3180
},
{
"epoch": 0.39875,
"grad_norm": 3.518083333969116,
"learning_rate": 3.0068750000000002e-05,
"loss": 1.425,
"step": 3190
},
{
"epoch": 0.4,
"grad_norm": 3.223477840423584,
"learning_rate": 3.000625e-05,
"loss": 1.4198,
"step": 3200
},
{
"epoch": 0.40125,
"grad_norm": 6.2975029945373535,
"learning_rate": 2.9943750000000003e-05,
"loss": 1.4008,
"step": 3210
},
{
"epoch": 0.4025,
"grad_norm": 4.495896339416504,
"learning_rate": 2.9881249999999998e-05,
"loss": 1.608,
"step": 3220
},
{
"epoch": 0.40375,
"grad_norm": 3.413543701171875,
"learning_rate": 2.981875e-05,
"loss": 1.4011,
"step": 3230
},
{
"epoch": 0.405,
"grad_norm": 3.756793260574341,
"learning_rate": 2.975625e-05,
"loss": 1.5263,
"step": 3240
},
{
"epoch": 0.40625,
"grad_norm": 3.6322548389434814,
"learning_rate": 2.9693750000000003e-05,
"loss": 1.3671,
"step": 3250
},
{
"epoch": 0.4075,
"grad_norm": 3.74729061126709,
"learning_rate": 2.963125e-05,
"loss": 1.3878,
"step": 3260
},
{
"epoch": 0.40875,
"grad_norm": 3.154021978378296,
"learning_rate": 2.956875e-05,
"loss": 1.2886,
"step": 3270
},
{
"epoch": 0.41,
"grad_norm": 3.127899408340454,
"learning_rate": 2.9506250000000002e-05,
"loss": 1.542,
"step": 3280
},
{
"epoch": 0.41125,
"grad_norm": 2.8965258598327637,
"learning_rate": 2.9443750000000004e-05,
"loss": 1.2855,
"step": 3290
},
{
"epoch": 0.4125,
"grad_norm": 3.9691522121429443,
"learning_rate": 2.938125e-05,
"loss": 1.331,
"step": 3300
},
{
"epoch": 0.41375,
"grad_norm": 4.497001647949219,
"learning_rate": 2.931875e-05,
"loss": 1.2743,
"step": 3310
},
{
"epoch": 0.415,
"grad_norm": 4.911508083343506,
"learning_rate": 2.9256250000000003e-05,
"loss": 1.4838,
"step": 3320
},
{
"epoch": 0.41625,
"grad_norm": 4.168112754821777,
"learning_rate": 2.919375e-05,
"loss": 1.5264,
"step": 3330
},
{
"epoch": 0.4175,
"grad_norm": 4.177130699157715,
"learning_rate": 2.913125e-05,
"loss": 1.4769,
"step": 3340
},
{
"epoch": 0.41875,
"grad_norm": 3.3406238555908203,
"learning_rate": 2.9068750000000002e-05,
"loss": 1.3252,
"step": 3350
},
{
"epoch": 0.42,
"grad_norm": 3.403542995452881,
"learning_rate": 2.900625e-05,
"loss": 1.3591,
"step": 3360
},
{
"epoch": 0.42125,
"grad_norm": 3.7915780544281006,
"learning_rate": 2.8943750000000002e-05,
"loss": 1.4208,
"step": 3370
},
{
"epoch": 0.4225,
"grad_norm": 4.730687618255615,
"learning_rate": 2.8881250000000004e-05,
"loss": 1.3171,
"step": 3380
},
{
"epoch": 0.42375,
"grad_norm": 3.976198673248291,
"learning_rate": 2.881875e-05,
"loss": 1.3666,
"step": 3390
},
{
"epoch": 0.425,
"grad_norm": 3.474154472351074,
"learning_rate": 2.875625e-05,
"loss": 1.5027,
"step": 3400
},
{
"epoch": 0.42625,
"grad_norm": 4.456624984741211,
"learning_rate": 2.8693750000000003e-05,
"loss": 1.3966,
"step": 3410
},
{
"epoch": 0.4275,
"grad_norm": 3.5991241931915283,
"learning_rate": 2.8631250000000005e-05,
"loss": 1.3464,
"step": 3420
},
{
"epoch": 0.42875,
"grad_norm": 3.397467613220215,
"learning_rate": 2.856875e-05,
"loss": 1.3021,
"step": 3430
},
{
"epoch": 0.43,
"grad_norm": 4.232719898223877,
"learning_rate": 2.8506250000000002e-05,
"loss": 1.366,
"step": 3440
},
{
"epoch": 0.43125,
"grad_norm": 4.262751579284668,
"learning_rate": 2.8443750000000004e-05,
"loss": 1.4335,
"step": 3450
},
{
"epoch": 0.4325,
"grad_norm": 4.169719696044922,
"learning_rate": 2.8381250000000002e-05,
"loss": 1.403,
"step": 3460
},
{
"epoch": 0.43375,
"grad_norm": 4.9819159507751465,
"learning_rate": 2.831875e-05,
"loss": 1.3418,
"step": 3470
},
{
"epoch": 0.435,
"grad_norm": 3.556701421737671,
"learning_rate": 2.8256250000000002e-05,
"loss": 1.3712,
"step": 3480
},
{
"epoch": 0.43625,
"grad_norm": 3.9347524642944336,
"learning_rate": 2.819375e-05,
"loss": 1.5704,
"step": 3490
},
{
"epoch": 0.4375,
"grad_norm": 3.451732873916626,
"learning_rate": 2.8131250000000003e-05,
"loss": 1.3976,
"step": 3500
},
{
"epoch": 0.43875,
"grad_norm": 3.0148160457611084,
"learning_rate": 2.8068749999999998e-05,
"loss": 1.3539,
"step": 3510
},
{
"epoch": 0.44,
"grad_norm": 3.8727331161499023,
"learning_rate": 2.800625e-05,
"loss": 1.4348,
"step": 3520
},
{
"epoch": 0.44125,
"grad_norm": 4.594605445861816,
"learning_rate": 2.794375e-05,
"loss": 1.2116,
"step": 3530
},
{
"epoch": 0.4425,
"grad_norm": 4.683310031890869,
"learning_rate": 2.7881250000000003e-05,
"loss": 1.3442,
"step": 3540
},
{
"epoch": 0.44375,
"grad_norm": 3.2116706371307373,
"learning_rate": 2.781875e-05,
"loss": 1.3959,
"step": 3550
},
{
"epoch": 0.445,
"grad_norm": 3.98592472076416,
"learning_rate": 2.775625e-05,
"loss": 1.3782,
"step": 3560
},
{
"epoch": 0.44625,
"grad_norm": 2.84287691116333,
"learning_rate": 2.7693750000000002e-05,
"loss": 1.2789,
"step": 3570
},
{
"epoch": 0.4475,
"grad_norm": 2.849111795425415,
"learning_rate": 2.7631250000000004e-05,
"loss": 1.3003,
"step": 3580
},
{
"epoch": 0.44875,
"grad_norm": 3.8393287658691406,
"learning_rate": 2.756875e-05,
"loss": 1.422,
"step": 3590
},
{
"epoch": 0.45,
"grad_norm": 3.058866024017334,
"learning_rate": 2.750625e-05,
"loss": 1.4432,
"step": 3600
},
{
"epoch": 0.45125,
"grad_norm": 4.536365032196045,
"learning_rate": 2.7443750000000003e-05,
"loss": 1.4127,
"step": 3610
},
{
"epoch": 0.4525,
"grad_norm": 3.964500904083252,
"learning_rate": 2.738125e-05,
"loss": 1.4182,
"step": 3620
},
{
"epoch": 0.45375,
"grad_norm": 4.288209438323975,
"learning_rate": 2.7318750000000003e-05,
"loss": 1.2563,
"step": 3630
},
{
"epoch": 0.455,
"grad_norm": 3.3023056983947754,
"learning_rate": 2.725625e-05,
"loss": 1.3547,
"step": 3640
},
{
"epoch": 0.45625,
"grad_norm": 3.554124116897583,
"learning_rate": 2.719375e-05,
"loss": 1.4073,
"step": 3650
},
{
"epoch": 0.4575,
"grad_norm": 3.063807725906372,
"learning_rate": 2.7131250000000002e-05,
"loss": 1.4365,
"step": 3660
},
{
"epoch": 0.45875,
"grad_norm": 8.845410346984863,
"learning_rate": 2.7068750000000004e-05,
"loss": 1.2819,
"step": 3670
},
{
"epoch": 0.46,
"grad_norm": 4.73734712600708,
"learning_rate": 2.700625e-05,
"loss": 1.2654,
"step": 3680
},
{
"epoch": 0.46125,
"grad_norm": 3.227581262588501,
"learning_rate": 2.694375e-05,
"loss": 1.4912,
"step": 3690
},
{
"epoch": 0.4625,
"grad_norm": 3.327014923095703,
"learning_rate": 2.6881250000000003e-05,
"loss": 1.2713,
"step": 3700
},
{
"epoch": 0.46375,
"grad_norm": 4.055096626281738,
"learning_rate": 2.6818750000000005e-05,
"loss": 1.3,
"step": 3710
},
{
"epoch": 0.465,
"grad_norm": 3.03869366645813,
"learning_rate": 2.675625e-05,
"loss": 1.213,
"step": 3720
},
{
"epoch": 0.46625,
"grad_norm": 2.9507339000701904,
"learning_rate": 2.6693750000000002e-05,
"loss": 1.2247,
"step": 3730
},
{
"epoch": 0.4675,
"grad_norm": 3.0396885871887207,
"learning_rate": 2.6631250000000004e-05,
"loss": 1.3832,
"step": 3740
},
{
"epoch": 0.46875,
"grad_norm": 3.567950963973999,
"learning_rate": 2.6568750000000002e-05,
"loss": 1.4424,
"step": 3750
},
{
"epoch": 0.47,
"grad_norm": 3.8607802391052246,
"learning_rate": 2.650625e-05,
"loss": 1.3011,
"step": 3760
},
{
"epoch": 0.47125,
"grad_norm": 4.078023433685303,
"learning_rate": 2.644375e-05,
"loss": 1.4831,
"step": 3770
},
{
"epoch": 0.4725,
"grad_norm": 3.342250347137451,
"learning_rate": 2.638125e-05,
"loss": 1.3761,
"step": 3780
},
{
"epoch": 0.47375,
"grad_norm": 2.861462354660034,
"learning_rate": 2.6318750000000003e-05,
"loss": 1.3057,
"step": 3790
},
{
"epoch": 0.475,
"grad_norm": 4.583399772644043,
"learning_rate": 2.6256249999999998e-05,
"loss": 1.3889,
"step": 3800
},
{
"epoch": 0.47625,
"grad_norm": 4.215075969696045,
"learning_rate": 2.619375e-05,
"loss": 1.3721,
"step": 3810
},
{
"epoch": 0.4775,
"grad_norm": 4.96607780456543,
"learning_rate": 2.613125e-05,
"loss": 1.2088,
"step": 3820
},
{
"epoch": 0.47875,
"grad_norm": 4.239419937133789,
"learning_rate": 2.6068750000000003e-05,
"loss": 1.3942,
"step": 3830
},
{
"epoch": 0.48,
"grad_norm": 4.5754289627075195,
"learning_rate": 2.600625e-05,
"loss": 1.3388,
"step": 3840
},
{
"epoch": 0.48125,
"grad_norm": 4.335231781005859,
"learning_rate": 2.594375e-05,
"loss": 1.3362,
"step": 3850
},
{
"epoch": 0.4825,
"grad_norm": 3.0417494773864746,
"learning_rate": 2.5881250000000002e-05,
"loss": 1.2406,
"step": 3860
},
{
"epoch": 0.48375,
"grad_norm": 3.8516695499420166,
"learning_rate": 2.5818750000000004e-05,
"loss": 1.4081,
"step": 3870
},
{
"epoch": 0.485,
"grad_norm": 3.9781277179718018,
"learning_rate": 2.5756250000000003e-05,
"loss": 1.3636,
"step": 3880
},
{
"epoch": 0.48625,
"grad_norm": 3.7591724395751953,
"learning_rate": 2.569375e-05,
"loss": 1.4053,
"step": 3890
},
{
"epoch": 0.4875,
"grad_norm": 4.294608116149902,
"learning_rate": 2.563125e-05,
"loss": 1.4465,
"step": 3900
},
{
"epoch": 0.48875,
"grad_norm": 2.709139108657837,
"learning_rate": 2.556875e-05,
"loss": 1.469,
"step": 3910
},
{
"epoch": 0.49,
"grad_norm": 4.929773807525635,
"learning_rate": 2.5506250000000003e-05,
"loss": 1.2714,
"step": 3920
},
{
"epoch": 0.49125,
"grad_norm": 3.6654865741729736,
"learning_rate": 2.544375e-05,
"loss": 1.4664,
"step": 3930
},
{
"epoch": 0.4925,
"grad_norm": 3.2190186977386475,
"learning_rate": 2.538125e-05,
"loss": 1.4203,
"step": 3940
},
{
"epoch": 0.49375,
"grad_norm": 6.021998882293701,
"learning_rate": 2.5318750000000002e-05,
"loss": 1.3446,
"step": 3950
},
{
"epoch": 0.495,
"grad_norm": 4.783326148986816,
"learning_rate": 2.5256250000000004e-05,
"loss": 1.303,
"step": 3960
},
{
"epoch": 0.49625,
"grad_norm": 4.282038688659668,
"learning_rate": 2.519375e-05,
"loss": 1.4116,
"step": 3970
},
{
"epoch": 0.4975,
"grad_norm": 3.5954740047454834,
"learning_rate": 2.513125e-05,
"loss": 1.4053,
"step": 3980
},
{
"epoch": 0.49875,
"grad_norm": 4.234996795654297,
"learning_rate": 2.5068750000000003e-05,
"loss": 1.3921,
"step": 3990
},
{
"epoch": 0.5,
"grad_norm": 4.2734222412109375,
"learning_rate": 2.5006250000000005e-05,
"loss": 1.3818,
"step": 4000
},
{
"epoch": 0.50125,
"grad_norm": 2.8631579875946045,
"learning_rate": 2.4943750000000003e-05,
"loss": 1.1995,
"step": 4010
},
{
"epoch": 0.5025,
"grad_norm": 2.787076950073242,
"learning_rate": 2.4881250000000002e-05,
"loss": 1.4324,
"step": 4020
},
{
"epoch": 0.50375,
"grad_norm": 5.550398826599121,
"learning_rate": 2.481875e-05,
"loss": 1.3689,
"step": 4030
},
{
"epoch": 0.505,
"grad_norm": 3.542635679244995,
"learning_rate": 2.475625e-05,
"loss": 1.2885,
"step": 4040
},
{
"epoch": 0.50625,
"grad_norm": 6.562772750854492,
"learning_rate": 2.469375e-05,
"loss": 1.3545,
"step": 4050
},
{
"epoch": 0.5075,
"grad_norm": 4.4956889152526855,
"learning_rate": 2.463125e-05,
"loss": 1.3232,
"step": 4060
},
{
"epoch": 0.50875,
"grad_norm": 5.023864269256592,
"learning_rate": 2.456875e-05,
"loss": 1.2954,
"step": 4070
},
{
"epoch": 0.51,
"grad_norm": 3.3570520877838135,
"learning_rate": 2.450625e-05,
"loss": 1.2619,
"step": 4080
},
{
"epoch": 0.51125,
"grad_norm": 3.6277055740356445,
"learning_rate": 2.444375e-05,
"loss": 1.4095,
"step": 4090
},
{
"epoch": 0.5125,
"grad_norm": 3.271885871887207,
"learning_rate": 2.438125e-05,
"loss": 1.4898,
"step": 4100
},
{
"epoch": 0.51375,
"grad_norm": 3.166588306427002,
"learning_rate": 2.431875e-05,
"loss": 1.2367,
"step": 4110
},
{
"epoch": 0.515,
"grad_norm": 3.730806589126587,
"learning_rate": 2.425625e-05,
"loss": 1.2928,
"step": 4120
},
{
"epoch": 0.51625,
"grad_norm": 3.6622071266174316,
"learning_rate": 2.4193750000000002e-05,
"loss": 1.3213,
"step": 4130
},
{
"epoch": 0.5175,
"grad_norm": 3.9172027111053467,
"learning_rate": 2.4131250000000004e-05,
"loss": 1.262,
"step": 4140
},
{
"epoch": 0.51875,
"grad_norm": 3.6153948307037354,
"learning_rate": 2.4068750000000002e-05,
"loss": 1.1961,
"step": 4150
},
{
"epoch": 0.52,
"grad_norm": 3.5669710636138916,
"learning_rate": 2.400625e-05,
"loss": 1.286,
"step": 4160
},
{
"epoch": 0.52125,
"grad_norm": 2.944169044494629,
"learning_rate": 2.394375e-05,
"loss": 1.1684,
"step": 4170
},
{
"epoch": 0.5225,
"grad_norm": 5.035433769226074,
"learning_rate": 2.388125e-05,
"loss": 1.4153,
"step": 4180
},
{
"epoch": 0.52375,
"grad_norm": 4.437448501586914,
"learning_rate": 2.381875e-05,
"loss": 1.3649,
"step": 4190
},
{
"epoch": 0.525,
"grad_norm": 6.091770172119141,
"learning_rate": 2.375625e-05,
"loss": 1.3024,
"step": 4200
},
{
"epoch": 0.52625,
"grad_norm": 2.8936169147491455,
"learning_rate": 2.369375e-05,
"loss": 1.3005,
"step": 4210
},
{
"epoch": 0.5275,
"grad_norm": 3.831921100616455,
"learning_rate": 2.3631250000000002e-05,
"loss": 1.3541,
"step": 4220
},
{
"epoch": 0.52875,
"grad_norm": 3.6951687335968018,
"learning_rate": 2.356875e-05,
"loss": 1.4483,
"step": 4230
},
{
"epoch": 0.53,
"grad_norm": 3.1395816802978516,
"learning_rate": 2.3506250000000002e-05,
"loss": 1.368,
"step": 4240
},
{
"epoch": 0.53125,
"grad_norm": 3.2226712703704834,
"learning_rate": 2.344375e-05,
"loss": 1.302,
"step": 4250
},
{
"epoch": 0.5325,
"grad_norm": 4.4419660568237305,
"learning_rate": 2.3381250000000003e-05,
"loss": 1.3303,
"step": 4260
},
{
"epoch": 0.53375,
"grad_norm": 3.2927405834198,
"learning_rate": 2.331875e-05,
"loss": 1.4167,
"step": 4270
},
{
"epoch": 0.535,
"grad_norm": 4.213326454162598,
"learning_rate": 2.3256250000000003e-05,
"loss": 1.3116,
"step": 4280
},
{
"epoch": 0.53625,
"grad_norm": 3.944117546081543,
"learning_rate": 2.319375e-05,
"loss": 1.3415,
"step": 4290
},
{
"epoch": 0.5375,
"grad_norm": 5.39017391204834,
"learning_rate": 2.3131250000000003e-05,
"loss": 1.4055,
"step": 4300
},
{
"epoch": 0.53875,
"grad_norm": 5.432854175567627,
"learning_rate": 2.306875e-05,
"loss": 1.3829,
"step": 4310
},
{
"epoch": 0.54,
"grad_norm": 5.189695835113525,
"learning_rate": 2.300625e-05,
"loss": 1.3361,
"step": 4320
},
{
"epoch": 0.54125,
"grad_norm": 6.880331993103027,
"learning_rate": 2.294375e-05,
"loss": 1.5419,
"step": 4330
},
{
"epoch": 0.5425,
"grad_norm": 3.2537145614624023,
"learning_rate": 2.288125e-05,
"loss": 1.2588,
"step": 4340
},
{
"epoch": 0.54375,
"grad_norm": 3.7062385082244873,
"learning_rate": 2.281875e-05,
"loss": 1.3066,
"step": 4350
},
{
"epoch": 0.545,
"grad_norm": 3.2734427452087402,
"learning_rate": 2.275625e-05,
"loss": 1.2808,
"step": 4360
},
{
"epoch": 0.54625,
"grad_norm": 3.4768989086151123,
"learning_rate": 2.269375e-05,
"loss": 1.3208,
"step": 4370
},
{
"epoch": 0.5475,
"grad_norm": 4.7144670486450195,
"learning_rate": 2.263125e-05,
"loss": 1.4836,
"step": 4380
},
{
"epoch": 0.54875,
"grad_norm": 3.88132905960083,
"learning_rate": 2.2568750000000003e-05,
"loss": 1.3068,
"step": 4390
},
{
"epoch": 0.55,
"grad_norm": 3.8812150955200195,
"learning_rate": 2.250625e-05,
"loss": 1.2981,
"step": 4400
},
{
"epoch": 0.55125,
"grad_norm": 5.712122440338135,
"learning_rate": 2.2443750000000003e-05,
"loss": 1.2052,
"step": 4410
},
{
"epoch": 0.5525,
"grad_norm": 4.217561721801758,
"learning_rate": 2.2381250000000002e-05,
"loss": 1.3009,
"step": 4420
},
{
"epoch": 0.55375,
"grad_norm": 3.853726863861084,
"learning_rate": 2.2318750000000004e-05,
"loss": 1.3033,
"step": 4430
},
{
"epoch": 0.555,
"grad_norm": 4.639031410217285,
"learning_rate": 2.2256250000000002e-05,
"loss": 1.5494,
"step": 4440
},
{
"epoch": 0.55625,
"grad_norm": 3.084345579147339,
"learning_rate": 2.219375e-05,
"loss": 1.3974,
"step": 4450
},
{
"epoch": 0.5575,
"grad_norm": 3.7611162662506104,
"learning_rate": 2.213125e-05,
"loss": 1.3137,
"step": 4460
},
{
"epoch": 0.55875,
"grad_norm": 3.6951828002929688,
"learning_rate": 2.206875e-05,
"loss": 1.3461,
"step": 4470
},
{
"epoch": 0.56,
"grad_norm": 3.5445632934570312,
"learning_rate": 2.200625e-05,
"loss": 1.4106,
"step": 4480
},
{
"epoch": 0.56125,
"grad_norm": 3.625247001647949,
"learning_rate": 2.194375e-05,
"loss": 1.231,
"step": 4490
},
{
"epoch": 0.5625,
"grad_norm": 4.577424049377441,
"learning_rate": 2.188125e-05,
"loss": 1.2473,
"step": 4500
},
{
"epoch": 0.56375,
"grad_norm": 2.9018397331237793,
"learning_rate": 2.1818750000000002e-05,
"loss": 1.1463,
"step": 4510
},
{
"epoch": 0.565,
"grad_norm": 3.3070101737976074,
"learning_rate": 2.175625e-05,
"loss": 1.3716,
"step": 4520
},
{
"epoch": 0.56625,
"grad_norm": 2.9594733715057373,
"learning_rate": 2.1693750000000002e-05,
"loss": 1.3987,
"step": 4530
},
{
"epoch": 0.5675,
"grad_norm": 4.008158206939697,
"learning_rate": 2.163125e-05,
"loss": 1.3694,
"step": 4540
},
{
"epoch": 0.56875,
"grad_norm": 3.944383382797241,
"learning_rate": 2.1568750000000002e-05,
"loss": 1.2835,
"step": 4550
},
{
"epoch": 0.57,
"grad_norm": 3.6331794261932373,
"learning_rate": 2.150625e-05,
"loss": 1.3091,
"step": 4560
},
{
"epoch": 0.57125,
"grad_norm": 4.168713092803955,
"learning_rate": 2.1443750000000003e-05,
"loss": 1.5456,
"step": 4570
},
{
"epoch": 0.5725,
"grad_norm": 3.31856369972229,
"learning_rate": 2.138125e-05,
"loss": 1.2628,
"step": 4580
},
{
"epoch": 0.57375,
"grad_norm": 5.429656982421875,
"learning_rate": 2.131875e-05,
"loss": 1.1798,
"step": 4590
},
{
"epoch": 0.575,
"grad_norm": 5.1332268714904785,
"learning_rate": 2.1256249999999998e-05,
"loss": 1.4308,
"step": 4600
},
{
"epoch": 0.57625,
"grad_norm": 4.013575553894043,
"learning_rate": 2.119375e-05,
"loss": 1.1261,
"step": 4610
},
{
"epoch": 0.5775,
"grad_norm": 3.2173519134521484,
"learning_rate": 2.113125e-05,
"loss": 1.3259,
"step": 4620
},
{
"epoch": 0.57875,
"grad_norm": 4.379116058349609,
"learning_rate": 2.106875e-05,
"loss": 1.2595,
"step": 4630
},
{
"epoch": 0.58,
"grad_norm": 3.4463205337524414,
"learning_rate": 2.1006250000000002e-05,
"loss": 1.3184,
"step": 4640
},
{
"epoch": 0.58125,
"grad_norm": 4.147000312805176,
"learning_rate": 2.094375e-05,
"loss": 1.3683,
"step": 4650
},
{
"epoch": 0.5825,
"grad_norm": 4.752554893493652,
"learning_rate": 2.0881250000000003e-05,
"loss": 1.4316,
"step": 4660
},
{
"epoch": 0.58375,
"grad_norm": 3.6568074226379395,
"learning_rate": 2.081875e-05,
"loss": 1.3977,
"step": 4670
},
{
"epoch": 0.585,
"grad_norm": 3.955928325653076,
"learning_rate": 2.0756250000000003e-05,
"loss": 1.3277,
"step": 4680
},
{
"epoch": 0.58625,
"grad_norm": 3.560964345932007,
"learning_rate": 2.069375e-05,
"loss": 1.4668,
"step": 4690
},
{
"epoch": 0.5875,
"grad_norm": 3.2746620178222656,
"learning_rate": 2.0631250000000003e-05,
"loss": 1.3007,
"step": 4700
},
{
"epoch": 0.58875,
"grad_norm": 4.622394561767578,
"learning_rate": 2.0568750000000002e-05,
"loss": 1.2535,
"step": 4710
},
{
"epoch": 0.59,
"grad_norm": 3.711751699447632,
"learning_rate": 2.050625e-05,
"loss": 1.3777,
"step": 4720
},
{
"epoch": 0.59125,
"grad_norm": 4.493631362915039,
"learning_rate": 2.044375e-05,
"loss": 1.3629,
"step": 4730
},
{
"epoch": 0.5925,
"grad_norm": 3.8427581787109375,
"learning_rate": 2.038125e-05,
"loss": 1.315,
"step": 4740
},
{
"epoch": 0.59375,
"grad_norm": 3.4456775188446045,
"learning_rate": 2.031875e-05,
"loss": 1.2649,
"step": 4750
},
{
"epoch": 0.595,
"grad_norm": 4.129278659820557,
"learning_rate": 2.025625e-05,
"loss": 1.1598,
"step": 4760
},
{
"epoch": 0.59625,
"grad_norm": 3.5817270278930664,
"learning_rate": 2.019375e-05,
"loss": 1.3781,
"step": 4770
},
{
"epoch": 0.5975,
"grad_norm": 3.882089138031006,
"learning_rate": 2.013125e-05,
"loss": 1.2973,
"step": 4780
},
{
"epoch": 0.59875,
"grad_norm": 4.201085090637207,
"learning_rate": 2.006875e-05,
"loss": 1.3712,
"step": 4790
},
{
"epoch": 0.6,
"grad_norm": 4.771631240844727,
"learning_rate": 2.0006250000000002e-05,
"loss": 1.3552,
"step": 4800
},
{
"epoch": 0.60125,
"grad_norm": 3.188880205154419,
"learning_rate": 1.994375e-05,
"loss": 1.3319,
"step": 4810
},
{
"epoch": 0.6025,
"grad_norm": 5.565931797027588,
"learning_rate": 1.9881250000000002e-05,
"loss": 1.453,
"step": 4820
},
{
"epoch": 0.60375,
"grad_norm": 3.3195409774780273,
"learning_rate": 1.981875e-05,
"loss": 1.3035,
"step": 4830
},
{
"epoch": 0.605,
"grad_norm": 4.334782123565674,
"learning_rate": 1.9756250000000002e-05,
"loss": 1.1937,
"step": 4840
},
{
"epoch": 0.60625,
"grad_norm": 4.163855075836182,
"learning_rate": 1.969375e-05,
"loss": 1.2671,
"step": 4850
},
{
"epoch": 0.6075,
"grad_norm": 4.8429975509643555,
"learning_rate": 1.963125e-05,
"loss": 1.4048,
"step": 4860
},
{
"epoch": 0.60875,
"grad_norm": 3.570777177810669,
"learning_rate": 1.9568749999999998e-05,
"loss": 1.3251,
"step": 4870
},
{
"epoch": 0.61,
"grad_norm": 4.26336669921875,
"learning_rate": 1.950625e-05,
"loss": 1.3698,
"step": 4880
},
{
"epoch": 0.61125,
"grad_norm": 5.224381923675537,
"learning_rate": 1.944375e-05,
"loss": 1.4259,
"step": 4890
},
{
"epoch": 0.6125,
"grad_norm": 3.4501774311065674,
"learning_rate": 1.938125e-05,
"loss": 1.3284,
"step": 4900
},
{
"epoch": 0.61375,
"grad_norm": 4.366506099700928,
"learning_rate": 1.9318750000000002e-05,
"loss": 1.3818,
"step": 4910
},
{
"epoch": 0.615,
"grad_norm": 3.7246909141540527,
"learning_rate": 1.925625e-05,
"loss": 1.2934,
"step": 4920
},
{
"epoch": 0.61625,
"grad_norm": 4.3427348136901855,
"learning_rate": 1.9193750000000002e-05,
"loss": 1.377,
"step": 4930
},
{
"epoch": 0.6175,
"grad_norm": 3.3694660663604736,
"learning_rate": 1.913125e-05,
"loss": 1.3416,
"step": 4940
},
{
"epoch": 0.61875,
"grad_norm": 3.8398380279541016,
"learning_rate": 1.9068750000000003e-05,
"loss": 1.2429,
"step": 4950
},
{
"epoch": 0.62,
"grad_norm": 3.3248672485351562,
"learning_rate": 1.900625e-05,
"loss": 1.1935,
"step": 4960
},
{
"epoch": 0.62125,
"grad_norm": 4.807949066162109,
"learning_rate": 1.8943750000000003e-05,
"loss": 1.2884,
"step": 4970
},
{
"epoch": 0.6225,
"grad_norm": 3.617875814437866,
"learning_rate": 1.888125e-05,
"loss": 1.3156,
"step": 4980
},
{
"epoch": 0.62375,
"grad_norm": 3.635308265686035,
"learning_rate": 1.881875e-05,
"loss": 1.2937,
"step": 4990
},
{
"epoch": 0.625,
"grad_norm": 4.459296703338623,
"learning_rate": 1.8756250000000002e-05,
"loss": 1.3306,
"step": 5000
},
{
"epoch": 0.62625,
"grad_norm": 3.0861570835113525,
"learning_rate": 1.869375e-05,
"loss": 1.3047,
"step": 5010
},
{
"epoch": 0.6275,
"grad_norm": 2.831782341003418,
"learning_rate": 1.863125e-05,
"loss": 1.2269,
"step": 5020
},
{
"epoch": 0.62875,
"grad_norm": 3.3934264183044434,
"learning_rate": 1.856875e-05,
"loss": 1.3926,
"step": 5030
},
{
"epoch": 0.63,
"grad_norm": 3.5797278881073,
"learning_rate": 1.850625e-05,
"loss": 1.3056,
"step": 5040
},
{
"epoch": 0.63125,
"grad_norm": 3.5126845836639404,
"learning_rate": 1.844375e-05,
"loss": 1.2345,
"step": 5050
},
{
"epoch": 0.6325,
"grad_norm": 3.1061553955078125,
"learning_rate": 1.838125e-05,
"loss": 1.3201,
"step": 5060
},
{
"epoch": 0.63375,
"grad_norm": 2.7157793045043945,
"learning_rate": 1.831875e-05,
"loss": 1.2418,
"step": 5070
},
{
"epoch": 0.635,
"grad_norm": 5.214048385620117,
"learning_rate": 1.825625e-05,
"loss": 1.4307,
"step": 5080
},
{
"epoch": 0.63625,
"grad_norm": 3.1405720710754395,
"learning_rate": 1.8193750000000002e-05,
"loss": 1.1286,
"step": 5090
},
{
"epoch": 0.6375,
"grad_norm": 3.6989457607269287,
"learning_rate": 1.813125e-05,
"loss": 1.204,
"step": 5100
},
{
"epoch": 0.63875,
"grad_norm": 2.809293031692505,
"learning_rate": 1.8068750000000002e-05,
"loss": 1.2624,
"step": 5110
},
{
"epoch": 0.64,
"grad_norm": 3.5766420364379883,
"learning_rate": 1.800625e-05,
"loss": 1.3483,
"step": 5120
},
{
"epoch": 0.64125,
"grad_norm": 4.097106456756592,
"learning_rate": 1.7943750000000002e-05,
"loss": 1.4194,
"step": 5130
},
{
"epoch": 0.6425,
"grad_norm": 3.991610288619995,
"learning_rate": 1.788125e-05,
"loss": 1.37,
"step": 5140
},
{
"epoch": 0.64375,
"grad_norm": 3.1795196533203125,
"learning_rate": 1.781875e-05,
"loss": 1.355,
"step": 5150
},
{
"epoch": 0.645,
"grad_norm": 4.34062385559082,
"learning_rate": 1.775625e-05,
"loss": 1.2885,
"step": 5160
},
{
"epoch": 0.64625,
"grad_norm": 3.086254835128784,
"learning_rate": 1.769375e-05,
"loss": 1.3419,
"step": 5170
},
{
"epoch": 0.6475,
"grad_norm": 2.817337989807129,
"learning_rate": 1.763125e-05,
"loss": 1.2352,
"step": 5180
},
{
"epoch": 0.64875,
"grad_norm": 3.538144588470459,
"learning_rate": 1.756875e-05,
"loss": 1.4543,
"step": 5190
},
{
"epoch": 0.65,
"grad_norm": 3.363987445831299,
"learning_rate": 1.7506250000000002e-05,
"loss": 1.2738,
"step": 5200
},
{
"epoch": 0.65125,
"grad_norm": 3.583441734313965,
"learning_rate": 1.744375e-05,
"loss": 1.3378,
"step": 5210
},
{
"epoch": 0.6525,
"grad_norm": 5.197504997253418,
"learning_rate": 1.7381250000000002e-05,
"loss": 1.2725,
"step": 5220
},
{
"epoch": 0.65375,
"grad_norm": 3.349055528640747,
"learning_rate": 1.731875e-05,
"loss": 1.4023,
"step": 5230
},
{
"epoch": 0.655,
"grad_norm": 3.489291191101074,
"learning_rate": 1.7256250000000003e-05,
"loss": 1.2932,
"step": 5240
},
{
"epoch": 0.65625,
"grad_norm": 4.6650872230529785,
"learning_rate": 1.719375e-05,
"loss": 1.3444,
"step": 5250
},
{
"epoch": 0.6575,
"grad_norm": 2.737346649169922,
"learning_rate": 1.7131250000000003e-05,
"loss": 1.2761,
"step": 5260
},
{
"epoch": 0.65875,
"grad_norm": 2.2421553134918213,
"learning_rate": 1.706875e-05,
"loss": 1.3069,
"step": 5270
},
{
"epoch": 0.66,
"grad_norm": 3.4570610523223877,
"learning_rate": 1.700625e-05,
"loss": 1.3267,
"step": 5280
},
{
"epoch": 0.66125,
"grad_norm": 3.7983574867248535,
"learning_rate": 1.694375e-05,
"loss": 1.1993,
"step": 5290
},
{
"epoch": 0.6625,
"grad_norm": 3.105295181274414,
"learning_rate": 1.688125e-05,
"loss": 1.2937,
"step": 5300
},
{
"epoch": 0.66375,
"grad_norm": 3.038071870803833,
"learning_rate": 1.681875e-05,
"loss": 1.1928,
"step": 5310
},
{
"epoch": 0.665,
"grad_norm": 3.6429975032806396,
"learning_rate": 1.675625e-05,
"loss": 1.2931,
"step": 5320
},
{
"epoch": 0.66625,
"grad_norm": 3.5131030082702637,
"learning_rate": 1.669375e-05,
"loss": 1.3754,
"step": 5330
},
{
"epoch": 0.6675,
"grad_norm": 2.8932530879974365,
"learning_rate": 1.663125e-05,
"loss": 1.2501,
"step": 5340
},
{
"epoch": 0.66875,
"grad_norm": 3.6388654708862305,
"learning_rate": 1.656875e-05,
"loss": 1.2519,
"step": 5350
},
{
"epoch": 0.67,
"grad_norm": 2.79237699508667,
"learning_rate": 1.650625e-05,
"loss": 1.261,
"step": 5360
},
{
"epoch": 0.67125,
"grad_norm": 3.383009910583496,
"learning_rate": 1.644375e-05,
"loss": 1.371,
"step": 5370
},
{
"epoch": 0.6725,
"grad_norm": 3.6595306396484375,
"learning_rate": 1.6381250000000002e-05,
"loss": 1.2767,
"step": 5380
},
{
"epoch": 0.67375,
"grad_norm": 3.0263020992279053,
"learning_rate": 1.6318750000000004e-05,
"loss": 1.1588,
"step": 5390
},
{
"epoch": 0.675,
"grad_norm": 4.379304885864258,
"learning_rate": 1.6256250000000002e-05,
"loss": 1.2933,
"step": 5400
},
{
"epoch": 0.67625,
"grad_norm": 3.8622045516967773,
"learning_rate": 1.619375e-05,
"loss": 1.2027,
"step": 5410
},
{
"epoch": 0.6775,
"grad_norm": 3.141866683959961,
"learning_rate": 1.613125e-05,
"loss": 1.2224,
"step": 5420
},
{
"epoch": 0.67875,
"grad_norm": 6.858778476715088,
"learning_rate": 1.606875e-05,
"loss": 1.5491,
"step": 5430
},
{
"epoch": 0.68,
"grad_norm": 5.642153263092041,
"learning_rate": 1.600625e-05,
"loss": 1.29,
"step": 5440
},
{
"epoch": 0.68125,
"grad_norm": 3.560525894165039,
"learning_rate": 1.594375e-05,
"loss": 1.1829,
"step": 5450
},
{
"epoch": 0.6825,
"grad_norm": 2.9443717002868652,
"learning_rate": 1.588125e-05,
"loss": 1.1874,
"step": 5460
},
{
"epoch": 0.68375,
"grad_norm": 4.751156330108643,
"learning_rate": 1.581875e-05,
"loss": 1.2543,
"step": 5470
},
{
"epoch": 0.685,
"grad_norm": 4.42818546295166,
"learning_rate": 1.575625e-05,
"loss": 1.3246,
"step": 5480
},
{
"epoch": 0.68625,
"grad_norm": 3.2748584747314453,
"learning_rate": 1.5693750000000002e-05,
"loss": 1.3284,
"step": 5490
},
{
"epoch": 0.6875,
"grad_norm": 3.11030912399292,
"learning_rate": 1.563125e-05,
"loss": 1.374,
"step": 5500
},
{
"epoch": 0.68875,
"grad_norm": 3.4232919216156006,
"learning_rate": 1.5568750000000002e-05,
"loss": 1.2341,
"step": 5510
},
{
"epoch": 0.69,
"grad_norm": 3.561033248901367,
"learning_rate": 1.550625e-05,
"loss": 1.3163,
"step": 5520
},
{
"epoch": 0.69125,
"grad_norm": 3.259941816329956,
"learning_rate": 1.5443750000000003e-05,
"loss": 1.297,
"step": 5530
},
{
"epoch": 0.6925,
"grad_norm": 4.584996223449707,
"learning_rate": 1.538125e-05,
"loss": 1.2904,
"step": 5540
},
{
"epoch": 0.69375,
"grad_norm": 3.8913450241088867,
"learning_rate": 1.531875e-05,
"loss": 1.1971,
"step": 5550
},
{
"epoch": 0.695,
"grad_norm": 2.977058172225952,
"learning_rate": 1.525625e-05,
"loss": 1.2937,
"step": 5560
},
{
"epoch": 0.69625,
"grad_norm": 4.8936767578125,
"learning_rate": 1.5193750000000002e-05,
"loss": 1.2619,
"step": 5570
},
{
"epoch": 0.6975,
"grad_norm": 3.7824175357818604,
"learning_rate": 1.513125e-05,
"loss": 1.2624,
"step": 5580
},
{
"epoch": 0.69875,
"grad_norm": 3.056828498840332,
"learning_rate": 1.506875e-05,
"loss": 1.216,
"step": 5590
},
{
"epoch": 0.7,
"grad_norm": 2.8399548530578613,
"learning_rate": 1.5006249999999999e-05,
"loss": 1.4088,
"step": 5600
},
{
"epoch": 0.70125,
"grad_norm": 3.484537124633789,
"learning_rate": 1.494375e-05,
"loss": 1.4189,
"step": 5610
},
{
"epoch": 0.7025,
"grad_norm": 3.8352763652801514,
"learning_rate": 1.488125e-05,
"loss": 1.2755,
"step": 5620
},
{
"epoch": 0.70375,
"grad_norm": 5.0336785316467285,
"learning_rate": 1.4818750000000001e-05,
"loss": 1.2553,
"step": 5630
},
{
"epoch": 0.705,
"grad_norm": 3.4178173542022705,
"learning_rate": 1.4756250000000001e-05,
"loss": 1.2476,
"step": 5640
},
{
"epoch": 0.70625,
"grad_norm": 2.737694501876831,
"learning_rate": 1.4693750000000001e-05,
"loss": 1.2752,
"step": 5650
},
{
"epoch": 0.7075,
"grad_norm": 4.246181964874268,
"learning_rate": 1.4631250000000002e-05,
"loss": 1.2681,
"step": 5660
},
{
"epoch": 0.70875,
"grad_norm": 3.4891135692596436,
"learning_rate": 1.456875e-05,
"loss": 1.2727,
"step": 5670
},
{
"epoch": 0.71,
"grad_norm": 5.688715934753418,
"learning_rate": 1.4506250000000002e-05,
"loss": 1.2608,
"step": 5680
},
{
"epoch": 0.71125,
"grad_norm": 3.696350574493408,
"learning_rate": 1.444375e-05,
"loss": 1.2572,
"step": 5690
},
{
"epoch": 0.7125,
"grad_norm": 2.7734687328338623,
"learning_rate": 1.4381250000000002e-05,
"loss": 1.2112,
"step": 5700
},
{
"epoch": 0.71375,
"grad_norm": 3.8144760131835938,
"learning_rate": 1.431875e-05,
"loss": 1.1772,
"step": 5710
},
{
"epoch": 0.715,
"grad_norm": 3.2144532203674316,
"learning_rate": 1.4256250000000001e-05,
"loss": 1.1894,
"step": 5720
},
{
"epoch": 0.71625,
"grad_norm": 3.2612404823303223,
"learning_rate": 1.419375e-05,
"loss": 1.2391,
"step": 5730
},
{
"epoch": 0.7175,
"grad_norm": 3.1684000492095947,
"learning_rate": 1.4131250000000001e-05,
"loss": 1.4017,
"step": 5740
},
{
"epoch": 0.71875,
"grad_norm": 2.8268094062805176,
"learning_rate": 1.406875e-05,
"loss": 1.2547,
"step": 5750
},
{
"epoch": 0.72,
"grad_norm": 3.3832662105560303,
"learning_rate": 1.4006250000000002e-05,
"loss": 1.3259,
"step": 5760
},
{
"epoch": 0.72125,
"grad_norm": 4.300711154937744,
"learning_rate": 1.394375e-05,
"loss": 1.2589,
"step": 5770
},
{
"epoch": 0.7225,
"grad_norm": 3.825169801712036,
"learning_rate": 1.3881250000000002e-05,
"loss": 1.299,
"step": 5780
},
{
"epoch": 0.72375,
"grad_norm": 4.574991703033447,
"learning_rate": 1.381875e-05,
"loss": 1.2543,
"step": 5790
},
{
"epoch": 0.725,
"grad_norm": 3.1067469120025635,
"learning_rate": 1.375625e-05,
"loss": 1.2156,
"step": 5800
},
{
"epoch": 0.72625,
"grad_norm": 3.175403356552124,
"learning_rate": 1.3693749999999999e-05,
"loss": 1.1028,
"step": 5810
},
{
"epoch": 0.7275,
"grad_norm": 3.0259435176849365,
"learning_rate": 1.3631250000000001e-05,
"loss": 1.2328,
"step": 5820
},
{
"epoch": 0.72875,
"grad_norm": 3.6122825145721436,
"learning_rate": 1.356875e-05,
"loss": 1.2719,
"step": 5830
},
{
"epoch": 0.73,
"grad_norm": 3.8099982738494873,
"learning_rate": 1.3506250000000001e-05,
"loss": 1.2776,
"step": 5840
},
{
"epoch": 0.73125,
"grad_norm": 3.41595196723938,
"learning_rate": 1.344375e-05,
"loss": 1.3049,
"step": 5850
},
{
"epoch": 0.7325,
"grad_norm": 3.9172563552856445,
"learning_rate": 1.338125e-05,
"loss": 1.259,
"step": 5860
},
{
"epoch": 0.73375,
"grad_norm": 3.162015438079834,
"learning_rate": 1.3318749999999998e-05,
"loss": 1.2661,
"step": 5870
},
{
"epoch": 0.735,
"grad_norm": 3.8498268127441406,
"learning_rate": 1.325625e-05,
"loss": 1.2737,
"step": 5880
},
{
"epoch": 0.73625,
"grad_norm": 3.3922574520111084,
"learning_rate": 1.3193750000000002e-05,
"loss": 1.3626,
"step": 5890
},
{
"epoch": 0.7375,
"grad_norm": 3.3294966220855713,
"learning_rate": 1.313125e-05,
"loss": 1.1615,
"step": 5900
},
{
"epoch": 0.73875,
"grad_norm": 3.1682517528533936,
"learning_rate": 1.3068750000000003e-05,
"loss": 1.236,
"step": 5910
},
{
"epoch": 0.74,
"grad_norm": 4.264817714691162,
"learning_rate": 1.3006250000000001e-05,
"loss": 1.409,
"step": 5920
},
{
"epoch": 0.74125,
"grad_norm": 4.208788871765137,
"learning_rate": 1.2943750000000001e-05,
"loss": 1.315,
"step": 5930
},
{
"epoch": 0.7425,
"grad_norm": 3.256880044937134,
"learning_rate": 1.288125e-05,
"loss": 1.1969,
"step": 5940
},
{
"epoch": 0.74375,
"grad_norm": 3.2720675468444824,
"learning_rate": 1.2818750000000002e-05,
"loss": 1.1622,
"step": 5950
},
{
"epoch": 0.745,
"grad_norm": 3.5530762672424316,
"learning_rate": 1.275625e-05,
"loss": 1.1997,
"step": 5960
},
{
"epoch": 0.74625,
"grad_norm": 3.067060708999634,
"learning_rate": 1.2693750000000002e-05,
"loss": 1.251,
"step": 5970
},
{
"epoch": 0.7475,
"grad_norm": 2.7820942401885986,
"learning_rate": 1.263125e-05,
"loss": 1.1509,
"step": 5980
},
{
"epoch": 0.74875,
"grad_norm": 3.6107969284057617,
"learning_rate": 1.256875e-05,
"loss": 1.4085,
"step": 5990
},
{
"epoch": 0.75,
"grad_norm": 6.036477565765381,
"learning_rate": 1.250625e-05,
"loss": 1.327,
"step": 6000
},
{
"epoch": 0.75125,
"grad_norm": 6.355252742767334,
"learning_rate": 1.2443750000000001e-05,
"loss": 1.3521,
"step": 6010
},
{
"epoch": 0.7525,
"grad_norm": 3.9743545055389404,
"learning_rate": 1.2381250000000001e-05,
"loss": 1.2671,
"step": 6020
},
{
"epoch": 0.75375,
"grad_norm": 3.6817073822021484,
"learning_rate": 1.2318750000000001e-05,
"loss": 1.2116,
"step": 6030
},
{
"epoch": 0.755,
"grad_norm": 3.35996413230896,
"learning_rate": 1.2256250000000001e-05,
"loss": 1.2384,
"step": 6040
},
{
"epoch": 0.75625,
"grad_norm": 3.913332939147949,
"learning_rate": 1.2193750000000002e-05,
"loss": 1.1616,
"step": 6050
},
{
"epoch": 0.7575,
"grad_norm": 3.327204465866089,
"learning_rate": 1.213125e-05,
"loss": 1.397,
"step": 6060
},
{
"epoch": 0.75875,
"grad_norm": 3.389503240585327,
"learning_rate": 1.206875e-05,
"loss": 1.1839,
"step": 6070
},
{
"epoch": 0.76,
"grad_norm": 3.358041286468506,
"learning_rate": 1.200625e-05,
"loss": 1.2082,
"step": 6080
},
{
"epoch": 0.76125,
"grad_norm": 3.1447696685791016,
"learning_rate": 1.194375e-05,
"loss": 1.1562,
"step": 6090
},
{
"epoch": 0.7625,
"grad_norm": 3.44541072845459,
"learning_rate": 1.188125e-05,
"loss": 1.3382,
"step": 6100
},
{
"epoch": 0.76375,
"grad_norm": 3.0848042964935303,
"learning_rate": 1.1818750000000001e-05,
"loss": 1.3179,
"step": 6110
},
{
"epoch": 0.765,
"grad_norm": 3.651977300643921,
"learning_rate": 1.1756250000000001e-05,
"loss": 1.2659,
"step": 6120
},
{
"epoch": 0.76625,
"grad_norm": 3.152700662612915,
"learning_rate": 1.1693750000000001e-05,
"loss": 1.268,
"step": 6130
},
{
"epoch": 0.7675,
"grad_norm": 4.158604145050049,
"learning_rate": 1.163125e-05,
"loss": 1.2345,
"step": 6140
},
{
"epoch": 0.76875,
"grad_norm": 3.4470276832580566,
"learning_rate": 1.156875e-05,
"loss": 1.2949,
"step": 6150
},
{
"epoch": 0.77,
"grad_norm": 3.150104284286499,
"learning_rate": 1.150625e-05,
"loss": 1.2278,
"step": 6160
},
{
"epoch": 0.77125,
"grad_norm": 3.8590192794799805,
"learning_rate": 1.144375e-05,
"loss": 1.242,
"step": 6170
},
{
"epoch": 0.7725,
"grad_norm": 4.324366092681885,
"learning_rate": 1.138125e-05,
"loss": 1.3226,
"step": 6180
},
{
"epoch": 0.77375,
"grad_norm": 2.908385753631592,
"learning_rate": 1.131875e-05,
"loss": 1.2675,
"step": 6190
},
{
"epoch": 0.775,
"grad_norm": 3.7354936599731445,
"learning_rate": 1.1256250000000001e-05,
"loss": 1.4641,
"step": 6200
},
{
"epoch": 0.77625,
"grad_norm": 3.4952096939086914,
"learning_rate": 1.119375e-05,
"loss": 1.3276,
"step": 6210
},
{
"epoch": 0.7775,
"grad_norm": 3.402109384536743,
"learning_rate": 1.113125e-05,
"loss": 1.1241,
"step": 6220
},
{
"epoch": 0.77875,
"grad_norm": 3.307467460632324,
"learning_rate": 1.106875e-05,
"loss": 1.3648,
"step": 6230
},
{
"epoch": 0.78,
"grad_norm": 3.728865146636963,
"learning_rate": 1.100625e-05,
"loss": 1.1877,
"step": 6240
},
{
"epoch": 0.78125,
"grad_norm": 3.682426691055298,
"learning_rate": 1.094375e-05,
"loss": 1.1554,
"step": 6250
},
{
"epoch": 0.7825,
"grad_norm": 3.6852643489837646,
"learning_rate": 1.0881250000000002e-05,
"loss": 1.141,
"step": 6260
},
{
"epoch": 0.78375,
"grad_norm": 4.276542663574219,
"learning_rate": 1.081875e-05,
"loss": 1.3658,
"step": 6270
},
{
"epoch": 0.785,
"grad_norm": 3.071035385131836,
"learning_rate": 1.075625e-05,
"loss": 1.1546,
"step": 6280
},
{
"epoch": 0.78625,
"grad_norm": 4.381916522979736,
"learning_rate": 1.069375e-05,
"loss": 1.1487,
"step": 6290
},
{
"epoch": 0.7875,
"grad_norm": 3.135784864425659,
"learning_rate": 1.0631250000000001e-05,
"loss": 1.2275,
"step": 6300
},
{
"epoch": 0.78875,
"grad_norm": 3.404214859008789,
"learning_rate": 1.0568750000000001e-05,
"loss": 1.1391,
"step": 6310
},
{
"epoch": 0.79,
"grad_norm": 4.272989749908447,
"learning_rate": 1.0506250000000001e-05,
"loss": 1.3467,
"step": 6320
},
{
"epoch": 0.79125,
"grad_norm": 3.6154932975769043,
"learning_rate": 1.0443750000000001e-05,
"loss": 1.2993,
"step": 6330
},
{
"epoch": 0.7925,
"grad_norm": 3.6855859756469727,
"learning_rate": 1.038125e-05,
"loss": 1.1339,
"step": 6340
},
{
"epoch": 0.79375,
"grad_norm": 3.8834280967712402,
"learning_rate": 1.031875e-05,
"loss": 1.2207,
"step": 6350
},
{
"epoch": 0.795,
"grad_norm": 4.357527256011963,
"learning_rate": 1.025625e-05,
"loss": 1.266,
"step": 6360
},
{
"epoch": 0.79625,
"grad_norm": 4.210048198699951,
"learning_rate": 1.019375e-05,
"loss": 1.2289,
"step": 6370
},
{
"epoch": 0.7975,
"grad_norm": 5.4694647789001465,
"learning_rate": 1.013125e-05,
"loss": 1.1857,
"step": 6380
},
{
"epoch": 0.79875,
"grad_norm": 3.1147310733795166,
"learning_rate": 1.006875e-05,
"loss": 1.3103,
"step": 6390
},
{
"epoch": 0.8,
"grad_norm": 3.2928709983825684,
"learning_rate": 1.0006250000000001e-05,
"loss": 1.3642,
"step": 6400
},
{
"epoch": 0.80125,
"grad_norm": 3.124969244003296,
"learning_rate": 9.94375e-06,
"loss": 1.2237,
"step": 6410
},
{
"epoch": 0.8025,
"grad_norm": 4.003876686096191,
"learning_rate": 9.88125e-06,
"loss": 1.1267,
"step": 6420
},
{
"epoch": 0.80375,
"grad_norm": 3.974295139312744,
"learning_rate": 9.81875e-06,
"loss": 1.1949,
"step": 6430
},
{
"epoch": 0.805,
"grad_norm": 3.8797054290771484,
"learning_rate": 9.75625e-06,
"loss": 1.2467,
"step": 6440
},
{
"epoch": 0.80625,
"grad_norm": 3.5032153129577637,
"learning_rate": 9.69375e-06,
"loss": 1.1921,
"step": 6450
},
{
"epoch": 0.8075,
"grad_norm": 3.413506269454956,
"learning_rate": 9.63125e-06,
"loss": 1.2359,
"step": 6460
},
{
"epoch": 0.80875,
"grad_norm": 3.894541025161743,
"learning_rate": 9.56875e-06,
"loss": 1.3246,
"step": 6470
},
{
"epoch": 0.81,
"grad_norm": 3.6062092781066895,
"learning_rate": 9.50625e-06,
"loss": 1.2462,
"step": 6480
},
{
"epoch": 0.81125,
"grad_norm": 3.027904510498047,
"learning_rate": 9.44375e-06,
"loss": 1.1628,
"step": 6490
},
{
"epoch": 0.8125,
"grad_norm": 3.1570937633514404,
"learning_rate": 9.38125e-06,
"loss": 1.1969,
"step": 6500
},
{
"epoch": 0.81375,
"grad_norm": 2.943943977355957,
"learning_rate": 9.318750000000001e-06,
"loss": 1.2753,
"step": 6510
},
{
"epoch": 0.815,
"grad_norm": 3.1280055046081543,
"learning_rate": 9.256250000000001e-06,
"loss": 1.1306,
"step": 6520
},
{
"epoch": 0.81625,
"grad_norm": 4.8054399490356445,
"learning_rate": 9.193750000000002e-06,
"loss": 1.4028,
"step": 6530
},
{
"epoch": 0.8175,
"grad_norm": 3.8370373249053955,
"learning_rate": 9.131250000000002e-06,
"loss": 1.3408,
"step": 6540
},
{
"epoch": 0.81875,
"grad_norm": 3.5859014987945557,
"learning_rate": 9.06875e-06,
"loss": 1.2043,
"step": 6550
},
{
"epoch": 0.82,
"grad_norm": 3.415255069732666,
"learning_rate": 9.00625e-06,
"loss": 1.2217,
"step": 6560
},
{
"epoch": 0.82125,
"grad_norm": 3.6743905544281006,
"learning_rate": 8.94375e-06,
"loss": 1.1703,
"step": 6570
},
{
"epoch": 0.8225,
"grad_norm": 2.7219078540802,
"learning_rate": 8.88125e-06,
"loss": 1.2929,
"step": 6580
},
{
"epoch": 0.82375,
"grad_norm": 2.926729679107666,
"learning_rate": 8.818750000000001e-06,
"loss": 1.371,
"step": 6590
},
{
"epoch": 0.825,
"grad_norm": 3.4970710277557373,
"learning_rate": 8.756250000000001e-06,
"loss": 1.1985,
"step": 6600
},
{
"epoch": 0.82625,
"grad_norm": 3.802090644836426,
"learning_rate": 8.693750000000001e-06,
"loss": 1.2843,
"step": 6610
},
{
"epoch": 0.8275,
"grad_norm": 3.0677645206451416,
"learning_rate": 8.63125e-06,
"loss": 1.2791,
"step": 6620
},
{
"epoch": 0.82875,
"grad_norm": 3.903545379638672,
"learning_rate": 8.56875e-06,
"loss": 1.1844,
"step": 6630
},
{
"epoch": 0.83,
"grad_norm": 3.0674688816070557,
"learning_rate": 8.50625e-06,
"loss": 1.2111,
"step": 6640
},
{
"epoch": 0.83125,
"grad_norm": 3.7008626461029053,
"learning_rate": 8.44375e-06,
"loss": 1.2203,
"step": 6650
},
{
"epoch": 0.8325,
"grad_norm": 4.41727352142334,
"learning_rate": 8.38125e-06,
"loss": 1.3227,
"step": 6660
},
{
"epoch": 0.83375,
"grad_norm": 3.456376075744629,
"learning_rate": 8.31875e-06,
"loss": 1.2385,
"step": 6670
},
{
"epoch": 0.835,
"grad_norm": 4.11112117767334,
"learning_rate": 8.25625e-06,
"loss": 1.3041,
"step": 6680
},
{
"epoch": 0.83625,
"grad_norm": 2.9469385147094727,
"learning_rate": 8.19375e-06,
"loss": 1.2004,
"step": 6690
},
{
"epoch": 0.8375,
"grad_norm": 3.8115038871765137,
"learning_rate": 8.13125e-06,
"loss": 1.2884,
"step": 6700
},
{
"epoch": 0.83875,
"grad_norm": 3.3773956298828125,
"learning_rate": 8.06875e-06,
"loss": 1.0463,
"step": 6710
},
{
"epoch": 0.84,
"grad_norm": 3.9320790767669678,
"learning_rate": 8.00625e-06,
"loss": 1.2637,
"step": 6720
},
{
"epoch": 0.84125,
"grad_norm": 3.310523271560669,
"learning_rate": 7.94375e-06,
"loss": 1.3186,
"step": 6730
},
{
"epoch": 0.8425,
"grad_norm": 4.7285637855529785,
"learning_rate": 7.88125e-06,
"loss": 1.0737,
"step": 6740
},
{
"epoch": 0.84375,
"grad_norm": 4.357003688812256,
"learning_rate": 7.81875e-06,
"loss": 1.2484,
"step": 6750
},
{
"epoch": 0.845,
"grad_norm": 6.129393100738525,
"learning_rate": 7.75625e-06,
"loss": 1.2047,
"step": 6760
},
{
"epoch": 0.84625,
"grad_norm": 3.644521713256836,
"learning_rate": 7.69375e-06,
"loss": 1.2953,
"step": 6770
},
{
"epoch": 0.8475,
"grad_norm": 3.131911516189575,
"learning_rate": 7.63125e-06,
"loss": 1.167,
"step": 6780
},
{
"epoch": 0.84875,
"grad_norm": 3.4802305698394775,
"learning_rate": 7.568750000000001e-06,
"loss": 1.3353,
"step": 6790
},
{
"epoch": 0.85,
"grad_norm": 4.78393030166626,
"learning_rate": 7.506250000000001e-06,
"loss": 1.2363,
"step": 6800
},
{
"epoch": 0.85125,
"grad_norm": 3.6145455837249756,
"learning_rate": 7.4437500000000005e-06,
"loss": 1.1944,
"step": 6810
},
{
"epoch": 0.8525,
"grad_norm": 2.614313840866089,
"learning_rate": 7.381250000000001e-06,
"loss": 1.2045,
"step": 6820
},
{
"epoch": 0.85375,
"grad_norm": 2.96185564994812,
"learning_rate": 7.318750000000001e-06,
"loss": 1.103,
"step": 6830
},
{
"epoch": 0.855,
"grad_norm": 6.062429428100586,
"learning_rate": 7.25625e-06,
"loss": 1.2446,
"step": 6840
},
{
"epoch": 0.85625,
"grad_norm": 3.7602720260620117,
"learning_rate": 7.19375e-06,
"loss": 1.2243,
"step": 6850
},
{
"epoch": 0.8575,
"grad_norm": 3.405805826187134,
"learning_rate": 7.1312500000000005e-06,
"loss": 1.1315,
"step": 6860
},
{
"epoch": 0.85875,
"grad_norm": 3.2969017028808594,
"learning_rate": 7.068750000000001e-06,
"loss": 1.1796,
"step": 6870
},
{
"epoch": 0.86,
"grad_norm": 4.001396179199219,
"learning_rate": 7.00625e-06,
"loss": 1.3448,
"step": 6880
},
{
"epoch": 0.86125,
"grad_norm": 4.414263725280762,
"learning_rate": 6.94375e-06,
"loss": 1.2219,
"step": 6890
},
{
"epoch": 0.8625,
"grad_norm": 3.5935354232788086,
"learning_rate": 6.88125e-06,
"loss": 1.1148,
"step": 6900
},
{
"epoch": 0.86375,
"grad_norm": 3.3420941829681396,
"learning_rate": 6.81875e-06,
"loss": 1.2654,
"step": 6910
},
{
"epoch": 0.865,
"grad_norm": 4.038907527923584,
"learning_rate": 6.75625e-06,
"loss": 1.25,
"step": 6920
},
{
"epoch": 0.86625,
"grad_norm": 4.007872581481934,
"learning_rate": 6.69375e-06,
"loss": 1.3026,
"step": 6930
},
{
"epoch": 0.8675,
"grad_norm": 3.9287617206573486,
"learning_rate": 6.63125e-06,
"loss": 1.1596,
"step": 6940
},
{
"epoch": 0.86875,
"grad_norm": 2.911954402923584,
"learning_rate": 6.56875e-06,
"loss": 1.281,
"step": 6950
},
{
"epoch": 0.87,
"grad_norm": 3.543391704559326,
"learning_rate": 6.50625e-06,
"loss": 1.268,
"step": 6960
},
{
"epoch": 0.87125,
"grad_norm": 2.627988815307617,
"learning_rate": 6.44375e-06,
"loss": 1.2706,
"step": 6970
},
{
"epoch": 0.8725,
"grad_norm": 3.9405062198638916,
"learning_rate": 6.38125e-06,
"loss": 1.2318,
"step": 6980
},
{
"epoch": 0.87375,
"grad_norm": 5.287662506103516,
"learning_rate": 6.3187499999999994e-06,
"loss": 1.3652,
"step": 6990
},
{
"epoch": 0.875,
"grad_norm": 3.4925477504730225,
"learning_rate": 6.25625e-06,
"loss": 1.3112,
"step": 7000
},
{
"epoch": 0.87625,
"grad_norm": 4.140984058380127,
"learning_rate": 6.193750000000001e-06,
"loss": 1.1956,
"step": 7010
},
{
"epoch": 0.8775,
"grad_norm": 5.385605335235596,
"learning_rate": 6.13125e-06,
"loss": 1.2959,
"step": 7020
},
{
"epoch": 0.87875,
"grad_norm": 3.0660688877105713,
"learning_rate": 6.06875e-06,
"loss": 1.2465,
"step": 7030
},
{
"epoch": 0.88,
"grad_norm": 6.002776622772217,
"learning_rate": 6.00625e-06,
"loss": 1.4034,
"step": 7040
},
{
"epoch": 0.88125,
"grad_norm": 3.455930471420288,
"learning_rate": 5.94375e-06,
"loss": 1.2638,
"step": 7050
},
{
"epoch": 0.8825,
"grad_norm": 4.8663330078125,
"learning_rate": 5.88125e-06,
"loss": 1.241,
"step": 7060
},
{
"epoch": 0.88375,
"grad_norm": 3.752796173095703,
"learning_rate": 5.818750000000001e-06,
"loss": 1.2547,
"step": 7070
},
{
"epoch": 0.885,
"grad_norm": 3.7783405780792236,
"learning_rate": 5.75625e-06,
"loss": 1.2531,
"step": 7080
},
{
"epoch": 0.88625,
"grad_norm": 4.519382953643799,
"learning_rate": 5.69375e-06,
"loss": 1.2299,
"step": 7090
},
{
"epoch": 0.8875,
"grad_norm": 3.280031442642212,
"learning_rate": 5.6312500000000005e-06,
"loss": 1.1249,
"step": 7100
},
{
"epoch": 0.88875,
"grad_norm": 3.4675002098083496,
"learning_rate": 5.568750000000001e-06,
"loss": 1.1638,
"step": 7110
},
{
"epoch": 0.89,
"grad_norm": 3.768967390060425,
"learning_rate": 5.50625e-06,
"loss": 1.3985,
"step": 7120
},
{
"epoch": 0.89125,
"grad_norm": 4.724802017211914,
"learning_rate": 5.44375e-06,
"loss": 1.2324,
"step": 7130
},
{
"epoch": 0.8925,
"grad_norm": 4.179074287414551,
"learning_rate": 5.38125e-06,
"loss": 1.1886,
"step": 7140
},
{
"epoch": 0.89375,
"grad_norm": 4.108990669250488,
"learning_rate": 5.3187500000000005e-06,
"loss": 1.4765,
"step": 7150
},
{
"epoch": 0.895,
"grad_norm": 4.176460266113281,
"learning_rate": 5.25625e-06,
"loss": 1.3531,
"step": 7160
},
{
"epoch": 0.89625,
"grad_norm": 3.8916871547698975,
"learning_rate": 5.19375e-06,
"loss": 1.1835,
"step": 7170
},
{
"epoch": 0.8975,
"grad_norm": 3.0018982887268066,
"learning_rate": 5.13125e-06,
"loss": 1.2182,
"step": 7180
},
{
"epoch": 0.89875,
"grad_norm": 2.873230218887329,
"learning_rate": 5.06875e-06,
"loss": 1.2015,
"step": 7190
},
{
"epoch": 0.9,
"grad_norm": 3.587171792984009,
"learning_rate": 5.0062500000000006e-06,
"loss": 1.1995,
"step": 7200
},
{
"epoch": 0.90125,
"grad_norm": 3.585000514984131,
"learning_rate": 4.943750000000001e-06,
"loss": 1.2743,
"step": 7210
},
{
"epoch": 0.9025,
"grad_norm": 3.029139757156372,
"learning_rate": 4.88125e-06,
"loss": 1.2592,
"step": 7220
},
{
"epoch": 0.90375,
"grad_norm": 3.077986240386963,
"learning_rate": 4.81875e-06,
"loss": 1.1442,
"step": 7230
},
{
"epoch": 0.905,
"grad_norm": 2.580195426940918,
"learning_rate": 4.75625e-06,
"loss": 1.1531,
"step": 7240
},
{
"epoch": 0.90625,
"grad_norm": 4.1036272048950195,
"learning_rate": 4.693750000000001e-06,
"loss": 1.3263,
"step": 7250
},
{
"epoch": 0.9075,
"grad_norm": 2.9886510372161865,
"learning_rate": 4.63125e-06,
"loss": 1.2372,
"step": 7260
},
{
"epoch": 0.90875,
"grad_norm": 3.5167458057403564,
"learning_rate": 4.56875e-06,
"loss": 1.3199,
"step": 7270
},
{
"epoch": 0.91,
"grad_norm": 2.5231704711914062,
"learning_rate": 4.50625e-06,
"loss": 1.177,
"step": 7280
},
{
"epoch": 0.91125,
"grad_norm": 3.786348581314087,
"learning_rate": 4.44375e-06,
"loss": 1.1602,
"step": 7290
},
{
"epoch": 0.9125,
"grad_norm": 3.3437860012054443,
"learning_rate": 4.38125e-06,
"loss": 1.266,
"step": 7300
},
{
"epoch": 0.91375,
"grad_norm": 3.798862934112549,
"learning_rate": 4.31875e-06,
"loss": 1.1616,
"step": 7310
},
{
"epoch": 0.915,
"grad_norm": 3.9102814197540283,
"learning_rate": 4.25625e-06,
"loss": 1.1375,
"step": 7320
},
{
"epoch": 0.91625,
"grad_norm": 3.7560813426971436,
"learning_rate": 4.19375e-06,
"loss": 1.1178,
"step": 7330
},
{
"epoch": 0.9175,
"grad_norm": 3.210345983505249,
"learning_rate": 4.1312500000000005e-06,
"loss": 1.0262,
"step": 7340
},
{
"epoch": 0.91875,
"grad_norm": 3.9245500564575195,
"learning_rate": 4.068750000000001e-06,
"loss": 1.2043,
"step": 7350
},
{
"epoch": 0.92,
"grad_norm": 3.6340880393981934,
"learning_rate": 4.00625e-06,
"loss": 1.2074,
"step": 7360
},
{
"epoch": 0.92125,
"grad_norm": 2.729948043823242,
"learning_rate": 3.94375e-06,
"loss": 1.2154,
"step": 7370
},
{
"epoch": 0.9225,
"grad_norm": 5.572306156158447,
"learning_rate": 3.88125e-06,
"loss": 1.2146,
"step": 7380
},
{
"epoch": 0.92375,
"grad_norm": 3.3658738136291504,
"learning_rate": 3.8187500000000005e-06,
"loss": 1.2932,
"step": 7390
},
{
"epoch": 0.925,
"grad_norm": 3.831812620162964,
"learning_rate": 3.7562500000000002e-06,
"loss": 1.276,
"step": 7400
},
{
"epoch": 0.92625,
"grad_norm": 2.969461441040039,
"learning_rate": 3.69375e-06,
"loss": 1.1595,
"step": 7410
},
{
"epoch": 0.9275,
"grad_norm": 4.540526390075684,
"learning_rate": 3.6312499999999997e-06,
"loss": 1.2967,
"step": 7420
},
{
"epoch": 0.92875,
"grad_norm": 4.625833988189697,
"learning_rate": 3.56875e-06,
"loss": 1.1621,
"step": 7430
},
{
"epoch": 0.93,
"grad_norm": 3.144590139389038,
"learning_rate": 3.5062500000000005e-06,
"loss": 1.1817,
"step": 7440
},
{
"epoch": 0.93125,
"grad_norm": 4.031451225280762,
"learning_rate": 3.4437500000000003e-06,
"loss": 1.3139,
"step": 7450
},
{
"epoch": 0.9325,
"grad_norm": 3.656533718109131,
"learning_rate": 3.3812500000000004e-06,
"loss": 1.2257,
"step": 7460
},
{
"epoch": 0.93375,
"grad_norm": 3.497356653213501,
"learning_rate": 3.31875e-06,
"loss": 1.2571,
"step": 7470
},
{
"epoch": 0.935,
"grad_norm": 3.372255802154541,
"learning_rate": 3.2562500000000004e-06,
"loss": 1.2038,
"step": 7480
},
{
"epoch": 0.93625,
"grad_norm": 3.349648952484131,
"learning_rate": 3.19375e-06,
"loss": 1.2484,
"step": 7490
},
{
"epoch": 0.9375,
"grad_norm": 3.791111469268799,
"learning_rate": 3.1312500000000003e-06,
"loss": 1.2193,
"step": 7500
},
{
"epoch": 0.93875,
"grad_norm": 2.6422860622406006,
"learning_rate": 3.06875e-06,
"loss": 1.1635,
"step": 7510
},
{
"epoch": 0.94,
"grad_norm": 4.382972240447998,
"learning_rate": 3.0062500000000002e-06,
"loss": 1.2232,
"step": 7520
},
{
"epoch": 0.94125,
"grad_norm": 3.4872641563415527,
"learning_rate": 2.94375e-06,
"loss": 1.1796,
"step": 7530
},
{
"epoch": 0.9425,
"grad_norm": 3.1682002544403076,
"learning_rate": 2.88125e-06,
"loss": 1.2348,
"step": 7540
},
{
"epoch": 0.94375,
"grad_norm": 3.578993558883667,
"learning_rate": 2.8187500000000003e-06,
"loss": 1.2451,
"step": 7550
}
],
"logging_steps": 10,
"max_steps": 8000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.1593221075369984e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}