| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.94375, | |
| "eval_steps": 500, | |
| "global_step": 7550, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00125, | |
| "grad_norm": 15.714543342590332, | |
| "learning_rate": 4.994375e-05, | |
| "loss": 2.8924, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0025, | |
| "grad_norm": 10.94906234741211, | |
| "learning_rate": 4.988125e-05, | |
| "loss": 2.5952, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.00375, | |
| "grad_norm": 15.514859199523926, | |
| "learning_rate": 4.981875e-05, | |
| "loss": 2.4666, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.005, | |
| "grad_norm": 9.053750991821289, | |
| "learning_rate": 4.975625000000001e-05, | |
| "loss": 2.3325, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.00625, | |
| "grad_norm": 19.168121337890625, | |
| "learning_rate": 4.969375e-05, | |
| "loss": 2.3066, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0075, | |
| "grad_norm": 9.255040168762207, | |
| "learning_rate": 4.9631250000000004e-05, | |
| "loss": 2.2425, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.00875, | |
| "grad_norm": 8.124181747436523, | |
| "learning_rate": 4.956875e-05, | |
| "loss": 2.1237, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 7.121690273284912, | |
| "learning_rate": 4.950625000000001e-05, | |
| "loss": 1.9941, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.01125, | |
| "grad_norm": 8.191390991210938, | |
| "learning_rate": 4.944375e-05, | |
| "loss": 2.118, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.0125, | |
| "grad_norm": 7.51083517074585, | |
| "learning_rate": 4.9381250000000004e-05, | |
| "loss": 1.9932, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.01375, | |
| "grad_norm": 7.706765174865723, | |
| "learning_rate": 4.931875e-05, | |
| "loss": 2.2066, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.015, | |
| "grad_norm": 6.421011447906494, | |
| "learning_rate": 4.925625e-05, | |
| "loss": 2.0439, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.01625, | |
| "grad_norm": 17.403257369995117, | |
| "learning_rate": 4.9193750000000007e-05, | |
| "loss": 2.1501, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.0175, | |
| "grad_norm": 5.682934284210205, | |
| "learning_rate": 4.913125e-05, | |
| "loss": 2.0993, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.01875, | |
| "grad_norm": 8.931119918823242, | |
| "learning_rate": 4.9068750000000003e-05, | |
| "loss": 2.0383, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 5.6062140464782715, | |
| "learning_rate": 4.900625e-05, | |
| "loss": 1.9334, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.02125, | |
| "grad_norm": 8.099666595458984, | |
| "learning_rate": 4.894375000000001e-05, | |
| "loss": 1.9423, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.0225, | |
| "grad_norm": 5.474426746368408, | |
| "learning_rate": 4.888125e-05, | |
| "loss": 2.0548, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.02375, | |
| "grad_norm": 6.269449710845947, | |
| "learning_rate": 4.8818750000000004e-05, | |
| "loss": 1.9097, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.025, | |
| "grad_norm": 6.422362804412842, | |
| "learning_rate": 4.875625e-05, | |
| "loss": 1.9114, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.02625, | |
| "grad_norm": 8.737632751464844, | |
| "learning_rate": 4.869375000000001e-05, | |
| "loss": 2.0777, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.0275, | |
| "grad_norm": 7.279562950134277, | |
| "learning_rate": 4.863125e-05, | |
| "loss": 1.9173, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.02875, | |
| "grad_norm": 7.346338272094727, | |
| "learning_rate": 4.8568750000000005e-05, | |
| "loss": 1.7516, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 7.4580888748168945, | |
| "learning_rate": 4.850625e-05, | |
| "loss": 1.9487, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.03125, | |
| "grad_norm": 6.269708633422852, | |
| "learning_rate": 4.844375e-05, | |
| "loss": 1.8654, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.0325, | |
| "grad_norm": 5.809817790985107, | |
| "learning_rate": 4.838125e-05, | |
| "loss": 1.9751, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.03375, | |
| "grad_norm": 11.987732887268066, | |
| "learning_rate": 4.831875e-05, | |
| "loss": 2.0062, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.035, | |
| "grad_norm": 8.748228073120117, | |
| "learning_rate": 4.8256250000000004e-05, | |
| "loss": 2.1293, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.03625, | |
| "grad_norm": 5.571599960327148, | |
| "learning_rate": 4.819375e-05, | |
| "loss": 1.9504, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.0375, | |
| "grad_norm": 5.935739517211914, | |
| "learning_rate": 4.813125e-05, | |
| "loss": 1.8399, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.03875, | |
| "grad_norm": 5.676118850708008, | |
| "learning_rate": 4.806875e-05, | |
| "loss": 1.9019, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 8.595135688781738, | |
| "learning_rate": 4.8006250000000005e-05, | |
| "loss": 2.0299, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.04125, | |
| "grad_norm": 5.606544017791748, | |
| "learning_rate": 4.794375e-05, | |
| "loss": 1.8806, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.0425, | |
| "grad_norm": 7.973094463348389, | |
| "learning_rate": 4.788125e-05, | |
| "loss": 1.7285, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.04375, | |
| "grad_norm": 9.09721851348877, | |
| "learning_rate": 4.781875e-05, | |
| "loss": 1.9773, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.045, | |
| "grad_norm": 9.41321849822998, | |
| "learning_rate": 4.7756250000000005e-05, | |
| "loss": 1.9834, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.04625, | |
| "grad_norm": 4.810961723327637, | |
| "learning_rate": 4.7693750000000004e-05, | |
| "loss": 1.9862, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.0475, | |
| "grad_norm": 6.283267498016357, | |
| "learning_rate": 4.763125e-05, | |
| "loss": 1.8544, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.04875, | |
| "grad_norm": 9.017960548400879, | |
| "learning_rate": 4.756875e-05, | |
| "loss": 1.8689, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 6.013952732086182, | |
| "learning_rate": 4.750625e-05, | |
| "loss": 1.798, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.05125, | |
| "grad_norm": 5.1610026359558105, | |
| "learning_rate": 4.7443750000000005e-05, | |
| "loss": 1.8669, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.0525, | |
| "grad_norm": 5.481388092041016, | |
| "learning_rate": 4.738125e-05, | |
| "loss": 1.9778, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.05375, | |
| "grad_norm": 8.239945411682129, | |
| "learning_rate": 4.731875e-05, | |
| "loss": 1.6991, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.055, | |
| "grad_norm": 5.641376972198486, | |
| "learning_rate": 4.725625e-05, | |
| "loss": 1.7848, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.05625, | |
| "grad_norm": 7.561056613922119, | |
| "learning_rate": 4.7193750000000005e-05, | |
| "loss": 1.7594, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.0575, | |
| "grad_norm": 4.920119762420654, | |
| "learning_rate": 4.7131250000000004e-05, | |
| "loss": 1.8387, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.05875, | |
| "grad_norm": 6.90638542175293, | |
| "learning_rate": 4.706875e-05, | |
| "loss": 1.8461, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 6.640336036682129, | |
| "learning_rate": 4.700625e-05, | |
| "loss": 1.7583, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.06125, | |
| "grad_norm": 5.126943588256836, | |
| "learning_rate": 4.6943750000000006e-05, | |
| "loss": 1.9002, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.0625, | |
| "grad_norm": 4.264902591705322, | |
| "learning_rate": 4.6881250000000005e-05, | |
| "loss": 1.9068, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.06375, | |
| "grad_norm": 7.165338039398193, | |
| "learning_rate": 4.681875e-05, | |
| "loss": 1.7812, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.065, | |
| "grad_norm": 7.0703277587890625, | |
| "learning_rate": 4.675625e-05, | |
| "loss": 1.698, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.06625, | |
| "grad_norm": 6.396975040435791, | |
| "learning_rate": 4.669375e-05, | |
| "loss": 1.7182, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.0675, | |
| "grad_norm": 8.975642204284668, | |
| "learning_rate": 4.6631250000000005e-05, | |
| "loss": 1.8448, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.06875, | |
| "grad_norm": 5.405190467834473, | |
| "learning_rate": 4.656875e-05, | |
| "loss": 1.7348, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 5.298381328582764, | |
| "learning_rate": 4.650625e-05, | |
| "loss": 1.6221, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.07125, | |
| "grad_norm": 4.854972839355469, | |
| "learning_rate": 4.644375e-05, | |
| "loss": 1.7367, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.0725, | |
| "grad_norm": 5.756941795349121, | |
| "learning_rate": 4.6381250000000006e-05, | |
| "loss": 1.801, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.07375, | |
| "grad_norm": 4.7184600830078125, | |
| "learning_rate": 4.631875e-05, | |
| "loss": 1.6898, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.075, | |
| "grad_norm": 5.140761852264404, | |
| "learning_rate": 4.625625e-05, | |
| "loss": 1.6612, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.07625, | |
| "grad_norm": 3.7104735374450684, | |
| "learning_rate": 4.619375e-05, | |
| "loss": 1.624, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.0775, | |
| "grad_norm": 6.2447896003723145, | |
| "learning_rate": 4.613125000000001e-05, | |
| "loss": 1.7355, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.07875, | |
| "grad_norm": 9.780672073364258, | |
| "learning_rate": 4.6068750000000005e-05, | |
| "loss": 1.6192, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 5.958286285400391, | |
| "learning_rate": 4.6006250000000004e-05, | |
| "loss": 1.6157, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.08125, | |
| "grad_norm": 5.786264419555664, | |
| "learning_rate": 4.594375e-05, | |
| "loss": 1.633, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.0825, | |
| "grad_norm": 4.75607967376709, | |
| "learning_rate": 4.588125e-05, | |
| "loss": 1.8139, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.08375, | |
| "grad_norm": 4.077645301818848, | |
| "learning_rate": 4.5818750000000006e-05, | |
| "loss": 1.6889, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.085, | |
| "grad_norm": 5.341221332550049, | |
| "learning_rate": 4.575625e-05, | |
| "loss": 1.7112, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.08625, | |
| "grad_norm": 5.421123027801514, | |
| "learning_rate": 4.569375e-05, | |
| "loss": 1.729, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.0875, | |
| "grad_norm": 5.6531829833984375, | |
| "learning_rate": 4.563125e-05, | |
| "loss": 1.7805, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.08875, | |
| "grad_norm": 4.264986515045166, | |
| "learning_rate": 4.5568750000000006e-05, | |
| "loss": 1.5703, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 5.658288955688477, | |
| "learning_rate": 4.550625e-05, | |
| "loss": 1.8264, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.09125, | |
| "grad_norm": 5.032812118530273, | |
| "learning_rate": 4.5443750000000003e-05, | |
| "loss": 1.6834, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.0925, | |
| "grad_norm": 4.175335884094238, | |
| "learning_rate": 4.538125e-05, | |
| "loss": 1.8537, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.09375, | |
| "grad_norm": 6.916208267211914, | |
| "learning_rate": 4.531875000000001e-05, | |
| "loss": 1.7556, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.095, | |
| "grad_norm": 6.064156532287598, | |
| "learning_rate": 4.525625e-05, | |
| "loss": 1.7132, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.09625, | |
| "grad_norm": 6.488204002380371, | |
| "learning_rate": 4.5193750000000004e-05, | |
| "loss": 1.6907, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.0975, | |
| "grad_norm": 5.419294357299805, | |
| "learning_rate": 4.513125e-05, | |
| "loss": 1.6911, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.09875, | |
| "grad_norm": 6.069253921508789, | |
| "learning_rate": 4.506875e-05, | |
| "loss": 1.7586, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 5.28116512298584, | |
| "learning_rate": 4.500625e-05, | |
| "loss": 1.7632, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.10125, | |
| "grad_norm": 4.091380596160889, | |
| "learning_rate": 4.494375e-05, | |
| "loss": 1.6792, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.1025, | |
| "grad_norm": 5.594090938568115, | |
| "learning_rate": 4.488125e-05, | |
| "loss": 1.8079, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.10375, | |
| "grad_norm": 5.238066673278809, | |
| "learning_rate": 4.481875e-05, | |
| "loss": 1.7537, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.105, | |
| "grad_norm": 4.945870399475098, | |
| "learning_rate": 4.475625e-05, | |
| "loss": 1.7236, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.10625, | |
| "grad_norm": 47.752830505371094, | |
| "learning_rate": 4.469375e-05, | |
| "loss": 1.7715, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.1075, | |
| "grad_norm": 6.474725246429443, | |
| "learning_rate": 4.4631250000000004e-05, | |
| "loss": 1.6062, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.10875, | |
| "grad_norm": 5.710610866546631, | |
| "learning_rate": 4.456875e-05, | |
| "loss": 1.6496, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 5.289515495300293, | |
| "learning_rate": 4.450625000000001e-05, | |
| "loss": 1.7847, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.11125, | |
| "grad_norm": 4.040447235107422, | |
| "learning_rate": 4.444375e-05, | |
| "loss": 1.5642, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.1125, | |
| "grad_norm": 4.721045970916748, | |
| "learning_rate": 4.4381250000000005e-05, | |
| "loss": 1.6424, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.11375, | |
| "grad_norm": 4.430590629577637, | |
| "learning_rate": 4.431875e-05, | |
| "loss": 1.5693, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.115, | |
| "grad_norm": 4.155664443969727, | |
| "learning_rate": 4.425625e-05, | |
| "loss": 1.5653, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.11625, | |
| "grad_norm": 5.376486778259277, | |
| "learning_rate": 4.419375e-05, | |
| "loss": 1.5292, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.1175, | |
| "grad_norm": 5.888033390045166, | |
| "learning_rate": 4.413125e-05, | |
| "loss": 1.5894, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.11875, | |
| "grad_norm": 4.666552543640137, | |
| "learning_rate": 4.4068750000000004e-05, | |
| "loss": 1.6117, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.565912246704102, | |
| "learning_rate": 4.400625e-05, | |
| "loss": 1.7492, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.12125, | |
| "grad_norm": 4.324479579925537, | |
| "learning_rate": 4.394375e-05, | |
| "loss": 1.6294, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.1225, | |
| "grad_norm": 5.823368072509766, | |
| "learning_rate": 4.388125e-05, | |
| "loss": 1.6953, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.12375, | |
| "grad_norm": 6.85033655166626, | |
| "learning_rate": 4.3818750000000005e-05, | |
| "loss": 1.5723, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.125, | |
| "grad_norm": 6.246133327484131, | |
| "learning_rate": 4.375625e-05, | |
| "loss": 1.6042, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.12625, | |
| "grad_norm": 4.677844047546387, | |
| "learning_rate": 4.369375e-05, | |
| "loss": 1.6391, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.1275, | |
| "grad_norm": 6.899301528930664, | |
| "learning_rate": 4.363125e-05, | |
| "loss": 1.5956, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.12875, | |
| "grad_norm": 4.530190944671631, | |
| "learning_rate": 4.3568750000000005e-05, | |
| "loss": 1.537, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 5.287406921386719, | |
| "learning_rate": 4.3506250000000004e-05, | |
| "loss": 1.7437, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.13125, | |
| "grad_norm": 3.828369140625, | |
| "learning_rate": 4.344375e-05, | |
| "loss": 1.6047, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.1325, | |
| "grad_norm": 4.649002552032471, | |
| "learning_rate": 4.338125e-05, | |
| "loss": 1.5932, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.13375, | |
| "grad_norm": 6.695621967315674, | |
| "learning_rate": 4.331875e-05, | |
| "loss": 1.7681, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.135, | |
| "grad_norm": 3.446563243865967, | |
| "learning_rate": 4.3256250000000004e-05, | |
| "loss": 1.622, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.13625, | |
| "grad_norm": 4.040626525878906, | |
| "learning_rate": 4.3193749999999996e-05, | |
| "loss": 1.5857, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.1375, | |
| "grad_norm": 3.937739372253418, | |
| "learning_rate": 4.313125e-05, | |
| "loss": 1.613, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.13875, | |
| "grad_norm": 3.4581360816955566, | |
| "learning_rate": 4.306875e-05, | |
| "loss": 1.5218, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.91287899017334, | |
| "learning_rate": 4.3006250000000005e-05, | |
| "loss": 1.6676, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.14125, | |
| "grad_norm": 5.508735179901123, | |
| "learning_rate": 4.2943750000000004e-05, | |
| "loss": 1.673, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.1425, | |
| "grad_norm": 6.8148512840271, | |
| "learning_rate": 4.288125e-05, | |
| "loss": 1.6635, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.14375, | |
| "grad_norm": 4.404072284698486, | |
| "learning_rate": 4.281875e-05, | |
| "loss": 1.6963, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.145, | |
| "grad_norm": 4.8719682693481445, | |
| "learning_rate": 4.2756250000000006e-05, | |
| "loss": 1.554, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.14625, | |
| "grad_norm": 4.299533843994141, | |
| "learning_rate": 4.2693750000000004e-05, | |
| "loss": 1.6114, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.1475, | |
| "grad_norm": 4.48207426071167, | |
| "learning_rate": 4.263125e-05, | |
| "loss": 1.5773, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.14875, | |
| "grad_norm": 3.618351697921753, | |
| "learning_rate": 4.256875e-05, | |
| "loss": 1.4916, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.393692970275879, | |
| "learning_rate": 4.250625e-05, | |
| "loss": 1.6461, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.15125, | |
| "grad_norm": 3.9668004512786865, | |
| "learning_rate": 4.2443750000000005e-05, | |
| "loss": 1.5421, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.1525, | |
| "grad_norm": 5.683198928833008, | |
| "learning_rate": 4.238125e-05, | |
| "loss": 1.6221, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.15375, | |
| "grad_norm": 4.367931365966797, | |
| "learning_rate": 4.231875e-05, | |
| "loss": 1.6046, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.155, | |
| "grad_norm": 4.630238056182861, | |
| "learning_rate": 4.225625e-05, | |
| "loss": 1.5636, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.15625, | |
| "grad_norm": 3.8488829135894775, | |
| "learning_rate": 4.2193750000000006e-05, | |
| "loss": 1.5471, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.1575, | |
| "grad_norm": 3.6925644874572754, | |
| "learning_rate": 4.213125e-05, | |
| "loss": 1.5385, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.15875, | |
| "grad_norm": 7.884620189666748, | |
| "learning_rate": 4.206875e-05, | |
| "loss": 1.4549, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 4.773382186889648, | |
| "learning_rate": 4.200625e-05, | |
| "loss": 1.6672, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.16125, | |
| "grad_norm": 3.8848724365234375, | |
| "learning_rate": 4.1943750000000006e-05, | |
| "loss": 1.4247, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.1625, | |
| "grad_norm": 5.548094272613525, | |
| "learning_rate": 4.188125e-05, | |
| "loss": 1.817, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.16375, | |
| "grad_norm": 5.077624320983887, | |
| "learning_rate": 4.1818750000000003e-05, | |
| "loss": 1.6165, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.165, | |
| "grad_norm": 4.995692729949951, | |
| "learning_rate": 4.175625e-05, | |
| "loss": 1.5932, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.16625, | |
| "grad_norm": 4.223556041717529, | |
| "learning_rate": 4.169375e-05, | |
| "loss": 1.6204, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.1675, | |
| "grad_norm": 5.034037113189697, | |
| "learning_rate": 4.163125e-05, | |
| "loss": 1.5817, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.16875, | |
| "grad_norm": 6.067126274108887, | |
| "learning_rate": 4.1568750000000004e-05, | |
| "loss": 1.6194, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 4.081786155700684, | |
| "learning_rate": 4.150625e-05, | |
| "loss": 1.4929, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.17125, | |
| "grad_norm": 4.328601837158203, | |
| "learning_rate": 4.144375e-05, | |
| "loss": 1.6266, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.1725, | |
| "grad_norm": 4.224490642547607, | |
| "learning_rate": 4.1381250000000006e-05, | |
| "loss": 1.577, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.17375, | |
| "grad_norm": 3.7425456047058105, | |
| "learning_rate": 4.131875e-05, | |
| "loss": 1.4331, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.175, | |
| "grad_norm": 4.163187503814697, | |
| "learning_rate": 4.125625e-05, | |
| "loss": 1.6496, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.17625, | |
| "grad_norm": 4.357766628265381, | |
| "learning_rate": 4.119375e-05, | |
| "loss": 1.7169, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.1775, | |
| "grad_norm": 4.384897232055664, | |
| "learning_rate": 4.113125000000001e-05, | |
| "loss": 1.4617, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.17875, | |
| "grad_norm": 4.748656749725342, | |
| "learning_rate": 4.106875e-05, | |
| "loss": 1.5784, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 4.227325439453125, | |
| "learning_rate": 4.1006250000000004e-05, | |
| "loss": 1.5098, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.18125, | |
| "grad_norm": 4.830343723297119, | |
| "learning_rate": 4.094375e-05, | |
| "loss": 1.618, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.1825, | |
| "grad_norm": 4.403887748718262, | |
| "learning_rate": 4.088125e-05, | |
| "loss": 1.5866, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.18375, | |
| "grad_norm": 5.818870544433594, | |
| "learning_rate": 4.081875e-05, | |
| "loss": 1.4826, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.185, | |
| "grad_norm": 6.949367046356201, | |
| "learning_rate": 4.0756250000000005e-05, | |
| "loss": 1.4832, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.18625, | |
| "grad_norm": 4.030486583709717, | |
| "learning_rate": 4.069375e-05, | |
| "loss": 1.4703, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.1875, | |
| "grad_norm": 4.741464614868164, | |
| "learning_rate": 4.063125e-05, | |
| "loss": 1.7756, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.18875, | |
| "grad_norm": 4.840798377990723, | |
| "learning_rate": 4.056875e-05, | |
| "loss": 1.6653, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 4.910340309143066, | |
| "learning_rate": 4.050625e-05, | |
| "loss": 1.4131, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.19125, | |
| "grad_norm": 5.179189205169678, | |
| "learning_rate": 4.0443750000000004e-05, | |
| "loss": 1.5972, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.1925, | |
| "grad_norm": 5.435876369476318, | |
| "learning_rate": 4.038125e-05, | |
| "loss": 1.5848, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.19375, | |
| "grad_norm": 3.8866443634033203, | |
| "learning_rate": 4.031875e-05, | |
| "loss": 1.5638, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.195, | |
| "grad_norm": 4.297860145568848, | |
| "learning_rate": 4.025625e-05, | |
| "loss": 1.4118, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.19625, | |
| "grad_norm": 5.706923484802246, | |
| "learning_rate": 4.0193750000000005e-05, | |
| "loss": 1.4316, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.1975, | |
| "grad_norm": 4.5453925132751465, | |
| "learning_rate": 4.013125e-05, | |
| "loss": 1.4754, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.19875, | |
| "grad_norm": 4.322735786437988, | |
| "learning_rate": 4.006875e-05, | |
| "loss": 1.5779, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 4.953495979309082, | |
| "learning_rate": 4.000625e-05, | |
| "loss": 1.6211, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.20125, | |
| "grad_norm": 3.003465414047241, | |
| "learning_rate": 3.9943750000000005e-05, | |
| "loss": 1.5667, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.2025, | |
| "grad_norm": 4.02094030380249, | |
| "learning_rate": 3.9881250000000004e-05, | |
| "loss": 1.5953, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.20375, | |
| "grad_norm": 3.9984161853790283, | |
| "learning_rate": 3.981875e-05, | |
| "loss": 1.5122, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.205, | |
| "grad_norm": 4.243444442749023, | |
| "learning_rate": 3.975625e-05, | |
| "loss": 1.6517, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.20625, | |
| "grad_norm": 4.29213809967041, | |
| "learning_rate": 3.969375e-05, | |
| "loss": 1.5576, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.2075, | |
| "grad_norm": 4.561241149902344, | |
| "learning_rate": 3.9631250000000004e-05, | |
| "loss": 1.5213, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.20875, | |
| "grad_norm": 4.34321403503418, | |
| "learning_rate": 3.956875e-05, | |
| "loss": 1.4204, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 4.353539943695068, | |
| "learning_rate": 3.950625e-05, | |
| "loss": 1.5922, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.21125, | |
| "grad_norm": 4.75934362411499, | |
| "learning_rate": 3.944375e-05, | |
| "loss": 1.4769, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.2125, | |
| "grad_norm": 4.053194999694824, | |
| "learning_rate": 3.9381250000000005e-05, | |
| "loss": 1.5125, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.21375, | |
| "grad_norm": 5.373641490936279, | |
| "learning_rate": 3.9318750000000004e-05, | |
| "loss": 1.6501, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.215, | |
| "grad_norm": 3.5294227600097656, | |
| "learning_rate": 3.925625e-05, | |
| "loss": 1.5721, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.21625, | |
| "grad_norm": 3.870500326156616, | |
| "learning_rate": 3.919375e-05, | |
| "loss": 1.5217, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.2175, | |
| "grad_norm": 27.053773880004883, | |
| "learning_rate": 3.9131250000000006e-05, | |
| "loss": 1.5216, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.21875, | |
| "grad_norm": 8.008138656616211, | |
| "learning_rate": 3.9068750000000004e-05, | |
| "loss": 1.6043, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 5.010753631591797, | |
| "learning_rate": 3.900625e-05, | |
| "loss": 1.5854, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.22125, | |
| "grad_norm": 5.048765659332275, | |
| "learning_rate": 3.894375e-05, | |
| "loss": 1.5564, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.2225, | |
| "grad_norm": 5.92832612991333, | |
| "learning_rate": 3.888125e-05, | |
| "loss": 1.6321, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.22375, | |
| "grad_norm": 3.242619037628174, | |
| "learning_rate": 3.8818750000000005e-05, | |
| "loss": 1.5837, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.225, | |
| "grad_norm": 6.22261381149292, | |
| "learning_rate": 3.875625e-05, | |
| "loss": 1.5758, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.22625, | |
| "grad_norm": 4.630067348480225, | |
| "learning_rate": 3.869375e-05, | |
| "loss": 1.379, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.2275, | |
| "grad_norm": 5.385432720184326, | |
| "learning_rate": 3.863125e-05, | |
| "loss": 1.6319, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.22875, | |
| "grad_norm": 3.7115695476531982, | |
| "learning_rate": 3.8568750000000006e-05, | |
| "loss": 1.5246, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 18.826236724853516, | |
| "learning_rate": 3.850625e-05, | |
| "loss": 1.5025, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.23125, | |
| "grad_norm": 4.666762828826904, | |
| "learning_rate": 3.844375e-05, | |
| "loss": 1.5511, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.2325, | |
| "grad_norm": 4.544824600219727, | |
| "learning_rate": 3.838125e-05, | |
| "loss": 1.5163, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.23375, | |
| "grad_norm": 6.902198314666748, | |
| "learning_rate": 3.8318750000000006e-05, | |
| "loss": 1.5313, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.235, | |
| "grad_norm": 5.414902687072754, | |
| "learning_rate": 3.8256250000000005e-05, | |
| "loss": 1.4708, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.23625, | |
| "grad_norm": 7.254164218902588, | |
| "learning_rate": 3.8193750000000003e-05, | |
| "loss": 1.4522, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.2375, | |
| "grad_norm": 2.95312237739563, | |
| "learning_rate": 3.813125e-05, | |
| "loss": 1.3787, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.23875, | |
| "grad_norm": 3.5326123237609863, | |
| "learning_rate": 3.806875e-05, | |
| "loss": 1.3401, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 4.167004585266113, | |
| "learning_rate": 3.8006250000000006e-05, | |
| "loss": 1.4521, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.24125, | |
| "grad_norm": 4.5269646644592285, | |
| "learning_rate": 3.794375e-05, | |
| "loss": 1.5217, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.2425, | |
| "grad_norm": 3.9108593463897705, | |
| "learning_rate": 3.788125e-05, | |
| "loss": 1.4758, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.24375, | |
| "grad_norm": 5.435783386230469, | |
| "learning_rate": 3.781875e-05, | |
| "loss": 1.5579, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.245, | |
| "grad_norm": 4.055588722229004, | |
| "learning_rate": 3.7756250000000006e-05, | |
| "loss": 1.4664, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.24625, | |
| "grad_norm": 3.8323545455932617, | |
| "learning_rate": 3.769375e-05, | |
| "loss": 1.4167, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.2475, | |
| "grad_norm": 3.928833484649658, | |
| "learning_rate": 3.763125e-05, | |
| "loss": 1.538, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.24875, | |
| "grad_norm": 5.081177711486816, | |
| "learning_rate": 3.756875e-05, | |
| "loss": 1.5143, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 4.005138874053955, | |
| "learning_rate": 3.750625000000001e-05, | |
| "loss": 1.4391, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.25125, | |
| "grad_norm": 7.569023132324219, | |
| "learning_rate": 3.744375e-05, | |
| "loss": 1.5854, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.2525, | |
| "grad_norm": 3.234931230545044, | |
| "learning_rate": 3.7381250000000004e-05, | |
| "loss": 1.5288, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.25375, | |
| "grad_norm": 4.21964168548584, | |
| "learning_rate": 3.731875e-05, | |
| "loss": 1.5045, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.255, | |
| "grad_norm": 5.102002143859863, | |
| "learning_rate": 3.725625e-05, | |
| "loss": 1.5022, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.25625, | |
| "grad_norm": 7.323031902313232, | |
| "learning_rate": 3.719375e-05, | |
| "loss": 1.8256, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.2575, | |
| "grad_norm": 3.650108575820923, | |
| "learning_rate": 3.713125e-05, | |
| "loss": 1.3734, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.25875, | |
| "grad_norm": 3.9725680351257324, | |
| "learning_rate": 3.706875e-05, | |
| "loss": 1.6534, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 3.5736091136932373, | |
| "learning_rate": 3.700625e-05, | |
| "loss": 1.5137, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.26125, | |
| "grad_norm": 3.700044631958008, | |
| "learning_rate": 3.694375e-05, | |
| "loss": 1.4202, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.2625, | |
| "grad_norm": 4.076671600341797, | |
| "learning_rate": 3.688125e-05, | |
| "loss": 1.5527, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.26375, | |
| "grad_norm": 4.140468597412109, | |
| "learning_rate": 3.6818750000000004e-05, | |
| "loss": 1.4233, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.265, | |
| "grad_norm": 4.703122615814209, | |
| "learning_rate": 3.675625e-05, | |
| "loss": 1.4997, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.26625, | |
| "grad_norm": 5.30742883682251, | |
| "learning_rate": 3.669375000000001e-05, | |
| "loss": 1.3517, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.2675, | |
| "grad_norm": 6.19927453994751, | |
| "learning_rate": 3.663125e-05, | |
| "loss": 1.4677, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.26875, | |
| "grad_norm": 5.471877098083496, | |
| "learning_rate": 3.6568750000000005e-05, | |
| "loss": 1.3718, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 5.80817985534668, | |
| "learning_rate": 3.650625e-05, | |
| "loss": 1.5756, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.27125, | |
| "grad_norm": 4.3927717208862305, | |
| "learning_rate": 3.644375e-05, | |
| "loss": 1.4272, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.2725, | |
| "grad_norm": 4.650943279266357, | |
| "learning_rate": 3.638125e-05, | |
| "loss": 1.5826, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.27375, | |
| "grad_norm": 2.954941511154175, | |
| "learning_rate": 3.631875e-05, | |
| "loss": 1.4817, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.275, | |
| "grad_norm": 3.7205264568328857, | |
| "learning_rate": 3.6256250000000004e-05, | |
| "loss": 1.3662, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.27625, | |
| "grad_norm": 4.387423038482666, | |
| "learning_rate": 3.619375e-05, | |
| "loss": 1.4639, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.2775, | |
| "grad_norm": 2.9313790798187256, | |
| "learning_rate": 3.613125e-05, | |
| "loss": 1.5223, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.27875, | |
| "grad_norm": 2.299884796142578, | |
| "learning_rate": 3.606875e-05, | |
| "loss": 1.4924, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 6.7828497886657715, | |
| "learning_rate": 3.6006250000000004e-05, | |
| "loss": 1.4188, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.28125, | |
| "grad_norm": 3.3752660751342773, | |
| "learning_rate": 3.594375e-05, | |
| "loss": 1.4604, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.2825, | |
| "grad_norm": 6.302489757537842, | |
| "learning_rate": 3.588125e-05, | |
| "loss": 1.4211, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.28375, | |
| "grad_norm": 26.017749786376953, | |
| "learning_rate": 3.581875e-05, | |
| "loss": 1.415, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.285, | |
| "grad_norm": 3.502596139907837, | |
| "learning_rate": 3.5756250000000005e-05, | |
| "loss": 1.4949, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.28625, | |
| "grad_norm": 5.614380359649658, | |
| "learning_rate": 3.5693750000000004e-05, | |
| "loss": 1.4836, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.2875, | |
| "grad_norm": 3.932626485824585, | |
| "learning_rate": 3.563125e-05, | |
| "loss": 1.3806, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.28875, | |
| "grad_norm": 2.99294114112854, | |
| "learning_rate": 3.556875e-05, | |
| "loss": 1.5548, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 4.608867168426514, | |
| "learning_rate": 3.550625e-05, | |
| "loss": 1.4296, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.29125, | |
| "grad_norm": 5.113489627838135, | |
| "learning_rate": 3.5443750000000004e-05, | |
| "loss": 1.3717, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.2925, | |
| "grad_norm": 3.7373545169830322, | |
| "learning_rate": 3.5381249999999996e-05, | |
| "loss": 1.4817, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.29375, | |
| "grad_norm": 5.348628997802734, | |
| "learning_rate": 3.531875e-05, | |
| "loss": 1.4436, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.295, | |
| "grad_norm": 4.982232093811035, | |
| "learning_rate": 3.525625e-05, | |
| "loss": 1.3956, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.29625, | |
| "grad_norm": 9.024496078491211, | |
| "learning_rate": 3.5193750000000005e-05, | |
| "loss": 1.4792, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.2975, | |
| "grad_norm": 4.083111763000488, | |
| "learning_rate": 3.5131250000000004e-05, | |
| "loss": 1.5715, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.29875, | |
| "grad_norm": 3.5770645141601562, | |
| "learning_rate": 3.506875e-05, | |
| "loss": 1.4707, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 4.641442775726318, | |
| "learning_rate": 3.500625e-05, | |
| "loss": 1.362, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.30125, | |
| "grad_norm": 3.524186849594116, | |
| "learning_rate": 3.4943750000000006e-05, | |
| "loss": 1.4686, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.3025, | |
| "grad_norm": 4.681451797485352, | |
| "learning_rate": 3.4881250000000004e-05, | |
| "loss": 1.4739, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.30375, | |
| "grad_norm": 5.3212785720825195, | |
| "learning_rate": 3.481875e-05, | |
| "loss": 1.5322, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.305, | |
| "grad_norm": 4.161744117736816, | |
| "learning_rate": 3.475625e-05, | |
| "loss": 1.3801, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.30625, | |
| "grad_norm": 4.7510151863098145, | |
| "learning_rate": 3.469375e-05, | |
| "loss": 1.4092, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.3075, | |
| "grad_norm": 3.8327901363372803, | |
| "learning_rate": 3.4631250000000005e-05, | |
| "loss": 1.4072, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.30875, | |
| "grad_norm": 3.7004222869873047, | |
| "learning_rate": 3.456875e-05, | |
| "loss": 1.4626, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 4.66420841217041, | |
| "learning_rate": 3.450625e-05, | |
| "loss": 1.3397, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.31125, | |
| "grad_norm": 5.986914157867432, | |
| "learning_rate": 3.444375e-05, | |
| "loss": 1.4815, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.3125, | |
| "grad_norm": 3.43625807762146, | |
| "learning_rate": 3.4381250000000006e-05, | |
| "loss": 1.4322, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.31375, | |
| "grad_norm": 4.031944274902344, | |
| "learning_rate": 3.431875e-05, | |
| "loss": 1.4035, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.315, | |
| "grad_norm": 3.607931613922119, | |
| "learning_rate": 3.425625e-05, | |
| "loss": 1.5213, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.31625, | |
| "grad_norm": 5.655627727508545, | |
| "learning_rate": 3.419375e-05, | |
| "loss": 1.4377, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.3175, | |
| "grad_norm": 4.682887077331543, | |
| "learning_rate": 3.4131250000000006e-05, | |
| "loss": 1.2967, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.31875, | |
| "grad_norm": 4.796265125274658, | |
| "learning_rate": 3.406875e-05, | |
| "loss": 1.3274, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 3.3428382873535156, | |
| "learning_rate": 3.400625e-05, | |
| "loss": 1.3762, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.32125, | |
| "grad_norm": 4.231629371643066, | |
| "learning_rate": 3.394375e-05, | |
| "loss": 1.3922, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.3225, | |
| "grad_norm": 3.5596585273742676, | |
| "learning_rate": 3.388125e-05, | |
| "loss": 1.3427, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.32375, | |
| "grad_norm": 4.1195783615112305, | |
| "learning_rate": 3.381875e-05, | |
| "loss": 1.362, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.325, | |
| "grad_norm": 4.000838279724121, | |
| "learning_rate": 3.375625e-05, | |
| "loss": 1.4245, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.32625, | |
| "grad_norm": 5.7205939292907715, | |
| "learning_rate": 3.369375e-05, | |
| "loss": 1.526, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.3275, | |
| "grad_norm": 4.729959011077881, | |
| "learning_rate": 3.363125e-05, | |
| "loss": 1.3265, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.32875, | |
| "grad_norm": 3.846036434173584, | |
| "learning_rate": 3.3568750000000006e-05, | |
| "loss": 1.3523, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 3.625514507293701, | |
| "learning_rate": 3.350625e-05, | |
| "loss": 1.3483, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.33125, | |
| "grad_norm": 3.985917568206787, | |
| "learning_rate": 3.344375e-05, | |
| "loss": 1.3951, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.3325, | |
| "grad_norm": 5.314172267913818, | |
| "learning_rate": 3.338125e-05, | |
| "loss": 1.4334, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.33375, | |
| "grad_norm": 5.172106742858887, | |
| "learning_rate": 3.331875000000001e-05, | |
| "loss": 1.333, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.335, | |
| "grad_norm": 3.5582571029663086, | |
| "learning_rate": 3.325625e-05, | |
| "loss": 1.4376, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.33625, | |
| "grad_norm": 3.68792462348938, | |
| "learning_rate": 3.3193750000000004e-05, | |
| "loss": 1.4401, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.3375, | |
| "grad_norm": 3.366680860519409, | |
| "learning_rate": 3.313125e-05, | |
| "loss": 1.3825, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.33875, | |
| "grad_norm": 4.318718910217285, | |
| "learning_rate": 3.306875e-05, | |
| "loss": 1.4478, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 5.707590103149414, | |
| "learning_rate": 3.300625e-05, | |
| "loss": 1.4966, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.34125, | |
| "grad_norm": 3.624086380004883, | |
| "learning_rate": 3.294375e-05, | |
| "loss": 1.419, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.3425, | |
| "grad_norm": 4.7848711013793945, | |
| "learning_rate": 3.288125e-05, | |
| "loss": 1.29, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.34375, | |
| "grad_norm": 4.0258612632751465, | |
| "learning_rate": 3.281875e-05, | |
| "loss": 1.479, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.345, | |
| "grad_norm": 4.456843852996826, | |
| "learning_rate": 3.275625e-05, | |
| "loss": 1.4191, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.34625, | |
| "grad_norm": 4.327670097351074, | |
| "learning_rate": 3.269375e-05, | |
| "loss": 1.3777, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.3475, | |
| "grad_norm": 3.2591614723205566, | |
| "learning_rate": 3.2631250000000004e-05, | |
| "loss": 1.2713, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.34875, | |
| "grad_norm": 4.323492527008057, | |
| "learning_rate": 3.256875e-05, | |
| "loss": 1.3162, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 4.254138946533203, | |
| "learning_rate": 3.250625e-05, | |
| "loss": 1.5309, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.35125, | |
| "grad_norm": 3.481466054916382, | |
| "learning_rate": 3.244375e-05, | |
| "loss": 1.6049, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.3525, | |
| "grad_norm": 3.48063063621521, | |
| "learning_rate": 3.2381250000000004e-05, | |
| "loss": 1.5601, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.35375, | |
| "grad_norm": 3.9832093715667725, | |
| "learning_rate": 3.231875e-05, | |
| "loss": 1.4256, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.355, | |
| "grad_norm": 3.842890739440918, | |
| "learning_rate": 3.225625e-05, | |
| "loss": 1.3579, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.35625, | |
| "grad_norm": 4.633380889892578, | |
| "learning_rate": 3.219375e-05, | |
| "loss": 1.3881, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.3575, | |
| "grad_norm": 6.086498260498047, | |
| "learning_rate": 3.213125e-05, | |
| "loss": 1.4811, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.35875, | |
| "grad_norm": 4.031968593597412, | |
| "learning_rate": 3.2068750000000004e-05, | |
| "loss": 1.3418, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 4.838329315185547, | |
| "learning_rate": 3.200625e-05, | |
| "loss": 1.4378, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.36125, | |
| "grad_norm": 3.499248743057251, | |
| "learning_rate": 3.194375e-05, | |
| "loss": 1.2617, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.3625, | |
| "grad_norm": 4.68066930770874, | |
| "learning_rate": 3.188125e-05, | |
| "loss": 1.4701, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.36375, | |
| "grad_norm": 3.8823728561401367, | |
| "learning_rate": 3.1818750000000004e-05, | |
| "loss": 1.364, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.365, | |
| "grad_norm": 3.7089786529541016, | |
| "learning_rate": 3.175625e-05, | |
| "loss": 1.4843, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.36625, | |
| "grad_norm": 3.1307108402252197, | |
| "learning_rate": 3.169375e-05, | |
| "loss": 1.3714, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.3675, | |
| "grad_norm": 4.351153373718262, | |
| "learning_rate": 3.163125e-05, | |
| "loss": 1.4312, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.36875, | |
| "grad_norm": 4.765021800994873, | |
| "learning_rate": 3.1568750000000005e-05, | |
| "loss": 1.6201, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 3.538285493850708, | |
| "learning_rate": 3.1506250000000003e-05, | |
| "loss": 1.3805, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.37125, | |
| "grad_norm": 4.136841773986816, | |
| "learning_rate": 3.144375e-05, | |
| "loss": 1.3558, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.3725, | |
| "grad_norm": 4.298130512237549, | |
| "learning_rate": 3.138125e-05, | |
| "loss": 1.5242, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.37375, | |
| "grad_norm": 3.6436102390289307, | |
| "learning_rate": 3.131875e-05, | |
| "loss": 1.3199, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.375, | |
| "grad_norm": 4.527806758880615, | |
| "learning_rate": 3.1256250000000004e-05, | |
| "loss": 1.531, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.37625, | |
| "grad_norm": 5.912485122680664, | |
| "learning_rate": 3.119375e-05, | |
| "loss": 1.444, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.3775, | |
| "grad_norm": 5.101160049438477, | |
| "learning_rate": 3.113125e-05, | |
| "loss": 1.5674, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.37875, | |
| "grad_norm": 5.113125324249268, | |
| "learning_rate": 3.106875e-05, | |
| "loss": 1.3053, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 3.990057945251465, | |
| "learning_rate": 3.1006250000000005e-05, | |
| "loss": 1.2181, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.38125, | |
| "grad_norm": 6.3468780517578125, | |
| "learning_rate": 3.0943749999999997e-05, | |
| "loss": 1.4149, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.3825, | |
| "grad_norm": 4.712606430053711, | |
| "learning_rate": 3.088125e-05, | |
| "loss": 1.4128, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.38375, | |
| "grad_norm": 5.313744068145752, | |
| "learning_rate": 3.081875e-05, | |
| "loss": 1.4877, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.385, | |
| "grad_norm": 4.46605110168457, | |
| "learning_rate": 3.0756250000000006e-05, | |
| "loss": 1.4314, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.38625, | |
| "grad_norm": 3.6377127170562744, | |
| "learning_rate": 3.069375e-05, | |
| "loss": 1.3966, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.3875, | |
| "grad_norm": 4.34388542175293, | |
| "learning_rate": 3.063125e-05, | |
| "loss": 1.2802, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.38875, | |
| "grad_norm": 11.9617338180542, | |
| "learning_rate": 3.056875e-05, | |
| "loss": 1.4014, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 3.758890390396118, | |
| "learning_rate": 3.0506250000000003e-05, | |
| "loss": 1.5544, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.39125, | |
| "grad_norm": 4.928178310394287, | |
| "learning_rate": 3.0443750000000005e-05, | |
| "loss": 1.4504, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.3925, | |
| "grad_norm": 4.8397722244262695, | |
| "learning_rate": 3.038125e-05, | |
| "loss": 1.3976, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.39375, | |
| "grad_norm": 4.685599327087402, | |
| "learning_rate": 3.0318750000000002e-05, | |
| "loss": 1.3353, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.395, | |
| "grad_norm": 6.802610397338867, | |
| "learning_rate": 3.0256250000000004e-05, | |
| "loss": 1.3951, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.39625, | |
| "grad_norm": 5.310746669769287, | |
| "learning_rate": 3.0193750000000005e-05, | |
| "loss": 1.3754, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.3975, | |
| "grad_norm": 3.733003616333008, | |
| "learning_rate": 3.013125e-05, | |
| "loss": 1.4712, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.39875, | |
| "grad_norm": 3.518083333969116, | |
| "learning_rate": 3.0068750000000002e-05, | |
| "loss": 1.425, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 3.223477840423584, | |
| "learning_rate": 3.000625e-05, | |
| "loss": 1.4198, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.40125, | |
| "grad_norm": 6.2975029945373535, | |
| "learning_rate": 2.9943750000000003e-05, | |
| "loss": 1.4008, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.4025, | |
| "grad_norm": 4.495896339416504, | |
| "learning_rate": 2.9881249999999998e-05, | |
| "loss": 1.608, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.40375, | |
| "grad_norm": 3.413543701171875, | |
| "learning_rate": 2.981875e-05, | |
| "loss": 1.4011, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.405, | |
| "grad_norm": 3.756793260574341, | |
| "learning_rate": 2.975625e-05, | |
| "loss": 1.5263, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.40625, | |
| "grad_norm": 3.6322548389434814, | |
| "learning_rate": 2.9693750000000003e-05, | |
| "loss": 1.3671, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.4075, | |
| "grad_norm": 3.74729061126709, | |
| "learning_rate": 2.963125e-05, | |
| "loss": 1.3878, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.40875, | |
| "grad_norm": 3.154021978378296, | |
| "learning_rate": 2.956875e-05, | |
| "loss": 1.2886, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 3.127899408340454, | |
| "learning_rate": 2.9506250000000002e-05, | |
| "loss": 1.542, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.41125, | |
| "grad_norm": 2.8965258598327637, | |
| "learning_rate": 2.9443750000000004e-05, | |
| "loss": 1.2855, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.4125, | |
| "grad_norm": 3.9691522121429443, | |
| "learning_rate": 2.938125e-05, | |
| "loss": 1.331, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.41375, | |
| "grad_norm": 4.497001647949219, | |
| "learning_rate": 2.931875e-05, | |
| "loss": 1.2743, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.415, | |
| "grad_norm": 4.911508083343506, | |
| "learning_rate": 2.9256250000000003e-05, | |
| "loss": 1.4838, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.41625, | |
| "grad_norm": 4.168112754821777, | |
| "learning_rate": 2.919375e-05, | |
| "loss": 1.5264, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.4175, | |
| "grad_norm": 4.177130699157715, | |
| "learning_rate": 2.913125e-05, | |
| "loss": 1.4769, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.41875, | |
| "grad_norm": 3.3406238555908203, | |
| "learning_rate": 2.9068750000000002e-05, | |
| "loss": 1.3252, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 3.403542995452881, | |
| "learning_rate": 2.900625e-05, | |
| "loss": 1.3591, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.42125, | |
| "grad_norm": 3.7915780544281006, | |
| "learning_rate": 2.8943750000000002e-05, | |
| "loss": 1.4208, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.4225, | |
| "grad_norm": 4.730687618255615, | |
| "learning_rate": 2.8881250000000004e-05, | |
| "loss": 1.3171, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.42375, | |
| "grad_norm": 3.976198673248291, | |
| "learning_rate": 2.881875e-05, | |
| "loss": 1.3666, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.425, | |
| "grad_norm": 3.474154472351074, | |
| "learning_rate": 2.875625e-05, | |
| "loss": 1.5027, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.42625, | |
| "grad_norm": 4.456624984741211, | |
| "learning_rate": 2.8693750000000003e-05, | |
| "loss": 1.3966, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.4275, | |
| "grad_norm": 3.5991241931915283, | |
| "learning_rate": 2.8631250000000005e-05, | |
| "loss": 1.3464, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.42875, | |
| "grad_norm": 3.397467613220215, | |
| "learning_rate": 2.856875e-05, | |
| "loss": 1.3021, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 4.232719898223877, | |
| "learning_rate": 2.8506250000000002e-05, | |
| "loss": 1.366, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.43125, | |
| "grad_norm": 4.262751579284668, | |
| "learning_rate": 2.8443750000000004e-05, | |
| "loss": 1.4335, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.4325, | |
| "grad_norm": 4.169719696044922, | |
| "learning_rate": 2.8381250000000002e-05, | |
| "loss": 1.403, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.43375, | |
| "grad_norm": 4.9819159507751465, | |
| "learning_rate": 2.831875e-05, | |
| "loss": 1.3418, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.435, | |
| "grad_norm": 3.556701421737671, | |
| "learning_rate": 2.8256250000000002e-05, | |
| "loss": 1.3712, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.43625, | |
| "grad_norm": 3.9347524642944336, | |
| "learning_rate": 2.819375e-05, | |
| "loss": 1.5704, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.4375, | |
| "grad_norm": 3.451732873916626, | |
| "learning_rate": 2.8131250000000003e-05, | |
| "loss": 1.3976, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.43875, | |
| "grad_norm": 3.0148160457611084, | |
| "learning_rate": 2.8068749999999998e-05, | |
| "loss": 1.3539, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 3.8727331161499023, | |
| "learning_rate": 2.800625e-05, | |
| "loss": 1.4348, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.44125, | |
| "grad_norm": 4.594605445861816, | |
| "learning_rate": 2.794375e-05, | |
| "loss": 1.2116, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.4425, | |
| "grad_norm": 4.683310031890869, | |
| "learning_rate": 2.7881250000000003e-05, | |
| "loss": 1.3442, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.44375, | |
| "grad_norm": 3.2116706371307373, | |
| "learning_rate": 2.781875e-05, | |
| "loss": 1.3959, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.445, | |
| "grad_norm": 3.98592472076416, | |
| "learning_rate": 2.775625e-05, | |
| "loss": 1.3782, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.44625, | |
| "grad_norm": 2.84287691116333, | |
| "learning_rate": 2.7693750000000002e-05, | |
| "loss": 1.2789, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.4475, | |
| "grad_norm": 2.849111795425415, | |
| "learning_rate": 2.7631250000000004e-05, | |
| "loss": 1.3003, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.44875, | |
| "grad_norm": 3.8393287658691406, | |
| "learning_rate": 2.756875e-05, | |
| "loss": 1.422, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 3.058866024017334, | |
| "learning_rate": 2.750625e-05, | |
| "loss": 1.4432, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.45125, | |
| "grad_norm": 4.536365032196045, | |
| "learning_rate": 2.7443750000000003e-05, | |
| "loss": 1.4127, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.4525, | |
| "grad_norm": 3.964500904083252, | |
| "learning_rate": 2.738125e-05, | |
| "loss": 1.4182, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.45375, | |
| "grad_norm": 4.288209438323975, | |
| "learning_rate": 2.7318750000000003e-05, | |
| "loss": 1.2563, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.455, | |
| "grad_norm": 3.3023056983947754, | |
| "learning_rate": 2.725625e-05, | |
| "loss": 1.3547, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.45625, | |
| "grad_norm": 3.554124116897583, | |
| "learning_rate": 2.719375e-05, | |
| "loss": 1.4073, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.4575, | |
| "grad_norm": 3.063807725906372, | |
| "learning_rate": 2.7131250000000002e-05, | |
| "loss": 1.4365, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.45875, | |
| "grad_norm": 8.845410346984863, | |
| "learning_rate": 2.7068750000000004e-05, | |
| "loss": 1.2819, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 4.73734712600708, | |
| "learning_rate": 2.700625e-05, | |
| "loss": 1.2654, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.46125, | |
| "grad_norm": 3.227581262588501, | |
| "learning_rate": 2.694375e-05, | |
| "loss": 1.4912, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 0.4625, | |
| "grad_norm": 3.327014923095703, | |
| "learning_rate": 2.6881250000000003e-05, | |
| "loss": 1.2713, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.46375, | |
| "grad_norm": 4.055096626281738, | |
| "learning_rate": 2.6818750000000005e-05, | |
| "loss": 1.3, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 0.465, | |
| "grad_norm": 3.03869366645813, | |
| "learning_rate": 2.675625e-05, | |
| "loss": 1.213, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.46625, | |
| "grad_norm": 2.9507339000701904, | |
| "learning_rate": 2.6693750000000002e-05, | |
| "loss": 1.2247, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 0.4675, | |
| "grad_norm": 3.0396885871887207, | |
| "learning_rate": 2.6631250000000004e-05, | |
| "loss": 1.3832, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.46875, | |
| "grad_norm": 3.567950963973999, | |
| "learning_rate": 2.6568750000000002e-05, | |
| "loss": 1.4424, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 3.8607802391052246, | |
| "learning_rate": 2.650625e-05, | |
| "loss": 1.3011, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.47125, | |
| "grad_norm": 4.078023433685303, | |
| "learning_rate": 2.644375e-05, | |
| "loss": 1.4831, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 0.4725, | |
| "grad_norm": 3.342250347137451, | |
| "learning_rate": 2.638125e-05, | |
| "loss": 1.3761, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.47375, | |
| "grad_norm": 2.861462354660034, | |
| "learning_rate": 2.6318750000000003e-05, | |
| "loss": 1.3057, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 0.475, | |
| "grad_norm": 4.583399772644043, | |
| "learning_rate": 2.6256249999999998e-05, | |
| "loss": 1.3889, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.47625, | |
| "grad_norm": 4.215075969696045, | |
| "learning_rate": 2.619375e-05, | |
| "loss": 1.3721, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 0.4775, | |
| "grad_norm": 4.96607780456543, | |
| "learning_rate": 2.613125e-05, | |
| "loss": 1.2088, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.47875, | |
| "grad_norm": 4.239419937133789, | |
| "learning_rate": 2.6068750000000003e-05, | |
| "loss": 1.3942, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 4.5754289627075195, | |
| "learning_rate": 2.600625e-05, | |
| "loss": 1.3388, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.48125, | |
| "grad_norm": 4.335231781005859, | |
| "learning_rate": 2.594375e-05, | |
| "loss": 1.3362, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.4825, | |
| "grad_norm": 3.0417494773864746, | |
| "learning_rate": 2.5881250000000002e-05, | |
| "loss": 1.2406, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.48375, | |
| "grad_norm": 3.8516695499420166, | |
| "learning_rate": 2.5818750000000004e-05, | |
| "loss": 1.4081, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 0.485, | |
| "grad_norm": 3.9781277179718018, | |
| "learning_rate": 2.5756250000000003e-05, | |
| "loss": 1.3636, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.48625, | |
| "grad_norm": 3.7591724395751953, | |
| "learning_rate": 2.569375e-05, | |
| "loss": 1.4053, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 0.4875, | |
| "grad_norm": 4.294608116149902, | |
| "learning_rate": 2.563125e-05, | |
| "loss": 1.4465, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.48875, | |
| "grad_norm": 2.709139108657837, | |
| "learning_rate": 2.556875e-05, | |
| "loss": 1.469, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 4.929773807525635, | |
| "learning_rate": 2.5506250000000003e-05, | |
| "loss": 1.2714, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.49125, | |
| "grad_norm": 3.6654865741729736, | |
| "learning_rate": 2.544375e-05, | |
| "loss": 1.4664, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 0.4925, | |
| "grad_norm": 3.2190186977386475, | |
| "learning_rate": 2.538125e-05, | |
| "loss": 1.4203, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.49375, | |
| "grad_norm": 6.021998882293701, | |
| "learning_rate": 2.5318750000000002e-05, | |
| "loss": 1.3446, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.495, | |
| "grad_norm": 4.783326148986816, | |
| "learning_rate": 2.5256250000000004e-05, | |
| "loss": 1.303, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.49625, | |
| "grad_norm": 4.282038688659668, | |
| "learning_rate": 2.519375e-05, | |
| "loss": 1.4116, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 0.4975, | |
| "grad_norm": 3.5954740047454834, | |
| "learning_rate": 2.513125e-05, | |
| "loss": 1.4053, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 0.49875, | |
| "grad_norm": 4.234996795654297, | |
| "learning_rate": 2.5068750000000003e-05, | |
| "loss": 1.3921, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 4.2734222412109375, | |
| "learning_rate": 2.5006250000000005e-05, | |
| "loss": 1.3818, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.50125, | |
| "grad_norm": 2.8631579875946045, | |
| "learning_rate": 2.4943750000000003e-05, | |
| "loss": 1.1995, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 0.5025, | |
| "grad_norm": 2.787076950073242, | |
| "learning_rate": 2.4881250000000002e-05, | |
| "loss": 1.4324, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 0.50375, | |
| "grad_norm": 5.550398826599121, | |
| "learning_rate": 2.481875e-05, | |
| "loss": 1.3689, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 0.505, | |
| "grad_norm": 3.542635679244995, | |
| "learning_rate": 2.475625e-05, | |
| "loss": 1.2885, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.50625, | |
| "grad_norm": 6.562772750854492, | |
| "learning_rate": 2.469375e-05, | |
| "loss": 1.3545, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.5075, | |
| "grad_norm": 4.4956889152526855, | |
| "learning_rate": 2.463125e-05, | |
| "loss": 1.3232, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 0.50875, | |
| "grad_norm": 5.023864269256592, | |
| "learning_rate": 2.456875e-05, | |
| "loss": 1.2954, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 3.3570520877838135, | |
| "learning_rate": 2.450625e-05, | |
| "loss": 1.2619, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.51125, | |
| "grad_norm": 3.6277055740356445, | |
| "learning_rate": 2.444375e-05, | |
| "loss": 1.4095, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 0.5125, | |
| "grad_norm": 3.271885871887207, | |
| "learning_rate": 2.438125e-05, | |
| "loss": 1.4898, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.51375, | |
| "grad_norm": 3.166588306427002, | |
| "learning_rate": 2.431875e-05, | |
| "loss": 1.2367, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 0.515, | |
| "grad_norm": 3.730806589126587, | |
| "learning_rate": 2.425625e-05, | |
| "loss": 1.2928, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.51625, | |
| "grad_norm": 3.6622071266174316, | |
| "learning_rate": 2.4193750000000002e-05, | |
| "loss": 1.3213, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 0.5175, | |
| "grad_norm": 3.9172027111053467, | |
| "learning_rate": 2.4131250000000004e-05, | |
| "loss": 1.262, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 0.51875, | |
| "grad_norm": 3.6153948307037354, | |
| "learning_rate": 2.4068750000000002e-05, | |
| "loss": 1.1961, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 3.5669710636138916, | |
| "learning_rate": 2.400625e-05, | |
| "loss": 1.286, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.52125, | |
| "grad_norm": 2.944169044494629, | |
| "learning_rate": 2.394375e-05, | |
| "loss": 1.1684, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 0.5225, | |
| "grad_norm": 5.035433769226074, | |
| "learning_rate": 2.388125e-05, | |
| "loss": 1.4153, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 0.52375, | |
| "grad_norm": 4.437448501586914, | |
| "learning_rate": 2.381875e-05, | |
| "loss": 1.3649, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 0.525, | |
| "grad_norm": 6.091770172119141, | |
| "learning_rate": 2.375625e-05, | |
| "loss": 1.3024, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.52625, | |
| "grad_norm": 2.8936169147491455, | |
| "learning_rate": 2.369375e-05, | |
| "loss": 1.3005, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 0.5275, | |
| "grad_norm": 3.831921100616455, | |
| "learning_rate": 2.3631250000000002e-05, | |
| "loss": 1.3541, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 0.52875, | |
| "grad_norm": 3.6951687335968018, | |
| "learning_rate": 2.356875e-05, | |
| "loss": 1.4483, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 3.1395816802978516, | |
| "learning_rate": 2.3506250000000002e-05, | |
| "loss": 1.368, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.53125, | |
| "grad_norm": 3.2226712703704834, | |
| "learning_rate": 2.344375e-05, | |
| "loss": 1.302, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.5325, | |
| "grad_norm": 4.4419660568237305, | |
| "learning_rate": 2.3381250000000003e-05, | |
| "loss": 1.3303, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 0.53375, | |
| "grad_norm": 3.2927405834198, | |
| "learning_rate": 2.331875e-05, | |
| "loss": 1.4167, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 0.535, | |
| "grad_norm": 4.213326454162598, | |
| "learning_rate": 2.3256250000000003e-05, | |
| "loss": 1.3116, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.53625, | |
| "grad_norm": 3.944117546081543, | |
| "learning_rate": 2.319375e-05, | |
| "loss": 1.3415, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 0.5375, | |
| "grad_norm": 5.39017391204834, | |
| "learning_rate": 2.3131250000000003e-05, | |
| "loss": 1.4055, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.53875, | |
| "grad_norm": 5.432854175567627, | |
| "learning_rate": 2.306875e-05, | |
| "loss": 1.3829, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 5.189695835113525, | |
| "learning_rate": 2.300625e-05, | |
| "loss": 1.3361, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.54125, | |
| "grad_norm": 6.880331993103027, | |
| "learning_rate": 2.294375e-05, | |
| "loss": 1.5419, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 0.5425, | |
| "grad_norm": 3.2537145614624023, | |
| "learning_rate": 2.288125e-05, | |
| "loss": 1.2588, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 0.54375, | |
| "grad_norm": 3.7062385082244873, | |
| "learning_rate": 2.281875e-05, | |
| "loss": 1.3066, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.545, | |
| "grad_norm": 3.2734427452087402, | |
| "learning_rate": 2.275625e-05, | |
| "loss": 1.2808, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 0.54625, | |
| "grad_norm": 3.4768989086151123, | |
| "learning_rate": 2.269375e-05, | |
| "loss": 1.3208, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 0.5475, | |
| "grad_norm": 4.7144670486450195, | |
| "learning_rate": 2.263125e-05, | |
| "loss": 1.4836, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 0.54875, | |
| "grad_norm": 3.88132905960083, | |
| "learning_rate": 2.2568750000000003e-05, | |
| "loss": 1.3068, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 3.8812150955200195, | |
| "learning_rate": 2.250625e-05, | |
| "loss": 1.2981, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.55125, | |
| "grad_norm": 5.712122440338135, | |
| "learning_rate": 2.2443750000000003e-05, | |
| "loss": 1.2052, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 0.5525, | |
| "grad_norm": 4.217561721801758, | |
| "learning_rate": 2.2381250000000002e-05, | |
| "loss": 1.3009, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 0.55375, | |
| "grad_norm": 3.853726863861084, | |
| "learning_rate": 2.2318750000000004e-05, | |
| "loss": 1.3033, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 0.555, | |
| "grad_norm": 4.639031410217285, | |
| "learning_rate": 2.2256250000000002e-05, | |
| "loss": 1.5494, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 0.55625, | |
| "grad_norm": 3.084345579147339, | |
| "learning_rate": 2.219375e-05, | |
| "loss": 1.3974, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.5575, | |
| "grad_norm": 3.7611162662506104, | |
| "learning_rate": 2.213125e-05, | |
| "loss": 1.3137, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 0.55875, | |
| "grad_norm": 3.6951828002929688, | |
| "learning_rate": 2.206875e-05, | |
| "loss": 1.3461, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 3.5445632934570312, | |
| "learning_rate": 2.200625e-05, | |
| "loss": 1.4106, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.56125, | |
| "grad_norm": 3.625247001647949, | |
| "learning_rate": 2.194375e-05, | |
| "loss": 1.231, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 0.5625, | |
| "grad_norm": 4.577424049377441, | |
| "learning_rate": 2.188125e-05, | |
| "loss": 1.2473, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.56375, | |
| "grad_norm": 2.9018397331237793, | |
| "learning_rate": 2.1818750000000002e-05, | |
| "loss": 1.1463, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 0.565, | |
| "grad_norm": 3.3070101737976074, | |
| "learning_rate": 2.175625e-05, | |
| "loss": 1.3716, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 0.56625, | |
| "grad_norm": 2.9594733715057373, | |
| "learning_rate": 2.1693750000000002e-05, | |
| "loss": 1.3987, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 0.5675, | |
| "grad_norm": 4.008158206939697, | |
| "learning_rate": 2.163125e-05, | |
| "loss": 1.3694, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 0.56875, | |
| "grad_norm": 3.944383382797241, | |
| "learning_rate": 2.1568750000000002e-05, | |
| "loss": 1.2835, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 3.6331794261932373, | |
| "learning_rate": 2.150625e-05, | |
| "loss": 1.3091, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 0.57125, | |
| "grad_norm": 4.168713092803955, | |
| "learning_rate": 2.1443750000000003e-05, | |
| "loss": 1.5456, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 0.5725, | |
| "grad_norm": 3.31856369972229, | |
| "learning_rate": 2.138125e-05, | |
| "loss": 1.2628, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 0.57375, | |
| "grad_norm": 5.429656982421875, | |
| "learning_rate": 2.131875e-05, | |
| "loss": 1.1798, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 0.575, | |
| "grad_norm": 5.1332268714904785, | |
| "learning_rate": 2.1256249999999998e-05, | |
| "loss": 1.4308, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.57625, | |
| "grad_norm": 4.013575553894043, | |
| "learning_rate": 2.119375e-05, | |
| "loss": 1.1261, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 0.5775, | |
| "grad_norm": 3.2173519134521484, | |
| "learning_rate": 2.113125e-05, | |
| "loss": 1.3259, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 0.57875, | |
| "grad_norm": 4.379116058349609, | |
| "learning_rate": 2.106875e-05, | |
| "loss": 1.2595, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 3.4463205337524414, | |
| "learning_rate": 2.1006250000000002e-05, | |
| "loss": 1.3184, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.58125, | |
| "grad_norm": 4.147000312805176, | |
| "learning_rate": 2.094375e-05, | |
| "loss": 1.3683, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.5825, | |
| "grad_norm": 4.752554893493652, | |
| "learning_rate": 2.0881250000000003e-05, | |
| "loss": 1.4316, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 0.58375, | |
| "grad_norm": 3.6568074226379395, | |
| "learning_rate": 2.081875e-05, | |
| "loss": 1.3977, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 0.585, | |
| "grad_norm": 3.955928325653076, | |
| "learning_rate": 2.0756250000000003e-05, | |
| "loss": 1.3277, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 0.58625, | |
| "grad_norm": 3.560964345932007, | |
| "learning_rate": 2.069375e-05, | |
| "loss": 1.4668, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 0.5875, | |
| "grad_norm": 3.2746620178222656, | |
| "learning_rate": 2.0631250000000003e-05, | |
| "loss": 1.3007, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.58875, | |
| "grad_norm": 4.622394561767578, | |
| "learning_rate": 2.0568750000000002e-05, | |
| "loss": 1.2535, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 3.711751699447632, | |
| "learning_rate": 2.050625e-05, | |
| "loss": 1.3777, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 0.59125, | |
| "grad_norm": 4.493631362915039, | |
| "learning_rate": 2.044375e-05, | |
| "loss": 1.3629, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 0.5925, | |
| "grad_norm": 3.8427581787109375, | |
| "learning_rate": 2.038125e-05, | |
| "loss": 1.315, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 0.59375, | |
| "grad_norm": 3.4456775188446045, | |
| "learning_rate": 2.031875e-05, | |
| "loss": 1.2649, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.595, | |
| "grad_norm": 4.129278659820557, | |
| "learning_rate": 2.025625e-05, | |
| "loss": 1.1598, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 0.59625, | |
| "grad_norm": 3.5817270278930664, | |
| "learning_rate": 2.019375e-05, | |
| "loss": 1.3781, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 0.5975, | |
| "grad_norm": 3.882089138031006, | |
| "learning_rate": 2.013125e-05, | |
| "loss": 1.2973, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 0.59875, | |
| "grad_norm": 4.201085090637207, | |
| "learning_rate": 2.006875e-05, | |
| "loss": 1.3712, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 4.771631240844727, | |
| "learning_rate": 2.0006250000000002e-05, | |
| "loss": 1.3552, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.60125, | |
| "grad_norm": 3.188880205154419, | |
| "learning_rate": 1.994375e-05, | |
| "loss": 1.3319, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 0.6025, | |
| "grad_norm": 5.565931797027588, | |
| "learning_rate": 1.9881250000000002e-05, | |
| "loss": 1.453, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 0.60375, | |
| "grad_norm": 3.3195409774780273, | |
| "learning_rate": 1.981875e-05, | |
| "loss": 1.3035, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 0.605, | |
| "grad_norm": 4.334782123565674, | |
| "learning_rate": 1.9756250000000002e-05, | |
| "loss": 1.1937, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 0.60625, | |
| "grad_norm": 4.163855075836182, | |
| "learning_rate": 1.969375e-05, | |
| "loss": 1.2671, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.6075, | |
| "grad_norm": 4.8429975509643555, | |
| "learning_rate": 1.963125e-05, | |
| "loss": 1.4048, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 0.60875, | |
| "grad_norm": 3.570777177810669, | |
| "learning_rate": 1.9568749999999998e-05, | |
| "loss": 1.3251, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 4.26336669921875, | |
| "learning_rate": 1.950625e-05, | |
| "loss": 1.3698, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 0.61125, | |
| "grad_norm": 5.224381923675537, | |
| "learning_rate": 1.944375e-05, | |
| "loss": 1.4259, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 0.6125, | |
| "grad_norm": 3.4501774311065674, | |
| "learning_rate": 1.938125e-05, | |
| "loss": 1.3284, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.61375, | |
| "grad_norm": 4.366506099700928, | |
| "learning_rate": 1.9318750000000002e-05, | |
| "loss": 1.3818, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 0.615, | |
| "grad_norm": 3.7246909141540527, | |
| "learning_rate": 1.925625e-05, | |
| "loss": 1.2934, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 0.61625, | |
| "grad_norm": 4.3427348136901855, | |
| "learning_rate": 1.9193750000000002e-05, | |
| "loss": 1.377, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 0.6175, | |
| "grad_norm": 3.3694660663604736, | |
| "learning_rate": 1.913125e-05, | |
| "loss": 1.3416, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 0.61875, | |
| "grad_norm": 3.8398380279541016, | |
| "learning_rate": 1.9068750000000003e-05, | |
| "loss": 1.2429, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 3.3248672485351562, | |
| "learning_rate": 1.900625e-05, | |
| "loss": 1.1935, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 0.62125, | |
| "grad_norm": 4.807949066162109, | |
| "learning_rate": 1.8943750000000003e-05, | |
| "loss": 1.2884, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 0.6225, | |
| "grad_norm": 3.617875814437866, | |
| "learning_rate": 1.888125e-05, | |
| "loss": 1.3156, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 0.62375, | |
| "grad_norm": 3.635308265686035, | |
| "learning_rate": 1.881875e-05, | |
| "loss": 1.2937, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 4.459296703338623, | |
| "learning_rate": 1.8756250000000002e-05, | |
| "loss": 1.3306, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.62625, | |
| "grad_norm": 3.0861570835113525, | |
| "learning_rate": 1.869375e-05, | |
| "loss": 1.3047, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 0.6275, | |
| "grad_norm": 2.831782341003418, | |
| "learning_rate": 1.863125e-05, | |
| "loss": 1.2269, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 0.62875, | |
| "grad_norm": 3.3934264183044434, | |
| "learning_rate": 1.856875e-05, | |
| "loss": 1.3926, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 3.5797278881073, | |
| "learning_rate": 1.850625e-05, | |
| "loss": 1.3056, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 0.63125, | |
| "grad_norm": 3.5126845836639404, | |
| "learning_rate": 1.844375e-05, | |
| "loss": 1.2345, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.6325, | |
| "grad_norm": 3.1061553955078125, | |
| "learning_rate": 1.838125e-05, | |
| "loss": 1.3201, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 0.63375, | |
| "grad_norm": 2.7157793045043945, | |
| "learning_rate": 1.831875e-05, | |
| "loss": 1.2418, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 0.635, | |
| "grad_norm": 5.214048385620117, | |
| "learning_rate": 1.825625e-05, | |
| "loss": 1.4307, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 0.63625, | |
| "grad_norm": 3.1405720710754395, | |
| "learning_rate": 1.8193750000000002e-05, | |
| "loss": 1.1286, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 0.6375, | |
| "grad_norm": 3.6989457607269287, | |
| "learning_rate": 1.813125e-05, | |
| "loss": 1.204, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.63875, | |
| "grad_norm": 2.809293031692505, | |
| "learning_rate": 1.8068750000000002e-05, | |
| "loss": 1.2624, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 3.5766420364379883, | |
| "learning_rate": 1.800625e-05, | |
| "loss": 1.3483, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.64125, | |
| "grad_norm": 4.097106456756592, | |
| "learning_rate": 1.7943750000000002e-05, | |
| "loss": 1.4194, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 0.6425, | |
| "grad_norm": 3.991610288619995, | |
| "learning_rate": 1.788125e-05, | |
| "loss": 1.37, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 0.64375, | |
| "grad_norm": 3.1795196533203125, | |
| "learning_rate": 1.781875e-05, | |
| "loss": 1.355, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.645, | |
| "grad_norm": 4.34062385559082, | |
| "learning_rate": 1.775625e-05, | |
| "loss": 1.2885, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 0.64625, | |
| "grad_norm": 3.086254835128784, | |
| "learning_rate": 1.769375e-05, | |
| "loss": 1.3419, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 0.6475, | |
| "grad_norm": 2.817337989807129, | |
| "learning_rate": 1.763125e-05, | |
| "loss": 1.2352, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 0.64875, | |
| "grad_norm": 3.538144588470459, | |
| "learning_rate": 1.756875e-05, | |
| "loss": 1.4543, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 3.363987445831299, | |
| "learning_rate": 1.7506250000000002e-05, | |
| "loss": 1.2738, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.65125, | |
| "grad_norm": 3.583441734313965, | |
| "learning_rate": 1.744375e-05, | |
| "loss": 1.3378, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 0.6525, | |
| "grad_norm": 5.197504997253418, | |
| "learning_rate": 1.7381250000000002e-05, | |
| "loss": 1.2725, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 0.65375, | |
| "grad_norm": 3.349055528640747, | |
| "learning_rate": 1.731875e-05, | |
| "loss": 1.4023, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 0.655, | |
| "grad_norm": 3.489291191101074, | |
| "learning_rate": 1.7256250000000003e-05, | |
| "loss": 1.2932, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 0.65625, | |
| "grad_norm": 4.6650872230529785, | |
| "learning_rate": 1.719375e-05, | |
| "loss": 1.3444, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.6575, | |
| "grad_norm": 2.737346649169922, | |
| "learning_rate": 1.7131250000000003e-05, | |
| "loss": 1.2761, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 0.65875, | |
| "grad_norm": 2.2421553134918213, | |
| "learning_rate": 1.706875e-05, | |
| "loss": 1.3069, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 3.4570610523223877, | |
| "learning_rate": 1.700625e-05, | |
| "loss": 1.3267, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 0.66125, | |
| "grad_norm": 3.7983574867248535, | |
| "learning_rate": 1.694375e-05, | |
| "loss": 1.1993, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 0.6625, | |
| "grad_norm": 3.105295181274414, | |
| "learning_rate": 1.688125e-05, | |
| "loss": 1.2937, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.66375, | |
| "grad_norm": 3.038071870803833, | |
| "learning_rate": 1.681875e-05, | |
| "loss": 1.1928, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 0.665, | |
| "grad_norm": 3.6429975032806396, | |
| "learning_rate": 1.675625e-05, | |
| "loss": 1.2931, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 0.66625, | |
| "grad_norm": 3.5131030082702637, | |
| "learning_rate": 1.669375e-05, | |
| "loss": 1.3754, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 0.6675, | |
| "grad_norm": 2.8932530879974365, | |
| "learning_rate": 1.663125e-05, | |
| "loss": 1.2501, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 0.66875, | |
| "grad_norm": 3.6388654708862305, | |
| "learning_rate": 1.656875e-05, | |
| "loss": 1.2519, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 2.79237699508667, | |
| "learning_rate": 1.650625e-05, | |
| "loss": 1.261, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 0.67125, | |
| "grad_norm": 3.383009910583496, | |
| "learning_rate": 1.644375e-05, | |
| "loss": 1.371, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 0.6725, | |
| "grad_norm": 3.6595306396484375, | |
| "learning_rate": 1.6381250000000002e-05, | |
| "loss": 1.2767, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 0.67375, | |
| "grad_norm": 3.0263020992279053, | |
| "learning_rate": 1.6318750000000004e-05, | |
| "loss": 1.1588, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 0.675, | |
| "grad_norm": 4.379304885864258, | |
| "learning_rate": 1.6256250000000002e-05, | |
| "loss": 1.2933, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.67625, | |
| "grad_norm": 3.8622045516967773, | |
| "learning_rate": 1.619375e-05, | |
| "loss": 1.2027, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 0.6775, | |
| "grad_norm": 3.141866683959961, | |
| "learning_rate": 1.613125e-05, | |
| "loss": 1.2224, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 0.67875, | |
| "grad_norm": 6.858778476715088, | |
| "learning_rate": 1.606875e-05, | |
| "loss": 1.5491, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 5.642153263092041, | |
| "learning_rate": 1.600625e-05, | |
| "loss": 1.29, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 0.68125, | |
| "grad_norm": 3.560525894165039, | |
| "learning_rate": 1.594375e-05, | |
| "loss": 1.1829, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.6825, | |
| "grad_norm": 2.9443717002868652, | |
| "learning_rate": 1.588125e-05, | |
| "loss": 1.1874, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 0.68375, | |
| "grad_norm": 4.751156330108643, | |
| "learning_rate": 1.581875e-05, | |
| "loss": 1.2543, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 0.685, | |
| "grad_norm": 4.42818546295166, | |
| "learning_rate": 1.575625e-05, | |
| "loss": 1.3246, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 0.68625, | |
| "grad_norm": 3.2748584747314453, | |
| "learning_rate": 1.5693750000000002e-05, | |
| "loss": 1.3284, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 0.6875, | |
| "grad_norm": 3.11030912399292, | |
| "learning_rate": 1.563125e-05, | |
| "loss": 1.374, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.68875, | |
| "grad_norm": 3.4232919216156006, | |
| "learning_rate": 1.5568750000000002e-05, | |
| "loss": 1.2341, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 3.561033248901367, | |
| "learning_rate": 1.550625e-05, | |
| "loss": 1.3163, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 0.69125, | |
| "grad_norm": 3.259941816329956, | |
| "learning_rate": 1.5443750000000003e-05, | |
| "loss": 1.297, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 0.6925, | |
| "grad_norm": 4.584996223449707, | |
| "learning_rate": 1.538125e-05, | |
| "loss": 1.2904, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 0.69375, | |
| "grad_norm": 3.8913450241088867, | |
| "learning_rate": 1.531875e-05, | |
| "loss": 1.1971, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.695, | |
| "grad_norm": 2.977058172225952, | |
| "learning_rate": 1.525625e-05, | |
| "loss": 1.2937, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 0.69625, | |
| "grad_norm": 4.8936767578125, | |
| "learning_rate": 1.5193750000000002e-05, | |
| "loss": 1.2619, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 0.6975, | |
| "grad_norm": 3.7824175357818604, | |
| "learning_rate": 1.513125e-05, | |
| "loss": 1.2624, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 0.69875, | |
| "grad_norm": 3.056828498840332, | |
| "learning_rate": 1.506875e-05, | |
| "loss": 1.216, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 2.8399548530578613, | |
| "learning_rate": 1.5006249999999999e-05, | |
| "loss": 1.4088, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.70125, | |
| "grad_norm": 3.484537124633789, | |
| "learning_rate": 1.494375e-05, | |
| "loss": 1.4189, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 0.7025, | |
| "grad_norm": 3.8352763652801514, | |
| "learning_rate": 1.488125e-05, | |
| "loss": 1.2755, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 0.70375, | |
| "grad_norm": 5.0336785316467285, | |
| "learning_rate": 1.4818750000000001e-05, | |
| "loss": 1.2553, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 0.705, | |
| "grad_norm": 3.4178173542022705, | |
| "learning_rate": 1.4756250000000001e-05, | |
| "loss": 1.2476, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 0.70625, | |
| "grad_norm": 2.737694501876831, | |
| "learning_rate": 1.4693750000000001e-05, | |
| "loss": 1.2752, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.7075, | |
| "grad_norm": 4.246181964874268, | |
| "learning_rate": 1.4631250000000002e-05, | |
| "loss": 1.2681, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 0.70875, | |
| "grad_norm": 3.4891135692596436, | |
| "learning_rate": 1.456875e-05, | |
| "loss": 1.2727, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 5.688715934753418, | |
| "learning_rate": 1.4506250000000002e-05, | |
| "loss": 1.2608, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 0.71125, | |
| "grad_norm": 3.696350574493408, | |
| "learning_rate": 1.444375e-05, | |
| "loss": 1.2572, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 0.7125, | |
| "grad_norm": 2.7734687328338623, | |
| "learning_rate": 1.4381250000000002e-05, | |
| "loss": 1.2112, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.71375, | |
| "grad_norm": 3.8144760131835938, | |
| "learning_rate": 1.431875e-05, | |
| "loss": 1.1772, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 0.715, | |
| "grad_norm": 3.2144532203674316, | |
| "learning_rate": 1.4256250000000001e-05, | |
| "loss": 1.1894, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 0.71625, | |
| "grad_norm": 3.2612404823303223, | |
| "learning_rate": 1.419375e-05, | |
| "loss": 1.2391, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 0.7175, | |
| "grad_norm": 3.1684000492095947, | |
| "learning_rate": 1.4131250000000001e-05, | |
| "loss": 1.4017, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 0.71875, | |
| "grad_norm": 2.8268094062805176, | |
| "learning_rate": 1.406875e-05, | |
| "loss": 1.2547, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 3.3832662105560303, | |
| "learning_rate": 1.4006250000000002e-05, | |
| "loss": 1.3259, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 0.72125, | |
| "grad_norm": 4.300711154937744, | |
| "learning_rate": 1.394375e-05, | |
| "loss": 1.2589, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 0.7225, | |
| "grad_norm": 3.825169801712036, | |
| "learning_rate": 1.3881250000000002e-05, | |
| "loss": 1.299, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 0.72375, | |
| "grad_norm": 4.574991703033447, | |
| "learning_rate": 1.381875e-05, | |
| "loss": 1.2543, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 0.725, | |
| "grad_norm": 3.1067469120025635, | |
| "learning_rate": 1.375625e-05, | |
| "loss": 1.2156, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.72625, | |
| "grad_norm": 3.175403356552124, | |
| "learning_rate": 1.3693749999999999e-05, | |
| "loss": 1.1028, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 0.7275, | |
| "grad_norm": 3.0259435176849365, | |
| "learning_rate": 1.3631250000000001e-05, | |
| "loss": 1.2328, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 0.72875, | |
| "grad_norm": 3.6122825145721436, | |
| "learning_rate": 1.356875e-05, | |
| "loss": 1.2719, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 3.8099982738494873, | |
| "learning_rate": 1.3506250000000001e-05, | |
| "loss": 1.2776, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 0.73125, | |
| "grad_norm": 3.41595196723938, | |
| "learning_rate": 1.344375e-05, | |
| "loss": 1.3049, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.7325, | |
| "grad_norm": 3.9172563552856445, | |
| "learning_rate": 1.338125e-05, | |
| "loss": 1.259, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 0.73375, | |
| "grad_norm": 3.162015438079834, | |
| "learning_rate": 1.3318749999999998e-05, | |
| "loss": 1.2661, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 0.735, | |
| "grad_norm": 3.8498268127441406, | |
| "learning_rate": 1.325625e-05, | |
| "loss": 1.2737, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 0.73625, | |
| "grad_norm": 3.3922574520111084, | |
| "learning_rate": 1.3193750000000002e-05, | |
| "loss": 1.3626, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 0.7375, | |
| "grad_norm": 3.3294966220855713, | |
| "learning_rate": 1.313125e-05, | |
| "loss": 1.1615, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.73875, | |
| "grad_norm": 3.1682517528533936, | |
| "learning_rate": 1.3068750000000003e-05, | |
| "loss": 1.236, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 4.264817714691162, | |
| "learning_rate": 1.3006250000000001e-05, | |
| "loss": 1.409, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 0.74125, | |
| "grad_norm": 4.208788871765137, | |
| "learning_rate": 1.2943750000000001e-05, | |
| "loss": 1.315, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 0.7425, | |
| "grad_norm": 3.256880044937134, | |
| "learning_rate": 1.288125e-05, | |
| "loss": 1.1969, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 0.74375, | |
| "grad_norm": 3.2720675468444824, | |
| "learning_rate": 1.2818750000000002e-05, | |
| "loss": 1.1622, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.745, | |
| "grad_norm": 3.5530762672424316, | |
| "learning_rate": 1.275625e-05, | |
| "loss": 1.1997, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 0.74625, | |
| "grad_norm": 3.067060708999634, | |
| "learning_rate": 1.2693750000000002e-05, | |
| "loss": 1.251, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 0.7475, | |
| "grad_norm": 2.7820942401885986, | |
| "learning_rate": 1.263125e-05, | |
| "loss": 1.1509, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 0.74875, | |
| "grad_norm": 3.6107969284057617, | |
| "learning_rate": 1.256875e-05, | |
| "loss": 1.4085, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 6.036477565765381, | |
| "learning_rate": 1.250625e-05, | |
| "loss": 1.327, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.75125, | |
| "grad_norm": 6.355252742767334, | |
| "learning_rate": 1.2443750000000001e-05, | |
| "loss": 1.3521, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 0.7525, | |
| "grad_norm": 3.9743545055389404, | |
| "learning_rate": 1.2381250000000001e-05, | |
| "loss": 1.2671, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 0.75375, | |
| "grad_norm": 3.6817073822021484, | |
| "learning_rate": 1.2318750000000001e-05, | |
| "loss": 1.2116, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 0.755, | |
| "grad_norm": 3.35996413230896, | |
| "learning_rate": 1.2256250000000001e-05, | |
| "loss": 1.2384, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 0.75625, | |
| "grad_norm": 3.913332939147949, | |
| "learning_rate": 1.2193750000000002e-05, | |
| "loss": 1.1616, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.7575, | |
| "grad_norm": 3.327204465866089, | |
| "learning_rate": 1.213125e-05, | |
| "loss": 1.397, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 0.75875, | |
| "grad_norm": 3.389503240585327, | |
| "learning_rate": 1.206875e-05, | |
| "loss": 1.1839, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 3.358041286468506, | |
| "learning_rate": 1.200625e-05, | |
| "loss": 1.2082, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 0.76125, | |
| "grad_norm": 3.1447696685791016, | |
| "learning_rate": 1.194375e-05, | |
| "loss": 1.1562, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 0.7625, | |
| "grad_norm": 3.44541072845459, | |
| "learning_rate": 1.188125e-05, | |
| "loss": 1.3382, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.76375, | |
| "grad_norm": 3.0848042964935303, | |
| "learning_rate": 1.1818750000000001e-05, | |
| "loss": 1.3179, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 0.765, | |
| "grad_norm": 3.651977300643921, | |
| "learning_rate": 1.1756250000000001e-05, | |
| "loss": 1.2659, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 0.76625, | |
| "grad_norm": 3.152700662612915, | |
| "learning_rate": 1.1693750000000001e-05, | |
| "loss": 1.268, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 0.7675, | |
| "grad_norm": 4.158604145050049, | |
| "learning_rate": 1.163125e-05, | |
| "loss": 1.2345, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 0.76875, | |
| "grad_norm": 3.4470276832580566, | |
| "learning_rate": 1.156875e-05, | |
| "loss": 1.2949, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 3.150104284286499, | |
| "learning_rate": 1.150625e-05, | |
| "loss": 1.2278, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 0.77125, | |
| "grad_norm": 3.8590192794799805, | |
| "learning_rate": 1.144375e-05, | |
| "loss": 1.242, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 0.7725, | |
| "grad_norm": 4.324366092681885, | |
| "learning_rate": 1.138125e-05, | |
| "loss": 1.3226, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 0.77375, | |
| "grad_norm": 2.908385753631592, | |
| "learning_rate": 1.131875e-05, | |
| "loss": 1.2675, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 0.775, | |
| "grad_norm": 3.7354936599731445, | |
| "learning_rate": 1.1256250000000001e-05, | |
| "loss": 1.4641, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.77625, | |
| "grad_norm": 3.4952096939086914, | |
| "learning_rate": 1.119375e-05, | |
| "loss": 1.3276, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 0.7775, | |
| "grad_norm": 3.402109384536743, | |
| "learning_rate": 1.113125e-05, | |
| "loss": 1.1241, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 0.77875, | |
| "grad_norm": 3.307467460632324, | |
| "learning_rate": 1.106875e-05, | |
| "loss": 1.3648, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 3.728865146636963, | |
| "learning_rate": 1.100625e-05, | |
| "loss": 1.1877, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 0.78125, | |
| "grad_norm": 3.682426691055298, | |
| "learning_rate": 1.094375e-05, | |
| "loss": 1.1554, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.7825, | |
| "grad_norm": 3.6852643489837646, | |
| "learning_rate": 1.0881250000000002e-05, | |
| "loss": 1.141, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 0.78375, | |
| "grad_norm": 4.276542663574219, | |
| "learning_rate": 1.081875e-05, | |
| "loss": 1.3658, | |
| "step": 6270 | |
| }, | |
| { | |
| "epoch": 0.785, | |
| "grad_norm": 3.071035385131836, | |
| "learning_rate": 1.075625e-05, | |
| "loss": 1.1546, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 0.78625, | |
| "grad_norm": 4.381916522979736, | |
| "learning_rate": 1.069375e-05, | |
| "loss": 1.1487, | |
| "step": 6290 | |
| }, | |
| { | |
| "epoch": 0.7875, | |
| "grad_norm": 3.135784864425659, | |
| "learning_rate": 1.0631250000000001e-05, | |
| "loss": 1.2275, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.78875, | |
| "grad_norm": 3.404214859008789, | |
| "learning_rate": 1.0568750000000001e-05, | |
| "loss": 1.1391, | |
| "step": 6310 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 4.272989749908447, | |
| "learning_rate": 1.0506250000000001e-05, | |
| "loss": 1.3467, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 0.79125, | |
| "grad_norm": 3.6154932975769043, | |
| "learning_rate": 1.0443750000000001e-05, | |
| "loss": 1.2993, | |
| "step": 6330 | |
| }, | |
| { | |
| "epoch": 0.7925, | |
| "grad_norm": 3.6855859756469727, | |
| "learning_rate": 1.038125e-05, | |
| "loss": 1.1339, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 0.79375, | |
| "grad_norm": 3.8834280967712402, | |
| "learning_rate": 1.031875e-05, | |
| "loss": 1.2207, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.795, | |
| "grad_norm": 4.357527256011963, | |
| "learning_rate": 1.025625e-05, | |
| "loss": 1.266, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 0.79625, | |
| "grad_norm": 4.210048198699951, | |
| "learning_rate": 1.019375e-05, | |
| "loss": 1.2289, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 0.7975, | |
| "grad_norm": 5.4694647789001465, | |
| "learning_rate": 1.013125e-05, | |
| "loss": 1.1857, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 0.79875, | |
| "grad_norm": 3.1147310733795166, | |
| "learning_rate": 1.006875e-05, | |
| "loss": 1.3103, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 3.2928709983825684, | |
| "learning_rate": 1.0006250000000001e-05, | |
| "loss": 1.3642, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.80125, | |
| "grad_norm": 3.124969244003296, | |
| "learning_rate": 9.94375e-06, | |
| "loss": 1.2237, | |
| "step": 6410 | |
| }, | |
| { | |
| "epoch": 0.8025, | |
| "grad_norm": 4.003876686096191, | |
| "learning_rate": 9.88125e-06, | |
| "loss": 1.1267, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 0.80375, | |
| "grad_norm": 3.974295139312744, | |
| "learning_rate": 9.81875e-06, | |
| "loss": 1.1949, | |
| "step": 6430 | |
| }, | |
| { | |
| "epoch": 0.805, | |
| "grad_norm": 3.8797054290771484, | |
| "learning_rate": 9.75625e-06, | |
| "loss": 1.2467, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 0.80625, | |
| "grad_norm": 3.5032153129577637, | |
| "learning_rate": 9.69375e-06, | |
| "loss": 1.1921, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.8075, | |
| "grad_norm": 3.413506269454956, | |
| "learning_rate": 9.63125e-06, | |
| "loss": 1.2359, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 0.80875, | |
| "grad_norm": 3.894541025161743, | |
| "learning_rate": 9.56875e-06, | |
| "loss": 1.3246, | |
| "step": 6470 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 3.6062092781066895, | |
| "learning_rate": 9.50625e-06, | |
| "loss": 1.2462, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 0.81125, | |
| "grad_norm": 3.027904510498047, | |
| "learning_rate": 9.44375e-06, | |
| "loss": 1.1628, | |
| "step": 6490 | |
| }, | |
| { | |
| "epoch": 0.8125, | |
| "grad_norm": 3.1570937633514404, | |
| "learning_rate": 9.38125e-06, | |
| "loss": 1.1969, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.81375, | |
| "grad_norm": 2.943943977355957, | |
| "learning_rate": 9.318750000000001e-06, | |
| "loss": 1.2753, | |
| "step": 6510 | |
| }, | |
| { | |
| "epoch": 0.815, | |
| "grad_norm": 3.1280055046081543, | |
| "learning_rate": 9.256250000000001e-06, | |
| "loss": 1.1306, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 0.81625, | |
| "grad_norm": 4.8054399490356445, | |
| "learning_rate": 9.193750000000002e-06, | |
| "loss": 1.4028, | |
| "step": 6530 | |
| }, | |
| { | |
| "epoch": 0.8175, | |
| "grad_norm": 3.8370373249053955, | |
| "learning_rate": 9.131250000000002e-06, | |
| "loss": 1.3408, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 0.81875, | |
| "grad_norm": 3.5859014987945557, | |
| "learning_rate": 9.06875e-06, | |
| "loss": 1.2043, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 3.415255069732666, | |
| "learning_rate": 9.00625e-06, | |
| "loss": 1.2217, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 0.82125, | |
| "grad_norm": 3.6743905544281006, | |
| "learning_rate": 8.94375e-06, | |
| "loss": 1.1703, | |
| "step": 6570 | |
| }, | |
| { | |
| "epoch": 0.8225, | |
| "grad_norm": 2.7219078540802, | |
| "learning_rate": 8.88125e-06, | |
| "loss": 1.2929, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 0.82375, | |
| "grad_norm": 2.926729679107666, | |
| "learning_rate": 8.818750000000001e-06, | |
| "loss": 1.371, | |
| "step": 6590 | |
| }, | |
| { | |
| "epoch": 0.825, | |
| "grad_norm": 3.4970710277557373, | |
| "learning_rate": 8.756250000000001e-06, | |
| "loss": 1.1985, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.82625, | |
| "grad_norm": 3.802090644836426, | |
| "learning_rate": 8.693750000000001e-06, | |
| "loss": 1.2843, | |
| "step": 6610 | |
| }, | |
| { | |
| "epoch": 0.8275, | |
| "grad_norm": 3.0677645206451416, | |
| "learning_rate": 8.63125e-06, | |
| "loss": 1.2791, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 0.82875, | |
| "grad_norm": 3.903545379638672, | |
| "learning_rate": 8.56875e-06, | |
| "loss": 1.1844, | |
| "step": 6630 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 3.0674688816070557, | |
| "learning_rate": 8.50625e-06, | |
| "loss": 1.2111, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 0.83125, | |
| "grad_norm": 3.7008626461029053, | |
| "learning_rate": 8.44375e-06, | |
| "loss": 1.2203, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.8325, | |
| "grad_norm": 4.41727352142334, | |
| "learning_rate": 8.38125e-06, | |
| "loss": 1.3227, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 0.83375, | |
| "grad_norm": 3.456376075744629, | |
| "learning_rate": 8.31875e-06, | |
| "loss": 1.2385, | |
| "step": 6670 | |
| }, | |
| { | |
| "epoch": 0.835, | |
| "grad_norm": 4.11112117767334, | |
| "learning_rate": 8.25625e-06, | |
| "loss": 1.3041, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 0.83625, | |
| "grad_norm": 2.9469385147094727, | |
| "learning_rate": 8.19375e-06, | |
| "loss": 1.2004, | |
| "step": 6690 | |
| }, | |
| { | |
| "epoch": 0.8375, | |
| "grad_norm": 3.8115038871765137, | |
| "learning_rate": 8.13125e-06, | |
| "loss": 1.2884, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.83875, | |
| "grad_norm": 3.3773956298828125, | |
| "learning_rate": 8.06875e-06, | |
| "loss": 1.0463, | |
| "step": 6710 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 3.9320790767669678, | |
| "learning_rate": 8.00625e-06, | |
| "loss": 1.2637, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 0.84125, | |
| "grad_norm": 3.310523271560669, | |
| "learning_rate": 7.94375e-06, | |
| "loss": 1.3186, | |
| "step": 6730 | |
| }, | |
| { | |
| "epoch": 0.8425, | |
| "grad_norm": 4.7285637855529785, | |
| "learning_rate": 7.88125e-06, | |
| "loss": 1.0737, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 0.84375, | |
| "grad_norm": 4.357003688812256, | |
| "learning_rate": 7.81875e-06, | |
| "loss": 1.2484, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.845, | |
| "grad_norm": 6.129393100738525, | |
| "learning_rate": 7.75625e-06, | |
| "loss": 1.2047, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 0.84625, | |
| "grad_norm": 3.644521713256836, | |
| "learning_rate": 7.69375e-06, | |
| "loss": 1.2953, | |
| "step": 6770 | |
| }, | |
| { | |
| "epoch": 0.8475, | |
| "grad_norm": 3.131911516189575, | |
| "learning_rate": 7.63125e-06, | |
| "loss": 1.167, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 0.84875, | |
| "grad_norm": 3.4802305698394775, | |
| "learning_rate": 7.568750000000001e-06, | |
| "loss": 1.3353, | |
| "step": 6790 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 4.78393030166626, | |
| "learning_rate": 7.506250000000001e-06, | |
| "loss": 1.2363, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.85125, | |
| "grad_norm": 3.6145455837249756, | |
| "learning_rate": 7.4437500000000005e-06, | |
| "loss": 1.1944, | |
| "step": 6810 | |
| }, | |
| { | |
| "epoch": 0.8525, | |
| "grad_norm": 2.614313840866089, | |
| "learning_rate": 7.381250000000001e-06, | |
| "loss": 1.2045, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 0.85375, | |
| "grad_norm": 2.96185564994812, | |
| "learning_rate": 7.318750000000001e-06, | |
| "loss": 1.103, | |
| "step": 6830 | |
| }, | |
| { | |
| "epoch": 0.855, | |
| "grad_norm": 6.062429428100586, | |
| "learning_rate": 7.25625e-06, | |
| "loss": 1.2446, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 0.85625, | |
| "grad_norm": 3.7602720260620117, | |
| "learning_rate": 7.19375e-06, | |
| "loss": 1.2243, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.8575, | |
| "grad_norm": 3.405805826187134, | |
| "learning_rate": 7.1312500000000005e-06, | |
| "loss": 1.1315, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 0.85875, | |
| "grad_norm": 3.2969017028808594, | |
| "learning_rate": 7.068750000000001e-06, | |
| "loss": 1.1796, | |
| "step": 6870 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 4.001396179199219, | |
| "learning_rate": 7.00625e-06, | |
| "loss": 1.3448, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 0.86125, | |
| "grad_norm": 4.414263725280762, | |
| "learning_rate": 6.94375e-06, | |
| "loss": 1.2219, | |
| "step": 6890 | |
| }, | |
| { | |
| "epoch": 0.8625, | |
| "grad_norm": 3.5935354232788086, | |
| "learning_rate": 6.88125e-06, | |
| "loss": 1.1148, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.86375, | |
| "grad_norm": 3.3420941829681396, | |
| "learning_rate": 6.81875e-06, | |
| "loss": 1.2654, | |
| "step": 6910 | |
| }, | |
| { | |
| "epoch": 0.865, | |
| "grad_norm": 4.038907527923584, | |
| "learning_rate": 6.75625e-06, | |
| "loss": 1.25, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 0.86625, | |
| "grad_norm": 4.007872581481934, | |
| "learning_rate": 6.69375e-06, | |
| "loss": 1.3026, | |
| "step": 6930 | |
| }, | |
| { | |
| "epoch": 0.8675, | |
| "grad_norm": 3.9287617206573486, | |
| "learning_rate": 6.63125e-06, | |
| "loss": 1.1596, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 0.86875, | |
| "grad_norm": 2.911954402923584, | |
| "learning_rate": 6.56875e-06, | |
| "loss": 1.281, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 3.543391704559326, | |
| "learning_rate": 6.50625e-06, | |
| "loss": 1.268, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 0.87125, | |
| "grad_norm": 2.627988815307617, | |
| "learning_rate": 6.44375e-06, | |
| "loss": 1.2706, | |
| "step": 6970 | |
| }, | |
| { | |
| "epoch": 0.8725, | |
| "grad_norm": 3.9405062198638916, | |
| "learning_rate": 6.38125e-06, | |
| "loss": 1.2318, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 0.87375, | |
| "grad_norm": 5.287662506103516, | |
| "learning_rate": 6.3187499999999994e-06, | |
| "loss": 1.3652, | |
| "step": 6990 | |
| }, | |
| { | |
| "epoch": 0.875, | |
| "grad_norm": 3.4925477504730225, | |
| "learning_rate": 6.25625e-06, | |
| "loss": 1.3112, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.87625, | |
| "grad_norm": 4.140984058380127, | |
| "learning_rate": 6.193750000000001e-06, | |
| "loss": 1.1956, | |
| "step": 7010 | |
| }, | |
| { | |
| "epoch": 0.8775, | |
| "grad_norm": 5.385605335235596, | |
| "learning_rate": 6.13125e-06, | |
| "loss": 1.2959, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 0.87875, | |
| "grad_norm": 3.0660688877105713, | |
| "learning_rate": 6.06875e-06, | |
| "loss": 1.2465, | |
| "step": 7030 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 6.002776622772217, | |
| "learning_rate": 6.00625e-06, | |
| "loss": 1.4034, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 0.88125, | |
| "grad_norm": 3.455930471420288, | |
| "learning_rate": 5.94375e-06, | |
| "loss": 1.2638, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.8825, | |
| "grad_norm": 4.8663330078125, | |
| "learning_rate": 5.88125e-06, | |
| "loss": 1.241, | |
| "step": 7060 | |
| }, | |
| { | |
| "epoch": 0.88375, | |
| "grad_norm": 3.752796173095703, | |
| "learning_rate": 5.818750000000001e-06, | |
| "loss": 1.2547, | |
| "step": 7070 | |
| }, | |
| { | |
| "epoch": 0.885, | |
| "grad_norm": 3.7783405780792236, | |
| "learning_rate": 5.75625e-06, | |
| "loss": 1.2531, | |
| "step": 7080 | |
| }, | |
| { | |
| "epoch": 0.88625, | |
| "grad_norm": 4.519382953643799, | |
| "learning_rate": 5.69375e-06, | |
| "loss": 1.2299, | |
| "step": 7090 | |
| }, | |
| { | |
| "epoch": 0.8875, | |
| "grad_norm": 3.280031442642212, | |
| "learning_rate": 5.6312500000000005e-06, | |
| "loss": 1.1249, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.88875, | |
| "grad_norm": 3.4675002098083496, | |
| "learning_rate": 5.568750000000001e-06, | |
| "loss": 1.1638, | |
| "step": 7110 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 3.768967390060425, | |
| "learning_rate": 5.50625e-06, | |
| "loss": 1.3985, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 0.89125, | |
| "grad_norm": 4.724802017211914, | |
| "learning_rate": 5.44375e-06, | |
| "loss": 1.2324, | |
| "step": 7130 | |
| }, | |
| { | |
| "epoch": 0.8925, | |
| "grad_norm": 4.179074287414551, | |
| "learning_rate": 5.38125e-06, | |
| "loss": 1.1886, | |
| "step": 7140 | |
| }, | |
| { | |
| "epoch": 0.89375, | |
| "grad_norm": 4.108990669250488, | |
| "learning_rate": 5.3187500000000005e-06, | |
| "loss": 1.4765, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.895, | |
| "grad_norm": 4.176460266113281, | |
| "learning_rate": 5.25625e-06, | |
| "loss": 1.3531, | |
| "step": 7160 | |
| }, | |
| { | |
| "epoch": 0.89625, | |
| "grad_norm": 3.8916871547698975, | |
| "learning_rate": 5.19375e-06, | |
| "loss": 1.1835, | |
| "step": 7170 | |
| }, | |
| { | |
| "epoch": 0.8975, | |
| "grad_norm": 3.0018982887268066, | |
| "learning_rate": 5.13125e-06, | |
| "loss": 1.2182, | |
| "step": 7180 | |
| }, | |
| { | |
| "epoch": 0.89875, | |
| "grad_norm": 2.873230218887329, | |
| "learning_rate": 5.06875e-06, | |
| "loss": 1.2015, | |
| "step": 7190 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 3.587171792984009, | |
| "learning_rate": 5.0062500000000006e-06, | |
| "loss": 1.1995, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.90125, | |
| "grad_norm": 3.585000514984131, | |
| "learning_rate": 4.943750000000001e-06, | |
| "loss": 1.2743, | |
| "step": 7210 | |
| }, | |
| { | |
| "epoch": 0.9025, | |
| "grad_norm": 3.029139757156372, | |
| "learning_rate": 4.88125e-06, | |
| "loss": 1.2592, | |
| "step": 7220 | |
| }, | |
| { | |
| "epoch": 0.90375, | |
| "grad_norm": 3.077986240386963, | |
| "learning_rate": 4.81875e-06, | |
| "loss": 1.1442, | |
| "step": 7230 | |
| }, | |
| { | |
| "epoch": 0.905, | |
| "grad_norm": 2.580195426940918, | |
| "learning_rate": 4.75625e-06, | |
| "loss": 1.1531, | |
| "step": 7240 | |
| }, | |
| { | |
| "epoch": 0.90625, | |
| "grad_norm": 4.1036272048950195, | |
| "learning_rate": 4.693750000000001e-06, | |
| "loss": 1.3263, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.9075, | |
| "grad_norm": 2.9886510372161865, | |
| "learning_rate": 4.63125e-06, | |
| "loss": 1.2372, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 0.90875, | |
| "grad_norm": 3.5167458057403564, | |
| "learning_rate": 4.56875e-06, | |
| "loss": 1.3199, | |
| "step": 7270 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 2.5231704711914062, | |
| "learning_rate": 4.50625e-06, | |
| "loss": 1.177, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 0.91125, | |
| "grad_norm": 3.786348581314087, | |
| "learning_rate": 4.44375e-06, | |
| "loss": 1.1602, | |
| "step": 7290 | |
| }, | |
| { | |
| "epoch": 0.9125, | |
| "grad_norm": 3.3437860012054443, | |
| "learning_rate": 4.38125e-06, | |
| "loss": 1.266, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.91375, | |
| "grad_norm": 3.798862934112549, | |
| "learning_rate": 4.31875e-06, | |
| "loss": 1.1616, | |
| "step": 7310 | |
| }, | |
| { | |
| "epoch": 0.915, | |
| "grad_norm": 3.9102814197540283, | |
| "learning_rate": 4.25625e-06, | |
| "loss": 1.1375, | |
| "step": 7320 | |
| }, | |
| { | |
| "epoch": 0.91625, | |
| "grad_norm": 3.7560813426971436, | |
| "learning_rate": 4.19375e-06, | |
| "loss": 1.1178, | |
| "step": 7330 | |
| }, | |
| { | |
| "epoch": 0.9175, | |
| "grad_norm": 3.210345983505249, | |
| "learning_rate": 4.1312500000000005e-06, | |
| "loss": 1.0262, | |
| "step": 7340 | |
| }, | |
| { | |
| "epoch": 0.91875, | |
| "grad_norm": 3.9245500564575195, | |
| "learning_rate": 4.068750000000001e-06, | |
| "loss": 1.2043, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 3.6340880393981934, | |
| "learning_rate": 4.00625e-06, | |
| "loss": 1.2074, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 0.92125, | |
| "grad_norm": 2.729948043823242, | |
| "learning_rate": 3.94375e-06, | |
| "loss": 1.2154, | |
| "step": 7370 | |
| }, | |
| { | |
| "epoch": 0.9225, | |
| "grad_norm": 5.572306156158447, | |
| "learning_rate": 3.88125e-06, | |
| "loss": 1.2146, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 0.92375, | |
| "grad_norm": 3.3658738136291504, | |
| "learning_rate": 3.8187500000000005e-06, | |
| "loss": 1.2932, | |
| "step": 7390 | |
| }, | |
| { | |
| "epoch": 0.925, | |
| "grad_norm": 3.831812620162964, | |
| "learning_rate": 3.7562500000000002e-06, | |
| "loss": 1.276, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.92625, | |
| "grad_norm": 2.969461441040039, | |
| "learning_rate": 3.69375e-06, | |
| "loss": 1.1595, | |
| "step": 7410 | |
| }, | |
| { | |
| "epoch": 0.9275, | |
| "grad_norm": 4.540526390075684, | |
| "learning_rate": 3.6312499999999997e-06, | |
| "loss": 1.2967, | |
| "step": 7420 | |
| }, | |
| { | |
| "epoch": 0.92875, | |
| "grad_norm": 4.625833988189697, | |
| "learning_rate": 3.56875e-06, | |
| "loss": 1.1621, | |
| "step": 7430 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 3.144590139389038, | |
| "learning_rate": 3.5062500000000005e-06, | |
| "loss": 1.1817, | |
| "step": 7440 | |
| }, | |
| { | |
| "epoch": 0.93125, | |
| "grad_norm": 4.031451225280762, | |
| "learning_rate": 3.4437500000000003e-06, | |
| "loss": 1.3139, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.9325, | |
| "grad_norm": 3.656533718109131, | |
| "learning_rate": 3.3812500000000004e-06, | |
| "loss": 1.2257, | |
| "step": 7460 | |
| }, | |
| { | |
| "epoch": 0.93375, | |
| "grad_norm": 3.497356653213501, | |
| "learning_rate": 3.31875e-06, | |
| "loss": 1.2571, | |
| "step": 7470 | |
| }, | |
| { | |
| "epoch": 0.935, | |
| "grad_norm": 3.372255802154541, | |
| "learning_rate": 3.2562500000000004e-06, | |
| "loss": 1.2038, | |
| "step": 7480 | |
| }, | |
| { | |
| "epoch": 0.93625, | |
| "grad_norm": 3.349648952484131, | |
| "learning_rate": 3.19375e-06, | |
| "loss": 1.2484, | |
| "step": 7490 | |
| }, | |
| { | |
| "epoch": 0.9375, | |
| "grad_norm": 3.791111469268799, | |
| "learning_rate": 3.1312500000000003e-06, | |
| "loss": 1.2193, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.93875, | |
| "grad_norm": 2.6422860622406006, | |
| "learning_rate": 3.06875e-06, | |
| "loss": 1.1635, | |
| "step": 7510 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 4.382972240447998, | |
| "learning_rate": 3.0062500000000002e-06, | |
| "loss": 1.2232, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 0.94125, | |
| "grad_norm": 3.4872641563415527, | |
| "learning_rate": 2.94375e-06, | |
| "loss": 1.1796, | |
| "step": 7530 | |
| }, | |
| { | |
| "epoch": 0.9425, | |
| "grad_norm": 3.1682002544403076, | |
| "learning_rate": 2.88125e-06, | |
| "loss": 1.2348, | |
| "step": 7540 | |
| }, | |
| { | |
| "epoch": 0.94375, | |
| "grad_norm": 3.578993558883667, | |
| "learning_rate": 2.8187500000000003e-06, | |
| "loss": 1.2451, | |
| "step": 7550 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 8000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.1593221075369984e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |