bs-normal / trainer_state.json
vpalod's picture
Upload folder using huggingface_hub
8603006 verified
{
"best_global_step": 175000,
"best_metric": 0.0006260189693421125,
"best_model_checkpoint": "/data/bozos/models/f8d245da3b0d0e66db4c97688fe67d8c31303d4f662c4b64e5da18eb8964c893/checkpoints/checkpoint-175000",
"epoch": 4.08,
"eval_steps": 5000,
"global_step": 255000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.016,
"grad_norm": 2.213568925857544,
"learning_rate": 0.00022758072642650628,
"loss": 1.732,
"step": 1000
},
{
"epoch": 0.032,
"grad_norm": 0.6694766879081726,
"learning_rate": 0.00022757708507668012,
"loss": 0.4826,
"step": 2000
},
{
"epoch": 0.048,
"grad_norm": 2.43005633354187,
"learning_rate": 0.00022757344372685396,
"loss": 0.1817,
"step": 3000
},
{
"epoch": 0.064,
"grad_norm": 0.4847993850708008,
"learning_rate": 0.0002275698023770278,
"loss": 0.1264,
"step": 4000
},
{
"epoch": 0.08,
"grad_norm": 0.162339448928833,
"learning_rate": 0.00022756616102720165,
"loss": 0.0816,
"step": 5000
},
{
"epoch": 0.08,
"eval_loss": 0.05317778140306473,
"eval_runtime": 27.3774,
"eval_samples_per_second": 36.526,
"eval_steps_per_second": 4.566,
"step": 5000
},
{
"epoch": 0.096,
"grad_norm": 0.13133646547794342,
"learning_rate": 0.0002275625196773755,
"loss": 0.0531,
"step": 6000
},
{
"epoch": 0.112,
"grad_norm": 0.16350027918815613,
"learning_rate": 0.00022755887832754933,
"loss": 0.0471,
"step": 7000
},
{
"epoch": 0.128,
"grad_norm": 0.15041467547416687,
"learning_rate": 0.00022755523697772317,
"loss": 0.0303,
"step": 8000
},
{
"epoch": 0.144,
"grad_norm": 0.22431999444961548,
"learning_rate": 0.000227551595627897,
"loss": 0.0213,
"step": 9000
},
{
"epoch": 0.16,
"grad_norm": 0.09702177345752716,
"learning_rate": 0.00022754795427807082,
"loss": 0.016,
"step": 10000
},
{
"epoch": 0.16,
"eval_loss": 0.011971595697104931,
"eval_runtime": 27.2488,
"eval_samples_per_second": 36.699,
"eval_steps_per_second": 4.587,
"step": 10000
},
{
"epoch": 0.176,
"grad_norm": 0.22383971512317657,
"learning_rate": 0.0002275443129282447,
"loss": 0.0182,
"step": 11000
},
{
"epoch": 0.192,
"grad_norm": 0.10483340919017792,
"learning_rate": 0.0002275406715784185,
"loss": 0.0099,
"step": 12000
},
{
"epoch": 0.208,
"grad_norm": 0.06974471360445023,
"learning_rate": 0.00022753703022859238,
"loss": 0.0102,
"step": 13000
},
{
"epoch": 0.224,
"grad_norm": 0.08179257810115814,
"learning_rate": 0.0002275333888787662,
"loss": 0.0095,
"step": 14000
},
{
"epoch": 0.24,
"grad_norm": 0.06491447985172272,
"learning_rate": 0.00022752974752894006,
"loss": 0.0108,
"step": 15000
},
{
"epoch": 0.24,
"eval_loss": 0.007058731280267239,
"eval_runtime": 27.4538,
"eval_samples_per_second": 36.425,
"eval_steps_per_second": 4.553,
"step": 15000
},
{
"epoch": 0.256,
"grad_norm": 0.07762598991394043,
"learning_rate": 0.00022752610617911387,
"loss": 0.0083,
"step": 16000
},
{
"epoch": 0.272,
"grad_norm": 0.09719238430261612,
"learning_rate": 0.00022752246482928774,
"loss": 0.0088,
"step": 17000
},
{
"epoch": 0.288,
"grad_norm": 0.0923658162355423,
"learning_rate": 0.00022751882347946155,
"loss": 0.0058,
"step": 18000
},
{
"epoch": 0.304,
"grad_norm": 0.07014696300029755,
"learning_rate": 0.00022751518212963542,
"loss": 0.0075,
"step": 19000
},
{
"epoch": 0.32,
"grad_norm": 0.060413043946027756,
"learning_rate": 0.00022751154077980924,
"loss": 0.006,
"step": 20000
},
{
"epoch": 0.32,
"eval_loss": 0.004977255128324032,
"eval_runtime": 27.1268,
"eval_samples_per_second": 36.864,
"eval_steps_per_second": 4.608,
"step": 20000
},
{
"epoch": 0.336,
"grad_norm": 0.07979925721883774,
"learning_rate": 0.0002275078994299831,
"loss": 0.0074,
"step": 21000
},
{
"epoch": 0.352,
"grad_norm": 0.07468965649604797,
"learning_rate": 0.00022750425808015692,
"loss": 0.0061,
"step": 22000
},
{
"epoch": 0.368,
"grad_norm": 0.11379896104335785,
"learning_rate": 0.00022750061673033079,
"loss": 0.0066,
"step": 23000
},
{
"epoch": 0.384,
"grad_norm": 0.14029712975025177,
"learning_rate": 0.0002274969753805046,
"loss": 0.0048,
"step": 24000
},
{
"epoch": 0.4,
"grad_norm": 0.025649070739746094,
"learning_rate": 0.00022749333403067844,
"loss": 0.0062,
"step": 25000
},
{
"epoch": 0.4,
"eval_loss": 0.00377740734256804,
"eval_runtime": 27.2272,
"eval_samples_per_second": 36.728,
"eval_steps_per_second": 4.591,
"step": 25000
},
{
"epoch": 0.416,
"grad_norm": 0.07835651934146881,
"learning_rate": 0.00022748969268085228,
"loss": 0.0057,
"step": 26000
},
{
"epoch": 0.432,
"grad_norm": 0.037621937692165375,
"learning_rate": 0.00022748605133102612,
"loss": 0.0043,
"step": 27000
},
{
"epoch": 0.448,
"grad_norm": 0.05530184134840965,
"learning_rate": 0.00022748240998119996,
"loss": 0.0078,
"step": 28000
},
{
"epoch": 0.464,
"grad_norm": 0.2537539601325989,
"learning_rate": 0.0002274787686313738,
"loss": 0.004,
"step": 29000
},
{
"epoch": 0.48,
"grad_norm": 0.08855901658535004,
"learning_rate": 0.00022747512728154765,
"loss": 0.0055,
"step": 30000
},
{
"epoch": 0.48,
"eval_loss": 0.00407881336286664,
"eval_runtime": 27.3314,
"eval_samples_per_second": 36.588,
"eval_steps_per_second": 4.573,
"step": 30000
},
{
"epoch": 0.496,
"grad_norm": 0.01860993541777134,
"learning_rate": 0.0002274714859317215,
"loss": 0.0044,
"step": 31000
},
{
"epoch": 0.512,
"grad_norm": 0.030549678951501846,
"learning_rate": 0.00022746784458189533,
"loss": 0.0062,
"step": 32000
},
{
"epoch": 0.528,
"grad_norm": 0.07974190264940262,
"learning_rate": 0.00022746420323206917,
"loss": 0.0044,
"step": 33000
},
{
"epoch": 0.544,
"grad_norm": 0.07146530598402023,
"learning_rate": 0.000227460561882243,
"loss": 0.0033,
"step": 34000
},
{
"epoch": 0.56,
"grad_norm": 0.03786474093794823,
"learning_rate": 0.00022745692053241685,
"loss": 0.0064,
"step": 35000
},
{
"epoch": 0.56,
"eval_loss": 0.002913910197094083,
"eval_runtime": 27.2895,
"eval_samples_per_second": 36.644,
"eval_steps_per_second": 4.581,
"step": 35000
},
{
"epoch": 0.576,
"grad_norm": 1.5708693265914917,
"learning_rate": 0.0002274532791825907,
"loss": 0.0048,
"step": 36000
},
{
"epoch": 0.592,
"grad_norm": 0.04259568825364113,
"learning_rate": 0.0002274496378327645,
"loss": 0.0027,
"step": 37000
},
{
"epoch": 0.608,
"grad_norm": 0.029481125995516777,
"learning_rate": 0.00022744599648293838,
"loss": 0.0049,
"step": 38000
},
{
"epoch": 0.624,
"grad_norm": 0.3993789553642273,
"learning_rate": 0.0002274423551331122,
"loss": 0.0048,
"step": 39000
},
{
"epoch": 0.64,
"grad_norm": 0.03810903802514076,
"learning_rate": 0.00022743871378328606,
"loss": 0.0027,
"step": 40000
},
{
"epoch": 0.64,
"eval_loss": 0.00209135003387928,
"eval_runtime": 27.0955,
"eval_samples_per_second": 36.906,
"eval_steps_per_second": 4.613,
"step": 40000
},
{
"epoch": 0.656,
"grad_norm": 0.027073705568909645,
"learning_rate": 0.00022743507243345987,
"loss": 0.0033,
"step": 41000
},
{
"epoch": 0.672,
"grad_norm": 0.04906334728002548,
"learning_rate": 0.00022743143108363374,
"loss": 0.0033,
"step": 42000
},
{
"epoch": 0.688,
"grad_norm": 0.05806988850235939,
"learning_rate": 0.00022742778973380755,
"loss": 0.0039,
"step": 43000
},
{
"epoch": 0.704,
"grad_norm": 0.022845715284347534,
"learning_rate": 0.00022742414838398142,
"loss": 0.0031,
"step": 44000
},
{
"epoch": 0.72,
"grad_norm": 0.06443994492292404,
"learning_rate": 0.00022742050703415524,
"loss": 0.0027,
"step": 45000
},
{
"epoch": 0.72,
"eval_loss": 0.002537691965699196,
"eval_runtime": 27.3857,
"eval_samples_per_second": 36.515,
"eval_steps_per_second": 4.564,
"step": 45000
},
{
"epoch": 0.736,
"grad_norm": 0.1143941730260849,
"learning_rate": 0.0002274168656843291,
"loss": 0.0034,
"step": 46000
},
{
"epoch": 0.752,
"grad_norm": 0.04524613544344902,
"learning_rate": 0.00022741322433450292,
"loss": 0.0034,
"step": 47000
},
{
"epoch": 0.768,
"grad_norm": 0.027965977787971497,
"learning_rate": 0.00022740958298467676,
"loss": 0.0031,
"step": 48000
},
{
"epoch": 0.784,
"grad_norm": 0.033201005309820175,
"learning_rate": 0.0002274059416348506,
"loss": 0.0032,
"step": 49000
},
{
"epoch": 0.8,
"grad_norm": 0.11329031735658646,
"learning_rate": 0.00022740230028502444,
"loss": 0.0034,
"step": 50000
},
{
"epoch": 0.8,
"eval_loss": 0.00293481582775712,
"eval_runtime": 27.0954,
"eval_samples_per_second": 36.907,
"eval_steps_per_second": 4.613,
"step": 50000
},
{
"epoch": 0.816,
"grad_norm": 0.08998037129640579,
"learning_rate": 0.00022739865893519828,
"loss": 0.003,
"step": 51000
},
{
"epoch": 0.832,
"grad_norm": 0.034506551921367645,
"learning_rate": 0.00022739501758537212,
"loss": 0.0039,
"step": 52000
},
{
"epoch": 0.848,
"grad_norm": 0.10205531865358353,
"learning_rate": 0.00022739137623554596,
"loss": 0.0031,
"step": 53000
},
{
"epoch": 0.864,
"grad_norm": 0.016757190227508545,
"learning_rate": 0.0002273877348857198,
"loss": 0.0024,
"step": 54000
},
{
"epoch": 0.88,
"grad_norm": 0.038212958723306656,
"learning_rate": 0.00022738409353589365,
"loss": 0.0035,
"step": 55000
},
{
"epoch": 0.88,
"eval_loss": 0.003101126756519079,
"eval_runtime": 27.2079,
"eval_samples_per_second": 36.754,
"eval_steps_per_second": 4.594,
"step": 55000
},
{
"epoch": 0.896,
"grad_norm": 0.20447635650634766,
"learning_rate": 0.0002273804521860675,
"loss": 0.0029,
"step": 56000
},
{
"epoch": 0.912,
"grad_norm": 0.029786735773086548,
"learning_rate": 0.00022737681083624133,
"loss": 0.0028,
"step": 57000
},
{
"epoch": 0.928,
"grad_norm": 0.1717972755432129,
"learning_rate": 0.00022737316948641517,
"loss": 0.0035,
"step": 58000
},
{
"epoch": 0.944,
"grad_norm": 0.051670778542757034,
"learning_rate": 0.000227369528136589,
"loss": 0.0026,
"step": 59000
},
{
"epoch": 0.96,
"grad_norm": 0.18315136432647705,
"learning_rate": 0.00022736588678676285,
"loss": 0.0021,
"step": 60000
},
{
"epoch": 0.96,
"eval_loss": 0.0022276523523032665,
"eval_runtime": 27.2912,
"eval_samples_per_second": 36.642,
"eval_steps_per_second": 4.58,
"step": 60000
},
{
"epoch": 0.976,
"grad_norm": 0.2137993574142456,
"learning_rate": 0.0002273622454369367,
"loss": 0.0031,
"step": 61000
},
{
"epoch": 0.992,
"grad_norm": 0.02584846317768097,
"learning_rate": 0.00022735860408711053,
"loss": 0.0033,
"step": 62000
},
{
"epoch": 1.008,
"grad_norm": 0.054690442979335785,
"learning_rate": 0.00022735496273728438,
"loss": 0.0024,
"step": 63000
},
{
"epoch": 1.024,
"grad_norm": 0.01702144928276539,
"learning_rate": 0.00022735132138745822,
"loss": 0.0032,
"step": 64000
},
{
"epoch": 1.04,
"grad_norm": 0.02373000793159008,
"learning_rate": 0.00022734768003763206,
"loss": 0.0032,
"step": 65000
},
{
"epoch": 1.04,
"eval_loss": 0.0020411296281963587,
"eval_runtime": 27.2492,
"eval_samples_per_second": 36.698,
"eval_steps_per_second": 4.587,
"step": 65000
},
{
"epoch": 1.056,
"grad_norm": 0.012987160123884678,
"learning_rate": 0.0002273440386878059,
"loss": 0.002,
"step": 66000
},
{
"epoch": 1.072,
"grad_norm": 0.029065946117043495,
"learning_rate": 0.0002273403973379797,
"loss": 0.0031,
"step": 67000
},
{
"epoch": 1.088,
"grad_norm": 0.17107020318508148,
"learning_rate": 0.00022733675598815358,
"loss": 0.0022,
"step": 68000
},
{
"epoch": 1.104,
"grad_norm": 0.019081389531493187,
"learning_rate": 0.0002273331146383274,
"loss": 0.0031,
"step": 69000
},
{
"epoch": 1.12,
"grad_norm": 0.008192900568246841,
"learning_rate": 0.00022732947328850126,
"loss": 0.0022,
"step": 70000
},
{
"epoch": 1.12,
"eval_loss": 0.0018265106482431293,
"eval_runtime": 27.5029,
"eval_samples_per_second": 36.36,
"eval_steps_per_second": 4.545,
"step": 70000
},
{
"epoch": 1.1360000000000001,
"grad_norm": 0.009845556691288948,
"learning_rate": 0.00022732583193867508,
"loss": 0.0033,
"step": 71000
},
{
"epoch": 1.152,
"grad_norm": 0.1637999713420868,
"learning_rate": 0.00022732219058884895,
"loss": 0.0024,
"step": 72000
},
{
"epoch": 1.168,
"grad_norm": 0.03215477615594864,
"learning_rate": 0.00022731854923902276,
"loss": 0.0026,
"step": 73000
},
{
"epoch": 1.184,
"grad_norm": 0.028977178037166595,
"learning_rate": 0.00022731490788919663,
"loss": 0.0023,
"step": 74000
},
{
"epoch": 1.2,
"grad_norm": 0.057766951620578766,
"learning_rate": 0.00022731126653937044,
"loss": 0.0034,
"step": 75000
},
{
"epoch": 1.2,
"eval_loss": 0.0021767348516732454,
"eval_runtime": 27.526,
"eval_samples_per_second": 36.329,
"eval_steps_per_second": 4.541,
"step": 75000
},
{
"epoch": 1.216,
"grad_norm": 0.00946386530995369,
"learning_rate": 0.0002273076251895443,
"loss": 0.0021,
"step": 76000
},
{
"epoch": 1.232,
"grad_norm": 0.12553413212299347,
"learning_rate": 0.00022730398383971812,
"loss": 0.0019,
"step": 77000
},
{
"epoch": 1.248,
"grad_norm": 0.0369916595518589,
"learning_rate": 0.000227300342489892,
"loss": 0.003,
"step": 78000
},
{
"epoch": 1.264,
"grad_norm": 0.19732122123241425,
"learning_rate": 0.0002272967011400658,
"loss": 0.0024,
"step": 79000
},
{
"epoch": 1.28,
"grad_norm": 0.02471228875219822,
"learning_rate": 0.00022729305979023967,
"loss": 0.0024,
"step": 80000
},
{
"epoch": 1.28,
"eval_loss": 0.002593559678643942,
"eval_runtime": 27.2923,
"eval_samples_per_second": 36.64,
"eval_steps_per_second": 4.58,
"step": 80000
},
{
"epoch": 1.296,
"grad_norm": 0.5299795269966125,
"learning_rate": 0.0002272894184404135,
"loss": 0.0019,
"step": 81000
},
{
"epoch": 1.312,
"grad_norm": 0.03472663834691048,
"learning_rate": 0.00022728577709058736,
"loss": 0.003,
"step": 82000
},
{
"epoch": 1.328,
"grad_norm": 0.09357739239931107,
"learning_rate": 0.00022728213574076117,
"loss": 0.0022,
"step": 83000
},
{
"epoch": 1.3439999999999999,
"grad_norm": 0.01810472272336483,
"learning_rate": 0.00022727849439093504,
"loss": 0.0019,
"step": 84000
},
{
"epoch": 1.3599999999999999,
"grad_norm": 0.024920647963881493,
"learning_rate": 0.00022727485304110885,
"loss": 0.0021,
"step": 85000
},
{
"epoch": 1.3599999999999999,
"eval_loss": 0.0022232765331864357,
"eval_runtime": 27.3645,
"eval_samples_per_second": 36.544,
"eval_steps_per_second": 4.568,
"step": 85000
},
{
"epoch": 1.376,
"grad_norm": 0.03085111826658249,
"learning_rate": 0.00022727121169128272,
"loss": 0.0023,
"step": 86000
},
{
"epoch": 1.392,
"grad_norm": 0.010742255486547947,
"learning_rate": 0.00022726757034145654,
"loss": 0.0019,
"step": 87000
},
{
"epoch": 1.408,
"grad_norm": 0.03559265285730362,
"learning_rate": 0.00022726392899163038,
"loss": 0.0022,
"step": 88000
},
{
"epoch": 1.424,
"grad_norm": 0.0898701399564743,
"learning_rate": 0.00022726028764180422,
"loss": 0.0028,
"step": 89000
},
{
"epoch": 1.44,
"grad_norm": 0.026589710265398026,
"learning_rate": 0.00022725664629197806,
"loss": 0.0016,
"step": 90000
},
{
"epoch": 1.44,
"eval_loss": 0.00150102109182626,
"eval_runtime": 27.6944,
"eval_samples_per_second": 36.108,
"eval_steps_per_second": 4.514,
"step": 90000
},
{
"epoch": 1.456,
"grad_norm": 0.016303159296512604,
"learning_rate": 0.0002272530049421519,
"loss": 0.0024,
"step": 91000
},
{
"epoch": 1.472,
"grad_norm": 0.01823027804493904,
"learning_rate": 0.00022724936359232574,
"loss": 0.0018,
"step": 92000
},
{
"epoch": 1.488,
"grad_norm": 0.15236489474773407,
"learning_rate": 0.00022724572224249958,
"loss": 0.0024,
"step": 93000
},
{
"epoch": 1.504,
"grad_norm": 0.03902558609843254,
"learning_rate": 0.00022724208089267342,
"loss": 0.0021,
"step": 94000
},
{
"epoch": 1.52,
"grad_norm": 0.020767396315932274,
"learning_rate": 0.00022723843954284726,
"loss": 0.002,
"step": 95000
},
{
"epoch": 1.52,
"eval_loss": 0.001406910945661366,
"eval_runtime": 27.6364,
"eval_samples_per_second": 36.184,
"eval_steps_per_second": 4.523,
"step": 95000
},
{
"epoch": 1.536,
"grad_norm": 0.09269700944423676,
"learning_rate": 0.0002272347981930211,
"loss": 0.0023,
"step": 96000
},
{
"epoch": 1.552,
"grad_norm": 0.04058321192860603,
"learning_rate": 0.00022723115684319495,
"loss": 0.0019,
"step": 97000
},
{
"epoch": 1.568,
"grad_norm": 0.04894057661294937,
"learning_rate": 0.0002272275154933688,
"loss": 0.0018,
"step": 98000
},
{
"epoch": 1.584,
"grad_norm": 0.04043205827474594,
"learning_rate": 0.00022722387414354263,
"loss": 0.0022,
"step": 99000
},
{
"epoch": 1.6,
"grad_norm": 0.1002797931432724,
"learning_rate": 0.00022722023279371647,
"loss": 0.002,
"step": 100000
},
{
"epoch": 1.6,
"eval_loss": 0.0017908032750710845,
"eval_runtime": 27.5843,
"eval_samples_per_second": 36.253,
"eval_steps_per_second": 4.532,
"step": 100000
},
{
"epoch": 1.616,
"grad_norm": 0.02161436155438423,
"learning_rate": 0.0002272165914438903,
"loss": 0.0018,
"step": 101000
},
{
"epoch": 1.6320000000000001,
"grad_norm": 0.010246573947370052,
"learning_rate": 0.00022721295009406415,
"loss": 0.002,
"step": 102000
},
{
"epoch": 1.6480000000000001,
"grad_norm": 0.06802576035261154,
"learning_rate": 0.000227209308744238,
"loss": 0.0015,
"step": 103000
},
{
"epoch": 1.6640000000000001,
"grad_norm": 0.013391965068876743,
"learning_rate": 0.00022720566739441183,
"loss": 0.0025,
"step": 104000
},
{
"epoch": 1.6800000000000002,
"grad_norm": 0.10946637392044067,
"learning_rate": 0.00022720202604458568,
"loss": 0.0018,
"step": 105000
},
{
"epoch": 1.6800000000000002,
"eval_loss": 0.0015562042826786637,
"eval_runtime": 27.7549,
"eval_samples_per_second": 36.03,
"eval_steps_per_second": 4.504,
"step": 105000
},
{
"epoch": 1.696,
"grad_norm": 0.028942033648490906,
"learning_rate": 0.00022719838469475952,
"loss": 0.002,
"step": 106000
},
{
"epoch": 1.712,
"grad_norm": 0.023039843887090683,
"learning_rate": 0.00022719474334493333,
"loss": 0.0014,
"step": 107000
},
{
"epoch": 1.728,
"grad_norm": 0.010488491505384445,
"learning_rate": 0.0002271911019951072,
"loss": 0.0016,
"step": 108000
},
{
"epoch": 1.744,
"grad_norm": 0.019485417753458023,
"learning_rate": 0.000227187460645281,
"loss": 0.002,
"step": 109000
},
{
"epoch": 1.76,
"grad_norm": 0.010597913525998592,
"learning_rate": 0.00022718381929545488,
"loss": 0.002,
"step": 110000
},
{
"epoch": 1.76,
"eval_loss": 0.000878525257576257,
"eval_runtime": 27.823,
"eval_samples_per_second": 35.941,
"eval_steps_per_second": 4.493,
"step": 110000
},
{
"epoch": 1.776,
"grad_norm": 0.02870281971991062,
"learning_rate": 0.0002271801779456287,
"loss": 0.0023,
"step": 111000
},
{
"epoch": 1.792,
"grad_norm": 0.041255537420511246,
"learning_rate": 0.00022717653659580256,
"loss": 0.0014,
"step": 112000
},
{
"epoch": 1.808,
"grad_norm": 0.04701690748333931,
"learning_rate": 0.00022717289524597638,
"loss": 0.0015,
"step": 113000
},
{
"epoch": 1.8239999999999998,
"grad_norm": 0.059342917054891586,
"learning_rate": 0.00022716925389615024,
"loss": 0.0028,
"step": 114000
},
{
"epoch": 1.8399999999999999,
"grad_norm": 0.040327928960323334,
"learning_rate": 0.00022716561254632406,
"loss": 0.0014,
"step": 115000
},
{
"epoch": 1.8399999999999999,
"eval_loss": 0.0016007705125957727,
"eval_runtime": 27.7178,
"eval_samples_per_second": 36.078,
"eval_steps_per_second": 4.51,
"step": 115000
},
{
"epoch": 1.8559999999999999,
"grad_norm": 0.018858684226870537,
"learning_rate": 0.00022716197119649793,
"loss": 0.002,
"step": 116000
},
{
"epoch": 1.8719999999999999,
"grad_norm": 0.026660999283194542,
"learning_rate": 0.00022715832984667174,
"loss": 0.0019,
"step": 117000
},
{
"epoch": 1.888,
"grad_norm": 0.08471547812223434,
"learning_rate": 0.0002271546884968456,
"loss": 0.0016,
"step": 118000
},
{
"epoch": 1.904,
"grad_norm": 0.03236541524529457,
"learning_rate": 0.00022715104714701942,
"loss": 0.0014,
"step": 119000
},
{
"epoch": 1.92,
"grad_norm": 0.015728328377008438,
"learning_rate": 0.0002271474057971933,
"loss": 0.0022,
"step": 120000
},
{
"epoch": 1.92,
"eval_loss": 0.0017823727102950215,
"eval_runtime": 27.6601,
"eval_samples_per_second": 36.153,
"eval_steps_per_second": 4.519,
"step": 120000
},
{
"epoch": 1.936,
"grad_norm": 0.2575147747993469,
"learning_rate": 0.0002271437644473671,
"loss": 0.0015,
"step": 121000
},
{
"epoch": 1.952,
"grad_norm": 0.03020591102540493,
"learning_rate": 0.00022714012309754097,
"loss": 0.0015,
"step": 122000
},
{
"epoch": 1.968,
"grad_norm": 0.011387010104954243,
"learning_rate": 0.0002271364817477148,
"loss": 0.0015,
"step": 123000
},
{
"epoch": 1.984,
"grad_norm": 0.033326998353004456,
"learning_rate": 0.00022713284039788866,
"loss": 0.0019,
"step": 124000
},
{
"epoch": 2.0,
"grad_norm": 0.234897643327713,
"learning_rate": 0.00022712919904806247,
"loss": 0.0029,
"step": 125000
},
{
"epoch": 2.0,
"eval_loss": 0.00910487212240696,
"eval_runtime": 27.7906,
"eval_samples_per_second": 35.983,
"eval_steps_per_second": 4.498,
"step": 125000
},
{
"epoch": 2.016,
"grad_norm": 0.05067163705825806,
"learning_rate": 0.0002271255576982363,
"loss": 0.0019,
"step": 126000
},
{
"epoch": 2.032,
"grad_norm": 0.015078851021826267,
"learning_rate": 0.00022712191634841015,
"loss": 0.0012,
"step": 127000
},
{
"epoch": 2.048,
"grad_norm": 0.03365013748407364,
"learning_rate": 0.000227118274998584,
"loss": 0.0018,
"step": 128000
},
{
"epoch": 2.064,
"grad_norm": 0.00802704505622387,
"learning_rate": 0.00022711463364875783,
"loss": 0.0013,
"step": 129000
},
{
"epoch": 2.08,
"grad_norm": 0.011523068882524967,
"learning_rate": 0.00022711099229893168,
"loss": 0.0021,
"step": 130000
},
{
"epoch": 2.08,
"eval_loss": 0.0009301243117079139,
"eval_runtime": 27.505,
"eval_samples_per_second": 36.357,
"eval_steps_per_second": 4.545,
"step": 130000
},
{
"epoch": 2.096,
"grad_norm": 0.012680677697062492,
"learning_rate": 0.00022710735094910552,
"loss": 0.0014,
"step": 131000
},
{
"epoch": 2.112,
"grad_norm": 0.0508689247071743,
"learning_rate": 0.00022710370959927936,
"loss": 0.002,
"step": 132000
},
{
"epoch": 2.128,
"grad_norm": 0.014830244705080986,
"learning_rate": 0.0002271000682494532,
"loss": 0.001,
"step": 133000
},
{
"epoch": 2.144,
"grad_norm": 0.028912167996168137,
"learning_rate": 0.00022709642689962704,
"loss": 0.0019,
"step": 134000
},
{
"epoch": 2.16,
"grad_norm": 0.06254349648952484,
"learning_rate": 0.00022709278554980088,
"loss": 0.0012,
"step": 135000
},
{
"epoch": 2.16,
"eval_loss": 0.0014802517835050821,
"eval_runtime": 27.695,
"eval_samples_per_second": 36.108,
"eval_steps_per_second": 4.513,
"step": 135000
},
{
"epoch": 2.176,
"grad_norm": 0.01877821609377861,
"learning_rate": 0.00022708914419997472,
"loss": 0.0015,
"step": 136000
},
{
"epoch": 2.192,
"grad_norm": 0.18786460161209106,
"learning_rate": 0.00022708550285014856,
"loss": 0.0018,
"step": 137000
},
{
"epoch": 2.208,
"grad_norm": 0.016280388459563255,
"learning_rate": 0.0002270818615003224,
"loss": 0.0015,
"step": 138000
},
{
"epoch": 2.224,
"grad_norm": 0.009028231725096703,
"learning_rate": 0.00022707822015049625,
"loss": 0.0022,
"step": 139000
},
{
"epoch": 2.24,
"grad_norm": 0.02473852038383484,
"learning_rate": 0.0002270745788006701,
"loss": 0.0011,
"step": 140000
},
{
"epoch": 2.24,
"eval_loss": 0.0011171329533681273,
"eval_runtime": 27.6717,
"eval_samples_per_second": 36.138,
"eval_steps_per_second": 4.517,
"step": 140000
},
{
"epoch": 2.2560000000000002,
"grad_norm": 0.015900999307632446,
"learning_rate": 0.00022707093745084393,
"loss": 0.0015,
"step": 141000
},
{
"epoch": 2.2720000000000002,
"grad_norm": 0.018436668440699577,
"learning_rate": 0.00022706729610101774,
"loss": 0.0015,
"step": 142000
},
{
"epoch": 2.288,
"grad_norm": 0.268839567899704,
"learning_rate": 0.0002270636547511916,
"loss": 0.0013,
"step": 143000
},
{
"epoch": 2.304,
"grad_norm": 0.024980826303362846,
"learning_rate": 0.00022706001340136542,
"loss": 0.0017,
"step": 144000
},
{
"epoch": 2.32,
"grad_norm": 0.025631515309214592,
"learning_rate": 0.00022705637205153926,
"loss": 0.0009,
"step": 145000
},
{
"epoch": 2.32,
"eval_loss": 0.0012023365125060081,
"eval_runtime": 27.5991,
"eval_samples_per_second": 36.233,
"eval_steps_per_second": 4.529,
"step": 145000
},
{
"epoch": 2.336,
"grad_norm": 0.010165953077375889,
"learning_rate": 0.0002270527307017131,
"loss": 0.0018,
"step": 146000
},
{
"epoch": 2.352,
"grad_norm": 0.012398986145853996,
"learning_rate": 0.00022704908935188695,
"loss": 0.001,
"step": 147000
},
{
"epoch": 2.368,
"grad_norm": 0.02246440201997757,
"learning_rate": 0.0002270454480020608,
"loss": 0.0025,
"step": 148000
},
{
"epoch": 2.384,
"grad_norm": 0.018412381410598755,
"learning_rate": 0.00022704180665223463,
"loss": 0.0008,
"step": 149000
},
{
"epoch": 2.4,
"grad_norm": 0.025599336251616478,
"learning_rate": 0.00022703816530240847,
"loss": 0.0025,
"step": 150000
},
{
"epoch": 2.4,
"eval_loss": 0.000995820271782577,
"eval_runtime": 27.7548,
"eval_samples_per_second": 36.03,
"eval_steps_per_second": 4.504,
"step": 150000
},
{
"epoch": 2.416,
"grad_norm": 0.03476562350988388,
"learning_rate": 0.0002270345239525823,
"loss": 0.0016,
"step": 151000
},
{
"epoch": 2.432,
"grad_norm": 0.002502072835341096,
"learning_rate": 0.00022703088260275615,
"loss": 0.001,
"step": 152000
},
{
"epoch": 2.448,
"grad_norm": 0.09545526653528214,
"learning_rate": 0.00022702724125293,
"loss": 0.0019,
"step": 153000
},
{
"epoch": 2.464,
"grad_norm": 0.026374874636530876,
"learning_rate": 0.00022702359990310383,
"loss": 0.0027,
"step": 154000
},
{
"epoch": 2.48,
"grad_norm": 0.02330603636801243,
"learning_rate": 0.00022701995855327768,
"loss": 0.0013,
"step": 155000
},
{
"epoch": 2.48,
"eval_loss": 0.0009146310039795935,
"eval_runtime": 27.6699,
"eval_samples_per_second": 36.14,
"eval_steps_per_second": 4.518,
"step": 155000
},
{
"epoch": 2.496,
"grad_norm": 0.042115718126297,
"learning_rate": 0.00022701631720345152,
"loss": 0.001,
"step": 156000
},
{
"epoch": 2.512,
"grad_norm": 0.006467332132160664,
"learning_rate": 0.00022701267585362536,
"loss": 0.0013,
"step": 157000
},
{
"epoch": 2.528,
"grad_norm": 0.039700523018836975,
"learning_rate": 0.0002270090345037992,
"loss": 0.0012,
"step": 158000
},
{
"epoch": 2.544,
"grad_norm": 0.006177098024636507,
"learning_rate": 0.00022700539315397304,
"loss": 0.0032,
"step": 159000
},
{
"epoch": 2.56,
"grad_norm": 0.016644610092043877,
"learning_rate": 0.00022700175180414688,
"loss": 0.0007,
"step": 160000
},
{
"epoch": 2.56,
"eval_loss": 0.0010344331385567784,
"eval_runtime": 27.8065,
"eval_samples_per_second": 35.963,
"eval_steps_per_second": 4.495,
"step": 160000
},
{
"epoch": 2.576,
"grad_norm": 0.01400495320558548,
"learning_rate": 0.00022699811045432072,
"loss": 0.0012,
"step": 161000
},
{
"epoch": 2.592,
"grad_norm": 0.016703518107533455,
"learning_rate": 0.00022699446910449456,
"loss": 0.0012,
"step": 162000
},
{
"epoch": 2.608,
"grad_norm": 0.006359017454087734,
"learning_rate": 0.0002269908277546684,
"loss": 0.0012,
"step": 163000
},
{
"epoch": 2.624,
"grad_norm": 0.01771441660821438,
"learning_rate": 0.00022698718640484222,
"loss": 0.0016,
"step": 164000
},
{
"epoch": 2.64,
"grad_norm": 0.01094936951994896,
"learning_rate": 0.0002269835450550161,
"loss": 0.0011,
"step": 165000
},
{
"epoch": 2.64,
"eval_loss": 0.0007599141681566834,
"eval_runtime": 27.7146,
"eval_samples_per_second": 36.082,
"eval_steps_per_second": 4.51,
"step": 165000
},
{
"epoch": 2.656,
"grad_norm": 0.09152177721261978,
"learning_rate": 0.0002269799037051899,
"loss": 0.0024,
"step": 166000
},
{
"epoch": 2.672,
"grad_norm": 0.012105804868042469,
"learning_rate": 0.00022697626235536377,
"loss": 0.0009,
"step": 167000
},
{
"epoch": 2.6879999999999997,
"grad_norm": 0.01530654076486826,
"learning_rate": 0.00022697262100553758,
"loss": 0.0011,
"step": 168000
},
{
"epoch": 2.7039999999999997,
"grad_norm": 0.031053414568305016,
"learning_rate": 0.00022696897965571145,
"loss": 0.0015,
"step": 169000
},
{
"epoch": 2.7199999999999998,
"grad_norm": 0.01557753887027502,
"learning_rate": 0.00022696533830588527,
"loss": 0.001,
"step": 170000
},
{
"epoch": 2.7199999999999998,
"eval_loss": 0.0008088626782409847,
"eval_runtime": 27.776,
"eval_samples_per_second": 36.002,
"eval_steps_per_second": 4.5,
"step": 170000
},
{
"epoch": 2.7359999999999998,
"grad_norm": 0.02831295132637024,
"learning_rate": 0.00022696169695605913,
"loss": 0.0014,
"step": 171000
},
{
"epoch": 2.752,
"grad_norm": 0.017672572284936905,
"learning_rate": 0.00022695805560623295,
"loss": 0.0011,
"step": 172000
},
{
"epoch": 2.768,
"grad_norm": 0.018164193257689476,
"learning_rate": 0.00022695441425640682,
"loss": 0.0019,
"step": 173000
},
{
"epoch": 2.784,
"grad_norm": 0.017383994534611702,
"learning_rate": 0.00022695077290658063,
"loss": 0.001,
"step": 174000
},
{
"epoch": 2.8,
"grad_norm": 0.006576849147677422,
"learning_rate": 0.0002269471315567545,
"loss": 0.0011,
"step": 175000
},
{
"epoch": 2.8,
"eval_loss": 0.0006260189693421125,
"eval_runtime": 27.3919,
"eval_samples_per_second": 36.507,
"eval_steps_per_second": 4.563,
"step": 175000
},
{
"epoch": 2.816,
"grad_norm": 0.019615883007645607,
"learning_rate": 0.0002269434902069283,
"loss": 0.0012,
"step": 176000
},
{
"epoch": 2.832,
"grad_norm": 0.03926165774464607,
"learning_rate": 0.00022693984885710218,
"loss": 0.0014,
"step": 177000
},
{
"epoch": 2.848,
"grad_norm": 0.021534917876124382,
"learning_rate": 0.000226936207507276,
"loss": 0.0012,
"step": 178000
},
{
"epoch": 2.864,
"grad_norm": 0.04047563299536705,
"learning_rate": 0.00022693256615744986,
"loss": 0.001,
"step": 179000
},
{
"epoch": 2.88,
"grad_norm": 0.04712160676717758,
"learning_rate": 0.00022692892480762368,
"loss": 0.0015,
"step": 180000
},
{
"epoch": 2.88,
"eval_loss": 0.0013630291214212775,
"eval_runtime": 27.4056,
"eval_samples_per_second": 36.489,
"eval_steps_per_second": 4.561,
"step": 180000
},
{
"epoch": 2.896,
"grad_norm": 0.21584591269493103,
"learning_rate": 0.00022692528345779754,
"loss": 0.0019,
"step": 181000
},
{
"epoch": 2.912,
"grad_norm": 0.015519549138844013,
"learning_rate": 0.00022692164210797136,
"loss": 0.0012,
"step": 182000
},
{
"epoch": 2.928,
"grad_norm": 0.0314391665160656,
"learning_rate": 0.00022691800075814523,
"loss": 0.0009,
"step": 183000
},
{
"epoch": 2.944,
"grad_norm": 0.16906876862049103,
"learning_rate": 0.00022691435940831904,
"loss": 0.0013,
"step": 184000
},
{
"epoch": 2.96,
"grad_norm": 0.04538990557193756,
"learning_rate": 0.00022691071805849288,
"loss": 0.001,
"step": 185000
},
{
"epoch": 2.96,
"eval_loss": 0.0014080323744565248,
"eval_runtime": 27.3828,
"eval_samples_per_second": 36.519,
"eval_steps_per_second": 4.565,
"step": 185000
},
{
"epoch": 2.976,
"grad_norm": 0.008023149333894253,
"learning_rate": 0.00022690707670866672,
"loss": 0.0013,
"step": 186000
},
{
"epoch": 2.992,
"grad_norm": 0.011926773004233837,
"learning_rate": 0.00022690343535884056,
"loss": 0.0012,
"step": 187000
},
{
"epoch": 3.008,
"grad_norm": 0.01701526716351509,
"learning_rate": 0.0002268997940090144,
"loss": 0.0011,
"step": 188000
},
{
"epoch": 3.024,
"grad_norm": 0.015581037849187851,
"learning_rate": 0.00022689615265918825,
"loss": 0.0013,
"step": 189000
},
{
"epoch": 3.04,
"grad_norm": 0.012046800926327705,
"learning_rate": 0.0002268925113093621,
"loss": 0.001,
"step": 190000
},
{
"epoch": 3.04,
"eval_loss": 0.0010119588114321232,
"eval_runtime": 27.6665,
"eval_samples_per_second": 36.145,
"eval_steps_per_second": 4.518,
"step": 190000
},
{
"epoch": 3.056,
"grad_norm": 0.009263888001441956,
"learning_rate": 0.00022688886995953593,
"loss": 0.001,
"step": 191000
},
{
"epoch": 3.072,
"grad_norm": 0.0538918599486351,
"learning_rate": 0.00022688522860970977,
"loss": 0.0012,
"step": 192000
},
{
"epoch": 3.088,
"grad_norm": 0.0521121546626091,
"learning_rate": 0.0002268815872598836,
"loss": 0.0017,
"step": 193000
},
{
"epoch": 3.104,
"grad_norm": 0.05000779777765274,
"learning_rate": 0.00022687794591005745,
"loss": 0.0008,
"step": 194000
},
{
"epoch": 3.12,
"grad_norm": 0.06467895954847336,
"learning_rate": 0.0002268743045602313,
"loss": 0.0011,
"step": 195000
},
{
"epoch": 3.12,
"eval_loss": 0.0008815609035082161,
"eval_runtime": 27.5652,
"eval_samples_per_second": 36.278,
"eval_steps_per_second": 4.535,
"step": 195000
},
{
"epoch": 3.136,
"grad_norm": 0.01422048918902874,
"learning_rate": 0.00022687066321040513,
"loss": 0.0011,
"step": 196000
},
{
"epoch": 3.152,
"grad_norm": 0.02482694387435913,
"learning_rate": 0.00022686702186057897,
"loss": 0.0011,
"step": 197000
},
{
"epoch": 3.168,
"grad_norm": 0.03517874330282211,
"learning_rate": 0.00022686338051075282,
"loss": 0.0017,
"step": 198000
},
{
"epoch": 3.184,
"grad_norm": 0.027310600504279137,
"learning_rate": 0.00022685973916092666,
"loss": 0.0008,
"step": 199000
},
{
"epoch": 3.2,
"grad_norm": 0.06521017849445343,
"learning_rate": 0.0002268560978111005,
"loss": 0.002,
"step": 200000
},
{
"epoch": 3.2,
"eval_loss": 0.00754576688632369,
"eval_runtime": 27.5143,
"eval_samples_per_second": 36.345,
"eval_steps_per_second": 4.543,
"step": 200000
},
{
"epoch": 3.216,
"grad_norm": 0.24959920346736908,
"learning_rate": 0.00022685245646127434,
"loss": 0.0008,
"step": 201000
},
{
"epoch": 3.232,
"grad_norm": 0.010456324554979801,
"learning_rate": 0.00022684881511144818,
"loss": 0.0011,
"step": 202000
},
{
"epoch": 3.248,
"grad_norm": 0.010797294788062572,
"learning_rate": 0.00022684517376162202,
"loss": 0.0011,
"step": 203000
},
{
"epoch": 3.2640000000000002,
"grad_norm": 0.04222773015499115,
"learning_rate": 0.00022684153241179584,
"loss": 0.001,
"step": 204000
},
{
"epoch": 3.2800000000000002,
"grad_norm": 0.03277302905917168,
"learning_rate": 0.0002268378910619697,
"loss": 0.0015,
"step": 205000
},
{
"epoch": 3.2800000000000002,
"eval_loss": 0.0007634469075128436,
"eval_runtime": 27.7742,
"eval_samples_per_second": 36.005,
"eval_steps_per_second": 4.501,
"step": 205000
},
{
"epoch": 3.296,
"grad_norm": 0.0069810631684958935,
"learning_rate": 0.00022683424971214352,
"loss": 0.001,
"step": 206000
},
{
"epoch": 3.312,
"grad_norm": 0.01147681474685669,
"learning_rate": 0.00022683060836231739,
"loss": 0.0009,
"step": 207000
},
{
"epoch": 3.328,
"grad_norm": 0.009766928851604462,
"learning_rate": 0.0002268269670124912,
"loss": 0.0019,
"step": 208000
},
{
"epoch": 3.344,
"grad_norm": 0.03460145741701126,
"learning_rate": 0.00022682332566266507,
"loss": 0.0008,
"step": 209000
},
{
"epoch": 3.36,
"grad_norm": 0.016247229650616646,
"learning_rate": 0.00022681968431283888,
"loss": 0.001,
"step": 210000
},
{
"epoch": 3.36,
"eval_loss": 0.0010268606711179018,
"eval_runtime": 26.9593,
"eval_samples_per_second": 37.093,
"eval_steps_per_second": 4.637,
"step": 210000
},
{
"epoch": 3.376,
"grad_norm": 0.012766228057444096,
"learning_rate": 0.00022681604296301275,
"loss": 0.0008,
"step": 211000
},
{
"epoch": 3.392,
"grad_norm": 0.005086794961243868,
"learning_rate": 0.00022681240161318656,
"loss": 0.0014,
"step": 212000
},
{
"epoch": 3.408,
"grad_norm": 0.028264038264751434,
"learning_rate": 0.00022680876026336043,
"loss": 0.0011,
"step": 213000
},
{
"epoch": 3.424,
"grad_norm": 0.05160939320921898,
"learning_rate": 0.00022680511891353425,
"loss": 0.0009,
"step": 214000
},
{
"epoch": 3.44,
"grad_norm": 0.02259020321071148,
"learning_rate": 0.00022680147756370811,
"loss": 0.0012,
"step": 215000
},
{
"epoch": 3.44,
"eval_loss": 0.0007602461846545339,
"eval_runtime": 26.8881,
"eval_samples_per_second": 37.191,
"eval_steps_per_second": 4.649,
"step": 215000
},
{
"epoch": 3.456,
"grad_norm": 0.03077981248497963,
"learning_rate": 0.00022679783621388193,
"loss": 0.0012,
"step": 216000
},
{
"epoch": 3.472,
"grad_norm": 0.027997983619570732,
"learning_rate": 0.0002267941948640558,
"loss": 0.0008,
"step": 217000
},
{
"epoch": 3.488,
"grad_norm": 0.009089149534702301,
"learning_rate": 0.0002267905535142296,
"loss": 0.0011,
"step": 218000
},
{
"epoch": 3.504,
"grad_norm": 0.09043902903795242,
"learning_rate": 0.00022678691216440348,
"loss": 0.0011,
"step": 219000
},
{
"epoch": 3.52,
"grad_norm": 0.06199198588728905,
"learning_rate": 0.0002267832708145773,
"loss": 0.0011,
"step": 220000
},
{
"epoch": 3.52,
"eval_loss": 0.001106478739529848,
"eval_runtime": 27.0055,
"eval_samples_per_second": 37.029,
"eval_steps_per_second": 4.629,
"step": 220000
},
{
"epoch": 3.536,
"grad_norm": 0.013115255162119865,
"learning_rate": 0.00022677962946475116,
"loss": 0.0015,
"step": 221000
},
{
"epoch": 3.552,
"grad_norm": 0.030206598341464996,
"learning_rate": 0.00022677598811492498,
"loss": 0.001,
"step": 222000
},
{
"epoch": 3.568,
"grad_norm": 0.014335270039737225,
"learning_rate": 0.00022677234676509882,
"loss": 0.0008,
"step": 223000
},
{
"epoch": 3.584,
"grad_norm": 0.04320364445447922,
"learning_rate": 0.00022676870541527266,
"loss": 0.001,
"step": 224000
},
{
"epoch": 3.6,
"grad_norm": 0.01011396199464798,
"learning_rate": 0.0002267650640654465,
"loss": 0.0014,
"step": 225000
},
{
"epoch": 3.6,
"eval_loss": 0.0008724904037080705,
"eval_runtime": 26.7598,
"eval_samples_per_second": 37.37,
"eval_steps_per_second": 4.671,
"step": 225000
},
{
"epoch": 3.616,
"grad_norm": 0.06343936175107956,
"learning_rate": 0.00022676142271562034,
"loss": 0.0009,
"step": 226000
},
{
"epoch": 3.632,
"grad_norm": 0.04553668946027756,
"learning_rate": 0.00022675778136579418,
"loss": 0.001,
"step": 227000
},
{
"epoch": 3.648,
"grad_norm": 0.0029150221962481737,
"learning_rate": 0.00022675414001596802,
"loss": 0.0018,
"step": 228000
},
{
"epoch": 3.664,
"grad_norm": 0.03533324971795082,
"learning_rate": 0.00022675049866614186,
"loss": 0.0017,
"step": 229000
},
{
"epoch": 3.68,
"grad_norm": 0.020134087651968002,
"learning_rate": 0.0002267468573163157,
"loss": 0.0013,
"step": 230000
},
{
"epoch": 3.68,
"eval_loss": 0.001037033973261714,
"eval_runtime": 27.1191,
"eval_samples_per_second": 36.874,
"eval_steps_per_second": 4.609,
"step": 230000
},
{
"epoch": 3.6959999999999997,
"grad_norm": 0.01976308599114418,
"learning_rate": 0.00022674321596648955,
"loss": 0.0009,
"step": 231000
},
{
"epoch": 3.7119999999999997,
"grad_norm": 0.05415629222989082,
"learning_rate": 0.00022673957461666339,
"loss": 0.0012,
"step": 232000
},
{
"epoch": 3.7279999999999998,
"grad_norm": 0.020477378740906715,
"learning_rate": 0.00022673593326683723,
"loss": 0.001,
"step": 233000
},
{
"epoch": 3.7439999999999998,
"grad_norm": 0.014153924770653248,
"learning_rate": 0.00022673229191701107,
"loss": 0.0017,
"step": 234000
},
{
"epoch": 3.76,
"grad_norm": 0.02030963823199272,
"learning_rate": 0.0002267286505671849,
"loss": 0.0007,
"step": 235000
},
{
"epoch": 3.76,
"eval_loss": 0.0007908450206741691,
"eval_runtime": 27.0159,
"eval_samples_per_second": 37.015,
"eval_steps_per_second": 4.627,
"step": 235000
},
{
"epoch": 3.776,
"grad_norm": 0.03953304514288902,
"learning_rate": 0.00022672500921735875,
"loss": 0.0008,
"step": 236000
},
{
"epoch": 3.792,
"grad_norm": 0.007172519341111183,
"learning_rate": 0.0002267213678675326,
"loss": 0.0016,
"step": 237000
},
{
"epoch": 3.808,
"grad_norm": 0.03694753348827362,
"learning_rate": 0.00022671772651770643,
"loss": 0.0008,
"step": 238000
},
{
"epoch": 3.824,
"grad_norm": 0.04899757727980614,
"learning_rate": 0.00022671408516788027,
"loss": 0.0011,
"step": 239000
},
{
"epoch": 3.84,
"grad_norm": 0.05499159172177315,
"learning_rate": 0.00022671044381805412,
"loss": 0.0013,
"step": 240000
},
{
"epoch": 3.84,
"eval_loss": 0.0008275896543636918,
"eval_runtime": 27.099,
"eval_samples_per_second": 36.902,
"eval_steps_per_second": 4.613,
"step": 240000
},
{
"epoch": 3.856,
"grad_norm": 0.02498927153646946,
"learning_rate": 0.00022670680246822796,
"loss": 0.0008,
"step": 241000
},
{
"epoch": 3.872,
"grad_norm": 0.02703891508281231,
"learning_rate": 0.00022670316111840177,
"loss": 0.0009,
"step": 242000
},
{
"epoch": 3.888,
"grad_norm": 0.010871395468711853,
"learning_rate": 0.00022669951976857564,
"loss": 0.0009,
"step": 243000
},
{
"epoch": 3.904,
"grad_norm": 0.006647611036896706,
"learning_rate": 0.00022669587841874945,
"loss": 0.0019,
"step": 244000
},
{
"epoch": 3.92,
"grad_norm": 0.11232209205627441,
"learning_rate": 0.00022669223706892332,
"loss": 0.0006,
"step": 245000
},
{
"epoch": 3.92,
"eval_loss": 0.004233696032315493,
"eval_runtime": 27.2042,
"eval_samples_per_second": 36.759,
"eval_steps_per_second": 4.595,
"step": 245000
},
{
"epoch": 3.936,
"grad_norm": 0.03585943579673767,
"learning_rate": 0.00022668859571909713,
"loss": 0.0012,
"step": 246000
},
{
"epoch": 3.952,
"grad_norm": 0.028422392904758453,
"learning_rate": 0.000226684954369271,
"loss": 0.0009,
"step": 247000
},
{
"epoch": 3.968,
"grad_norm": 0.029626131057739258,
"learning_rate": 0.00022668131301944482,
"loss": 0.0009,
"step": 248000
},
{
"epoch": 3.984,
"grad_norm": 0.01423815730959177,
"learning_rate": 0.00022667767166961866,
"loss": 0.0011,
"step": 249000
},
{
"epoch": 4.0,
"grad_norm": 0.028744470328092575,
"learning_rate": 0.0002266740303197925,
"loss": 0.0012,
"step": 250000
},
{
"epoch": 4.0,
"eval_loss": 0.0009512793621979654,
"eval_runtime": 27.0826,
"eval_samples_per_second": 36.924,
"eval_steps_per_second": 4.616,
"step": 250000
},
{
"epoch": 4.016,
"grad_norm": 0.05679468810558319,
"learning_rate": 0.00022667038896996634,
"loss": 0.0008,
"step": 251000
},
{
"epoch": 4.032,
"grad_norm": 0.01259209681302309,
"learning_rate": 0.00022666674762014018,
"loss": 0.0012,
"step": 252000
},
{
"epoch": 4.048,
"grad_norm": 0.02058994211256504,
"learning_rate": 0.00022666310627031402,
"loss": 0.0007,
"step": 253000
},
{
"epoch": 4.064,
"grad_norm": 0.028425488620996475,
"learning_rate": 0.00022665946492048786,
"loss": 0.0017,
"step": 254000
},
{
"epoch": 4.08,
"grad_norm": 0.035576559603214264,
"learning_rate": 0.0002266558235706617,
"loss": 0.0008,
"step": 255000
},
{
"epoch": 4.08,
"eval_loss": 0.0006453625974245369,
"eval_runtime": 27.5028,
"eval_samples_per_second": 36.36,
"eval_steps_per_second": 4.545,
"step": 255000
}
],
"logging_steps": 1000,
"max_steps": 62500000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1000,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.9094798804101104e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}