| { | |
| "best_global_step": 2034, | |
| "best_metric": 0.018208853900432587, | |
| "best_model_checkpoint": "saves_stability/prefix-tuning/llama-3-8b-instruct/train_cb_1757340215/checkpoint-2034", | |
| "epoch": 20.0, | |
| "eval_steps": 113, | |
| "global_step": 2260, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.04424778761061947, | |
| "grad_norm": 176.08645629882812, | |
| "learning_rate": 8.849557522123894e-07, | |
| "loss": 9.371, | |
| "num_input_tokens_seen": 1520, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.08849557522123894, | |
| "grad_norm": 182.31874084472656, | |
| "learning_rate": 1.991150442477876e-06, | |
| "loss": 8.6136, | |
| "num_input_tokens_seen": 2976, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.13274336283185842, | |
| "grad_norm": 191.4988555908203, | |
| "learning_rate": 3.097345132743363e-06, | |
| "loss": 7.307, | |
| "num_input_tokens_seen": 4688, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.17699115044247787, | |
| "grad_norm": 121.74630737304688, | |
| "learning_rate": 4.2035398230088504e-06, | |
| "loss": 5.3146, | |
| "num_input_tokens_seen": 5776, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.22123893805309736, | |
| "grad_norm": 85.33639526367188, | |
| "learning_rate": 5.3097345132743365e-06, | |
| "loss": 3.4853, | |
| "num_input_tokens_seen": 6992, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.26548672566371684, | |
| "grad_norm": 104.62677001953125, | |
| "learning_rate": 6.415929203539823e-06, | |
| "loss": 2.6932, | |
| "num_input_tokens_seen": 8320, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.30973451327433627, | |
| "grad_norm": 76.95153045654297, | |
| "learning_rate": 7.52212389380531e-06, | |
| "loss": 1.4263, | |
| "num_input_tokens_seen": 9536, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.35398230088495575, | |
| "grad_norm": 91.25277709960938, | |
| "learning_rate": 8.628318584070797e-06, | |
| "loss": 0.6123, | |
| "num_input_tokens_seen": 11072, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.39823008849557523, | |
| "grad_norm": 65.90292358398438, | |
| "learning_rate": 9.734513274336284e-06, | |
| "loss": 0.6019, | |
| "num_input_tokens_seen": 12320, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.4424778761061947, | |
| "grad_norm": 67.72266387939453, | |
| "learning_rate": 1.0840707964601771e-05, | |
| "loss": 0.5444, | |
| "num_input_tokens_seen": 13616, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.48672566371681414, | |
| "grad_norm": 75.33143615722656, | |
| "learning_rate": 1.1946902654867258e-05, | |
| "loss": 0.5862, | |
| "num_input_tokens_seen": 15104, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.5309734513274337, | |
| "grad_norm": 29.605249404907227, | |
| "learning_rate": 1.3053097345132745e-05, | |
| "loss": 0.3168, | |
| "num_input_tokens_seen": 16352, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.5752212389380531, | |
| "grad_norm": 8.359716415405273, | |
| "learning_rate": 1.415929203539823e-05, | |
| "loss": 0.1929, | |
| "num_input_tokens_seen": 17888, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.6194690265486725, | |
| "grad_norm": 149.0347442626953, | |
| "learning_rate": 1.5265486725663717e-05, | |
| "loss": 0.7495, | |
| "num_input_tokens_seen": 19008, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.6637168141592921, | |
| "grad_norm": 10.16899299621582, | |
| "learning_rate": 1.6371681415929206e-05, | |
| "loss": 0.2649, | |
| "num_input_tokens_seen": 20480, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.7079646017699115, | |
| "grad_norm": 26.192371368408203, | |
| "learning_rate": 1.747787610619469e-05, | |
| "loss": 0.4134, | |
| "num_input_tokens_seen": 22000, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.7522123893805309, | |
| "grad_norm": 6.6128435134887695, | |
| "learning_rate": 1.858407079646018e-05, | |
| "loss": 0.2021, | |
| "num_input_tokens_seen": 23456, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.7964601769911505, | |
| "grad_norm": 31.164657592773438, | |
| "learning_rate": 1.9690265486725665e-05, | |
| "loss": 0.6914, | |
| "num_input_tokens_seen": 25280, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.8407079646017699, | |
| "grad_norm": 27.877477645874023, | |
| "learning_rate": 2.079646017699115e-05, | |
| "loss": 0.17, | |
| "num_input_tokens_seen": 26496, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.8849557522123894, | |
| "grad_norm": 20.50126075744629, | |
| "learning_rate": 2.190265486725664e-05, | |
| "loss": 0.1746, | |
| "num_input_tokens_seen": 28016, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.9292035398230089, | |
| "grad_norm": 14.361896514892578, | |
| "learning_rate": 2.3008849557522124e-05, | |
| "loss": 0.418, | |
| "num_input_tokens_seen": 29344, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.9734513274336283, | |
| "grad_norm": 25.596282958984375, | |
| "learning_rate": 2.411504424778761e-05, | |
| "loss": 0.1978, | |
| "num_input_tokens_seen": 30496, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.46471935510635376, | |
| "eval_runtime": 0.7074, | |
| "eval_samples_per_second": 35.343, | |
| "eval_steps_per_second": 18.378, | |
| "num_input_tokens_seen": 31064, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 1.0176991150442478, | |
| "grad_norm": 35.75157928466797, | |
| "learning_rate": 2.5221238938053098e-05, | |
| "loss": 0.3202, | |
| "num_input_tokens_seen": 31560, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.0619469026548674, | |
| "grad_norm": 4.021816730499268, | |
| "learning_rate": 2.6327433628318586e-05, | |
| "loss": 1.3669, | |
| "num_input_tokens_seen": 32744, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.1061946902654867, | |
| "grad_norm": 14.25241756439209, | |
| "learning_rate": 2.743362831858407e-05, | |
| "loss": 0.3348, | |
| "num_input_tokens_seen": 34440, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.1504424778761062, | |
| "grad_norm": 15.869715690612793, | |
| "learning_rate": 2.853982300884956e-05, | |
| "loss": 0.2621, | |
| "num_input_tokens_seen": 35992, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.1946902654867257, | |
| "grad_norm": 9.137112617492676, | |
| "learning_rate": 2.964601769911505e-05, | |
| "loss": 0.4748, | |
| "num_input_tokens_seen": 37480, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.238938053097345, | |
| "grad_norm": 15.987156867980957, | |
| "learning_rate": 3.075221238938053e-05, | |
| "loss": 0.1724, | |
| "num_input_tokens_seen": 38776, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.2831858407079646, | |
| "grad_norm": 35.1132698059082, | |
| "learning_rate": 3.185840707964602e-05, | |
| "loss": 0.2813, | |
| "num_input_tokens_seen": 40408, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.3274336283185841, | |
| "grad_norm": 17.494558334350586, | |
| "learning_rate": 3.296460176991151e-05, | |
| "loss": 0.8969, | |
| "num_input_tokens_seen": 41944, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.3716814159292037, | |
| "grad_norm": 92.76953125, | |
| "learning_rate": 3.407079646017699e-05, | |
| "loss": 0.3488, | |
| "num_input_tokens_seen": 43576, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.415929203539823, | |
| "grad_norm": 30.726852416992188, | |
| "learning_rate": 3.517699115044248e-05, | |
| "loss": 0.3747, | |
| "num_input_tokens_seen": 44984, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.4601769911504425, | |
| "grad_norm": 10.402463912963867, | |
| "learning_rate": 3.628318584070797e-05, | |
| "loss": 0.2144, | |
| "num_input_tokens_seen": 46424, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.504424778761062, | |
| "grad_norm": 47.82248306274414, | |
| "learning_rate": 3.7389380530973455e-05, | |
| "loss": 0.3073, | |
| "num_input_tokens_seen": 47800, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.5486725663716814, | |
| "grad_norm": 4.886204719543457, | |
| "learning_rate": 3.849557522123894e-05, | |
| "loss": 0.4033, | |
| "num_input_tokens_seen": 48776, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.592920353982301, | |
| "grad_norm": 42.30789566040039, | |
| "learning_rate": 3.9601769911504426e-05, | |
| "loss": 0.2292, | |
| "num_input_tokens_seen": 49960, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.6371681415929205, | |
| "grad_norm": 0.1658252775669098, | |
| "learning_rate": 4.0707964601769914e-05, | |
| "loss": 0.0061, | |
| "num_input_tokens_seen": 51624, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.6814159292035398, | |
| "grad_norm": 10.36694049835205, | |
| "learning_rate": 4.1814159292035396e-05, | |
| "loss": 0.3457, | |
| "num_input_tokens_seen": 53048, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.7256637168141593, | |
| "grad_norm": 19.44915008544922, | |
| "learning_rate": 4.2920353982300885e-05, | |
| "loss": 0.569, | |
| "num_input_tokens_seen": 54552, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.7699115044247788, | |
| "grad_norm": 13.492650985717773, | |
| "learning_rate": 4.4026548672566373e-05, | |
| "loss": 0.4705, | |
| "num_input_tokens_seen": 55656, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.8141592920353982, | |
| "grad_norm": 2.0458106994628906, | |
| "learning_rate": 4.5132743362831855e-05, | |
| "loss": 0.1993, | |
| "num_input_tokens_seen": 56984, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.8584070796460177, | |
| "grad_norm": 3.030862808227539, | |
| "learning_rate": 4.6238938053097344e-05, | |
| "loss": 0.3077, | |
| "num_input_tokens_seen": 58200, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.9026548672566372, | |
| "grad_norm": 3.6195731163024902, | |
| "learning_rate": 4.734513274336283e-05, | |
| "loss": 0.1787, | |
| "num_input_tokens_seen": 59544, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.9469026548672566, | |
| "grad_norm": 24.178558349609375, | |
| "learning_rate": 4.845132743362832e-05, | |
| "loss": 0.6463, | |
| "num_input_tokens_seen": 60968, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.991150442477876, | |
| "grad_norm": 12.566280364990234, | |
| "learning_rate": 4.955752212389381e-05, | |
| "loss": 0.3015, | |
| "num_input_tokens_seen": 62216, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.4196261763572693, | |
| "eval_runtime": 0.7085, | |
| "eval_samples_per_second": 35.285, | |
| "eval_steps_per_second": 18.348, | |
| "num_input_tokens_seen": 62304, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 2.0353982300884956, | |
| "grad_norm": 28.30797004699707, | |
| "learning_rate": 4.9999731620342936e-05, | |
| "loss": 0.3508, | |
| "num_input_tokens_seen": 63568, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.079646017699115, | |
| "grad_norm": 16.764211654663086, | |
| "learning_rate": 4.9998091543305845e-05, | |
| "loss": 0.2591, | |
| "num_input_tokens_seen": 64800, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 2.1238938053097347, | |
| "grad_norm": 17.578092575073242, | |
| "learning_rate": 4.999496058673635e-05, | |
| "loss": 0.3062, | |
| "num_input_tokens_seen": 66144, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.168141592920354, | |
| "grad_norm": 0.1362372487783432, | |
| "learning_rate": 4.999033893736386e-05, | |
| "loss": 0.0411, | |
| "num_input_tokens_seen": 67600, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.2123893805309733, | |
| "grad_norm": 14.771645545959473, | |
| "learning_rate": 4.99842268708223e-05, | |
| "loss": 0.4286, | |
| "num_input_tokens_seen": 69056, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.256637168141593, | |
| "grad_norm": 4.802849292755127, | |
| "learning_rate": 4.9976624751633725e-05, | |
| "loss": 0.9021, | |
| "num_input_tokens_seen": 70304, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.3008849557522124, | |
| "grad_norm": 8.362566947937012, | |
| "learning_rate": 4.996753303318648e-05, | |
| "loss": 0.1102, | |
| "num_input_tokens_seen": 71872, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.3451327433628317, | |
| "grad_norm": 1.8170663118362427, | |
| "learning_rate": 4.995695225770825e-05, | |
| "loss": 0.3114, | |
| "num_input_tokens_seen": 73520, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 2.3893805309734515, | |
| "grad_norm": 1.7204346656799316, | |
| "learning_rate": 4.994488305623365e-05, | |
| "loss": 0.2026, | |
| "num_input_tokens_seen": 75184, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.433628318584071, | |
| "grad_norm": 5.6229963302612305, | |
| "learning_rate": 4.993132614856666e-05, | |
| "loss": 0.095, | |
| "num_input_tokens_seen": 76592, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.47787610619469, | |
| "grad_norm": 7.6734771728515625, | |
| "learning_rate": 4.991628234323765e-05, | |
| "loss": 0.8076, | |
| "num_input_tokens_seen": 77632, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.52212389380531, | |
| "grad_norm": 3.230637550354004, | |
| "learning_rate": 4.9899752537455166e-05, | |
| "loss": 0.1888, | |
| "num_input_tokens_seen": 79104, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 2.566371681415929, | |
| "grad_norm": 0.5752978920936584, | |
| "learning_rate": 4.9881737717052436e-05, | |
| "loss": 0.1704, | |
| "num_input_tokens_seen": 80432, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.6106194690265485, | |
| "grad_norm": 6.58157205581665, | |
| "learning_rate": 4.9862238956428556e-05, | |
| "loss": 0.7013, | |
| "num_input_tokens_seen": 81744, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 2.6548672566371683, | |
| "grad_norm": 10.219573020935059, | |
| "learning_rate": 4.984125741848441e-05, | |
| "loss": 0.4427, | |
| "num_input_tokens_seen": 83104, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.6991150442477876, | |
| "grad_norm": 17.33927345275879, | |
| "learning_rate": 4.981879435455336e-05, | |
| "loss": 0.3118, | |
| "num_input_tokens_seen": 84336, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 2.7433628318584073, | |
| "grad_norm": 2.8333613872528076, | |
| "learning_rate": 4.9794851104326554e-05, | |
| "loss": 0.617, | |
| "num_input_tokens_seen": 85936, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.7876106194690267, | |
| "grad_norm": 6.623862266540527, | |
| "learning_rate": 4.976942909577307e-05, | |
| "loss": 0.1483, | |
| "num_input_tokens_seen": 87232, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 2.831858407079646, | |
| "grad_norm": 2.0592002868652344, | |
| "learning_rate": 4.974252984505475e-05, | |
| "loss": 0.1033, | |
| "num_input_tokens_seen": 88336, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.8761061946902657, | |
| "grad_norm": 0.33601754903793335, | |
| "learning_rate": 4.971415495643574e-05, | |
| "loss": 0.3888, | |
| "num_input_tokens_seen": 89584, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.920353982300885, | |
| "grad_norm": 7.770787239074707, | |
| "learning_rate": 4.968430612218687e-05, | |
| "loss": 0.0721, | |
| "num_input_tokens_seen": 91008, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.9646017699115044, | |
| "grad_norm": 1.1051758527755737, | |
| "learning_rate": 4.965298512248466e-05, | |
| "loss": 0.49, | |
| "num_input_tokens_seen": 92288, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.2181825488805771, | |
| "eval_runtime": 0.7048, | |
| "eval_samples_per_second": 35.47, | |
| "eval_steps_per_second": 18.444, | |
| "num_input_tokens_seen": 93232, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 3.0088495575221237, | |
| "grad_norm": 0.15708673000335693, | |
| "learning_rate": 4.962019382530521e-05, | |
| "loss": 0.1333, | |
| "num_input_tokens_seen": 93456, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 3.0530973451327434, | |
| "grad_norm": 0.3463290333747864, | |
| "learning_rate": 4.958593418631275e-05, | |
| "loss": 0.0663, | |
| "num_input_tokens_seen": 94752, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 3.0973451327433628, | |
| "grad_norm": 8.072488784790039, | |
| "learning_rate": 4.955020824874307e-05, | |
| "loss": 0.4001, | |
| "num_input_tokens_seen": 96000, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 3.1415929203539825, | |
| "grad_norm": 3.377310276031494, | |
| "learning_rate": 4.951301814328157e-05, | |
| "loss": 0.0907, | |
| "num_input_tokens_seen": 97568, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 3.185840707964602, | |
| "grad_norm": 16.744794845581055, | |
| "learning_rate": 4.947436608793624e-05, | |
| "loss": 0.1587, | |
| "num_input_tokens_seen": 98896, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 3.230088495575221, | |
| "grad_norm": 0.08063212782144547, | |
| "learning_rate": 4.9434254387905395e-05, | |
| "loss": 0.2449, | |
| "num_input_tokens_seen": 100336, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 3.274336283185841, | |
| "grad_norm": 6.403633117675781, | |
| "learning_rate": 4.9392685435440154e-05, | |
| "loss": 0.231, | |
| "num_input_tokens_seen": 101888, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 3.3185840707964602, | |
| "grad_norm": 6.990411281585693, | |
| "learning_rate": 4.93496617097018e-05, | |
| "loss": 0.3525, | |
| "num_input_tokens_seen": 103184, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 3.3628318584070795, | |
| "grad_norm": 1.9636709690093994, | |
| "learning_rate": 4.930518577661388e-05, | |
| "loss": 0.1136, | |
| "num_input_tokens_seen": 104672, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.4070796460176993, | |
| "grad_norm": 0.7355748414993286, | |
| "learning_rate": 4.925926028870923e-05, | |
| "loss": 0.1842, | |
| "num_input_tokens_seen": 105968, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 3.4513274336283186, | |
| "grad_norm": 1.4544111490249634, | |
| "learning_rate": 4.921188798497173e-05, | |
| "loss": 0.0577, | |
| "num_input_tokens_seen": 107472, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.495575221238938, | |
| "grad_norm": 0.02327471598982811, | |
| "learning_rate": 4.9163071690672973e-05, | |
| "loss": 0.1441, | |
| "num_input_tokens_seen": 108688, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 3.5398230088495577, | |
| "grad_norm": 5.047955513000488, | |
| "learning_rate": 4.911281431720378e-05, | |
| "loss": 0.3961, | |
| "num_input_tokens_seen": 110320, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.584070796460177, | |
| "grad_norm": 0.618489146232605, | |
| "learning_rate": 4.9061118861900537e-05, | |
| "loss": 0.3237, | |
| "num_input_tokens_seen": 111952, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 3.6283185840707963, | |
| "grad_norm": 0.40347716212272644, | |
| "learning_rate": 4.900798840786645e-05, | |
| "loss": 0.1157, | |
| "num_input_tokens_seen": 113152, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 3.672566371681416, | |
| "grad_norm": 10.295228958129883, | |
| "learning_rate": 4.8953426123787674e-05, | |
| "loss": 0.2442, | |
| "num_input_tokens_seen": 114592, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 3.7168141592920354, | |
| "grad_norm": 0.1771656572818756, | |
| "learning_rate": 4.889743526374432e-05, | |
| "loss": 0.6003, | |
| "num_input_tokens_seen": 115936, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 3.7610619469026547, | |
| "grad_norm": 0.21636876463890076, | |
| "learning_rate": 4.884001916701639e-05, | |
| "loss": 0.1804, | |
| "num_input_tokens_seen": 117232, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 3.8053097345132745, | |
| "grad_norm": 6.24064302444458, | |
| "learning_rate": 4.878118125788462e-05, | |
| "loss": 0.1424, | |
| "num_input_tokens_seen": 118448, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 3.849557522123894, | |
| "grad_norm": 5.579165458679199, | |
| "learning_rate": 4.872092504542629e-05, | |
| "loss": 0.8064, | |
| "num_input_tokens_seen": 119760, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 3.893805309734513, | |
| "grad_norm": 1.1224756240844727, | |
| "learning_rate": 4.865925412330586e-05, | |
| "loss": 0.0734, | |
| "num_input_tokens_seen": 121488, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 3.938053097345133, | |
| "grad_norm": 4.9014458656311035, | |
| "learning_rate": 4.859617216956074e-05, | |
| "loss": 0.2712, | |
| "num_input_tokens_seen": 122816, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 3.982300884955752, | |
| "grad_norm": 0.6472075581550598, | |
| "learning_rate": 4.8531682946381874e-05, | |
| "loss": 0.0664, | |
| "num_input_tokens_seen": 124352, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.10101249814033508, | |
| "eval_runtime": 0.7115, | |
| "eval_samples_per_second": 35.137, | |
| "eval_steps_per_second": 18.271, | |
| "num_input_tokens_seen": 124680, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 4.0265486725663715, | |
| "grad_norm": 0.11688811331987381, | |
| "learning_rate": 4.846579029988939e-05, | |
| "loss": 0.5922, | |
| "num_input_tokens_seen": 125432, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 4.070796460176991, | |
| "grad_norm": 0.17728334665298462, | |
| "learning_rate": 4.8398498159903194e-05, | |
| "loss": 0.131, | |
| "num_input_tokens_seen": 126744, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 4.115044247787611, | |
| "grad_norm": 7.516114234924316, | |
| "learning_rate": 4.8329810539708625e-05, | |
| "loss": 0.0699, | |
| "num_input_tokens_seen": 128424, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 4.15929203539823, | |
| "grad_norm": 0.06694227457046509, | |
| "learning_rate": 4.825973153581709e-05, | |
| "loss": 0.0666, | |
| "num_input_tokens_seen": 129640, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 4.20353982300885, | |
| "grad_norm": 5.715642929077148, | |
| "learning_rate": 4.818826532772174e-05, | |
| "loss": 0.138, | |
| "num_input_tokens_seen": 131016, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 4.247787610619469, | |
| "grad_norm": 12.158321380615234, | |
| "learning_rate": 4.8115416177648234e-05, | |
| "loss": 0.1991, | |
| "num_input_tokens_seen": 132552, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 4.292035398230088, | |
| "grad_norm": 0.12277557700872421, | |
| "learning_rate": 4.804118843030049e-05, | |
| "loss": 0.3774, | |
| "num_input_tokens_seen": 133624, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 4.336283185840708, | |
| "grad_norm": 0.031609226018190384, | |
| "learning_rate": 4.796558651260165e-05, | |
| "loss": 0.0041, | |
| "num_input_tokens_seen": 135464, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 4.380530973451328, | |
| "grad_norm": 3.4474875926971436, | |
| "learning_rate": 4.7888614933429955e-05, | |
| "loss": 0.2738, | |
| "num_input_tokens_seen": 136664, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 4.424778761061947, | |
| "grad_norm": 12.919379234313965, | |
| "learning_rate": 4.781027828334994e-05, | |
| "loss": 0.418, | |
| "num_input_tokens_seen": 137752, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 4.469026548672566, | |
| "grad_norm": 0.014108425937592983, | |
| "learning_rate": 4.773058123433857e-05, | |
| "loss": 0.0716, | |
| "num_input_tokens_seen": 139000, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 4.513274336283186, | |
| "grad_norm": 6.61913537979126, | |
| "learning_rate": 4.7649528539506673e-05, | |
| "loss": 0.0242, | |
| "num_input_tokens_seen": 140184, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 4.557522123893805, | |
| "grad_norm": 0.04510444402694702, | |
| "learning_rate": 4.7567125032815394e-05, | |
| "loss": 0.0425, | |
| "num_input_tokens_seen": 141512, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 4.601769911504425, | |
| "grad_norm": 1.0070881843566895, | |
| "learning_rate": 4.7483375628787975e-05, | |
| "loss": 0.1949, | |
| "num_input_tokens_seen": 142888, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 4.646017699115045, | |
| "grad_norm": 6.574953079223633, | |
| "learning_rate": 4.739828532221661e-05, | |
| "loss": 0.128, | |
| "num_input_tokens_seen": 144312, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 4.6902654867256635, | |
| "grad_norm": 14.323014259338379, | |
| "learning_rate": 4.731185918786453e-05, | |
| "loss": 0.2598, | |
| "num_input_tokens_seen": 146008, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 4.734513274336283, | |
| "grad_norm": 8.124874114990234, | |
| "learning_rate": 4.722410238016343e-05, | |
| "loss": 0.3355, | |
| "num_input_tokens_seen": 147448, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 4.778761061946903, | |
| "grad_norm": 0.4788016676902771, | |
| "learning_rate": 4.7135020132905985e-05, | |
| "loss": 0.0034, | |
| "num_input_tokens_seen": 148952, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 4.823008849557522, | |
| "grad_norm": 0.01158350519835949, | |
| "learning_rate": 4.7044617758933714e-05, | |
| "loss": 0.2511, | |
| "num_input_tokens_seen": 150472, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 4.867256637168142, | |
| "grad_norm": 37.700584411621094, | |
| "learning_rate": 4.695290064982018e-05, | |
| "loss": 0.5979, | |
| "num_input_tokens_seen": 151880, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 4.911504424778761, | |
| "grad_norm": 12.200066566467285, | |
| "learning_rate": 4.6859874275549376e-05, | |
| "loss": 0.1413, | |
| "num_input_tokens_seen": 153336, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 4.95575221238938, | |
| "grad_norm": 8.2332763671875, | |
| "learning_rate": 4.676554418418953e-05, | |
| "loss": 0.2624, | |
| "num_input_tokens_seen": 154584, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.3725515902042389, | |
| "learning_rate": 4.66699160015622e-05, | |
| "loss": 0.2766, | |
| "num_input_tokens_seen": 155672, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.10234539210796356, | |
| "eval_runtime": 0.7055, | |
| "eval_samples_per_second": 35.434, | |
| "eval_steps_per_second": 18.426, | |
| "num_input_tokens_seen": 155672, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 5.04424778761062, | |
| "grad_norm": 6.086857795715332, | |
| "learning_rate": 4.6572995430906784e-05, | |
| "loss": 0.161, | |
| "num_input_tokens_seen": 156920, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 5.088495575221239, | |
| "grad_norm": 1.3057224750518799, | |
| "learning_rate": 4.6474788252540323e-05, | |
| "loss": 0.0179, | |
| "num_input_tokens_seen": 158872, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 5.132743362831858, | |
| "grad_norm": 0.07508724927902222, | |
| "learning_rate": 4.637530032351284e-05, | |
| "loss": 0.0159, | |
| "num_input_tokens_seen": 159960, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 5.176991150442478, | |
| "grad_norm": 0.41541609168052673, | |
| "learning_rate": 4.627453757725796e-05, | |
| "loss": 0.084, | |
| "num_input_tokens_seen": 161112, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 5.221238938053097, | |
| "grad_norm": 10.99107551574707, | |
| "learning_rate": 4.617250602323907e-05, | |
| "loss": 0.0434, | |
| "num_input_tokens_seen": 162472, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 5.265486725663717, | |
| "grad_norm": 1.7388826608657837, | |
| "learning_rate": 4.6069211746590926e-05, | |
| "loss": 0.2506, | |
| "num_input_tokens_seen": 164216, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 5.3097345132743365, | |
| "grad_norm": 2.5360372066497803, | |
| "learning_rate": 4.596466090775672e-05, | |
| "loss": 0.0283, | |
| "num_input_tokens_seen": 165832, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 5.353982300884955, | |
| "grad_norm": 0.5227421522140503, | |
| "learning_rate": 4.585885974212068e-05, | |
| "loss": 0.0081, | |
| "num_input_tokens_seen": 167000, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 5.398230088495575, | |
| "grad_norm": 16.39299774169922, | |
| "learning_rate": 4.575181455963619e-05, | |
| "loss": 0.2277, | |
| "num_input_tokens_seen": 168456, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 5.442477876106195, | |
| "grad_norm": 0.012677973136305809, | |
| "learning_rate": 4.5643531744449474e-05, | |
| "loss": 0.0005, | |
| "num_input_tokens_seen": 169624, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 5.486725663716814, | |
| "grad_norm": 0.8441804647445679, | |
| "learning_rate": 4.553401775451882e-05, | |
| "loss": 0.0689, | |
| "num_input_tokens_seen": 170888, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 5.530973451327434, | |
| "grad_norm": 0.009645035490393639, | |
| "learning_rate": 4.542327912122949e-05, | |
| "loss": 0.0622, | |
| "num_input_tokens_seen": 172056, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 5.575221238938053, | |
| "grad_norm": 12.262785911560059, | |
| "learning_rate": 4.531132244900411e-05, | |
| "loss": 0.6379, | |
| "num_input_tokens_seen": 173448, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 5.619469026548672, | |
| "grad_norm": 0.2407040148973465, | |
| "learning_rate": 4.519815441490884e-05, | |
| "loss": 0.0084, | |
| "num_input_tokens_seen": 174872, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 5.663716814159292, | |
| "grad_norm": 1.2417314052581787, | |
| "learning_rate": 4.508378176825516e-05, | |
| "loss": 0.0051, | |
| "num_input_tokens_seen": 176488, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 5.707964601769912, | |
| "grad_norm": 0.4008258283138275, | |
| "learning_rate": 4.496821133019728e-05, | |
| "loss": 0.004, | |
| "num_input_tokens_seen": 177816, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 5.752212389380531, | |
| "grad_norm": 8.208776473999023, | |
| "learning_rate": 4.485144999332541e-05, | |
| "loss": 0.0227, | |
| "num_input_tokens_seen": 179352, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 5.79646017699115, | |
| "grad_norm": 0.11330597847700119, | |
| "learning_rate": 4.4733504721254625e-05, | |
| "loss": 0.0118, | |
| "num_input_tokens_seen": 180504, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 5.84070796460177, | |
| "grad_norm": 13.392186164855957, | |
| "learning_rate": 4.461438254820959e-05, | |
| "loss": 0.3968, | |
| "num_input_tokens_seen": 181976, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 5.88495575221239, | |
| "grad_norm": 0.05890387296676636, | |
| "learning_rate": 4.449409057860504e-05, | |
| "loss": 0.321, | |
| "num_input_tokens_seen": 183000, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 5.929203539823009, | |
| "grad_norm": 0.124122753739357, | |
| "learning_rate": 4.4372635986622044e-05, | |
| "loss": 0.0571, | |
| "num_input_tokens_seen": 184344, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 5.9734513274336285, | |
| "grad_norm": 11.64846134185791, | |
| "learning_rate": 4.425002601578017e-05, | |
| "loss": 0.0905, | |
| "num_input_tokens_seen": 185976, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 0.10962831228971481, | |
| "eval_runtime": 0.7204, | |
| "eval_samples_per_second": 34.702, | |
| "eval_steps_per_second": 18.045, | |
| "num_input_tokens_seen": 186688, | |
| "step": 678 | |
| }, | |
| { | |
| "epoch": 6.017699115044247, | |
| "grad_norm": 0.014174363575875759, | |
| "learning_rate": 4.4126267978505486e-05, | |
| "loss": 0.0036, | |
| "num_input_tokens_seen": 187312, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 6.061946902654867, | |
| "grad_norm": 16.07929229736328, | |
| "learning_rate": 4.4001369255694416e-05, | |
| "loss": 0.0637, | |
| "num_input_tokens_seen": 188512, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 6.106194690265487, | |
| "grad_norm": 0.041025493294000626, | |
| "learning_rate": 4.387533729627359e-05, | |
| "loss": 0.0082, | |
| "num_input_tokens_seen": 189616, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 6.150442477876107, | |
| "grad_norm": 0.010337191633880138, | |
| "learning_rate": 4.374817961675553e-05, | |
| "loss": 0.0386, | |
| "num_input_tokens_seen": 191264, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 6.1946902654867255, | |
| "grad_norm": 0.05837007984519005, | |
| "learning_rate": 4.3619903800790465e-05, | |
| "loss": 0.0009, | |
| "num_input_tokens_seen": 192576, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 6.238938053097345, | |
| "grad_norm": 0.5997312664985657, | |
| "learning_rate": 4.3490517498713924e-05, | |
| "loss": 0.0175, | |
| "num_input_tokens_seen": 193920, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 6.283185840707965, | |
| "grad_norm": 0.010935655795037746, | |
| "learning_rate": 4.336002842709057e-05, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 195216, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 6.327433628318584, | |
| "grad_norm": 7.356991767883301, | |
| "learning_rate": 4.3228444368253925e-05, | |
| "loss": 0.0111, | |
| "num_input_tokens_seen": 196640, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 6.371681415929204, | |
| "grad_norm": 0.005636111833155155, | |
| "learning_rate": 4.309577316984228e-05, | |
| "loss": 0.008, | |
| "num_input_tokens_seen": 197696, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 6.415929203539823, | |
| "grad_norm": 0.05741603672504425, | |
| "learning_rate": 4.2962022744330616e-05, | |
| "loss": 0.0015, | |
| "num_input_tokens_seen": 199040, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 6.460176991150442, | |
| "grad_norm": 0.011010805144906044, | |
| "learning_rate": 4.282720106855876e-05, | |
| "loss": 0.0005, | |
| "num_input_tokens_seen": 200928, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 6.504424778761062, | |
| "grad_norm": 0.013609147630631924, | |
| "learning_rate": 4.269131618325559e-05, | |
| "loss": 0.0004, | |
| "num_input_tokens_seen": 202784, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 6.548672566371682, | |
| "grad_norm": 16.62497901916504, | |
| "learning_rate": 4.255437619255955e-05, | |
| "loss": 0.0541, | |
| "num_input_tokens_seen": 204080, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 6.592920353982301, | |
| "grad_norm": 4.816155433654785, | |
| "learning_rate": 4.241638926353526e-05, | |
| "loss": 0.0078, | |
| "num_input_tokens_seen": 205296, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 6.6371681415929205, | |
| "grad_norm": 2.4015004634857178, | |
| "learning_rate": 4.2277363625686475e-05, | |
| "loss": 0.0094, | |
| "num_input_tokens_seen": 206768, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 6.68141592920354, | |
| "grad_norm": 0.004668759182095528, | |
| "learning_rate": 4.213730757046528e-05, | |
| "loss": 0.0044, | |
| "num_input_tokens_seen": 208048, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 6.725663716814159, | |
| "grad_norm": 0.017873145639896393, | |
| "learning_rate": 4.199622945077755e-05, | |
| "loss": 0.0683, | |
| "num_input_tokens_seen": 209200, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 6.769911504424779, | |
| "grad_norm": 0.277229368686676, | |
| "learning_rate": 4.185413768048483e-05, | |
| "loss": 0.0029, | |
| "num_input_tokens_seen": 210448, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 6.814159292035399, | |
| "grad_norm": 54.36215591430664, | |
| "learning_rate": 4.1711040733902526e-05, | |
| "loss": 0.8871, | |
| "num_input_tokens_seen": 211984, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 6.8584070796460175, | |
| "grad_norm": 0.006391752976924181, | |
| "learning_rate": 4.1566947145294474e-05, | |
| "loss": 0.0012, | |
| "num_input_tokens_seen": 213328, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 6.902654867256637, | |
| "grad_norm": 17.785545349121094, | |
| "learning_rate": 4.142186550836399e-05, | |
| "loss": 0.202, | |
| "num_input_tokens_seen": 214400, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 6.946902654867257, | |
| "grad_norm": 0.0706261619925499, | |
| "learning_rate": 4.127580447574131e-05, | |
| "loss": 0.0008, | |
| "num_input_tokens_seen": 215888, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 6.991150442477876, | |
| "grad_norm": 0.0091946329921484, | |
| "learning_rate": 4.1128772758467604e-05, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 217664, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 0.2586836516857147, | |
| "eval_runtime": 0.7093, | |
| "eval_samples_per_second": 35.248, | |
| "eval_steps_per_second": 18.329, | |
| "num_input_tokens_seen": 217736, | |
| "step": 791 | |
| }, | |
| { | |
| "epoch": 7.035398230088496, | |
| "grad_norm": 0.23946355283260345, | |
| "learning_rate": 4.098077912547536e-05, | |
| "loss": 0.1797, | |
| "num_input_tokens_seen": 218840, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 7.079646017699115, | |
| "grad_norm": 0.8961653709411621, | |
| "learning_rate": 4.0831832403065526e-05, | |
| "loss": 0.0118, | |
| "num_input_tokens_seen": 219992, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 7.123893805309734, | |
| "grad_norm": 0.005731819197535515, | |
| "learning_rate": 4.068194147438101e-05, | |
| "loss": 0.0018, | |
| "num_input_tokens_seen": 221160, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 7.168141592920354, | |
| "grad_norm": 0.0060266111977398396, | |
| "learning_rate": 4.0531115278876934e-05, | |
| "loss": 0.0097, | |
| "num_input_tokens_seen": 222696, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 7.212389380530974, | |
| "grad_norm": 0.01002022996544838, | |
| "learning_rate": 4.0379362811787504e-05, | |
| "loss": 0.0003, | |
| "num_input_tokens_seen": 224088, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 7.256637168141593, | |
| "grad_norm": 0.1268666535615921, | |
| "learning_rate": 4.022669312358949e-05, | |
| "loss": 0.1595, | |
| "num_input_tokens_seen": 225432, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 7.300884955752212, | |
| "grad_norm": 0.004778804257512093, | |
| "learning_rate": 4.007311531946252e-05, | |
| "loss": 0.0011, | |
| "num_input_tokens_seen": 226696, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 7.345132743362832, | |
| "grad_norm": 0.0037855140399187803, | |
| "learning_rate": 3.9918638558745966e-05, | |
| "loss": 0.0135, | |
| "num_input_tokens_seen": 228168, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 7.389380530973451, | |
| "grad_norm": 0.01771862432360649, | |
| "learning_rate": 3.976327205439279e-05, | |
| "loss": 0.0004, | |
| "num_input_tokens_seen": 229528, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 7.433628318584071, | |
| "grad_norm": 0.10408202558755875, | |
| "learning_rate": 3.9607025072419986e-05, | |
| "loss": 0.0012, | |
| "num_input_tokens_seen": 231208, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 7.477876106194691, | |
| "grad_norm": 0.0022122100926935673, | |
| "learning_rate": 3.9449906931356005e-05, | |
| "loss": 0.0024, | |
| "num_input_tokens_seen": 232392, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 7.522123893805309, | |
| "grad_norm": 0.0038752174004912376, | |
| "learning_rate": 3.929192700168501e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 233480, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 7.566371681415929, | |
| "grad_norm": 0.001984543865546584, | |
| "learning_rate": 3.9133094705287984e-05, | |
| "loss": 0.0003, | |
| "num_input_tokens_seen": 235256, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 7.610619469026549, | |
| "grad_norm": 0.00306964130140841, | |
| "learning_rate": 3.897341951488087e-05, | |
| "loss": 0.0003, | |
| "num_input_tokens_seen": 236744, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 7.654867256637168, | |
| "grad_norm": 0.0018930825171992183, | |
| "learning_rate": 3.8812910953449555e-05, | |
| "loss": 0.1634, | |
| "num_input_tokens_seen": 237880, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 7.699115044247788, | |
| "grad_norm": 0.6990119814872742, | |
| "learning_rate": 3.865157859368196e-05, | |
| "loss": 0.001, | |
| "num_input_tokens_seen": 239544, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 7.743362831858407, | |
| "grad_norm": 0.013935576193034649, | |
| "learning_rate": 3.848943205739711e-05, | |
| "loss": 0.0034, | |
| "num_input_tokens_seen": 241048, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 7.787610619469026, | |
| "grad_norm": 0.00296784657984972, | |
| "learning_rate": 3.832648101497134e-05, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 242408, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 7.831858407079646, | |
| "grad_norm": 0.018667805939912796, | |
| "learning_rate": 3.8162735184761476e-05, | |
| "loss": 0.0005, | |
| "num_input_tokens_seen": 243688, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 7.876106194690266, | |
| "grad_norm": 0.0245062168687582, | |
| "learning_rate": 3.799820433252529e-05, | |
| "loss": 0.0002, | |
| "num_input_tokens_seen": 245320, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 7.920353982300885, | |
| "grad_norm": 0.00318564148619771, | |
| "learning_rate": 3.783289827083905e-05, | |
| "loss": 0.0004, | |
| "num_input_tokens_seen": 246584, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 7.964601769911504, | |
| "grad_norm": 0.00905859749764204, | |
| "learning_rate": 3.766682685851234e-05, | |
| "loss": 0.0003, | |
| "num_input_tokens_seen": 248024, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 0.03978044167160988, | |
| "eval_runtime": 0.7103, | |
| "eval_samples_per_second": 35.199, | |
| "eval_steps_per_second": 18.303, | |
| "num_input_tokens_seen": 248784, | |
| "step": 904 | |
| }, | |
| { | |
| "epoch": 8.008849557522124, | |
| "grad_norm": 0.010085551999509335, | |
| "learning_rate": 3.7500000000000003e-05, | |
| "loss": 0.0002, | |
| "num_input_tokens_seen": 249024, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 8.053097345132743, | |
| "grad_norm": 0.33434563875198364, | |
| "learning_rate": 3.733242764481154e-05, | |
| "loss": 0.0012, | |
| "num_input_tokens_seen": 250672, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 8.097345132743364, | |
| "grad_norm": 0.0033214823342859745, | |
| "learning_rate": 3.716411978691766e-05, | |
| "loss": 0.0033, | |
| "num_input_tokens_seen": 252368, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 8.141592920353983, | |
| "grad_norm": 0.0026391008868813515, | |
| "learning_rate": 3.699508646415424e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 253920, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 8.185840707964601, | |
| "grad_norm": 0.003199717728421092, | |
| "learning_rate": 3.6825337757623696e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 255248, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 8.230088495575222, | |
| "grad_norm": 0.002210139762610197, | |
| "learning_rate": 3.665488379109377e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 256336, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 8.274336283185841, | |
| "grad_norm": 0.0021523365285247564, | |
| "learning_rate": 3.648373473039368e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 257648, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 8.31858407079646, | |
| "grad_norm": 0.0015277840429916978, | |
| "learning_rate": 3.631190078280791e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 258752, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 8.36283185840708, | |
| "grad_norm": 0.002127046464011073, | |
| "learning_rate": 3.613939219646739e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 260144, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 8.4070796460177, | |
| "grad_norm": 0.003957969136536121, | |
| "learning_rate": 3.596621925973835e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 261328, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 8.451327433628318, | |
| "grad_norm": 0.0043671284802258015, | |
| "learning_rate": 3.579239230060867e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 262784, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 8.495575221238939, | |
| "grad_norm": 0.0023445875849574804, | |
| "learning_rate": 3.5617921686071995e-05, | |
| "loss": 0.001, | |
| "num_input_tokens_seen": 264448, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 8.539823008849558, | |
| "grad_norm": 0.9188032746315002, | |
| "learning_rate": 3.544281782150936e-05, | |
| "loss": 0.0012, | |
| "num_input_tokens_seen": 265920, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 8.584070796460177, | |
| "grad_norm": 0.0022221386898308992, | |
| "learning_rate": 3.526709115006871e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 266992, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 8.628318584070797, | |
| "grad_norm": 0.017859293147921562, | |
| "learning_rate": 3.5090752152041975e-05, | |
| "loss": 0.0003, | |
| "num_input_tokens_seen": 268688, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 8.672566371681416, | |
| "grad_norm": 0.0011308812536299229, | |
| "learning_rate": 3.491381134424012e-05, | |
| "loss": 0.0228, | |
| "num_input_tokens_seen": 270000, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 8.716814159292035, | |
| "grad_norm": 0.002236367203295231, | |
| "learning_rate": 3.4736279279365876e-05, | |
| "loss": 0.0002, | |
| "num_input_tokens_seen": 271056, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 8.761061946902656, | |
| "grad_norm": 0.030419211834669113, | |
| "learning_rate": 3.455816654538438e-05, | |
| "loss": 0.0089, | |
| "num_input_tokens_seen": 272608, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 8.805309734513274, | |
| "grad_norm": 0.0011954925721511245, | |
| "learning_rate": 3.437948376489172e-05, | |
| "loss": 0.0003, | |
| "num_input_tokens_seen": 274000, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 8.849557522123893, | |
| "grad_norm": 0.008035533130168915, | |
| "learning_rate": 3.420024159448142e-05, | |
| "loss": 0.0004, | |
| "num_input_tokens_seen": 275200, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 8.893805309734514, | |
| "grad_norm": 0.033750034868717194, | |
| "learning_rate": 3.402045072410886e-05, | |
| "loss": 0.0003, | |
| "num_input_tokens_seen": 276352, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 8.938053097345133, | |
| "grad_norm": 0.0011004299158230424, | |
| "learning_rate": 3.3840121876453734e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 277952, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 8.982300884955752, | |
| "grad_norm": 0.0018102293834090233, | |
| "learning_rate": 3.365926580628057e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 279328, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 0.09988843649625778, | |
| "eval_runtime": 0.7112, | |
| "eval_samples_per_second": 35.152, | |
| "eval_steps_per_second": 18.279, | |
| "num_input_tokens_seen": 279688, | |
| "step": 1017 | |
| }, | |
| { | |
| "epoch": 9.026548672566372, | |
| "grad_norm": 0.0009696983615867794, | |
| "learning_rate": 3.3477893299797304e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 280456, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 9.070796460176991, | |
| "grad_norm": 0.0008515130612067878, | |
| "learning_rate": 3.3296015174011984e-05, | |
| "loss": 0.0002, | |
| "num_input_tokens_seen": 281752, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 9.11504424778761, | |
| "grad_norm": 0.0012484554899856448, | |
| "learning_rate": 3.311364227608768e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 283464, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 9.15929203539823, | |
| "grad_norm": 0.0006809367914684117, | |
| "learning_rate": 3.293078548269553e-05, | |
| "loss": 0.0004, | |
| "num_input_tokens_seen": 284760, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 9.20353982300885, | |
| "grad_norm": 0.0011875375639647245, | |
| "learning_rate": 3.2747455699366056e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 286104, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 9.247787610619469, | |
| "grad_norm": 0.0012038415297865868, | |
| "learning_rate": 3.256366385983879e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 287672, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 9.29203539823009, | |
| "grad_norm": 0.006046834401786327, | |
| "learning_rate": 3.237942092541018e-05, | |
| "loss": 0.0004, | |
| "num_input_tokens_seen": 288920, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 9.336283185840708, | |
| "grad_norm": 0.0012190825073048472, | |
| "learning_rate": 3.219473788427984e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 289960, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 9.380530973451327, | |
| "grad_norm": 0.0028741054702550173, | |
| "learning_rate": 3.2009625750895224e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 291544, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 9.424778761061948, | |
| "grad_norm": 0.024399342015385628, | |
| "learning_rate": 3.182409556529476e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 292808, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 9.469026548672566, | |
| "grad_norm": 0.0008577514672651887, | |
| "learning_rate": 3.163815839244937e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 293912, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 9.513274336283185, | |
| "grad_norm": 0.0012534806737676263, | |
| "learning_rate": 3.14518253216026e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 295416, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 9.557522123893806, | |
| "grad_norm": 0.0012791682966053486, | |
| "learning_rate": 3.126510746560925e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 297016, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 9.601769911504425, | |
| "grad_norm": 0.0010787008795887232, | |
| "learning_rate": 3.107801596027261e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 298168, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 9.646017699115044, | |
| "grad_norm": 0.0010192421032115817, | |
| "learning_rate": 3.0890561963680306e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 299672, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 9.690265486725664, | |
| "grad_norm": 0.0038640201091766357, | |
| "learning_rate": 3.0702756655538835e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 301080, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 9.734513274336283, | |
| "grad_norm": 0.0006423942977562547, | |
| "learning_rate": 3.051461123650685e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 302472, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 9.778761061946902, | |
| "grad_norm": 0.01918724551796913, | |
| "learning_rate": 3.032613692752711e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 303592, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 9.823008849557523, | |
| "grad_norm": 0.0008666754583828151, | |
| "learning_rate": 3.0137344969157284e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 305224, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 9.867256637168142, | |
| "grad_norm": 0.005792871117591858, | |
| "learning_rate": 2.9948246620899557e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 306200, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 9.91150442477876, | |
| "grad_norm": 0.0005233255214989185, | |
| "learning_rate": 2.9758853160529148e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 307880, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 9.955752212389381, | |
| "grad_norm": 0.000557672290597111, | |
| "learning_rate": 2.9569175883421672e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 309240, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.0006190123967826366, | |
| "learning_rate": 2.93792261018795e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 310504, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 0.02271426096558571, | |
| "eval_runtime": 0.7078, | |
| "eval_samples_per_second": 35.321, | |
| "eval_steps_per_second": 18.367, | |
| "num_input_tokens_seen": 310504, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 10.044247787610619, | |
| "grad_norm": 0.0008720250916667283, | |
| "learning_rate": 2.9189015144457087e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 311800, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 10.08849557522124, | |
| "grad_norm": 0.0008014932973310351, | |
| "learning_rate": 2.8998554355285355e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 313064, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 10.132743362831858, | |
| "grad_norm": 0.0009775793878361583, | |
| "learning_rate": 2.8807855093395126e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 314456, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 10.176991150442477, | |
| "grad_norm": 0.000823335547465831, | |
| "learning_rate": 2.8616928732039684e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 315928, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 10.221238938053098, | |
| "grad_norm": 0.0010703738080337644, | |
| "learning_rate": 2.8425786658016423e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 317528, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 10.265486725663717, | |
| "grad_norm": 0.0008935982477851212, | |
| "learning_rate": 2.8234440270987837e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 318952, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 10.309734513274336, | |
| "grad_norm": 0.0008601442677900195, | |
| "learning_rate": 2.804290098280155e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 320312, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 10.353982300884956, | |
| "grad_norm": 0.001831409870646894, | |
| "learning_rate": 2.7851180216809796e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 321560, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 10.398230088495575, | |
| "grad_norm": 0.0011935007059946656, | |
| "learning_rate": 2.765928940718806e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 322984, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 10.442477876106194, | |
| "grad_norm": 0.00106055848300457, | |
| "learning_rate": 2.7467239998253214e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 324120, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 10.486725663716815, | |
| "grad_norm": 0.0005716433515772223, | |
| "learning_rate": 2.7275043443780934e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 325480, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 10.530973451327434, | |
| "grad_norm": 0.0005892731714993715, | |
| "learning_rate": 2.708271120632262e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 326680, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 10.575221238938052, | |
| "grad_norm": 0.0006973208510316908, | |
| "learning_rate": 2.6890254756521778e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 327832, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 10.619469026548673, | |
| "grad_norm": 0.006061312276870012, | |
| "learning_rate": 2.6697685572429886e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 328968, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 10.663716814159292, | |
| "grad_norm": 0.0005946594756096601, | |
| "learning_rate": 2.65050151388219e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 330312, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 10.70796460176991, | |
| "grad_norm": 0.0018679506611078978, | |
| "learning_rate": 2.6312254946511217e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 331704, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 10.752212389380531, | |
| "grad_norm": 0.0007243392756208777, | |
| "learning_rate": 2.6119416491664472e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 333016, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 10.79646017699115, | |
| "grad_norm": 0.0011644094483926892, | |
| "learning_rate": 2.5926511275115827e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 334840, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 10.84070796460177, | |
| "grad_norm": 0.0010138576617464423, | |
| "learning_rate": 2.57335508016811e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 336280, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 10.88495575221239, | |
| "grad_norm": 0.0016166451387107372, | |
| "learning_rate": 2.5540546579471624e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 337720, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 10.929203539823009, | |
| "grad_norm": 0.01318308338522911, | |
| "learning_rate": 2.5347510119207878e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 339032, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 10.973451327433628, | |
| "grad_norm": 0.0007939252536743879, | |
| "learning_rate": 2.515445293353304e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 340552, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_loss": 0.021070513874292374, | |
| "eval_runtime": 0.7097, | |
| "eval_samples_per_second": 35.227, | |
| "eval_steps_per_second": 18.318, | |
| "num_input_tokens_seen": 341152, | |
| "step": 1243 | |
| }, | |
| { | |
| "epoch": 11.017699115044248, | |
| "grad_norm": 0.0005229383241385221, | |
| "learning_rate": 2.4961386536326307e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 341728, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 11.061946902654867, | |
| "grad_norm": 0.0010782463941723108, | |
| "learning_rate": 2.4768322442016278e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 343360, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 11.106194690265486, | |
| "grad_norm": 0.0020232030656188726, | |
| "learning_rate": 2.457527216489421e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 344912, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 11.150442477876107, | |
| "grad_norm": 0.0005589700886048377, | |
| "learning_rate": 2.438224721842728e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 346176, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 11.194690265486726, | |
| "grad_norm": 0.0008123366278596222, | |
| "learning_rate": 2.4189259114571984e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 348096, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 11.238938053097344, | |
| "grad_norm": 0.0008835430489853024, | |
| "learning_rate": 2.39963193630875e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 349408, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 11.283185840707965, | |
| "grad_norm": 0.0046989452093839645, | |
| "learning_rate": 2.3803439470849335e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 350800, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 11.327433628318584, | |
| "grad_norm": 0.0011660694144666195, | |
| "learning_rate": 2.361063094116293e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 352064, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 11.371681415929203, | |
| "grad_norm": 0.0008970863418653607, | |
| "learning_rate": 2.3417905273077756e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 353424, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 11.415929203539823, | |
| "grad_norm": 0.00047139974776655436, | |
| "learning_rate": 2.32252739607014e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 354704, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 11.460176991150442, | |
| "grad_norm": 0.0005186158232390881, | |
| "learning_rate": 2.3032748492514116e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 355920, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 11.504424778761061, | |
| "grad_norm": 0.0005980245769023895, | |
| "learning_rate": 2.2840340350683622e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 357168, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 11.548672566371682, | |
| "grad_norm": 0.004304022993892431, | |
| "learning_rate": 2.2648061010380346e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 358448, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 11.5929203539823, | |
| "grad_norm": 0.00046522877528332174, | |
| "learning_rate": 2.2455921939093e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 359856, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 11.63716814159292, | |
| "grad_norm": 0.0010373727418482304, | |
| "learning_rate": 2.2263934595944716e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 361216, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 11.68141592920354, | |
| "grad_norm": 0.0005117251421324909, | |
| "learning_rate": 2.207211043100958e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 362656, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 11.725663716814159, | |
| "grad_norm": 0.0005031274631619453, | |
| "learning_rate": 2.188046088462979e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 363888, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 11.769911504424778, | |
| "grad_norm": 0.013536173850297928, | |
| "learning_rate": 2.1688997386733316e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 365232, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 11.814159292035399, | |
| "grad_norm": 0.00047480713692493737, | |
| "learning_rate": 2.1497731356152286e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 366720, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 11.858407079646017, | |
| "grad_norm": 0.0026002004742622375, | |
| "learning_rate": 2.1306674199941872e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 367824, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 11.902654867256636, | |
| "grad_norm": 0.0013884420040994883, | |
| "learning_rate": 2.1115837312700088e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 369184, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 11.946902654867257, | |
| "grad_norm": 0.0004059988132212311, | |
| "learning_rate": 2.0925232075888143e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 370432, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 11.991150442477876, | |
| "grad_norm": 0.0004867562965955585, | |
| "learning_rate": 2.0734869857151666e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 371696, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 0.019818633794784546, | |
| "eval_runtime": 0.7123, | |
| "eval_samples_per_second": 35.099, | |
| "eval_steps_per_second": 18.252, | |
| "num_input_tokens_seen": 371768, | |
| "step": 1356 | |
| }, | |
| { | |
| "epoch": 12.035398230088495, | |
| "grad_norm": 0.000515097810421139, | |
| "learning_rate": 2.054476200964278e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 372952, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 12.079646017699115, | |
| "grad_norm": 0.002132029039785266, | |
| "learning_rate": 2.035491987134294e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 374264, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 12.123893805309734, | |
| "grad_norm": 0.0006297901272773743, | |
| "learning_rate": 2.0165354764386807e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 375528, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 12.168141592920353, | |
| "grad_norm": 0.0023943870328366756, | |
| "learning_rate": 1.997607799438694e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 377016, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 12.212389380530974, | |
| "grad_norm": 0.0013272215146571398, | |
| "learning_rate": 1.978710084975959e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 378472, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 12.256637168141593, | |
| "grad_norm": 0.001000846503302455, | |
| "learning_rate": 1.9598434601051386e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 379736, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 12.300884955752213, | |
| "grad_norm": 0.0004203008720651269, | |
| "learning_rate": 1.941009050026726e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 381160, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 12.345132743362832, | |
| "grad_norm": 0.0007252345094457269, | |
| "learning_rate": 1.922207978019928e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 382232, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 12.389380530973451, | |
| "grad_norm": 0.0018995037535205483, | |
| "learning_rate": 1.903441365375681e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 383384, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 12.43362831858407, | |
| "grad_norm": 0.0005581756122410297, | |
| "learning_rate": 1.884710331329772e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 385000, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 12.47787610619469, | |
| "grad_norm": 0.0008921206463128328, | |
| "learning_rate": 1.8660159929960914e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 386712, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 12.52212389380531, | |
| "grad_norm": 0.004135303199291229, | |
| "learning_rate": 1.847359465300006e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 388360, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 12.56637168141593, | |
| "grad_norm": 0.0005593369132839143, | |
| "learning_rate": 1.828741860911867e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 389672, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 12.610619469026549, | |
| "grad_norm": 0.0005592005327343941, | |
| "learning_rate": 1.8101642901806486e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 391224, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 12.654867256637168, | |
| "grad_norm": 0.0009430416394025087, | |
| "learning_rate": 1.791627861067731e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 392616, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 12.699115044247787, | |
| "grad_norm": 0.005889066495001316, | |
| "learning_rate": 1.7731336790808146e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 393800, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 12.743362831858407, | |
| "grad_norm": 0.000711196509655565, | |
| "learning_rate": 1.7546828472079992e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 395064, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 12.787610619469026, | |
| "grad_norm": 0.00040520497714169323, | |
| "learning_rate": 1.7362764658519877e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 396392, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 12.831858407079647, | |
| "grad_norm": 0.00030975393019616604, | |
| "learning_rate": 1.7179156327644724e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 397720, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 12.876106194690266, | |
| "grad_norm": 0.0036038346588611603, | |
| "learning_rate": 1.699601442980655e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 398968, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 12.920353982300885, | |
| "grad_norm": 0.0005937079549767077, | |
| "learning_rate": 1.6813349887539443e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 400696, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 12.964601769911503, | |
| "grad_norm": 0.00041711816447786987, | |
| "learning_rate": 1.663117359490814e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 402168, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_loss": 0.01962355710566044, | |
| "eval_runtime": 0.7175, | |
| "eval_samples_per_second": 34.844, | |
| "eval_steps_per_second": 18.119, | |
| "num_input_tokens_seen": 402896, | |
| "step": 1469 | |
| }, | |
| { | |
| "epoch": 13.008849557522124, | |
| "grad_norm": 0.0004908296396024525, | |
| "learning_rate": 1.6449496416858284e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 403168, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 13.053097345132743, | |
| "grad_norm": 0.0007329536601901054, | |
| "learning_rate": 1.6268329188568468e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 404752, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 13.097345132743364, | |
| "grad_norm": 0.0009706666460260749, | |
| "learning_rate": 1.6087682714804002e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 406064, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 13.141592920353983, | |
| "grad_norm": 0.0008216256974264979, | |
| "learning_rate": 1.5907567769272568e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 407472, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 13.185840707964601, | |
| "grad_norm": 0.0008931403863243759, | |
| "learning_rate": 1.5727995093981598e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 408672, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 13.230088495575222, | |
| "grad_norm": 0.00038248588680289686, | |
| "learning_rate": 1.5548975398597718e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 409968, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 13.274336283185841, | |
| "grad_norm": 0.000629936985205859, | |
| "learning_rate": 1.537051935980794e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 411456, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 13.31858407079646, | |
| "grad_norm": 0.001056081848219037, | |
| "learning_rate": 1.5192637620682981e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 412864, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 13.36283185840708, | |
| "grad_norm": 0.0005708714015781879, | |
| "learning_rate": 1.5015340790042446e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 414272, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 13.4070796460177, | |
| "grad_norm": 0.00035775068681687117, | |
| "learning_rate": 1.4838639441822183e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 415536, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 13.451327433628318, | |
| "grad_norm": 0.00041552510811015964, | |
| "learning_rate": 1.46625441144436e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 416896, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 13.495575221238939, | |
| "grad_norm": 0.0004293840902391821, | |
| "learning_rate": 1.4487065310185202e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 418192, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 13.539823008849558, | |
| "grad_norm": 0.008226204663515091, | |
| "learning_rate": 1.4312213494556218e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 419424, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 13.584070796460177, | |
| "grad_norm": 0.0005483218701556325, | |
| "learning_rate": 1.4137999095672444e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 421056, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 13.628318584070797, | |
| "grad_norm": 0.0009848109912127256, | |
| "learning_rate": 1.3964432503634281e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 422256, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 13.672566371681416, | |
| "grad_norm": 0.0012349269818514585, | |
| "learning_rate": 1.3791524069907141e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 423408, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 13.716814159292035, | |
| "grad_norm": 0.0006271946476772428, | |
| "learning_rate": 1.361928410670403e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 424496, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 13.761061946902656, | |
| "grad_norm": 0.0007682631839998066, | |
| "learning_rate": 1.3447722886370565e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 426352, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 13.805309734513274, | |
| "grad_norm": 0.002840487053617835, | |
| "learning_rate": 1.3276850640772288e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 427440, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 13.849557522123893, | |
| "grad_norm": 0.0005562572623603046, | |
| "learning_rate": 1.3106677560684494e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 428784, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 13.893805309734514, | |
| "grad_norm": 0.0003185897076036781, | |
| "learning_rate": 1.2937213795184434e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 430240, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 13.938053097345133, | |
| "grad_norm": 0.0009106624638661742, | |
| "learning_rate": 1.2768469451046029e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 431664, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 13.982300884955752, | |
| "grad_norm": 0.0005448419833555818, | |
| "learning_rate": 1.2600454592137062e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 433472, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_loss": 0.019971443340182304, | |
| "eval_runtime": 0.7266, | |
| "eval_samples_per_second": 34.408, | |
| "eval_steps_per_second": 17.892, | |
| "num_input_tokens_seen": 433768, | |
| "step": 1582 | |
| }, | |
| { | |
| "epoch": 14.026548672566372, | |
| "grad_norm": 0.007204546593129635, | |
| "learning_rate": 1.2433179238819077e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 434536, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 14.070796460176991, | |
| "grad_norm": 0.0004600539105013013, | |
| "learning_rate": 1.2266653367349657e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 435688, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 14.11504424778761, | |
| "grad_norm": 0.00044398586032912135, | |
| "learning_rate": 1.2100886909287478e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 437224, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 14.15929203539823, | |
| "grad_norm": 0.0004702652804553509, | |
| "learning_rate": 1.1935889750900034e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 438600, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 14.20353982300885, | |
| "grad_norm": 0.00046526207006536424, | |
| "learning_rate": 1.1771671732573976e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 439832, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 14.247787610619469, | |
| "grad_norm": 0.0005686290678568184, | |
| "learning_rate": 1.1608242648228257e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 441048, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 14.29203539823009, | |
| "grad_norm": 0.0004068968119099736, | |
| "learning_rate": 1.1445612244729984e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 442872, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 14.336283185840708, | |
| "grad_norm": 0.001695982995443046, | |
| "learning_rate": 1.1283790221313208e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 444360, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 14.380530973451327, | |
| "grad_norm": 0.0037256069481372833, | |
| "learning_rate": 1.1122786229000356e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 445688, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 14.424778761061948, | |
| "grad_norm": 0.00461566960439086, | |
| "learning_rate": 1.0962609870026724e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 447256, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 14.469026548672566, | |
| "grad_norm": 0.0006255768821574748, | |
| "learning_rate": 1.0803270697267764e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 448712, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 14.513274336283185, | |
| "grad_norm": 0.00046136006130836904, | |
| "learning_rate": 1.0644778213669385e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 450072, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 14.557522123893806, | |
| "grad_norm": 0.0002965346211567521, | |
| "learning_rate": 1.0487141871681142e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 451432, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 14.601769911504425, | |
| "grad_norm": 0.000347623456036672, | |
| "learning_rate": 1.0330371072692565e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 452696, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 14.646017699115044, | |
| "grad_norm": 0.0006429420900531113, | |
| "learning_rate": 1.0174475166472417e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 454056, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 14.690265486725664, | |
| "grad_norm": 0.002083728089928627, | |
| "learning_rate": 1.0019463450611103e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 455592, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 14.734513274336283, | |
| "grad_norm": 0.00048280772170983255, | |
| "learning_rate": 9.865345169966114e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 456680, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 14.778761061946902, | |
| "grad_norm": 0.0007308169733732939, | |
| "learning_rate": 9.71212951611074e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 458248, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 14.823008849557523, | |
| "grad_norm": 0.0008597771520726383, | |
| "learning_rate": 9.559825626785837e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 459720, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 14.867256637168142, | |
| "grad_norm": 0.0030042710714042187, | |
| "learning_rate": 9.40844258535487e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 461096, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 14.91150442477876, | |
| "grad_norm": 0.001321359071880579, | |
| "learning_rate": 9.257989420262151e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 462536, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 14.955752212389381, | |
| "grad_norm": 0.0003227783308830112, | |
| "learning_rate": 9.108475104494475e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 463832, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 0.0005668445373885334, | |
| "learning_rate": 8.959908555045846e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 464816, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_loss": 0.018581919372081757, | |
| "eval_runtime": 0.7209, | |
| "eval_samples_per_second": 34.678, | |
| "eval_steps_per_second": 18.033, | |
| "num_input_tokens_seen": 464816, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 15.044247787610619, | |
| "grad_norm": 0.00046278457739390433, | |
| "learning_rate": 8.812298632385784e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 466240, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 15.08849557522124, | |
| "grad_norm": 0.0005744256195612252, | |
| "learning_rate": 8.66565413993082e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 467344, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 15.132743362831858, | |
| "grad_norm": 0.0002654808049555868, | |
| "learning_rate": 8.519983823519496e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 468432, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 15.176991150442477, | |
| "grad_norm": 0.00036680474295280874, | |
| "learning_rate": 8.375296370890749e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 470224, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 15.221238938053098, | |
| "grad_norm": 0.00044989073649048805, | |
| "learning_rate": 8.231600411165757e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 471552, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 15.265486725663717, | |
| "grad_norm": 0.0006426925538107753, | |
| "learning_rate": 8.088904514333384e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 472896, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 15.309734513274336, | |
| "grad_norm": 0.0004558507935144007, | |
| "learning_rate": 7.947217190738945e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 474384, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 15.353982300884956, | |
| "grad_norm": 0.00033109524520114064, | |
| "learning_rate": 7.806546890576753e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 475456, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 15.398230088495575, | |
| "grad_norm": 0.0010439999168738723, | |
| "learning_rate": 7.666902003386104e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 477136, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 15.442477876106194, | |
| "grad_norm": 0.00024651194689795375, | |
| "learning_rate": 7.528290857550943e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 478672, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 15.486725663716815, | |
| "grad_norm": 0.00045066667371429503, | |
| "learning_rate": 7.390721719803137e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 480096, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 15.530973451327434, | |
| "grad_norm": 0.00037901801988482475, | |
| "learning_rate": 7.254202794729484e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 481904, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 15.575221238938052, | |
| "grad_norm": 0.0010054173180833459, | |
| "learning_rate": 7.11874222428238e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 483440, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 15.619469026548673, | |
| "grad_norm": 0.0004434712463989854, | |
| "learning_rate": 6.9843480872942294e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 485056, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 15.663716814159292, | |
| "grad_norm": 0.00036231096601113677, | |
| "learning_rate": 6.851028398995607e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 486608, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 15.70796460176991, | |
| "grad_norm": 0.00084352632984519, | |
| "learning_rate": 6.718791110537287e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 487888, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 15.752212389380531, | |
| "grad_norm": 0.0006180101190693676, | |
| "learning_rate": 6.587644108515986e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 489184, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 15.79646017699115, | |
| "grad_norm": 0.00047136572538875043, | |
| "learning_rate": 6.457595214504042e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 490528, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 15.84070796460177, | |
| "grad_norm": 0.001999986357986927, | |
| "learning_rate": 6.328652184582884e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 491680, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 15.88495575221239, | |
| "grad_norm": 0.0006776810041628778, | |
| "learning_rate": 6.200822708880563e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 493152, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 15.929203539823009, | |
| "grad_norm": 0.0005173166864551604, | |
| "learning_rate": 6.074114411112997e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 494400, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 15.973451327433628, | |
| "grad_norm": 0.0004017841420136392, | |
| "learning_rate": 5.948534848129378e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 495584, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_loss": 0.019682612270116806, | |
| "eval_runtime": 0.7212, | |
| "eval_samples_per_second": 34.662, | |
| "eval_steps_per_second": 18.024, | |
| "num_input_tokens_seen": 496216, | |
| "step": 1808 | |
| }, | |
| { | |
| "epoch": 16.01769911504425, | |
| "grad_norm": 0.0007653535576537251, | |
| "learning_rate": 5.824091509461449e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 496680, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 16.061946902654867, | |
| "grad_norm": 0.00038566955481655896, | |
| "learning_rate": 5.7007918168768405e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 497992, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 16.106194690265486, | |
| "grad_norm": 0.0003379808913450688, | |
| "learning_rate": 5.5786431239364365e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 499272, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 16.150442477876105, | |
| "grad_norm": 0.0006492345710285008, | |
| "learning_rate": 5.457652715555781e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 500776, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 16.194690265486727, | |
| "grad_norm": 0.002473029075190425, | |
| "learning_rate": 5.337827807570689e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 501928, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 16.238938053097346, | |
| "grad_norm": 0.0006469090585596859, | |
| "learning_rate": 5.219175546306784e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 502936, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 16.283185840707965, | |
| "grad_norm": 0.00036444375291466713, | |
| "learning_rate": 5.1017030081533914e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 504200, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 16.327433628318584, | |
| "grad_norm": 0.0003275285707786679, | |
| "learning_rate": 4.985417199141443e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 505368, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 16.371681415929203, | |
| "grad_norm": 0.00047016076860018075, | |
| "learning_rate": 4.870325054525673e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 506792, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 16.41592920353982, | |
| "grad_norm": 0.0005479489336721599, | |
| "learning_rate": 4.7564334383709745e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 508184, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 16.460176991150444, | |
| "grad_norm": 0.0007782442844472826, | |
| "learning_rate": 4.6437491431430556e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 509704, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 16.504424778761063, | |
| "grad_norm": 0.0004585021815728396, | |
| "learning_rate": 4.5322788893033155e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 511272, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 16.548672566371682, | |
| "grad_norm": 0.0005866039427928627, | |
| "learning_rate": 4.422029324908061e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 512744, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 16.5929203539823, | |
| "grad_norm": 0.00041383542702533305, | |
| "learning_rate": 4.313007025211985e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 514264, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 16.63716814159292, | |
| "grad_norm": 0.0014405195834115148, | |
| "learning_rate": 4.205218492276055e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 515720, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 16.68141592920354, | |
| "grad_norm": 0.0037912712432444096, | |
| "learning_rate": 4.098670154579715e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 517080, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 16.72566371681416, | |
| "grad_norm": 0.001533073023892939, | |
| "learning_rate": 3.9933683666374986e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 518568, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 16.76991150442478, | |
| "grad_norm": 0.007198888808488846, | |
| "learning_rate": 3.889319408620021e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 520008, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 16.8141592920354, | |
| "grad_norm": 0.0009076311835087836, | |
| "learning_rate": 3.7865294859794926e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 521368, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 16.858407079646017, | |
| "grad_norm": 0.0002917584788519889, | |
| "learning_rate": 3.68500472907955e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 522648, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 16.902654867256636, | |
| "grad_norm": 0.0003024785837624222, | |
| "learning_rate": 3.584751192829705e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 524072, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 16.946902654867255, | |
| "grad_norm": 0.0003079813322983682, | |
| "learning_rate": 3.4857748563242006e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 525640, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 16.991150442477878, | |
| "grad_norm": 0.0002828809665516019, | |
| "learning_rate": 3.388081622485431e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 527192, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_loss": 0.019199641421437263, | |
| "eval_runtime": 0.7112, | |
| "eval_samples_per_second": 35.151, | |
| "eval_steps_per_second": 18.278, | |
| "num_input_tokens_seen": 527360, | |
| "step": 1921 | |
| }, | |
| { | |
| "epoch": 17.035398230088497, | |
| "grad_norm": 0.00035240757279098034, | |
| "learning_rate": 3.2916773177118778e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 528400, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 17.079646017699115, | |
| "grad_norm": 0.0005224815104156733, | |
| "learning_rate": 3.1965676915306384e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 529952, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 17.123893805309734, | |
| "grad_norm": 0.0007481279317289591, | |
| "learning_rate": 3.102758416254545e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 531248, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 17.168141592920353, | |
| "grad_norm": 0.0003523877530824393, | |
| "learning_rate": 3.010255086643818e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 532720, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 17.212389380530972, | |
| "grad_norm": 0.00042034246143884957, | |
| "learning_rate": 2.919063219572438e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 534176, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 17.256637168141594, | |
| "grad_norm": 0.0007219575345516205, | |
| "learning_rate": 2.829188253699111e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 535792, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 17.300884955752213, | |
| "grad_norm": 0.0003039201837964356, | |
| "learning_rate": 2.7406355491429086e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 537088, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 17.345132743362832, | |
| "grad_norm": 0.0003757340309675783, | |
| "learning_rate": 2.653410387163574e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 538624, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 17.38938053097345, | |
| "grad_norm": 0.0015457504196092486, | |
| "learning_rate": 2.567517969846575e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 539968, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 17.43362831858407, | |
| "grad_norm": 0.0003834764356724918, | |
| "learning_rate": 2.482963419792844e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 541024, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 17.47787610619469, | |
| "grad_norm": 0.00040257559157907963, | |
| "learning_rate": 2.399751779813264e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 542304, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 17.52212389380531, | |
| "grad_norm": 0.00033592438558116555, | |
| "learning_rate": 2.317888012627914e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 543712, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 17.56637168141593, | |
| "grad_norm": 0.0005625042249448597, | |
| "learning_rate": 2.2373770005700955e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 545024, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 17.61061946902655, | |
| "grad_norm": 0.0005863187252543867, | |
| "learning_rate": 2.1582235452951682e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 546416, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 17.654867256637168, | |
| "grad_norm": 0.0003696437634062022, | |
| "learning_rate": 2.0804323674941563e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 547760, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 17.699115044247787, | |
| "grad_norm": 0.00047242920845746994, | |
| "learning_rate": 2.0040081066122043e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 548880, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 17.743362831858406, | |
| "grad_norm": 0.0029493607580661774, | |
| "learning_rate": 1.9289553205719317e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 550192, | |
| "step": 2005 | |
| }, | |
| { | |
| "epoch": 17.787610619469028, | |
| "grad_norm": 0.00043380033457651734, | |
| "learning_rate": 1.8552784855015215e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 551424, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 17.831858407079647, | |
| "grad_norm": 0.0005246605142019689, | |
| "learning_rate": 1.7829819954678361e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 552704, | |
| "step": 2015 | |
| }, | |
| { | |
| "epoch": 17.876106194690266, | |
| "grad_norm": 0.0019084580708295107, | |
| "learning_rate": 1.7120701622143132e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 554432, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 17.920353982300885, | |
| "grad_norm": 0.0005461287801153958, | |
| "learning_rate": 1.6425472149038361e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 556032, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 17.964601769911503, | |
| "grad_norm": 0.00038796328590251505, | |
| "learning_rate": 1.5744172998664902e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 557168, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_loss": 0.018208853900432587, | |
| "eval_runtime": 0.7295, | |
| "eval_samples_per_second": 34.269, | |
| "eval_steps_per_second": 17.82, | |
| "num_input_tokens_seen": 558088, | |
| "step": 2034 | |
| }, | |
| { | |
| "epoch": 18.008849557522122, | |
| "grad_norm": 0.0016791054513305426, | |
| "learning_rate": 1.5076844803522922e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 558408, | |
| "step": 2035 | |
| }, | |
| { | |
| "epoch": 18.053097345132745, | |
| "grad_norm": 0.00732705183327198, | |
| "learning_rate": 1.4423527362888546e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 559960, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 18.097345132743364, | |
| "grad_norm": 0.000439478870248422, | |
| "learning_rate": 1.3784259640440279e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 561272, | |
| "step": 2045 | |
| }, | |
| { | |
| "epoch": 18.141592920353983, | |
| "grad_norm": 0.0011805107351392508, | |
| "learning_rate": 1.3159079761934923e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 563064, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 18.1858407079646, | |
| "grad_norm": 0.00039607463986612856, | |
| "learning_rate": 1.2548025012934367e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 564504, | |
| "step": 2055 | |
| }, | |
| { | |
| "epoch": 18.23008849557522, | |
| "grad_norm": 0.00045017743832431734, | |
| "learning_rate": 1.195113183658131e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 565816, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 18.27433628318584, | |
| "grad_norm": 0.00038782545016147196, | |
| "learning_rate": 1.1368435831426021e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 567048, | |
| "step": 2065 | |
| }, | |
| { | |
| "epoch": 18.31858407079646, | |
| "grad_norm": 0.00046747527085244656, | |
| "learning_rate": 1.0799971749303333e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 568552, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 18.36283185840708, | |
| "grad_norm": 0.00045611089444719255, | |
| "learning_rate": 1.0245773493259946e-06, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 570088, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 18.4070796460177, | |
| "grad_norm": 0.0003884553152602166, | |
| "learning_rate": 9.705874115532532e-07, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 571400, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 18.451327433628318, | |
| "grad_norm": 0.0006371684139594436, | |
| "learning_rate": 9.180305815576301e-07, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 572648, | |
| "step": 2085 | |
| }, | |
| { | |
| "epoch": 18.495575221238937, | |
| "grad_norm": 0.00032609282061457634, | |
| "learning_rate": 8.669099938144992e-07, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 573976, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 18.539823008849556, | |
| "grad_norm": 0.0037040063180029392, | |
| "learning_rate": 8.172286971421167e-07, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 575560, | |
| "step": 2095 | |
| }, | |
| { | |
| "epoch": 18.58407079646018, | |
| "grad_norm": 0.00041839800542220473, | |
| "learning_rate": 7.689896545198111e-07, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 577096, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 18.628318584070797, | |
| "grad_norm": 0.001043095369823277, | |
| "learning_rate": 7.221957429112469e-07, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 578440, | |
| "step": 2105 | |
| }, | |
| { | |
| "epoch": 18.672566371681416, | |
| "grad_norm": 0.000590807176195085, | |
| "learning_rate": 6.768497530928785e-07, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 579592, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 18.716814159292035, | |
| "grad_norm": 0.000375599367544055, | |
| "learning_rate": 6.329543894874779e-07, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 580984, | |
| "step": 2115 | |
| }, | |
| { | |
| "epoch": 18.761061946902654, | |
| "grad_norm": 0.004237520508468151, | |
| "learning_rate": 5.905122700028576e-07, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 582424, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 18.805309734513273, | |
| "grad_norm": 0.0015916775446385145, | |
| "learning_rate": 5.49525925875738e-07, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 583608, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 18.849557522123895, | |
| "grad_norm": 0.0024985643103718758, | |
| "learning_rate": 5.099978015207868e-07, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 584568, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 18.893805309734514, | |
| "grad_norm": 0.000398988340748474, | |
| "learning_rate": 4.719302543848225e-07, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 586232, | |
| "step": 2135 | |
| }, | |
| { | |
| "epoch": 18.938053097345133, | |
| "grad_norm": 0.0004915536846965551, | |
| "learning_rate": 4.3532555480624295e-07, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 587608, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 18.98230088495575, | |
| "grad_norm": 0.0018118071602657437, | |
| "learning_rate": 4.001858858795893e-07, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 588744, | |
| "step": 2145 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_loss": 0.0190139003098011, | |
| "eval_runtime": 0.7256, | |
| "eval_samples_per_second": 34.455, | |
| "eval_steps_per_second": 17.916, | |
| "num_input_tokens_seen": 589072, | |
| "step": 2147 | |
| }, | |
| { | |
| "epoch": 19.02654867256637, | |
| "grad_norm": 0.003615348832681775, | |
| "learning_rate": 3.665133433253809e-07, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 589776, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 19.07079646017699, | |
| "grad_norm": 0.00036754633765667677, | |
| "learning_rate": 3.34309935365093e-07, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 591200, | |
| "step": 2155 | |
| }, | |
| { | |
| "epoch": 19.115044247787612, | |
| "grad_norm": 0.0005496389348991215, | |
| "learning_rate": 3.03577582601422e-07, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 592464, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 19.15929203539823, | |
| "grad_norm": 0.0004596963117364794, | |
| "learning_rate": 2.743181179037047e-07, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 593936, | |
| "step": 2165 | |
| }, | |
| { | |
| "epoch": 19.20353982300885, | |
| "grad_norm": 0.00025676190853118896, | |
| "learning_rate": 2.465332862986447e-07, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 595232, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 19.24778761061947, | |
| "grad_norm": 0.0004909674171358347, | |
| "learning_rate": 2.2022474486620427e-07, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 596592, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 19.292035398230087, | |
| "grad_norm": 0.00031084747752174735, | |
| "learning_rate": 1.953940626408024e-07, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 598176, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 19.336283185840706, | |
| "grad_norm": 0.001011427491903305, | |
| "learning_rate": 1.720427205177233e-07, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 599792, | |
| "step": 2185 | |
| }, | |
| { | |
| "epoch": 19.38053097345133, | |
| "grad_norm": 0.0016426928341388702, | |
| "learning_rate": 1.5017211116479802e-07, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 601072, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 19.424778761061948, | |
| "grad_norm": 0.00022410904057323933, | |
| "learning_rate": 1.297835389393598e-07, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 602384, | |
| "step": 2195 | |
| }, | |
| { | |
| "epoch": 19.469026548672566, | |
| "grad_norm": 0.0016083059599623084, | |
| "learning_rate": 1.1087821981042856e-07, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 603840, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 19.513274336283185, | |
| "grad_norm": 0.0005845269188284874, | |
| "learning_rate": 9.345728128621611e-08, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 604944, | |
| "step": 2205 | |
| }, | |
| { | |
| "epoch": 19.557522123893804, | |
| "grad_norm": 0.00038981102989055216, | |
| "learning_rate": 7.752176234685771e-08, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 606224, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 19.601769911504427, | |
| "grad_norm": 0.001588311162777245, | |
| "learning_rate": 6.307261338246718e-08, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 607632, | |
| "step": 2215 | |
| }, | |
| { | |
| "epoch": 19.646017699115045, | |
| "grad_norm": 0.005621492862701416, | |
| "learning_rate": 5.011069613644892e-08, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 608960, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 19.690265486725664, | |
| "grad_norm": 0.0005275033763609827, | |
| "learning_rate": 3.8636783654100174e-08, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 610400, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 19.734513274336283, | |
| "grad_norm": 0.0027882629074156284, | |
| "learning_rate": 2.865156023650617e-08, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 611648, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 19.778761061946902, | |
| "grad_norm": 0.0033670312259346247, | |
| "learning_rate": 2.0155621399742254e-08, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 612880, | |
| "step": 2235 | |
| }, | |
| { | |
| "epoch": 19.82300884955752, | |
| "grad_norm": 0.000658839417155832, | |
| "learning_rate": 1.31494738393384e-08, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 614560, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 19.86725663716814, | |
| "grad_norm": 0.0017916634678840637, | |
| "learning_rate": 7.633535400070057e-09, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 615856, | |
| "step": 2245 | |
| }, | |
| { | |
| "epoch": 19.911504424778762, | |
| "grad_norm": 0.0003976623120252043, | |
| "learning_rate": 3.6081350510447365e-09, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 617136, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 19.95575221238938, | |
| "grad_norm": 0.00045023686834611, | |
| "learning_rate": 1.0735128660649406e-09, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 618640, | |
| "step": 2255 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 0.0003346599987708032, | |
| "learning_rate": 2.982000932294504e-11, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 620240, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_loss": 0.01861300691962242, | |
| "eval_runtime": 0.7164, | |
| "eval_samples_per_second": 34.898, | |
| "eval_steps_per_second": 18.147, | |
| "num_input_tokens_seen": 620240, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "num_input_tokens_seen": 620240, | |
| "step": 2260, | |
| "total_flos": 2.792912687136768e+16, | |
| "train_loss": 0.167221273584263, | |
| "train_runtime": 256.8958, | |
| "train_samples_per_second": 17.517, | |
| "train_steps_per_second": 8.797 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2260, | |
| "num_input_tokens_seen": 620240, | |
| "num_train_epochs": 20, | |
| "save_steps": 113, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.792912687136768e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |