| { | |
| "best_global_step": 499760, | |
| "best_metric": 1.0584163665771484, | |
| "best_model_checkpoint": "/media/user/Expansion1/opus-mt-en-zhtw-google-translate3/checkpoint-499760", | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 499760, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005002401152553225, | |
| "grad_norm": 9.338144302368164, | |
| "learning_rate": 4.997503801824876e-05, | |
| "loss": 2.3751, | |
| "num_input_tokens_seen": 281752, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.01000480230510645, | |
| "grad_norm": 10.810979843139648, | |
| "learning_rate": 4.9950026012485996e-05, | |
| "loss": 2.2026, | |
| "num_input_tokens_seen": 552352, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.015007203457659676, | |
| "grad_norm": 9.087040901184082, | |
| "learning_rate": 4.992501400672323e-05, | |
| "loss": 2.1448, | |
| "num_input_tokens_seen": 823176, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.0200096046102129, | |
| "grad_norm": 8.617157936096191, | |
| "learning_rate": 4.990000200096046e-05, | |
| "loss": 2.0951, | |
| "num_input_tokens_seen": 1102760, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.02501200576276613, | |
| "grad_norm": 7.297477722167969, | |
| "learning_rate": 4.9874989995197695e-05, | |
| "loss": 2.0308, | |
| "num_input_tokens_seen": 1378560, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.030014406915319352, | |
| "grad_norm": 8.311019897460938, | |
| "learning_rate": 4.984997798943493e-05, | |
| "loss": 2.0307, | |
| "num_input_tokens_seen": 1653488, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.03501680806787258, | |
| "grad_norm": 8.287640571594238, | |
| "learning_rate": 4.9824965983672164e-05, | |
| "loss": 1.9855, | |
| "num_input_tokens_seen": 1923368, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.0400192092204258, | |
| "grad_norm": 9.764960289001465, | |
| "learning_rate": 4.97999539779094e-05, | |
| "loss": 1.9682, | |
| "num_input_tokens_seen": 2199816, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.04502161037297903, | |
| "grad_norm": 7.692084312438965, | |
| "learning_rate": 4.9774941972146634e-05, | |
| "loss": 1.9222, | |
| "num_input_tokens_seen": 2467792, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.05002401152553226, | |
| "grad_norm": 7.139247417449951, | |
| "learning_rate": 4.974992996638387e-05, | |
| "loss": 1.9266, | |
| "num_input_tokens_seen": 2731792, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.05502641267808548, | |
| "grad_norm": 7.6170244216918945, | |
| "learning_rate": 4.97249179606211e-05, | |
| "loss": 1.9024, | |
| "num_input_tokens_seen": 3004120, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.060028813830638704, | |
| "grad_norm": 12.332016944885254, | |
| "learning_rate": 4.969990595485833e-05, | |
| "loss": 1.8823, | |
| "num_input_tokens_seen": 3284568, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.06503121498319193, | |
| "grad_norm": 7.665128231048584, | |
| "learning_rate": 4.967489394909557e-05, | |
| "loss": 1.8642, | |
| "num_input_tokens_seen": 3554600, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.07003361613574516, | |
| "grad_norm": 10.934691429138184, | |
| "learning_rate": 4.96498819433328e-05, | |
| "loss": 1.8556, | |
| "num_input_tokens_seen": 3824936, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.07503601728829838, | |
| "grad_norm": 7.880730152130127, | |
| "learning_rate": 4.962486993757004e-05, | |
| "loss": 1.8606, | |
| "num_input_tokens_seen": 4099080, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.0800384184408516, | |
| "grad_norm": 7.548530578613281, | |
| "learning_rate": 4.959985793180727e-05, | |
| "loss": 1.83, | |
| "num_input_tokens_seen": 4366808, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.08504081959340483, | |
| "grad_norm": 7.900990009307861, | |
| "learning_rate": 4.957484592604451e-05, | |
| "loss": 1.8031, | |
| "num_input_tokens_seen": 4638816, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.09004322074595807, | |
| "grad_norm": 8.125676155090332, | |
| "learning_rate": 4.9549833920281736e-05, | |
| "loss": 1.8455, | |
| "num_input_tokens_seen": 4915000, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.09504562189851129, | |
| "grad_norm": 7.727709770202637, | |
| "learning_rate": 4.952482191451897e-05, | |
| "loss": 1.8024, | |
| "num_input_tokens_seen": 5188672, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.10004802305106451, | |
| "grad_norm": 5.897092342376709, | |
| "learning_rate": 4.9499809908756206e-05, | |
| "loss": 1.7928, | |
| "num_input_tokens_seen": 5468592, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.10505042420361774, | |
| "grad_norm": 11.170528411865234, | |
| "learning_rate": 4.947479790299344e-05, | |
| "loss": 1.7868, | |
| "num_input_tokens_seen": 5733256, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.11005282535617096, | |
| "grad_norm": 8.682831764221191, | |
| "learning_rate": 4.944978589723067e-05, | |
| "loss": 1.7878, | |
| "num_input_tokens_seen": 6008088, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.11505522650872418, | |
| "grad_norm": 7.914422988891602, | |
| "learning_rate": 4.942477389146791e-05, | |
| "loss": 1.7355, | |
| "num_input_tokens_seen": 6274960, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.12005762766127741, | |
| "grad_norm": 8.685178756713867, | |
| "learning_rate": 4.9399761885705145e-05, | |
| "loss": 1.7744, | |
| "num_input_tokens_seen": 6554288, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.12506002881383063, | |
| "grad_norm": 7.942957401275635, | |
| "learning_rate": 4.9374749879942374e-05, | |
| "loss": 1.7293, | |
| "num_input_tokens_seen": 6832880, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.13006242996638387, | |
| "grad_norm": 6.650600910186768, | |
| "learning_rate": 4.934973787417961e-05, | |
| "loss": 1.7523, | |
| "num_input_tokens_seen": 7107168, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.13506483111893708, | |
| "grad_norm": 7.683079242706299, | |
| "learning_rate": 4.9324725868416844e-05, | |
| "loss": 1.7432, | |
| "num_input_tokens_seen": 7377472, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.14006723227149032, | |
| "grad_norm": 8.168213844299316, | |
| "learning_rate": 4.929971386265408e-05, | |
| "loss": 1.745, | |
| "num_input_tokens_seen": 7653272, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.14506963342404355, | |
| "grad_norm": 8.087789535522461, | |
| "learning_rate": 4.927470185689131e-05, | |
| "loss": 1.7179, | |
| "num_input_tokens_seen": 7926040, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.15007203457659676, | |
| "grad_norm": 8.388677597045898, | |
| "learning_rate": 4.924968985112854e-05, | |
| "loss": 1.7219, | |
| "num_input_tokens_seen": 8195288, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.15507443572915, | |
| "grad_norm": 8.354930877685547, | |
| "learning_rate": 4.922467784536578e-05, | |
| "loss": 1.7376, | |
| "num_input_tokens_seen": 8469792, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.1600768368817032, | |
| "grad_norm": 8.638579368591309, | |
| "learning_rate": 4.919966583960301e-05, | |
| "loss": 1.701, | |
| "num_input_tokens_seen": 8735800, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.16507923803425645, | |
| "grad_norm": 6.771655559539795, | |
| "learning_rate": 4.9174653833840247e-05, | |
| "loss": 1.719, | |
| "num_input_tokens_seen": 9005256, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.17008163918680966, | |
| "grad_norm": 12.017413139343262, | |
| "learning_rate": 4.914964182807748e-05, | |
| "loss": 1.7029, | |
| "num_input_tokens_seen": 9279816, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.1750840403393629, | |
| "grad_norm": 7.177635669708252, | |
| "learning_rate": 4.9124629822314716e-05, | |
| "loss": 1.6925, | |
| "num_input_tokens_seen": 9548800, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.18008644149191613, | |
| "grad_norm": 6.606298446655273, | |
| "learning_rate": 4.9099617816551945e-05, | |
| "loss": 1.6957, | |
| "num_input_tokens_seen": 9826416, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.18508884264446934, | |
| "grad_norm": 6.026829242706299, | |
| "learning_rate": 4.907460581078918e-05, | |
| "loss": 1.7022, | |
| "num_input_tokens_seen": 10095920, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.19009124379702258, | |
| "grad_norm": 8.743913650512695, | |
| "learning_rate": 4.9049593805026415e-05, | |
| "loss": 1.6904, | |
| "num_input_tokens_seen": 10371760, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.1950936449495758, | |
| "grad_norm": 9.37678050994873, | |
| "learning_rate": 4.902458179926365e-05, | |
| "loss": 1.6617, | |
| "num_input_tokens_seen": 10639680, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.20009604610212903, | |
| "grad_norm": 7.834632396697998, | |
| "learning_rate": 4.8999569793500885e-05, | |
| "loss": 1.6631, | |
| "num_input_tokens_seen": 10910752, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.20509844725468224, | |
| "grad_norm": 7.523416519165039, | |
| "learning_rate": 4.897455778773812e-05, | |
| "loss": 1.6526, | |
| "num_input_tokens_seen": 11174272, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.21010084840723547, | |
| "grad_norm": 7.593217372894287, | |
| "learning_rate": 4.894954578197535e-05, | |
| "loss": 1.6618, | |
| "num_input_tokens_seen": 11447056, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.2151032495597887, | |
| "grad_norm": 7.984575271606445, | |
| "learning_rate": 4.892453377621258e-05, | |
| "loss": 1.6668, | |
| "num_input_tokens_seen": 11715856, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.22010565071234192, | |
| "grad_norm": 7.122634410858154, | |
| "learning_rate": 4.889952177044982e-05, | |
| "loss": 1.6697, | |
| "num_input_tokens_seen": 11994600, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.22510805186489516, | |
| "grad_norm": 6.745737552642822, | |
| "learning_rate": 4.887450976468705e-05, | |
| "loss": 1.6471, | |
| "num_input_tokens_seen": 12264272, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.23011045301744837, | |
| "grad_norm": 6.742521286010742, | |
| "learning_rate": 4.884949775892428e-05, | |
| "loss": 1.6588, | |
| "num_input_tokens_seen": 12538760, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.2351128541700016, | |
| "grad_norm": 8.37658977508545, | |
| "learning_rate": 4.882448575316152e-05, | |
| "loss": 1.667, | |
| "num_input_tokens_seen": 12814464, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.24011525532255482, | |
| "grad_norm": 7.458651065826416, | |
| "learning_rate": 4.879947374739876e-05, | |
| "loss": 1.6459, | |
| "num_input_tokens_seen": 13083456, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.24511765647510805, | |
| "grad_norm": 10.364368438720703, | |
| "learning_rate": 4.8774461741635986e-05, | |
| "loss": 1.651, | |
| "num_input_tokens_seen": 13361568, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.25012005762766126, | |
| "grad_norm": 6.404083251953125, | |
| "learning_rate": 4.874944973587322e-05, | |
| "loss": 1.6271, | |
| "num_input_tokens_seen": 13638952, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.2551224587802145, | |
| "grad_norm": 7.239497184753418, | |
| "learning_rate": 4.8724437730110456e-05, | |
| "loss": 1.6325, | |
| "num_input_tokens_seen": 13909896, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.26012485993276774, | |
| "grad_norm": 9.5720796585083, | |
| "learning_rate": 4.869942572434769e-05, | |
| "loss": 1.6227, | |
| "num_input_tokens_seen": 14182688, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.265127261085321, | |
| "grad_norm": 7.255125045776367, | |
| "learning_rate": 4.867441371858492e-05, | |
| "loss": 1.6328, | |
| "num_input_tokens_seen": 14455888, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.27012966223787416, | |
| "grad_norm": 7.990500450134277, | |
| "learning_rate": 4.8649401712822154e-05, | |
| "loss": 1.6315, | |
| "num_input_tokens_seen": 14726048, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.2751320633904274, | |
| "grad_norm": 7.787556171417236, | |
| "learning_rate": 4.8624389707059396e-05, | |
| "loss": 1.6357, | |
| "num_input_tokens_seen": 15005208, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.28013446454298063, | |
| "grad_norm": 6.046635627746582, | |
| "learning_rate": 4.8599377701296624e-05, | |
| "loss": 1.6173, | |
| "num_input_tokens_seen": 15272768, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.28513686569553387, | |
| "grad_norm": 6.547093391418457, | |
| "learning_rate": 4.857436569553386e-05, | |
| "loss": 1.6067, | |
| "num_input_tokens_seen": 15539248, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 0.2901392668480871, | |
| "grad_norm": 8.07209587097168, | |
| "learning_rate": 4.8549353689771094e-05, | |
| "loss": 1.6018, | |
| "num_input_tokens_seen": 15810072, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.2951416680006403, | |
| "grad_norm": 7.229666709899902, | |
| "learning_rate": 4.852434168400833e-05, | |
| "loss": 1.6238, | |
| "num_input_tokens_seen": 16082968, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 0.3001440691531935, | |
| "grad_norm": 6.863572597503662, | |
| "learning_rate": 4.849932967824556e-05, | |
| "loss": 1.6104, | |
| "num_input_tokens_seen": 16354784, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.30514647030574676, | |
| "grad_norm": 8.546486854553223, | |
| "learning_rate": 4.847431767248279e-05, | |
| "loss": 1.5961, | |
| "num_input_tokens_seen": 16623816, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 0.3101488714583, | |
| "grad_norm": 6.493512153625488, | |
| "learning_rate": 4.844930566672003e-05, | |
| "loss": 1.5977, | |
| "num_input_tokens_seen": 16896624, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.3151512726108532, | |
| "grad_norm": 7.902426242828369, | |
| "learning_rate": 4.842429366095726e-05, | |
| "loss": 1.6049, | |
| "num_input_tokens_seen": 17168672, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 0.3201536737634064, | |
| "grad_norm": 8.033360481262207, | |
| "learning_rate": 4.83992816551945e-05, | |
| "loss": 1.581, | |
| "num_input_tokens_seen": 17433848, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.32515607491595966, | |
| "grad_norm": 7.9239325523376465, | |
| "learning_rate": 4.837426964943173e-05, | |
| "loss": 1.6029, | |
| "num_input_tokens_seen": 17704920, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 0.3301584760685129, | |
| "grad_norm": 6.995474815368652, | |
| "learning_rate": 4.834925764366897e-05, | |
| "loss": 1.576, | |
| "num_input_tokens_seen": 17969776, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.33516087722106613, | |
| "grad_norm": 8.305245399475098, | |
| "learning_rate": 4.8324245637906195e-05, | |
| "loss": 1.5909, | |
| "num_input_tokens_seen": 18239784, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 0.3401632783736193, | |
| "grad_norm": 6.429056167602539, | |
| "learning_rate": 4.829923363214343e-05, | |
| "loss": 1.5742, | |
| "num_input_tokens_seen": 18507688, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.34516567952617255, | |
| "grad_norm": 7.208921432495117, | |
| "learning_rate": 4.8274221626380665e-05, | |
| "loss": 1.573, | |
| "num_input_tokens_seen": 18776368, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 0.3501680806787258, | |
| "grad_norm": 7.433680057525635, | |
| "learning_rate": 4.82492096206179e-05, | |
| "loss": 1.588, | |
| "num_input_tokens_seen": 19050584, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.35517048183127903, | |
| "grad_norm": 6.901820182800293, | |
| "learning_rate": 4.8224197614855135e-05, | |
| "loss": 1.58, | |
| "num_input_tokens_seen": 19333968, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 0.36017288298383227, | |
| "grad_norm": 8.789533615112305, | |
| "learning_rate": 4.819918560909237e-05, | |
| "loss": 1.5758, | |
| "num_input_tokens_seen": 19612632, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.36517528413638545, | |
| "grad_norm": 7.546513557434082, | |
| "learning_rate": 4.8174173603329605e-05, | |
| "loss": 1.5763, | |
| "num_input_tokens_seen": 19883800, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 0.3701776852889387, | |
| "grad_norm": 6.489349842071533, | |
| "learning_rate": 4.814916159756683e-05, | |
| "loss": 1.5866, | |
| "num_input_tokens_seen": 20158632, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 0.3751800864414919, | |
| "grad_norm": 7.696920871734619, | |
| "learning_rate": 4.812414959180407e-05, | |
| "loss": 1.5858, | |
| "num_input_tokens_seen": 20430440, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 0.38018248759404516, | |
| "grad_norm": 6.559112071990967, | |
| "learning_rate": 4.80991375860413e-05, | |
| "loss": 1.5596, | |
| "num_input_tokens_seen": 20703976, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 0.38518488874659834, | |
| "grad_norm": 8.480047225952148, | |
| "learning_rate": 4.807412558027853e-05, | |
| "loss": 1.5533, | |
| "num_input_tokens_seen": 20972048, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 0.3901872898991516, | |
| "grad_norm": 5.950156211853027, | |
| "learning_rate": 4.804911357451577e-05, | |
| "loss": 1.5642, | |
| "num_input_tokens_seen": 21240912, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 0.3951896910517048, | |
| "grad_norm": 5.799604892730713, | |
| "learning_rate": 4.802410156875301e-05, | |
| "loss": 1.5598, | |
| "num_input_tokens_seen": 21507728, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 0.40019209220425805, | |
| "grad_norm": 5.980112075805664, | |
| "learning_rate": 4.7999089562990236e-05, | |
| "loss": 1.5609, | |
| "num_input_tokens_seen": 21783160, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.4051944933568113, | |
| "grad_norm": 5.936067581176758, | |
| "learning_rate": 4.797407755722747e-05, | |
| "loss": 1.5523, | |
| "num_input_tokens_seen": 22052792, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 0.4101968945093645, | |
| "grad_norm": 9.166204452514648, | |
| "learning_rate": 4.7949065551464706e-05, | |
| "loss": 1.5452, | |
| "num_input_tokens_seen": 22324072, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 0.4151992956619177, | |
| "grad_norm": 8.854216575622559, | |
| "learning_rate": 4.792405354570194e-05, | |
| "loss": 1.5516, | |
| "num_input_tokens_seen": 22590624, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 0.42020169681447095, | |
| "grad_norm": 9.261016845703125, | |
| "learning_rate": 4.789904153993917e-05, | |
| "loss": 1.5374, | |
| "num_input_tokens_seen": 22862424, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 0.4252040979670242, | |
| "grad_norm": 7.714609622955322, | |
| "learning_rate": 4.7874029534176404e-05, | |
| "loss": 1.544, | |
| "num_input_tokens_seen": 23128888, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 0.4302064991195774, | |
| "grad_norm": 5.665945529937744, | |
| "learning_rate": 4.784901752841364e-05, | |
| "loss": 1.5532, | |
| "num_input_tokens_seen": 23407576, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 0.4352089002721306, | |
| "grad_norm": 6.948183536529541, | |
| "learning_rate": 4.7824005522650874e-05, | |
| "loss": 1.5467, | |
| "num_input_tokens_seen": 23683544, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 0.44021130142468384, | |
| "grad_norm": 5.725684642791748, | |
| "learning_rate": 4.779899351688811e-05, | |
| "loss": 1.5295, | |
| "num_input_tokens_seen": 23948640, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 0.4452137025772371, | |
| "grad_norm": 6.168211936950684, | |
| "learning_rate": 4.7773981511125344e-05, | |
| "loss": 1.5477, | |
| "num_input_tokens_seen": 24212584, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 0.4502161037297903, | |
| "grad_norm": 6.778971195220947, | |
| "learning_rate": 4.774896950536258e-05, | |
| "loss": 1.5134, | |
| "num_input_tokens_seen": 24481104, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 0.4552185048823435, | |
| "grad_norm": 7.2399210929870605, | |
| "learning_rate": 4.772395749959981e-05, | |
| "loss": 1.5447, | |
| "num_input_tokens_seen": 24756992, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 0.46022090603489674, | |
| "grad_norm": 6.476212024688721, | |
| "learning_rate": 4.769894549383704e-05, | |
| "loss": 1.5361, | |
| "num_input_tokens_seen": 25020392, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 0.46522330718745, | |
| "grad_norm": 10.64287281036377, | |
| "learning_rate": 4.767393348807428e-05, | |
| "loss": 1.5409, | |
| "num_input_tokens_seen": 25296728, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 0.4702257083400032, | |
| "grad_norm": 7.746724605560303, | |
| "learning_rate": 4.764892148231151e-05, | |
| "loss": 1.4953, | |
| "num_input_tokens_seen": 25562256, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 0.47522810949255645, | |
| "grad_norm": 6.38646125793457, | |
| "learning_rate": 4.762390947654875e-05, | |
| "loss": 1.5518, | |
| "num_input_tokens_seen": 25833464, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 0.48023051064510963, | |
| "grad_norm": 6.2214555740356445, | |
| "learning_rate": 4.759889747078598e-05, | |
| "loss": 1.5375, | |
| "num_input_tokens_seen": 26108816, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 0.48523291179766287, | |
| "grad_norm": 7.317322731018066, | |
| "learning_rate": 4.757388546502322e-05, | |
| "loss": 1.5449, | |
| "num_input_tokens_seen": 26385360, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 0.4902353129502161, | |
| "grad_norm": 6.4762701988220215, | |
| "learning_rate": 4.7548873459260445e-05, | |
| "loss": 1.5179, | |
| "num_input_tokens_seen": 26656488, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 0.49523771410276934, | |
| "grad_norm": 7.051132678985596, | |
| "learning_rate": 4.752386145349768e-05, | |
| "loss": 1.5213, | |
| "num_input_tokens_seen": 26925744, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 0.5002401152553225, | |
| "grad_norm": 8.628856658935547, | |
| "learning_rate": 4.7498849447734915e-05, | |
| "loss": 1.5517, | |
| "num_input_tokens_seen": 27202296, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 0.5052425164078758, | |
| "grad_norm": 6.756930351257324, | |
| "learning_rate": 4.747383744197215e-05, | |
| "loss": 1.5061, | |
| "num_input_tokens_seen": 27469216, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 0.510244917560429, | |
| "grad_norm": 8.543140411376953, | |
| "learning_rate": 4.7448825436209385e-05, | |
| "loss": 1.5323, | |
| "num_input_tokens_seen": 27746208, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 0.5152473187129822, | |
| "grad_norm": 7.62526273727417, | |
| "learning_rate": 4.742381343044662e-05, | |
| "loss": 1.5284, | |
| "num_input_tokens_seen": 28024152, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 0.5202497198655355, | |
| "grad_norm": 6.117633819580078, | |
| "learning_rate": 4.7398801424683855e-05, | |
| "loss": 1.5304, | |
| "num_input_tokens_seen": 28300312, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 0.5252521210180887, | |
| "grad_norm": 6.280879497528076, | |
| "learning_rate": 4.737378941892108e-05, | |
| "loss": 1.4873, | |
| "num_input_tokens_seen": 28573472, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 0.530254522170642, | |
| "grad_norm": 6.297519683837891, | |
| "learning_rate": 4.734877741315832e-05, | |
| "loss": 1.512, | |
| "num_input_tokens_seen": 28846960, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 0.5352569233231952, | |
| "grad_norm": 7.927740097045898, | |
| "learning_rate": 4.732376540739555e-05, | |
| "loss": 1.5303, | |
| "num_input_tokens_seen": 29120648, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 0.5402593244757483, | |
| "grad_norm": 6.746880054473877, | |
| "learning_rate": 4.729875340163278e-05, | |
| "loss": 1.5187, | |
| "num_input_tokens_seen": 29389712, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 0.5452617256283016, | |
| "grad_norm": 6.765636920928955, | |
| "learning_rate": 4.7273741395870016e-05, | |
| "loss": 1.502, | |
| "num_input_tokens_seen": 29661920, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 0.5502641267808548, | |
| "grad_norm": 4.868513107299805, | |
| "learning_rate": 4.724872939010726e-05, | |
| "loss": 1.499, | |
| "num_input_tokens_seen": 29933584, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 0.555266527933408, | |
| "grad_norm": 6.332437515258789, | |
| "learning_rate": 4.722371738434449e-05, | |
| "loss": 1.4815, | |
| "num_input_tokens_seen": 30200096, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 0.5602689290859613, | |
| "grad_norm": 6.429714679718018, | |
| "learning_rate": 4.719870537858172e-05, | |
| "loss": 1.5138, | |
| "num_input_tokens_seen": 30475600, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 0.5652713302385145, | |
| "grad_norm": 6.9108991622924805, | |
| "learning_rate": 4.7173693372818956e-05, | |
| "loss": 1.5218, | |
| "num_input_tokens_seen": 30750536, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 0.5702737313910677, | |
| "grad_norm": 7.187946796417236, | |
| "learning_rate": 4.714868136705619e-05, | |
| "loss": 1.4937, | |
| "num_input_tokens_seen": 31023928, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 0.575276132543621, | |
| "grad_norm": 7.880980014801025, | |
| "learning_rate": 4.712366936129342e-05, | |
| "loss": 1.5149, | |
| "num_input_tokens_seen": 31295016, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 0.5802785336961742, | |
| "grad_norm": 6.050008773803711, | |
| "learning_rate": 4.7098657355530654e-05, | |
| "loss": 1.5172, | |
| "num_input_tokens_seen": 31571344, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 0.5852809348487273, | |
| "grad_norm": 5.153658390045166, | |
| "learning_rate": 4.707364534976789e-05, | |
| "loss": 1.499, | |
| "num_input_tokens_seen": 31847512, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 0.5902833360012806, | |
| "grad_norm": 8.292108535766602, | |
| "learning_rate": 4.704863334400513e-05, | |
| "loss": 1.4897, | |
| "num_input_tokens_seen": 32118720, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 0.5952857371538338, | |
| "grad_norm": 5.900440216064453, | |
| "learning_rate": 4.702362133824236e-05, | |
| "loss": 1.5195, | |
| "num_input_tokens_seen": 32397744, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 0.600288138306387, | |
| "grad_norm": 7.023585796356201, | |
| "learning_rate": 4.6998609332479594e-05, | |
| "loss": 1.4755, | |
| "num_input_tokens_seen": 32671912, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 0.6052905394589403, | |
| "grad_norm": 7.419212818145752, | |
| "learning_rate": 4.697359732671683e-05, | |
| "loss": 1.4844, | |
| "num_input_tokens_seen": 32941344, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 0.6102929406114935, | |
| "grad_norm": 7.654923915863037, | |
| "learning_rate": 4.694858532095406e-05, | |
| "loss": 1.497, | |
| "num_input_tokens_seen": 33221672, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 0.6152953417640468, | |
| "grad_norm": 6.979129791259766, | |
| "learning_rate": 4.692357331519129e-05, | |
| "loss": 1.4855, | |
| "num_input_tokens_seen": 33489080, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 0.6202977429166, | |
| "grad_norm": 6.450369834899902, | |
| "learning_rate": 4.689856130942853e-05, | |
| "loss": 1.4945, | |
| "num_input_tokens_seen": 33763488, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 0.6253001440691532, | |
| "grad_norm": 6.070815563201904, | |
| "learning_rate": 4.687354930366576e-05, | |
| "loss": 1.4991, | |
| "num_input_tokens_seen": 34031328, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 0.6303025452217064, | |
| "grad_norm": 5.402656078338623, | |
| "learning_rate": 4.6848537297903e-05, | |
| "loss": 1.4957, | |
| "num_input_tokens_seen": 34304608, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 0.6353049463742596, | |
| "grad_norm": 10.961112022399902, | |
| "learning_rate": 4.682352529214023e-05, | |
| "loss": 1.4767, | |
| "num_input_tokens_seen": 34569600, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 0.6403073475268128, | |
| "grad_norm": 7.523622035980225, | |
| "learning_rate": 4.679851328637747e-05, | |
| "loss": 1.4953, | |
| "num_input_tokens_seen": 34838720, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 0.6453097486793661, | |
| "grad_norm": 7.367971420288086, | |
| "learning_rate": 4.6773501280614695e-05, | |
| "loss": 1.4959, | |
| "num_input_tokens_seen": 35110952, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 0.6503121498319193, | |
| "grad_norm": 7.122432231903076, | |
| "learning_rate": 4.674848927485193e-05, | |
| "loss": 1.4617, | |
| "num_input_tokens_seen": 35379336, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 0.6553145509844726, | |
| "grad_norm": 6.941073894500732, | |
| "learning_rate": 4.6723477269089165e-05, | |
| "loss": 1.4828, | |
| "num_input_tokens_seen": 35654144, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 0.6603169521370258, | |
| "grad_norm": 7.309379577636719, | |
| "learning_rate": 4.669846526332639e-05, | |
| "loss": 1.4695, | |
| "num_input_tokens_seen": 35922592, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 0.665319353289579, | |
| "grad_norm": 8.28540325164795, | |
| "learning_rate": 4.6673453257563635e-05, | |
| "loss": 1.4853, | |
| "num_input_tokens_seen": 36195592, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 0.6703217544421323, | |
| "grad_norm": 6.311332702636719, | |
| "learning_rate": 4.664844125180087e-05, | |
| "loss": 1.478, | |
| "num_input_tokens_seen": 36471728, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 0.6753241555946854, | |
| "grad_norm": 6.863243579864502, | |
| "learning_rate": 4.6623429246038105e-05, | |
| "loss": 1.4935, | |
| "num_input_tokens_seen": 36738232, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 0.6803265567472386, | |
| "grad_norm": 5.535435199737549, | |
| "learning_rate": 4.659841724027533e-05, | |
| "loss": 1.4689, | |
| "num_input_tokens_seen": 37003552, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 0.6853289578997919, | |
| "grad_norm": 7.348452568054199, | |
| "learning_rate": 4.657340523451257e-05, | |
| "loss": 1.4802, | |
| "num_input_tokens_seen": 37273624, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 0.6903313590523451, | |
| "grad_norm": 5.919636249542236, | |
| "learning_rate": 4.65483932287498e-05, | |
| "loss": 1.468, | |
| "num_input_tokens_seen": 37542216, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 0.6953337602048983, | |
| "grad_norm": 6.997893333435059, | |
| "learning_rate": 4.652338122298703e-05, | |
| "loss": 1.4711, | |
| "num_input_tokens_seen": 37817712, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 0.7003361613574516, | |
| "grad_norm": 7.683621883392334, | |
| "learning_rate": 4.6498369217224266e-05, | |
| "loss": 1.4467, | |
| "num_input_tokens_seen": 38086368, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 0.7053385625100048, | |
| "grad_norm": 5.56058931350708, | |
| "learning_rate": 4.647335721146151e-05, | |
| "loss": 1.4631, | |
| "num_input_tokens_seen": 38364016, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 0.7103409636625581, | |
| "grad_norm": 5.151466369628906, | |
| "learning_rate": 4.644834520569874e-05, | |
| "loss": 1.4776, | |
| "num_input_tokens_seen": 38639544, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 0.7153433648151113, | |
| "grad_norm": 7.764716625213623, | |
| "learning_rate": 4.642333319993597e-05, | |
| "loss": 1.4629, | |
| "num_input_tokens_seen": 38900248, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 0.7203457659676645, | |
| "grad_norm": 7.205192565917969, | |
| "learning_rate": 4.6398321194173206e-05, | |
| "loss": 1.4699, | |
| "num_input_tokens_seen": 39177440, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 0.7253481671202177, | |
| "grad_norm": 6.734379768371582, | |
| "learning_rate": 4.637330918841044e-05, | |
| "loss": 1.4649, | |
| "num_input_tokens_seen": 39442976, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 0.7303505682727709, | |
| "grad_norm": 6.191771507263184, | |
| "learning_rate": 4.634829718264767e-05, | |
| "loss": 1.4764, | |
| "num_input_tokens_seen": 39715104, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 0.7353529694253241, | |
| "grad_norm": 7.378221035003662, | |
| "learning_rate": 4.6323285176884904e-05, | |
| "loss": 1.4515, | |
| "num_input_tokens_seen": 39990848, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 0.7403553705778774, | |
| "grad_norm": 6.436953067779541, | |
| "learning_rate": 4.629827317112214e-05, | |
| "loss": 1.4495, | |
| "num_input_tokens_seen": 40258280, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 0.7453577717304306, | |
| "grad_norm": 5.954966068267822, | |
| "learning_rate": 4.6273261165359374e-05, | |
| "loss": 1.4497, | |
| "num_input_tokens_seen": 40535544, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 0.7503601728829838, | |
| "grad_norm": 6.085744857788086, | |
| "learning_rate": 4.624824915959661e-05, | |
| "loss": 1.4428, | |
| "num_input_tokens_seen": 40804552, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 0.7553625740355371, | |
| "grad_norm": 6.737603664398193, | |
| "learning_rate": 4.6223237153833844e-05, | |
| "loss": 1.4394, | |
| "num_input_tokens_seen": 41072144, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 0.7603649751880903, | |
| "grad_norm": 5.9119439125061035, | |
| "learning_rate": 4.619822514807108e-05, | |
| "loss": 1.4408, | |
| "num_input_tokens_seen": 41341080, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 0.7653673763406436, | |
| "grad_norm": 7.842981815338135, | |
| "learning_rate": 4.617321314230831e-05, | |
| "loss": 1.4538, | |
| "num_input_tokens_seen": 41613760, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 0.7703697774931967, | |
| "grad_norm": 7.999574184417725, | |
| "learning_rate": 4.614820113654554e-05, | |
| "loss": 1.451, | |
| "num_input_tokens_seen": 41886512, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 0.7753721786457499, | |
| "grad_norm": 5.851772308349609, | |
| "learning_rate": 4.612318913078278e-05, | |
| "loss": 1.4317, | |
| "num_input_tokens_seen": 42154824, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 0.7803745797983032, | |
| "grad_norm": 7.43974494934082, | |
| "learning_rate": 4.609817712502001e-05, | |
| "loss": 1.4707, | |
| "num_input_tokens_seen": 42425080, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 0.7853769809508564, | |
| "grad_norm": 6.566989898681641, | |
| "learning_rate": 4.607316511925725e-05, | |
| "loss": 1.4725, | |
| "num_input_tokens_seen": 42695896, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 0.7903793821034096, | |
| "grad_norm": 6.765398979187012, | |
| "learning_rate": 4.604815311349448e-05, | |
| "loss": 1.4454, | |
| "num_input_tokens_seen": 42961864, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 0.7953817832559629, | |
| "grad_norm": 5.989940643310547, | |
| "learning_rate": 4.602314110773172e-05, | |
| "loss": 1.4551, | |
| "num_input_tokens_seen": 43237224, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 0.8003841844085161, | |
| "grad_norm": 8.16629409790039, | |
| "learning_rate": 4.5998129101968945e-05, | |
| "loss": 1.4354, | |
| "num_input_tokens_seen": 43512872, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 0.8053865855610693, | |
| "grad_norm": 6.704333305358887, | |
| "learning_rate": 4.597311709620618e-05, | |
| "loss": 1.4481, | |
| "num_input_tokens_seen": 43779448, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 0.8103889867136226, | |
| "grad_norm": 6.2965593338012695, | |
| "learning_rate": 4.5948105090443415e-05, | |
| "loss": 1.4443, | |
| "num_input_tokens_seen": 44053296, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 0.8153913878661757, | |
| "grad_norm": 6.224064350128174, | |
| "learning_rate": 4.592309308468064e-05, | |
| "loss": 1.47, | |
| "num_input_tokens_seen": 44328488, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 0.820393789018729, | |
| "grad_norm": 6.873196601867676, | |
| "learning_rate": 4.589808107891788e-05, | |
| "loss": 1.4429, | |
| "num_input_tokens_seen": 44604152, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 0.8253961901712822, | |
| "grad_norm": 6.774177551269531, | |
| "learning_rate": 4.587306907315512e-05, | |
| "loss": 1.4427, | |
| "num_input_tokens_seen": 44880240, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 0.8303985913238354, | |
| "grad_norm": 7.543479919433594, | |
| "learning_rate": 4.5848057067392355e-05, | |
| "loss": 1.4559, | |
| "num_input_tokens_seen": 45150264, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 0.8354009924763887, | |
| "grad_norm": 6.445783615112305, | |
| "learning_rate": 4.582304506162958e-05, | |
| "loss": 1.4215, | |
| "num_input_tokens_seen": 45419664, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 0.8404033936289419, | |
| "grad_norm": 8.083765029907227, | |
| "learning_rate": 4.579803305586682e-05, | |
| "loss": 1.4636, | |
| "num_input_tokens_seen": 45691768, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 0.8454057947814951, | |
| "grad_norm": 6.205325126647949, | |
| "learning_rate": 4.577302105010405e-05, | |
| "loss": 1.4386, | |
| "num_input_tokens_seen": 45965288, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 0.8504081959340484, | |
| "grad_norm": 5.954364776611328, | |
| "learning_rate": 4.574800904434128e-05, | |
| "loss": 1.454, | |
| "num_input_tokens_seen": 46239520, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 0.8554105970866016, | |
| "grad_norm": 7.476288318634033, | |
| "learning_rate": 4.5722997038578516e-05, | |
| "loss": 1.4375, | |
| "num_input_tokens_seen": 46506456, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 0.8604129982391548, | |
| "grad_norm": 9.656715393066406, | |
| "learning_rate": 4.569798503281575e-05, | |
| "loss": 1.4293, | |
| "num_input_tokens_seen": 46775832, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 0.865415399391708, | |
| "grad_norm": 5.440873622894287, | |
| "learning_rate": 4.567297302705299e-05, | |
| "loss": 1.428, | |
| "num_input_tokens_seen": 47053184, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 0.8704178005442612, | |
| "grad_norm": 6.26190710067749, | |
| "learning_rate": 4.564796102129022e-05, | |
| "loss": 1.4175, | |
| "num_input_tokens_seen": 47326160, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 0.8754202016968144, | |
| "grad_norm": 5.701922416687012, | |
| "learning_rate": 4.5622949015527456e-05, | |
| "loss": 1.4323, | |
| "num_input_tokens_seen": 47596624, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 0.8804226028493677, | |
| "grad_norm": 7.687532901763916, | |
| "learning_rate": 4.559793700976469e-05, | |
| "loss": 1.4403, | |
| "num_input_tokens_seen": 47866072, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 0.8854250040019209, | |
| "grad_norm": 4.988935470581055, | |
| "learning_rate": 4.557292500400192e-05, | |
| "loss": 1.445, | |
| "num_input_tokens_seen": 48153664, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 0.8904274051544742, | |
| "grad_norm": 5.36391544342041, | |
| "learning_rate": 4.5547912998239154e-05, | |
| "loss": 1.43, | |
| "num_input_tokens_seen": 48432248, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 0.8954298063070274, | |
| "grad_norm": 7.618863105773926, | |
| "learning_rate": 4.552290099247639e-05, | |
| "loss": 1.4122, | |
| "num_input_tokens_seen": 48705584, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 0.9004322074595806, | |
| "grad_norm": 9.530303001403809, | |
| "learning_rate": 4.5497888986713624e-05, | |
| "loss": 1.4392, | |
| "num_input_tokens_seen": 48985152, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 0.9054346086121339, | |
| "grad_norm": 7.428534030914307, | |
| "learning_rate": 4.547287698095086e-05, | |
| "loss": 1.4289, | |
| "num_input_tokens_seen": 49262664, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 0.910437009764687, | |
| "grad_norm": 7.3600287437438965, | |
| "learning_rate": 4.5447864975188094e-05, | |
| "loss": 1.4242, | |
| "num_input_tokens_seen": 49536480, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 0.9154394109172402, | |
| "grad_norm": 5.594141960144043, | |
| "learning_rate": 4.542285296942533e-05, | |
| "loss": 1.4266, | |
| "num_input_tokens_seen": 49809336, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 0.9204418120697935, | |
| "grad_norm": 6.597540378570557, | |
| "learning_rate": 4.539784096366256e-05, | |
| "loss": 1.4216, | |
| "num_input_tokens_seen": 50082984, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 0.9254442132223467, | |
| "grad_norm": 8.180904388427734, | |
| "learning_rate": 4.537282895789979e-05, | |
| "loss": 1.4374, | |
| "num_input_tokens_seen": 50358512, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 0.9304466143749, | |
| "grad_norm": 7.512216567993164, | |
| "learning_rate": 4.534781695213703e-05, | |
| "loss": 1.4178, | |
| "num_input_tokens_seen": 50634216, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 0.9354490155274532, | |
| "grad_norm": 6.1448283195495605, | |
| "learning_rate": 4.532280494637426e-05, | |
| "loss": 1.399, | |
| "num_input_tokens_seen": 50902088, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 0.9404514166800064, | |
| "grad_norm": 6.424488544464111, | |
| "learning_rate": 4.52977929406115e-05, | |
| "loss": 1.4115, | |
| "num_input_tokens_seen": 51174136, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 0.9454538178325597, | |
| "grad_norm": 5.398598670959473, | |
| "learning_rate": 4.527278093484873e-05, | |
| "loss": 1.4146, | |
| "num_input_tokens_seen": 51453752, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 0.9504562189851129, | |
| "grad_norm": 6.272931098937988, | |
| "learning_rate": 4.524776892908597e-05, | |
| "loss": 1.4166, | |
| "num_input_tokens_seen": 51728016, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 0.955458620137666, | |
| "grad_norm": 6.412170886993408, | |
| "learning_rate": 4.5222756923323195e-05, | |
| "loss": 1.4241, | |
| "num_input_tokens_seen": 51995272, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 0.9604610212902193, | |
| "grad_norm": 7.181222438812256, | |
| "learning_rate": 4.519774491756043e-05, | |
| "loss": 1.4166, | |
| "num_input_tokens_seen": 52276888, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 0.9654634224427725, | |
| "grad_norm": 6.848874092102051, | |
| "learning_rate": 4.5172732911797665e-05, | |
| "loss": 1.4028, | |
| "num_input_tokens_seen": 52555928, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 0.9704658235953257, | |
| "grad_norm": 6.6588568687438965, | |
| "learning_rate": 4.51477209060349e-05, | |
| "loss": 1.3993, | |
| "num_input_tokens_seen": 52824472, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 0.975468224747879, | |
| "grad_norm": 7.8776373863220215, | |
| "learning_rate": 4.512270890027213e-05, | |
| "loss": 1.4204, | |
| "num_input_tokens_seen": 53098176, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 0.9804706259004322, | |
| "grad_norm": 5.281984806060791, | |
| "learning_rate": 4.509769689450937e-05, | |
| "loss": 1.4191, | |
| "num_input_tokens_seen": 53379376, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 0.9854730270529855, | |
| "grad_norm": 8.383103370666504, | |
| "learning_rate": 4.5072684888746605e-05, | |
| "loss": 1.4232, | |
| "num_input_tokens_seen": 53654608, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 0.9904754282055387, | |
| "grad_norm": 5.8474626541137695, | |
| "learning_rate": 4.504767288298383e-05, | |
| "loss": 1.4099, | |
| "num_input_tokens_seen": 53931080, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 0.9954778293580919, | |
| "grad_norm": 6.058784008026123, | |
| "learning_rate": 4.502266087722107e-05, | |
| "loss": 1.3993, | |
| "num_input_tokens_seen": 54204800, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 1.2487133741378784, | |
| "eval_runtime": 187.129, | |
| "eval_samples_per_second": 1068.274, | |
| "eval_steps_per_second": 133.539, | |
| "num_input_tokens_seen": 54454616, | |
| "step": 99952 | |
| }, | |
| { | |
| "epoch": 1.000480230510645, | |
| "grad_norm": 5.304110050201416, | |
| "learning_rate": 4.49976488714583e-05, | |
| "loss": 1.3882, | |
| "num_input_tokens_seen": 54481288, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 1.0054826316631984, | |
| "grad_norm": 7.098052501678467, | |
| "learning_rate": 4.497263686569553e-05, | |
| "loss": 1.2973, | |
| "num_input_tokens_seen": 54749928, | |
| "step": 100500 | |
| }, | |
| { | |
| "epoch": 1.0104850328157515, | |
| "grad_norm": 7.15824031829834, | |
| "learning_rate": 4.4947624859932766e-05, | |
| "loss": 1.3323, | |
| "num_input_tokens_seen": 55027920, | |
| "step": 101000 | |
| }, | |
| { | |
| "epoch": 1.0154874339683049, | |
| "grad_norm": 6.138706684112549, | |
| "learning_rate": 4.492261285417e-05, | |
| "loss": 1.3195, | |
| "num_input_tokens_seen": 55303960, | |
| "step": 101500 | |
| }, | |
| { | |
| "epoch": 1.020489835120858, | |
| "grad_norm": 8.01395320892334, | |
| "learning_rate": 4.4897600848407236e-05, | |
| "loss": 1.2913, | |
| "num_input_tokens_seen": 55577184, | |
| "step": 102000 | |
| }, | |
| { | |
| "epoch": 1.0254922362734111, | |
| "grad_norm": 7.413015842437744, | |
| "learning_rate": 4.487258884264447e-05, | |
| "loss": 1.3284, | |
| "num_input_tokens_seen": 55851192, | |
| "step": 102500 | |
| }, | |
| { | |
| "epoch": 1.0304946374259645, | |
| "grad_norm": 6.665005207061768, | |
| "learning_rate": 4.4847576836881706e-05, | |
| "loss": 1.3239, | |
| "num_input_tokens_seen": 56125184, | |
| "step": 103000 | |
| }, | |
| { | |
| "epoch": 1.0354970385785176, | |
| "grad_norm": 6.208978652954102, | |
| "learning_rate": 4.482256483111894e-05, | |
| "loss": 1.3198, | |
| "num_input_tokens_seen": 56399640, | |
| "step": 103500 | |
| }, | |
| { | |
| "epoch": 1.040499439731071, | |
| "grad_norm": 6.494995594024658, | |
| "learning_rate": 4.479755282535617e-05, | |
| "loss": 1.3036, | |
| "num_input_tokens_seen": 56672752, | |
| "step": 104000 | |
| }, | |
| { | |
| "epoch": 1.045501840883624, | |
| "grad_norm": 7.3449625968933105, | |
| "learning_rate": 4.4772540819593404e-05, | |
| "loss": 1.3304, | |
| "num_input_tokens_seen": 56942744, | |
| "step": 104500 | |
| }, | |
| { | |
| "epoch": 1.0505042420361774, | |
| "grad_norm": 5.880083084106445, | |
| "learning_rate": 4.474752881383064e-05, | |
| "loss": 1.3273, | |
| "num_input_tokens_seen": 57223568, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 1.0555066431887306, | |
| "grad_norm": 7.793262004852295, | |
| "learning_rate": 4.4722516808067874e-05, | |
| "loss": 1.3364, | |
| "num_input_tokens_seen": 57501104, | |
| "step": 105500 | |
| }, | |
| { | |
| "epoch": 1.060509044341284, | |
| "grad_norm": 5.995269298553467, | |
| "learning_rate": 4.469750480230511e-05, | |
| "loss": 1.3157, | |
| "num_input_tokens_seen": 57774032, | |
| "step": 106000 | |
| }, | |
| { | |
| "epoch": 1.065511445493837, | |
| "grad_norm": 6.386702060699463, | |
| "learning_rate": 4.4672492796542344e-05, | |
| "loss": 1.2906, | |
| "num_input_tokens_seen": 58052328, | |
| "step": 106500 | |
| }, | |
| { | |
| "epoch": 1.0705138466463904, | |
| "grad_norm": 6.049729347229004, | |
| "learning_rate": 4.464748079077958e-05, | |
| "loss": 1.3073, | |
| "num_input_tokens_seen": 58325864, | |
| "step": 107000 | |
| }, | |
| { | |
| "epoch": 1.0755162477989435, | |
| "grad_norm": 6.0326433181762695, | |
| "learning_rate": 4.462246878501681e-05, | |
| "loss": 1.3223, | |
| "num_input_tokens_seen": 58605688, | |
| "step": 107500 | |
| }, | |
| { | |
| "epoch": 1.0805186489514966, | |
| "grad_norm": 7.254247188568115, | |
| "learning_rate": 4.459745677925404e-05, | |
| "loss": 1.3131, | |
| "num_input_tokens_seen": 58875792, | |
| "step": 108000 | |
| }, | |
| { | |
| "epoch": 1.08552105010405, | |
| "grad_norm": 5.334825038909912, | |
| "learning_rate": 4.457244477349128e-05, | |
| "loss": 1.3313, | |
| "num_input_tokens_seen": 59148200, | |
| "step": 108500 | |
| }, | |
| { | |
| "epoch": 1.090523451256603, | |
| "grad_norm": 5.982466697692871, | |
| "learning_rate": 4.454743276772851e-05, | |
| "loss": 1.3031, | |
| "num_input_tokens_seen": 59416680, | |
| "step": 109000 | |
| }, | |
| { | |
| "epoch": 1.0955258524091565, | |
| "grad_norm": 5.858680725097656, | |
| "learning_rate": 4.452242076196575e-05, | |
| "loss": 1.2964, | |
| "num_input_tokens_seen": 59680504, | |
| "step": 109500 | |
| }, | |
| { | |
| "epoch": 1.1005282535617096, | |
| "grad_norm": 7.001748085021973, | |
| "learning_rate": 4.449740875620298e-05, | |
| "loss": 1.3203, | |
| "num_input_tokens_seen": 59951112, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 1.105530654714263, | |
| "grad_norm": 7.0456013679504395, | |
| "learning_rate": 4.447239675044022e-05, | |
| "loss": 1.3229, | |
| "num_input_tokens_seen": 60223952, | |
| "step": 110500 | |
| }, | |
| { | |
| "epoch": 1.110533055866816, | |
| "grad_norm": 8.38005256652832, | |
| "learning_rate": 4.4447384744677446e-05, | |
| "loss": 1.3045, | |
| "num_input_tokens_seen": 60497304, | |
| "step": 111000 | |
| }, | |
| { | |
| "epoch": 1.1155354570193694, | |
| "grad_norm": 6.44760799407959, | |
| "learning_rate": 4.442237273891468e-05, | |
| "loss": 1.3298, | |
| "num_input_tokens_seen": 60770032, | |
| "step": 111500 | |
| }, | |
| { | |
| "epoch": 1.1205378581719225, | |
| "grad_norm": 7.661795616149902, | |
| "learning_rate": 4.4397360733151915e-05, | |
| "loss": 1.299, | |
| "num_input_tokens_seen": 61041904, | |
| "step": 112000 | |
| }, | |
| { | |
| "epoch": 1.1255402593244757, | |
| "grad_norm": 7.2505340576171875, | |
| "learning_rate": 4.437234872738915e-05, | |
| "loss": 1.3444, | |
| "num_input_tokens_seen": 61315792, | |
| "step": 112500 | |
| }, | |
| { | |
| "epoch": 1.130542660477029, | |
| "grad_norm": 8.16947078704834, | |
| "learning_rate": 4.434733672162638e-05, | |
| "loss": 1.313, | |
| "num_input_tokens_seen": 61591968, | |
| "step": 113000 | |
| }, | |
| { | |
| "epoch": 1.1355450616295821, | |
| "grad_norm": 6.221188068389893, | |
| "learning_rate": 4.4322324715863614e-05, | |
| "loss": 1.3266, | |
| "num_input_tokens_seen": 61862648, | |
| "step": 113500 | |
| }, | |
| { | |
| "epoch": 1.1405474627821355, | |
| "grad_norm": 5.967212677001953, | |
| "learning_rate": 4.4297312710100855e-05, | |
| "loss": 1.3521, | |
| "num_input_tokens_seen": 62138024, | |
| "step": 114000 | |
| }, | |
| { | |
| "epoch": 1.1455498639346886, | |
| "grad_norm": 6.872376441955566, | |
| "learning_rate": 4.4272300704338084e-05, | |
| "loss": 1.3218, | |
| "num_input_tokens_seen": 62414352, | |
| "step": 114500 | |
| }, | |
| { | |
| "epoch": 1.150552265087242, | |
| "grad_norm": 6.218190670013428, | |
| "learning_rate": 4.424728869857532e-05, | |
| "loss": 1.3306, | |
| "num_input_tokens_seen": 62689104, | |
| "step": 115000 | |
| }, | |
| { | |
| "epoch": 1.155554666239795, | |
| "grad_norm": 8.191985130310059, | |
| "learning_rate": 4.4222276692812553e-05, | |
| "loss": 1.3236, | |
| "num_input_tokens_seen": 62963216, | |
| "step": 115500 | |
| }, | |
| { | |
| "epoch": 1.1605570673923484, | |
| "grad_norm": 6.161906719207764, | |
| "learning_rate": 4.419726468704979e-05, | |
| "loss": 1.3258, | |
| "num_input_tokens_seen": 63235456, | |
| "step": 116000 | |
| }, | |
| { | |
| "epoch": 1.1655594685449016, | |
| "grad_norm": 7.158758640289307, | |
| "learning_rate": 4.417225268128702e-05, | |
| "loss": 1.3037, | |
| "num_input_tokens_seen": 63505248, | |
| "step": 116500 | |
| }, | |
| { | |
| "epoch": 1.1705618696974547, | |
| "grad_norm": 5.683105945587158, | |
| "learning_rate": 4.414724067552425e-05, | |
| "loss": 1.3154, | |
| "num_input_tokens_seen": 63772504, | |
| "step": 117000 | |
| }, | |
| { | |
| "epoch": 1.175564270850008, | |
| "grad_norm": 7.0123467445373535, | |
| "learning_rate": 4.4122228669761487e-05, | |
| "loss": 1.3043, | |
| "num_input_tokens_seen": 64045928, | |
| "step": 117500 | |
| }, | |
| { | |
| "epoch": 1.1805666720025612, | |
| "grad_norm": 5.434397220611572, | |
| "learning_rate": 4.409721666399872e-05, | |
| "loss": 1.3247, | |
| "num_input_tokens_seen": 64313624, | |
| "step": 118000 | |
| }, | |
| { | |
| "epoch": 1.1855690731551145, | |
| "grad_norm": 6.298323631286621, | |
| "learning_rate": 4.4072204658235956e-05, | |
| "loss": 1.3333, | |
| "num_input_tokens_seen": 64591384, | |
| "step": 118500 | |
| }, | |
| { | |
| "epoch": 1.1905714743076676, | |
| "grad_norm": 6.530762672424316, | |
| "learning_rate": 4.404719265247319e-05, | |
| "loss": 1.3324, | |
| "num_input_tokens_seen": 64864128, | |
| "step": 119000 | |
| }, | |
| { | |
| "epoch": 1.195573875460221, | |
| "grad_norm": 7.463630199432373, | |
| "learning_rate": 4.4022180646710426e-05, | |
| "loss": 1.314, | |
| "num_input_tokens_seen": 65134680, | |
| "step": 119500 | |
| }, | |
| { | |
| "epoch": 1.200576276612774, | |
| "grad_norm": 8.017274856567383, | |
| "learning_rate": 4.3997168640947655e-05, | |
| "loss": 1.3, | |
| "num_input_tokens_seen": 65400128, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 1.2055786777653275, | |
| "grad_norm": 6.083741188049316, | |
| "learning_rate": 4.397215663518489e-05, | |
| "loss": 1.3122, | |
| "num_input_tokens_seen": 65670200, | |
| "step": 120500 | |
| }, | |
| { | |
| "epoch": 1.2105810789178806, | |
| "grad_norm": 7.809543609619141, | |
| "learning_rate": 4.3947144629422125e-05, | |
| "loss": 1.316, | |
| "num_input_tokens_seen": 65935248, | |
| "step": 121000 | |
| }, | |
| { | |
| "epoch": 1.2155834800704337, | |
| "grad_norm": 6.627076148986816, | |
| "learning_rate": 4.392213262365936e-05, | |
| "loss": 1.3024, | |
| "num_input_tokens_seen": 66206584, | |
| "step": 121500 | |
| }, | |
| { | |
| "epoch": 1.220585881222987, | |
| "grad_norm": 5.432526111602783, | |
| "learning_rate": 4.3897120617896594e-05, | |
| "loss": 1.3181, | |
| "num_input_tokens_seen": 66476424, | |
| "step": 122000 | |
| }, | |
| { | |
| "epoch": 1.2255882823755402, | |
| "grad_norm": 5.557873249053955, | |
| "learning_rate": 4.387210861213383e-05, | |
| "loss": 1.3066, | |
| "num_input_tokens_seen": 66746568, | |
| "step": 122500 | |
| }, | |
| { | |
| "epoch": 1.2305906835280935, | |
| "grad_norm": 5.4136738777160645, | |
| "learning_rate": 4.384709660637106e-05, | |
| "loss": 1.3065, | |
| "num_input_tokens_seen": 67013472, | |
| "step": 123000 | |
| }, | |
| { | |
| "epoch": 1.2355930846806467, | |
| "grad_norm": 4.602624416351318, | |
| "learning_rate": 4.382208460060829e-05, | |
| "loss": 1.2921, | |
| "num_input_tokens_seen": 67284136, | |
| "step": 123500 | |
| }, | |
| { | |
| "epoch": 1.2405954858332, | |
| "grad_norm": 7.711009502410889, | |
| "learning_rate": 4.379707259484553e-05, | |
| "loss": 1.3104, | |
| "num_input_tokens_seen": 67555712, | |
| "step": 124000 | |
| }, | |
| { | |
| "epoch": 1.2455978869857531, | |
| "grad_norm": 5.971095561981201, | |
| "learning_rate": 4.377206058908276e-05, | |
| "loss": 1.3288, | |
| "num_input_tokens_seen": 67830816, | |
| "step": 124500 | |
| }, | |
| { | |
| "epoch": 1.2506002881383065, | |
| "grad_norm": 5.992773056030273, | |
| "learning_rate": 4.374704858331999e-05, | |
| "loss": 1.3372, | |
| "num_input_tokens_seen": 68113208, | |
| "step": 125000 | |
| }, | |
| { | |
| "epoch": 1.2556026892908596, | |
| "grad_norm": 7.2574238777160645, | |
| "learning_rate": 4.372203657755723e-05, | |
| "loss": 1.2964, | |
| "num_input_tokens_seen": 68376088, | |
| "step": 125500 | |
| }, | |
| { | |
| "epoch": 1.2606050904434127, | |
| "grad_norm": 4.974996566772461, | |
| "learning_rate": 4.369702457179447e-05, | |
| "loss": 1.3021, | |
| "num_input_tokens_seen": 68641168, | |
| "step": 126000 | |
| }, | |
| { | |
| "epoch": 1.265607491595966, | |
| "grad_norm": 5.745625019073486, | |
| "learning_rate": 4.3672012566031696e-05, | |
| "loss": 1.3217, | |
| "num_input_tokens_seen": 68909752, | |
| "step": 126500 | |
| }, | |
| { | |
| "epoch": 1.2706098927485192, | |
| "grad_norm": 6.78819465637207, | |
| "learning_rate": 4.364700056026893e-05, | |
| "loss": 1.3211, | |
| "num_input_tokens_seen": 69181824, | |
| "step": 127000 | |
| }, | |
| { | |
| "epoch": 1.2756122939010726, | |
| "grad_norm": 7.1991047859191895, | |
| "learning_rate": 4.3621988554506166e-05, | |
| "loss": 1.3175, | |
| "num_input_tokens_seen": 69448304, | |
| "step": 127500 | |
| }, | |
| { | |
| "epoch": 1.2806146950536257, | |
| "grad_norm": 5.636517524719238, | |
| "learning_rate": 4.35969765487434e-05, | |
| "loss": 1.308, | |
| "num_input_tokens_seen": 69724960, | |
| "step": 128000 | |
| }, | |
| { | |
| "epoch": 1.285617096206179, | |
| "grad_norm": 6.406187057495117, | |
| "learning_rate": 4.357196454298063e-05, | |
| "loss": 1.3225, | |
| "num_input_tokens_seen": 70004440, | |
| "step": 128500 | |
| }, | |
| { | |
| "epoch": 1.2906194973587322, | |
| "grad_norm": 5.746100902557373, | |
| "learning_rate": 4.3546952537217864e-05, | |
| "loss": 1.3084, | |
| "num_input_tokens_seen": 70276824, | |
| "step": 129000 | |
| }, | |
| { | |
| "epoch": 1.2956218985112855, | |
| "grad_norm": 5.6266584396362305, | |
| "learning_rate": 4.3521940531455105e-05, | |
| "loss": 1.3251, | |
| "num_input_tokens_seen": 70549080, | |
| "step": 129500 | |
| }, | |
| { | |
| "epoch": 1.3006242996638386, | |
| "grad_norm": 6.3568315505981445, | |
| "learning_rate": 4.3496928525692334e-05, | |
| "loss": 1.2909, | |
| "num_input_tokens_seen": 70822216, | |
| "step": 130000 | |
| }, | |
| { | |
| "epoch": 1.3056267008163918, | |
| "grad_norm": 6.566619873046875, | |
| "learning_rate": 4.347191651992957e-05, | |
| "loss": 1.3083, | |
| "num_input_tokens_seen": 71088152, | |
| "step": 130500 | |
| }, | |
| { | |
| "epoch": 1.310629101968945, | |
| "grad_norm": 8.060522079467773, | |
| "learning_rate": 4.3446904514166804e-05, | |
| "loss": 1.3124, | |
| "num_input_tokens_seen": 71354416, | |
| "step": 131000 | |
| }, | |
| { | |
| "epoch": 1.3156315031214985, | |
| "grad_norm": 7.366143226623535, | |
| "learning_rate": 4.342189250840404e-05, | |
| "loss": 1.317, | |
| "num_input_tokens_seen": 71630192, | |
| "step": 131500 | |
| }, | |
| { | |
| "epoch": 1.3206339042740516, | |
| "grad_norm": 6.985642910003662, | |
| "learning_rate": 4.339688050264127e-05, | |
| "loss": 1.3115, | |
| "num_input_tokens_seen": 71898288, | |
| "step": 132000 | |
| }, | |
| { | |
| "epoch": 1.3256363054266047, | |
| "grad_norm": 6.185933589935303, | |
| "learning_rate": 4.33718684968785e-05, | |
| "loss": 1.3227, | |
| "num_input_tokens_seen": 72177880, | |
| "step": 132500 | |
| }, | |
| { | |
| "epoch": 1.330638706579158, | |
| "grad_norm": 5.259435176849365, | |
| "learning_rate": 4.334685649111574e-05, | |
| "loss": 1.3202, | |
| "num_input_tokens_seen": 72456024, | |
| "step": 133000 | |
| }, | |
| { | |
| "epoch": 1.3356411077317112, | |
| "grad_norm": 6.163081169128418, | |
| "learning_rate": 4.332184448535297e-05, | |
| "loss": 1.3021, | |
| "num_input_tokens_seen": 72724464, | |
| "step": 133500 | |
| }, | |
| { | |
| "epoch": 1.3406435088842645, | |
| "grad_norm": 5.284718036651611, | |
| "learning_rate": 4.329683247959021e-05, | |
| "loss": 1.3063, | |
| "num_input_tokens_seen": 72991696, | |
| "step": 134000 | |
| }, | |
| { | |
| "epoch": 1.3456459100368177, | |
| "grad_norm": 6.016850471496582, | |
| "learning_rate": 4.327182047382744e-05, | |
| "loss": 1.3012, | |
| "num_input_tokens_seen": 73261048, | |
| "step": 134500 | |
| }, | |
| { | |
| "epoch": 1.3506483111893708, | |
| "grad_norm": 6.393965244293213, | |
| "learning_rate": 4.3246808468064677e-05, | |
| "loss": 1.2991, | |
| "num_input_tokens_seen": 73529952, | |
| "step": 135000 | |
| }, | |
| { | |
| "epoch": 1.3556507123419241, | |
| "grad_norm": 7.240478992462158, | |
| "learning_rate": 4.3221796462301905e-05, | |
| "loss": 1.3297, | |
| "num_input_tokens_seen": 73806208, | |
| "step": 135500 | |
| }, | |
| { | |
| "epoch": 1.3606531134944775, | |
| "grad_norm": 6.343556880950928, | |
| "learning_rate": 4.319678445653914e-05, | |
| "loss": 1.3228, | |
| "num_input_tokens_seen": 74076360, | |
| "step": 136000 | |
| }, | |
| { | |
| "epoch": 1.3656555146470306, | |
| "grad_norm": 5.717186450958252, | |
| "learning_rate": 4.3171772450776375e-05, | |
| "loss": 1.3018, | |
| "num_input_tokens_seen": 74350688, | |
| "step": 136500 | |
| }, | |
| { | |
| "epoch": 1.3706579157995837, | |
| "grad_norm": 5.872751235961914, | |
| "learning_rate": 4.314676044501361e-05, | |
| "loss": 1.3053, | |
| "num_input_tokens_seen": 74623168, | |
| "step": 137000 | |
| }, | |
| { | |
| "epoch": 1.375660316952137, | |
| "grad_norm": 6.422801971435547, | |
| "learning_rate": 4.3121748439250845e-05, | |
| "loss": 1.3107, | |
| "num_input_tokens_seen": 74892400, | |
| "step": 137500 | |
| }, | |
| { | |
| "epoch": 1.3806627181046902, | |
| "grad_norm": 5.038456439971924, | |
| "learning_rate": 4.309673643348808e-05, | |
| "loss": 1.3261, | |
| "num_input_tokens_seen": 75161376, | |
| "step": 138000 | |
| }, | |
| { | |
| "epoch": 1.3856651192572436, | |
| "grad_norm": 6.162600040435791, | |
| "learning_rate": 4.3071724427725315e-05, | |
| "loss": 1.2904, | |
| "num_input_tokens_seen": 75437000, | |
| "step": 138500 | |
| }, | |
| { | |
| "epoch": 1.3906675204097967, | |
| "grad_norm": 5.364713191986084, | |
| "learning_rate": 4.304671242196254e-05, | |
| "loss": 1.3162, | |
| "num_input_tokens_seen": 75711312, | |
| "step": 139000 | |
| }, | |
| { | |
| "epoch": 1.3956699215623498, | |
| "grad_norm": 6.959611415863037, | |
| "learning_rate": 4.302170041619978e-05, | |
| "loss": 1.3231, | |
| "num_input_tokens_seen": 75982336, | |
| "step": 139500 | |
| }, | |
| { | |
| "epoch": 1.4006723227149032, | |
| "grad_norm": 7.737590789794922, | |
| "learning_rate": 4.299668841043701e-05, | |
| "loss": 1.3175, | |
| "num_input_tokens_seen": 76261536, | |
| "step": 140000 | |
| }, | |
| { | |
| "epoch": 1.4056747238674565, | |
| "grad_norm": 5.541545391082764, | |
| "learning_rate": 4.297167640467424e-05, | |
| "loss": 1.3075, | |
| "num_input_tokens_seen": 76530928, | |
| "step": 140500 | |
| }, | |
| { | |
| "epoch": 1.4106771250200096, | |
| "grad_norm": 6.196156024932861, | |
| "learning_rate": 4.2946664398911476e-05, | |
| "loss": 1.3045, | |
| "num_input_tokens_seen": 76805928, | |
| "step": 141000 | |
| }, | |
| { | |
| "epoch": 1.4156795261725628, | |
| "grad_norm": 5.349905490875244, | |
| "learning_rate": 4.292165239314872e-05, | |
| "loss": 1.3223, | |
| "num_input_tokens_seen": 77083224, | |
| "step": 141500 | |
| }, | |
| { | |
| "epoch": 1.4206819273251161, | |
| "grad_norm": 5.8378586769104, | |
| "learning_rate": 4.2896640387385946e-05, | |
| "loss": 1.3025, | |
| "num_input_tokens_seen": 77352800, | |
| "step": 142000 | |
| }, | |
| { | |
| "epoch": 1.4256843284776692, | |
| "grad_norm": 6.061739921569824, | |
| "learning_rate": 4.287162838162318e-05, | |
| "loss": 1.3074, | |
| "num_input_tokens_seen": 77625328, | |
| "step": 142500 | |
| }, | |
| { | |
| "epoch": 1.4306867296302226, | |
| "grad_norm": 5.522953510284424, | |
| "learning_rate": 4.2846616375860416e-05, | |
| "loss": 1.3052, | |
| "num_input_tokens_seen": 77902368, | |
| "step": 143000 | |
| }, | |
| { | |
| "epoch": 1.4356891307827757, | |
| "grad_norm": 6.295720100402832, | |
| "learning_rate": 4.282160437009765e-05, | |
| "loss": 1.3118, | |
| "num_input_tokens_seen": 78177488, | |
| "step": 143500 | |
| }, | |
| { | |
| "epoch": 1.4406915319353288, | |
| "grad_norm": 6.575023651123047, | |
| "learning_rate": 4.279659236433488e-05, | |
| "loss": 1.3093, | |
| "num_input_tokens_seen": 78446712, | |
| "step": 144000 | |
| }, | |
| { | |
| "epoch": 1.4456939330878822, | |
| "grad_norm": 6.984113693237305, | |
| "learning_rate": 4.2771580358572114e-05, | |
| "loss": 1.3076, | |
| "num_input_tokens_seen": 78720880, | |
| "step": 144500 | |
| }, | |
| { | |
| "epoch": 1.4506963342404355, | |
| "grad_norm": 5.520240306854248, | |
| "learning_rate": 4.274656835280935e-05, | |
| "loss": 1.3001, | |
| "num_input_tokens_seen": 78987824, | |
| "step": 145000 | |
| }, | |
| { | |
| "epoch": 1.4556987353929887, | |
| "grad_norm": 8.607036590576172, | |
| "learning_rate": 4.2721556347046584e-05, | |
| "loss": 1.3129, | |
| "num_input_tokens_seen": 79265448, | |
| "step": 145500 | |
| }, | |
| { | |
| "epoch": 1.4607011365455418, | |
| "grad_norm": 5.851890563964844, | |
| "learning_rate": 4.269654434128382e-05, | |
| "loss": 1.283, | |
| "num_input_tokens_seen": 79533224, | |
| "step": 146000 | |
| }, | |
| { | |
| "epoch": 1.4657035376980951, | |
| "grad_norm": 6.837863922119141, | |
| "learning_rate": 4.2671532335521054e-05, | |
| "loss": 1.3191, | |
| "num_input_tokens_seen": 79806784, | |
| "step": 146500 | |
| }, | |
| { | |
| "epoch": 1.4707059388506483, | |
| "grad_norm": 8.558204650878906, | |
| "learning_rate": 4.264652032975829e-05, | |
| "loss": 1.3004, | |
| "num_input_tokens_seen": 80082392, | |
| "step": 147000 | |
| }, | |
| { | |
| "epoch": 1.4757083400032016, | |
| "grad_norm": 5.562234401702881, | |
| "learning_rate": 4.262150832399552e-05, | |
| "loss": 1.3127, | |
| "num_input_tokens_seen": 80357544, | |
| "step": 147500 | |
| }, | |
| { | |
| "epoch": 1.4807107411557547, | |
| "grad_norm": 6.331244945526123, | |
| "learning_rate": 4.259649631823275e-05, | |
| "loss": 1.2875, | |
| "num_input_tokens_seen": 80619480, | |
| "step": 148000 | |
| }, | |
| { | |
| "epoch": 1.4857131423083079, | |
| "grad_norm": 7.26661491394043, | |
| "learning_rate": 4.257148431246999e-05, | |
| "loss": 1.286, | |
| "num_input_tokens_seen": 80889016, | |
| "step": 148500 | |
| }, | |
| { | |
| "epoch": 1.4907155434608612, | |
| "grad_norm": 6.140303134918213, | |
| "learning_rate": 4.254647230670722e-05, | |
| "loss": 1.3209, | |
| "num_input_tokens_seen": 81158600, | |
| "step": 149000 | |
| }, | |
| { | |
| "epoch": 1.4957179446134146, | |
| "grad_norm": 6.452395439147949, | |
| "learning_rate": 4.252146030094446e-05, | |
| "loss": 1.3115, | |
| "num_input_tokens_seen": 81438680, | |
| "step": 149500 | |
| }, | |
| { | |
| "epoch": 1.5007203457659677, | |
| "grad_norm": 7.9884257316589355, | |
| "learning_rate": 4.249644829518169e-05, | |
| "loss": 1.2958, | |
| "num_input_tokens_seen": 81705824, | |
| "step": 150000 | |
| }, | |
| { | |
| "epoch": 1.5057227469185208, | |
| "grad_norm": 5.807667255401611, | |
| "learning_rate": 4.247143628941893e-05, | |
| "loss": 1.3309, | |
| "num_input_tokens_seen": 81978560, | |
| "step": 150500 | |
| }, | |
| { | |
| "epoch": 1.5107251480710742, | |
| "grad_norm": 6.487443447113037, | |
| "learning_rate": 4.2446424283656155e-05, | |
| "loss": 1.303, | |
| "num_input_tokens_seen": 82250552, | |
| "step": 151000 | |
| }, | |
| { | |
| "epoch": 1.5157275492236273, | |
| "grad_norm": 7.297651767730713, | |
| "learning_rate": 4.242141227789339e-05, | |
| "loss": 1.2961, | |
| "num_input_tokens_seen": 82528296, | |
| "step": 151500 | |
| }, | |
| { | |
| "epoch": 1.5207299503761806, | |
| "grad_norm": 6.434643268585205, | |
| "learning_rate": 4.2396400272130625e-05, | |
| "loss": 1.2926, | |
| "num_input_tokens_seen": 82791496, | |
| "step": 152000 | |
| }, | |
| { | |
| "epoch": 1.5257323515287338, | |
| "grad_norm": 6.918686389923096, | |
| "learning_rate": 4.237138826636785e-05, | |
| "loss": 1.2998, | |
| "num_input_tokens_seen": 83063776, | |
| "step": 152500 | |
| }, | |
| { | |
| "epoch": 1.530734752681287, | |
| "grad_norm": 5.594851493835449, | |
| "learning_rate": 4.2346376260605095e-05, | |
| "loss": 1.321, | |
| "num_input_tokens_seen": 83339208, | |
| "step": 153000 | |
| }, | |
| { | |
| "epoch": 1.5357371538338402, | |
| "grad_norm": 6.245510578155518, | |
| "learning_rate": 4.232136425484233e-05, | |
| "loss": 1.2743, | |
| "num_input_tokens_seen": 83610920, | |
| "step": 153500 | |
| }, | |
| { | |
| "epoch": 1.5407395549863936, | |
| "grad_norm": 6.392094612121582, | |
| "learning_rate": 4.2296352249079565e-05, | |
| "loss": 1.3062, | |
| "num_input_tokens_seen": 83883600, | |
| "step": 154000 | |
| }, | |
| { | |
| "epoch": 1.5457419561389467, | |
| "grad_norm": 6.538769245147705, | |
| "learning_rate": 4.227134024331679e-05, | |
| "loss": 1.3111, | |
| "num_input_tokens_seen": 84152704, | |
| "step": 154500 | |
| }, | |
| { | |
| "epoch": 1.5507443572914998, | |
| "grad_norm": 6.384563446044922, | |
| "learning_rate": 4.224632823755403e-05, | |
| "loss": 1.2767, | |
| "num_input_tokens_seen": 84425920, | |
| "step": 155000 | |
| }, | |
| { | |
| "epoch": 1.5557467584440532, | |
| "grad_norm": 6.407052040100098, | |
| "learning_rate": 4.222131623179126e-05, | |
| "loss": 1.2865, | |
| "num_input_tokens_seen": 84697904, | |
| "step": 155500 | |
| }, | |
| { | |
| "epoch": 1.5607491595966063, | |
| "grad_norm": 6.534234046936035, | |
| "learning_rate": 4.219630422602849e-05, | |
| "loss": 1.2817, | |
| "num_input_tokens_seen": 84968680, | |
| "step": 156000 | |
| }, | |
| { | |
| "epoch": 1.5657515607491597, | |
| "grad_norm": 5.641045093536377, | |
| "learning_rate": 4.2171292220265726e-05, | |
| "loss": 1.2963, | |
| "num_input_tokens_seen": 85238032, | |
| "step": 156500 | |
| }, | |
| { | |
| "epoch": 1.5707539619017128, | |
| "grad_norm": 6.242879867553711, | |
| "learning_rate": 4.214628021450297e-05, | |
| "loss": 1.2924, | |
| "num_input_tokens_seen": 85511200, | |
| "step": 157000 | |
| }, | |
| { | |
| "epoch": 1.575756363054266, | |
| "grad_norm": 6.90887451171875, | |
| "learning_rate": 4.21212682087402e-05, | |
| "loss": 1.2968, | |
| "num_input_tokens_seen": 85785448, | |
| "step": 157500 | |
| }, | |
| { | |
| "epoch": 1.5807587642068193, | |
| "grad_norm": 7.269606590270996, | |
| "learning_rate": 4.209625620297743e-05, | |
| "loss": 1.2845, | |
| "num_input_tokens_seen": 86056312, | |
| "step": 158000 | |
| }, | |
| { | |
| "epoch": 1.5857611653593726, | |
| "grad_norm": 5.152353763580322, | |
| "learning_rate": 4.2071244197214666e-05, | |
| "loss": 1.2909, | |
| "num_input_tokens_seen": 86333312, | |
| "step": 158500 | |
| }, | |
| { | |
| "epoch": 1.5907635665119257, | |
| "grad_norm": 6.0240631103515625, | |
| "learning_rate": 4.20462321914519e-05, | |
| "loss": 1.2923, | |
| "num_input_tokens_seen": 86610480, | |
| "step": 159000 | |
| }, | |
| { | |
| "epoch": 1.5957659676644789, | |
| "grad_norm": 7.361881256103516, | |
| "learning_rate": 4.202122018568913e-05, | |
| "loss": 1.2759, | |
| "num_input_tokens_seen": 86882480, | |
| "step": 159500 | |
| }, | |
| { | |
| "epoch": 1.6007683688170322, | |
| "grad_norm": 5.192800521850586, | |
| "learning_rate": 4.1996208179926364e-05, | |
| "loss": 1.298, | |
| "num_input_tokens_seen": 87144592, | |
| "step": 160000 | |
| }, | |
| { | |
| "epoch": 1.6057707699695856, | |
| "grad_norm": 7.1856369972229, | |
| "learning_rate": 4.19711961741636e-05, | |
| "loss": 1.2955, | |
| "num_input_tokens_seen": 87420328, | |
| "step": 160500 | |
| }, | |
| { | |
| "epoch": 1.6107731711221387, | |
| "grad_norm": 5.096145153045654, | |
| "learning_rate": 4.1946184168400834e-05, | |
| "loss": 1.2923, | |
| "num_input_tokens_seen": 87696968, | |
| "step": 161000 | |
| }, | |
| { | |
| "epoch": 1.6157755722746918, | |
| "grad_norm": 6.808541297912598, | |
| "learning_rate": 4.192117216263807e-05, | |
| "loss": 1.3044, | |
| "num_input_tokens_seen": 87977352, | |
| "step": 161500 | |
| }, | |
| { | |
| "epoch": 1.620777973427245, | |
| "grad_norm": 5.258007526397705, | |
| "learning_rate": 4.1896160156875304e-05, | |
| "loss": 1.3023, | |
| "num_input_tokens_seen": 88251864, | |
| "step": 162000 | |
| }, | |
| { | |
| "epoch": 1.6257803745797983, | |
| "grad_norm": 5.184575080871582, | |
| "learning_rate": 4.187114815111254e-05, | |
| "loss": 1.2594, | |
| "num_input_tokens_seen": 88521800, | |
| "step": 162500 | |
| }, | |
| { | |
| "epoch": 1.6307827757323516, | |
| "grad_norm": 5.858316421508789, | |
| "learning_rate": 4.184613614534977e-05, | |
| "loss": 1.2854, | |
| "num_input_tokens_seen": 88788776, | |
| "step": 163000 | |
| }, | |
| { | |
| "epoch": 1.6357851768849048, | |
| "grad_norm": 7.03213357925415, | |
| "learning_rate": 4.1821124139587e-05, | |
| "loss": 1.2745, | |
| "num_input_tokens_seen": 89054472, | |
| "step": 163500 | |
| }, | |
| { | |
| "epoch": 1.640787578037458, | |
| "grad_norm": 7.509394645690918, | |
| "learning_rate": 4.179611213382424e-05, | |
| "loss": 1.2932, | |
| "num_input_tokens_seen": 89332456, | |
| "step": 164000 | |
| }, | |
| { | |
| "epoch": 1.6457899791900112, | |
| "grad_norm": 7.114541530609131, | |
| "learning_rate": 4.177110012806147e-05, | |
| "loss": 1.2751, | |
| "num_input_tokens_seen": 89609920, | |
| "step": 164500 | |
| }, | |
| { | |
| "epoch": 1.6507923803425646, | |
| "grad_norm": 14.539456367492676, | |
| "learning_rate": 4.174608812229871e-05, | |
| "loss": 1.3115, | |
| "num_input_tokens_seen": 89875856, | |
| "step": 165000 | |
| }, | |
| { | |
| "epoch": 1.6557947814951177, | |
| "grad_norm": 5.730625629425049, | |
| "learning_rate": 4.172107611653594e-05, | |
| "loss": 1.2938, | |
| "num_input_tokens_seen": 90148472, | |
| "step": 165500 | |
| }, | |
| { | |
| "epoch": 1.6607971826476708, | |
| "grad_norm": 5.901363849639893, | |
| "learning_rate": 4.169606411077318e-05, | |
| "loss": 1.2895, | |
| "num_input_tokens_seen": 90424400, | |
| "step": 166000 | |
| }, | |
| { | |
| "epoch": 1.665799583800224, | |
| "grad_norm": 5.94663667678833, | |
| "learning_rate": 4.1671052105010405e-05, | |
| "loss": 1.2988, | |
| "num_input_tokens_seen": 90702152, | |
| "step": 166500 | |
| }, | |
| { | |
| "epoch": 1.6708019849527773, | |
| "grad_norm": 5.720317363739014, | |
| "learning_rate": 4.164604009924764e-05, | |
| "loss": 1.2921, | |
| "num_input_tokens_seen": 90980904, | |
| "step": 167000 | |
| }, | |
| { | |
| "epoch": 1.6758043861053307, | |
| "grad_norm": 8.514877319335938, | |
| "learning_rate": 4.1621028093484875e-05, | |
| "loss": 1.2762, | |
| "num_input_tokens_seen": 91249200, | |
| "step": 167500 | |
| }, | |
| { | |
| "epoch": 1.6808067872578838, | |
| "grad_norm": 8.756369590759277, | |
| "learning_rate": 4.15960160877221e-05, | |
| "loss": 1.2898, | |
| "num_input_tokens_seen": 91523408, | |
| "step": 168000 | |
| }, | |
| { | |
| "epoch": 1.685809188410437, | |
| "grad_norm": 4.922306537628174, | |
| "learning_rate": 4.1571004081959345e-05, | |
| "loss": 1.2873, | |
| "num_input_tokens_seen": 91795848, | |
| "step": 168500 | |
| }, | |
| { | |
| "epoch": 1.6908115895629903, | |
| "grad_norm": 5.668425559997559, | |
| "learning_rate": 4.154599207619658e-05, | |
| "loss": 1.2662, | |
| "num_input_tokens_seen": 92067336, | |
| "step": 169000 | |
| }, | |
| { | |
| "epoch": 1.6958139907155436, | |
| "grad_norm": 6.631772518157959, | |
| "learning_rate": 4.1520980070433815e-05, | |
| "loss": 1.3048, | |
| "num_input_tokens_seen": 92339392, | |
| "step": 169500 | |
| }, | |
| { | |
| "epoch": 1.7008163918680967, | |
| "grad_norm": 6.489889144897461, | |
| "learning_rate": 4.149596806467104e-05, | |
| "loss": 1.2835, | |
| "num_input_tokens_seen": 92613216, | |
| "step": 170000 | |
| }, | |
| { | |
| "epoch": 1.7058187930206499, | |
| "grad_norm": 6.344711780548096, | |
| "learning_rate": 4.147095605890828e-05, | |
| "loss": 1.2918, | |
| "num_input_tokens_seen": 92890872, | |
| "step": 170500 | |
| }, | |
| { | |
| "epoch": 1.710821194173203, | |
| "grad_norm": 7.276896953582764, | |
| "learning_rate": 4.144594405314551e-05, | |
| "loss": 1.303, | |
| "num_input_tokens_seen": 93161528, | |
| "step": 171000 | |
| }, | |
| { | |
| "epoch": 1.7158235953257563, | |
| "grad_norm": 6.139397144317627, | |
| "learning_rate": 4.142093204738274e-05, | |
| "loss": 1.2884, | |
| "num_input_tokens_seen": 93434024, | |
| "step": 171500 | |
| }, | |
| { | |
| "epoch": 1.7208259964783097, | |
| "grad_norm": 5.353676795959473, | |
| "learning_rate": 4.1395920041619976e-05, | |
| "loss": 1.2865, | |
| "num_input_tokens_seen": 93712728, | |
| "step": 172000 | |
| }, | |
| { | |
| "epoch": 1.7258283976308628, | |
| "grad_norm": 7.979468822479248, | |
| "learning_rate": 4.137090803585721e-05, | |
| "loss": 1.2884, | |
| "num_input_tokens_seen": 93992640, | |
| "step": 172500 | |
| }, | |
| { | |
| "epoch": 1.730830798783416, | |
| "grad_norm": 5.386059761047363, | |
| "learning_rate": 4.134589603009445e-05, | |
| "loss": 1.2875, | |
| "num_input_tokens_seen": 94262344, | |
| "step": 173000 | |
| }, | |
| { | |
| "epoch": 1.7358331999359693, | |
| "grad_norm": 4.8488311767578125, | |
| "learning_rate": 4.132088402433168e-05, | |
| "loss": 1.2636, | |
| "num_input_tokens_seen": 94536416, | |
| "step": 173500 | |
| }, | |
| { | |
| "epoch": 1.7408356010885226, | |
| "grad_norm": 7.375112056732178, | |
| "learning_rate": 4.1295872018568916e-05, | |
| "loss": 1.276, | |
| "num_input_tokens_seen": 94806528, | |
| "step": 174000 | |
| }, | |
| { | |
| "epoch": 1.7458380022410758, | |
| "grad_norm": 4.830787181854248, | |
| "learning_rate": 4.127086001280615e-05, | |
| "loss": 1.2681, | |
| "num_input_tokens_seen": 95076832, | |
| "step": 174500 | |
| }, | |
| { | |
| "epoch": 1.750840403393629, | |
| "grad_norm": 5.590123653411865, | |
| "learning_rate": 4.124584800704338e-05, | |
| "loss": 1.273, | |
| "num_input_tokens_seen": 95342672, | |
| "step": 175000 | |
| }, | |
| { | |
| "epoch": 1.755842804546182, | |
| "grad_norm": 5.334784984588623, | |
| "learning_rate": 4.1220836001280614e-05, | |
| "loss": 1.29, | |
| "num_input_tokens_seen": 95613968, | |
| "step": 175500 | |
| }, | |
| { | |
| "epoch": 1.7608452056987354, | |
| "grad_norm": 5.795757293701172, | |
| "learning_rate": 4.119582399551785e-05, | |
| "loss": 1.2683, | |
| "num_input_tokens_seen": 95880488, | |
| "step": 176000 | |
| }, | |
| { | |
| "epoch": 1.7658476068512887, | |
| "grad_norm": 5.436016082763672, | |
| "learning_rate": 4.1170811989755084e-05, | |
| "loss": 1.2665, | |
| "num_input_tokens_seen": 96148192, | |
| "step": 176500 | |
| }, | |
| { | |
| "epoch": 1.7708500080038418, | |
| "grad_norm": 7.753640174865723, | |
| "learning_rate": 4.114579998399232e-05, | |
| "loss": 1.3018, | |
| "num_input_tokens_seen": 96422808, | |
| "step": 177000 | |
| }, | |
| { | |
| "epoch": 1.775852409156395, | |
| "grad_norm": 6.833972454071045, | |
| "learning_rate": 4.1120787978229554e-05, | |
| "loss": 1.2731, | |
| "num_input_tokens_seen": 96691008, | |
| "step": 177500 | |
| }, | |
| { | |
| "epoch": 1.7808548103089483, | |
| "grad_norm": 5.354393482208252, | |
| "learning_rate": 4.109577597246679e-05, | |
| "loss": 1.2683, | |
| "num_input_tokens_seen": 96964040, | |
| "step": 178000 | |
| }, | |
| { | |
| "epoch": 1.7858572114615017, | |
| "grad_norm": 5.666247367858887, | |
| "learning_rate": 4.107076396670402e-05, | |
| "loss": 1.3028, | |
| "num_input_tokens_seen": 97244192, | |
| "step": 178500 | |
| }, | |
| { | |
| "epoch": 1.7908596126140548, | |
| "grad_norm": 5.841219902038574, | |
| "learning_rate": 4.104575196094125e-05, | |
| "loss": 1.2637, | |
| "num_input_tokens_seen": 97515856, | |
| "step": 179000 | |
| }, | |
| { | |
| "epoch": 1.795862013766608, | |
| "grad_norm": 6.097582817077637, | |
| "learning_rate": 4.102073995517849e-05, | |
| "loss": 1.2602, | |
| "num_input_tokens_seen": 97784032, | |
| "step": 179500 | |
| }, | |
| { | |
| "epoch": 1.800864414919161, | |
| "grad_norm": 6.291224002838135, | |
| "learning_rate": 4.099572794941572e-05, | |
| "loss": 1.2792, | |
| "num_input_tokens_seen": 98061744, | |
| "step": 180000 | |
| }, | |
| { | |
| "epoch": 1.8058668160717144, | |
| "grad_norm": 6.529845237731934, | |
| "learning_rate": 4.097071594365296e-05, | |
| "loss": 1.2751, | |
| "num_input_tokens_seen": 98333416, | |
| "step": 180500 | |
| }, | |
| { | |
| "epoch": 1.8108692172242677, | |
| "grad_norm": 5.767446041107178, | |
| "learning_rate": 4.094570393789019e-05, | |
| "loss": 1.2924, | |
| "num_input_tokens_seen": 98611352, | |
| "step": 181000 | |
| }, | |
| { | |
| "epoch": 1.8158716183768209, | |
| "grad_norm": 11.748208999633789, | |
| "learning_rate": 4.092069193212743e-05, | |
| "loss": 1.2656, | |
| "num_input_tokens_seen": 98875048, | |
| "step": 181500 | |
| }, | |
| { | |
| "epoch": 1.820874019529374, | |
| "grad_norm": 6.215290069580078, | |
| "learning_rate": 4.0895679926364655e-05, | |
| "loss": 1.2854, | |
| "num_input_tokens_seen": 99142440, | |
| "step": 182000 | |
| }, | |
| { | |
| "epoch": 1.8258764206819273, | |
| "grad_norm": 4.965378284454346, | |
| "learning_rate": 4.087066792060189e-05, | |
| "loss": 1.3011, | |
| "num_input_tokens_seen": 99420240, | |
| "step": 182500 | |
| }, | |
| { | |
| "epoch": 1.8308788218344807, | |
| "grad_norm": 4.903427600860596, | |
| "learning_rate": 4.0845655914839125e-05, | |
| "loss": 1.29, | |
| "num_input_tokens_seen": 99700544, | |
| "step": 183000 | |
| }, | |
| { | |
| "epoch": 1.8358812229870338, | |
| "grad_norm": 7.428767204284668, | |
| "learning_rate": 4.082064390907635e-05, | |
| "loss": 1.2784, | |
| "num_input_tokens_seen": 99976104, | |
| "step": 183500 | |
| }, | |
| { | |
| "epoch": 1.840883624139587, | |
| "grad_norm": 5.334924697875977, | |
| "learning_rate": 4.079563190331359e-05, | |
| "loss": 1.2617, | |
| "num_input_tokens_seen": 100248360, | |
| "step": 184000 | |
| }, | |
| { | |
| "epoch": 1.84588602529214, | |
| "grad_norm": 5.380727291107178, | |
| "learning_rate": 4.077061989755083e-05, | |
| "loss": 1.2824, | |
| "num_input_tokens_seen": 100520712, | |
| "step": 184500 | |
| }, | |
| { | |
| "epoch": 1.8508884264446934, | |
| "grad_norm": 6.993951320648193, | |
| "learning_rate": 4.0745607891788065e-05, | |
| "loss": 1.3006, | |
| "num_input_tokens_seen": 100795856, | |
| "step": 185000 | |
| }, | |
| { | |
| "epoch": 1.8558908275972468, | |
| "grad_norm": 6.079780578613281, | |
| "learning_rate": 4.072059588602529e-05, | |
| "loss": 1.2844, | |
| "num_input_tokens_seen": 101069232, | |
| "step": 185500 | |
| }, | |
| { | |
| "epoch": 1.8608932287498, | |
| "grad_norm": 5.772866725921631, | |
| "learning_rate": 4.069558388026253e-05, | |
| "loss": 1.269, | |
| "num_input_tokens_seen": 101347816, | |
| "step": 186000 | |
| }, | |
| { | |
| "epoch": 1.865895629902353, | |
| "grad_norm": 6.067032337188721, | |
| "learning_rate": 4.067057187449976e-05, | |
| "loss": 1.2753, | |
| "num_input_tokens_seen": 101618784, | |
| "step": 186500 | |
| }, | |
| { | |
| "epoch": 1.8708980310549064, | |
| "grad_norm": 8.178043365478516, | |
| "learning_rate": 4.064555986873699e-05, | |
| "loss": 1.2694, | |
| "num_input_tokens_seen": 101889416, | |
| "step": 187000 | |
| }, | |
| { | |
| "epoch": 1.8759004322074597, | |
| "grad_norm": 5.999898433685303, | |
| "learning_rate": 4.0620547862974226e-05, | |
| "loss": 1.2774, | |
| "num_input_tokens_seen": 102163040, | |
| "step": 187500 | |
| }, | |
| { | |
| "epoch": 1.8809028333600128, | |
| "grad_norm": 7.069881439208984, | |
| "learning_rate": 4.059553585721146e-05, | |
| "loss": 1.2848, | |
| "num_input_tokens_seen": 102431528, | |
| "step": 188000 | |
| }, | |
| { | |
| "epoch": 1.885905234512566, | |
| "grad_norm": 5.21435546875, | |
| "learning_rate": 4.05705238514487e-05, | |
| "loss": 1.2599, | |
| "num_input_tokens_seen": 102705520, | |
| "step": 188500 | |
| }, | |
| { | |
| "epoch": 1.890907635665119, | |
| "grad_norm": 6.542243003845215, | |
| "learning_rate": 4.054551184568593e-05, | |
| "loss": 1.283, | |
| "num_input_tokens_seen": 102981304, | |
| "step": 189000 | |
| }, | |
| { | |
| "epoch": 1.8959100368176725, | |
| "grad_norm": 6.719133377075195, | |
| "learning_rate": 4.0520499839923166e-05, | |
| "loss": 1.2879, | |
| "num_input_tokens_seen": 103259112, | |
| "step": 189500 | |
| }, | |
| { | |
| "epoch": 1.9009124379702258, | |
| "grad_norm": 6.38728666305542, | |
| "learning_rate": 4.04954878341604e-05, | |
| "loss": 1.2548, | |
| "num_input_tokens_seen": 103534888, | |
| "step": 190000 | |
| }, | |
| { | |
| "epoch": 1.905914839122779, | |
| "grad_norm": 5.428126811981201, | |
| "learning_rate": 4.047047582839763e-05, | |
| "loss": 1.28, | |
| "num_input_tokens_seen": 103803600, | |
| "step": 190500 | |
| }, | |
| { | |
| "epoch": 1.910917240275332, | |
| "grad_norm": 5.377976894378662, | |
| "learning_rate": 4.0445463822634864e-05, | |
| "loss": 1.2658, | |
| "num_input_tokens_seen": 104075160, | |
| "step": 191000 | |
| }, | |
| { | |
| "epoch": 1.9159196414278854, | |
| "grad_norm": 5.453880786895752, | |
| "learning_rate": 4.04204518168721e-05, | |
| "loss": 1.2643, | |
| "num_input_tokens_seen": 104349992, | |
| "step": 191500 | |
| }, | |
| { | |
| "epoch": 1.9209220425804387, | |
| "grad_norm": 5.114168167114258, | |
| "learning_rate": 4.0395439811109334e-05, | |
| "loss": 1.2769, | |
| "num_input_tokens_seen": 104621104, | |
| "step": 192000 | |
| }, | |
| { | |
| "epoch": 1.9259244437329919, | |
| "grad_norm": 5.22728157043457, | |
| "learning_rate": 4.037042780534657e-05, | |
| "loss": 1.2842, | |
| "num_input_tokens_seen": 104890976, | |
| "step": 192500 | |
| }, | |
| { | |
| "epoch": 1.930926844885545, | |
| "grad_norm": 5.4410881996154785, | |
| "learning_rate": 4.0345415799583804e-05, | |
| "loss": 1.2627, | |
| "num_input_tokens_seen": 105165152, | |
| "step": 193000 | |
| }, | |
| { | |
| "epoch": 1.9359292460380981, | |
| "grad_norm": 5.700538158416748, | |
| "learning_rate": 4.032040379382104e-05, | |
| "loss": 1.275, | |
| "num_input_tokens_seen": 105431920, | |
| "step": 193500 | |
| }, | |
| { | |
| "epoch": 1.9409316471906515, | |
| "grad_norm": 5.171668529510498, | |
| "learning_rate": 4.029539178805827e-05, | |
| "loss": 1.2852, | |
| "num_input_tokens_seen": 105709976, | |
| "step": 194000 | |
| }, | |
| { | |
| "epoch": 1.9459340483432048, | |
| "grad_norm": 7.026444911956787, | |
| "learning_rate": 4.02703797822955e-05, | |
| "loss": 1.2718, | |
| "num_input_tokens_seen": 105983472, | |
| "step": 194500 | |
| }, | |
| { | |
| "epoch": 1.950936449495758, | |
| "grad_norm": 6.670947074890137, | |
| "learning_rate": 4.024536777653274e-05, | |
| "loss": 1.2574, | |
| "num_input_tokens_seen": 106257632, | |
| "step": 195000 | |
| }, | |
| { | |
| "epoch": 1.955938850648311, | |
| "grad_norm": 5.797586441040039, | |
| "learning_rate": 4.022035577076997e-05, | |
| "loss": 1.2821, | |
| "num_input_tokens_seen": 106533272, | |
| "step": 195500 | |
| }, | |
| { | |
| "epoch": 1.9609412518008644, | |
| "grad_norm": 7.070456504821777, | |
| "learning_rate": 4.019534376500721e-05, | |
| "loss": 1.2749, | |
| "num_input_tokens_seen": 106804176, | |
| "step": 196000 | |
| }, | |
| { | |
| "epoch": 1.9659436529534178, | |
| "grad_norm": 5.074236869812012, | |
| "learning_rate": 4.017033175924444e-05, | |
| "loss": 1.2837, | |
| "num_input_tokens_seen": 107077264, | |
| "step": 196500 | |
| }, | |
| { | |
| "epoch": 1.970946054105971, | |
| "grad_norm": 5.952401161193848, | |
| "learning_rate": 4.014531975348168e-05, | |
| "loss": 1.2481, | |
| "num_input_tokens_seen": 107342400, | |
| "step": 197000 | |
| }, | |
| { | |
| "epoch": 1.975948455258524, | |
| "grad_norm": 6.63128662109375, | |
| "learning_rate": 4.0120307747718905e-05, | |
| "loss": 1.2769, | |
| "num_input_tokens_seen": 107619760, | |
| "step": 197500 | |
| }, | |
| { | |
| "epoch": 1.9809508564110772, | |
| "grad_norm": 6.601523399353027, | |
| "learning_rate": 4.009529574195614e-05, | |
| "loss": 1.275, | |
| "num_input_tokens_seen": 107898512, | |
| "step": 198000 | |
| }, | |
| { | |
| "epoch": 1.9859532575636305, | |
| "grad_norm": 6.857260227203369, | |
| "learning_rate": 4.0070283736193375e-05, | |
| "loss": 1.2508, | |
| "num_input_tokens_seen": 108163904, | |
| "step": 198500 | |
| }, | |
| { | |
| "epoch": 1.9909556587161839, | |
| "grad_norm": 5.871264934539795, | |
| "learning_rate": 4.004527173043061e-05, | |
| "loss": 1.2712, | |
| "num_input_tokens_seen": 108440480, | |
| "step": 199000 | |
| }, | |
| { | |
| "epoch": 1.995958059868737, | |
| "grad_norm": 7.567385673522949, | |
| "learning_rate": 4.002025972466784e-05, | |
| "loss": 1.2801, | |
| "num_input_tokens_seen": 108714992, | |
| "step": 199500 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 1.1701077222824097, | |
| "eval_runtime": 186.6785, | |
| "eval_samples_per_second": 1070.851, | |
| "eval_steps_per_second": 133.861, | |
| "num_input_tokens_seen": 108935048, | |
| "step": 199904 | |
| }, | |
| { | |
| "epoch": 2.00096046102129, | |
| "grad_norm": 5.430812835693359, | |
| "learning_rate": 3.999524771890507e-05, | |
| "loss": 1.2538, | |
| "num_input_tokens_seen": 108986608, | |
| "step": 200000 | |
| }, | |
| { | |
| "epoch": 2.0059628621738432, | |
| "grad_norm": 7.064018249511719, | |
| "learning_rate": 3.9970235713142315e-05, | |
| "loss": 1.1538, | |
| "num_input_tokens_seen": 109256864, | |
| "step": 200500 | |
| }, | |
| { | |
| "epoch": 2.010965263326397, | |
| "grad_norm": 6.479573726654053, | |
| "learning_rate": 3.994522370737954e-05, | |
| "loss": 1.1664, | |
| "num_input_tokens_seen": 109523672, | |
| "step": 201000 | |
| }, | |
| { | |
| "epoch": 2.01596766447895, | |
| "grad_norm": 6.595979690551758, | |
| "learning_rate": 3.992021170161678e-05, | |
| "loss": 1.1338, | |
| "num_input_tokens_seen": 109791408, | |
| "step": 201500 | |
| }, | |
| { | |
| "epoch": 2.020970065631503, | |
| "grad_norm": 7.46008825302124, | |
| "learning_rate": 3.989519969585401e-05, | |
| "loss": 1.1799, | |
| "num_input_tokens_seen": 110064104, | |
| "step": 202000 | |
| }, | |
| { | |
| "epoch": 2.025972466784056, | |
| "grad_norm": 5.414816379547119, | |
| "learning_rate": 3.987018769009124e-05, | |
| "loss": 1.1688, | |
| "num_input_tokens_seen": 110335328, | |
| "step": 202500 | |
| }, | |
| { | |
| "epoch": 2.0309748679366098, | |
| "grad_norm": 7.442201137542725, | |
| "learning_rate": 3.9845175684328476e-05, | |
| "loss": 1.1804, | |
| "num_input_tokens_seen": 110611648, | |
| "step": 203000 | |
| }, | |
| { | |
| "epoch": 2.035977269089163, | |
| "grad_norm": 5.2355475425720215, | |
| "learning_rate": 3.982016367856571e-05, | |
| "loss": 1.1776, | |
| "num_input_tokens_seen": 110883064, | |
| "step": 203500 | |
| }, | |
| { | |
| "epoch": 2.040979670241716, | |
| "grad_norm": 7.008761882781982, | |
| "learning_rate": 3.9795151672802946e-05, | |
| "loss": 1.1622, | |
| "num_input_tokens_seen": 111162464, | |
| "step": 204000 | |
| }, | |
| { | |
| "epoch": 2.045982071394269, | |
| "grad_norm": 5.213141918182373, | |
| "learning_rate": 3.977013966704018e-05, | |
| "loss": 1.1863, | |
| "num_input_tokens_seen": 111434960, | |
| "step": 204500 | |
| }, | |
| { | |
| "epoch": 2.0509844725468223, | |
| "grad_norm": 6.3171000480651855, | |
| "learning_rate": 3.9745127661277416e-05, | |
| "loss": 1.1735, | |
| "num_input_tokens_seen": 111707896, | |
| "step": 205000 | |
| }, | |
| { | |
| "epoch": 2.055986873699376, | |
| "grad_norm": 5.790093898773193, | |
| "learning_rate": 3.972011565551465e-05, | |
| "loss": 1.1916, | |
| "num_input_tokens_seen": 111976120, | |
| "step": 205500 | |
| }, | |
| { | |
| "epoch": 2.060989274851929, | |
| "grad_norm": 5.817662239074707, | |
| "learning_rate": 3.969510364975188e-05, | |
| "loss": 1.1996, | |
| "num_input_tokens_seen": 112248384, | |
| "step": 206000 | |
| }, | |
| { | |
| "epoch": 2.065991676004482, | |
| "grad_norm": 7.098010063171387, | |
| "learning_rate": 3.9670091643989114e-05, | |
| "loss": 1.1698, | |
| "num_input_tokens_seen": 112525640, | |
| "step": 206500 | |
| }, | |
| { | |
| "epoch": 2.070994077157035, | |
| "grad_norm": 5.172534942626953, | |
| "learning_rate": 3.964507963822635e-05, | |
| "loss": 1.17, | |
| "num_input_tokens_seen": 112794848, | |
| "step": 207000 | |
| }, | |
| { | |
| "epoch": 2.0759964783095888, | |
| "grad_norm": 5.681086540222168, | |
| "learning_rate": 3.9620067632463584e-05, | |
| "loss": 1.1812, | |
| "num_input_tokens_seen": 113070744, | |
| "step": 207500 | |
| }, | |
| { | |
| "epoch": 2.080998879462142, | |
| "grad_norm": 6.1572489738464355, | |
| "learning_rate": 3.959505562670082e-05, | |
| "loss": 1.1733, | |
| "num_input_tokens_seen": 113352016, | |
| "step": 208000 | |
| }, | |
| { | |
| "epoch": 2.086001280614695, | |
| "grad_norm": 6.149631023406982, | |
| "learning_rate": 3.9570043620938054e-05, | |
| "loss": 1.1733, | |
| "num_input_tokens_seen": 113629168, | |
| "step": 208500 | |
| }, | |
| { | |
| "epoch": 2.091003681767248, | |
| "grad_norm": 4.973777770996094, | |
| "learning_rate": 3.954503161517529e-05, | |
| "loss": 1.1732, | |
| "num_input_tokens_seen": 113899440, | |
| "step": 209000 | |
| }, | |
| { | |
| "epoch": 2.0960060829198017, | |
| "grad_norm": 12.051576614379883, | |
| "learning_rate": 3.952001960941252e-05, | |
| "loss": 1.1974, | |
| "num_input_tokens_seen": 114170160, | |
| "step": 209500 | |
| }, | |
| { | |
| "epoch": 2.101008484072355, | |
| "grad_norm": 5.458679676055908, | |
| "learning_rate": 3.949500760364975e-05, | |
| "loss": 1.1664, | |
| "num_input_tokens_seen": 114441696, | |
| "step": 210000 | |
| }, | |
| { | |
| "epoch": 2.106010885224908, | |
| "grad_norm": 5.033444404602051, | |
| "learning_rate": 3.946999559788699e-05, | |
| "loss": 1.1781, | |
| "num_input_tokens_seen": 114715752, | |
| "step": 210500 | |
| }, | |
| { | |
| "epoch": 2.111013286377461, | |
| "grad_norm": 5.643963813781738, | |
| "learning_rate": 3.944498359212422e-05, | |
| "loss": 1.1849, | |
| "num_input_tokens_seen": 114996136, | |
| "step": 211000 | |
| }, | |
| { | |
| "epoch": 2.1160156875300142, | |
| "grad_norm": 6.656442165374756, | |
| "learning_rate": 3.941997158636145e-05, | |
| "loss": 1.1792, | |
| "num_input_tokens_seen": 115259352, | |
| "step": 211500 | |
| }, | |
| { | |
| "epoch": 2.121018088682568, | |
| "grad_norm": 5.712615013122559, | |
| "learning_rate": 3.939495958059869e-05, | |
| "loss": 1.1799, | |
| "num_input_tokens_seen": 115534944, | |
| "step": 212000 | |
| }, | |
| { | |
| "epoch": 2.126020489835121, | |
| "grad_norm": 8.317249298095703, | |
| "learning_rate": 3.936994757483593e-05, | |
| "loss": 1.1853, | |
| "num_input_tokens_seen": 115808536, | |
| "step": 212500 | |
| }, | |
| { | |
| "epoch": 2.131022890987674, | |
| "grad_norm": 6.112279415130615, | |
| "learning_rate": 3.9344935569073155e-05, | |
| "loss": 1.1822, | |
| "num_input_tokens_seen": 116087064, | |
| "step": 213000 | |
| }, | |
| { | |
| "epoch": 2.136025292140227, | |
| "grad_norm": 7.357901096343994, | |
| "learning_rate": 3.931992356331039e-05, | |
| "loss": 1.1866, | |
| "num_input_tokens_seen": 116365200, | |
| "step": 213500 | |
| }, | |
| { | |
| "epoch": 2.1410276932927808, | |
| "grad_norm": 5.3409929275512695, | |
| "learning_rate": 3.9294911557547625e-05, | |
| "loss": 1.1897, | |
| "num_input_tokens_seen": 116636120, | |
| "step": 214000 | |
| }, | |
| { | |
| "epoch": 2.146030094445334, | |
| "grad_norm": 7.562960624694824, | |
| "learning_rate": 3.926989955178486e-05, | |
| "loss": 1.1809, | |
| "num_input_tokens_seen": 116916360, | |
| "step": 214500 | |
| }, | |
| { | |
| "epoch": 2.151032495597887, | |
| "grad_norm": 5.4258503913879395, | |
| "learning_rate": 3.924488754602209e-05, | |
| "loss": 1.1871, | |
| "num_input_tokens_seen": 117184808, | |
| "step": 215000 | |
| }, | |
| { | |
| "epoch": 2.15603489675044, | |
| "grad_norm": 6.741093158721924, | |
| "learning_rate": 3.9219875540259324e-05, | |
| "loss": 1.176, | |
| "num_input_tokens_seen": 117454928, | |
| "step": 215500 | |
| }, | |
| { | |
| "epoch": 2.1610372979029933, | |
| "grad_norm": 6.085869789123535, | |
| "learning_rate": 3.9194863534496565e-05, | |
| "loss": 1.1789, | |
| "num_input_tokens_seen": 117722352, | |
| "step": 216000 | |
| }, | |
| { | |
| "epoch": 2.166039699055547, | |
| "grad_norm": 6.9086151123046875, | |
| "learning_rate": 3.9169851528733793e-05, | |
| "loss": 1.189, | |
| "num_input_tokens_seen": 117990704, | |
| "step": 216500 | |
| }, | |
| { | |
| "epoch": 2.1710421002081, | |
| "grad_norm": 5.497861385345459, | |
| "learning_rate": 3.914483952297103e-05, | |
| "loss": 1.1768, | |
| "num_input_tokens_seen": 118255368, | |
| "step": 217000 | |
| }, | |
| { | |
| "epoch": 2.176044501360653, | |
| "grad_norm": 8.487640380859375, | |
| "learning_rate": 3.9119827517208263e-05, | |
| "loss": 1.1743, | |
| "num_input_tokens_seen": 118525432, | |
| "step": 217500 | |
| }, | |
| { | |
| "epoch": 2.181046902513206, | |
| "grad_norm": 5.8003435134887695, | |
| "learning_rate": 3.90948155114455e-05, | |
| "loss": 1.1697, | |
| "num_input_tokens_seen": 118799496, | |
| "step": 218000 | |
| }, | |
| { | |
| "epoch": 2.18604930366576, | |
| "grad_norm": 7.726077079772949, | |
| "learning_rate": 3.9069803505682727e-05, | |
| "loss": 1.1784, | |
| "num_input_tokens_seen": 119074944, | |
| "step": 218500 | |
| }, | |
| { | |
| "epoch": 2.191051704818313, | |
| "grad_norm": 5.625581741333008, | |
| "learning_rate": 3.904479149991996e-05, | |
| "loss": 1.1856, | |
| "num_input_tokens_seen": 119353528, | |
| "step": 219000 | |
| }, | |
| { | |
| "epoch": 2.196054105970866, | |
| "grad_norm": 5.582902908325195, | |
| "learning_rate": 3.9019779494157196e-05, | |
| "loss": 1.1832, | |
| "num_input_tokens_seen": 119627520, | |
| "step": 219500 | |
| }, | |
| { | |
| "epoch": 2.201056507123419, | |
| "grad_norm": 5.2057671546936035, | |
| "learning_rate": 3.899476748839443e-05, | |
| "loss": 1.1888, | |
| "num_input_tokens_seen": 119894432, | |
| "step": 220000 | |
| }, | |
| { | |
| "epoch": 2.2060589082759723, | |
| "grad_norm": 6.18375825881958, | |
| "learning_rate": 3.8969755482631666e-05, | |
| "loss": 1.1636, | |
| "num_input_tokens_seen": 120165872, | |
| "step": 220500 | |
| }, | |
| { | |
| "epoch": 2.211061309428526, | |
| "grad_norm": 7.083649158477783, | |
| "learning_rate": 3.89447434768689e-05, | |
| "loss": 1.1716, | |
| "num_input_tokens_seen": 120437360, | |
| "step": 221000 | |
| }, | |
| { | |
| "epoch": 2.216063710581079, | |
| "grad_norm": 6.966033458709717, | |
| "learning_rate": 3.891973147110613e-05, | |
| "loss": 1.1899, | |
| "num_input_tokens_seen": 120707824, | |
| "step": 221500 | |
| }, | |
| { | |
| "epoch": 2.221066111733632, | |
| "grad_norm": 4.439563751220703, | |
| "learning_rate": 3.8894719465343365e-05, | |
| "loss": 1.1714, | |
| "num_input_tokens_seen": 120977840, | |
| "step": 222000 | |
| }, | |
| { | |
| "epoch": 2.2260685128861852, | |
| "grad_norm": 6.870123386383057, | |
| "learning_rate": 3.88697074595806e-05, | |
| "loss": 1.1793, | |
| "num_input_tokens_seen": 121254560, | |
| "step": 222500 | |
| }, | |
| { | |
| "epoch": 2.231070914038739, | |
| "grad_norm": 8.789484024047852, | |
| "learning_rate": 3.8844695453817834e-05, | |
| "loss": 1.1851, | |
| "num_input_tokens_seen": 121523936, | |
| "step": 223000 | |
| }, | |
| { | |
| "epoch": 2.236073315191292, | |
| "grad_norm": 6.196369647979736, | |
| "learning_rate": 3.881968344805507e-05, | |
| "loss": 1.1857, | |
| "num_input_tokens_seen": 121795288, | |
| "step": 223500 | |
| }, | |
| { | |
| "epoch": 2.241075716343845, | |
| "grad_norm": 5.902594566345215, | |
| "learning_rate": 3.8794671442292304e-05, | |
| "loss": 1.2016, | |
| "num_input_tokens_seen": 122065592, | |
| "step": 224000 | |
| }, | |
| { | |
| "epoch": 2.246078117496398, | |
| "grad_norm": 6.811281681060791, | |
| "learning_rate": 3.876965943652954e-05, | |
| "loss": 1.1837, | |
| "num_input_tokens_seen": 122340584, | |
| "step": 224500 | |
| }, | |
| { | |
| "epoch": 2.2510805186489513, | |
| "grad_norm": 6.388464450836182, | |
| "learning_rate": 3.874464743076677e-05, | |
| "loss": 1.1816, | |
| "num_input_tokens_seen": 122612352, | |
| "step": 225000 | |
| }, | |
| { | |
| "epoch": 2.256082919801505, | |
| "grad_norm": 6.045330047607422, | |
| "learning_rate": 3.8719635425004e-05, | |
| "loss": 1.1835, | |
| "num_input_tokens_seen": 122878624, | |
| "step": 225500 | |
| }, | |
| { | |
| "epoch": 2.261085320954058, | |
| "grad_norm": 7.601827621459961, | |
| "learning_rate": 3.869462341924124e-05, | |
| "loss": 1.1766, | |
| "num_input_tokens_seen": 123143944, | |
| "step": 226000 | |
| }, | |
| { | |
| "epoch": 2.266087722106611, | |
| "grad_norm": 5.323575496673584, | |
| "learning_rate": 3.866961141347847e-05, | |
| "loss": 1.1913, | |
| "num_input_tokens_seen": 123415600, | |
| "step": 226500 | |
| }, | |
| { | |
| "epoch": 2.2710901232591643, | |
| "grad_norm": 6.938271522521973, | |
| "learning_rate": 3.86445994077157e-05, | |
| "loss": 1.1812, | |
| "num_input_tokens_seen": 123682608, | |
| "step": 227000 | |
| }, | |
| { | |
| "epoch": 2.276092524411718, | |
| "grad_norm": 5.9254021644592285, | |
| "learning_rate": 3.861958740195294e-05, | |
| "loss": 1.1856, | |
| "num_input_tokens_seen": 123954888, | |
| "step": 227500 | |
| }, | |
| { | |
| "epoch": 2.281094925564271, | |
| "grad_norm": 7.544998645782471, | |
| "learning_rate": 3.859457539619018e-05, | |
| "loss": 1.1957, | |
| "num_input_tokens_seen": 124230632, | |
| "step": 228000 | |
| }, | |
| { | |
| "epoch": 2.286097326716824, | |
| "grad_norm": 6.14992618560791, | |
| "learning_rate": 3.8569563390427406e-05, | |
| "loss": 1.1894, | |
| "num_input_tokens_seen": 124507352, | |
| "step": 228500 | |
| }, | |
| { | |
| "epoch": 2.291099727869377, | |
| "grad_norm": 5.440382957458496, | |
| "learning_rate": 3.854455138466464e-05, | |
| "loss": 1.1752, | |
| "num_input_tokens_seen": 124781440, | |
| "step": 229000 | |
| }, | |
| { | |
| "epoch": 2.2961021290219303, | |
| "grad_norm": 7.271317481994629, | |
| "learning_rate": 3.8519539378901876e-05, | |
| "loss": 1.1749, | |
| "num_input_tokens_seen": 125055624, | |
| "step": 229500 | |
| }, | |
| { | |
| "epoch": 2.301104530174484, | |
| "grad_norm": 5.141626834869385, | |
| "learning_rate": 3.849452737313911e-05, | |
| "loss": 1.18, | |
| "num_input_tokens_seen": 125329000, | |
| "step": 230000 | |
| }, | |
| { | |
| "epoch": 2.306106931327037, | |
| "grad_norm": 6.321171760559082, | |
| "learning_rate": 3.846951536737634e-05, | |
| "loss": 1.1741, | |
| "num_input_tokens_seen": 125605816, | |
| "step": 230500 | |
| }, | |
| { | |
| "epoch": 2.31110933247959, | |
| "grad_norm": 5.19276237487793, | |
| "learning_rate": 3.8444503361613574e-05, | |
| "loss": 1.1966, | |
| "num_input_tokens_seen": 125887328, | |
| "step": 231000 | |
| }, | |
| { | |
| "epoch": 2.3161117336321433, | |
| "grad_norm": 5.9422125816345215, | |
| "learning_rate": 3.841949135585081e-05, | |
| "loss": 1.1638, | |
| "num_input_tokens_seen": 126158768, | |
| "step": 231500 | |
| }, | |
| { | |
| "epoch": 2.321114134784697, | |
| "grad_norm": 5.361838340759277, | |
| "learning_rate": 3.8394479350088044e-05, | |
| "loss": 1.1737, | |
| "num_input_tokens_seen": 126429432, | |
| "step": 232000 | |
| }, | |
| { | |
| "epoch": 2.32611653593725, | |
| "grad_norm": 6.030839920043945, | |
| "learning_rate": 3.836946734432528e-05, | |
| "loss": 1.1975, | |
| "num_input_tokens_seen": 126703336, | |
| "step": 232500 | |
| }, | |
| { | |
| "epoch": 2.331118937089803, | |
| "grad_norm": 6.013172149658203, | |
| "learning_rate": 3.8344455338562514e-05, | |
| "loss": 1.1785, | |
| "num_input_tokens_seen": 126981120, | |
| "step": 233000 | |
| }, | |
| { | |
| "epoch": 2.3361213382423562, | |
| "grad_norm": 5.227244853973389, | |
| "learning_rate": 3.831944333279975e-05, | |
| "loss": 1.1934, | |
| "num_input_tokens_seen": 127248672, | |
| "step": 233500 | |
| }, | |
| { | |
| "epoch": 2.3411237393949094, | |
| "grad_norm": 5.995646953582764, | |
| "learning_rate": 3.829443132703698e-05, | |
| "loss": 1.198, | |
| "num_input_tokens_seen": 127525032, | |
| "step": 234000 | |
| }, | |
| { | |
| "epoch": 2.346126140547463, | |
| "grad_norm": 8.163732528686523, | |
| "learning_rate": 3.826941932127421e-05, | |
| "loss": 1.1743, | |
| "num_input_tokens_seen": 127793384, | |
| "step": 234500 | |
| }, | |
| { | |
| "epoch": 2.351128541700016, | |
| "grad_norm": 5.394166946411133, | |
| "learning_rate": 3.824440731551145e-05, | |
| "loss": 1.1726, | |
| "num_input_tokens_seen": 128065120, | |
| "step": 235000 | |
| }, | |
| { | |
| "epoch": 2.356130942852569, | |
| "grad_norm": 5.673594951629639, | |
| "learning_rate": 3.821939530974868e-05, | |
| "loss": 1.1959, | |
| "num_input_tokens_seen": 128334976, | |
| "step": 235500 | |
| }, | |
| { | |
| "epoch": 2.3611333440051223, | |
| "grad_norm": 5.715531826019287, | |
| "learning_rate": 3.8194383303985917e-05, | |
| "loss": 1.1936, | |
| "num_input_tokens_seen": 128610504, | |
| "step": 236000 | |
| }, | |
| { | |
| "epoch": 2.366135745157676, | |
| "grad_norm": 5.725061416625977, | |
| "learning_rate": 3.816937129822315e-05, | |
| "loss": 1.1771, | |
| "num_input_tokens_seen": 128881800, | |
| "step": 236500 | |
| }, | |
| { | |
| "epoch": 2.371138146310229, | |
| "grad_norm": 4.505105972290039, | |
| "learning_rate": 3.8144359292460386e-05, | |
| "loss": 1.1826, | |
| "num_input_tokens_seen": 129157576, | |
| "step": 237000 | |
| }, | |
| { | |
| "epoch": 2.376140547462782, | |
| "grad_norm": 5.860077857971191, | |
| "learning_rate": 3.8119347286697615e-05, | |
| "loss": 1.1925, | |
| "num_input_tokens_seen": 129432392, | |
| "step": 237500 | |
| }, | |
| { | |
| "epoch": 2.3811429486153353, | |
| "grad_norm": 6.7791337966918945, | |
| "learning_rate": 3.809433528093485e-05, | |
| "loss": 1.1746, | |
| "num_input_tokens_seen": 129700968, | |
| "step": 238000 | |
| }, | |
| { | |
| "epoch": 2.3861453497678884, | |
| "grad_norm": 5.708649635314941, | |
| "learning_rate": 3.8069323275172085e-05, | |
| "loss": 1.1793, | |
| "num_input_tokens_seen": 129977384, | |
| "step": 238500 | |
| }, | |
| { | |
| "epoch": 2.391147750920442, | |
| "grad_norm": 5.659774303436279, | |
| "learning_rate": 3.804431126940932e-05, | |
| "loss": 1.1797, | |
| "num_input_tokens_seen": 130248672, | |
| "step": 239000 | |
| }, | |
| { | |
| "epoch": 2.396150152072995, | |
| "grad_norm": 6.859200477600098, | |
| "learning_rate": 3.8019299263646555e-05, | |
| "loss": 1.1853, | |
| "num_input_tokens_seen": 130522208, | |
| "step": 239500 | |
| }, | |
| { | |
| "epoch": 2.401152553225548, | |
| "grad_norm": 6.860942840576172, | |
| "learning_rate": 3.799428725788379e-05, | |
| "loss": 1.1922, | |
| "num_input_tokens_seen": 130799088, | |
| "step": 240000 | |
| }, | |
| { | |
| "epoch": 2.4061549543781013, | |
| "grad_norm": 6.199068069458008, | |
| "learning_rate": 3.7969275252121025e-05, | |
| "loss": 1.1825, | |
| "num_input_tokens_seen": 131067984, | |
| "step": 240500 | |
| }, | |
| { | |
| "epoch": 2.411157355530655, | |
| "grad_norm": 5.724475383758545, | |
| "learning_rate": 3.794426324635825e-05, | |
| "loss": 1.168, | |
| "num_input_tokens_seen": 131340552, | |
| "step": 241000 | |
| }, | |
| { | |
| "epoch": 2.416159756683208, | |
| "grad_norm": 5.187953472137451, | |
| "learning_rate": 3.791925124059549e-05, | |
| "loss": 1.1875, | |
| "num_input_tokens_seen": 131613968, | |
| "step": 241500 | |
| }, | |
| { | |
| "epoch": 2.421162157835761, | |
| "grad_norm": 6.069790363311768, | |
| "learning_rate": 3.789423923483272e-05, | |
| "loss": 1.1866, | |
| "num_input_tokens_seen": 131880736, | |
| "step": 242000 | |
| }, | |
| { | |
| "epoch": 2.4261645589883143, | |
| "grad_norm": 6.761556148529053, | |
| "learning_rate": 3.786922722906995e-05, | |
| "loss": 1.205, | |
| "num_input_tokens_seen": 132150456, | |
| "step": 242500 | |
| }, | |
| { | |
| "epoch": 2.4311669601408674, | |
| "grad_norm": 5.816013336181641, | |
| "learning_rate": 3.7844215223307186e-05, | |
| "loss": 1.1938, | |
| "num_input_tokens_seen": 132424832, | |
| "step": 243000 | |
| }, | |
| { | |
| "epoch": 2.436169361293421, | |
| "grad_norm": 6.447406768798828, | |
| "learning_rate": 3.781920321754443e-05, | |
| "loss": 1.1792, | |
| "num_input_tokens_seen": 132691704, | |
| "step": 243500 | |
| }, | |
| { | |
| "epoch": 2.441171762445974, | |
| "grad_norm": 6.802369117736816, | |
| "learning_rate": 3.7794191211781656e-05, | |
| "loss": 1.1891, | |
| "num_input_tokens_seen": 132962376, | |
| "step": 244000 | |
| }, | |
| { | |
| "epoch": 2.4461741635985272, | |
| "grad_norm": 5.149132251739502, | |
| "learning_rate": 3.776917920601889e-05, | |
| "loss": 1.1691, | |
| "num_input_tokens_seen": 133236272, | |
| "step": 244500 | |
| }, | |
| { | |
| "epoch": 2.4511765647510804, | |
| "grad_norm": 6.554666996002197, | |
| "learning_rate": 3.7744167200256126e-05, | |
| "loss": 1.1958, | |
| "num_input_tokens_seen": 133505504, | |
| "step": 245000 | |
| }, | |
| { | |
| "epoch": 2.456178965903634, | |
| "grad_norm": 5.13792610168457, | |
| "learning_rate": 3.771915519449336e-05, | |
| "loss": 1.1771, | |
| "num_input_tokens_seen": 133785904, | |
| "step": 245500 | |
| }, | |
| { | |
| "epoch": 2.461181367056187, | |
| "grad_norm": 4.5011491775512695, | |
| "learning_rate": 3.769414318873059e-05, | |
| "loss": 1.192, | |
| "num_input_tokens_seen": 134055360, | |
| "step": 246000 | |
| }, | |
| { | |
| "epoch": 2.46618376820874, | |
| "grad_norm": 7.41070556640625, | |
| "learning_rate": 3.7669131182967824e-05, | |
| "loss": 1.1682, | |
| "num_input_tokens_seen": 134321528, | |
| "step": 246500 | |
| }, | |
| { | |
| "epoch": 2.4711861693612933, | |
| "grad_norm": 7.749119281768799, | |
| "learning_rate": 3.764411917720506e-05, | |
| "loss": 1.188, | |
| "num_input_tokens_seen": 134595208, | |
| "step": 247000 | |
| }, | |
| { | |
| "epoch": 2.4761885705138464, | |
| "grad_norm": 5.476714134216309, | |
| "learning_rate": 3.7619107171442294e-05, | |
| "loss": 1.1668, | |
| "num_input_tokens_seen": 134869136, | |
| "step": 247500 | |
| }, | |
| { | |
| "epoch": 2.4811909716664, | |
| "grad_norm": 5.9990010261535645, | |
| "learning_rate": 3.759409516567953e-05, | |
| "loss": 1.182, | |
| "num_input_tokens_seen": 135144112, | |
| "step": 248000 | |
| }, | |
| { | |
| "epoch": 2.486193372818953, | |
| "grad_norm": 5.635094165802002, | |
| "learning_rate": 3.7569083159916764e-05, | |
| "loss": 1.1861, | |
| "num_input_tokens_seen": 135413984, | |
| "step": 248500 | |
| }, | |
| { | |
| "epoch": 2.4911957739715063, | |
| "grad_norm": 5.974431991577148, | |
| "learning_rate": 3.7544071154154e-05, | |
| "loss": 1.1927, | |
| "num_input_tokens_seen": 135697672, | |
| "step": 249000 | |
| }, | |
| { | |
| "epoch": 2.4961981751240594, | |
| "grad_norm": 6.688498497009277, | |
| "learning_rate": 3.751905914839123e-05, | |
| "loss": 1.2016, | |
| "num_input_tokens_seen": 135975272, | |
| "step": 249500 | |
| }, | |
| { | |
| "epoch": 2.501200576276613, | |
| "grad_norm": 8.589900970458984, | |
| "learning_rate": 3.749404714262846e-05, | |
| "loss": 1.1711, | |
| "num_input_tokens_seen": 136241376, | |
| "step": 250000 | |
| }, | |
| { | |
| "epoch": 2.506202977429166, | |
| "grad_norm": 6.064274311065674, | |
| "learning_rate": 3.74690351368657e-05, | |
| "loss": 1.2027, | |
| "num_input_tokens_seen": 136510824, | |
| "step": 250500 | |
| }, | |
| { | |
| "epoch": 2.511205378581719, | |
| "grad_norm": 5.36790657043457, | |
| "learning_rate": 3.744402313110293e-05, | |
| "loss": 1.189, | |
| "num_input_tokens_seen": 136777568, | |
| "step": 251000 | |
| }, | |
| { | |
| "epoch": 2.5162077797342723, | |
| "grad_norm": 5.9187703132629395, | |
| "learning_rate": 3.741901112534017e-05, | |
| "loss": 1.1894, | |
| "num_input_tokens_seen": 137049696, | |
| "step": 251500 | |
| }, | |
| { | |
| "epoch": 2.5212101808868255, | |
| "grad_norm": 5.2425007820129395, | |
| "learning_rate": 3.73939991195774e-05, | |
| "loss": 1.2087, | |
| "num_input_tokens_seen": 137319688, | |
| "step": 252000 | |
| }, | |
| { | |
| "epoch": 2.526212582039379, | |
| "grad_norm": 6.622330188751221, | |
| "learning_rate": 3.736898711381464e-05, | |
| "loss": 1.172, | |
| "num_input_tokens_seen": 137592360, | |
| "step": 252500 | |
| }, | |
| { | |
| "epoch": 2.531214983191932, | |
| "grad_norm": 5.9546122550964355, | |
| "learning_rate": 3.7343975108051865e-05, | |
| "loss": 1.1784, | |
| "num_input_tokens_seen": 137869696, | |
| "step": 253000 | |
| }, | |
| { | |
| "epoch": 2.5362173843444853, | |
| "grad_norm": 6.10466194152832, | |
| "learning_rate": 3.73189631022891e-05, | |
| "loss": 1.1806, | |
| "num_input_tokens_seen": 138146440, | |
| "step": 253500 | |
| }, | |
| { | |
| "epoch": 2.5412197854970384, | |
| "grad_norm": 7.046773433685303, | |
| "learning_rate": 3.7293951096526335e-05, | |
| "loss": 1.187, | |
| "num_input_tokens_seen": 138432672, | |
| "step": 254000 | |
| }, | |
| { | |
| "epoch": 2.546222186649592, | |
| "grad_norm": 5.8726115226745605, | |
| "learning_rate": 3.726893909076356e-05, | |
| "loss": 1.1769, | |
| "num_input_tokens_seen": 138704056, | |
| "step": 254500 | |
| }, | |
| { | |
| "epoch": 2.551224587802145, | |
| "grad_norm": 6.145564079284668, | |
| "learning_rate": 3.7243927085000805e-05, | |
| "loss": 1.1783, | |
| "num_input_tokens_seen": 138972048, | |
| "step": 255000 | |
| }, | |
| { | |
| "epoch": 2.5562269889546982, | |
| "grad_norm": 8.949604988098145, | |
| "learning_rate": 3.721891507923804e-05, | |
| "loss": 1.1928, | |
| "num_input_tokens_seen": 139249808, | |
| "step": 255500 | |
| }, | |
| { | |
| "epoch": 2.5612293901072514, | |
| "grad_norm": 6.0869975090026855, | |
| "learning_rate": 3.7193903073475275e-05, | |
| "loss": 1.2032, | |
| "num_input_tokens_seen": 139528824, | |
| "step": 256000 | |
| }, | |
| { | |
| "epoch": 2.5662317912598045, | |
| "grad_norm": 6.634551048278809, | |
| "learning_rate": 3.71688910677125e-05, | |
| "loss": 1.1977, | |
| "num_input_tokens_seen": 139798352, | |
| "step": 256500 | |
| }, | |
| { | |
| "epoch": 2.571234192412358, | |
| "grad_norm": 5.805966377258301, | |
| "learning_rate": 3.714387906194974e-05, | |
| "loss": 1.1725, | |
| "num_input_tokens_seen": 140071304, | |
| "step": 257000 | |
| }, | |
| { | |
| "epoch": 2.576236593564911, | |
| "grad_norm": 5.509829998016357, | |
| "learning_rate": 3.711886705618697e-05, | |
| "loss": 1.1794, | |
| "num_input_tokens_seen": 140351576, | |
| "step": 257500 | |
| }, | |
| { | |
| "epoch": 2.5812389947174643, | |
| "grad_norm": 7.246334552764893, | |
| "learning_rate": 3.70938550504242e-05, | |
| "loss": 1.1638, | |
| "num_input_tokens_seen": 140629520, | |
| "step": 258000 | |
| }, | |
| { | |
| "epoch": 2.5862413958700174, | |
| "grad_norm": 5.683703899383545, | |
| "learning_rate": 3.7068843044661436e-05, | |
| "loss": 1.1772, | |
| "num_input_tokens_seen": 140907480, | |
| "step": 258500 | |
| }, | |
| { | |
| "epoch": 2.591243797022571, | |
| "grad_norm": 5.520617485046387, | |
| "learning_rate": 3.704383103889867e-05, | |
| "loss": 1.1874, | |
| "num_input_tokens_seen": 141174448, | |
| "step": 259000 | |
| }, | |
| { | |
| "epoch": 2.596246198175124, | |
| "grad_norm": 6.609923839569092, | |
| "learning_rate": 3.701881903313591e-05, | |
| "loss": 1.1954, | |
| "num_input_tokens_seen": 141451848, | |
| "step": 259500 | |
| }, | |
| { | |
| "epoch": 2.6012485993276773, | |
| "grad_norm": 5.208652973175049, | |
| "learning_rate": 3.699380702737314e-05, | |
| "loss": 1.1777, | |
| "num_input_tokens_seen": 141719928, | |
| "step": 260000 | |
| }, | |
| { | |
| "epoch": 2.6062510004802304, | |
| "grad_norm": 6.525882720947266, | |
| "learning_rate": 3.6968795021610376e-05, | |
| "loss": 1.1725, | |
| "num_input_tokens_seen": 141993992, | |
| "step": 260500 | |
| }, | |
| { | |
| "epoch": 2.6112534016327835, | |
| "grad_norm": 6.694952011108398, | |
| "learning_rate": 3.694378301584761e-05, | |
| "loss": 1.1764, | |
| "num_input_tokens_seen": 142262512, | |
| "step": 261000 | |
| }, | |
| { | |
| "epoch": 2.616255802785337, | |
| "grad_norm": 6.036692142486572, | |
| "learning_rate": 3.691877101008484e-05, | |
| "loss": 1.1863, | |
| "num_input_tokens_seen": 142535016, | |
| "step": 261500 | |
| }, | |
| { | |
| "epoch": 2.62125820393789, | |
| "grad_norm": 4.5128021240234375, | |
| "learning_rate": 3.6893759004322074e-05, | |
| "loss": 1.1914, | |
| "num_input_tokens_seen": 142817040, | |
| "step": 262000 | |
| }, | |
| { | |
| "epoch": 2.6262606050904433, | |
| "grad_norm": 6.445744037628174, | |
| "learning_rate": 3.686874699855931e-05, | |
| "loss": 1.1938, | |
| "num_input_tokens_seen": 143093520, | |
| "step": 262500 | |
| }, | |
| { | |
| "epoch": 2.631263006242997, | |
| "grad_norm": 5.805507183074951, | |
| "learning_rate": 3.6843734992796544e-05, | |
| "loss": 1.176, | |
| "num_input_tokens_seen": 143361184, | |
| "step": 263000 | |
| }, | |
| { | |
| "epoch": 2.63626540739555, | |
| "grad_norm": 7.369002819061279, | |
| "learning_rate": 3.681872298703378e-05, | |
| "loss": 1.1737, | |
| "num_input_tokens_seen": 143633104, | |
| "step": 263500 | |
| }, | |
| { | |
| "epoch": 2.641267808548103, | |
| "grad_norm": 5.3200459480285645, | |
| "learning_rate": 3.6793710981271014e-05, | |
| "loss": 1.1853, | |
| "num_input_tokens_seen": 143903304, | |
| "step": 264000 | |
| }, | |
| { | |
| "epoch": 2.6462702097006563, | |
| "grad_norm": 4.868594169616699, | |
| "learning_rate": 3.676869897550825e-05, | |
| "loss": 1.1924, | |
| "num_input_tokens_seen": 144176568, | |
| "step": 264500 | |
| }, | |
| { | |
| "epoch": 2.6512726108532094, | |
| "grad_norm": 6.198353290557861, | |
| "learning_rate": 3.674368696974548e-05, | |
| "loss": 1.1854, | |
| "num_input_tokens_seen": 144457912, | |
| "step": 265000 | |
| }, | |
| { | |
| "epoch": 2.6562750120057625, | |
| "grad_norm": 5.720507621765137, | |
| "learning_rate": 3.671867496398271e-05, | |
| "loss": 1.1922, | |
| "num_input_tokens_seen": 144733560, | |
| "step": 265500 | |
| }, | |
| { | |
| "epoch": 2.661277413158316, | |
| "grad_norm": 6.092404365539551, | |
| "learning_rate": 3.669366295821995e-05, | |
| "loss": 1.1784, | |
| "num_input_tokens_seen": 145006896, | |
| "step": 266000 | |
| }, | |
| { | |
| "epoch": 2.6662798143108692, | |
| "grad_norm": 5.7721266746521, | |
| "learning_rate": 3.666865095245718e-05, | |
| "loss": 1.1682, | |
| "num_input_tokens_seen": 145276408, | |
| "step": 266500 | |
| }, | |
| { | |
| "epoch": 2.6712822154634224, | |
| "grad_norm": 5.34429407119751, | |
| "learning_rate": 3.664363894669442e-05, | |
| "loss": 1.2014, | |
| "num_input_tokens_seen": 145549104, | |
| "step": 267000 | |
| }, | |
| { | |
| "epoch": 2.676284616615976, | |
| "grad_norm": 5.627655982971191, | |
| "learning_rate": 3.661862694093165e-05, | |
| "loss": 1.1873, | |
| "num_input_tokens_seen": 145813696, | |
| "step": 267500 | |
| }, | |
| { | |
| "epoch": 2.681287017768529, | |
| "grad_norm": 5.520989894866943, | |
| "learning_rate": 3.659361493516889e-05, | |
| "loss": 1.1801, | |
| "num_input_tokens_seen": 146081824, | |
| "step": 268000 | |
| }, | |
| { | |
| "epoch": 2.686289418921082, | |
| "grad_norm": 5.194046497344971, | |
| "learning_rate": 3.6568602929406115e-05, | |
| "loss": 1.2043, | |
| "num_input_tokens_seen": 146359992, | |
| "step": 268500 | |
| }, | |
| { | |
| "epoch": 2.6912918200736353, | |
| "grad_norm": 7.6289753913879395, | |
| "learning_rate": 3.654359092364335e-05, | |
| "loss": 1.1667, | |
| "num_input_tokens_seen": 146635688, | |
| "step": 269000 | |
| }, | |
| { | |
| "epoch": 2.6962942212261884, | |
| "grad_norm": 6.4248504638671875, | |
| "learning_rate": 3.6518578917880585e-05, | |
| "loss": 1.182, | |
| "num_input_tokens_seen": 146907280, | |
| "step": 269500 | |
| }, | |
| { | |
| "epoch": 2.7012966223787416, | |
| "grad_norm": 7.369548797607422, | |
| "learning_rate": 3.649356691211781e-05, | |
| "loss": 1.1863, | |
| "num_input_tokens_seen": 147180456, | |
| "step": 270000 | |
| }, | |
| { | |
| "epoch": 2.706299023531295, | |
| "grad_norm": 7.325328826904297, | |
| "learning_rate": 3.646855490635505e-05, | |
| "loss": 1.1731, | |
| "num_input_tokens_seen": 147447936, | |
| "step": 270500 | |
| }, | |
| { | |
| "epoch": 2.7113014246838483, | |
| "grad_norm": 6.618239879608154, | |
| "learning_rate": 3.644354290059229e-05, | |
| "loss": 1.1898, | |
| "num_input_tokens_seen": 147714576, | |
| "step": 271000 | |
| }, | |
| { | |
| "epoch": 2.7163038258364014, | |
| "grad_norm": 6.6161932945251465, | |
| "learning_rate": 3.6418530894829525e-05, | |
| "loss": 1.1757, | |
| "num_input_tokens_seen": 147982616, | |
| "step": 271500 | |
| }, | |
| { | |
| "epoch": 2.721306226988955, | |
| "grad_norm": 4.964172840118408, | |
| "learning_rate": 3.639351888906675e-05, | |
| "loss": 1.1822, | |
| "num_input_tokens_seen": 148261928, | |
| "step": 272000 | |
| }, | |
| { | |
| "epoch": 2.726308628141508, | |
| "grad_norm": 5.542762756347656, | |
| "learning_rate": 3.636850688330399e-05, | |
| "loss": 1.1979, | |
| "num_input_tokens_seen": 148537656, | |
| "step": 272500 | |
| }, | |
| { | |
| "epoch": 2.731311029294061, | |
| "grad_norm": 6.122353553771973, | |
| "learning_rate": 3.634349487754122e-05, | |
| "loss": 1.1837, | |
| "num_input_tokens_seen": 148805656, | |
| "step": 273000 | |
| }, | |
| { | |
| "epoch": 2.7363134304466143, | |
| "grad_norm": 5.522734642028809, | |
| "learning_rate": 3.631848287177845e-05, | |
| "loss": 1.1755, | |
| "num_input_tokens_seen": 149071656, | |
| "step": 273500 | |
| }, | |
| { | |
| "epoch": 2.7413158315991675, | |
| "grad_norm": 7.560063362121582, | |
| "learning_rate": 3.6293470866015686e-05, | |
| "loss": 1.1801, | |
| "num_input_tokens_seen": 149339936, | |
| "step": 274000 | |
| }, | |
| { | |
| "epoch": 2.7463182327517206, | |
| "grad_norm": 5.46027135848999, | |
| "learning_rate": 3.626845886025292e-05, | |
| "loss": 1.1734, | |
| "num_input_tokens_seen": 149616152, | |
| "step": 274500 | |
| }, | |
| { | |
| "epoch": 2.751320633904274, | |
| "grad_norm": 5.810853004455566, | |
| "learning_rate": 3.624344685449016e-05, | |
| "loss": 1.1806, | |
| "num_input_tokens_seen": 149886976, | |
| "step": 275000 | |
| }, | |
| { | |
| "epoch": 2.7563230350568273, | |
| "grad_norm": 5.957060813903809, | |
| "learning_rate": 3.621843484872739e-05, | |
| "loss": 1.1777, | |
| "num_input_tokens_seen": 150161384, | |
| "step": 275500 | |
| }, | |
| { | |
| "epoch": 2.7613254362093804, | |
| "grad_norm": 8.7448148727417, | |
| "learning_rate": 3.6193422842964626e-05, | |
| "loss": 1.1535, | |
| "num_input_tokens_seen": 150426192, | |
| "step": 276000 | |
| }, | |
| { | |
| "epoch": 2.766327837361934, | |
| "grad_norm": 6.24728536605835, | |
| "learning_rate": 3.616841083720186e-05, | |
| "loss": 1.1741, | |
| "num_input_tokens_seen": 150694480, | |
| "step": 276500 | |
| }, | |
| { | |
| "epoch": 2.771330238514487, | |
| "grad_norm": 8.271539688110352, | |
| "learning_rate": 3.614339883143909e-05, | |
| "loss": 1.1853, | |
| "num_input_tokens_seen": 150965896, | |
| "step": 277000 | |
| }, | |
| { | |
| "epoch": 2.7763326396670402, | |
| "grad_norm": 6.075042247772217, | |
| "learning_rate": 3.6118386825676324e-05, | |
| "loss": 1.1729, | |
| "num_input_tokens_seen": 151239800, | |
| "step": 277500 | |
| }, | |
| { | |
| "epoch": 2.7813350408195934, | |
| "grad_norm": 7.93595552444458, | |
| "learning_rate": 3.609337481991356e-05, | |
| "loss": 1.185, | |
| "num_input_tokens_seen": 151512560, | |
| "step": 278000 | |
| }, | |
| { | |
| "epoch": 2.7863374419721465, | |
| "grad_norm": 7.406468868255615, | |
| "learning_rate": 3.6068362814150794e-05, | |
| "loss": 1.1908, | |
| "num_input_tokens_seen": 151789264, | |
| "step": 278500 | |
| }, | |
| { | |
| "epoch": 2.7913398431246996, | |
| "grad_norm": 6.6226325035095215, | |
| "learning_rate": 3.604335080838803e-05, | |
| "loss": 1.1696, | |
| "num_input_tokens_seen": 152057600, | |
| "step": 279000 | |
| }, | |
| { | |
| "epoch": 2.796342244277253, | |
| "grad_norm": 5.142138481140137, | |
| "learning_rate": 3.6018338802625264e-05, | |
| "loss": 1.1694, | |
| "num_input_tokens_seen": 152328800, | |
| "step": 279500 | |
| }, | |
| { | |
| "epoch": 2.8013446454298063, | |
| "grad_norm": 6.834789752960205, | |
| "learning_rate": 3.59933267968625e-05, | |
| "loss": 1.1798, | |
| "num_input_tokens_seen": 152610624, | |
| "step": 280000 | |
| }, | |
| { | |
| "epoch": 2.8063470465823595, | |
| "grad_norm": 5.720213890075684, | |
| "learning_rate": 3.596831479109973e-05, | |
| "loss": 1.1781, | |
| "num_input_tokens_seen": 152876576, | |
| "step": 280500 | |
| }, | |
| { | |
| "epoch": 2.811349447734913, | |
| "grad_norm": 6.060703277587891, | |
| "learning_rate": 3.594330278533696e-05, | |
| "loss": 1.1781, | |
| "num_input_tokens_seen": 153146728, | |
| "step": 281000 | |
| }, | |
| { | |
| "epoch": 2.816351848887466, | |
| "grad_norm": 7.374409198760986, | |
| "learning_rate": 3.59182907795742e-05, | |
| "loss": 1.1925, | |
| "num_input_tokens_seen": 153416872, | |
| "step": 281500 | |
| }, | |
| { | |
| "epoch": 2.8213542500400193, | |
| "grad_norm": 6.183439254760742, | |
| "learning_rate": 3.5893278773811425e-05, | |
| "loss": 1.1588, | |
| "num_input_tokens_seen": 153688560, | |
| "step": 282000 | |
| }, | |
| { | |
| "epoch": 2.8263566511925724, | |
| "grad_norm": 7.167964935302734, | |
| "learning_rate": 3.586826676804867e-05, | |
| "loss": 1.1799, | |
| "num_input_tokens_seen": 153968432, | |
| "step": 282500 | |
| }, | |
| { | |
| "epoch": 2.8313590523451255, | |
| "grad_norm": 5.514324188232422, | |
| "learning_rate": 3.58432547622859e-05, | |
| "loss": 1.1695, | |
| "num_input_tokens_seen": 154241312, | |
| "step": 283000 | |
| }, | |
| { | |
| "epoch": 2.8363614534976787, | |
| "grad_norm": 4.6626667976379395, | |
| "learning_rate": 3.581824275652314e-05, | |
| "loss": 1.1876, | |
| "num_input_tokens_seen": 154513432, | |
| "step": 283500 | |
| }, | |
| { | |
| "epoch": 2.8413638546502322, | |
| "grad_norm": 5.130783557891846, | |
| "learning_rate": 3.5793230750760365e-05, | |
| "loss": 1.1806, | |
| "num_input_tokens_seen": 154791160, | |
| "step": 284000 | |
| }, | |
| { | |
| "epoch": 2.8463662558027853, | |
| "grad_norm": 6.905600547790527, | |
| "learning_rate": 3.57682187449976e-05, | |
| "loss": 1.1774, | |
| "num_input_tokens_seen": 155060824, | |
| "step": 284500 | |
| }, | |
| { | |
| "epoch": 2.8513686569553385, | |
| "grad_norm": 5.459284782409668, | |
| "learning_rate": 3.5743206739234835e-05, | |
| "loss": 1.17, | |
| "num_input_tokens_seen": 155335544, | |
| "step": 285000 | |
| }, | |
| { | |
| "epoch": 2.856371058107892, | |
| "grad_norm": 6.299667835235596, | |
| "learning_rate": 3.571819473347206e-05, | |
| "loss": 1.1751, | |
| "num_input_tokens_seen": 155613112, | |
| "step": 285500 | |
| }, | |
| { | |
| "epoch": 2.861373459260445, | |
| "grad_norm": 5.297176361083984, | |
| "learning_rate": 3.56931827277093e-05, | |
| "loss": 1.173, | |
| "num_input_tokens_seen": 155888552, | |
| "step": 286000 | |
| }, | |
| { | |
| "epoch": 2.8663758604129983, | |
| "grad_norm": 7.074682712554932, | |
| "learning_rate": 3.566817072194654e-05, | |
| "loss": 1.1753, | |
| "num_input_tokens_seen": 156163384, | |
| "step": 286500 | |
| }, | |
| { | |
| "epoch": 2.8713782615655514, | |
| "grad_norm": 7.402191638946533, | |
| "learning_rate": 3.5643158716183775e-05, | |
| "loss": 1.1763, | |
| "num_input_tokens_seen": 156433192, | |
| "step": 287000 | |
| }, | |
| { | |
| "epoch": 2.8763806627181046, | |
| "grad_norm": 6.5372419357299805, | |
| "learning_rate": 3.5618146710421e-05, | |
| "loss": 1.1931, | |
| "num_input_tokens_seen": 156704752, | |
| "step": 287500 | |
| }, | |
| { | |
| "epoch": 2.8813830638706577, | |
| "grad_norm": 6.030176162719727, | |
| "learning_rate": 3.559313470465824e-05, | |
| "loss": 1.1808, | |
| "num_input_tokens_seen": 156979192, | |
| "step": 288000 | |
| }, | |
| { | |
| "epoch": 2.8863854650232112, | |
| "grad_norm": 5.74777364730835, | |
| "learning_rate": 3.556812269889547e-05, | |
| "loss": 1.1693, | |
| "num_input_tokens_seen": 157251448, | |
| "step": 288500 | |
| }, | |
| { | |
| "epoch": 2.8913878661757644, | |
| "grad_norm": 5.995535373687744, | |
| "learning_rate": 3.55431106931327e-05, | |
| "loss": 1.1637, | |
| "num_input_tokens_seen": 157518624, | |
| "step": 289000 | |
| }, | |
| { | |
| "epoch": 2.8963902673283175, | |
| "grad_norm": 7.268390655517578, | |
| "learning_rate": 3.5518098687369936e-05, | |
| "loss": 1.171, | |
| "num_input_tokens_seen": 157789104, | |
| "step": 289500 | |
| }, | |
| { | |
| "epoch": 2.901392668480871, | |
| "grad_norm": 6.150352478027344, | |
| "learning_rate": 3.549308668160717e-05, | |
| "loss": 1.1607, | |
| "num_input_tokens_seen": 158064360, | |
| "step": 290000 | |
| }, | |
| { | |
| "epoch": 2.906395069633424, | |
| "grad_norm": 9.338305473327637, | |
| "learning_rate": 3.5468074675844406e-05, | |
| "loss": 1.1777, | |
| "num_input_tokens_seen": 158337456, | |
| "step": 290500 | |
| }, | |
| { | |
| "epoch": 2.9113974707859773, | |
| "grad_norm": 7.0623674392700195, | |
| "learning_rate": 3.544306267008164e-05, | |
| "loss": 1.1673, | |
| "num_input_tokens_seen": 158610232, | |
| "step": 291000 | |
| }, | |
| { | |
| "epoch": 2.9163998719385305, | |
| "grad_norm": 6.665122985839844, | |
| "learning_rate": 3.5418050664318876e-05, | |
| "loss": 1.1764, | |
| "num_input_tokens_seen": 158883800, | |
| "step": 291500 | |
| }, | |
| { | |
| "epoch": 2.9214022730910836, | |
| "grad_norm": 5.489156723022461, | |
| "learning_rate": 3.539303865855611e-05, | |
| "loss": 1.1674, | |
| "num_input_tokens_seen": 159156920, | |
| "step": 292000 | |
| }, | |
| { | |
| "epoch": 2.9264046742436367, | |
| "grad_norm": 4.9325456619262695, | |
| "learning_rate": 3.536802665279334e-05, | |
| "loss": 1.1724, | |
| "num_input_tokens_seen": 159422696, | |
| "step": 292500 | |
| }, | |
| { | |
| "epoch": 2.9314070753961903, | |
| "grad_norm": 4.590809345245361, | |
| "learning_rate": 3.5343014647030574e-05, | |
| "loss": 1.1691, | |
| "num_input_tokens_seen": 159694472, | |
| "step": 293000 | |
| }, | |
| { | |
| "epoch": 2.9364094765487434, | |
| "grad_norm": 5.634531497955322, | |
| "learning_rate": 3.531800264126781e-05, | |
| "loss": 1.1875, | |
| "num_input_tokens_seen": 159966504, | |
| "step": 293500 | |
| }, | |
| { | |
| "epoch": 2.9414118777012965, | |
| "grad_norm": 7.636883735656738, | |
| "learning_rate": 3.5292990635505044e-05, | |
| "loss": 1.1771, | |
| "num_input_tokens_seen": 160236568, | |
| "step": 294000 | |
| }, | |
| { | |
| "epoch": 2.94641427885385, | |
| "grad_norm": 4.785983562469482, | |
| "learning_rate": 3.526797862974228e-05, | |
| "loss": 1.1746, | |
| "num_input_tokens_seen": 160505800, | |
| "step": 294500 | |
| }, | |
| { | |
| "epoch": 2.9514166800064032, | |
| "grad_norm": 5.1736931800842285, | |
| "learning_rate": 3.5242966623979514e-05, | |
| "loss": 1.1753, | |
| "num_input_tokens_seen": 160785240, | |
| "step": 295000 | |
| }, | |
| { | |
| "epoch": 2.9564190811589564, | |
| "grad_norm": 6.308248519897461, | |
| "learning_rate": 3.521795461821675e-05, | |
| "loss": 1.1851, | |
| "num_input_tokens_seen": 161049888, | |
| "step": 295500 | |
| }, | |
| { | |
| "epoch": 2.9614214823115095, | |
| "grad_norm": 6.6797404289245605, | |
| "learning_rate": 3.519294261245398e-05, | |
| "loss": 1.1486, | |
| "num_input_tokens_seen": 161316952, | |
| "step": 296000 | |
| }, | |
| { | |
| "epoch": 2.9664238834640626, | |
| "grad_norm": 5.875812530517578, | |
| "learning_rate": 3.516793060669121e-05, | |
| "loss": 1.1689, | |
| "num_input_tokens_seen": 161584440, | |
| "step": 296500 | |
| }, | |
| { | |
| "epoch": 2.9714262846166157, | |
| "grad_norm": 6.539891719818115, | |
| "learning_rate": 3.514291860092845e-05, | |
| "loss": 1.1567, | |
| "num_input_tokens_seen": 161861384, | |
| "step": 297000 | |
| }, | |
| { | |
| "epoch": 2.9764286857691693, | |
| "grad_norm": 4.355959415435791, | |
| "learning_rate": 3.511790659516568e-05, | |
| "loss": 1.1654, | |
| "num_input_tokens_seen": 162134976, | |
| "step": 297500 | |
| }, | |
| { | |
| "epoch": 2.9814310869217224, | |
| "grad_norm": 8.101115226745605, | |
| "learning_rate": 3.509289458940292e-05, | |
| "loss": 1.1749, | |
| "num_input_tokens_seen": 162408584, | |
| "step": 298000 | |
| }, | |
| { | |
| "epoch": 2.9864334880742756, | |
| "grad_norm": 6.168905735015869, | |
| "learning_rate": 3.506788258364015e-05, | |
| "loss": 1.1823, | |
| "num_input_tokens_seen": 162688056, | |
| "step": 298500 | |
| }, | |
| { | |
| "epoch": 2.991435889226829, | |
| "grad_norm": 6.624521255493164, | |
| "learning_rate": 3.504287057787739e-05, | |
| "loss": 1.1709, | |
| "num_input_tokens_seen": 162956872, | |
| "step": 299000 | |
| }, | |
| { | |
| "epoch": 2.9964382903793823, | |
| "grad_norm": 6.812922954559326, | |
| "learning_rate": 3.5017858572114615e-05, | |
| "loss": 1.1728, | |
| "num_input_tokens_seen": 163231632, | |
| "step": 299500 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 1.1232455968856812, | |
| "eval_runtime": 186.7896, | |
| "eval_samples_per_second": 1070.215, | |
| "eval_steps_per_second": 133.782, | |
| "num_input_tokens_seen": 163424808, | |
| "step": 299856 | |
| }, | |
| { | |
| "epoch": 3.0014406915319354, | |
| "grad_norm": 4.925895690917969, | |
| "learning_rate": 3.499284656635185e-05, | |
| "loss": 1.1557, | |
| "num_input_tokens_seen": 163507488, | |
| "step": 300000 | |
| }, | |
| { | |
| "epoch": 3.0064430926844885, | |
| "grad_norm": 4.663059234619141, | |
| "learning_rate": 3.4967834560589085e-05, | |
| "loss": 1.0655, | |
| "num_input_tokens_seen": 163782720, | |
| "step": 300500 | |
| }, | |
| { | |
| "epoch": 3.0114454938370416, | |
| "grad_norm": 7.381974220275879, | |
| "learning_rate": 3.494282255482632e-05, | |
| "loss": 1.0891, | |
| "num_input_tokens_seen": 164062072, | |
| "step": 301000 | |
| }, | |
| { | |
| "epoch": 3.016447894989595, | |
| "grad_norm": 6.4466094970703125, | |
| "learning_rate": 3.491781054906355e-05, | |
| "loss": 1.074, | |
| "num_input_tokens_seen": 164336176, | |
| "step": 301500 | |
| }, | |
| { | |
| "epoch": 3.0214502961421483, | |
| "grad_norm": 5.126181602478027, | |
| "learning_rate": 3.489279854330078e-05, | |
| "loss": 1.0666, | |
| "num_input_tokens_seen": 164610040, | |
| "step": 302000 | |
| }, | |
| { | |
| "epoch": 3.0264526972947015, | |
| "grad_norm": 5.322078227996826, | |
| "learning_rate": 3.4867786537538025e-05, | |
| "loss": 1.0837, | |
| "num_input_tokens_seen": 164887400, | |
| "step": 302500 | |
| }, | |
| { | |
| "epoch": 3.0314550984472546, | |
| "grad_norm": 5.671963691711426, | |
| "learning_rate": 3.484277453177525e-05, | |
| "loss": 1.0829, | |
| "num_input_tokens_seen": 165162256, | |
| "step": 303000 | |
| }, | |
| { | |
| "epoch": 3.0364574995998077, | |
| "grad_norm": 4.6445441246032715, | |
| "learning_rate": 3.481776252601249e-05, | |
| "loss": 1.0771, | |
| "num_input_tokens_seen": 165430680, | |
| "step": 303500 | |
| }, | |
| { | |
| "epoch": 3.0414599007523613, | |
| "grad_norm": 5.632525444030762, | |
| "learning_rate": 3.479275052024972e-05, | |
| "loss": 1.0893, | |
| "num_input_tokens_seen": 165706048, | |
| "step": 304000 | |
| }, | |
| { | |
| "epoch": 3.0464623019049144, | |
| "grad_norm": 4.770864963531494, | |
| "learning_rate": 3.476773851448695e-05, | |
| "loss": 1.0772, | |
| "num_input_tokens_seen": 165979496, | |
| "step": 304500 | |
| }, | |
| { | |
| "epoch": 3.0514647030574675, | |
| "grad_norm": 7.340290546417236, | |
| "learning_rate": 3.4742726508724186e-05, | |
| "loss": 1.0851, | |
| "num_input_tokens_seen": 166256376, | |
| "step": 305000 | |
| }, | |
| { | |
| "epoch": 3.0564671042100207, | |
| "grad_norm": 5.2338080406188965, | |
| "learning_rate": 3.471771450296142e-05, | |
| "loss": 1.0949, | |
| "num_input_tokens_seen": 166529712, | |
| "step": 305500 | |
| }, | |
| { | |
| "epoch": 3.0614695053625742, | |
| "grad_norm": 5.540538311004639, | |
| "learning_rate": 3.4692702497198656e-05, | |
| "loss": 1.0929, | |
| "num_input_tokens_seen": 166795520, | |
| "step": 306000 | |
| }, | |
| { | |
| "epoch": 3.0664719065151274, | |
| "grad_norm": 6.512203693389893, | |
| "learning_rate": 3.466769049143589e-05, | |
| "loss": 1.075, | |
| "num_input_tokens_seen": 167066728, | |
| "step": 306500 | |
| }, | |
| { | |
| "epoch": 3.0714743076676805, | |
| "grad_norm": 5.783512592315674, | |
| "learning_rate": 3.4642678485673126e-05, | |
| "loss": 1.0749, | |
| "num_input_tokens_seen": 167335800, | |
| "step": 307000 | |
| }, | |
| { | |
| "epoch": 3.0764767088202336, | |
| "grad_norm": 5.550832271575928, | |
| "learning_rate": 3.461766647991036e-05, | |
| "loss": 1.0886, | |
| "num_input_tokens_seen": 167610656, | |
| "step": 307500 | |
| }, | |
| { | |
| "epoch": 3.0814791099727867, | |
| "grad_norm": 5.394260883331299, | |
| "learning_rate": 3.459265447414759e-05, | |
| "loss": 1.0906, | |
| "num_input_tokens_seen": 167882480, | |
| "step": 308000 | |
| }, | |
| { | |
| "epoch": 3.0864815111253403, | |
| "grad_norm": 5.690032005310059, | |
| "learning_rate": 3.4567642468384824e-05, | |
| "loss": 1.0851, | |
| "num_input_tokens_seen": 168156832, | |
| "step": 308500 | |
| }, | |
| { | |
| "epoch": 3.0914839122778934, | |
| "grad_norm": 6.620737552642822, | |
| "learning_rate": 3.454263046262206e-05, | |
| "loss": 1.0931, | |
| "num_input_tokens_seen": 168435520, | |
| "step": 309000 | |
| }, | |
| { | |
| "epoch": 3.0964863134304466, | |
| "grad_norm": 6.105669021606445, | |
| "learning_rate": 3.4517618456859294e-05, | |
| "loss": 1.0755, | |
| "num_input_tokens_seen": 168708304, | |
| "step": 309500 | |
| }, | |
| { | |
| "epoch": 3.1014887145829997, | |
| "grad_norm": 6.636053562164307, | |
| "learning_rate": 3.449260645109653e-05, | |
| "loss": 1.08, | |
| "num_input_tokens_seen": 168969008, | |
| "step": 310000 | |
| }, | |
| { | |
| "epoch": 3.1064911157355533, | |
| "grad_norm": 7.361510753631592, | |
| "learning_rate": 3.4467594445333764e-05, | |
| "loss": 1.0981, | |
| "num_input_tokens_seen": 169241920, | |
| "step": 310500 | |
| }, | |
| { | |
| "epoch": 3.1114935168881064, | |
| "grad_norm": 4.566134929656982, | |
| "learning_rate": 3.4442582439571e-05, | |
| "loss": 1.0713, | |
| "num_input_tokens_seen": 169520520, | |
| "step": 311000 | |
| }, | |
| { | |
| "epoch": 3.1164959180406595, | |
| "grad_norm": 5.323643207550049, | |
| "learning_rate": 3.441757043380823e-05, | |
| "loss": 1.0884, | |
| "num_input_tokens_seen": 169794728, | |
| "step": 311500 | |
| }, | |
| { | |
| "epoch": 3.1214983191932126, | |
| "grad_norm": 5.005212306976318, | |
| "learning_rate": 3.439255842804546e-05, | |
| "loss": 1.0817, | |
| "num_input_tokens_seen": 170074952, | |
| "step": 312000 | |
| }, | |
| { | |
| "epoch": 3.1265007203457658, | |
| "grad_norm": 6.78676176071167, | |
| "learning_rate": 3.43675464222827e-05, | |
| "loss": 1.1067, | |
| "num_input_tokens_seen": 170350784, | |
| "step": 312500 | |
| }, | |
| { | |
| "epoch": 3.1315031214983193, | |
| "grad_norm": 5.532153129577637, | |
| "learning_rate": 3.434253441651993e-05, | |
| "loss": 1.0729, | |
| "num_input_tokens_seen": 170621008, | |
| "step": 313000 | |
| }, | |
| { | |
| "epoch": 3.1365055226508725, | |
| "grad_norm": 6.041494369506836, | |
| "learning_rate": 3.431752241075716e-05, | |
| "loss": 1.0667, | |
| "num_input_tokens_seen": 170896544, | |
| "step": 313500 | |
| }, | |
| { | |
| "epoch": 3.1415079238034256, | |
| "grad_norm": 5.707986831665039, | |
| "learning_rate": 3.42925104049944e-05, | |
| "loss": 1.0803, | |
| "num_input_tokens_seen": 171167464, | |
| "step": 314000 | |
| }, | |
| { | |
| "epoch": 3.1465103249559787, | |
| "grad_norm": 6.608933925628662, | |
| "learning_rate": 3.426749839923164e-05, | |
| "loss": 1.0944, | |
| "num_input_tokens_seen": 171437824, | |
| "step": 314500 | |
| }, | |
| { | |
| "epoch": 3.1515127261085323, | |
| "grad_norm": 4.988198280334473, | |
| "learning_rate": 3.4242486393468865e-05, | |
| "loss": 1.0928, | |
| "num_input_tokens_seen": 171713920, | |
| "step": 315000 | |
| }, | |
| { | |
| "epoch": 3.1565151272610854, | |
| "grad_norm": 5.763394832611084, | |
| "learning_rate": 3.42174743877061e-05, | |
| "loss": 1.0655, | |
| "num_input_tokens_seen": 171991720, | |
| "step": 315500 | |
| }, | |
| { | |
| "epoch": 3.1615175284136385, | |
| "grad_norm": 6.287621974945068, | |
| "learning_rate": 3.4192462381943335e-05, | |
| "loss": 1.1027, | |
| "num_input_tokens_seen": 172269040, | |
| "step": 316000 | |
| }, | |
| { | |
| "epoch": 3.1665199295661917, | |
| "grad_norm": 7.083132266998291, | |
| "learning_rate": 3.416745037618057e-05, | |
| "loss": 1.097, | |
| "num_input_tokens_seen": 172556304, | |
| "step": 316500 | |
| }, | |
| { | |
| "epoch": 3.1715223307187452, | |
| "grad_norm": 5.856710433959961, | |
| "learning_rate": 3.41424383704178e-05, | |
| "loss": 1.0848, | |
| "num_input_tokens_seen": 172825392, | |
| "step": 317000 | |
| }, | |
| { | |
| "epoch": 3.1765247318712984, | |
| "grad_norm": 5.9765849113464355, | |
| "learning_rate": 3.4117426364655033e-05, | |
| "loss": 1.0959, | |
| "num_input_tokens_seen": 173095280, | |
| "step": 317500 | |
| }, | |
| { | |
| "epoch": 3.1815271330238515, | |
| "grad_norm": 7.099453926086426, | |
| "learning_rate": 3.409241435889227e-05, | |
| "loss": 1.0853, | |
| "num_input_tokens_seen": 173365904, | |
| "step": 318000 | |
| }, | |
| { | |
| "epoch": 3.1865295341764046, | |
| "grad_norm": 6.180022239685059, | |
| "learning_rate": 3.40674023531295e-05, | |
| "loss": 1.0783, | |
| "num_input_tokens_seen": 173632760, | |
| "step": 318500 | |
| }, | |
| { | |
| "epoch": 3.1915319353289577, | |
| "grad_norm": 5.463505744934082, | |
| "learning_rate": 3.404239034736674e-05, | |
| "loss": 1.0838, | |
| "num_input_tokens_seen": 173903984, | |
| "step": 319000 | |
| }, | |
| { | |
| "epoch": 3.1965343364815113, | |
| "grad_norm": 5.173684120178223, | |
| "learning_rate": 3.401737834160397e-05, | |
| "loss": 1.1002, | |
| "num_input_tokens_seen": 174179216, | |
| "step": 319500 | |
| }, | |
| { | |
| "epoch": 3.2015367376340644, | |
| "grad_norm": 7.594663619995117, | |
| "learning_rate": 3.399236633584121e-05, | |
| "loss": 1.096, | |
| "num_input_tokens_seen": 174445040, | |
| "step": 320000 | |
| }, | |
| { | |
| "epoch": 3.2065391387866176, | |
| "grad_norm": 6.0014262199401855, | |
| "learning_rate": 3.3967354330078436e-05, | |
| "loss": 1.0874, | |
| "num_input_tokens_seen": 174714360, | |
| "step": 320500 | |
| }, | |
| { | |
| "epoch": 3.2115415399391707, | |
| "grad_norm": 6.118896961212158, | |
| "learning_rate": 3.394234232431567e-05, | |
| "loss": 1.0911, | |
| "num_input_tokens_seen": 174982632, | |
| "step": 321000 | |
| }, | |
| { | |
| "epoch": 3.2165439410917243, | |
| "grad_norm": 6.8333587646484375, | |
| "learning_rate": 3.3917330318552906e-05, | |
| "loss": 1.1197, | |
| "num_input_tokens_seen": 175255784, | |
| "step": 321500 | |
| }, | |
| { | |
| "epoch": 3.2215463422442774, | |
| "grad_norm": 4.892594337463379, | |
| "learning_rate": 3.389231831279014e-05, | |
| "loss": 1.1041, | |
| "num_input_tokens_seen": 175526640, | |
| "step": 322000 | |
| }, | |
| { | |
| "epoch": 3.2265487433968305, | |
| "grad_norm": 5.051529884338379, | |
| "learning_rate": 3.3867306307027376e-05, | |
| "loss": 1.102, | |
| "num_input_tokens_seen": 175794768, | |
| "step": 322500 | |
| }, | |
| { | |
| "epoch": 3.2315511445493836, | |
| "grad_norm": 5.638453960418701, | |
| "learning_rate": 3.384229430126461e-05, | |
| "loss": 1.1116, | |
| "num_input_tokens_seen": 176065112, | |
| "step": 323000 | |
| }, | |
| { | |
| "epoch": 3.2365535457019368, | |
| "grad_norm": 7.20506477355957, | |
| "learning_rate": 3.381728229550184e-05, | |
| "loss": 1.0918, | |
| "num_input_tokens_seen": 176336008, | |
| "step": 323500 | |
| }, | |
| { | |
| "epoch": 3.2415559468544903, | |
| "grad_norm": 7.046761512756348, | |
| "learning_rate": 3.3792270289739074e-05, | |
| "loss": 1.0734, | |
| "num_input_tokens_seen": 176605936, | |
| "step": 324000 | |
| }, | |
| { | |
| "epoch": 3.2465583480070435, | |
| "grad_norm": 6.106048107147217, | |
| "learning_rate": 3.376725828397631e-05, | |
| "loss": 1.0953, | |
| "num_input_tokens_seen": 176880432, | |
| "step": 324500 | |
| }, | |
| { | |
| "epoch": 3.2515607491595966, | |
| "grad_norm": 6.578117847442627, | |
| "learning_rate": 3.3742246278213544e-05, | |
| "loss": 1.0973, | |
| "num_input_tokens_seen": 177149064, | |
| "step": 325000 | |
| }, | |
| { | |
| "epoch": 3.2565631503121497, | |
| "grad_norm": 5.515709400177002, | |
| "learning_rate": 3.371723427245078e-05, | |
| "loss": 1.1044, | |
| "num_input_tokens_seen": 177427928, | |
| "step": 325500 | |
| }, | |
| { | |
| "epoch": 3.2615655514647033, | |
| "grad_norm": 6.71830940246582, | |
| "learning_rate": 3.3692222266688014e-05, | |
| "loss": 1.0983, | |
| "num_input_tokens_seen": 177700128, | |
| "step": 326000 | |
| }, | |
| { | |
| "epoch": 3.2665679526172564, | |
| "grad_norm": 6.004988670349121, | |
| "learning_rate": 3.366721026092525e-05, | |
| "loss": 1.0844, | |
| "num_input_tokens_seen": 177979224, | |
| "step": 326500 | |
| }, | |
| { | |
| "epoch": 3.2715703537698095, | |
| "grad_norm": 6.418676376342773, | |
| "learning_rate": 3.364219825516248e-05, | |
| "loss": 1.0996, | |
| "num_input_tokens_seen": 178254504, | |
| "step": 327000 | |
| }, | |
| { | |
| "epoch": 3.2765727549223627, | |
| "grad_norm": 6.826735973358154, | |
| "learning_rate": 3.361718624939971e-05, | |
| "loss": 1.086, | |
| "num_input_tokens_seen": 178527880, | |
| "step": 327500 | |
| }, | |
| { | |
| "epoch": 3.281575156074916, | |
| "grad_norm": 7.035877704620361, | |
| "learning_rate": 3.359217424363695e-05, | |
| "loss": 1.0919, | |
| "num_input_tokens_seen": 178801256, | |
| "step": 328000 | |
| }, | |
| { | |
| "epoch": 3.2865775572274694, | |
| "grad_norm": 7.336743354797363, | |
| "learning_rate": 3.356716223787418e-05, | |
| "loss": 1.1121, | |
| "num_input_tokens_seen": 179072160, | |
| "step": 328500 | |
| }, | |
| { | |
| "epoch": 3.2915799583800225, | |
| "grad_norm": 5.2435383796691895, | |
| "learning_rate": 3.354215023211141e-05, | |
| "loss": 1.0915, | |
| "num_input_tokens_seen": 179344440, | |
| "step": 329000 | |
| }, | |
| { | |
| "epoch": 3.2965823595325756, | |
| "grad_norm": 7.368856906890869, | |
| "learning_rate": 3.3517138226348646e-05, | |
| "loss": 1.0991, | |
| "num_input_tokens_seen": 179619064, | |
| "step": 329500 | |
| }, | |
| { | |
| "epoch": 3.3015847606851287, | |
| "grad_norm": 6.245655059814453, | |
| "learning_rate": 3.349212622058589e-05, | |
| "loss": 1.0883, | |
| "num_input_tokens_seen": 179888680, | |
| "step": 330000 | |
| }, | |
| { | |
| "epoch": 3.3065871618376823, | |
| "grad_norm": 6.055501461029053, | |
| "learning_rate": 3.3467114214823116e-05, | |
| "loss": 1.0854, | |
| "num_input_tokens_seen": 180162488, | |
| "step": 330500 | |
| }, | |
| { | |
| "epoch": 3.3115895629902354, | |
| "grad_norm": 5.36578893661499, | |
| "learning_rate": 3.344210220906035e-05, | |
| "loss": 1.0829, | |
| "num_input_tokens_seen": 180435488, | |
| "step": 331000 | |
| }, | |
| { | |
| "epoch": 3.3165919641427886, | |
| "grad_norm": 4.865072727203369, | |
| "learning_rate": 3.3417090203297585e-05, | |
| "loss": 1.0914, | |
| "num_input_tokens_seen": 180708088, | |
| "step": 331500 | |
| }, | |
| { | |
| "epoch": 3.3215943652953417, | |
| "grad_norm": 5.984726428985596, | |
| "learning_rate": 3.339207819753482e-05, | |
| "loss": 1.0988, | |
| "num_input_tokens_seen": 180972032, | |
| "step": 332000 | |
| }, | |
| { | |
| "epoch": 3.326596766447895, | |
| "grad_norm": 6.17361307144165, | |
| "learning_rate": 3.336706619177205e-05, | |
| "loss": 1.1045, | |
| "num_input_tokens_seen": 181243824, | |
| "step": 332500 | |
| }, | |
| { | |
| "epoch": 3.3315991676004484, | |
| "grad_norm": 5.614140510559082, | |
| "learning_rate": 3.3342054186009284e-05, | |
| "loss": 1.1017, | |
| "num_input_tokens_seen": 181516200, | |
| "step": 333000 | |
| }, | |
| { | |
| "epoch": 3.3366015687530015, | |
| "grad_norm": 6.182852268218994, | |
| "learning_rate": 3.331704218024652e-05, | |
| "loss": 1.104, | |
| "num_input_tokens_seen": 181789432, | |
| "step": 333500 | |
| }, | |
| { | |
| "epoch": 3.3416039699055546, | |
| "grad_norm": 6.281063079833984, | |
| "learning_rate": 3.3292030174483754e-05, | |
| "loss": 1.1114, | |
| "num_input_tokens_seen": 182061728, | |
| "step": 334000 | |
| }, | |
| { | |
| "epoch": 3.3466063710581078, | |
| "grad_norm": 5.531891822814941, | |
| "learning_rate": 3.326701816872099e-05, | |
| "loss": 1.0908, | |
| "num_input_tokens_seen": 182340072, | |
| "step": 334500 | |
| }, | |
| { | |
| "epoch": 3.3516087722106613, | |
| "grad_norm": 5.755847930908203, | |
| "learning_rate": 3.3242006162958223e-05, | |
| "loss": 1.0978, | |
| "num_input_tokens_seen": 182603520, | |
| "step": 335000 | |
| }, | |
| { | |
| "epoch": 3.3566111733632145, | |
| "grad_norm": 5.261629104614258, | |
| "learning_rate": 3.321699415719546e-05, | |
| "loss": 1.089, | |
| "num_input_tokens_seen": 182871456, | |
| "step": 335500 | |
| }, | |
| { | |
| "epoch": 3.3616135745157676, | |
| "grad_norm": 9.492514610290527, | |
| "learning_rate": 3.319198215143269e-05, | |
| "loss": 1.0943, | |
| "num_input_tokens_seen": 183145224, | |
| "step": 336000 | |
| }, | |
| { | |
| "epoch": 3.3666159756683207, | |
| "grad_norm": 5.316561222076416, | |
| "learning_rate": 3.316697014566992e-05, | |
| "loss": 1.1153, | |
| "num_input_tokens_seen": 183418328, | |
| "step": 336500 | |
| }, | |
| { | |
| "epoch": 3.371618376820874, | |
| "grad_norm": 4.869199275970459, | |
| "learning_rate": 3.3141958139907157e-05, | |
| "loss": 1.0922, | |
| "num_input_tokens_seen": 183701552, | |
| "step": 337000 | |
| }, | |
| { | |
| "epoch": 3.3766207779734274, | |
| "grad_norm": 5.928160667419434, | |
| "learning_rate": 3.311694613414439e-05, | |
| "loss": 1.1058, | |
| "num_input_tokens_seen": 183971600, | |
| "step": 337500 | |
| }, | |
| { | |
| "epoch": 3.3816231791259805, | |
| "grad_norm": 5.425112724304199, | |
| "learning_rate": 3.3091934128381626e-05, | |
| "loss": 1.0863, | |
| "num_input_tokens_seen": 184239416, | |
| "step": 338000 | |
| }, | |
| { | |
| "epoch": 3.3866255802785337, | |
| "grad_norm": 5.094555854797363, | |
| "learning_rate": 3.306692212261886e-05, | |
| "loss": 1.0826, | |
| "num_input_tokens_seen": 184503600, | |
| "step": 338500 | |
| }, | |
| { | |
| "epoch": 3.391627981431087, | |
| "grad_norm": 6.472997665405273, | |
| "learning_rate": 3.3041910116856096e-05, | |
| "loss": 1.0987, | |
| "num_input_tokens_seen": 184777584, | |
| "step": 339000 | |
| }, | |
| { | |
| "epoch": 3.3966303825836404, | |
| "grad_norm": 5.41008996963501, | |
| "learning_rate": 3.3016898111093325e-05, | |
| "loss": 1.1025, | |
| "num_input_tokens_seen": 185048208, | |
| "step": 339500 | |
| }, | |
| { | |
| "epoch": 3.4016327837361935, | |
| "grad_norm": 6.235612869262695, | |
| "learning_rate": 3.299188610533056e-05, | |
| "loss": 1.1097, | |
| "num_input_tokens_seen": 185317376, | |
| "step": 340000 | |
| }, | |
| { | |
| "epoch": 3.4066351848887466, | |
| "grad_norm": 5.876267910003662, | |
| "learning_rate": 3.2966874099567795e-05, | |
| "loss": 1.111, | |
| "num_input_tokens_seen": 185588416, | |
| "step": 340500 | |
| }, | |
| { | |
| "epoch": 3.4116375860412997, | |
| "grad_norm": 4.517580032348633, | |
| "learning_rate": 3.294186209380502e-05, | |
| "loss": 1.0877, | |
| "num_input_tokens_seen": 185855440, | |
| "step": 341000 | |
| }, | |
| { | |
| "epoch": 3.416639987193853, | |
| "grad_norm": 7.28811502456665, | |
| "learning_rate": 3.2916850088042264e-05, | |
| "loss": 1.1021, | |
| "num_input_tokens_seen": 186125800, | |
| "step": 341500 | |
| }, | |
| { | |
| "epoch": 3.4216423883464064, | |
| "grad_norm": 7.394123077392578, | |
| "learning_rate": 3.28918380822795e-05, | |
| "loss": 1.1103, | |
| "num_input_tokens_seen": 186390984, | |
| "step": 342000 | |
| }, | |
| { | |
| "epoch": 3.4266447894989596, | |
| "grad_norm": 6.393476963043213, | |
| "learning_rate": 3.286682607651673e-05, | |
| "loss": 1.0946, | |
| "num_input_tokens_seen": 186659016, | |
| "step": 342500 | |
| }, | |
| { | |
| "epoch": 3.4316471906515127, | |
| "grad_norm": 5.5101470947265625, | |
| "learning_rate": 3.284181407075396e-05, | |
| "loss": 1.1031, | |
| "num_input_tokens_seen": 186933800, | |
| "step": 343000 | |
| }, | |
| { | |
| "epoch": 3.436649591804066, | |
| "grad_norm": 5.820064067840576, | |
| "learning_rate": 3.28168020649912e-05, | |
| "loss": 1.0992, | |
| "num_input_tokens_seen": 187203688, | |
| "step": 343500 | |
| }, | |
| { | |
| "epoch": 3.4416519929566194, | |
| "grad_norm": 4.500607013702393, | |
| "learning_rate": 3.279179005922843e-05, | |
| "loss": 1.1083, | |
| "num_input_tokens_seen": 187477344, | |
| "step": 344000 | |
| }, | |
| { | |
| "epoch": 3.4466543941091725, | |
| "grad_norm": 6.536877632141113, | |
| "learning_rate": 3.276677805346566e-05, | |
| "loss": 1.0988, | |
| "num_input_tokens_seen": 187753544, | |
| "step": 344500 | |
| }, | |
| { | |
| "epoch": 3.4516567952617256, | |
| "grad_norm": 6.723674774169922, | |
| "learning_rate": 3.2741766047702896e-05, | |
| "loss": 1.0814, | |
| "num_input_tokens_seen": 188027648, | |
| "step": 345000 | |
| }, | |
| { | |
| "epoch": 3.4566591964142788, | |
| "grad_norm": 5.175849437713623, | |
| "learning_rate": 3.271675404194014e-05, | |
| "loss": 1.1108, | |
| "num_input_tokens_seen": 188298976, | |
| "step": 345500 | |
| }, | |
| { | |
| "epoch": 3.461661597566832, | |
| "grad_norm": 4.006369590759277, | |
| "learning_rate": 3.2691742036177366e-05, | |
| "loss": 1.102, | |
| "num_input_tokens_seen": 188573952, | |
| "step": 346000 | |
| }, | |
| { | |
| "epoch": 3.4666639987193855, | |
| "grad_norm": 5.444148063659668, | |
| "learning_rate": 3.26667300304146e-05, | |
| "loss": 1.0963, | |
| "num_input_tokens_seen": 188843992, | |
| "step": 346500 | |
| }, | |
| { | |
| "epoch": 3.4716663998719386, | |
| "grad_norm": 6.093343257904053, | |
| "learning_rate": 3.2641718024651836e-05, | |
| "loss": 1.1117, | |
| "num_input_tokens_seen": 189117128, | |
| "step": 347000 | |
| }, | |
| { | |
| "epoch": 3.4766688010244917, | |
| "grad_norm": 5.752835750579834, | |
| "learning_rate": 3.261670601888907e-05, | |
| "loss": 1.0973, | |
| "num_input_tokens_seen": 189389104, | |
| "step": 347500 | |
| }, | |
| { | |
| "epoch": 3.481671202177045, | |
| "grad_norm": 4.975690841674805, | |
| "learning_rate": 3.25916940131263e-05, | |
| "loss": 1.1074, | |
| "num_input_tokens_seen": 189665040, | |
| "step": 348000 | |
| }, | |
| { | |
| "epoch": 3.4866736033295984, | |
| "grad_norm": 5.228826999664307, | |
| "learning_rate": 3.2566682007363534e-05, | |
| "loss": 1.0942, | |
| "num_input_tokens_seen": 189939112, | |
| "step": 348500 | |
| }, | |
| { | |
| "epoch": 3.4916760044821515, | |
| "grad_norm": 5.240488052368164, | |
| "learning_rate": 3.254167000160077e-05, | |
| "loss": 1.1023, | |
| "num_input_tokens_seen": 190214888, | |
| "step": 349000 | |
| }, | |
| { | |
| "epoch": 3.4966784056347047, | |
| "grad_norm": 6.247119903564453, | |
| "learning_rate": 3.2516657995838004e-05, | |
| "loss": 1.1001, | |
| "num_input_tokens_seen": 190486416, | |
| "step": 349500 | |
| }, | |
| { | |
| "epoch": 3.501680806787258, | |
| "grad_norm": 7.789793968200684, | |
| "learning_rate": 3.249164599007524e-05, | |
| "loss": 1.1066, | |
| "num_input_tokens_seen": 190761576, | |
| "step": 350000 | |
| }, | |
| { | |
| "epoch": 3.506683207939811, | |
| "grad_norm": 4.448274612426758, | |
| "learning_rate": 3.2466633984312474e-05, | |
| "loss": 1.1009, | |
| "num_input_tokens_seen": 191031280, | |
| "step": 350500 | |
| }, | |
| { | |
| "epoch": 3.5116856090923645, | |
| "grad_norm": 7.334349632263184, | |
| "learning_rate": 3.244162197854971e-05, | |
| "loss": 1.1059, | |
| "num_input_tokens_seen": 191300640, | |
| "step": 351000 | |
| }, | |
| { | |
| "epoch": 3.5166880102449176, | |
| "grad_norm": 6.003718852996826, | |
| "learning_rate": 3.241660997278694e-05, | |
| "loss": 1.097, | |
| "num_input_tokens_seen": 191573504, | |
| "step": 351500 | |
| }, | |
| { | |
| "epoch": 3.5216904113974707, | |
| "grad_norm": 5.930721759796143, | |
| "learning_rate": 3.239159796702417e-05, | |
| "loss": 1.0897, | |
| "num_input_tokens_seen": 191844912, | |
| "step": 352000 | |
| }, | |
| { | |
| "epoch": 3.526692812550024, | |
| "grad_norm": 4.852160453796387, | |
| "learning_rate": 3.236658596126141e-05, | |
| "loss": 1.0989, | |
| "num_input_tokens_seen": 192115928, | |
| "step": 352500 | |
| }, | |
| { | |
| "epoch": 3.5316952137025774, | |
| "grad_norm": 5.043008327484131, | |
| "learning_rate": 3.234157395549864e-05, | |
| "loss": 1.0835, | |
| "num_input_tokens_seen": 192381304, | |
| "step": 353000 | |
| }, | |
| { | |
| "epoch": 3.5366976148551306, | |
| "grad_norm": 5.529479503631592, | |
| "learning_rate": 3.231656194973588e-05, | |
| "loss": 1.1085, | |
| "num_input_tokens_seen": 192651760, | |
| "step": 353500 | |
| }, | |
| { | |
| "epoch": 3.5417000160076837, | |
| "grad_norm": 6.701032638549805, | |
| "learning_rate": 3.229154994397311e-05, | |
| "loss": 1.1058, | |
| "num_input_tokens_seen": 192924280, | |
| "step": 354000 | |
| }, | |
| { | |
| "epoch": 3.546702417160237, | |
| "grad_norm": 6.587806224822998, | |
| "learning_rate": 3.2266537938210347e-05, | |
| "loss": 1.0952, | |
| "num_input_tokens_seen": 193194504, | |
| "step": 354500 | |
| }, | |
| { | |
| "epoch": 3.55170481831279, | |
| "grad_norm": 5.651816368103027, | |
| "learning_rate": 3.2241525932447575e-05, | |
| "loss": 1.0987, | |
| "num_input_tokens_seen": 193457896, | |
| "step": 355000 | |
| }, | |
| { | |
| "epoch": 3.5567072194653435, | |
| "grad_norm": 4.911685943603516, | |
| "learning_rate": 3.221651392668481e-05, | |
| "loss": 1.0894, | |
| "num_input_tokens_seen": 193726256, | |
| "step": 355500 | |
| }, | |
| { | |
| "epoch": 3.5617096206178966, | |
| "grad_norm": 5.760750770568848, | |
| "learning_rate": 3.2191501920922045e-05, | |
| "loss": 1.1061, | |
| "num_input_tokens_seen": 194000904, | |
| "step": 356000 | |
| }, | |
| { | |
| "epoch": 3.5667120217704498, | |
| "grad_norm": 5.3068647384643555, | |
| "learning_rate": 3.216648991515927e-05, | |
| "loss": 1.0917, | |
| "num_input_tokens_seen": 194271728, | |
| "step": 356500 | |
| }, | |
| { | |
| "epoch": 3.571714422923003, | |
| "grad_norm": 5.526483535766602, | |
| "learning_rate": 3.2141477909396515e-05, | |
| "loss": 1.0853, | |
| "num_input_tokens_seen": 194541832, | |
| "step": 357000 | |
| }, | |
| { | |
| "epoch": 3.5767168240755565, | |
| "grad_norm": 6.068410396575928, | |
| "learning_rate": 3.211646590363375e-05, | |
| "loss": 1.1037, | |
| "num_input_tokens_seen": 194815496, | |
| "step": 357500 | |
| }, | |
| { | |
| "epoch": 3.5817192252281096, | |
| "grad_norm": 5.573991775512695, | |
| "learning_rate": 3.2091453897870985e-05, | |
| "loss": 1.104, | |
| "num_input_tokens_seen": 195090856, | |
| "step": 358000 | |
| }, | |
| { | |
| "epoch": 3.5867216263806627, | |
| "grad_norm": 7.24959135055542, | |
| "learning_rate": 3.206644189210821e-05, | |
| "loss": 1.1011, | |
| "num_input_tokens_seen": 195370496, | |
| "step": 358500 | |
| }, | |
| { | |
| "epoch": 3.591724027533216, | |
| "grad_norm": 5.9966535568237305, | |
| "learning_rate": 3.204142988634545e-05, | |
| "loss": 1.1042, | |
| "num_input_tokens_seen": 195642920, | |
| "step": 359000 | |
| }, | |
| { | |
| "epoch": 3.596726428685769, | |
| "grad_norm": 10.24399185180664, | |
| "learning_rate": 3.201641788058268e-05, | |
| "loss": 1.1068, | |
| "num_input_tokens_seen": 195918104, | |
| "step": 359500 | |
| }, | |
| { | |
| "epoch": 3.6017288298383225, | |
| "grad_norm": 6.0826215744018555, | |
| "learning_rate": 3.199140587481991e-05, | |
| "loss": 1.0953, | |
| "num_input_tokens_seen": 196193816, | |
| "step": 360000 | |
| }, | |
| { | |
| "epoch": 3.6067312309908757, | |
| "grad_norm": 5.735098838806152, | |
| "learning_rate": 3.1966393869057146e-05, | |
| "loss": 1.0956, | |
| "num_input_tokens_seen": 196461344, | |
| "step": 360500 | |
| }, | |
| { | |
| "epoch": 3.611733632143429, | |
| "grad_norm": 4.604750156402588, | |
| "learning_rate": 3.194138186329438e-05, | |
| "loss": 1.0863, | |
| "num_input_tokens_seen": 196732248, | |
| "step": 361000 | |
| }, | |
| { | |
| "epoch": 3.616736033295982, | |
| "grad_norm": 5.826147079467773, | |
| "learning_rate": 3.191636985753162e-05, | |
| "loss": 1.1043, | |
| "num_input_tokens_seen": 197008704, | |
| "step": 361500 | |
| }, | |
| { | |
| "epoch": 3.6217384344485355, | |
| "grad_norm": 6.071508884429932, | |
| "learning_rate": 3.189135785176885e-05, | |
| "loss": 1.1086, | |
| "num_input_tokens_seen": 197287472, | |
| "step": 362000 | |
| }, | |
| { | |
| "epoch": 3.6267408356010886, | |
| "grad_norm": 7.109647750854492, | |
| "learning_rate": 3.1866345846006086e-05, | |
| "loss": 1.1049, | |
| "num_input_tokens_seen": 197561376, | |
| "step": 362500 | |
| }, | |
| { | |
| "epoch": 3.6317432367536417, | |
| "grad_norm": 5.95808219909668, | |
| "learning_rate": 3.184133384024332e-05, | |
| "loss": 1.1102, | |
| "num_input_tokens_seen": 197833112, | |
| "step": 363000 | |
| }, | |
| { | |
| "epoch": 3.636745637906195, | |
| "grad_norm": 5.6464080810546875, | |
| "learning_rate": 3.181632183448055e-05, | |
| "loss": 1.1011, | |
| "num_input_tokens_seen": 198108448, | |
| "step": 363500 | |
| }, | |
| { | |
| "epoch": 3.641748039058748, | |
| "grad_norm": 6.354126453399658, | |
| "learning_rate": 3.1791309828717784e-05, | |
| "loss": 1.1115, | |
| "num_input_tokens_seen": 198384984, | |
| "step": 364000 | |
| }, | |
| { | |
| "epoch": 3.6467504402113016, | |
| "grad_norm": 4.0459394454956055, | |
| "learning_rate": 3.176629782295502e-05, | |
| "loss": 1.0864, | |
| "num_input_tokens_seen": 198658952, | |
| "step": 364500 | |
| }, | |
| { | |
| "epoch": 3.6517528413638547, | |
| "grad_norm": 5.361639022827148, | |
| "learning_rate": 3.1741285817192254e-05, | |
| "loss": 1.0827, | |
| "num_input_tokens_seen": 198924888, | |
| "step": 365000 | |
| }, | |
| { | |
| "epoch": 3.656755242516408, | |
| "grad_norm": 5.508306503295898, | |
| "learning_rate": 3.171627381142949e-05, | |
| "loss": 1.1204, | |
| "num_input_tokens_seen": 199201528, | |
| "step": 365500 | |
| }, | |
| { | |
| "epoch": 3.661757643668961, | |
| "grad_norm": 5.771850109100342, | |
| "learning_rate": 3.1691261805666724e-05, | |
| "loss": 1.0936, | |
| "num_input_tokens_seen": 199477568, | |
| "step": 366000 | |
| }, | |
| { | |
| "epoch": 3.6667600448215145, | |
| "grad_norm": 5.311666011810303, | |
| "learning_rate": 3.166624979990396e-05, | |
| "loss": 1.0837, | |
| "num_input_tokens_seen": 199742528, | |
| "step": 366500 | |
| }, | |
| { | |
| "epoch": 3.6717624459740676, | |
| "grad_norm": 6.869203090667725, | |
| "learning_rate": 3.164123779414119e-05, | |
| "loss": 1.0877, | |
| "num_input_tokens_seen": 200016352, | |
| "step": 367000 | |
| }, | |
| { | |
| "epoch": 3.6767648471266208, | |
| "grad_norm": 5.720645427703857, | |
| "learning_rate": 3.161622578837842e-05, | |
| "loss": 1.1157, | |
| "num_input_tokens_seen": 200288848, | |
| "step": 367500 | |
| }, | |
| { | |
| "epoch": 3.681767248279174, | |
| "grad_norm": 4.348053455352783, | |
| "learning_rate": 3.159121378261566e-05, | |
| "loss": 1.1081, | |
| "num_input_tokens_seen": 200560176, | |
| "step": 368000 | |
| }, | |
| { | |
| "epoch": 3.686769649431727, | |
| "grad_norm": 10.115488052368164, | |
| "learning_rate": 3.1566201776852885e-05, | |
| "loss": 1.0972, | |
| "num_input_tokens_seen": 200829688, | |
| "step": 368500 | |
| }, | |
| { | |
| "epoch": 3.6917720505842806, | |
| "grad_norm": 5.798775672912598, | |
| "learning_rate": 3.154118977109013e-05, | |
| "loss": 1.0972, | |
| "num_input_tokens_seen": 201096760, | |
| "step": 369000 | |
| }, | |
| { | |
| "epoch": 3.6967744517368337, | |
| "grad_norm": 6.090835094451904, | |
| "learning_rate": 3.151617776532736e-05, | |
| "loss": 1.0971, | |
| "num_input_tokens_seen": 201367440, | |
| "step": 369500 | |
| }, | |
| { | |
| "epoch": 3.701776852889387, | |
| "grad_norm": 5.695186138153076, | |
| "learning_rate": 3.14911657595646e-05, | |
| "loss": 1.0839, | |
| "num_input_tokens_seen": 201639960, | |
| "step": 370000 | |
| }, | |
| { | |
| "epoch": 3.70677925404194, | |
| "grad_norm": 7.136424541473389, | |
| "learning_rate": 3.1466153753801825e-05, | |
| "loss": 1.1157, | |
| "num_input_tokens_seen": 201913680, | |
| "step": 370500 | |
| }, | |
| { | |
| "epoch": 3.7117816551944935, | |
| "grad_norm": 5.564599514007568, | |
| "learning_rate": 3.144114174803906e-05, | |
| "loss": 1.0987, | |
| "num_input_tokens_seen": 202193184, | |
| "step": 371000 | |
| }, | |
| { | |
| "epoch": 3.7167840563470467, | |
| "grad_norm": 5.429393291473389, | |
| "learning_rate": 3.1416129742276295e-05, | |
| "loss": 1.0872, | |
| "num_input_tokens_seen": 202465104, | |
| "step": 371500 | |
| }, | |
| { | |
| "epoch": 3.7217864574996, | |
| "grad_norm": 6.241130828857422, | |
| "learning_rate": 3.139111773651352e-05, | |
| "loss": 1.1101, | |
| "num_input_tokens_seen": 202739128, | |
| "step": 372000 | |
| }, | |
| { | |
| "epoch": 3.726788858652153, | |
| "grad_norm": 5.023561954498291, | |
| "learning_rate": 3.136610573075076e-05, | |
| "loss": 1.1091, | |
| "num_input_tokens_seen": 203013680, | |
| "step": 372500 | |
| }, | |
| { | |
| "epoch": 3.731791259804706, | |
| "grad_norm": 4.592106342315674, | |
| "learning_rate": 3.1341093724988e-05, | |
| "loss": 1.105, | |
| "num_input_tokens_seen": 203285192, | |
| "step": 373000 | |
| }, | |
| { | |
| "epoch": 3.7367936609572596, | |
| "grad_norm": 4.939518928527832, | |
| "learning_rate": 3.1316081719225235e-05, | |
| "loss": 1.1075, | |
| "num_input_tokens_seen": 203559176, | |
| "step": 373500 | |
| }, | |
| { | |
| "epoch": 3.7417960621098127, | |
| "grad_norm": 5.232937812805176, | |
| "learning_rate": 3.129106971346246e-05, | |
| "loss": 1.1105, | |
| "num_input_tokens_seen": 203835152, | |
| "step": 374000 | |
| }, | |
| { | |
| "epoch": 3.746798463262366, | |
| "grad_norm": 4.963284492492676, | |
| "learning_rate": 3.12660577076997e-05, | |
| "loss": 1.0907, | |
| "num_input_tokens_seen": 204105752, | |
| "step": 374500 | |
| }, | |
| { | |
| "epoch": 3.751800864414919, | |
| "grad_norm": 5.728975296020508, | |
| "learning_rate": 3.124104570193693e-05, | |
| "loss": 1.1002, | |
| "num_input_tokens_seen": 204373136, | |
| "step": 375000 | |
| }, | |
| { | |
| "epoch": 3.7568032655674726, | |
| "grad_norm": 6.109611511230469, | |
| "learning_rate": 3.121603369617416e-05, | |
| "loss": 1.108, | |
| "num_input_tokens_seen": 204638480, | |
| "step": 375500 | |
| }, | |
| { | |
| "epoch": 3.7618056667200257, | |
| "grad_norm": 5.837881088256836, | |
| "learning_rate": 3.1191021690411396e-05, | |
| "loss": 1.1266, | |
| "num_input_tokens_seen": 204909880, | |
| "step": 376000 | |
| }, | |
| { | |
| "epoch": 3.766808067872579, | |
| "grad_norm": 6.2475666999816895, | |
| "learning_rate": 3.116600968464863e-05, | |
| "loss": 1.088, | |
| "num_input_tokens_seen": 205188256, | |
| "step": 376500 | |
| }, | |
| { | |
| "epoch": 3.771810469025132, | |
| "grad_norm": 5.80530309677124, | |
| "learning_rate": 3.1140997678885866e-05, | |
| "loss": 1.0914, | |
| "num_input_tokens_seen": 205462952, | |
| "step": 377000 | |
| }, | |
| { | |
| "epoch": 3.776812870177685, | |
| "grad_norm": 8.078316688537598, | |
| "learning_rate": 3.11159856731231e-05, | |
| "loss": 1.0968, | |
| "num_input_tokens_seen": 205733776, | |
| "step": 377500 | |
| }, | |
| { | |
| "epoch": 3.7818152713302386, | |
| "grad_norm": 6.782426834106445, | |
| "learning_rate": 3.1090973667360336e-05, | |
| "loss": 1.0869, | |
| "num_input_tokens_seen": 206004512, | |
| "step": 378000 | |
| }, | |
| { | |
| "epoch": 3.7868176724827918, | |
| "grad_norm": 5.787932395935059, | |
| "learning_rate": 3.106596166159757e-05, | |
| "loss": 1.1081, | |
| "num_input_tokens_seen": 206278760, | |
| "step": 378500 | |
| }, | |
| { | |
| "epoch": 3.791820073635345, | |
| "grad_norm": 6.141157150268555, | |
| "learning_rate": 3.10409496558348e-05, | |
| "loss": 1.1042, | |
| "num_input_tokens_seen": 206552664, | |
| "step": 379000 | |
| }, | |
| { | |
| "epoch": 3.796822474787898, | |
| "grad_norm": 5.748921871185303, | |
| "learning_rate": 3.1015937650072034e-05, | |
| "loss": 1.1058, | |
| "num_input_tokens_seen": 206824976, | |
| "step": 379500 | |
| }, | |
| { | |
| "epoch": 3.8018248759404516, | |
| "grad_norm": 5.540569305419922, | |
| "learning_rate": 3.099092564430927e-05, | |
| "loss": 1.102, | |
| "num_input_tokens_seen": 207100016, | |
| "step": 380000 | |
| }, | |
| { | |
| "epoch": 3.8068272770930047, | |
| "grad_norm": 6.440171718597412, | |
| "learning_rate": 3.0965913638546504e-05, | |
| "loss": 1.1179, | |
| "num_input_tokens_seen": 207370128, | |
| "step": 380500 | |
| }, | |
| { | |
| "epoch": 3.811829678245558, | |
| "grad_norm": 4.424386024475098, | |
| "learning_rate": 3.094090163278374e-05, | |
| "loss": 1.0881, | |
| "num_input_tokens_seen": 207637240, | |
| "step": 381000 | |
| }, | |
| { | |
| "epoch": 3.816832079398111, | |
| "grad_norm": 5.059506416320801, | |
| "learning_rate": 3.0915889627020974e-05, | |
| "loss": 1.0892, | |
| "num_input_tokens_seen": 207914904, | |
| "step": 381500 | |
| }, | |
| { | |
| "epoch": 3.821834480550664, | |
| "grad_norm": 5.5119805335998535, | |
| "learning_rate": 3.089087762125821e-05, | |
| "loss": 1.089, | |
| "num_input_tokens_seen": 208195200, | |
| "step": 382000 | |
| }, | |
| { | |
| "epoch": 3.8268368817032177, | |
| "grad_norm": 5.340829372406006, | |
| "learning_rate": 3.086586561549544e-05, | |
| "loss": 1.0989, | |
| "num_input_tokens_seen": 208469512, | |
| "step": 382500 | |
| }, | |
| { | |
| "epoch": 3.831839282855771, | |
| "grad_norm": 5.793147087097168, | |
| "learning_rate": 3.084085360973267e-05, | |
| "loss": 1.1051, | |
| "num_input_tokens_seen": 208740512, | |
| "step": 383000 | |
| }, | |
| { | |
| "epoch": 3.836841684008324, | |
| "grad_norm": 4.490692138671875, | |
| "learning_rate": 3.081584160396991e-05, | |
| "loss": 1.1077, | |
| "num_input_tokens_seen": 209018232, | |
| "step": 383500 | |
| }, | |
| { | |
| "epoch": 3.8418440851608775, | |
| "grad_norm": 6.107596397399902, | |
| "learning_rate": 3.0790829598207135e-05, | |
| "loss": 1.0995, | |
| "num_input_tokens_seen": 209290016, | |
| "step": 384000 | |
| }, | |
| { | |
| "epoch": 3.8468464863134306, | |
| "grad_norm": 7.825516223907471, | |
| "learning_rate": 3.076581759244438e-05, | |
| "loss": 1.0869, | |
| "num_input_tokens_seen": 209556592, | |
| "step": 384500 | |
| }, | |
| { | |
| "epoch": 3.8518488874659838, | |
| "grad_norm": 4.849490165710449, | |
| "learning_rate": 3.074080558668161e-05, | |
| "loss": 1.1221, | |
| "num_input_tokens_seen": 209832880, | |
| "step": 385000 | |
| }, | |
| { | |
| "epoch": 3.856851288618537, | |
| "grad_norm": 6.529792308807373, | |
| "learning_rate": 3.071579358091885e-05, | |
| "loss": 1.1062, | |
| "num_input_tokens_seen": 210114184, | |
| "step": 385500 | |
| }, | |
| { | |
| "epoch": 3.86185368977109, | |
| "grad_norm": 6.837585926055908, | |
| "learning_rate": 3.0690781575156075e-05, | |
| "loss": 1.0878, | |
| "num_input_tokens_seen": 210380480, | |
| "step": 386000 | |
| }, | |
| { | |
| "epoch": 3.866856090923643, | |
| "grad_norm": 6.309233665466309, | |
| "learning_rate": 3.066576956939331e-05, | |
| "loss": 1.1116, | |
| "num_input_tokens_seen": 210653688, | |
| "step": 386500 | |
| }, | |
| { | |
| "epoch": 3.8718584920761967, | |
| "grad_norm": 6.287944316864014, | |
| "learning_rate": 3.0640757563630545e-05, | |
| "loss": 1.1021, | |
| "num_input_tokens_seen": 210927232, | |
| "step": 387000 | |
| }, | |
| { | |
| "epoch": 3.87686089322875, | |
| "grad_norm": 5.488702774047852, | |
| "learning_rate": 3.061574555786777e-05, | |
| "loss": 1.1043, | |
| "num_input_tokens_seen": 211197296, | |
| "step": 387500 | |
| }, | |
| { | |
| "epoch": 3.881863294381303, | |
| "grad_norm": 8.246638298034668, | |
| "learning_rate": 3.059073355210501e-05, | |
| "loss": 1.0917, | |
| "num_input_tokens_seen": 211469200, | |
| "step": 388000 | |
| }, | |
| { | |
| "epoch": 3.8868656955338565, | |
| "grad_norm": 6.3921332359313965, | |
| "learning_rate": 3.056572154634224e-05, | |
| "loss": 1.1079, | |
| "num_input_tokens_seen": 211736248, | |
| "step": 388500 | |
| }, | |
| { | |
| "epoch": 3.8918680966864097, | |
| "grad_norm": 5.241750717163086, | |
| "learning_rate": 3.0540709540579485e-05, | |
| "loss": 1.0928, | |
| "num_input_tokens_seen": 212005560, | |
| "step": 389000 | |
| }, | |
| { | |
| "epoch": 3.8968704978389628, | |
| "grad_norm": 5.063024997711182, | |
| "learning_rate": 3.0515697534816713e-05, | |
| "loss": 1.1103, | |
| "num_input_tokens_seen": 212271040, | |
| "step": 389500 | |
| }, | |
| { | |
| "epoch": 3.901872898991516, | |
| "grad_norm": 5.5935139656066895, | |
| "learning_rate": 3.0490685529053948e-05, | |
| "loss": 1.1169, | |
| "num_input_tokens_seen": 212544560, | |
| "step": 390000 | |
| }, | |
| { | |
| "epoch": 3.906875300144069, | |
| "grad_norm": 5.44050931930542, | |
| "learning_rate": 3.0465673523291183e-05, | |
| "loss": 1.0862, | |
| "num_input_tokens_seen": 212819160, | |
| "step": 390500 | |
| }, | |
| { | |
| "epoch": 3.911877701296622, | |
| "grad_norm": 5.747745990753174, | |
| "learning_rate": 3.044066151752841e-05, | |
| "loss": 1.0898, | |
| "num_input_tokens_seen": 213087032, | |
| "step": 391000 | |
| }, | |
| { | |
| "epoch": 3.9168801024491757, | |
| "grad_norm": 5.6474995613098145, | |
| "learning_rate": 3.041564951176565e-05, | |
| "loss": 1.1183, | |
| "num_input_tokens_seen": 213366232, | |
| "step": 391500 | |
| }, | |
| { | |
| "epoch": 3.921882503601729, | |
| "grad_norm": 5.1681928634643555, | |
| "learning_rate": 3.0390637506002884e-05, | |
| "loss": 1.1001, | |
| "num_input_tokens_seen": 213633560, | |
| "step": 392000 | |
| }, | |
| { | |
| "epoch": 3.926884904754282, | |
| "grad_norm": 7.847573280334473, | |
| "learning_rate": 3.036562550024012e-05, | |
| "loss": 1.0939, | |
| "num_input_tokens_seen": 213908816, | |
| "step": 392500 | |
| }, | |
| { | |
| "epoch": 3.9318873059068356, | |
| "grad_norm": 7.0550713539123535, | |
| "learning_rate": 3.0340613494477348e-05, | |
| "loss": 1.1101, | |
| "num_input_tokens_seen": 214186464, | |
| "step": 393000 | |
| }, | |
| { | |
| "epoch": 3.9368897070593887, | |
| "grad_norm": 5.558708667755127, | |
| "learning_rate": 3.0315601488714586e-05, | |
| "loss": 1.1038, | |
| "num_input_tokens_seen": 214455448, | |
| "step": 393500 | |
| }, | |
| { | |
| "epoch": 3.941892108211942, | |
| "grad_norm": 7.920301914215088, | |
| "learning_rate": 3.029058948295182e-05, | |
| "loss": 1.1085, | |
| "num_input_tokens_seen": 214732032, | |
| "step": 394000 | |
| }, | |
| { | |
| "epoch": 3.946894509364495, | |
| "grad_norm": 6.4054789543151855, | |
| "learning_rate": 3.026557747718905e-05, | |
| "loss": 1.1035, | |
| "num_input_tokens_seen": 215009992, | |
| "step": 394500 | |
| }, | |
| { | |
| "epoch": 3.951896910517048, | |
| "grad_norm": 5.385251045227051, | |
| "learning_rate": 3.0240565471426284e-05, | |
| "loss": 1.1004, | |
| "num_input_tokens_seen": 215281032, | |
| "step": 395000 | |
| }, | |
| { | |
| "epoch": 3.956899311669601, | |
| "grad_norm": 6.670193672180176, | |
| "learning_rate": 3.0215553465663523e-05, | |
| "loss": 1.0895, | |
| "num_input_tokens_seen": 215547536, | |
| "step": 395500 | |
| }, | |
| { | |
| "epoch": 3.9619017128221548, | |
| "grad_norm": 9.283798217773438, | |
| "learning_rate": 3.0190541459900757e-05, | |
| "loss": 1.0853, | |
| "num_input_tokens_seen": 215818168, | |
| "step": 396000 | |
| }, | |
| { | |
| "epoch": 3.966904113974708, | |
| "grad_norm": 5.494171142578125, | |
| "learning_rate": 3.0165529454137986e-05, | |
| "loss": 1.1118, | |
| "num_input_tokens_seen": 216097808, | |
| "step": 396500 | |
| }, | |
| { | |
| "epoch": 3.971906515127261, | |
| "grad_norm": 9.865717887878418, | |
| "learning_rate": 3.014051744837522e-05, | |
| "loss": 1.1092, | |
| "num_input_tokens_seen": 216372792, | |
| "step": 397000 | |
| }, | |
| { | |
| "epoch": 3.9769089162798146, | |
| "grad_norm": 7.068398952484131, | |
| "learning_rate": 3.0115505442612456e-05, | |
| "loss": 1.0978, | |
| "num_input_tokens_seen": 216645048, | |
| "step": 397500 | |
| }, | |
| { | |
| "epoch": 3.9819113174323677, | |
| "grad_norm": 7.0897626876831055, | |
| "learning_rate": 3.0090493436849687e-05, | |
| "loss": 1.0978, | |
| "num_input_tokens_seen": 216922104, | |
| "step": 398000 | |
| }, | |
| { | |
| "epoch": 3.986913718584921, | |
| "grad_norm": 6.884424686431885, | |
| "learning_rate": 3.0065481431086922e-05, | |
| "loss": 1.1057, | |
| "num_input_tokens_seen": 217197472, | |
| "step": 398500 | |
| }, | |
| { | |
| "epoch": 3.991916119737474, | |
| "grad_norm": 8.55648136138916, | |
| "learning_rate": 3.0040469425324157e-05, | |
| "loss": 1.0986, | |
| "num_input_tokens_seen": 217464560, | |
| "step": 399000 | |
| }, | |
| { | |
| "epoch": 3.996918520890027, | |
| "grad_norm": 6.080700874328613, | |
| "learning_rate": 3.0015457419561392e-05, | |
| "loss": 1.1001, | |
| "num_input_tokens_seen": 217738936, | |
| "step": 399500 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 1.0870640277862549, | |
| "eval_runtime": 187.0155, | |
| "eval_samples_per_second": 1068.922, | |
| "eval_steps_per_second": 133.62, | |
| "num_input_tokens_seen": 217911400, | |
| "step": 399808 | |
| }, | |
| { | |
| "epoch": 4.00192092204258, | |
| "grad_norm": 5.729778289794922, | |
| "learning_rate": 2.9990445413798624e-05, | |
| "loss": 1.041, | |
| "num_input_tokens_seen": 218020144, | |
| "step": 400000 | |
| }, | |
| { | |
| "epoch": 4.006923323195133, | |
| "grad_norm": 5.8337225914001465, | |
| "learning_rate": 2.996543340803586e-05, | |
| "loss": 1.0154, | |
| "num_input_tokens_seen": 218293352, | |
| "step": 400500 | |
| }, | |
| { | |
| "epoch": 4.0119257243476865, | |
| "grad_norm": 6.142926216125488, | |
| "learning_rate": 2.9940421402273094e-05, | |
| "loss": 1.0043, | |
| "num_input_tokens_seen": 218559112, | |
| "step": 401000 | |
| }, | |
| { | |
| "epoch": 4.0169281255002405, | |
| "grad_norm": 4.911243915557861, | |
| "learning_rate": 2.9915409396510325e-05, | |
| "loss": 1.0061, | |
| "num_input_tokens_seen": 218839224, | |
| "step": 401500 | |
| }, | |
| { | |
| "epoch": 4.021930526652794, | |
| "grad_norm": 5.466070175170898, | |
| "learning_rate": 2.989039739074756e-05, | |
| "loss": 0.9953, | |
| "num_input_tokens_seen": 219111776, | |
| "step": 402000 | |
| }, | |
| { | |
| "epoch": 4.026932927805347, | |
| "grad_norm": 6.471262454986572, | |
| "learning_rate": 2.9865385384984795e-05, | |
| "loss": 0.9955, | |
| "num_input_tokens_seen": 219383912, | |
| "step": 402500 | |
| }, | |
| { | |
| "epoch": 4.0319353289579, | |
| "grad_norm": 7.179049491882324, | |
| "learning_rate": 2.9840373379222027e-05, | |
| "loss": 1.0101, | |
| "num_input_tokens_seen": 219647992, | |
| "step": 403000 | |
| }, | |
| { | |
| "epoch": 4.036937730110453, | |
| "grad_norm": 5.031703948974609, | |
| "learning_rate": 2.981536137345926e-05, | |
| "loss": 1.0021, | |
| "num_input_tokens_seen": 219913768, | |
| "step": 403500 | |
| }, | |
| { | |
| "epoch": 4.041940131263006, | |
| "grad_norm": 4.3193840980529785, | |
| "learning_rate": 2.9790349367696497e-05, | |
| "loss": 1.0078, | |
| "num_input_tokens_seen": 220190376, | |
| "step": 404000 | |
| }, | |
| { | |
| "epoch": 4.046942532415559, | |
| "grad_norm": 5.400819778442383, | |
| "learning_rate": 2.976533736193373e-05, | |
| "loss": 0.9949, | |
| "num_input_tokens_seen": 220460224, | |
| "step": 404500 | |
| }, | |
| { | |
| "epoch": 4.051944933568112, | |
| "grad_norm": 6.279000759124756, | |
| "learning_rate": 2.9740325356170963e-05, | |
| "loss": 1.0322, | |
| "num_input_tokens_seen": 220736472, | |
| "step": 405000 | |
| }, | |
| { | |
| "epoch": 4.0569473347206655, | |
| "grad_norm": 7.3011627197265625, | |
| "learning_rate": 2.9715313350408198e-05, | |
| "loss": 1.0112, | |
| "num_input_tokens_seen": 221004608, | |
| "step": 405500 | |
| }, | |
| { | |
| "epoch": 4.0619497358732195, | |
| "grad_norm": 6.007471561431885, | |
| "learning_rate": 2.9690301344645433e-05, | |
| "loss": 1.0231, | |
| "num_input_tokens_seen": 221278264, | |
| "step": 406000 | |
| }, | |
| { | |
| "epoch": 4.066952137025773, | |
| "grad_norm": 5.404012203216553, | |
| "learning_rate": 2.966528933888266e-05, | |
| "loss": 1.0205, | |
| "num_input_tokens_seen": 221555688, | |
| "step": 406500 | |
| }, | |
| { | |
| "epoch": 4.071954538178326, | |
| "grad_norm": 4.693950653076172, | |
| "learning_rate": 2.9640277333119896e-05, | |
| "loss": 1.021, | |
| "num_input_tokens_seen": 221832040, | |
| "step": 407000 | |
| }, | |
| { | |
| "epoch": 4.076956939330879, | |
| "grad_norm": 5.678884029388428, | |
| "learning_rate": 2.9615265327357135e-05, | |
| "loss": 1.0268, | |
| "num_input_tokens_seen": 222112352, | |
| "step": 407500 | |
| }, | |
| { | |
| "epoch": 4.081959340483432, | |
| "grad_norm": 5.514533042907715, | |
| "learning_rate": 2.959025332159437e-05, | |
| "loss": 1.0236, | |
| "num_input_tokens_seen": 222383544, | |
| "step": 408000 | |
| }, | |
| { | |
| "epoch": 4.086961741635985, | |
| "grad_norm": 6.353760719299316, | |
| "learning_rate": 2.9565241315831598e-05, | |
| "loss": 1.0076, | |
| "num_input_tokens_seen": 222653528, | |
| "step": 408500 | |
| }, | |
| { | |
| "epoch": 4.091964142788538, | |
| "grad_norm": 5.7514519691467285, | |
| "learning_rate": 2.9540229310068833e-05, | |
| "loss": 1.0175, | |
| "num_input_tokens_seen": 222929176, | |
| "step": 409000 | |
| }, | |
| { | |
| "epoch": 4.096966543941091, | |
| "grad_norm": 5.185674667358398, | |
| "learning_rate": 2.951521730430607e-05, | |
| "loss": 1.023, | |
| "num_input_tokens_seen": 223205600, | |
| "step": 409500 | |
| }, | |
| { | |
| "epoch": 4.1019689450936445, | |
| "grad_norm": 6.269286632537842, | |
| "learning_rate": 2.94902052985433e-05, | |
| "loss": 1.0177, | |
| "num_input_tokens_seen": 223471424, | |
| "step": 410000 | |
| }, | |
| { | |
| "epoch": 4.1069713462461985, | |
| "grad_norm": 5.551058292388916, | |
| "learning_rate": 2.9465193292780534e-05, | |
| "loss": 1.014, | |
| "num_input_tokens_seen": 223744904, | |
| "step": 410500 | |
| }, | |
| { | |
| "epoch": 4.111973747398752, | |
| "grad_norm": 7.259944438934326, | |
| "learning_rate": 2.944018128701777e-05, | |
| "loss": 1.0176, | |
| "num_input_tokens_seen": 224017960, | |
| "step": 411000 | |
| }, | |
| { | |
| "epoch": 4.116976148551305, | |
| "grad_norm": 6.2288498878479, | |
| "learning_rate": 2.9415169281255008e-05, | |
| "loss": 1.0208, | |
| "num_input_tokens_seen": 224288328, | |
| "step": 411500 | |
| }, | |
| { | |
| "epoch": 4.121978549703858, | |
| "grad_norm": 4.875370502471924, | |
| "learning_rate": 2.9390157275492236e-05, | |
| "loss": 1.0218, | |
| "num_input_tokens_seen": 224564744, | |
| "step": 412000 | |
| }, | |
| { | |
| "epoch": 4.126980950856411, | |
| "grad_norm": 5.8250603675842285, | |
| "learning_rate": 2.936514526972947e-05, | |
| "loss": 1.0176, | |
| "num_input_tokens_seen": 224833216, | |
| "step": 412500 | |
| }, | |
| { | |
| "epoch": 4.131983352008964, | |
| "grad_norm": 4.689972877502441, | |
| "learning_rate": 2.9340133263966706e-05, | |
| "loss": 1.0031, | |
| "num_input_tokens_seen": 225109008, | |
| "step": 413000 | |
| }, | |
| { | |
| "epoch": 4.136985753161517, | |
| "grad_norm": 6.370342254638672, | |
| "learning_rate": 2.9315121258203937e-05, | |
| "loss": 1.0235, | |
| "num_input_tokens_seen": 225386144, | |
| "step": 413500 | |
| }, | |
| { | |
| "epoch": 4.14198815431407, | |
| "grad_norm": 5.214616298675537, | |
| "learning_rate": 2.9290109252441172e-05, | |
| "loss": 1.0147, | |
| "num_input_tokens_seen": 225665576, | |
| "step": 414000 | |
| }, | |
| { | |
| "epoch": 4.146990555466624, | |
| "grad_norm": 5.056887626647949, | |
| "learning_rate": 2.9265097246678407e-05, | |
| "loss": 1.0134, | |
| "num_input_tokens_seen": 225936744, | |
| "step": 414500 | |
| }, | |
| { | |
| "epoch": 4.1519929566191776, | |
| "grad_norm": 7.385371685028076, | |
| "learning_rate": 2.9240085240915642e-05, | |
| "loss": 1.0133, | |
| "num_input_tokens_seen": 226206672, | |
| "step": 415000 | |
| }, | |
| { | |
| "epoch": 4.156995357771731, | |
| "grad_norm": 6.09354829788208, | |
| "learning_rate": 2.9215073235152874e-05, | |
| "loss": 1.0257, | |
| "num_input_tokens_seen": 226483208, | |
| "step": 415500 | |
| }, | |
| { | |
| "epoch": 4.161997758924284, | |
| "grad_norm": 6.554540634155273, | |
| "learning_rate": 2.919006122939011e-05, | |
| "loss": 1.004, | |
| "num_input_tokens_seen": 226756440, | |
| "step": 416000 | |
| }, | |
| { | |
| "epoch": 4.167000160076837, | |
| "grad_norm": 6.016900539398193, | |
| "learning_rate": 2.9165049223627344e-05, | |
| "loss": 1.0373, | |
| "num_input_tokens_seen": 227035824, | |
| "step": 416500 | |
| }, | |
| { | |
| "epoch": 4.17200256122939, | |
| "grad_norm": 5.212109565734863, | |
| "learning_rate": 2.9140037217864575e-05, | |
| "loss": 1.0168, | |
| "num_input_tokens_seen": 227309792, | |
| "step": 417000 | |
| }, | |
| { | |
| "epoch": 4.177004962381943, | |
| "grad_norm": 5.641068935394287, | |
| "learning_rate": 2.911502521210181e-05, | |
| "loss": 1.0184, | |
| "num_input_tokens_seen": 227578216, | |
| "step": 417500 | |
| }, | |
| { | |
| "epoch": 4.182007363534496, | |
| "grad_norm": 4.603857040405273, | |
| "learning_rate": 2.9090013206339045e-05, | |
| "loss": 1.0092, | |
| "num_input_tokens_seen": 227844640, | |
| "step": 418000 | |
| }, | |
| { | |
| "epoch": 4.187009764687049, | |
| "grad_norm": 7.76889705657959, | |
| "learning_rate": 2.906500120057628e-05, | |
| "loss": 1.0324, | |
| "num_input_tokens_seen": 228114632, | |
| "step": 418500 | |
| }, | |
| { | |
| "epoch": 4.1920121658396035, | |
| "grad_norm": 5.698912143707275, | |
| "learning_rate": 2.9039989194813512e-05, | |
| "loss": 1.0159, | |
| "num_input_tokens_seen": 228392896, | |
| "step": 419000 | |
| }, | |
| { | |
| "epoch": 4.197014566992157, | |
| "grad_norm": 4.45599365234375, | |
| "learning_rate": 2.9014977189050747e-05, | |
| "loss": 1.0243, | |
| "num_input_tokens_seen": 228657144, | |
| "step": 419500 | |
| }, | |
| { | |
| "epoch": 4.20201696814471, | |
| "grad_norm": 4.775566577911377, | |
| "learning_rate": 2.8989965183287982e-05, | |
| "loss": 1.0249, | |
| "num_input_tokens_seen": 228930912, | |
| "step": 420000 | |
| }, | |
| { | |
| "epoch": 4.207019369297263, | |
| "grad_norm": 4.6044511795043945, | |
| "learning_rate": 2.896495317752521e-05, | |
| "loss": 1.0171, | |
| "num_input_tokens_seen": 229190776, | |
| "step": 420500 | |
| }, | |
| { | |
| "epoch": 4.212021770449816, | |
| "grad_norm": 5.821028709411621, | |
| "learning_rate": 2.8939941171762448e-05, | |
| "loss": 1.0197, | |
| "num_input_tokens_seen": 229464464, | |
| "step": 421000 | |
| }, | |
| { | |
| "epoch": 4.217024171602369, | |
| "grad_norm": 6.407191753387451, | |
| "learning_rate": 2.8914929165999683e-05, | |
| "loss": 1.0193, | |
| "num_input_tokens_seen": 229735080, | |
| "step": 421500 | |
| }, | |
| { | |
| "epoch": 4.222026572754922, | |
| "grad_norm": 6.624352931976318, | |
| "learning_rate": 2.8889917160236918e-05, | |
| "loss": 1.0388, | |
| "num_input_tokens_seen": 230010112, | |
| "step": 422000 | |
| }, | |
| { | |
| "epoch": 4.227028973907475, | |
| "grad_norm": 5.672749042510986, | |
| "learning_rate": 2.8864905154474146e-05, | |
| "loss": 1.0094, | |
| "num_input_tokens_seen": 230273040, | |
| "step": 422500 | |
| }, | |
| { | |
| "epoch": 4.2320313750600285, | |
| "grad_norm": 4.765455722808838, | |
| "learning_rate": 2.8839893148711385e-05, | |
| "loss": 1.038, | |
| "num_input_tokens_seen": 230549568, | |
| "step": 423000 | |
| }, | |
| { | |
| "epoch": 4.2370337762125825, | |
| "grad_norm": 5.471391677856445, | |
| "learning_rate": 2.881488114294862e-05, | |
| "loss": 1.0137, | |
| "num_input_tokens_seen": 230815880, | |
| "step": 423500 | |
| }, | |
| { | |
| "epoch": 4.242036177365136, | |
| "grad_norm": 5.090280532836914, | |
| "learning_rate": 2.8789869137185848e-05, | |
| "loss": 1.0254, | |
| "num_input_tokens_seen": 231087416, | |
| "step": 424000 | |
| }, | |
| { | |
| "epoch": 4.247038578517689, | |
| "grad_norm": 5.823254585266113, | |
| "learning_rate": 2.8764857131423083e-05, | |
| "loss": 1.0369, | |
| "num_input_tokens_seen": 231351728, | |
| "step": 424500 | |
| }, | |
| { | |
| "epoch": 4.252040979670242, | |
| "grad_norm": 5.406543731689453, | |
| "learning_rate": 2.873984512566032e-05, | |
| "loss": 1.0233, | |
| "num_input_tokens_seen": 231626944, | |
| "step": 425000 | |
| }, | |
| { | |
| "epoch": 4.257043380822795, | |
| "grad_norm": 6.112472057342529, | |
| "learning_rate": 2.871483311989755e-05, | |
| "loss": 1.046, | |
| "num_input_tokens_seen": 231901904, | |
| "step": 425500 | |
| }, | |
| { | |
| "epoch": 4.262045781975348, | |
| "grad_norm": 5.495764255523682, | |
| "learning_rate": 2.8689821114134784e-05, | |
| "loss": 1.0127, | |
| "num_input_tokens_seen": 232177576, | |
| "step": 426000 | |
| }, | |
| { | |
| "epoch": 4.267048183127901, | |
| "grad_norm": 5.970737934112549, | |
| "learning_rate": 2.866480910837202e-05, | |
| "loss": 1.0266, | |
| "num_input_tokens_seen": 232448808, | |
| "step": 426500 | |
| }, | |
| { | |
| "epoch": 4.272050584280454, | |
| "grad_norm": 6.965437889099121, | |
| "learning_rate": 2.8639797102609254e-05, | |
| "loss": 1.0311, | |
| "num_input_tokens_seen": 232724960, | |
| "step": 427000 | |
| }, | |
| { | |
| "epoch": 4.2770529854330075, | |
| "grad_norm": 6.662547588348389, | |
| "learning_rate": 2.8614785096846486e-05, | |
| "loss": 1.0216, | |
| "num_input_tokens_seen": 232999800, | |
| "step": 427500 | |
| }, | |
| { | |
| "epoch": 4.2820553865855615, | |
| "grad_norm": 4.90582275390625, | |
| "learning_rate": 2.858977309108372e-05, | |
| "loss": 1.036, | |
| "num_input_tokens_seen": 233278352, | |
| "step": 428000 | |
| }, | |
| { | |
| "epoch": 4.287057787738115, | |
| "grad_norm": 5.090430736541748, | |
| "learning_rate": 2.8564761085320956e-05, | |
| "loss": 1.0253, | |
| "num_input_tokens_seen": 233542648, | |
| "step": 428500 | |
| }, | |
| { | |
| "epoch": 4.292060188890668, | |
| "grad_norm": 6.307216167449951, | |
| "learning_rate": 2.8539749079558187e-05, | |
| "loss": 1.0367, | |
| "num_input_tokens_seen": 233821008, | |
| "step": 429000 | |
| }, | |
| { | |
| "epoch": 4.297062590043221, | |
| "grad_norm": 5.634079933166504, | |
| "learning_rate": 2.8514737073795422e-05, | |
| "loss": 1.0248, | |
| "num_input_tokens_seen": 234099000, | |
| "step": 429500 | |
| }, | |
| { | |
| "epoch": 4.302064991195774, | |
| "grad_norm": 6.014862060546875, | |
| "learning_rate": 2.8489725068032657e-05, | |
| "loss": 1.0231, | |
| "num_input_tokens_seen": 234375176, | |
| "step": 430000 | |
| }, | |
| { | |
| "epoch": 4.307067392348327, | |
| "grad_norm": 5.199640274047852, | |
| "learning_rate": 2.8464713062269892e-05, | |
| "loss": 1.0366, | |
| "num_input_tokens_seen": 234650128, | |
| "step": 430500 | |
| }, | |
| { | |
| "epoch": 4.31206979350088, | |
| "grad_norm": 4.8902692794799805, | |
| "learning_rate": 2.8439701056507124e-05, | |
| "loss": 1.0394, | |
| "num_input_tokens_seen": 234918712, | |
| "step": 431000 | |
| }, | |
| { | |
| "epoch": 4.317072194653433, | |
| "grad_norm": 4.592429161071777, | |
| "learning_rate": 2.841468905074436e-05, | |
| "loss": 1.029, | |
| "num_input_tokens_seen": 235196600, | |
| "step": 431500 | |
| }, | |
| { | |
| "epoch": 4.3220745958059865, | |
| "grad_norm": 5.6518144607543945, | |
| "learning_rate": 2.8389677044981594e-05, | |
| "loss": 1.0217, | |
| "num_input_tokens_seen": 235465976, | |
| "step": 432000 | |
| }, | |
| { | |
| "epoch": 4.3270769969585405, | |
| "grad_norm": 5.183743000030518, | |
| "learning_rate": 2.8364665039218825e-05, | |
| "loss": 1.0262, | |
| "num_input_tokens_seen": 235745992, | |
| "step": 432500 | |
| }, | |
| { | |
| "epoch": 4.332079398111094, | |
| "grad_norm": 4.891019821166992, | |
| "learning_rate": 2.833965303345606e-05, | |
| "loss": 1.0363, | |
| "num_input_tokens_seen": 236018376, | |
| "step": 433000 | |
| }, | |
| { | |
| "epoch": 4.337081799263647, | |
| "grad_norm": 4.2536725997924805, | |
| "learning_rate": 2.8314641027693295e-05, | |
| "loss": 1.022, | |
| "num_input_tokens_seen": 236289296, | |
| "step": 433500 | |
| }, | |
| { | |
| "epoch": 4.3420842004162, | |
| "grad_norm": 6.686141014099121, | |
| "learning_rate": 2.828962902193053e-05, | |
| "loss": 1.0393, | |
| "num_input_tokens_seen": 236575672, | |
| "step": 434000 | |
| }, | |
| { | |
| "epoch": 4.347086601568753, | |
| "grad_norm": 4.4611945152282715, | |
| "learning_rate": 2.8264617016167762e-05, | |
| "loss": 1.0419, | |
| "num_input_tokens_seen": 236849624, | |
| "step": 434500 | |
| }, | |
| { | |
| "epoch": 4.352089002721306, | |
| "grad_norm": 4.447482585906982, | |
| "learning_rate": 2.8239605010404997e-05, | |
| "loss": 1.0337, | |
| "num_input_tokens_seen": 237115712, | |
| "step": 435000 | |
| }, | |
| { | |
| "epoch": 4.357091403873859, | |
| "grad_norm": 5.549137115478516, | |
| "learning_rate": 2.8214593004642232e-05, | |
| "loss": 1.0161, | |
| "num_input_tokens_seen": 237386704, | |
| "step": 435500 | |
| }, | |
| { | |
| "epoch": 4.362093805026412, | |
| "grad_norm": 6.824407577514648, | |
| "learning_rate": 2.818958099887946e-05, | |
| "loss": 1.025, | |
| "num_input_tokens_seen": 237662672, | |
| "step": 436000 | |
| }, | |
| { | |
| "epoch": 4.3670962061789655, | |
| "grad_norm": 5.618262767791748, | |
| "learning_rate": 2.8164568993116695e-05, | |
| "loss": 1.0222, | |
| "num_input_tokens_seen": 237934856, | |
| "step": 436500 | |
| }, | |
| { | |
| "epoch": 4.37209860733152, | |
| "grad_norm": 5.112995624542236, | |
| "learning_rate": 2.8139556987353933e-05, | |
| "loss": 1.0119, | |
| "num_input_tokens_seen": 238206992, | |
| "step": 437000 | |
| }, | |
| { | |
| "epoch": 4.377101008484073, | |
| "grad_norm": 5.395593166351318, | |
| "learning_rate": 2.811454498159117e-05, | |
| "loss": 1.0337, | |
| "num_input_tokens_seen": 238477792, | |
| "step": 437500 | |
| }, | |
| { | |
| "epoch": 4.382103409636626, | |
| "grad_norm": 8.581912994384766, | |
| "learning_rate": 2.8089532975828397e-05, | |
| "loss": 1.0447, | |
| "num_input_tokens_seen": 238754960, | |
| "step": 438000 | |
| }, | |
| { | |
| "epoch": 4.387105810789179, | |
| "grad_norm": 5.694709777832031, | |
| "learning_rate": 2.806452097006563e-05, | |
| "loss": 1.0404, | |
| "num_input_tokens_seen": 239027008, | |
| "step": 438500 | |
| }, | |
| { | |
| "epoch": 4.392108211941732, | |
| "grad_norm": 6.605731010437012, | |
| "learning_rate": 2.803950896430287e-05, | |
| "loss": 1.0412, | |
| "num_input_tokens_seen": 239292592, | |
| "step": 439000 | |
| }, | |
| { | |
| "epoch": 4.397110613094285, | |
| "grad_norm": 5.162715911865234, | |
| "learning_rate": 2.8014496958540098e-05, | |
| "loss": 1.0266, | |
| "num_input_tokens_seen": 239564344, | |
| "step": 439500 | |
| }, | |
| { | |
| "epoch": 4.402113014246838, | |
| "grad_norm": 8.414751052856445, | |
| "learning_rate": 2.7989484952777333e-05, | |
| "loss": 1.0461, | |
| "num_input_tokens_seen": 239839984, | |
| "step": 440000 | |
| }, | |
| { | |
| "epoch": 4.4071154153993914, | |
| "grad_norm": 5.043530464172363, | |
| "learning_rate": 2.7964472947014568e-05, | |
| "loss": 1.0312, | |
| "num_input_tokens_seen": 240116944, | |
| "step": 440500 | |
| }, | |
| { | |
| "epoch": 4.412117816551945, | |
| "grad_norm": 6.508191108703613, | |
| "learning_rate": 2.7939460941251806e-05, | |
| "loss": 1.0355, | |
| "num_input_tokens_seen": 240386144, | |
| "step": 441000 | |
| }, | |
| { | |
| "epoch": 4.417120217704499, | |
| "grad_norm": 4.704832077026367, | |
| "learning_rate": 2.7914448935489035e-05, | |
| "loss": 1.0252, | |
| "num_input_tokens_seen": 240655192, | |
| "step": 441500 | |
| }, | |
| { | |
| "epoch": 4.422122618857052, | |
| "grad_norm": 6.601123332977295, | |
| "learning_rate": 2.788943692972627e-05, | |
| "loss": 1.0564, | |
| "num_input_tokens_seen": 240931640, | |
| "step": 442000 | |
| }, | |
| { | |
| "epoch": 4.427125020009605, | |
| "grad_norm": 5.828186988830566, | |
| "learning_rate": 2.7864424923963504e-05, | |
| "loss": 1.0359, | |
| "num_input_tokens_seen": 241199768, | |
| "step": 442500 | |
| }, | |
| { | |
| "epoch": 4.432127421162158, | |
| "grad_norm": 4.463243007659912, | |
| "learning_rate": 2.7839412918200736e-05, | |
| "loss": 1.03, | |
| "num_input_tokens_seen": 241474320, | |
| "step": 443000 | |
| }, | |
| { | |
| "epoch": 4.437129822314711, | |
| "grad_norm": 5.028249263763428, | |
| "learning_rate": 2.781440091243797e-05, | |
| "loss": 1.0403, | |
| "num_input_tokens_seen": 241750640, | |
| "step": 443500 | |
| }, | |
| { | |
| "epoch": 4.442132223467264, | |
| "grad_norm": 4.5420684814453125, | |
| "learning_rate": 2.7789388906675206e-05, | |
| "loss": 1.0171, | |
| "num_input_tokens_seen": 242018848, | |
| "step": 444000 | |
| }, | |
| { | |
| "epoch": 4.447134624619817, | |
| "grad_norm": 7.803140640258789, | |
| "learning_rate": 2.7764376900912438e-05, | |
| "loss": 1.0297, | |
| "num_input_tokens_seen": 242284168, | |
| "step": 444500 | |
| }, | |
| { | |
| "epoch": 4.4521370257723705, | |
| "grad_norm": 5.844732761383057, | |
| "learning_rate": 2.7739364895149673e-05, | |
| "loss": 1.0348, | |
| "num_input_tokens_seen": 242553128, | |
| "step": 445000 | |
| }, | |
| { | |
| "epoch": 4.457139426924924, | |
| "grad_norm": 5.830750942230225, | |
| "learning_rate": 2.7714352889386908e-05, | |
| "loss": 1.0429, | |
| "num_input_tokens_seen": 242827088, | |
| "step": 445500 | |
| }, | |
| { | |
| "epoch": 4.462141828077478, | |
| "grad_norm": 4.908278942108154, | |
| "learning_rate": 2.7689340883624143e-05, | |
| "loss": 1.028, | |
| "num_input_tokens_seen": 243093120, | |
| "step": 446000 | |
| }, | |
| { | |
| "epoch": 4.467144229230031, | |
| "grad_norm": 5.725689888000488, | |
| "learning_rate": 2.7664328877861374e-05, | |
| "loss": 1.023, | |
| "num_input_tokens_seen": 243364816, | |
| "step": 446500 | |
| }, | |
| { | |
| "epoch": 4.472146630382584, | |
| "grad_norm": 5.354498386383057, | |
| "learning_rate": 2.763931687209861e-05, | |
| "loss": 1.0433, | |
| "num_input_tokens_seen": 243641016, | |
| "step": 447000 | |
| }, | |
| { | |
| "epoch": 4.477149031535137, | |
| "grad_norm": 6.727901458740234, | |
| "learning_rate": 2.7614304866335844e-05, | |
| "loss": 1.0142, | |
| "num_input_tokens_seen": 243912976, | |
| "step": 447500 | |
| }, | |
| { | |
| "epoch": 4.48215143268769, | |
| "grad_norm": 5.042398452758789, | |
| "learning_rate": 2.7589292860573072e-05, | |
| "loss": 1.0301, | |
| "num_input_tokens_seen": 244181688, | |
| "step": 448000 | |
| }, | |
| { | |
| "epoch": 4.487153833840243, | |
| "grad_norm": 6.022967338562012, | |
| "learning_rate": 2.756428085481031e-05, | |
| "loss": 1.0242, | |
| "num_input_tokens_seen": 244448704, | |
| "step": 448500 | |
| }, | |
| { | |
| "epoch": 4.492156234992796, | |
| "grad_norm": 5.077592849731445, | |
| "learning_rate": 2.7539268849047546e-05, | |
| "loss": 1.0373, | |
| "num_input_tokens_seen": 244722392, | |
| "step": 449000 | |
| }, | |
| { | |
| "epoch": 4.4971586361453495, | |
| "grad_norm": 5.527291774749756, | |
| "learning_rate": 2.751425684328478e-05, | |
| "loss": 1.0183, | |
| "num_input_tokens_seen": 244995952, | |
| "step": 449500 | |
| }, | |
| { | |
| "epoch": 4.502161037297903, | |
| "grad_norm": 5.025604248046875, | |
| "learning_rate": 2.748924483752201e-05, | |
| "loss": 1.0164, | |
| "num_input_tokens_seen": 245272304, | |
| "step": 450000 | |
| }, | |
| { | |
| "epoch": 4.507163438450457, | |
| "grad_norm": 5.344061374664307, | |
| "learning_rate": 2.7464232831759247e-05, | |
| "loss": 1.0373, | |
| "num_input_tokens_seen": 245546016, | |
| "step": 450500 | |
| }, | |
| { | |
| "epoch": 4.51216583960301, | |
| "grad_norm": 4.6710524559021, | |
| "learning_rate": 2.7439220825996482e-05, | |
| "loss": 1.0194, | |
| "num_input_tokens_seen": 245823488, | |
| "step": 451000 | |
| }, | |
| { | |
| "epoch": 4.517168240755563, | |
| "grad_norm": 5.240355014801025, | |
| "learning_rate": 2.741420882023371e-05, | |
| "loss": 1.0277, | |
| "num_input_tokens_seen": 246097728, | |
| "step": 451500 | |
| }, | |
| { | |
| "epoch": 4.522170641908116, | |
| "grad_norm": 5.007404327392578, | |
| "learning_rate": 2.7389196814470945e-05, | |
| "loss": 1.0217, | |
| "num_input_tokens_seen": 246368736, | |
| "step": 452000 | |
| }, | |
| { | |
| "epoch": 4.527173043060669, | |
| "grad_norm": 5.346477508544922, | |
| "learning_rate": 2.7364184808708184e-05, | |
| "loss": 1.0283, | |
| "num_input_tokens_seen": 246637968, | |
| "step": 452500 | |
| }, | |
| { | |
| "epoch": 4.532175444213222, | |
| "grad_norm": 5.416214466094971, | |
| "learning_rate": 2.733917280294542e-05, | |
| "loss": 1.0448, | |
| "num_input_tokens_seen": 246918648, | |
| "step": 453000 | |
| }, | |
| { | |
| "epoch": 4.537177845365775, | |
| "grad_norm": 7.101502418518066, | |
| "learning_rate": 2.7314160797182647e-05, | |
| "loss": 1.0469, | |
| "num_input_tokens_seen": 247181624, | |
| "step": 453500 | |
| }, | |
| { | |
| "epoch": 4.5421802465183285, | |
| "grad_norm": 5.758215427398682, | |
| "learning_rate": 2.728914879141988e-05, | |
| "loss": 1.0337, | |
| "num_input_tokens_seen": 247456808, | |
| "step": 454000 | |
| }, | |
| { | |
| "epoch": 4.547182647670882, | |
| "grad_norm": 7.215245246887207, | |
| "learning_rate": 2.726413678565712e-05, | |
| "loss": 1.0265, | |
| "num_input_tokens_seen": 247730456, | |
| "step": 454500 | |
| }, | |
| { | |
| "epoch": 4.552185048823436, | |
| "grad_norm": 5.474388122558594, | |
| "learning_rate": 2.7239124779894348e-05, | |
| "loss": 1.0319, | |
| "num_input_tokens_seen": 248006488, | |
| "step": 455000 | |
| }, | |
| { | |
| "epoch": 4.557187449975989, | |
| "grad_norm": 4.975455284118652, | |
| "learning_rate": 2.7214112774131583e-05, | |
| "loss": 1.029, | |
| "num_input_tokens_seen": 248283776, | |
| "step": 455500 | |
| }, | |
| { | |
| "epoch": 4.562189851128542, | |
| "grad_norm": 5.586923599243164, | |
| "learning_rate": 2.7189100768368818e-05, | |
| "loss": 1.0314, | |
| "num_input_tokens_seen": 248564784, | |
| "step": 456000 | |
| }, | |
| { | |
| "epoch": 4.567192252281095, | |
| "grad_norm": 7.202296257019043, | |
| "learning_rate": 2.7164088762606053e-05, | |
| "loss": 1.0436, | |
| "num_input_tokens_seen": 248837928, | |
| "step": 456500 | |
| }, | |
| { | |
| "epoch": 4.572194653433648, | |
| "grad_norm": 6.214195728302002, | |
| "learning_rate": 2.7139076756843285e-05, | |
| "loss": 1.0253, | |
| "num_input_tokens_seen": 249109248, | |
| "step": 457000 | |
| }, | |
| { | |
| "epoch": 4.577197054586201, | |
| "grad_norm": 6.16148567199707, | |
| "learning_rate": 2.711406475108052e-05, | |
| "loss": 1.0385, | |
| "num_input_tokens_seen": 249384792, | |
| "step": 457500 | |
| }, | |
| { | |
| "epoch": 4.582199455738754, | |
| "grad_norm": 5.71275520324707, | |
| "learning_rate": 2.7089052745317755e-05, | |
| "loss": 1.0432, | |
| "num_input_tokens_seen": 249651928, | |
| "step": 458000 | |
| }, | |
| { | |
| "epoch": 4.5872018568913075, | |
| "grad_norm": 4.817130088806152, | |
| "learning_rate": 2.7064040739554986e-05, | |
| "loss": 1.02, | |
| "num_input_tokens_seen": 249917104, | |
| "step": 458500 | |
| }, | |
| { | |
| "epoch": 4.592204258043861, | |
| "grad_norm": 5.333267688751221, | |
| "learning_rate": 2.703902873379222e-05, | |
| "loss": 1.0214, | |
| "num_input_tokens_seen": 250188040, | |
| "step": 459000 | |
| }, | |
| { | |
| "epoch": 4.597206659196415, | |
| "grad_norm": 5.287978172302246, | |
| "learning_rate": 2.7014016728029456e-05, | |
| "loss": 1.0401, | |
| "num_input_tokens_seen": 250468056, | |
| "step": 459500 | |
| }, | |
| { | |
| "epoch": 4.602209060348968, | |
| "grad_norm": 4.713915824890137, | |
| "learning_rate": 2.698900472226669e-05, | |
| "loss": 1.032, | |
| "num_input_tokens_seen": 250737232, | |
| "step": 460000 | |
| }, | |
| { | |
| "epoch": 4.607211461501521, | |
| "grad_norm": 6.2646965980529785, | |
| "learning_rate": 2.6963992716503923e-05, | |
| "loss": 1.0289, | |
| "num_input_tokens_seen": 251009944, | |
| "step": 460500 | |
| }, | |
| { | |
| "epoch": 4.612213862654074, | |
| "grad_norm": 6.391628742218018, | |
| "learning_rate": 2.6938980710741158e-05, | |
| "loss": 1.037, | |
| "num_input_tokens_seen": 251276984, | |
| "step": 461000 | |
| }, | |
| { | |
| "epoch": 4.617216263806627, | |
| "grad_norm": 6.245530128479004, | |
| "learning_rate": 2.6913968704978393e-05, | |
| "loss": 1.0321, | |
| "num_input_tokens_seen": 251544248, | |
| "step": 461500 | |
| }, | |
| { | |
| "epoch": 4.62221866495918, | |
| "grad_norm": 5.505767345428467, | |
| "learning_rate": 2.6888956699215624e-05, | |
| "loss": 1.023, | |
| "num_input_tokens_seen": 251820376, | |
| "step": 462000 | |
| }, | |
| { | |
| "epoch": 4.6272210661117334, | |
| "grad_norm": 5.286034107208252, | |
| "learning_rate": 2.686394469345286e-05, | |
| "loss": 1.029, | |
| "num_input_tokens_seen": 252089664, | |
| "step": 462500 | |
| }, | |
| { | |
| "epoch": 4.632223467264287, | |
| "grad_norm": 5.050361156463623, | |
| "learning_rate": 2.6838932687690094e-05, | |
| "loss": 1.0215, | |
| "num_input_tokens_seen": 252354304, | |
| "step": 463000 | |
| }, | |
| { | |
| "epoch": 4.63722586841684, | |
| "grad_norm": 4.83864164352417, | |
| "learning_rate": 2.6813920681927322e-05, | |
| "loss": 1.0292, | |
| "num_input_tokens_seen": 252625992, | |
| "step": 463500 | |
| }, | |
| { | |
| "epoch": 4.642228269569394, | |
| "grad_norm": 4.267606735229492, | |
| "learning_rate": 2.678890867616456e-05, | |
| "loss": 1.0356, | |
| "num_input_tokens_seen": 252900240, | |
| "step": 464000 | |
| }, | |
| { | |
| "epoch": 4.647230670721947, | |
| "grad_norm": 5.304383754730225, | |
| "learning_rate": 2.6763896670401796e-05, | |
| "loss": 1.0182, | |
| "num_input_tokens_seen": 253175392, | |
| "step": 464500 | |
| }, | |
| { | |
| "epoch": 4.6522330718745, | |
| "grad_norm": 7.107183933258057, | |
| "learning_rate": 2.673888466463903e-05, | |
| "loss": 1.0311, | |
| "num_input_tokens_seen": 253449480, | |
| "step": 465000 | |
| }, | |
| { | |
| "epoch": 4.657235473027053, | |
| "grad_norm": 7.010105133056641, | |
| "learning_rate": 2.671387265887626e-05, | |
| "loss": 1.0213, | |
| "num_input_tokens_seen": 253717600, | |
| "step": 465500 | |
| }, | |
| { | |
| "epoch": 4.662237874179606, | |
| "grad_norm": 5.442753791809082, | |
| "learning_rate": 2.6688860653113494e-05, | |
| "loss": 1.0624, | |
| "num_input_tokens_seen": 253996064, | |
| "step": 466000 | |
| }, | |
| { | |
| "epoch": 4.667240275332159, | |
| "grad_norm": 5.711010932922363, | |
| "learning_rate": 2.6663848647350732e-05, | |
| "loss": 1.0382, | |
| "num_input_tokens_seen": 254277992, | |
| "step": 466500 | |
| }, | |
| { | |
| "epoch": 4.6722426764847125, | |
| "grad_norm": 5.396849632263184, | |
| "learning_rate": 2.663883664158796e-05, | |
| "loss": 1.0358, | |
| "num_input_tokens_seen": 254544856, | |
| "step": 467000 | |
| }, | |
| { | |
| "epoch": 4.677245077637266, | |
| "grad_norm": 7.533030033111572, | |
| "learning_rate": 2.6613824635825195e-05, | |
| "loss": 1.0344, | |
| "num_input_tokens_seen": 254815184, | |
| "step": 467500 | |
| }, | |
| { | |
| "epoch": 4.682247478789819, | |
| "grad_norm": 6.03594446182251, | |
| "learning_rate": 2.658881263006243e-05, | |
| "loss": 1.0234, | |
| "num_input_tokens_seen": 255087048, | |
| "step": 468000 | |
| }, | |
| { | |
| "epoch": 4.687249879942373, | |
| "grad_norm": 6.070241928100586, | |
| "learning_rate": 2.656380062429967e-05, | |
| "loss": 1.0226, | |
| "num_input_tokens_seen": 255357680, | |
| "step": 468500 | |
| }, | |
| { | |
| "epoch": 4.692252281094926, | |
| "grad_norm": 6.334639549255371, | |
| "learning_rate": 2.6538788618536897e-05, | |
| "loss": 1.0262, | |
| "num_input_tokens_seen": 255624976, | |
| "step": 469000 | |
| }, | |
| { | |
| "epoch": 4.697254682247479, | |
| "grad_norm": 6.096264839172363, | |
| "learning_rate": 2.6513776612774132e-05, | |
| "loss": 1.0208, | |
| "num_input_tokens_seen": 255898192, | |
| "step": 469500 | |
| }, | |
| { | |
| "epoch": 4.702257083400032, | |
| "grad_norm": 6.717766761779785, | |
| "learning_rate": 2.6488764607011367e-05, | |
| "loss": 1.0395, | |
| "num_input_tokens_seen": 256174976, | |
| "step": 470000 | |
| }, | |
| { | |
| "epoch": 4.707259484552585, | |
| "grad_norm": 5.3120527267456055, | |
| "learning_rate": 2.64637526012486e-05, | |
| "loss": 1.0346, | |
| "num_input_tokens_seen": 256445504, | |
| "step": 470500 | |
| }, | |
| { | |
| "epoch": 4.712261885705138, | |
| "grad_norm": 5.765807151794434, | |
| "learning_rate": 2.6438740595485833e-05, | |
| "loss": 1.0221, | |
| "num_input_tokens_seen": 256728192, | |
| "step": 471000 | |
| }, | |
| { | |
| "epoch": 4.7172642868576915, | |
| "grad_norm": 5.730865478515625, | |
| "learning_rate": 2.6413728589723068e-05, | |
| "loss": 1.0305, | |
| "num_input_tokens_seen": 256998424, | |
| "step": 471500 | |
| }, | |
| { | |
| "epoch": 4.722266688010245, | |
| "grad_norm": 7.514993190765381, | |
| "learning_rate": 2.6388716583960303e-05, | |
| "loss": 1.0316, | |
| "num_input_tokens_seen": 257266816, | |
| "step": 472000 | |
| }, | |
| { | |
| "epoch": 4.727269089162798, | |
| "grad_norm": 4.826568603515625, | |
| "learning_rate": 2.6363704578197535e-05, | |
| "loss": 1.0336, | |
| "num_input_tokens_seen": 257528416, | |
| "step": 472500 | |
| }, | |
| { | |
| "epoch": 4.732271490315352, | |
| "grad_norm": 5.88137674331665, | |
| "learning_rate": 2.633869257243477e-05, | |
| "loss": 1.0379, | |
| "num_input_tokens_seen": 257803064, | |
| "step": 473000 | |
| }, | |
| { | |
| "epoch": 4.737273891467905, | |
| "grad_norm": 5.539977073669434, | |
| "learning_rate": 2.6313680566672005e-05, | |
| "loss": 1.0487, | |
| "num_input_tokens_seen": 258074752, | |
| "step": 473500 | |
| }, | |
| { | |
| "epoch": 4.742276292620458, | |
| "grad_norm": 4.8047871589660645, | |
| "learning_rate": 2.6288668560909236e-05, | |
| "loss": 1.0437, | |
| "num_input_tokens_seen": 258356488, | |
| "step": 474000 | |
| }, | |
| { | |
| "epoch": 4.747278693773011, | |
| "grad_norm": 5.240783214569092, | |
| "learning_rate": 2.626365655514647e-05, | |
| "loss": 1.0349, | |
| "num_input_tokens_seen": 258631128, | |
| "step": 474500 | |
| }, | |
| { | |
| "epoch": 4.752281094925564, | |
| "grad_norm": 5.152280807495117, | |
| "learning_rate": 2.6238644549383706e-05, | |
| "loss": 1.0395, | |
| "num_input_tokens_seen": 258902464, | |
| "step": 475000 | |
| }, | |
| { | |
| "epoch": 4.757283496078117, | |
| "grad_norm": 4.534987449645996, | |
| "learning_rate": 2.621363254362094e-05, | |
| "loss": 1.0417, | |
| "num_input_tokens_seen": 259167448, | |
| "step": 475500 | |
| }, | |
| { | |
| "epoch": 4.7622858972306705, | |
| "grad_norm": 5.842191219329834, | |
| "learning_rate": 2.6188620537858173e-05, | |
| "loss": 1.0251, | |
| "num_input_tokens_seen": 259437192, | |
| "step": 476000 | |
| }, | |
| { | |
| "epoch": 4.767288298383224, | |
| "grad_norm": 6.532055377960205, | |
| "learning_rate": 2.6163608532095408e-05, | |
| "loss": 1.0235, | |
| "num_input_tokens_seen": 259710112, | |
| "step": 476500 | |
| }, | |
| { | |
| "epoch": 4.772290699535777, | |
| "grad_norm": 4.665198802947998, | |
| "learning_rate": 2.6138596526332643e-05, | |
| "loss": 1.0329, | |
| "num_input_tokens_seen": 259983048, | |
| "step": 477000 | |
| }, | |
| { | |
| "epoch": 4.777293100688331, | |
| "grad_norm": 5.536545276641846, | |
| "learning_rate": 2.611358452056987e-05, | |
| "loss": 1.0144, | |
| "num_input_tokens_seen": 260257608, | |
| "step": 477500 | |
| }, | |
| { | |
| "epoch": 4.782295501840884, | |
| "grad_norm": 7.246273994445801, | |
| "learning_rate": 2.608857251480711e-05, | |
| "loss": 1.0353, | |
| "num_input_tokens_seen": 260533312, | |
| "step": 478000 | |
| }, | |
| { | |
| "epoch": 4.787297902993437, | |
| "grad_norm": 5.359396934509277, | |
| "learning_rate": 2.6063560509044344e-05, | |
| "loss": 1.0359, | |
| "num_input_tokens_seen": 260808264, | |
| "step": 478500 | |
| }, | |
| { | |
| "epoch": 4.79230030414599, | |
| "grad_norm": 5.461490154266357, | |
| "learning_rate": 2.603854850328158e-05, | |
| "loss": 1.0342, | |
| "num_input_tokens_seen": 261080680, | |
| "step": 479000 | |
| }, | |
| { | |
| "epoch": 4.797302705298543, | |
| "grad_norm": 6.074306488037109, | |
| "learning_rate": 2.6013536497518807e-05, | |
| "loss": 1.0386, | |
| "num_input_tokens_seen": 261352616, | |
| "step": 479500 | |
| }, | |
| { | |
| "epoch": 4.802305106451096, | |
| "grad_norm": 4.465676307678223, | |
| "learning_rate": 2.5988524491756046e-05, | |
| "loss": 1.0287, | |
| "num_input_tokens_seen": 261635744, | |
| "step": 480000 | |
| }, | |
| { | |
| "epoch": 4.8073075076036496, | |
| "grad_norm": 5.1833953857421875, | |
| "learning_rate": 2.596351248599328e-05, | |
| "loss": 1.0237, | |
| "num_input_tokens_seen": 261911376, | |
| "step": 480500 | |
| }, | |
| { | |
| "epoch": 4.812309908756203, | |
| "grad_norm": 7.636727809906006, | |
| "learning_rate": 2.593850048023051e-05, | |
| "loss": 1.0376, | |
| "num_input_tokens_seen": 262186256, | |
| "step": 481000 | |
| }, | |
| { | |
| "epoch": 4.817312309908756, | |
| "grad_norm": 5.770178318023682, | |
| "learning_rate": 2.5913488474467744e-05, | |
| "loss": 1.0221, | |
| "num_input_tokens_seen": 262461448, | |
| "step": 481500 | |
| }, | |
| { | |
| "epoch": 4.82231471106131, | |
| "grad_norm": 7.173573970794678, | |
| "learning_rate": 2.5888476468704982e-05, | |
| "loss": 1.0206, | |
| "num_input_tokens_seen": 262734672, | |
| "step": 482000 | |
| }, | |
| { | |
| "epoch": 4.827317112213863, | |
| "grad_norm": 5.029593467712402, | |
| "learning_rate": 2.5863464462942217e-05, | |
| "loss": 1.0507, | |
| "num_input_tokens_seen": 263009408, | |
| "step": 482500 | |
| }, | |
| { | |
| "epoch": 4.832319513366416, | |
| "grad_norm": 6.359258651733398, | |
| "learning_rate": 2.5838452457179445e-05, | |
| "loss": 1.0275, | |
| "num_input_tokens_seen": 263280584, | |
| "step": 483000 | |
| }, | |
| { | |
| "epoch": 4.837321914518969, | |
| "grad_norm": 5.677992820739746, | |
| "learning_rate": 2.581344045141668e-05, | |
| "loss": 1.0195, | |
| "num_input_tokens_seen": 263545768, | |
| "step": 483500 | |
| }, | |
| { | |
| "epoch": 4.842324315671522, | |
| "grad_norm": 4.935763835906982, | |
| "learning_rate": 2.578842844565392e-05, | |
| "loss": 1.0311, | |
| "num_input_tokens_seen": 263822840, | |
| "step": 484000 | |
| }, | |
| { | |
| "epoch": 4.8473267168240755, | |
| "grad_norm": 5.072977542877197, | |
| "learning_rate": 2.5763416439891147e-05, | |
| "loss": 1.0246, | |
| "num_input_tokens_seen": 264100088, | |
| "step": 484500 | |
| }, | |
| { | |
| "epoch": 4.852329117976629, | |
| "grad_norm": 6.382875442504883, | |
| "learning_rate": 2.5738404434128382e-05, | |
| "loss": 1.0211, | |
| "num_input_tokens_seen": 264373424, | |
| "step": 485000 | |
| }, | |
| { | |
| "epoch": 4.857331519129182, | |
| "grad_norm": 5.98667049407959, | |
| "learning_rate": 2.5713392428365617e-05, | |
| "loss": 1.0457, | |
| "num_input_tokens_seen": 264652616, | |
| "step": 485500 | |
| }, | |
| { | |
| "epoch": 4.862333920281735, | |
| "grad_norm": 5.859986782073975, | |
| "learning_rate": 2.568838042260285e-05, | |
| "loss": 1.0291, | |
| "num_input_tokens_seen": 264919240, | |
| "step": 486000 | |
| }, | |
| { | |
| "epoch": 4.867336321434289, | |
| "grad_norm": 5.3083271980285645, | |
| "learning_rate": 2.5663368416840083e-05, | |
| "loss": 1.024, | |
| "num_input_tokens_seen": 265186880, | |
| "step": 486500 | |
| }, | |
| { | |
| "epoch": 4.872338722586842, | |
| "grad_norm": 5.720509052276611, | |
| "learning_rate": 2.563835641107732e-05, | |
| "loss": 1.0329, | |
| "num_input_tokens_seen": 265456584, | |
| "step": 487000 | |
| }, | |
| { | |
| "epoch": 4.877341123739395, | |
| "grad_norm": 5.909444332122803, | |
| "learning_rate": 2.5613344405314553e-05, | |
| "loss": 1.0205, | |
| "num_input_tokens_seen": 265720768, | |
| "step": 487500 | |
| }, | |
| { | |
| "epoch": 4.882343524891948, | |
| "grad_norm": 4.779830455780029, | |
| "learning_rate": 2.5588332399551785e-05, | |
| "loss": 1.0226, | |
| "num_input_tokens_seen": 265991224, | |
| "step": 488000 | |
| }, | |
| { | |
| "epoch": 4.887345926044501, | |
| "grad_norm": 5.503864765167236, | |
| "learning_rate": 2.556332039378902e-05, | |
| "loss": 1.0299, | |
| "num_input_tokens_seen": 266260752, | |
| "step": 488500 | |
| }, | |
| { | |
| "epoch": 4.8923483271970545, | |
| "grad_norm": 6.2289581298828125, | |
| "learning_rate": 2.5538308388026255e-05, | |
| "loss": 1.0165, | |
| "num_input_tokens_seen": 266538032, | |
| "step": 489000 | |
| }, | |
| { | |
| "epoch": 4.897350728349608, | |
| "grad_norm": 6.163370132446289, | |
| "learning_rate": 2.5513296382263486e-05, | |
| "loss": 1.0273, | |
| "num_input_tokens_seen": 266812688, | |
| "step": 489500 | |
| }, | |
| { | |
| "epoch": 4.902353129502161, | |
| "grad_norm": 5.308876991271973, | |
| "learning_rate": 2.548828437650072e-05, | |
| "loss": 1.0302, | |
| "num_input_tokens_seen": 267083632, | |
| "step": 490000 | |
| }, | |
| { | |
| "epoch": 4.907355530654714, | |
| "grad_norm": 6.824766635894775, | |
| "learning_rate": 2.5463272370737956e-05, | |
| "loss": 1.0287, | |
| "num_input_tokens_seen": 267354032, | |
| "step": 490500 | |
| }, | |
| { | |
| "epoch": 4.912357931807268, | |
| "grad_norm": 5.9447102546691895, | |
| "learning_rate": 2.543826036497519e-05, | |
| "loss": 1.0552, | |
| "num_input_tokens_seen": 267626432, | |
| "step": 491000 | |
| }, | |
| { | |
| "epoch": 4.917360332959821, | |
| "grad_norm": 5.845020771026611, | |
| "learning_rate": 2.5413248359212423e-05, | |
| "loss": 1.0136, | |
| "num_input_tokens_seen": 267900520, | |
| "step": 491500 | |
| }, | |
| { | |
| "epoch": 4.922362734112374, | |
| "grad_norm": 5.4116082191467285, | |
| "learning_rate": 2.5388236353449658e-05, | |
| "loss": 1.0163, | |
| "num_input_tokens_seen": 268165832, | |
| "step": 492000 | |
| }, | |
| { | |
| "epoch": 4.927365135264927, | |
| "grad_norm": 7.0753326416015625, | |
| "learning_rate": 2.5363224347686893e-05, | |
| "loss": 1.0393, | |
| "num_input_tokens_seen": 268441848, | |
| "step": 492500 | |
| }, | |
| { | |
| "epoch": 4.93236753641748, | |
| "grad_norm": 7.350298881530762, | |
| "learning_rate": 2.533821234192412e-05, | |
| "loss": 1.0463, | |
| "num_input_tokens_seen": 268716696, | |
| "step": 493000 | |
| }, | |
| { | |
| "epoch": 4.9373699375700335, | |
| "grad_norm": 5.284552574157715, | |
| "learning_rate": 2.531320033616136e-05, | |
| "loss": 1.0311, | |
| "num_input_tokens_seen": 268988968, | |
| "step": 493500 | |
| }, | |
| { | |
| "epoch": 4.942372338722587, | |
| "grad_norm": 6.068382740020752, | |
| "learning_rate": 2.5288188330398594e-05, | |
| "loss": 1.0379, | |
| "num_input_tokens_seen": 269264776, | |
| "step": 494000 | |
| }, | |
| { | |
| "epoch": 4.94737473987514, | |
| "grad_norm": 6.806668281555176, | |
| "learning_rate": 2.526317632463583e-05, | |
| "loss": 1.0403, | |
| "num_input_tokens_seen": 269535320, | |
| "step": 494500 | |
| }, | |
| { | |
| "epoch": 4.952377141027693, | |
| "grad_norm": 5.127531051635742, | |
| "learning_rate": 2.5238164318873058e-05, | |
| "loss": 1.0269, | |
| "num_input_tokens_seen": 269805928, | |
| "step": 495000 | |
| }, | |
| { | |
| "epoch": 4.957379542180247, | |
| "grad_norm": 4.772179126739502, | |
| "learning_rate": 2.5213152313110293e-05, | |
| "loss": 1.0251, | |
| "num_input_tokens_seen": 270085064, | |
| "step": 495500 | |
| }, | |
| { | |
| "epoch": 4.9623819433328, | |
| "grad_norm": 7.208611011505127, | |
| "learning_rate": 2.518814030734753e-05, | |
| "loss": 1.0134, | |
| "num_input_tokens_seen": 270359424, | |
| "step": 496000 | |
| }, | |
| { | |
| "epoch": 4.967384344485353, | |
| "grad_norm": 5.730184555053711, | |
| "learning_rate": 2.516312830158476e-05, | |
| "loss": 1.0226, | |
| "num_input_tokens_seen": 270634264, | |
| "step": 496500 | |
| }, | |
| { | |
| "epoch": 4.972386745637906, | |
| "grad_norm": 5.047354698181152, | |
| "learning_rate": 2.5138116295821994e-05, | |
| "loss": 1.034, | |
| "num_input_tokens_seen": 270902440, | |
| "step": 497000 | |
| }, | |
| { | |
| "epoch": 4.977389146790459, | |
| "grad_norm": 4.870574951171875, | |
| "learning_rate": 2.511310429005923e-05, | |
| "loss": 1.0434, | |
| "num_input_tokens_seen": 271179104, | |
| "step": 497500 | |
| }, | |
| { | |
| "epoch": 4.9823915479430125, | |
| "grad_norm": 5.616664409637451, | |
| "learning_rate": 2.5088092284296467e-05, | |
| "loss": 1.0236, | |
| "num_input_tokens_seen": 271451512, | |
| "step": 498000 | |
| }, | |
| { | |
| "epoch": 4.987393949095566, | |
| "grad_norm": 4.676699638366699, | |
| "learning_rate": 2.5063080278533696e-05, | |
| "loss": 1.0464, | |
| "num_input_tokens_seen": 271724472, | |
| "step": 498500 | |
| }, | |
| { | |
| "epoch": 4.992396350248119, | |
| "grad_norm": 5.634840965270996, | |
| "learning_rate": 2.503806827277093e-05, | |
| "loss": 1.0291, | |
| "num_input_tokens_seen": 271995512, | |
| "step": 499000 | |
| }, | |
| { | |
| "epoch": 4.997398751400672, | |
| "grad_norm": 6.081726551055908, | |
| "learning_rate": 2.5013056267008166e-05, | |
| "loss": 1.0243, | |
| "num_input_tokens_seen": 272263560, | |
| "step": 499500 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 1.0584163665771484, | |
| "eval_runtime": 192.7527, | |
| "eval_samples_per_second": 1037.106, | |
| "eval_steps_per_second": 129.643, | |
| "num_input_tokens_seen": 272407288, | |
| "step": 499760 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "num_input_tokens_seen": 272407288, | |
| "step": 499760, | |
| "total_flos": 7.214188795055309e+16, | |
| "train_loss": 0.0, | |
| "train_runtime": 0.0544, | |
| "train_samples_per_second": 73478382.327, | |
| "train_steps_per_second": 9184797.791, | |
| "train_tokens_per_second": 5003948262.574 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 499760, | |
| "num_input_tokens_seen": 272407288, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.214188795055309e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |