{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 37460, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.13347570742124934, "grad_norm": 3.1176345348358154, "learning_rate": 4.933395621996797e-05, "loss": 1.6549, "step": 500 }, { "epoch": 0.2669514148424987, "grad_norm": 2.8823812007904053, "learning_rate": 4.866657768286172e-05, "loss": 1.3869, "step": 1000 }, { "epoch": 0.400427122263748, "grad_norm": 3.436349630355835, "learning_rate": 4.7999199145755475e-05, "loss": 1.31, "step": 1500 }, { "epoch": 0.5339028296849974, "grad_norm": 2.7581417560577393, "learning_rate": 4.733182060864923e-05, "loss": 1.2632, "step": 2000 }, { "epoch": 0.6673785371062466, "grad_norm": 3.0282974243164062, "learning_rate": 4.666444207154298e-05, "loss": 1.2197, "step": 2500 }, { "epoch": 0.800854244527496, "grad_norm": 2.7351794242858887, "learning_rate": 4.599706353443674e-05, "loss": 1.1942, "step": 3000 }, { "epoch": 0.9343299519487454, "grad_norm": 2.228116035461426, "learning_rate": 4.532968499733049e-05, "loss": 1.1273, "step": 3500 }, { "epoch": 1.0678056593699947, "grad_norm": 3.064039707183838, "learning_rate": 4.466230646022424e-05, "loss": 1.0261, "step": 4000 }, { "epoch": 1.201281366791244, "grad_norm": 2.9547019004821777, "learning_rate": 4.3994927923117995e-05, "loss": 0.9262, "step": 4500 }, { "epoch": 1.3347570742124932, "grad_norm": 2.8099584579467773, "learning_rate": 4.332754938601175e-05, "loss": 0.9193, "step": 5000 }, { "epoch": 1.4682327816337426, "grad_norm": 2.4523322582244873, "learning_rate": 4.26601708489055e-05, "loss": 0.939, "step": 5500 }, { "epoch": 1.601708489054992, "grad_norm": 2.4100770950317383, "learning_rate": 4.199279231179926e-05, "loss": 0.9084, "step": 6000 }, { "epoch": 1.7351841964762413, "grad_norm": 2.4628851413726807, "learning_rate": 4.1325413774693004e-05, "loss": 0.9303, "step": 6500 }, { "epoch": 1.8686599038974907, "grad_norm": 2.522763967514038, "learning_rate": 4.0658035237586763e-05, "loss": 0.8988, "step": 7000 }, { "epoch": 2.00213561131874, "grad_norm": 1.8565527200698853, "learning_rate": 3.9990656700480516e-05, "loss": 0.9408, "step": 7500 }, { "epoch": 2.1356113187399894, "grad_norm": 3.084707498550415, "learning_rate": 3.932327816337427e-05, "loss": 0.7548, "step": 8000 }, { "epoch": 2.269087026161239, "grad_norm": 1.836754322052002, "learning_rate": 3.865589962626802e-05, "loss": 0.7447, "step": 8500 }, { "epoch": 2.402562733582488, "grad_norm": 2.452847719192505, "learning_rate": 3.798852108916178e-05, "loss": 0.7293, "step": 9000 }, { "epoch": 2.536038441003737, "grad_norm": 3.363875150680542, "learning_rate": 3.7321142552055525e-05, "loss": 0.7437, "step": 9500 }, { "epoch": 2.6695141484249865, "grad_norm": 2.087559700012207, "learning_rate": 3.6653764014949284e-05, "loss": 0.7217, "step": 10000 }, { "epoch": 2.802989855846236, "grad_norm": 2.1554408073425293, "learning_rate": 3.598638547784303e-05, "loss": 0.7459, "step": 10500 }, { "epoch": 2.936465563267485, "grad_norm": 1.8922470808029175, "learning_rate": 3.531900694073679e-05, "loss": 0.7586, "step": 11000 }, { "epoch": 3.0699412706887346, "grad_norm": 2.0937387943267822, "learning_rate": 3.465162840363054e-05, "loss": 0.6552, "step": 11500 }, { "epoch": 3.203416978109984, "grad_norm": 2.954742908477783, "learning_rate": 3.398424986652429e-05, "loss": 0.6012, "step": 12000 }, { "epoch": 3.3368926855312333, "grad_norm": 2.8236591815948486, "learning_rate": 3.3316871329418045e-05, "loss": 0.607, "step": 12500 }, { "epoch": 3.4703683929524827, "grad_norm": 2.4959943294525146, "learning_rate": 3.2649492792311804e-05, "loss": 0.6181, "step": 13000 }, { "epoch": 3.603844100373732, "grad_norm": 3.4247825145721436, "learning_rate": 3.198211425520555e-05, "loss": 0.6029, "step": 13500 }, { "epoch": 3.7373198077949814, "grad_norm": 2.6698663234710693, "learning_rate": 3.131473571809931e-05, "loss": 0.6032, "step": 14000 }, { "epoch": 3.8707955152162308, "grad_norm": 1.9999539852142334, "learning_rate": 3.064735718099306e-05, "loss": 0.6241, "step": 14500 }, { "epoch": 4.00427122263748, "grad_norm": 2.449364185333252, "learning_rate": 2.9979978643886814e-05, "loss": 0.6183, "step": 15000 }, { "epoch": 4.1377469300587295, "grad_norm": 4.328239917755127, "learning_rate": 2.931260010678057e-05, "loss": 0.4832, "step": 15500 }, { "epoch": 4.271222637479979, "grad_norm": 1.9375187158584595, "learning_rate": 2.8645221569674318e-05, "loss": 0.4963, "step": 16000 }, { "epoch": 4.404698344901228, "grad_norm": 2.9625084400177, "learning_rate": 2.7977843032568074e-05, "loss": 0.5162, "step": 16500 }, { "epoch": 4.538174052322478, "grad_norm": 3.514462947845459, "learning_rate": 2.731046449546183e-05, "loss": 0.4913, "step": 17000 }, { "epoch": 4.671649759743727, "grad_norm": 2.035665988922119, "learning_rate": 2.664308595835558e-05, "loss": 0.5062, "step": 17500 }, { "epoch": 4.805125467164976, "grad_norm": 3.025426149368286, "learning_rate": 2.5975707421249334e-05, "loss": 0.5167, "step": 18000 }, { "epoch": 4.938601174586225, "grad_norm": 3.0361483097076416, "learning_rate": 2.530832888414309e-05, "loss": 0.4961, "step": 18500 }, { "epoch": 5.072076882007474, "grad_norm": 2.769930124282837, "learning_rate": 2.464095034703684e-05, "loss": 0.4582, "step": 19000 }, { "epoch": 5.205552589428724, "grad_norm": 2.209397077560425, "learning_rate": 2.3973571809930594e-05, "loss": 0.3926, "step": 19500 }, { "epoch": 5.339028296849973, "grad_norm": 2.680094003677368, "learning_rate": 2.3306193272824347e-05, "loss": 0.4187, "step": 20000 }, { "epoch": 5.472504004271222, "grad_norm": 2.425154447555542, "learning_rate": 2.26388147357181e-05, "loss": 0.4182, "step": 20500 }, { "epoch": 5.605979711692472, "grad_norm": 4.719789505004883, "learning_rate": 2.197143619861185e-05, "loss": 0.4257, "step": 21000 }, { "epoch": 5.739455419113721, "grad_norm": 1.3763984441757202, "learning_rate": 2.1304057661505607e-05, "loss": 0.4222, "step": 21500 }, { "epoch": 5.87293112653497, "grad_norm": 2.8332626819610596, "learning_rate": 2.063667912439936e-05, "loss": 0.4247, "step": 22000 }, { "epoch": 6.00640683395622, "grad_norm": 1.990326166152954, "learning_rate": 1.996930058729311e-05, "loss": 0.4149, "step": 22500 }, { "epoch": 6.139882541377469, "grad_norm": 1.9501370191574097, "learning_rate": 1.9301922050186867e-05, "loss": 0.3451, "step": 23000 }, { "epoch": 6.2733582487987185, "grad_norm": 2.518463611602783, "learning_rate": 1.863454351308062e-05, "loss": 0.3338, "step": 23500 }, { "epoch": 6.406833956219968, "grad_norm": 2.567760705947876, "learning_rate": 1.7967164975974375e-05, "loss": 0.3537, "step": 24000 }, { "epoch": 6.540309663641217, "grad_norm": 3.4953105449676514, "learning_rate": 1.7299786438868128e-05, "loss": 0.3463, "step": 24500 }, { "epoch": 6.673785371062467, "grad_norm": 4.280869007110596, "learning_rate": 1.663240790176188e-05, "loss": 0.3794, "step": 25000 }, { "epoch": 6.807261078483716, "grad_norm": 1.3708416223526, "learning_rate": 1.5965029364655636e-05, "loss": 0.3595, "step": 25500 }, { "epoch": 6.940736785904965, "grad_norm": 1.9402921199798584, "learning_rate": 1.5297650827549388e-05, "loss": 0.3567, "step": 26000 }, { "epoch": 7.074212493326215, "grad_norm": 3.4459097385406494, "learning_rate": 1.463027229044314e-05, "loss": 0.3271, "step": 26500 }, { "epoch": 7.207688200747464, "grad_norm": 3.0598597526550293, "learning_rate": 1.3962893753336892e-05, "loss": 0.3053, "step": 27000 }, { "epoch": 7.3411639081687134, "grad_norm": 1.887160062789917, "learning_rate": 1.3295515216230648e-05, "loss": 0.2941, "step": 27500 }, { "epoch": 7.474639615589963, "grad_norm": 2.143535852432251, "learning_rate": 1.26281366791244e-05, "loss": 0.2981, "step": 28000 }, { "epoch": 7.608115323011212, "grad_norm": 4.371819019317627, "learning_rate": 1.1960758142018154e-05, "loss": 0.2986, "step": 28500 }, { "epoch": 7.7415910304324616, "grad_norm": 3.378239631652832, "learning_rate": 1.1293379604911907e-05, "loss": 0.305, "step": 29000 }, { "epoch": 7.875066737853711, "grad_norm": 2.8271336555480957, "learning_rate": 1.062600106780566e-05, "loss": 0.3193, "step": 29500 }, { "epoch": 8.00854244527496, "grad_norm": 2.825468063354492, "learning_rate": 9.958622530699413e-06, "loss": 0.3004, "step": 30000 }, { "epoch": 8.14201815269621, "grad_norm": 2.4525914192199707, "learning_rate": 9.291243993593167e-06, "loss": 0.2669, "step": 30500 }, { "epoch": 8.275493860117459, "grad_norm": 4.695706844329834, "learning_rate": 8.62386545648692e-06, "loss": 0.2737, "step": 31000 }, { "epoch": 8.408969567538708, "grad_norm": 3.336611270904541, "learning_rate": 7.956486919380673e-06, "loss": 0.2697, "step": 31500 }, { "epoch": 8.542445274959958, "grad_norm": 3.6071436405181885, "learning_rate": 7.289108382274426e-06, "loss": 0.2644, "step": 32000 }, { "epoch": 8.675920982381207, "grad_norm": 3.1964738368988037, "learning_rate": 6.62172984516818e-06, "loss": 0.2669, "step": 32500 }, { "epoch": 8.809396689802456, "grad_norm": 2.115790843963623, "learning_rate": 5.9543513080619334e-06, "loss": 0.2638, "step": 33000 }, { "epoch": 8.942872397223706, "grad_norm": 1.7221794128417969, "learning_rate": 5.286972770955687e-06, "loss": 0.282, "step": 33500 }, { "epoch": 9.076348104644955, "grad_norm": 2.4076969623565674, "learning_rate": 4.61959423384944e-06, "loss": 0.2601, "step": 34000 }, { "epoch": 9.209823812066205, "grad_norm": 1.874233603477478, "learning_rate": 3.952215696743193e-06, "loss": 0.2474, "step": 34500 }, { "epoch": 9.343299519487454, "grad_norm": 2.8450875282287598, "learning_rate": 3.2848371596369464e-06, "loss": 0.2405, "step": 35000 }, { "epoch": 9.476775226908703, "grad_norm": 2.4307425022125244, "learning_rate": 2.6174586225306996e-06, "loss": 0.244, "step": 35500 }, { "epoch": 9.610250934329953, "grad_norm": 1.2817405462265015, "learning_rate": 1.9500800854244527e-06, "loss": 0.2457, "step": 36000 }, { "epoch": 9.743726641751202, "grad_norm": 2.613475799560547, "learning_rate": 1.282701548318206e-06, "loss": 0.2446, "step": 36500 }, { "epoch": 9.877202349172451, "grad_norm": 1.7706152200698853, "learning_rate": 6.153230112119594e-07, "loss": 0.2464, "step": 37000 }, { "epoch": 10.0, "step": 37460, "total_flos": 1.5022851621224448e+16, "train_loss": 0.5662414055868578, "train_runtime": 9241.9444, "train_samples_per_second": 16.213, "train_steps_per_second": 4.053 } ], "logging_steps": 500, "max_steps": 37460, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5022851621224448e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }