| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 10.0, | |
| "eval_steps": 500, | |
| "global_step": 37460, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.13347570742124934, | |
| "grad_norm": 3.1733596324920654, | |
| "learning_rate": 4.933395621996797e-05, | |
| "loss": 1.4561, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2669514148424987, | |
| "grad_norm": 2.9853200912475586, | |
| "learning_rate": 4.866657768286172e-05, | |
| "loss": 1.2371, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.400427122263748, | |
| "grad_norm": 2.6459007263183594, | |
| "learning_rate": 4.7999199145755475e-05, | |
| "loss": 1.1388, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5339028296849974, | |
| "grad_norm": 2.6050009727478027, | |
| "learning_rate": 4.733182060864923e-05, | |
| "loss": 1.1141, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6673785371062466, | |
| "grad_norm": 2.1738457679748535, | |
| "learning_rate": 4.666444207154298e-05, | |
| "loss": 1.086, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.800854244527496, | |
| "grad_norm": 2.692978620529175, | |
| "learning_rate": 4.599706353443674e-05, | |
| "loss": 1.0676, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.9343299519487454, | |
| "grad_norm": 2.072795867919922, | |
| "learning_rate": 4.532968499733049e-05, | |
| "loss": 0.9895, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.0678056593699947, | |
| "grad_norm": 2.18379282951355, | |
| "learning_rate": 4.466230646022424e-05, | |
| "loss": 0.9155, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.201281366791244, | |
| "grad_norm": 2.4531776905059814, | |
| "learning_rate": 4.3994927923117995e-05, | |
| "loss": 0.8136, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.3347570742124932, | |
| "grad_norm": 2.8264808654785156, | |
| "learning_rate": 4.332754938601175e-05, | |
| "loss": 0.8129, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.4682327816337426, | |
| "grad_norm": 1.746168851852417, | |
| "learning_rate": 4.26601708489055e-05, | |
| "loss": 0.8581, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.601708489054992, | |
| "grad_norm": 2.722280740737915, | |
| "learning_rate": 4.199279231179926e-05, | |
| "loss": 0.7905, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.7351841964762413, | |
| "grad_norm": 2.4873287677764893, | |
| "learning_rate": 4.1325413774693004e-05, | |
| "loss": 0.8327, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.8686599038974907, | |
| "grad_norm": 2.6175665855407715, | |
| "learning_rate": 4.0658035237586763e-05, | |
| "loss": 0.8191, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.00213561131874, | |
| "grad_norm": 1.2910939455032349, | |
| "learning_rate": 3.9990656700480516e-05, | |
| "loss": 0.8359, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.1356113187399894, | |
| "grad_norm": 2.5565571784973145, | |
| "learning_rate": 3.932327816337427e-05, | |
| "loss": 0.6788, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.269087026161239, | |
| "grad_norm": 2.173668146133423, | |
| "learning_rate": 3.865589962626802e-05, | |
| "loss": 0.6669, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.402562733582488, | |
| "grad_norm": 1.9133882522583008, | |
| "learning_rate": 3.798852108916178e-05, | |
| "loss": 0.6696, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.536038441003737, | |
| "grad_norm": 1.3470282554626465, | |
| "learning_rate": 3.7321142552055525e-05, | |
| "loss": 0.6803, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.6695141484249865, | |
| "grad_norm": 2.3730781078338623, | |
| "learning_rate": 3.6653764014949284e-05, | |
| "loss": 0.6684, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.802989855846236, | |
| "grad_norm": 2.106994390487671, | |
| "learning_rate": 3.598638547784303e-05, | |
| "loss": 0.6717, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 2.936465563267485, | |
| "grad_norm": 1.7302494049072266, | |
| "learning_rate": 3.531900694073679e-05, | |
| "loss": 0.676, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.0699412706887346, | |
| "grad_norm": 1.477286458015442, | |
| "learning_rate": 3.465162840363054e-05, | |
| "loss": 0.586, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.203416978109984, | |
| "grad_norm": 1.818613052368164, | |
| "learning_rate": 3.398424986652429e-05, | |
| "loss": 0.5467, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.3368926855312333, | |
| "grad_norm": 1.6314208507537842, | |
| "learning_rate": 3.3316871329418045e-05, | |
| "loss": 0.556, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 3.4703683929524827, | |
| "grad_norm": 2.8924617767333984, | |
| "learning_rate": 3.2649492792311804e-05, | |
| "loss": 0.5567, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.603844100373732, | |
| "grad_norm": 2.6945688724517822, | |
| "learning_rate": 3.198211425520555e-05, | |
| "loss": 0.5568, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 3.7373198077949814, | |
| "grad_norm": 2.092221736907959, | |
| "learning_rate": 3.131473571809931e-05, | |
| "loss": 0.5567, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 3.8707955152162308, | |
| "grad_norm": 1.6795735359191895, | |
| "learning_rate": 3.064735718099306e-05, | |
| "loss": 0.5764, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 4.00427122263748, | |
| "grad_norm": 2.4606454372406006, | |
| "learning_rate": 2.9979978643886814e-05, | |
| "loss": 0.5716, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.1377469300587295, | |
| "grad_norm": 4.759591102600098, | |
| "learning_rate": 2.931260010678057e-05, | |
| "loss": 0.4671, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 4.271222637479979, | |
| "grad_norm": 1.4791502952575684, | |
| "learning_rate": 2.8645221569674318e-05, | |
| "loss": 0.4719, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.404698344901228, | |
| "grad_norm": 1.2884821891784668, | |
| "learning_rate": 2.7977843032568074e-05, | |
| "loss": 0.465, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 4.538174052322478, | |
| "grad_norm": 3.4914660453796387, | |
| "learning_rate": 2.731046449546183e-05, | |
| "loss": 0.4572, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.671649759743727, | |
| "grad_norm": 1.9152294397354126, | |
| "learning_rate": 2.664308595835558e-05, | |
| "loss": 0.4685, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 4.805125467164976, | |
| "grad_norm": 2.193741798400879, | |
| "learning_rate": 2.5975707421249334e-05, | |
| "loss": 0.4751, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 4.938601174586225, | |
| "grad_norm": 1.8435180187225342, | |
| "learning_rate": 2.530832888414309e-05, | |
| "loss": 0.4671, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 5.072076882007474, | |
| "grad_norm": 2.184936761856079, | |
| "learning_rate": 2.464095034703684e-05, | |
| "loss": 0.4342, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 5.205552589428724, | |
| "grad_norm": 1.9590941667556763, | |
| "learning_rate": 2.3973571809930594e-05, | |
| "loss": 0.3865, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 5.339028296849973, | |
| "grad_norm": 2.4979445934295654, | |
| "learning_rate": 2.3306193272824347e-05, | |
| "loss": 0.3983, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 5.472504004271222, | |
| "grad_norm": 2.703019142150879, | |
| "learning_rate": 2.26388147357181e-05, | |
| "loss": 0.3951, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 5.605979711692472, | |
| "grad_norm": 2.801893711090088, | |
| "learning_rate": 2.197143619861185e-05, | |
| "loss": 0.3996, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 5.739455419113721, | |
| "grad_norm": 1.1886452436447144, | |
| "learning_rate": 2.1304057661505607e-05, | |
| "loss": 0.4042, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 5.87293112653497, | |
| "grad_norm": 2.024179458618164, | |
| "learning_rate": 2.063667912439936e-05, | |
| "loss": 0.4081, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 6.00640683395622, | |
| "grad_norm": 1.3413245677947998, | |
| "learning_rate": 1.996930058729311e-05, | |
| "loss": 0.3917, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 6.139882541377469, | |
| "grad_norm": 1.4281545877456665, | |
| "learning_rate": 1.9301922050186867e-05, | |
| "loss": 0.34, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 6.2733582487987185, | |
| "grad_norm": 2.500412702560425, | |
| "learning_rate": 1.863454351308062e-05, | |
| "loss": 0.3327, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 6.406833956219968, | |
| "grad_norm": 2.666672945022583, | |
| "learning_rate": 1.7967164975974375e-05, | |
| "loss": 0.3399, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 6.540309663641217, | |
| "grad_norm": 2.352971076965332, | |
| "learning_rate": 1.7299786438868128e-05, | |
| "loss": 0.331, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 6.673785371062467, | |
| "grad_norm": 3.2518150806427, | |
| "learning_rate": 1.663240790176188e-05, | |
| "loss": 0.3576, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 6.807261078483716, | |
| "grad_norm": 1.363599181175232, | |
| "learning_rate": 1.5965029364655636e-05, | |
| "loss": 0.3572, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 6.940736785904965, | |
| "grad_norm": 2.377072811126709, | |
| "learning_rate": 1.5297650827549388e-05, | |
| "loss": 0.3538, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 7.074212493326215, | |
| "grad_norm": 2.4149889945983887, | |
| "learning_rate": 1.463027229044314e-05, | |
| "loss": 0.32, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 7.207688200747464, | |
| "grad_norm": 2.2141239643096924, | |
| "learning_rate": 1.3962893753336892e-05, | |
| "loss": 0.2894, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 7.3411639081687134, | |
| "grad_norm": 1.6364160776138306, | |
| "learning_rate": 1.3295515216230648e-05, | |
| "loss": 0.2914, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 7.474639615589963, | |
| "grad_norm": 2.363274097442627, | |
| "learning_rate": 1.26281366791244e-05, | |
| "loss": 0.2963, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 7.608115323011212, | |
| "grad_norm": 3.750777006149292, | |
| "learning_rate": 1.1960758142018154e-05, | |
| "loss": 0.299, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 7.7415910304324616, | |
| "grad_norm": 3.3002822399139404, | |
| "learning_rate": 1.1293379604911907e-05, | |
| "loss": 0.2956, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 7.875066737853711, | |
| "grad_norm": 3.4351253509521484, | |
| "learning_rate": 1.062600106780566e-05, | |
| "loss": 0.3213, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 8.00854244527496, | |
| "grad_norm": 2.195547103881836, | |
| "learning_rate": 9.958622530699413e-06, | |
| "loss": 0.2988, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 8.14201815269621, | |
| "grad_norm": 2.4596967697143555, | |
| "learning_rate": 9.291243993593167e-06, | |
| "loss": 0.2702, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 8.275493860117459, | |
| "grad_norm": 3.5012736320495605, | |
| "learning_rate": 8.62386545648692e-06, | |
| "loss": 0.2609, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 8.408969567538708, | |
| "grad_norm": 1.7438409328460693, | |
| "learning_rate": 7.956486919380673e-06, | |
| "loss": 0.2605, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 8.542445274959958, | |
| "grad_norm": 2.4345345497131348, | |
| "learning_rate": 7.289108382274426e-06, | |
| "loss": 0.2611, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 8.675920982381207, | |
| "grad_norm": 3.1137731075286865, | |
| "learning_rate": 6.62172984516818e-06, | |
| "loss": 0.2688, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 8.809396689802456, | |
| "grad_norm": 1.2621135711669922, | |
| "learning_rate": 5.9543513080619334e-06, | |
| "loss": 0.2732, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 8.942872397223706, | |
| "grad_norm": 1.6983531713485718, | |
| "learning_rate": 5.286972770955687e-06, | |
| "loss": 0.2805, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 9.076348104644955, | |
| "grad_norm": 1.4236127138137817, | |
| "learning_rate": 4.61959423384944e-06, | |
| "loss": 0.2668, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 9.209823812066205, | |
| "grad_norm": 2.3351385593414307, | |
| "learning_rate": 3.952215696743193e-06, | |
| "loss": 0.2437, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 9.343299519487454, | |
| "grad_norm": 2.2198843955993652, | |
| "learning_rate": 3.2848371596369464e-06, | |
| "loss": 0.2459, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 9.476775226908703, | |
| "grad_norm": 2.087069272994995, | |
| "learning_rate": 2.6174586225306996e-06, | |
| "loss": 0.2415, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 9.610250934329953, | |
| "grad_norm": 1.8616083860397339, | |
| "learning_rate": 1.9500800854244527e-06, | |
| "loss": 0.2453, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 9.743726641751202, | |
| "grad_norm": 1.3939635753631592, | |
| "learning_rate": 1.282701548318206e-06, | |
| "loss": 0.2369, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 9.877202349172451, | |
| "grad_norm": 2.0491538047790527, | |
| "learning_rate": 6.153230112119594e-07, | |
| "loss": 0.2489, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "step": 37460, | |
| "total_flos": 1.2944564446789632e+16, | |
| "train_loss": 0.5224577584498334, | |
| "train_runtime": 9349.5182, | |
| "train_samples_per_second": 16.026, | |
| "train_steps_per_second": 4.007 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 37460, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 10000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2944564446789632e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |