| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 78, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.038461538461538464, | |
| "grad_norm": 7.130930423736572, | |
| "learning_rate": 1.25e-06, | |
| "loss": 1.0526, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.07692307692307693, | |
| "grad_norm": 7.099226951599121, | |
| "learning_rate": 2.5e-06, | |
| "loss": 1.0528, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.11538461538461539, | |
| "grad_norm": 6.989954471588135, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": 1.0596, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.15384615384615385, | |
| "grad_norm": 6.319800853729248, | |
| "learning_rate": 5e-06, | |
| "loss": 1.0272, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.19230769230769232, | |
| "grad_norm": 4.596342086791992, | |
| "learning_rate": 6.25e-06, | |
| "loss": 0.9795, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.23076923076923078, | |
| "grad_norm": 2.7473602294921875, | |
| "learning_rate": 7.500000000000001e-06, | |
| "loss": 0.94, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.2692307692307692, | |
| "grad_norm": 4.620419502258301, | |
| "learning_rate": 8.750000000000001e-06, | |
| "loss": 0.9473, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.3076923076923077, | |
| "grad_norm": 4.805872917175293, | |
| "learning_rate": 1e-05, | |
| "loss": 0.9536, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.34615384615384615, | |
| "grad_norm": 4.337175369262695, | |
| "learning_rate": 9.994965332706574e-06, | |
| "loss": 0.8806, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.38461538461538464, | |
| "grad_norm": 4.0317535400390625, | |
| "learning_rate": 9.979871469976197e-06, | |
| "loss": 0.8694, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.4230769230769231, | |
| "grad_norm": 3.0269405841827393, | |
| "learning_rate": 9.954748808839675e-06, | |
| "loss": 0.826, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.46153846153846156, | |
| "grad_norm": 1.9066985845565796, | |
| "learning_rate": 9.91964794299315e-06, | |
| "loss": 0.7938, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.9084997177124023, | |
| "learning_rate": 9.874639560909118e-06, | |
| "loss": 0.7915, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.5384615384615384, | |
| "grad_norm": 1.7610135078430176, | |
| "learning_rate": 9.819814303479268e-06, | |
| "loss": 0.7749, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.5769230769230769, | |
| "grad_norm": 1.4025635719299316, | |
| "learning_rate": 9.755282581475769e-06, | |
| "loss": 0.7574, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.6153846153846154, | |
| "grad_norm": 1.5101988315582275, | |
| "learning_rate": 9.681174353198687e-06, | |
| "loss": 0.7527, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.6538461538461539, | |
| "grad_norm": 1.289642572402954, | |
| "learning_rate": 9.597638862757255e-06, | |
| "loss": 0.7199, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.6923076923076923, | |
| "grad_norm": 1.2755658626556396, | |
| "learning_rate": 9.504844339512096e-06, | |
| "loss": 0.715, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.7307692307692307, | |
| "grad_norm": 1.1986231803894043, | |
| "learning_rate": 9.40297765928369e-06, | |
| "loss": 0.6988, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.7692307692307693, | |
| "grad_norm": 1.021087408065796, | |
| "learning_rate": 9.292243968009332e-06, | |
| "loss": 0.7013, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.8076923076923077, | |
| "grad_norm": 0.9690332412719727, | |
| "learning_rate": 9.172866268606514e-06, | |
| "loss": 0.691, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.8461538461538461, | |
| "grad_norm": 1.188096046447754, | |
| "learning_rate": 9.045084971874738e-06, | |
| "loss": 0.6967, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.8846153846153846, | |
| "grad_norm": 0.9607952833175659, | |
| "learning_rate": 8.90915741234015e-06, | |
| "loss": 0.6837, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.9230769230769231, | |
| "grad_norm": 0.9638901352882385, | |
| "learning_rate": 8.765357330018056e-06, | |
| "loss": 0.6789, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.9615384615384616, | |
| "grad_norm": 1.052692174911499, | |
| "learning_rate": 8.613974319136959e-06, | |
| "loss": 0.6895, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.6750084757804871, | |
| "learning_rate": 8.455313244934324e-06, | |
| "loss": 0.6846, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 1.0384615384615385, | |
| "grad_norm": 0.9357823729515076, | |
| "learning_rate": 8.289693629698564e-06, | |
| "loss": 0.6554, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 1.0769230769230769, | |
| "grad_norm": 1.2013366222381592, | |
| "learning_rate": 8.117449009293668e-06, | |
| "loss": 0.6586, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 1.1153846153846154, | |
| "grad_norm": 0.8335021138191223, | |
| "learning_rate": 7.938926261462366e-06, | |
| "loss": 0.6569, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 1.1538461538461537, | |
| "grad_norm": 0.703048050403595, | |
| "learning_rate": 7.754484907260513e-06, | |
| "loss": 0.65, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 1.1923076923076923, | |
| "grad_norm": 0.9962571263313293, | |
| "learning_rate": 7.564496387029532e-06, | |
| "loss": 0.6448, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 1.2307692307692308, | |
| "grad_norm": 0.810019314289093, | |
| "learning_rate": 7.369343312364994e-06, | |
| "loss": 0.6354, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 1.2692307692307692, | |
| "grad_norm": 0.6070151925086975, | |
| "learning_rate": 7.169418695587791e-06, | |
| "loss": 0.6459, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 1.3076923076923077, | |
| "grad_norm": 0.6172223091125488, | |
| "learning_rate": 6.965125158269619e-06, | |
| "loss": 0.6281, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 1.3461538461538463, | |
| "grad_norm": 0.671054482460022, | |
| "learning_rate": 6.7568741204067145e-06, | |
| "loss": 0.6327, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 1.3846153846153846, | |
| "grad_norm": 0.5767695307731628, | |
| "learning_rate": 6.545084971874738e-06, | |
| "loss": 0.6331, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 1.4230769230769231, | |
| "grad_norm": 0.6138707399368286, | |
| "learning_rate": 6.330184227833376e-06, | |
| "loss": 0.6027, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 1.4615384615384617, | |
| "grad_norm": 0.5649499297142029, | |
| "learning_rate": 6.112604669781572e-06, | |
| "loss": 0.6184, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.638535737991333, | |
| "learning_rate": 5.892784473993184e-06, | |
| "loss": 0.6289, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 1.5384615384615383, | |
| "grad_norm": 0.6712307333946228, | |
| "learning_rate": 5.671166329088278e-06, | |
| "loss": 0.613, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.5769230769230769, | |
| "grad_norm": 0.6186216473579407, | |
| "learning_rate": 5.448196544517168e-06, | |
| "loss": 0.6141, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 1.6153846153846154, | |
| "grad_norm": 0.4753836691379547, | |
| "learning_rate": 5.224324151752575e-06, | |
| "loss": 0.6155, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 1.6538461538461537, | |
| "grad_norm": 0.5167324542999268, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6169, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 1.6923076923076923, | |
| "grad_norm": 0.5835758447647095, | |
| "learning_rate": 4.775675848247427e-06, | |
| "loss": 0.6216, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 1.7307692307692308, | |
| "grad_norm": 0.564730167388916, | |
| "learning_rate": 4.551803455482833e-06, | |
| "loss": 0.611, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 1.7692307692307692, | |
| "grad_norm": 0.5620990991592407, | |
| "learning_rate": 4.3288336709117246e-06, | |
| "loss": 0.6072, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 1.8076923076923077, | |
| "grad_norm": 0.47912776470184326, | |
| "learning_rate": 4.107215526006818e-06, | |
| "loss": 0.5998, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 1.8461538461538463, | |
| "grad_norm": 0.4562658369541168, | |
| "learning_rate": 3.887395330218429e-06, | |
| "loss": 0.6289, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 1.8846153846153846, | |
| "grad_norm": 0.5224369764328003, | |
| "learning_rate": 3.669815772166625e-06, | |
| "loss": 0.6196, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 1.9230769230769231, | |
| "grad_norm": 0.5150539875030518, | |
| "learning_rate": 3.4549150281252635e-06, | |
| "loss": 0.6126, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.9615384615384617, | |
| "grad_norm": 0.4940248727798462, | |
| "learning_rate": 3.2431258795932863e-06, | |
| "loss": 0.5986, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.4971993565559387, | |
| "learning_rate": 3.0348748417303826e-06, | |
| "loss": 0.6136, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 2.0384615384615383, | |
| "grad_norm": 0.5150362253189087, | |
| "learning_rate": 2.83058130441221e-06, | |
| "loss": 0.5742, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 2.076923076923077, | |
| "grad_norm": 0.45559242367744446, | |
| "learning_rate": 2.6306566876350072e-06, | |
| "loss": 0.594, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 2.1153846153846154, | |
| "grad_norm": 0.4556572437286377, | |
| "learning_rate": 2.43550361297047e-06, | |
| "loss": 0.5913, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 2.1538461538461537, | |
| "grad_norm": 0.504047155380249, | |
| "learning_rate": 2.245515092739488e-06, | |
| "loss": 0.5769, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 2.1923076923076925, | |
| "grad_norm": 0.5272749662399292, | |
| "learning_rate": 2.061073738537635e-06, | |
| "loss": 0.5771, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 2.230769230769231, | |
| "grad_norm": 0.4603540599346161, | |
| "learning_rate": 1.8825509907063328e-06, | |
| "loss": 0.5864, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 2.269230769230769, | |
| "grad_norm": 0.3963864743709564, | |
| "learning_rate": 1.7103063703014372e-06, | |
| "loss": 0.6019, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 2.3076923076923075, | |
| "grad_norm": 0.43673136830329895, | |
| "learning_rate": 1.544686755065677e-06, | |
| "loss": 0.5635, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 2.3461538461538463, | |
| "grad_norm": 0.4128812551498413, | |
| "learning_rate": 1.3860256808630429e-06, | |
| "loss": 0.5717, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 2.3846153846153846, | |
| "grad_norm": 0.4299347698688507, | |
| "learning_rate": 1.234642669981946e-06, | |
| "loss": 0.5763, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 2.423076923076923, | |
| "grad_norm": 0.4996536374092102, | |
| "learning_rate": 1.0908425876598512e-06, | |
| "loss": 0.5778, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 2.4615384615384617, | |
| "grad_norm": 0.48068103194236755, | |
| "learning_rate": 9.549150281252633e-07, | |
| "loss": 0.5667, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.49868765473365784, | |
| "learning_rate": 8.271337313934869e-07, | |
| "loss": 0.5996, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 2.5384615384615383, | |
| "grad_norm": 0.3790745139122009, | |
| "learning_rate": 7.077560319906696e-07, | |
| "loss": 0.5987, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 2.5769230769230766, | |
| "grad_norm": 0.3854691982269287, | |
| "learning_rate": 5.9702234071631e-07, | |
| "loss": 0.5749, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 2.6153846153846154, | |
| "grad_norm": 0.41013067960739136, | |
| "learning_rate": 4.951556604879049e-07, | |
| "loss": 0.5616, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 2.6538461538461537, | |
| "grad_norm": 0.40195363759994507, | |
| "learning_rate": 4.0236113724274716e-07, | |
| "loss": 0.5571, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 2.6923076923076925, | |
| "grad_norm": 0.3825042247772217, | |
| "learning_rate": 3.18825646801314e-07, | |
| "loss": 0.5872, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 2.730769230769231, | |
| "grad_norm": 0.3998315632343292, | |
| "learning_rate": 2.447174185242324e-07, | |
| "loss": 0.5517, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 2.769230769230769, | |
| "grad_norm": 0.4044024348258972, | |
| "learning_rate": 1.801856965207338e-07, | |
| "loss": 0.577, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 2.8076923076923075, | |
| "grad_norm": 0.40848174691200256, | |
| "learning_rate": 1.253604390908819e-07, | |
| "loss": 0.5735, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 2.8461538461538463, | |
| "grad_norm": 0.3801068365573883, | |
| "learning_rate": 8.035205700685167e-08, | |
| "loss": 0.5874, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 2.8846153846153846, | |
| "grad_norm": 0.43877533078193665, | |
| "learning_rate": 4.52511911603265e-08, | |
| "loss": 0.564, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 2.9230769230769234, | |
| "grad_norm": 0.3853648602962494, | |
| "learning_rate": 2.012853002380466e-08, | |
| "loss": 0.5748, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 2.9615384615384617, | |
| "grad_norm": 0.3701406717300415, | |
| "learning_rate": 5.034667293427053e-09, | |
| "loss": 0.5707, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.3850381672382355, | |
| "learning_rate": 0.0, | |
| "loss": 0.5867, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 78, | |
| "total_flos": 123848585641984.0, | |
| "train_loss": 0.6756999851801456, | |
| "train_runtime": 1096.7751, | |
| "train_samples_per_second": 6.805, | |
| "train_steps_per_second": 0.071 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 78, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 123848585641984.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |