| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 6250, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 24.225746154785156, | |
| "learning_rate": 1e-05, | |
| "loss": 10.557, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 24.75214385986328, | |
| "learning_rate": 2e-05, | |
| "loss": 9.3749, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 21.598390579223633, | |
| "learning_rate": 3e-05, | |
| "loss": 8.3939, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 12.049657821655273, | |
| "learning_rate": 4e-05, | |
| "loss": 4.1354, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 5.240299224853516, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9051, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 6.658221244812012, | |
| "learning_rate": 4.91304347826087e-05, | |
| "loss": 0.737, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 2.5896286964416504, | |
| "learning_rate": 4.8260869565217394e-05, | |
| "loss": 0.6799, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 6.020174026489258, | |
| "learning_rate": 4.739130434782609e-05, | |
| "loss": 0.6532, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 3.4233405590057373, | |
| "learning_rate": 4.6521739130434785e-05, | |
| "loss": 0.6223, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 3.2131102085113525, | |
| "learning_rate": 4.565217391304348e-05, | |
| "loss": 0.5963, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 2.3392815589904785, | |
| "learning_rate": 4.478260869565218e-05, | |
| "loss": 0.6014, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.9733258485794067, | |
| "learning_rate": 4.391304347826087e-05, | |
| "loss": 0.5765, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 2.700756788253784, | |
| "learning_rate": 4.304347826086957e-05, | |
| "loss": 0.554, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 5.853904724121094, | |
| "learning_rate": 4.2173913043478264e-05, | |
| "loss": 0.5718, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 4.578104496002197, | |
| "learning_rate": 4.130434782608696e-05, | |
| "loss": 0.5471, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 1.733497977256775, | |
| "learning_rate": 4.0434782608695655e-05, | |
| "loss": 0.5577, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 2.4556689262390137, | |
| "learning_rate": 3.956521739130435e-05, | |
| "loss": 0.5367, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 1.4540470838546753, | |
| "learning_rate": 3.869565217391305e-05, | |
| "loss": 0.5277, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 2.833214521408081, | |
| "learning_rate": 3.7826086956521736e-05, | |
| "loss": 0.5237, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 1.5153348445892334, | |
| "learning_rate": 3.695652173913043e-05, | |
| "loss": 0.5183, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": 1.163548231124878, | |
| "learning_rate": 3.6086956521739134e-05, | |
| "loss": 0.4998, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 0.9056810736656189, | |
| "learning_rate": 3.521739130434783e-05, | |
| "loss": 0.5205, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": 2.93011736869812, | |
| "learning_rate": 3.4347826086956526e-05, | |
| "loss": 0.5067, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 1.668942928314209, | |
| "learning_rate": 3.347826086956522e-05, | |
| "loss": 0.5001, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 8.79544448852539, | |
| "learning_rate": 3.260869565217392e-05, | |
| "loss": 0.4944, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 2.793776035308838, | |
| "learning_rate": 3.173913043478261e-05, | |
| "loss": 0.462, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 1.993105173110962, | |
| "learning_rate": 3.086956521739131e-05, | |
| "loss": 0.4893, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 1.373683214187622, | |
| "learning_rate": 3e-05, | |
| "loss": 0.4884, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 2.5384674072265625, | |
| "learning_rate": 2.9130434782608696e-05, | |
| "loss": 0.4839, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 1.294622778892517, | |
| "learning_rate": 2.826086956521739e-05, | |
| "loss": 0.4824, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 1.2775664329528809, | |
| "learning_rate": 2.7391304347826085e-05, | |
| "loss": 0.4675, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 1.2705273628234863, | |
| "learning_rate": 2.6521739130434787e-05, | |
| "loss": 0.4785, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 3.2393271923065186, | |
| "learning_rate": 2.5652173913043483e-05, | |
| "loss": 0.4728, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "grad_norm": 1.82649827003479, | |
| "learning_rate": 2.4782608695652175e-05, | |
| "loss": 0.474, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 1.2423534393310547, | |
| "learning_rate": 2.391304347826087e-05, | |
| "loss": 0.4651, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 1.142115831375122, | |
| "learning_rate": 2.3043478260869567e-05, | |
| "loss": 0.4687, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 2.1958296298980713, | |
| "learning_rate": 2.2173913043478262e-05, | |
| "loss": 0.456, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 4.805281162261963, | |
| "learning_rate": 2.1304347826086958e-05, | |
| "loss": 0.4439, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "grad_norm": 23.097047805786133, | |
| "learning_rate": 2.0434782608695654e-05, | |
| "loss": 0.4393, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 3.215237617492676, | |
| "learning_rate": 1.956521739130435e-05, | |
| "loss": 0.4443, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 3.2800000000000002, | |
| "grad_norm": 1.1977019309997559, | |
| "learning_rate": 1.8695652173913045e-05, | |
| "loss": 0.4333, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 1.6901699304580688, | |
| "learning_rate": 1.782608695652174e-05, | |
| "loss": 0.4678, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 2.7421112060546875, | |
| "learning_rate": 1.6956521739130433e-05, | |
| "loss": 0.4529, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 1.5662778615951538, | |
| "learning_rate": 1.608695652173913e-05, | |
| "loss": 0.4399, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 2.1081831455230713, | |
| "learning_rate": 1.5217391304347828e-05, | |
| "loss": 0.4531, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 1.103431224822998, | |
| "learning_rate": 1.4347826086956522e-05, | |
| "loss": 0.4401, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 3.76, | |
| "grad_norm": 1.3851810693740845, | |
| "learning_rate": 1.3478260869565218e-05, | |
| "loss": 0.4401, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 1.0307343006134033, | |
| "learning_rate": 1.2608695652173914e-05, | |
| "loss": 0.4424, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "grad_norm": 1.1726175546646118, | |
| "learning_rate": 1.173913043478261e-05, | |
| "loss": 0.4459, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 1.542671799659729, | |
| "learning_rate": 1.0869565217391305e-05, | |
| "loss": 0.4389, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 4.08, | |
| "grad_norm": 1.4532389640808105, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4361, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "grad_norm": 3.246967077255249, | |
| "learning_rate": 9.130434782608697e-06, | |
| "loss": 0.4452, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 4.24, | |
| "grad_norm": 0.9646230936050415, | |
| "learning_rate": 8.26086956521739e-06, | |
| "loss": 0.4239, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 4.32, | |
| "grad_norm": 1.250227451324463, | |
| "learning_rate": 7.391304347826088e-06, | |
| "loss": 0.4218, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 4.4, | |
| "grad_norm": 1.2563761472702026, | |
| "learning_rate": 6.521739130434783e-06, | |
| "loss": 0.431, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "grad_norm": 0.9907436370849609, | |
| "learning_rate": 5.652173913043479e-06, | |
| "loss": 0.4297, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 4.5600000000000005, | |
| "grad_norm": 1.0406742095947266, | |
| "learning_rate": 4.782608695652174e-06, | |
| "loss": 0.4403, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 4.64, | |
| "grad_norm": 1.2739229202270508, | |
| "learning_rate": 3.91304347826087e-06, | |
| "loss": 0.438, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 4.72, | |
| "grad_norm": 1.569171667098999, | |
| "learning_rate": 3.0434782608695654e-06, | |
| "loss": 0.4199, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 1.1578859090805054, | |
| "learning_rate": 2.173913043478261e-06, | |
| "loss": 0.4202, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 4.88, | |
| "grad_norm": 1.2546783685684204, | |
| "learning_rate": 1.3043478260869564e-06, | |
| "loss": 0.4451, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 4.96, | |
| "grad_norm": 0.965720534324646, | |
| "learning_rate": 4.347826086956522e-07, | |
| "loss": 0.4295, | |
| "step": 6200 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 6250, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 1250, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.80591525888e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |