{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 269, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0186219739292365, "grad_norm": 1.397485375404358, "learning_rate": 1.7647058823529412e-06, "loss": 1.3045, "step": 5 }, { "epoch": 0.037243947858473, "grad_norm": 0.9164593815803528, "learning_rate": 3.970588235294118e-06, "loss": 1.3238, "step": 10 }, { "epoch": 0.055865921787709494, "grad_norm": 0.7285172343254089, "learning_rate": 6.176470588235294e-06, "loss": 1.3402, "step": 15 }, { "epoch": 0.074487895716946, "grad_norm": 0.7771405577659607, "learning_rate": 8.382352941176472e-06, "loss": 1.2444, "step": 20 }, { "epoch": 0.0931098696461825, "grad_norm": 0.590904951095581, "learning_rate": 1.0588235294117648e-05, "loss": 1.2908, "step": 25 }, { "epoch": 0.11173184357541899, "grad_norm": 0.68792724609375, "learning_rate": 1.2794117647058824e-05, "loss": 1.2953, "step": 30 }, { "epoch": 0.1303538175046555, "grad_norm": 0.6625350117683411, "learning_rate": 1.5e-05, "loss": 1.192, "step": 35 }, { "epoch": 0.148975791433892, "grad_norm": 0.5564359426498413, "learning_rate": 1.7205882352941175e-05, "loss": 1.1406, "step": 40 }, { "epoch": 0.16759776536312848, "grad_norm": 0.4591367542743683, "learning_rate": 1.9411764705882355e-05, "loss": 1.181, "step": 45 }, { "epoch": 0.186219739292365, "grad_norm": 0.5679731369018555, "learning_rate": 2.161764705882353e-05, "loss": 1.1935, "step": 50 }, { "epoch": 0.2048417132216015, "grad_norm": 0.4359203577041626, "learning_rate": 2.3823529411764704e-05, "loss": 1.1527, "step": 55 }, { "epoch": 0.22346368715083798, "grad_norm": 0.49340277910232544, "learning_rate": 2.6029411764705883e-05, "loss": 1.1635, "step": 60 }, { "epoch": 0.24208566108007448, "grad_norm": 0.4740353226661682, "learning_rate": 2.823529411764706e-05, "loss": 1.1883, "step": 65 }, { "epoch": 0.260707635009311, "grad_norm": 0.6028391122817993, "learning_rate": 2.9999954608033783e-05, "loss": 1.1328, "step": 70 }, { "epoch": 0.27932960893854747, "grad_norm": 0.5162932872772217, "learning_rate": 2.9998365918062082e-05, "loss": 1.0934, "step": 75 }, { "epoch": 0.297951582867784, "grad_norm": 0.5382624864578247, "learning_rate": 2.9994507904496206e-05, "loss": 1.1019, "step": 80 }, { "epoch": 0.3165735567970205, "grad_norm": 0.5565810799598694, "learning_rate": 2.998838115107183e-05, "loss": 1.0906, "step": 85 }, { "epoch": 0.33519553072625696, "grad_norm": 0.5212501287460327, "learning_rate": 2.997998658479568e-05, "loss": 1.0724, "step": 90 }, { "epoch": 0.3538175046554935, "grad_norm": 0.5489919185638428, "learning_rate": 2.9969325475805274e-05, "loss": 1.0658, "step": 95 }, { "epoch": 0.37243947858473, "grad_norm": 0.6241554021835327, "learning_rate": 2.995639943717676e-05, "loss": 1.059, "step": 100 }, { "epoch": 0.39106145251396646, "grad_norm": 0.563462495803833, "learning_rate": 2.9941210424680813e-05, "loss": 1.057, "step": 105 }, { "epoch": 0.409683426443203, "grad_norm": 0.5747496485710144, "learning_rate": 2.9923760736486766e-05, "loss": 1.0564, "step": 110 }, { "epoch": 0.42830540037243947, "grad_norm": 0.5738973021507263, "learning_rate": 2.9904053012814848e-05, "loss": 1.0316, "step": 115 }, { "epoch": 0.44692737430167595, "grad_norm": 0.6159545183181763, "learning_rate": 2.988209023553672e-05, "loss": 0.995, "step": 120 }, { "epoch": 0.4655493482309125, "grad_norm": 0.6447931528091431, "learning_rate": 2.9857875727724304e-05, "loss": 0.9692, "step": 125 }, { "epoch": 0.48417132216014896, "grad_norm": 0.7060043215751648, "learning_rate": 2.9831413153146988e-05, "loss": 0.9569, "step": 130 }, { "epoch": 0.5027932960893855, "grad_norm": 0.7400388717651367, "learning_rate": 2.9802706515717272e-05, "loss": 0.9378, "step": 135 }, { "epoch": 0.521415270018622, "grad_norm": 0.7127872705459595, "learning_rate": 2.9771760158884972e-05, "loss": 0.8923, "step": 140 }, { "epoch": 0.5400372439478585, "grad_norm": 0.8123269081115723, "learning_rate": 2.9738578764980025e-05, "loss": 0.9285, "step": 145 }, { "epoch": 0.5586592178770949, "grad_norm": 0.8044330477714539, "learning_rate": 2.9703167354504027e-05, "loss": 0.885, "step": 150 }, { "epoch": 0.5772811918063314, "grad_norm": 0.7073454856872559, "learning_rate": 2.966553128537062e-05, "loss": 0.9022, "step": 155 }, { "epoch": 0.595903165735568, "grad_norm": 0.725287914276123, "learning_rate": 2.9625676252094797e-05, "loss": 0.8863, "step": 160 }, { "epoch": 0.6145251396648045, "grad_norm": 0.843565046787262, "learning_rate": 2.9583608284931317e-05, "loss": 0.928, "step": 165 }, { "epoch": 0.633147113594041, "grad_norm": 0.8186341524124146, "learning_rate": 2.953933374896227e-05, "loss": 0.8967, "step": 170 }, { "epoch": 0.6517690875232774, "grad_norm": 0.8052655458450317, "learning_rate": 2.949285934313405e-05, "loss": 0.8766, "step": 175 }, { "epoch": 0.6703910614525139, "grad_norm": 1.142082691192627, "learning_rate": 2.9444192099243733e-05, "loss": 0.8402, "step": 180 }, { "epoch": 0.6890130353817505, "grad_norm": 0.9169373512268066, "learning_rate": 2.939333938087515e-05, "loss": 0.8426, "step": 185 }, { "epoch": 0.707635009310987, "grad_norm": 0.8522142171859741, "learning_rate": 2.9340308882284747e-05, "loss": 0.8288, "step": 190 }, { "epoch": 0.7262569832402235, "grad_norm": 0.8226320147514343, "learning_rate": 2.92851086272374e-05, "loss": 0.8252, "step": 195 }, { "epoch": 0.74487895716946, "grad_norm": 0.840149462223053, "learning_rate": 2.9227746967792392e-05, "loss": 0.7888, "step": 200 }, { "epoch": 0.7635009310986964, "grad_norm": 1.0344595909118652, "learning_rate": 2.916823258303968e-05, "loss": 0.7889, "step": 205 }, { "epoch": 0.7821229050279329, "grad_norm": 0.9850447177886963, "learning_rate": 2.9106574477786748e-05, "loss": 0.7634, "step": 210 }, { "epoch": 0.8007448789571695, "grad_norm": 0.9077547192573547, "learning_rate": 2.9042781981196095e-05, "loss": 0.7372, "step": 215 }, { "epoch": 0.819366852886406, "grad_norm": 0.8616479635238647, "learning_rate": 2.897686474537373e-05, "loss": 0.7238, "step": 220 }, { "epoch": 0.8379888268156425, "grad_norm": 0.9395949244499207, "learning_rate": 2.890883274390872e-05, "loss": 0.6952, "step": 225 }, { "epoch": 0.8566108007448789, "grad_norm": 0.9580796957015991, "learning_rate": 2.8838696270364183e-05, "loss": 0.6983, "step": 230 }, { "epoch": 0.8752327746741154, "grad_norm": 0.8633882403373718, "learning_rate": 2.8766465936719785e-05, "loss": 0.7479, "step": 235 }, { "epoch": 0.8938547486033519, "grad_norm": 1.0677725076675415, "learning_rate": 2.869215267176612e-05, "loss": 0.7132, "step": 240 }, { "epoch": 0.9124767225325885, "grad_norm": 1.0318955183029175, "learning_rate": 2.8615767719451125e-05, "loss": 0.6744, "step": 245 }, { "epoch": 0.931098696461825, "grad_norm": 0.8933286070823669, "learning_rate": 2.8537322637178816e-05, "loss": 0.705, "step": 250 }, { "epoch": 0.9497206703910615, "grad_norm": 0.8829292058944702, "learning_rate": 2.8456829294060608e-05, "loss": 0.7091, "step": 255 }, { "epoch": 0.9683426443202979, "grad_norm": 1.1101434230804443, "learning_rate": 2.837429986911944e-05, "loss": 0.6532, "step": 260 }, { "epoch": 0.9869646182495344, "grad_norm": 1.070557713508606, "learning_rate": 2.828974684944707e-05, "loss": 0.6493, "step": 265 } ], "logging_steps": 5, "max_steps": 1345, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.85525873404543e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }