{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.979591836734694, "eval_steps": 500, "global_step": 438, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.034013605442176874, "grad_norm": 28.75, "learning_rate": 1.2405214087609793e-05, "loss": 3.4643, "step": 5 }, { "epoch": 0.06802721088435375, "grad_norm": 38.25, "learning_rate": 2.791173169712203e-05, "loss": 4.2336, "step": 10 }, { "epoch": 0.10204081632653061, "grad_norm": 32.0, "learning_rate": 4.341824930663428e-05, "loss": 2.0587, "step": 15 }, { "epoch": 0.1360544217687075, "grad_norm": 0.84765625, "learning_rate": 5.8924766916146515e-05, "loss": 1.0773, "step": 20 }, { "epoch": 0.17006802721088435, "grad_norm": 42.75, "learning_rate": 7.443128452565876e-05, "loss": 0.6339, "step": 25 }, { "epoch": 0.20408163265306123, "grad_norm": 0.1826171875, "learning_rate": 8.9937802135171e-05, "loss": 0.3862, "step": 30 }, { "epoch": 0.23809523809523808, "grad_norm": 72.5, "learning_rate": 0.00010544431974468325, "loss": 2.4785, "step": 35 }, { "epoch": 0.272108843537415, "grad_norm": 35.75, "learning_rate": 0.0001085248274824069, "loss": 1.4148, "step": 40 }, { "epoch": 0.30612244897959184, "grad_norm": 74.5, "learning_rate": 0.0001084403787497468, "loss": 0.8793, "step": 45 }, { "epoch": 0.3401360544217687, "grad_norm": 35.5, "learning_rate": 0.00010829110362886751, "loss": 0.4985, "step": 50 }, { "epoch": 0.3741496598639456, "grad_norm": 2.328125, "learning_rate": 0.00010807722553920126, "loss": 0.382, "step": 55 }, { "epoch": 0.40816326530612246, "grad_norm": 17.5, "learning_rate": 0.00010779906459116616, "loss": 0.6253, "step": 60 }, { "epoch": 0.4421768707482993, "grad_norm": 16.375, "learning_rate": 0.00010745703710705828, "loss": 0.3236, "step": 65 }, { "epoch": 0.47619047619047616, "grad_norm": 55.75, "learning_rate": 0.00010705165499794393, "loss": 0.3981, "step": 70 }, { "epoch": 0.5102040816326531, "grad_norm": 25.625, "learning_rate": 0.00010658352499748452, "loss": 0.6889, "step": 75 }, { "epoch": 0.54421768707483, "grad_norm": 235.0, "learning_rate": 0.00010605334775384088, "loss": 1.3471, "step": 80 }, { "epoch": 0.5782312925170068, "grad_norm": 6.34375, "learning_rate": 0.00010546191678101621, "loss": 0.7067, "step": 85 }, { "epoch": 0.6122448979591837, "grad_norm": 33.0, "learning_rate": 0.00010481011727120708, "loss": 0.2755, "step": 90 }, { "epoch": 0.6462585034013606, "grad_norm": 29.875, "learning_rate": 0.00010409892476994003, "loss": 0.376, "step": 95 }, { "epoch": 0.6802721088435374, "grad_norm": 20.5, "learning_rate": 0.0001033294037159768, "loss": 0.5199, "step": 100 }, { "epoch": 0.7142857142857143, "grad_norm": 9.0625, "learning_rate": 0.00010250270584817341, "loss": 0.4744, "step": 105 }, { "epoch": 0.7482993197278912, "grad_norm": 49.25, "learning_rate": 0.0001016200684816775, "loss": 0.3467, "step": 110 }, { "epoch": 0.782312925170068, "grad_norm": 54.75, "learning_rate": 0.00010068281265604425, "loss": 0.4323, "step": 115 }, { "epoch": 0.8163265306122449, "grad_norm": 25.75, "learning_rate": 9.969234115804185e-05, "loss": 0.5117, "step": 120 }, { "epoch": 0.8503401360544217, "grad_norm": 27.375, "learning_rate": 9.865013642210685e-05, "loss": 0.3461, "step": 125 }, { "epoch": 0.8843537414965986, "grad_norm": 27.0, "learning_rate": 9.755775831159075e-05, "loss": 0.3838, "step": 130 }, { "epoch": 0.9183673469387755, "grad_norm": 22.0, "learning_rate": 9.641684178411933e-05, "loss": 0.3031, "step": 135 }, { "epoch": 0.9523809523809523, "grad_norm": 15.25, "learning_rate": 9.522909444455842e-05, "loss": 0.4872, "step": 140 }, { "epoch": 0.9863945578231292, "grad_norm": 8.375, "learning_rate": 9.399629398924927e-05, "loss": 0.2626, "step": 145 }, { "epoch": 0.9931972789115646, "eval_loss": 0.21089103817939758, "eval_runtime": 1.2585, "eval_samples_per_second": 20.66, "eval_steps_per_second": 20.66, "step": 146 }, { "epoch": 1.0204081632653061, "grad_norm": 23.75, "learning_rate": 9.272028554533782e-05, "loss": 0.1617, "step": 150 }, { "epoch": 1.054421768707483, "grad_norm": 3.421875, "learning_rate": 9.140297890918105e-05, "loss": 0.2647, "step": 155 }, { "epoch": 1.08843537414966, "grad_norm": 7.5625, "learning_rate": 9.004634568796285e-05, "loss": 0.4115, "step": 160 }, { "epoch": 1.1224489795918366, "grad_norm": 14.1875, "learning_rate": 8.865241634879804e-05, "loss": 0.2492, "step": 165 }, { "epoch": 1.1564625850340136, "grad_norm": 9.1875, "learning_rate": 8.722327717974095e-05, "loss": 0.3327, "step": 170 }, { "epoch": 1.1904761904761905, "grad_norm": 9.4375, "learning_rate": 8.57610671672471e-05, "loss": 0.4109, "step": 175 }, { "epoch": 1.2244897959183674, "grad_norm": 20.25, "learning_rate": 8.426797479476129e-05, "loss": 0.4426, "step": 180 }, { "epoch": 1.2585034013605443, "grad_norm": 66.0, "learning_rate": 8.27462347672239e-05, "loss": 0.3577, "step": 185 }, { "epoch": 1.2925170068027212, "grad_norm": 4.75, "learning_rate": 8.119812466639757e-05, "loss": 0.201, "step": 190 }, { "epoch": 1.3265306122448979, "grad_norm": 5.84375, "learning_rate": 7.962596154202029e-05, "loss": 0.1453, "step": 195 }, { "epoch": 1.3605442176870748, "grad_norm": 8.4375, "learning_rate": 7.80320984438872e-05, "loss": 0.072, "step": 200 }, { "epoch": 1.3945578231292517, "grad_norm": 10.75, "learning_rate": 7.641892090005088e-05, "loss": 0.0673, "step": 205 }, { "epoch": 1.4285714285714286, "grad_norm": 20.5, "learning_rate": 7.478884334641178e-05, "loss": 0.2137, "step": 210 }, { "epoch": 1.4625850340136055, "grad_norm": 13.875, "learning_rate": 7.314430551304253e-05, "loss": 0.1786, "step": 215 }, { "epoch": 1.4965986394557822, "grad_norm": 7.65625, "learning_rate": 7.148776877265426e-05, "loss": 0.0821, "step": 220 }, { "epoch": 1.5306122448979593, "grad_norm": 16.25, "learning_rate": 6.982171245667071e-05, "loss": 0.1904, "step": 225 }, { "epoch": 1.564625850340136, "grad_norm": 4.46875, "learning_rate": 6.81486301444235e-05, "loss": 0.0449, "step": 230 }, { "epoch": 1.598639455782313, "grad_norm": 14.5625, "learning_rate": 6.64710259310227e-05, "loss": 0.1493, "step": 235 }, { "epoch": 1.6326530612244898, "grad_norm": 25.75, "learning_rate": 6.479141067948843e-05, "loss": 0.1877, "step": 240 }, { "epoch": 1.6666666666666665, "grad_norm": 12.75, "learning_rate": 6.311229826275292e-05, "loss": 0.1844, "step": 245 }, { "epoch": 1.7006802721088436, "grad_norm": 13.5, "learning_rate": 6.143620180115768e-05, "loss": 0.2923, "step": 250 }, { "epoch": 1.7346938775510203, "grad_norm": 2.15625, "learning_rate": 5.9765629901077215e-05, "loss": 0.059, "step": 255 }, { "epoch": 1.7687074829931972, "grad_norm": 0.26953125, "learning_rate": 5.8103082900298425e-05, "loss": 0.0561, "step": 260 }, { "epoch": 1.8027210884353742, "grad_norm": 8.875, "learning_rate": 5.645104912577601e-05, "loss": 0.1103, "step": 265 }, { "epoch": 1.836734693877551, "grad_norm": 8.4375, "learning_rate": 5.481200116936402e-05, "loss": 0.1003, "step": 270 }, { "epoch": 1.870748299319728, "grad_norm": 6.0, "learning_rate": 5.31883921870983e-05, "loss": 0.0788, "step": 275 }, { "epoch": 1.9047619047619047, "grad_norm": 1.6171875, "learning_rate": 5.158265222756847e-05, "loss": 0.0819, "step": 280 }, { "epoch": 1.9387755102040818, "grad_norm": 14.1875, "learning_rate": 4.999718459487458e-05, "loss": 0.0352, "step": 285 }, { "epoch": 1.9727891156462585, "grad_norm": 0.60546875, "learning_rate": 4.843436225161211e-05, "loss": 0.1354, "step": 290 }, { "epoch": 1.9863945578231292, "eval_loss": 0.17950406670570374, "eval_runtime": 1.156, "eval_samples_per_second": 22.491, "eval_steps_per_second": 22.491, "step": 292 }, { "epoch": 2.006802721088435, "grad_norm": 1.0703125, "learning_rate": 4.689652426726917e-05, "loss": 0.0795, "step": 295 }, { "epoch": 2.0408163265306123, "grad_norm": 0.0264892578125, "learning_rate": 4.5385972317351206e-05, "loss": 0.1034, "step": 300 }, { "epoch": 2.074829931972789, "grad_norm": 0.060791015625, "learning_rate": 4.3904967238473124e-05, "loss": 0.0467, "step": 305 }, { "epoch": 2.108843537414966, "grad_norm": 4.5625, "learning_rate": 4.2455725644574884e-05, "loss": 0.0525, "step": 310 }, { "epoch": 2.142857142857143, "grad_norm": 0.08984375, "learning_rate": 4.1040416609324844e-05, "loss": 0.0122, "step": 315 }, { "epoch": 2.17687074829932, "grad_norm": 17.25, "learning_rate": 3.966115841967671e-05, "loss": 0.1311, "step": 320 }, { "epoch": 2.2108843537414966, "grad_norm": 3.390625, "learning_rate": 3.832001540543833e-05, "loss": 0.1173, "step": 325 }, { "epoch": 2.2448979591836733, "grad_norm": 4.59375, "learning_rate": 3.701899484959829e-05, "loss": 0.0816, "step": 330 }, { "epoch": 2.2789115646258504, "grad_norm": 0.06591796875, "learning_rate": 3.5760043984034015e-05, "loss": 0.0235, "step": 335 }, { "epoch": 2.312925170068027, "grad_norm": 12.375, "learning_rate": 3.454504707509821e-05, "loss": 0.0807, "step": 340 }, { "epoch": 2.3469387755102042, "grad_norm": 0.016357421875, "learning_rate": 3.337582260344549e-05, "loss": 0.0603, "step": 345 }, { "epoch": 2.380952380952381, "grad_norm": 0.06884765625, "learning_rate": 3.225412054232022e-05, "loss": 0.0107, "step": 350 }, { "epoch": 2.4149659863945576, "grad_norm": 0.07080078125, "learning_rate": 3.118161973837903e-05, "loss": 0.003, "step": 355 }, { "epoch": 2.4489795918367347, "grad_norm": 0.55078125, "learning_rate": 3.0159925398968314e-05, "loss": 0.0248, "step": 360 }, { "epoch": 2.4829931972789114, "grad_norm": 1.015625, "learning_rate": 2.9190566689617188e-05, "loss": 0.0651, "step": 365 }, { "epoch": 2.5170068027210886, "grad_norm": 0.0218505859375, "learning_rate": 2.8274994445342093e-05, "loss": 0.0021, "step": 370 }, { "epoch": 2.5510204081632653, "grad_norm": 0.037353515625, "learning_rate": 2.741457899918822e-05, "loss": 0.0111, "step": 375 }, { "epoch": 2.5850340136054424, "grad_norm": 54.75, "learning_rate": 2.6610608131257937e-05, "loss": 0.0358, "step": 380 }, { "epoch": 2.619047619047619, "grad_norm": 0.078125, "learning_rate": 2.5864285141295854e-05, "loss": 0.0004, "step": 385 }, { "epoch": 2.6530612244897958, "grad_norm": 0.005767822265625, "learning_rate": 2.517672704771522e-05, "loss": 0.0003, "step": 390 }, { "epoch": 2.687074829931973, "grad_norm": 2.015625, "learning_rate": 2.4548962915761334e-05, "loss": 0.0006, "step": 395 }, { "epoch": 2.7210884353741496, "grad_norm": 13.875, "learning_rate": 2.3981932317313933e-05, "loss": 0.0117, "step": 400 }, { "epoch": 2.7551020408163263, "grad_norm": 0.2216796875, "learning_rate": 2.347648392463406e-05, "loss": 0.0074, "step": 405 }, { "epoch": 2.7891156462585034, "grad_norm": 11.4375, "learning_rate": 2.303337424015989e-05, "loss": 0.0291, "step": 410 }, { "epoch": 2.8231292517006805, "grad_norm": 18.0, "learning_rate": 2.2653266464252818e-05, "loss": 0.0245, "step": 415 }, { "epoch": 2.857142857142857, "grad_norm": 7.25, "learning_rate": 2.2336729502588305e-05, "loss": 0.0082, "step": 420 }, { "epoch": 2.891156462585034, "grad_norm": 0.09716796875, "learning_rate": 2.2084237114677194e-05, "loss": 0.0224, "step": 425 }, { "epoch": 2.925170068027211, "grad_norm": 0.012451171875, "learning_rate": 2.18961672047919e-05, "loss": 0.0002, "step": 430 }, { "epoch": 2.9591836734693877, "grad_norm": 0.0341796875, "learning_rate": 2.1772801256358705e-05, "loss": 0.0299, "step": 435 }, { "epoch": 2.979591836734694, "eval_loss": 0.00994242262095213, "eval_runtime": 1.1603, "eval_samples_per_second": 22.408, "eval_steps_per_second": 22.408, "step": 438 } ], "logging_steps": 5, "max_steps": 441, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.330567101743104e+16, "train_batch_size": 3, "trial_name": null, "trial_params": null }