| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.979591836734694, |
| "eval_steps": 500, |
| "global_step": 438, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.034013605442176874, |
| "grad_norm": 28.75, |
| "learning_rate": 1.2405214087609793e-05, |
| "loss": 3.4643, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.06802721088435375, |
| "grad_norm": 38.25, |
| "learning_rate": 2.791173169712203e-05, |
| "loss": 4.2336, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.10204081632653061, |
| "grad_norm": 32.0, |
| "learning_rate": 4.341824930663428e-05, |
| "loss": 2.0587, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.1360544217687075, |
| "grad_norm": 0.84765625, |
| "learning_rate": 5.8924766916146515e-05, |
| "loss": 1.0773, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.17006802721088435, |
| "grad_norm": 42.75, |
| "learning_rate": 7.443128452565876e-05, |
| "loss": 0.6339, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.20408163265306123, |
| "grad_norm": 0.1826171875, |
| "learning_rate": 8.9937802135171e-05, |
| "loss": 0.3862, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.23809523809523808, |
| "grad_norm": 72.5, |
| "learning_rate": 0.00010544431974468325, |
| "loss": 2.4785, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.272108843537415, |
| "grad_norm": 35.75, |
| "learning_rate": 0.0001085248274824069, |
| "loss": 1.4148, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.30612244897959184, |
| "grad_norm": 74.5, |
| "learning_rate": 0.0001084403787497468, |
| "loss": 0.8793, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.3401360544217687, |
| "grad_norm": 35.5, |
| "learning_rate": 0.00010829110362886751, |
| "loss": 0.4985, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.3741496598639456, |
| "grad_norm": 2.328125, |
| "learning_rate": 0.00010807722553920126, |
| "loss": 0.382, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.40816326530612246, |
| "grad_norm": 17.5, |
| "learning_rate": 0.00010779906459116616, |
| "loss": 0.6253, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.4421768707482993, |
| "grad_norm": 16.375, |
| "learning_rate": 0.00010745703710705828, |
| "loss": 0.3236, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.47619047619047616, |
| "grad_norm": 55.75, |
| "learning_rate": 0.00010705165499794393, |
| "loss": 0.3981, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.5102040816326531, |
| "grad_norm": 25.625, |
| "learning_rate": 0.00010658352499748452, |
| "loss": 0.6889, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.54421768707483, |
| "grad_norm": 235.0, |
| "learning_rate": 0.00010605334775384088, |
| "loss": 1.3471, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.5782312925170068, |
| "grad_norm": 6.34375, |
| "learning_rate": 0.00010546191678101621, |
| "loss": 0.7067, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.6122448979591837, |
| "grad_norm": 33.0, |
| "learning_rate": 0.00010481011727120708, |
| "loss": 0.2755, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.6462585034013606, |
| "grad_norm": 29.875, |
| "learning_rate": 0.00010409892476994003, |
| "loss": 0.376, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.6802721088435374, |
| "grad_norm": 20.5, |
| "learning_rate": 0.0001033294037159768, |
| "loss": 0.5199, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.7142857142857143, |
| "grad_norm": 9.0625, |
| "learning_rate": 0.00010250270584817341, |
| "loss": 0.4744, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.7482993197278912, |
| "grad_norm": 49.25, |
| "learning_rate": 0.0001016200684816775, |
| "loss": 0.3467, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.782312925170068, |
| "grad_norm": 54.75, |
| "learning_rate": 0.00010068281265604425, |
| "loss": 0.4323, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.8163265306122449, |
| "grad_norm": 25.75, |
| "learning_rate": 9.969234115804185e-05, |
| "loss": 0.5117, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.8503401360544217, |
| "grad_norm": 27.375, |
| "learning_rate": 9.865013642210685e-05, |
| "loss": 0.3461, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.8843537414965986, |
| "grad_norm": 27.0, |
| "learning_rate": 9.755775831159075e-05, |
| "loss": 0.3838, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.9183673469387755, |
| "grad_norm": 22.0, |
| "learning_rate": 9.641684178411933e-05, |
| "loss": 0.3031, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.9523809523809523, |
| "grad_norm": 15.25, |
| "learning_rate": 9.522909444455842e-05, |
| "loss": 0.4872, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.9863945578231292, |
| "grad_norm": 8.375, |
| "learning_rate": 9.399629398924927e-05, |
| "loss": 0.2626, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.9931972789115646, |
| "eval_loss": 0.21089103817939758, |
| "eval_runtime": 1.2585, |
| "eval_samples_per_second": 20.66, |
| "eval_steps_per_second": 20.66, |
| "step": 146 |
| }, |
| { |
| "epoch": 1.0204081632653061, |
| "grad_norm": 23.75, |
| "learning_rate": 9.272028554533782e-05, |
| "loss": 0.1617, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.054421768707483, |
| "grad_norm": 3.421875, |
| "learning_rate": 9.140297890918105e-05, |
| "loss": 0.2647, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.08843537414966, |
| "grad_norm": 7.5625, |
| "learning_rate": 9.004634568796285e-05, |
| "loss": 0.4115, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.1224489795918366, |
| "grad_norm": 14.1875, |
| "learning_rate": 8.865241634879804e-05, |
| "loss": 0.2492, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.1564625850340136, |
| "grad_norm": 9.1875, |
| "learning_rate": 8.722327717974095e-05, |
| "loss": 0.3327, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.1904761904761905, |
| "grad_norm": 9.4375, |
| "learning_rate": 8.57610671672471e-05, |
| "loss": 0.4109, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.2244897959183674, |
| "grad_norm": 20.25, |
| "learning_rate": 8.426797479476129e-05, |
| "loss": 0.4426, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.2585034013605443, |
| "grad_norm": 66.0, |
| "learning_rate": 8.27462347672239e-05, |
| "loss": 0.3577, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.2925170068027212, |
| "grad_norm": 4.75, |
| "learning_rate": 8.119812466639757e-05, |
| "loss": 0.201, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.3265306122448979, |
| "grad_norm": 5.84375, |
| "learning_rate": 7.962596154202029e-05, |
| "loss": 0.1453, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.3605442176870748, |
| "grad_norm": 8.4375, |
| "learning_rate": 7.80320984438872e-05, |
| "loss": 0.072, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.3945578231292517, |
| "grad_norm": 10.75, |
| "learning_rate": 7.641892090005088e-05, |
| "loss": 0.0673, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.4285714285714286, |
| "grad_norm": 20.5, |
| "learning_rate": 7.478884334641178e-05, |
| "loss": 0.2137, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.4625850340136055, |
| "grad_norm": 13.875, |
| "learning_rate": 7.314430551304253e-05, |
| "loss": 0.1786, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.4965986394557822, |
| "grad_norm": 7.65625, |
| "learning_rate": 7.148776877265426e-05, |
| "loss": 0.0821, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.5306122448979593, |
| "grad_norm": 16.25, |
| "learning_rate": 6.982171245667071e-05, |
| "loss": 0.1904, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.564625850340136, |
| "grad_norm": 4.46875, |
| "learning_rate": 6.81486301444235e-05, |
| "loss": 0.0449, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.598639455782313, |
| "grad_norm": 14.5625, |
| "learning_rate": 6.64710259310227e-05, |
| "loss": 0.1493, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.6326530612244898, |
| "grad_norm": 25.75, |
| "learning_rate": 6.479141067948843e-05, |
| "loss": 0.1877, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 12.75, |
| "learning_rate": 6.311229826275292e-05, |
| "loss": 0.1844, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.7006802721088436, |
| "grad_norm": 13.5, |
| "learning_rate": 6.143620180115768e-05, |
| "loss": 0.2923, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.7346938775510203, |
| "grad_norm": 2.15625, |
| "learning_rate": 5.9765629901077215e-05, |
| "loss": 0.059, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.7687074829931972, |
| "grad_norm": 0.26953125, |
| "learning_rate": 5.8103082900298425e-05, |
| "loss": 0.0561, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.8027210884353742, |
| "grad_norm": 8.875, |
| "learning_rate": 5.645104912577601e-05, |
| "loss": 0.1103, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.836734693877551, |
| "grad_norm": 8.4375, |
| "learning_rate": 5.481200116936402e-05, |
| "loss": 0.1003, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.870748299319728, |
| "grad_norm": 6.0, |
| "learning_rate": 5.31883921870983e-05, |
| "loss": 0.0788, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.9047619047619047, |
| "grad_norm": 1.6171875, |
| "learning_rate": 5.158265222756847e-05, |
| "loss": 0.0819, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.9387755102040818, |
| "grad_norm": 14.1875, |
| "learning_rate": 4.999718459487458e-05, |
| "loss": 0.0352, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.9727891156462585, |
| "grad_norm": 0.60546875, |
| "learning_rate": 4.843436225161211e-05, |
| "loss": 0.1354, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.9863945578231292, |
| "eval_loss": 0.17950406670570374, |
| "eval_runtime": 1.156, |
| "eval_samples_per_second": 22.491, |
| "eval_steps_per_second": 22.491, |
| "step": 292 |
| }, |
| { |
| "epoch": 2.006802721088435, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.689652426726917e-05, |
| "loss": 0.0795, |
| "step": 295 |
| }, |
| { |
| "epoch": 2.0408163265306123, |
| "grad_norm": 0.0264892578125, |
| "learning_rate": 4.5385972317351206e-05, |
| "loss": 0.1034, |
| "step": 300 |
| }, |
| { |
| "epoch": 2.074829931972789, |
| "grad_norm": 0.060791015625, |
| "learning_rate": 4.3904967238473124e-05, |
| "loss": 0.0467, |
| "step": 305 |
| }, |
| { |
| "epoch": 2.108843537414966, |
| "grad_norm": 4.5625, |
| "learning_rate": 4.2455725644574884e-05, |
| "loss": 0.0525, |
| "step": 310 |
| }, |
| { |
| "epoch": 2.142857142857143, |
| "grad_norm": 0.08984375, |
| "learning_rate": 4.1040416609324844e-05, |
| "loss": 0.0122, |
| "step": 315 |
| }, |
| { |
| "epoch": 2.17687074829932, |
| "grad_norm": 17.25, |
| "learning_rate": 3.966115841967671e-05, |
| "loss": 0.1311, |
| "step": 320 |
| }, |
| { |
| "epoch": 2.2108843537414966, |
| "grad_norm": 3.390625, |
| "learning_rate": 3.832001540543833e-05, |
| "loss": 0.1173, |
| "step": 325 |
| }, |
| { |
| "epoch": 2.2448979591836733, |
| "grad_norm": 4.59375, |
| "learning_rate": 3.701899484959829e-05, |
| "loss": 0.0816, |
| "step": 330 |
| }, |
| { |
| "epoch": 2.2789115646258504, |
| "grad_norm": 0.06591796875, |
| "learning_rate": 3.5760043984034015e-05, |
| "loss": 0.0235, |
| "step": 335 |
| }, |
| { |
| "epoch": 2.312925170068027, |
| "grad_norm": 12.375, |
| "learning_rate": 3.454504707509821e-05, |
| "loss": 0.0807, |
| "step": 340 |
| }, |
| { |
| "epoch": 2.3469387755102042, |
| "grad_norm": 0.016357421875, |
| "learning_rate": 3.337582260344549e-05, |
| "loss": 0.0603, |
| "step": 345 |
| }, |
| { |
| "epoch": 2.380952380952381, |
| "grad_norm": 0.06884765625, |
| "learning_rate": 3.225412054232022e-05, |
| "loss": 0.0107, |
| "step": 350 |
| }, |
| { |
| "epoch": 2.4149659863945576, |
| "grad_norm": 0.07080078125, |
| "learning_rate": 3.118161973837903e-05, |
| "loss": 0.003, |
| "step": 355 |
| }, |
| { |
| "epoch": 2.4489795918367347, |
| "grad_norm": 0.55078125, |
| "learning_rate": 3.0159925398968314e-05, |
| "loss": 0.0248, |
| "step": 360 |
| }, |
| { |
| "epoch": 2.4829931972789114, |
| "grad_norm": 1.015625, |
| "learning_rate": 2.9190566689617188e-05, |
| "loss": 0.0651, |
| "step": 365 |
| }, |
| { |
| "epoch": 2.5170068027210886, |
| "grad_norm": 0.0218505859375, |
| "learning_rate": 2.8274994445342093e-05, |
| "loss": 0.0021, |
| "step": 370 |
| }, |
| { |
| "epoch": 2.5510204081632653, |
| "grad_norm": 0.037353515625, |
| "learning_rate": 2.741457899918822e-05, |
| "loss": 0.0111, |
| "step": 375 |
| }, |
| { |
| "epoch": 2.5850340136054424, |
| "grad_norm": 54.75, |
| "learning_rate": 2.6610608131257937e-05, |
| "loss": 0.0358, |
| "step": 380 |
| }, |
| { |
| "epoch": 2.619047619047619, |
| "grad_norm": 0.078125, |
| "learning_rate": 2.5864285141295854e-05, |
| "loss": 0.0004, |
| "step": 385 |
| }, |
| { |
| "epoch": 2.6530612244897958, |
| "grad_norm": 0.005767822265625, |
| "learning_rate": 2.517672704771522e-05, |
| "loss": 0.0003, |
| "step": 390 |
| }, |
| { |
| "epoch": 2.687074829931973, |
| "grad_norm": 2.015625, |
| "learning_rate": 2.4548962915761334e-05, |
| "loss": 0.0006, |
| "step": 395 |
| }, |
| { |
| "epoch": 2.7210884353741496, |
| "grad_norm": 13.875, |
| "learning_rate": 2.3981932317313933e-05, |
| "loss": 0.0117, |
| "step": 400 |
| }, |
| { |
| "epoch": 2.7551020408163263, |
| "grad_norm": 0.2216796875, |
| "learning_rate": 2.347648392463406e-05, |
| "loss": 0.0074, |
| "step": 405 |
| }, |
| { |
| "epoch": 2.7891156462585034, |
| "grad_norm": 11.4375, |
| "learning_rate": 2.303337424015989e-05, |
| "loss": 0.0291, |
| "step": 410 |
| }, |
| { |
| "epoch": 2.8231292517006805, |
| "grad_norm": 18.0, |
| "learning_rate": 2.2653266464252818e-05, |
| "loss": 0.0245, |
| "step": 415 |
| }, |
| { |
| "epoch": 2.857142857142857, |
| "grad_norm": 7.25, |
| "learning_rate": 2.2336729502588305e-05, |
| "loss": 0.0082, |
| "step": 420 |
| }, |
| { |
| "epoch": 2.891156462585034, |
| "grad_norm": 0.09716796875, |
| "learning_rate": 2.2084237114677194e-05, |
| "loss": 0.0224, |
| "step": 425 |
| }, |
| { |
| "epoch": 2.925170068027211, |
| "grad_norm": 0.012451171875, |
| "learning_rate": 2.18961672047919e-05, |
| "loss": 0.0002, |
| "step": 430 |
| }, |
| { |
| "epoch": 2.9591836734693877, |
| "grad_norm": 0.0341796875, |
| "learning_rate": 2.1772801256358705e-05, |
| "loss": 0.0299, |
| "step": 435 |
| }, |
| { |
| "epoch": 2.979591836734694, |
| "eval_loss": 0.00994242262095213, |
| "eval_runtime": 1.1603, |
| "eval_samples_per_second": 22.408, |
| "eval_steps_per_second": 22.408, |
| "step": 438 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 441, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.330567101743104e+16, |
| "train_batch_size": 3, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|