{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9857231533209188, "eval_steps": 30, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.024829298572315334, "grad_norm": 5.036550998687744, "learning_rate": 2.2222222222222223e-05, "loss": 2.4126, "step": 10 }, { "epoch": 0.04965859714463067, "grad_norm": 0.6553554534912109, "learning_rate": 4.691358024691358e-05, "loss": 0.8516, "step": 20 }, { "epoch": 0.074487895716946, "grad_norm": 0.7089270353317261, "learning_rate": 7.160493827160494e-05, "loss": 0.5451, "step": 30 }, { "epoch": 0.074487895716946, "eval_loss": 0.46651971340179443, "eval_runtime": 40.7643, "eval_samples_per_second": 4.17, "eval_steps_per_second": 2.085, "step": 30 }, { "epoch": 0.09931719428926133, "grad_norm": 0.5849066376686096, "learning_rate": 9.62962962962963e-05, "loss": 0.3903, "step": 40 }, { "epoch": 0.12414649286157665, "grad_norm": 0.6945697069168091, "learning_rate": 0.00012098765432098766, "loss": 0.3218, "step": 50 }, { "epoch": 0.148975791433892, "grad_norm": 0.43555283546447754, "learning_rate": 0.00014567901234567902, "loss": 0.2579, "step": 60 }, { "epoch": 0.148975791433892, "eval_loss": 0.2548002004623413, "eval_runtime": 40.1887, "eval_samples_per_second": 4.23, "eval_steps_per_second": 2.115, "step": 60 }, { "epoch": 0.17380509000620734, "grad_norm": 0.4026840627193451, "learning_rate": 0.00017037037037037037, "loss": 0.23, "step": 70 }, { "epoch": 0.19863438857852267, "grad_norm": 0.635771632194519, "learning_rate": 0.00019506172839506175, "loss": 0.2108, "step": 80 }, { "epoch": 0.22346368715083798, "grad_norm": 0.43689030408859253, "learning_rate": 0.0001999399199592735, "loss": 0.1878, "step": 90 }, { "epoch": 0.22346368715083798, "eval_loss": 0.186412051320076, "eval_runtime": 40.2208, "eval_samples_per_second": 4.227, "eval_steps_per_second": 2.113, "step": 90 }, { "epoch": 0.2482929857231533, "grad_norm": 0.3414112627506256, "learning_rate": 0.00019969596851644327, "loss": 0.178, "step": 100 }, { "epoch": 0.27312228429546864, "grad_norm": 0.47279804944992065, "learning_rate": 0.00019926484830975113, "loss": 0.1595, "step": 110 }, { "epoch": 0.297951582867784, "grad_norm": 0.2666519582271576, "learning_rate": 0.0001986473687223383, "loss": 0.159, "step": 120 }, { "epoch": 0.297951582867784, "eval_loss": 0.152946338057518, "eval_runtime": 40.3051, "eval_samples_per_second": 4.218, "eval_steps_per_second": 2.109, "step": 120 }, { "epoch": 0.3227808814400993, "grad_norm": 0.2780194580554962, "learning_rate": 0.00019784468900761095, "loss": 0.1491, "step": 130 }, { "epoch": 0.34761018001241467, "grad_norm": 0.5531139969825745, "learning_rate": 0.0001968583161128631, "loss": 0.1523, "step": 140 }, { "epoch": 0.37243947858473, "grad_norm": 0.3274007737636566, "learning_rate": 0.00019569010185014062, "loss": 0.1447, "step": 150 }, { "epoch": 0.37243947858473, "eval_loss": 0.1445446014404297, "eval_runtime": 40.255, "eval_samples_per_second": 4.223, "eval_steps_per_second": 2.112, "step": 150 }, { "epoch": 0.39726877715704534, "grad_norm": 0.2487361580133438, "learning_rate": 0.00019434223941965738, "loss": 0.1509, "step": 160 }, { "epoch": 0.42209807572936064, "grad_norm": 0.5522840023040771, "learning_rate": 0.00019281725929229127, "loss": 0.1433, "step": 170 }, { "epoch": 0.44692737430167595, "grad_norm": 0.1760244369506836, "learning_rate": 0.00019111802445888936, "loss": 0.1434, "step": 180 }, { "epoch": 0.44692737430167595, "eval_loss": 0.13944680988788605, "eval_runtime": 40.2246, "eval_samples_per_second": 4.226, "eval_steps_per_second": 2.113, "step": 180 }, { "epoch": 0.4717566728739913, "grad_norm": 0.2646051347255707, "learning_rate": 0.00018924772505530174, "loss": 0.1366, "step": 190 }, { "epoch": 0.4965859714463066, "grad_norm": 0.3032621741294861, "learning_rate": 0.000187209872373235, "loss": 0.1359, "step": 200 }, { "epoch": 0.521415270018622, "grad_norm": 0.5465778112411499, "learning_rate": 0.00018500829226816853, "loss": 0.143, "step": 210 }, { "epoch": 0.521415270018622, "eval_loss": 0.13419194519519806, "eval_runtime": 40.1269, "eval_samples_per_second": 4.237, "eval_steps_per_second": 2.118, "step": 210 }, { "epoch": 0.5462445685909373, "grad_norm": 0.1879195123910904, "learning_rate": 0.0001826471179767111, "loss": 0.1364, "step": 220 }, { "epoch": 0.5710738671632526, "grad_norm": 0.19969278573989868, "learning_rate": 0.0001801307823568806, "loss": 0.1407, "step": 230 }, { "epoch": 0.595903165735568, "grad_norm": 0.15893523395061493, "learning_rate": 0.00017746400956587653, "loss": 0.1397, "step": 240 }, { "epoch": 0.595903165735568, "eval_loss": 0.13155966997146606, "eval_runtime": 40.1893, "eval_samples_per_second": 4.23, "eval_steps_per_second": 2.115, "step": 240 }, { "epoch": 0.6207324643078833, "grad_norm": 0.1424490511417389, "learning_rate": 0.00017465180619096832, "loss": 0.1337, "step": 250 }, { "epoch": 0.6455617628801986, "grad_norm": 0.13029974699020386, "learning_rate": 0.00017169945185015106, "loss": 0.1376, "step": 260 }, { "epoch": 0.6703910614525139, "grad_norm": 0.13530579209327698, "learning_rate": 0.00016861248928021411, "loss": 0.129, "step": 270 }, { "epoch": 0.6703910614525139, "eval_loss": 0.12780845165252686, "eval_runtime": 40.2498, "eval_samples_per_second": 4.224, "eval_steps_per_second": 2.112, "step": 270 }, { "epoch": 0.6952203600248293, "grad_norm": 0.2645304501056671, "learning_rate": 0.00016539671393083215, "loss": 0.1246, "step": 280 }, { "epoch": 0.7200496585971446, "grad_norm": 0.15367014706134796, "learning_rate": 0.00016205816308421386, "loss": 0.1273, "step": 290 }, { "epoch": 0.74487895716946, "grad_norm": 0.2134842574596405, "learning_rate": 0.0001586031045207354, "loss": 0.1361, "step": 300 }, { "epoch": 0.74487895716946, "eval_loss": 0.1297762393951416, "eval_runtime": 40.2241, "eval_samples_per_second": 4.226, "eval_steps_per_second": 2.113, "step": 300 }, { "epoch": 0.7697082557417753, "grad_norm": 0.13907591998577118, "learning_rate": 0.00015503802475183773, "loss": 0.14, "step": 310 }, { "epoch": 0.7945375543140907, "grad_norm": 0.10886333137750626, "learning_rate": 0.00015136961684227904, "loss": 0.1351, "step": 320 }, { "epoch": 0.819366852886406, "grad_norm": 0.1071273609995842, "learning_rate": 0.00014760476784460514, "loss": 0.1288, "step": 330 }, { "epoch": 0.819366852886406, "eval_loss": 0.1265447735786438, "eval_runtime": 40.2437, "eval_samples_per_second": 4.224, "eval_steps_per_second": 2.112, "step": 330 }, { "epoch": 0.8441961514587213, "grad_norm": 0.13940832018852234, "learning_rate": 0.0001437505458694277, "loss": 0.1331, "step": 340 }, { "epoch": 0.8690254500310366, "grad_norm": 0.12029105424880981, "learning_rate": 0.00013981418681578546, "loss": 0.1297, "step": 350 }, { "epoch": 0.8938547486033519, "grad_norm": 0.09277268499135971, "learning_rate": 0.0001358030807864995, "loss": 0.1259, "step": 360 }, { "epoch": 0.8938547486033519, "eval_loss": 0.12379591166973114, "eval_runtime": 40.2294, "eval_samples_per_second": 4.226, "eval_steps_per_second": 2.113, "step": 360 }, { "epoch": 0.9186840471756673, "grad_norm": 0.172864630818367, "learning_rate": 0.00013172475821402748, "loss": 0.1301, "step": 370 }, { "epoch": 0.9435133457479826, "grad_norm": 0.10042418539524078, "learning_rate": 0.00012758687572286367, "loss": 0.1271, "step": 380 }, { "epoch": 0.9683426443202979, "grad_norm": 0.09972112625837326, "learning_rate": 0.00012339720175502642, "loss": 0.1352, "step": 390 }, { "epoch": 0.9683426443202979, "eval_loss": 0.12407374382019043, "eval_runtime": 40.2691, "eval_samples_per_second": 4.222, "eval_steps_per_second": 2.111, "step": 390 }, { "epoch": 0.9931719428926132, "grad_norm": 0.10841402411460876, "learning_rate": 0.0001191636019856198, "loss": 0.1254, "step": 400 }, { "epoch": 1.0173805090006207, "grad_norm": 0.14438092708587646, "learning_rate": 0.00011489402455585076, "loss": 0.1321, "step": 410 }, { "epoch": 1.042209807572936, "grad_norm": 0.10602834075689316, "learning_rate": 0.00011059648515122424, "loss": 0.1211, "step": 420 }, { "epoch": 1.042209807572936, "eval_loss": 0.12403523921966553, "eval_runtime": 40.5744, "eval_samples_per_second": 4.19, "eval_steps_per_second": 2.095, "step": 420 }, { "epoch": 1.0670391061452513, "grad_norm": 0.10185902565717697, "learning_rate": 0.00010627905195293135, "loss": 0.1237, "step": 430 }, { "epoch": 1.0918684047175666, "grad_norm": 0.09817427396774292, "learning_rate": 0.00010194983049068212, "loss": 0.1138, "step": 440 }, { "epoch": 1.1166977032898822, "grad_norm": 0.14020408689975739, "learning_rate": 9.76169484254204e-05, "loss": 0.118, "step": 450 }, { "epoch": 1.1166977032898822, "eval_loss": 0.12372539937496185, "eval_runtime": 40.514, "eval_samples_per_second": 4.196, "eval_steps_per_second": 2.098, "step": 450 }, { "epoch": 1.1415270018621975, "grad_norm": 0.09354697167873383, "learning_rate": 9.328854029048984e-05, "loss": 0.1241, "step": 460 }, { "epoch": 1.1663563004345128, "grad_norm": 0.10786397010087967, "learning_rate": 8.897273221989714e-05, "loss": 0.1254, "step": 470 }, { "epoch": 1.191185599006828, "grad_norm": 0.08708823472261429, "learning_rate": 8.467762669234495e-05, "loss": 0.1214, "step": 480 }, { "epoch": 1.191185599006828, "eval_loss": 0.12238769233226776, "eval_runtime": 40.5104, "eval_samples_per_second": 4.196, "eval_steps_per_second": 2.098, "step": 480 }, { "epoch": 1.2160148975791434, "grad_norm": 0.11436637490987778, "learning_rate": 8.041128731967444e-05, "loss": 0.1278, "step": 490 }, { "epoch": 1.2408441961514587, "grad_norm": 0.10331734269857407, "learning_rate": 7.61817237082768e-05, "loss": 0.1242, "step": 500 }, { "epoch": 1.265673494723774, "grad_norm": 0.09123562276363373, "learning_rate": 7.199687642189387e-05, "loss": 0.1191, "step": 510 }, { "epoch": 1.265673494723774, "eval_loss": 0.1218603253364563, "eval_runtime": 40.5038, "eval_samples_per_second": 4.197, "eval_steps_per_second": 2.099, "step": 510 }, { "epoch": 1.2905027932960893, "grad_norm": 0.08754425495862961, "learning_rate": 6.786460207403978e-05, "loss": 0.12, "step": 520 }, { "epoch": 1.3153320918684046, "grad_norm": 0.08568098396062851, "learning_rate": 6.379265857802969e-05, "loss": 0.1205, "step": 530 }, { "epoch": 1.34016139044072, "grad_norm": 0.09707140177488327, "learning_rate": 5.9788690582308404e-05, "loss": 0.1277, "step": 540 }, { "epoch": 1.34016139044072, "eval_loss": 0.1208547055721283, "eval_runtime": 40.5912, "eval_samples_per_second": 4.188, "eval_steps_per_second": 2.094, "step": 540 }, { "epoch": 1.3649906890130354, "grad_norm": 0.11941556632518768, "learning_rate": 5.586021511842136e-05, "loss": 0.1143, "step": 550 }, { "epoch": 1.3898199875853507, "grad_norm": 0.11442070454359055, "learning_rate": 5.201460748857369e-05, "loss": 0.1215, "step": 560 }, { "epoch": 1.414649286157666, "grad_norm": 0.09243914484977722, "learning_rate": 4.8259087419270756e-05, "loss": 0.124, "step": 570 }, { "epoch": 1.414649286157666, "eval_loss": 0.12113272398710251, "eval_runtime": 40.5079, "eval_samples_per_second": 4.197, "eval_steps_per_second": 2.098, "step": 570 }, { "epoch": 1.4394785847299814, "grad_norm": 0.09338078647851944, "learning_rate": 4.460070550703612e-05, "loss": 0.1248, "step": 580 }, { "epoch": 1.4643078833022967, "grad_norm": 0.0838402733206749, "learning_rate": 4.1046329981653086e-05, "loss": 0.1174, "step": 590 }, { "epoch": 1.489137181874612, "grad_norm": 0.09681010991334915, "learning_rate": 3.7602633811781166e-05, "loss": 0.1204, "step": 600 }, { "epoch": 1.489137181874612, "eval_loss": 0.12003795057535172, "eval_runtime": 40.5066, "eval_samples_per_second": 4.197, "eval_steps_per_second": 2.098, "step": 600 }, { "epoch": 1.5139664804469275, "grad_norm": 0.10349903255701065, "learning_rate": 3.4276082177154535e-05, "loss": 0.1254, "step": 610 }, { "epoch": 1.5387957790192428, "grad_norm": 0.07736501842737198, "learning_rate": 3.1072920330882647e-05, "loss": 0.1207, "step": 620 }, { "epoch": 1.563625077591558, "grad_norm": 0.09067176282405853, "learning_rate": 2.7999161874640022e-05, "loss": 0.1286, "step": 630 }, { "epoch": 1.563625077591558, "eval_loss": 0.11949945241212845, "eval_runtime": 40.5526, "eval_samples_per_second": 4.192, "eval_steps_per_second": 2.096, "step": 630 }, { "epoch": 1.5884543761638734, "grad_norm": 0.10042094439268112, "learning_rate": 2.506057746875753e-05, "loss": 0.1194, "step": 640 }, { "epoch": 1.6132836747361887, "grad_norm": 0.0972597673535347, "learning_rate": 2.226268399841055e-05, "loss": 0.1212, "step": 650 }, { "epoch": 1.638112973308504, "grad_norm": 0.09647821635007858, "learning_rate": 1.9610734216243522e-05, "loss": 0.1095, "step": 660 }, { "epoch": 1.638112973308504, "eval_loss": 0.11894174665212631, "eval_runtime": 40.5349, "eval_samples_per_second": 4.194, "eval_steps_per_second": 2.097, "step": 660 }, { "epoch": 1.6629422718808193, "grad_norm": 0.06773627549409866, "learning_rate": 1.710970688087561e-05, "loss": 0.1194, "step": 670 }, { "epoch": 1.6877715704531346, "grad_norm": 0.12803693115711212, "learning_rate": 1.4764297409801764e-05, "loss": 0.1175, "step": 680 }, { "epoch": 1.71260086902545, "grad_norm": 0.10929796099662781, "learning_rate": 1.2578909064236889e-05, "loss": 0.1222, "step": 690 }, { "epoch": 1.71260086902545, "eval_loss": 0.11896785348653793, "eval_runtime": 40.5323, "eval_samples_per_second": 4.194, "eval_steps_per_second": 2.097, "step": 690 }, { "epoch": 1.7374301675977653, "grad_norm": 0.06872426718473434, "learning_rate": 1.0557644682453039e-05, "loss": 0.1246, "step": 700 }, { "epoch": 1.7622594661700806, "grad_norm": 0.10362172871828079, "learning_rate": 8.70429897712921e-06, "loss": 0.1165, "step": 710 }, { "epoch": 1.7870887647423959, "grad_norm": 0.0892619714140892, "learning_rate": 7.022351411174866e-06, "loss": 0.1292, "step": 720 }, { "epoch": 1.7870887647423959, "eval_loss": 0.11873549222946167, "eval_runtime": 40.5842, "eval_samples_per_second": 4.189, "eval_steps_per_second": 2.094, "step": 720 }, { "epoch": 1.8119180633147114, "grad_norm": 0.10651155561208725, "learning_rate": 5.51495966540182e-06, "loss": 0.1182, "step": 730 }, { "epoch": 1.8367473618870267, "grad_norm": 0.09011874347925186, "learning_rate": 4.1849537103084925e-06, "loss": 0.1221, "step": 740 }, { "epoch": 1.861576660459342, "grad_norm": 0.09894613921642303, "learning_rate": 3.034830493105956e-06, "loss": 0.1149, "step": 750 }, { "epoch": 1.861576660459342, "eval_loss": 0.11866023391485214, "eval_runtime": 40.5331, "eval_samples_per_second": 4.194, "eval_steps_per_second": 2.097, "step": 750 }, { "epoch": 1.8864059590316573, "grad_norm": 0.10858767479658127, "learning_rate": 2.066749249960498e-06, "loss": 0.1212, "step": 760 }, { "epoch": 1.9112352576039728, "grad_norm": 0.07641536742448807, "learning_rate": 1.2825274522532792e-06, "loss": 0.1107, "step": 770 }, { "epoch": 1.9360645561762881, "grad_norm": 0.11793581396341324, "learning_rate": 6.836373944677954e-07, "loss": 0.1144, "step": 780 }, { "epoch": 1.9360645561762881, "eval_loss": 0.1186189278960228, "eval_runtime": 40.4999, "eval_samples_per_second": 4.198, "eval_steps_per_second": 2.099, "step": 780 }, { "epoch": 1.9608938547486034, "grad_norm": 0.1014862135052681, "learning_rate": 2.712034301107114e-07, "loss": 0.1264, "step": 790 }, { "epoch": 1.9857231533209188, "grad_norm": 0.09327876567840576, "learning_rate": 4.599986085573882e-08, "loss": 0.1182, "step": 800 } ], "logging_steps": 10, "max_steps": 806, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.5263477755117363e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }