| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9857231533209188, | |
| "eval_steps": 30, | |
| "global_step": 800, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.024829298572315334, | |
| "grad_norm": 5.036550998687744, | |
| "learning_rate": 2.2222222222222223e-05, | |
| "loss": 2.4126, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04965859714463067, | |
| "grad_norm": 0.6553554534912109, | |
| "learning_rate": 4.691358024691358e-05, | |
| "loss": 0.8516, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.074487895716946, | |
| "grad_norm": 0.7089270353317261, | |
| "learning_rate": 7.160493827160494e-05, | |
| "loss": 0.5451, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.074487895716946, | |
| "eval_loss": 0.46651971340179443, | |
| "eval_runtime": 40.7643, | |
| "eval_samples_per_second": 4.17, | |
| "eval_steps_per_second": 2.085, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09931719428926133, | |
| "grad_norm": 0.5849066376686096, | |
| "learning_rate": 9.62962962962963e-05, | |
| "loss": 0.3903, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.12414649286157665, | |
| "grad_norm": 0.6945697069168091, | |
| "learning_rate": 0.00012098765432098766, | |
| "loss": 0.3218, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.148975791433892, | |
| "grad_norm": 0.43555283546447754, | |
| "learning_rate": 0.00014567901234567902, | |
| "loss": 0.2579, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.148975791433892, | |
| "eval_loss": 0.2548002004623413, | |
| "eval_runtime": 40.1887, | |
| "eval_samples_per_second": 4.23, | |
| "eval_steps_per_second": 2.115, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.17380509000620734, | |
| "grad_norm": 0.4026840627193451, | |
| "learning_rate": 0.00017037037037037037, | |
| "loss": 0.23, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.19863438857852267, | |
| "grad_norm": 0.635771632194519, | |
| "learning_rate": 0.00019506172839506175, | |
| "loss": 0.2108, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.22346368715083798, | |
| "grad_norm": 0.43689030408859253, | |
| "learning_rate": 0.0001999399199592735, | |
| "loss": 0.1878, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.22346368715083798, | |
| "eval_loss": 0.186412051320076, | |
| "eval_runtime": 40.2208, | |
| "eval_samples_per_second": 4.227, | |
| "eval_steps_per_second": 2.113, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2482929857231533, | |
| "grad_norm": 0.3414112627506256, | |
| "learning_rate": 0.00019969596851644327, | |
| "loss": 0.178, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.27312228429546864, | |
| "grad_norm": 0.47279804944992065, | |
| "learning_rate": 0.00019926484830975113, | |
| "loss": 0.1595, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.297951582867784, | |
| "grad_norm": 0.2666519582271576, | |
| "learning_rate": 0.0001986473687223383, | |
| "loss": 0.159, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.297951582867784, | |
| "eval_loss": 0.152946338057518, | |
| "eval_runtime": 40.3051, | |
| "eval_samples_per_second": 4.218, | |
| "eval_steps_per_second": 2.109, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3227808814400993, | |
| "grad_norm": 0.2780194580554962, | |
| "learning_rate": 0.00019784468900761095, | |
| "loss": 0.1491, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.34761018001241467, | |
| "grad_norm": 0.5531139969825745, | |
| "learning_rate": 0.0001968583161128631, | |
| "loss": 0.1523, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.37243947858473, | |
| "grad_norm": 0.3274007737636566, | |
| "learning_rate": 0.00019569010185014062, | |
| "loss": 0.1447, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.37243947858473, | |
| "eval_loss": 0.1445446014404297, | |
| "eval_runtime": 40.255, | |
| "eval_samples_per_second": 4.223, | |
| "eval_steps_per_second": 2.112, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.39726877715704534, | |
| "grad_norm": 0.2487361580133438, | |
| "learning_rate": 0.00019434223941965738, | |
| "loss": 0.1509, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.42209807572936064, | |
| "grad_norm": 0.5522840023040771, | |
| "learning_rate": 0.00019281725929229127, | |
| "loss": 0.1433, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.44692737430167595, | |
| "grad_norm": 0.1760244369506836, | |
| "learning_rate": 0.00019111802445888936, | |
| "loss": 0.1434, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.44692737430167595, | |
| "eval_loss": 0.13944680988788605, | |
| "eval_runtime": 40.2246, | |
| "eval_samples_per_second": 4.226, | |
| "eval_steps_per_second": 2.113, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.4717566728739913, | |
| "grad_norm": 0.2646051347255707, | |
| "learning_rate": 0.00018924772505530174, | |
| "loss": 0.1366, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.4965859714463066, | |
| "grad_norm": 0.3032621741294861, | |
| "learning_rate": 0.000187209872373235, | |
| "loss": 0.1359, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.521415270018622, | |
| "grad_norm": 0.5465778112411499, | |
| "learning_rate": 0.00018500829226816853, | |
| "loss": 0.143, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.521415270018622, | |
| "eval_loss": 0.13419194519519806, | |
| "eval_runtime": 40.1269, | |
| "eval_samples_per_second": 4.237, | |
| "eval_steps_per_second": 2.118, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5462445685909373, | |
| "grad_norm": 0.1879195123910904, | |
| "learning_rate": 0.0001826471179767111, | |
| "loss": 0.1364, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5710738671632526, | |
| "grad_norm": 0.19969278573989868, | |
| "learning_rate": 0.0001801307823568806, | |
| "loss": 0.1407, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.595903165735568, | |
| "grad_norm": 0.15893523395061493, | |
| "learning_rate": 0.00017746400956587653, | |
| "loss": 0.1397, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.595903165735568, | |
| "eval_loss": 0.13155966997146606, | |
| "eval_runtime": 40.1893, | |
| "eval_samples_per_second": 4.23, | |
| "eval_steps_per_second": 2.115, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.6207324643078833, | |
| "grad_norm": 0.1424490511417389, | |
| "learning_rate": 0.00017465180619096832, | |
| "loss": 0.1337, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6455617628801986, | |
| "grad_norm": 0.13029974699020386, | |
| "learning_rate": 0.00017169945185015106, | |
| "loss": 0.1376, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.6703910614525139, | |
| "grad_norm": 0.13530579209327698, | |
| "learning_rate": 0.00016861248928021411, | |
| "loss": 0.129, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6703910614525139, | |
| "eval_loss": 0.12780845165252686, | |
| "eval_runtime": 40.2498, | |
| "eval_samples_per_second": 4.224, | |
| "eval_steps_per_second": 2.112, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6952203600248293, | |
| "grad_norm": 0.2645304501056671, | |
| "learning_rate": 0.00016539671393083215, | |
| "loss": 0.1246, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.7200496585971446, | |
| "grad_norm": 0.15367014706134796, | |
| "learning_rate": 0.00016205816308421386, | |
| "loss": 0.1273, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.74487895716946, | |
| "grad_norm": 0.2134842574596405, | |
| "learning_rate": 0.0001586031045207354, | |
| "loss": 0.1361, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.74487895716946, | |
| "eval_loss": 0.1297762393951416, | |
| "eval_runtime": 40.2241, | |
| "eval_samples_per_second": 4.226, | |
| "eval_steps_per_second": 2.113, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7697082557417753, | |
| "grad_norm": 0.13907591998577118, | |
| "learning_rate": 0.00015503802475183773, | |
| "loss": 0.14, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7945375543140907, | |
| "grad_norm": 0.10886333137750626, | |
| "learning_rate": 0.00015136961684227904, | |
| "loss": 0.1351, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.819366852886406, | |
| "grad_norm": 0.1071273609995842, | |
| "learning_rate": 0.00014760476784460514, | |
| "loss": 0.1288, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.819366852886406, | |
| "eval_loss": 0.1265447735786438, | |
| "eval_runtime": 40.2437, | |
| "eval_samples_per_second": 4.224, | |
| "eval_steps_per_second": 2.112, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.8441961514587213, | |
| "grad_norm": 0.13940832018852234, | |
| "learning_rate": 0.0001437505458694277, | |
| "loss": 0.1331, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.8690254500310366, | |
| "grad_norm": 0.12029105424880981, | |
| "learning_rate": 0.00013981418681578546, | |
| "loss": 0.1297, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8938547486033519, | |
| "grad_norm": 0.09277268499135971, | |
| "learning_rate": 0.0001358030807864995, | |
| "loss": 0.1259, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8938547486033519, | |
| "eval_loss": 0.12379591166973114, | |
| "eval_runtime": 40.2294, | |
| "eval_samples_per_second": 4.226, | |
| "eval_steps_per_second": 2.113, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.9186840471756673, | |
| "grad_norm": 0.172864630818367, | |
| "learning_rate": 0.00013172475821402748, | |
| "loss": 0.1301, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.9435133457479826, | |
| "grad_norm": 0.10042418539524078, | |
| "learning_rate": 0.00012758687572286367, | |
| "loss": 0.1271, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.9683426443202979, | |
| "grad_norm": 0.09972112625837326, | |
| "learning_rate": 0.00012339720175502642, | |
| "loss": 0.1352, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9683426443202979, | |
| "eval_loss": 0.12407374382019043, | |
| "eval_runtime": 40.2691, | |
| "eval_samples_per_second": 4.222, | |
| "eval_steps_per_second": 2.111, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9931719428926132, | |
| "grad_norm": 0.10841402411460876, | |
| "learning_rate": 0.0001191636019856198, | |
| "loss": 0.1254, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.0173805090006207, | |
| "grad_norm": 0.14438092708587646, | |
| "learning_rate": 0.00011489402455585076, | |
| "loss": 0.1321, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.042209807572936, | |
| "grad_norm": 0.10602834075689316, | |
| "learning_rate": 0.00011059648515122424, | |
| "loss": 0.1211, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.042209807572936, | |
| "eval_loss": 0.12403523921966553, | |
| "eval_runtime": 40.5744, | |
| "eval_samples_per_second": 4.19, | |
| "eval_steps_per_second": 2.095, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.0670391061452513, | |
| "grad_norm": 0.10185902565717697, | |
| "learning_rate": 0.00010627905195293135, | |
| "loss": 0.1237, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.0918684047175666, | |
| "grad_norm": 0.09817427396774292, | |
| "learning_rate": 0.00010194983049068212, | |
| "loss": 0.1138, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.1166977032898822, | |
| "grad_norm": 0.14020408689975739, | |
| "learning_rate": 9.76169484254204e-05, | |
| "loss": 0.118, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.1166977032898822, | |
| "eval_loss": 0.12372539937496185, | |
| "eval_runtime": 40.514, | |
| "eval_samples_per_second": 4.196, | |
| "eval_steps_per_second": 2.098, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.1415270018621975, | |
| "grad_norm": 0.09354697167873383, | |
| "learning_rate": 9.328854029048984e-05, | |
| "loss": 0.1241, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.1663563004345128, | |
| "grad_norm": 0.10786397010087967, | |
| "learning_rate": 8.897273221989714e-05, | |
| "loss": 0.1254, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.191185599006828, | |
| "grad_norm": 0.08708823472261429, | |
| "learning_rate": 8.467762669234495e-05, | |
| "loss": 0.1214, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.191185599006828, | |
| "eval_loss": 0.12238769233226776, | |
| "eval_runtime": 40.5104, | |
| "eval_samples_per_second": 4.196, | |
| "eval_steps_per_second": 2.098, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.2160148975791434, | |
| "grad_norm": 0.11436637490987778, | |
| "learning_rate": 8.041128731967444e-05, | |
| "loss": 0.1278, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.2408441961514587, | |
| "grad_norm": 0.10331734269857407, | |
| "learning_rate": 7.61817237082768e-05, | |
| "loss": 0.1242, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.265673494723774, | |
| "grad_norm": 0.09123562276363373, | |
| "learning_rate": 7.199687642189387e-05, | |
| "loss": 0.1191, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.265673494723774, | |
| "eval_loss": 0.1218603253364563, | |
| "eval_runtime": 40.5038, | |
| "eval_samples_per_second": 4.197, | |
| "eval_steps_per_second": 2.099, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.2905027932960893, | |
| "grad_norm": 0.08754425495862961, | |
| "learning_rate": 6.786460207403978e-05, | |
| "loss": 0.12, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.3153320918684046, | |
| "grad_norm": 0.08568098396062851, | |
| "learning_rate": 6.379265857802969e-05, | |
| "loss": 0.1205, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.34016139044072, | |
| "grad_norm": 0.09707140177488327, | |
| "learning_rate": 5.9788690582308404e-05, | |
| "loss": 0.1277, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.34016139044072, | |
| "eval_loss": 0.1208547055721283, | |
| "eval_runtime": 40.5912, | |
| "eval_samples_per_second": 4.188, | |
| "eval_steps_per_second": 2.094, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.3649906890130354, | |
| "grad_norm": 0.11941556632518768, | |
| "learning_rate": 5.586021511842136e-05, | |
| "loss": 0.1143, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.3898199875853507, | |
| "grad_norm": 0.11442070454359055, | |
| "learning_rate": 5.201460748857369e-05, | |
| "loss": 0.1215, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.414649286157666, | |
| "grad_norm": 0.09243914484977722, | |
| "learning_rate": 4.8259087419270756e-05, | |
| "loss": 0.124, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.414649286157666, | |
| "eval_loss": 0.12113272398710251, | |
| "eval_runtime": 40.5079, | |
| "eval_samples_per_second": 4.197, | |
| "eval_steps_per_second": 2.098, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.4394785847299814, | |
| "grad_norm": 0.09338078647851944, | |
| "learning_rate": 4.460070550703612e-05, | |
| "loss": 0.1248, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.4643078833022967, | |
| "grad_norm": 0.0838402733206749, | |
| "learning_rate": 4.1046329981653086e-05, | |
| "loss": 0.1174, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.489137181874612, | |
| "grad_norm": 0.09681010991334915, | |
| "learning_rate": 3.7602633811781166e-05, | |
| "loss": 0.1204, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.489137181874612, | |
| "eval_loss": 0.12003795057535172, | |
| "eval_runtime": 40.5066, | |
| "eval_samples_per_second": 4.197, | |
| "eval_steps_per_second": 2.098, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.5139664804469275, | |
| "grad_norm": 0.10349903255701065, | |
| "learning_rate": 3.4276082177154535e-05, | |
| "loss": 0.1254, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.5387957790192428, | |
| "grad_norm": 0.07736501842737198, | |
| "learning_rate": 3.1072920330882647e-05, | |
| "loss": 0.1207, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.563625077591558, | |
| "grad_norm": 0.09067176282405853, | |
| "learning_rate": 2.7999161874640022e-05, | |
| "loss": 0.1286, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.563625077591558, | |
| "eval_loss": 0.11949945241212845, | |
| "eval_runtime": 40.5526, | |
| "eval_samples_per_second": 4.192, | |
| "eval_steps_per_second": 2.096, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.5884543761638734, | |
| "grad_norm": 0.10042094439268112, | |
| "learning_rate": 2.506057746875753e-05, | |
| "loss": 0.1194, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.6132836747361887, | |
| "grad_norm": 0.0972597673535347, | |
| "learning_rate": 2.226268399841055e-05, | |
| "loss": 0.1212, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.638112973308504, | |
| "grad_norm": 0.09647821635007858, | |
| "learning_rate": 1.9610734216243522e-05, | |
| "loss": 0.1095, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.638112973308504, | |
| "eval_loss": 0.11894174665212631, | |
| "eval_runtime": 40.5349, | |
| "eval_samples_per_second": 4.194, | |
| "eval_steps_per_second": 2.097, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.6629422718808193, | |
| "grad_norm": 0.06773627549409866, | |
| "learning_rate": 1.710970688087561e-05, | |
| "loss": 0.1194, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.6877715704531346, | |
| "grad_norm": 0.12803693115711212, | |
| "learning_rate": 1.4764297409801764e-05, | |
| "loss": 0.1175, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.71260086902545, | |
| "grad_norm": 0.10929796099662781, | |
| "learning_rate": 1.2578909064236889e-05, | |
| "loss": 0.1222, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.71260086902545, | |
| "eval_loss": 0.11896785348653793, | |
| "eval_runtime": 40.5323, | |
| "eval_samples_per_second": 4.194, | |
| "eval_steps_per_second": 2.097, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.7374301675977653, | |
| "grad_norm": 0.06872426718473434, | |
| "learning_rate": 1.0557644682453039e-05, | |
| "loss": 0.1246, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.7622594661700806, | |
| "grad_norm": 0.10362172871828079, | |
| "learning_rate": 8.70429897712921e-06, | |
| "loss": 0.1165, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.7870887647423959, | |
| "grad_norm": 0.0892619714140892, | |
| "learning_rate": 7.022351411174866e-06, | |
| "loss": 0.1292, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.7870887647423959, | |
| "eval_loss": 0.11873549222946167, | |
| "eval_runtime": 40.5842, | |
| "eval_samples_per_second": 4.189, | |
| "eval_steps_per_second": 2.094, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.8119180633147114, | |
| "grad_norm": 0.10651155561208725, | |
| "learning_rate": 5.51495966540182e-06, | |
| "loss": 0.1182, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.8367473618870267, | |
| "grad_norm": 0.09011874347925186, | |
| "learning_rate": 4.1849537103084925e-06, | |
| "loss": 0.1221, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.861576660459342, | |
| "grad_norm": 0.09894613921642303, | |
| "learning_rate": 3.034830493105956e-06, | |
| "loss": 0.1149, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.861576660459342, | |
| "eval_loss": 0.11866023391485214, | |
| "eval_runtime": 40.5331, | |
| "eval_samples_per_second": 4.194, | |
| "eval_steps_per_second": 2.097, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.8864059590316573, | |
| "grad_norm": 0.10858767479658127, | |
| "learning_rate": 2.066749249960498e-06, | |
| "loss": 0.1212, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.9112352576039728, | |
| "grad_norm": 0.07641536742448807, | |
| "learning_rate": 1.2825274522532792e-06, | |
| "loss": 0.1107, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.9360645561762881, | |
| "grad_norm": 0.11793581396341324, | |
| "learning_rate": 6.836373944677954e-07, | |
| "loss": 0.1144, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.9360645561762881, | |
| "eval_loss": 0.1186189278960228, | |
| "eval_runtime": 40.4999, | |
| "eval_samples_per_second": 4.198, | |
| "eval_steps_per_second": 2.099, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.9608938547486034, | |
| "grad_norm": 0.1014862135052681, | |
| "learning_rate": 2.712034301107114e-07, | |
| "loss": 0.1264, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.9857231533209188, | |
| "grad_norm": 0.09327876567840576, | |
| "learning_rate": 4.599986085573882e-08, | |
| "loss": 0.1182, | |
| "step": 800 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 806, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.5263477755117363e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |