{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9860291834833903, "eval_steps": 30, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.024837007140639553, "grad_norm": 5.0680694580078125, "learning_rate": 2.2222222222222223e-05, "loss": 2.4793, "step": 10 }, { "epoch": 0.04967401428127911, "grad_norm": 0.6718662977218628, "learning_rate": 4.691358024691358e-05, "loss": 0.8831, "step": 20 }, { "epoch": 0.07451102142191866, "grad_norm": 0.7071403861045837, "learning_rate": 7.160493827160494e-05, "loss": 0.5408, "step": 30 }, { "epoch": 0.07451102142191866, "eval_loss": 0.4726891815662384, "eval_runtime": 38.6732, "eval_samples_per_second": 4.396, "eval_steps_per_second": 4.396, "step": 30 }, { "epoch": 0.09934802856255821, "grad_norm": 0.5646480321884155, "learning_rate": 9.62962962962963e-05, "loss": 0.3935, "step": 40 }, { "epoch": 0.12418503570319776, "grad_norm": 0.5533119440078735, "learning_rate": 0.00012098765432098766, "loss": 0.3001, "step": 50 }, { "epoch": 0.14902204284383733, "grad_norm": 0.4143104553222656, "learning_rate": 0.00014567901234567902, "loss": 0.2543, "step": 60 }, { "epoch": 0.14902204284383733, "eval_loss": 0.2523055374622345, "eval_runtime": 38.0335, "eval_samples_per_second": 4.47, "eval_steps_per_second": 4.47, "step": 60 }, { "epoch": 0.17385904998447688, "grad_norm": 0.47011032700538635, "learning_rate": 0.00017037037037037037, "loss": 0.2306, "step": 70 }, { "epoch": 0.19869605712511643, "grad_norm": 0.6400521397590637, "learning_rate": 0.00019506172839506175, "loss": 0.2104, "step": 80 }, { "epoch": 0.22353306426575598, "grad_norm": 0.6687613129615784, "learning_rate": 0.0001999399199592735, "loss": 0.19, "step": 90 }, { "epoch": 0.22353306426575598, "eval_loss": 0.18850156664848328, "eval_runtime": 38.0935, "eval_samples_per_second": 4.463, "eval_steps_per_second": 4.463, "step": 90 }, { "epoch": 0.24837007140639553, "grad_norm": 0.24749410152435303, "learning_rate": 0.00019969596851644327, "loss": 0.1835, "step": 100 }, { "epoch": 0.2732070785470351, "grad_norm": 0.43560871481895447, "learning_rate": 0.00019926484830975113, "loss": 0.1577, "step": 110 }, { "epoch": 0.29804408568767465, "grad_norm": 0.6888434290885925, "learning_rate": 0.0001986473687223383, "loss": 0.1585, "step": 120 }, { "epoch": 0.29804408568767465, "eval_loss": 0.15052124857902527, "eval_runtime": 38.1444, "eval_samples_per_second": 4.457, "eval_steps_per_second": 4.457, "step": 120 }, { "epoch": 0.3228810928283142, "grad_norm": 0.2602541446685791, "learning_rate": 0.00019784468900761095, "loss": 0.1548, "step": 130 }, { "epoch": 0.34771809996895375, "grad_norm": 0.23804056644439697, "learning_rate": 0.0001968583161128631, "loss": 0.1581, "step": 140 }, { "epoch": 0.3725551071095933, "grad_norm": 0.20728255808353424, "learning_rate": 0.00019569010185014062, "loss": 0.1487, "step": 150 }, { "epoch": 0.3725551071095933, "eval_loss": 0.14161638915538788, "eval_runtime": 38.1143, "eval_samples_per_second": 4.46, "eval_steps_per_second": 4.46, "step": 150 }, { "epoch": 0.39739211425023285, "grad_norm": 0.2241806834936142, "learning_rate": 0.00019434223941965738, "loss": 0.1521, "step": 160 }, { "epoch": 0.4222291213908724, "grad_norm": 0.24058522284030914, "learning_rate": 0.00019281725929229127, "loss": 0.1438, "step": 170 }, { "epoch": 0.44706612853151195, "grad_norm": 0.17390510439872742, "learning_rate": 0.00019111802445888936, "loss": 0.1513, "step": 180 }, { "epoch": 0.44706612853151195, "eval_loss": 0.1333342343568802, "eval_runtime": 38.1525, "eval_samples_per_second": 4.456, "eval_steps_per_second": 4.456, "step": 180 }, { "epoch": 0.47190313567215153, "grad_norm": 0.1597527265548706, "learning_rate": 0.00018924772505530174, "loss": 0.1416, "step": 190 }, { "epoch": 0.49674014281279105, "grad_norm": 0.16071690618991852, "learning_rate": 0.000187209872373235, "loss": 0.1437, "step": 200 }, { "epoch": 0.5215771499534306, "grad_norm": 0.15209132432937622, "learning_rate": 0.00018500829226816853, "loss": 0.1421, "step": 210 }, { "epoch": 0.5215771499534306, "eval_loss": 0.13083834946155548, "eval_runtime": 38.1005, "eval_samples_per_second": 4.462, "eval_steps_per_second": 4.462, "step": 210 }, { "epoch": 0.5464141570940702, "grad_norm": 0.17985881865024567, "learning_rate": 0.0001826471179767111, "loss": 0.1395, "step": 220 }, { "epoch": 0.5712511642347097, "grad_norm": 0.1529396027326584, "learning_rate": 0.0001801307823568806, "loss": 0.1443, "step": 230 }, { "epoch": 0.5960881713753493, "grad_norm": 0.14357714354991913, "learning_rate": 0.00017746400956587653, "loss": 0.1478, "step": 240 }, { "epoch": 0.5960881713753493, "eval_loss": 0.12620185315608978, "eval_runtime": 38.1161, "eval_samples_per_second": 4.46, "eval_steps_per_second": 4.46, "step": 240 }, { "epoch": 0.6209251785159888, "grad_norm": 0.15918347239494324, "learning_rate": 0.00017465180619096832, "loss": 0.1338, "step": 250 }, { "epoch": 0.6457621856566284, "grad_norm": 0.25342991948127747, "learning_rate": 0.00017169945185015106, "loss": 0.1353, "step": 260 }, { "epoch": 0.670599192797268, "grad_norm": 0.1387968510389328, "learning_rate": 0.00016861248928021411, "loss": 0.1507, "step": 270 }, { "epoch": 0.670599192797268, "eval_loss": 0.12623676657676697, "eval_runtime": 38.5722, "eval_samples_per_second": 4.407, "eval_steps_per_second": 4.407, "step": 270 }, { "epoch": 0.6954361999379075, "grad_norm": 0.1433294266462326, "learning_rate": 0.00016539671393083215, "loss": 0.1342, "step": 280 }, { "epoch": 0.720273207078547, "grad_norm": 0.16263321042060852, "learning_rate": 0.00016205816308421386, "loss": 0.1361, "step": 290 }, { "epoch": 0.7451102142191866, "grad_norm": 0.15614983439445496, "learning_rate": 0.0001586031045207354, "loss": 0.1457, "step": 300 }, { "epoch": 0.7451102142191866, "eval_loss": 0.1330355405807495, "eval_runtime": 38.7441, "eval_samples_per_second": 4.388, "eval_steps_per_second": 4.388, "step": 300 }, { "epoch": 0.7699472213598262, "grad_norm": 0.13199648261070251, "learning_rate": 0.00015503802475183773, "loss": 0.1429, "step": 310 }, { "epoch": 0.7947842285004657, "grad_norm": 0.11272416263818741, "learning_rate": 0.00015136961684227904, "loss": 0.1402, "step": 320 }, { "epoch": 0.8196212356411052, "grad_norm": 0.19195938110351562, "learning_rate": 0.00014760476784460514, "loss": 0.144, "step": 330 }, { "epoch": 0.8196212356411052, "eval_loss": 0.12694701552391052, "eval_runtime": 38.5526, "eval_samples_per_second": 4.41, "eval_steps_per_second": 4.41, "step": 330 }, { "epoch": 0.8444582427817448, "grad_norm": 0.1164827048778534, "learning_rate": 0.0001437505458694277, "loss": 0.1404, "step": 340 }, { "epoch": 0.8692952499223844, "grad_norm": 0.15164071321487427, "learning_rate": 0.00013981418681578546, "loss": 0.1332, "step": 350 }, { "epoch": 0.8941322570630239, "grad_norm": 0.11416257917881012, "learning_rate": 0.0001358030807864995, "loss": 0.1333, "step": 360 }, { "epoch": 0.8941322570630239, "eval_loss": 0.12437459081411362, "eval_runtime": 38.4955, "eval_samples_per_second": 4.416, "eval_steps_per_second": 4.416, "step": 360 }, { "epoch": 0.9189692642036634, "grad_norm": 0.13110032677650452, "learning_rate": 0.00013172475821402748, "loss": 0.147, "step": 370 }, { "epoch": 0.9438062713443031, "grad_norm": 0.09652693569660187, "learning_rate": 0.00012758687572286367, "loss": 0.1333, "step": 380 }, { "epoch": 0.9686432784849426, "grad_norm": 0.09613075852394104, "learning_rate": 0.00012339720175502642, "loss": 0.1402, "step": 390 }, { "epoch": 0.9686432784849426, "eval_loss": 0.12314330041408539, "eval_runtime": 38.4882, "eval_samples_per_second": 4.417, "eval_steps_per_second": 4.417, "step": 390 }, { "epoch": 0.9934802856255821, "grad_norm": 0.10631036758422852, "learning_rate": 0.0001191636019856198, "loss": 0.1287, "step": 400 }, { "epoch": 1.0173859049984477, "grad_norm": 0.46618208289146423, "learning_rate": 0.00011489402455585076, "loss": 0.138, "step": 410 }, { "epoch": 1.0422229121390871, "grad_norm": 0.30142149329185486, "learning_rate": 0.00011059648515122424, "loss": 0.1365, "step": 420 }, { "epoch": 1.0422229121390871, "eval_loss": 0.12314148247241974, "eval_runtime": 38.5184, "eval_samples_per_second": 4.413, "eval_steps_per_second": 4.413, "step": 420 }, { "epoch": 1.0670599192797268, "grad_norm": 0.3135342597961426, "learning_rate": 0.00010627905195293135, "loss": 0.1328, "step": 430 }, { "epoch": 1.0918969264203664, "grad_norm": 0.0945952981710434, "learning_rate": 0.00010194983049068212, "loss": 0.134, "step": 440 }, { "epoch": 1.1167339335610058, "grad_norm": 0.1430775374174118, "learning_rate": 9.76169484254204e-05, "loss": 0.1201, "step": 450 }, { "epoch": 1.1167339335610058, "eval_loss": 0.1227191910147667, "eval_runtime": 38.5653, "eval_samples_per_second": 4.408, "eval_steps_per_second": 4.408, "step": 450 }, { "epoch": 1.1415709407016454, "grad_norm": 0.09269551187753677, "learning_rate": 9.328854029048984e-05, "loss": 0.1305, "step": 460 }, { "epoch": 1.166407947842285, "grad_norm": 0.10537844151258469, "learning_rate": 8.897273221989714e-05, "loss": 0.1315, "step": 470 }, { "epoch": 1.1912449549829245, "grad_norm": 0.08408211916685104, "learning_rate": 8.467762669234495e-05, "loss": 0.1294, "step": 480 }, { "epoch": 1.1912449549829245, "eval_loss": 0.12096220254898071, "eval_runtime": 38.4954, "eval_samples_per_second": 4.416, "eval_steps_per_second": 4.416, "step": 480 }, { "epoch": 1.2160819621235641, "grad_norm": 0.1160721555352211, "learning_rate": 8.041128731967444e-05, "loss": 0.1341, "step": 490 }, { "epoch": 1.2409189692642038, "grad_norm": 0.09623311460018158, "learning_rate": 7.61817237082768e-05, "loss": 0.1321, "step": 500 }, { "epoch": 1.2657559764048432, "grad_norm": 0.3000585436820984, "learning_rate": 7.199687642189387e-05, "loss": 0.1249, "step": 510 }, { "epoch": 1.2657559764048432, "eval_loss": 0.12054261565208435, "eval_runtime": 38.4372, "eval_samples_per_second": 4.423, "eval_steps_per_second": 4.423, "step": 510 }, { "epoch": 1.2905929835454828, "grad_norm": 0.0921676829457283, "learning_rate": 6.786460207403978e-05, "loss": 0.1294, "step": 520 }, { "epoch": 1.3154299906861224, "grad_norm": 0.08408155292272568, "learning_rate": 6.379265857802969e-05, "loss": 0.1318, "step": 530 }, { "epoch": 1.3402669978267618, "grad_norm": 0.19716006517410278, "learning_rate": 5.9788690582308404e-05, "loss": 0.1305, "step": 540 }, { "epoch": 1.3402669978267618, "eval_loss": 0.12028669565916061, "eval_runtime": 38.6144, "eval_samples_per_second": 4.402, "eval_steps_per_second": 4.402, "step": 540 }, { "epoch": 1.3651040049674015, "grad_norm": 0.14016658067703247, "learning_rate": 5.586021511842136e-05, "loss": 0.1271, "step": 550 }, { "epoch": 1.389941012108041, "grad_norm": 0.11117364466190338, "learning_rate": 5.201460748857369e-05, "loss": 0.1306, "step": 560 }, { "epoch": 1.4147780192486805, "grad_norm": 0.09563940018415451, "learning_rate": 4.8259087419270756e-05, "loss": 0.1302, "step": 570 }, { "epoch": 1.4147780192486805, "eval_loss": 0.12124165892601013, "eval_runtime": 38.671, "eval_samples_per_second": 4.396, "eval_steps_per_second": 4.396, "step": 570 }, { "epoch": 1.4396150263893202, "grad_norm": 0.08976716548204422, "learning_rate": 4.460070550703612e-05, "loss": 0.1332, "step": 580 }, { "epoch": 1.4644520335299596, "grad_norm": 0.0839507058262825, "learning_rate": 4.1046329981653086e-05, "loss": 0.1276, "step": 590 }, { "epoch": 1.4892890406705992, "grad_norm": 0.09810927510261536, "learning_rate": 3.7602633811781166e-05, "loss": 0.1298, "step": 600 }, { "epoch": 1.4892890406705992, "eval_loss": 0.11950553208589554, "eval_runtime": 38.7353, "eval_samples_per_second": 4.389, "eval_steps_per_second": 4.389, "step": 600 }, { "epoch": 1.5141260478112386, "grad_norm": 0.10148163884878159, "learning_rate": 3.4276082177154535e-05, "loss": 0.1303, "step": 610 }, { "epoch": 1.5389630549518785, "grad_norm": 0.10012848675251007, "learning_rate": 3.1072920330882647e-05, "loss": 0.128, "step": 620 }, { "epoch": 1.5638000620925179, "grad_norm": 0.09456542879343033, "learning_rate": 2.7999161874640022e-05, "loss": 0.1293, "step": 630 }, { "epoch": 1.5638000620925179, "eval_loss": 0.11886715888977051, "eval_runtime": 38.5823, "eval_samples_per_second": 4.406, "eval_steps_per_second": 4.406, "step": 630 }, { "epoch": 1.5886370692331573, "grad_norm": 0.09369179606437683, "learning_rate": 2.506057746875753e-05, "loss": 0.1299, "step": 640 }, { "epoch": 1.613474076373797, "grad_norm": 0.09706632047891617, "learning_rate": 2.226268399841055e-05, "loss": 0.1244, "step": 650 }, { "epoch": 1.6383110835144366, "grad_norm": 0.08659979701042175, "learning_rate": 1.9610734216243522e-05, "loss": 0.1166, "step": 660 }, { "epoch": 1.6383110835144366, "eval_loss": 0.11844287067651749, "eval_runtime": 38.4977, "eval_samples_per_second": 4.416, "eval_steps_per_second": 4.416, "step": 660 }, { "epoch": 1.663148090655076, "grad_norm": 0.0770510733127594, "learning_rate": 1.710970688087561e-05, "loss": 0.1284, "step": 670 }, { "epoch": 1.6879850977957156, "grad_norm": 0.10163529217243195, "learning_rate": 1.4764297409801764e-05, "loss": 0.1298, "step": 680 }, { "epoch": 1.7128221049363552, "grad_norm": 0.10533007979393005, "learning_rate": 1.2578909064236889e-05, "loss": 0.1282, "step": 690 }, { "epoch": 1.7128221049363552, "eval_loss": 0.11826732009649277, "eval_runtime": 38.6249, "eval_samples_per_second": 4.401, "eval_steps_per_second": 4.401, "step": 690 }, { "epoch": 1.7376591120769946, "grad_norm": 0.07278448343276978, "learning_rate": 1.0557644682453039e-05, "loss": 0.1292, "step": 700 }, { "epoch": 1.7624961192176343, "grad_norm": 0.09515662491321564, "learning_rate": 8.70429897712921e-06, "loss": 0.1236, "step": 710 }, { "epoch": 1.787333126358274, "grad_norm": 0.08752795308828354, "learning_rate": 7.022351411174866e-06, "loss": 0.1353, "step": 720 }, { "epoch": 1.787333126358274, "eval_loss": 0.11793459951877594, "eval_runtime": 38.65, "eval_samples_per_second": 4.398, "eval_steps_per_second": 4.398, "step": 720 }, { "epoch": 1.8121701334989133, "grad_norm": 0.10476204007863998, "learning_rate": 5.51495966540182e-06, "loss": 0.1279, "step": 730 }, { "epoch": 1.837007140639553, "grad_norm": 0.11006776243448257, "learning_rate": 4.1849537103084925e-06, "loss": 0.1286, "step": 740 }, { "epoch": 1.8618441477801926, "grad_norm": 0.0957954004406929, "learning_rate": 3.034830493105956e-06, "loss": 0.1292, "step": 750 }, { "epoch": 1.8618441477801926, "eval_loss": 0.11800022423267365, "eval_runtime": 38.6325, "eval_samples_per_second": 4.4, "eval_steps_per_second": 4.4, "step": 750 }, { "epoch": 1.886681154920832, "grad_norm": 0.09181220084428787, "learning_rate": 2.066749249960498e-06, "loss": 0.1249, "step": 760 }, { "epoch": 1.9115181620614716, "grad_norm": 0.08642429113388062, "learning_rate": 1.2825274522532792e-06, "loss": 0.1218, "step": 770 }, { "epoch": 1.9363551692021113, "grad_norm": 0.0925707072019577, "learning_rate": 6.836373944677954e-07, "loss": 0.1299, "step": 780 }, { "epoch": 1.9363551692021113, "eval_loss": 0.1179690733551979, "eval_runtime": 38.5796, "eval_samples_per_second": 4.406, "eval_steps_per_second": 4.406, "step": 780 }, { "epoch": 1.9611921763427507, "grad_norm": 0.09501737356185913, "learning_rate": 2.712034301107114e-07, "loss": 0.1289, "step": 790 }, { "epoch": 1.9860291834833903, "grad_norm": 0.09520602226257324, "learning_rate": 4.599986085573882e-08, "loss": 0.128, "step": 800 } ], "logging_steps": 10, "max_steps": 806, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2428317974302515e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }