| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9860291834833903, | |
| "eval_steps": 30, | |
| "global_step": 800, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.024837007140639553, | |
| "grad_norm": 5.0680694580078125, | |
| "learning_rate": 2.2222222222222223e-05, | |
| "loss": 2.4793, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04967401428127911, | |
| "grad_norm": 0.6718662977218628, | |
| "learning_rate": 4.691358024691358e-05, | |
| "loss": 0.8831, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.07451102142191866, | |
| "grad_norm": 0.7071403861045837, | |
| "learning_rate": 7.160493827160494e-05, | |
| "loss": 0.5408, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.07451102142191866, | |
| "eval_loss": 0.4726891815662384, | |
| "eval_runtime": 38.6732, | |
| "eval_samples_per_second": 4.396, | |
| "eval_steps_per_second": 4.396, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09934802856255821, | |
| "grad_norm": 0.5646480321884155, | |
| "learning_rate": 9.62962962962963e-05, | |
| "loss": 0.3935, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.12418503570319776, | |
| "grad_norm": 0.5533119440078735, | |
| "learning_rate": 0.00012098765432098766, | |
| "loss": 0.3001, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.14902204284383733, | |
| "grad_norm": 0.4143104553222656, | |
| "learning_rate": 0.00014567901234567902, | |
| "loss": 0.2543, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.14902204284383733, | |
| "eval_loss": 0.2523055374622345, | |
| "eval_runtime": 38.0335, | |
| "eval_samples_per_second": 4.47, | |
| "eval_steps_per_second": 4.47, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.17385904998447688, | |
| "grad_norm": 0.47011032700538635, | |
| "learning_rate": 0.00017037037037037037, | |
| "loss": 0.2306, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.19869605712511643, | |
| "grad_norm": 0.6400521397590637, | |
| "learning_rate": 0.00019506172839506175, | |
| "loss": 0.2104, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.22353306426575598, | |
| "grad_norm": 0.6687613129615784, | |
| "learning_rate": 0.0001999399199592735, | |
| "loss": 0.19, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.22353306426575598, | |
| "eval_loss": 0.18850156664848328, | |
| "eval_runtime": 38.0935, | |
| "eval_samples_per_second": 4.463, | |
| "eval_steps_per_second": 4.463, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.24837007140639553, | |
| "grad_norm": 0.24749410152435303, | |
| "learning_rate": 0.00019969596851644327, | |
| "loss": 0.1835, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2732070785470351, | |
| "grad_norm": 0.43560871481895447, | |
| "learning_rate": 0.00019926484830975113, | |
| "loss": 0.1577, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.29804408568767465, | |
| "grad_norm": 0.6888434290885925, | |
| "learning_rate": 0.0001986473687223383, | |
| "loss": 0.1585, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.29804408568767465, | |
| "eval_loss": 0.15052124857902527, | |
| "eval_runtime": 38.1444, | |
| "eval_samples_per_second": 4.457, | |
| "eval_steps_per_second": 4.457, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3228810928283142, | |
| "grad_norm": 0.2602541446685791, | |
| "learning_rate": 0.00019784468900761095, | |
| "loss": 0.1548, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.34771809996895375, | |
| "grad_norm": 0.23804056644439697, | |
| "learning_rate": 0.0001968583161128631, | |
| "loss": 0.1581, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3725551071095933, | |
| "grad_norm": 0.20728255808353424, | |
| "learning_rate": 0.00019569010185014062, | |
| "loss": 0.1487, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3725551071095933, | |
| "eval_loss": 0.14161638915538788, | |
| "eval_runtime": 38.1143, | |
| "eval_samples_per_second": 4.46, | |
| "eval_steps_per_second": 4.46, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.39739211425023285, | |
| "grad_norm": 0.2241806834936142, | |
| "learning_rate": 0.00019434223941965738, | |
| "loss": 0.1521, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.4222291213908724, | |
| "grad_norm": 0.24058522284030914, | |
| "learning_rate": 0.00019281725929229127, | |
| "loss": 0.1438, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.44706612853151195, | |
| "grad_norm": 0.17390510439872742, | |
| "learning_rate": 0.00019111802445888936, | |
| "loss": 0.1513, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.44706612853151195, | |
| "eval_loss": 0.1333342343568802, | |
| "eval_runtime": 38.1525, | |
| "eval_samples_per_second": 4.456, | |
| "eval_steps_per_second": 4.456, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.47190313567215153, | |
| "grad_norm": 0.1597527265548706, | |
| "learning_rate": 0.00018924772505530174, | |
| "loss": 0.1416, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.49674014281279105, | |
| "grad_norm": 0.16071690618991852, | |
| "learning_rate": 0.000187209872373235, | |
| "loss": 0.1437, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5215771499534306, | |
| "grad_norm": 0.15209132432937622, | |
| "learning_rate": 0.00018500829226816853, | |
| "loss": 0.1421, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5215771499534306, | |
| "eval_loss": 0.13083834946155548, | |
| "eval_runtime": 38.1005, | |
| "eval_samples_per_second": 4.462, | |
| "eval_steps_per_second": 4.462, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5464141570940702, | |
| "grad_norm": 0.17985881865024567, | |
| "learning_rate": 0.0001826471179767111, | |
| "loss": 0.1395, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5712511642347097, | |
| "grad_norm": 0.1529396027326584, | |
| "learning_rate": 0.0001801307823568806, | |
| "loss": 0.1443, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5960881713753493, | |
| "grad_norm": 0.14357714354991913, | |
| "learning_rate": 0.00017746400956587653, | |
| "loss": 0.1478, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5960881713753493, | |
| "eval_loss": 0.12620185315608978, | |
| "eval_runtime": 38.1161, | |
| "eval_samples_per_second": 4.46, | |
| "eval_steps_per_second": 4.46, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.6209251785159888, | |
| "grad_norm": 0.15918347239494324, | |
| "learning_rate": 0.00017465180619096832, | |
| "loss": 0.1338, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6457621856566284, | |
| "grad_norm": 0.25342991948127747, | |
| "learning_rate": 0.00017169945185015106, | |
| "loss": 0.1353, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.670599192797268, | |
| "grad_norm": 0.1387968510389328, | |
| "learning_rate": 0.00016861248928021411, | |
| "loss": 0.1507, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.670599192797268, | |
| "eval_loss": 0.12623676657676697, | |
| "eval_runtime": 38.5722, | |
| "eval_samples_per_second": 4.407, | |
| "eval_steps_per_second": 4.407, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6954361999379075, | |
| "grad_norm": 0.1433294266462326, | |
| "learning_rate": 0.00016539671393083215, | |
| "loss": 0.1342, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.720273207078547, | |
| "grad_norm": 0.16263321042060852, | |
| "learning_rate": 0.00016205816308421386, | |
| "loss": 0.1361, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.7451102142191866, | |
| "grad_norm": 0.15614983439445496, | |
| "learning_rate": 0.0001586031045207354, | |
| "loss": 0.1457, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7451102142191866, | |
| "eval_loss": 0.1330355405807495, | |
| "eval_runtime": 38.7441, | |
| "eval_samples_per_second": 4.388, | |
| "eval_steps_per_second": 4.388, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7699472213598262, | |
| "grad_norm": 0.13199648261070251, | |
| "learning_rate": 0.00015503802475183773, | |
| "loss": 0.1429, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7947842285004657, | |
| "grad_norm": 0.11272416263818741, | |
| "learning_rate": 0.00015136961684227904, | |
| "loss": 0.1402, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.8196212356411052, | |
| "grad_norm": 0.19195938110351562, | |
| "learning_rate": 0.00014760476784460514, | |
| "loss": 0.144, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.8196212356411052, | |
| "eval_loss": 0.12694701552391052, | |
| "eval_runtime": 38.5526, | |
| "eval_samples_per_second": 4.41, | |
| "eval_steps_per_second": 4.41, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.8444582427817448, | |
| "grad_norm": 0.1164827048778534, | |
| "learning_rate": 0.0001437505458694277, | |
| "loss": 0.1404, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.8692952499223844, | |
| "grad_norm": 0.15164071321487427, | |
| "learning_rate": 0.00013981418681578546, | |
| "loss": 0.1332, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8941322570630239, | |
| "grad_norm": 0.11416257917881012, | |
| "learning_rate": 0.0001358030807864995, | |
| "loss": 0.1333, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8941322570630239, | |
| "eval_loss": 0.12437459081411362, | |
| "eval_runtime": 38.4955, | |
| "eval_samples_per_second": 4.416, | |
| "eval_steps_per_second": 4.416, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.9189692642036634, | |
| "grad_norm": 0.13110032677650452, | |
| "learning_rate": 0.00013172475821402748, | |
| "loss": 0.147, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.9438062713443031, | |
| "grad_norm": 0.09652693569660187, | |
| "learning_rate": 0.00012758687572286367, | |
| "loss": 0.1333, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.9686432784849426, | |
| "grad_norm": 0.09613075852394104, | |
| "learning_rate": 0.00012339720175502642, | |
| "loss": 0.1402, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9686432784849426, | |
| "eval_loss": 0.12314330041408539, | |
| "eval_runtime": 38.4882, | |
| "eval_samples_per_second": 4.417, | |
| "eval_steps_per_second": 4.417, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9934802856255821, | |
| "grad_norm": 0.10631036758422852, | |
| "learning_rate": 0.0001191636019856198, | |
| "loss": 0.1287, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.0173859049984477, | |
| "grad_norm": 0.46618208289146423, | |
| "learning_rate": 0.00011489402455585076, | |
| "loss": 0.138, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.0422229121390871, | |
| "grad_norm": 0.30142149329185486, | |
| "learning_rate": 0.00011059648515122424, | |
| "loss": 0.1365, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.0422229121390871, | |
| "eval_loss": 0.12314148247241974, | |
| "eval_runtime": 38.5184, | |
| "eval_samples_per_second": 4.413, | |
| "eval_steps_per_second": 4.413, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.0670599192797268, | |
| "grad_norm": 0.3135342597961426, | |
| "learning_rate": 0.00010627905195293135, | |
| "loss": 0.1328, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.0918969264203664, | |
| "grad_norm": 0.0945952981710434, | |
| "learning_rate": 0.00010194983049068212, | |
| "loss": 0.134, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.1167339335610058, | |
| "grad_norm": 0.1430775374174118, | |
| "learning_rate": 9.76169484254204e-05, | |
| "loss": 0.1201, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.1167339335610058, | |
| "eval_loss": 0.1227191910147667, | |
| "eval_runtime": 38.5653, | |
| "eval_samples_per_second": 4.408, | |
| "eval_steps_per_second": 4.408, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.1415709407016454, | |
| "grad_norm": 0.09269551187753677, | |
| "learning_rate": 9.328854029048984e-05, | |
| "loss": 0.1305, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.166407947842285, | |
| "grad_norm": 0.10537844151258469, | |
| "learning_rate": 8.897273221989714e-05, | |
| "loss": 0.1315, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.1912449549829245, | |
| "grad_norm": 0.08408211916685104, | |
| "learning_rate": 8.467762669234495e-05, | |
| "loss": 0.1294, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.1912449549829245, | |
| "eval_loss": 0.12096220254898071, | |
| "eval_runtime": 38.4954, | |
| "eval_samples_per_second": 4.416, | |
| "eval_steps_per_second": 4.416, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.2160819621235641, | |
| "grad_norm": 0.1160721555352211, | |
| "learning_rate": 8.041128731967444e-05, | |
| "loss": 0.1341, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.2409189692642038, | |
| "grad_norm": 0.09623311460018158, | |
| "learning_rate": 7.61817237082768e-05, | |
| "loss": 0.1321, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.2657559764048432, | |
| "grad_norm": 0.3000585436820984, | |
| "learning_rate": 7.199687642189387e-05, | |
| "loss": 0.1249, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.2657559764048432, | |
| "eval_loss": 0.12054261565208435, | |
| "eval_runtime": 38.4372, | |
| "eval_samples_per_second": 4.423, | |
| "eval_steps_per_second": 4.423, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.2905929835454828, | |
| "grad_norm": 0.0921676829457283, | |
| "learning_rate": 6.786460207403978e-05, | |
| "loss": 0.1294, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.3154299906861224, | |
| "grad_norm": 0.08408155292272568, | |
| "learning_rate": 6.379265857802969e-05, | |
| "loss": 0.1318, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.3402669978267618, | |
| "grad_norm": 0.19716006517410278, | |
| "learning_rate": 5.9788690582308404e-05, | |
| "loss": 0.1305, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.3402669978267618, | |
| "eval_loss": 0.12028669565916061, | |
| "eval_runtime": 38.6144, | |
| "eval_samples_per_second": 4.402, | |
| "eval_steps_per_second": 4.402, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.3651040049674015, | |
| "grad_norm": 0.14016658067703247, | |
| "learning_rate": 5.586021511842136e-05, | |
| "loss": 0.1271, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.389941012108041, | |
| "grad_norm": 0.11117364466190338, | |
| "learning_rate": 5.201460748857369e-05, | |
| "loss": 0.1306, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.4147780192486805, | |
| "grad_norm": 0.09563940018415451, | |
| "learning_rate": 4.8259087419270756e-05, | |
| "loss": 0.1302, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.4147780192486805, | |
| "eval_loss": 0.12124165892601013, | |
| "eval_runtime": 38.671, | |
| "eval_samples_per_second": 4.396, | |
| "eval_steps_per_second": 4.396, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.4396150263893202, | |
| "grad_norm": 0.08976716548204422, | |
| "learning_rate": 4.460070550703612e-05, | |
| "loss": 0.1332, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.4644520335299596, | |
| "grad_norm": 0.0839507058262825, | |
| "learning_rate": 4.1046329981653086e-05, | |
| "loss": 0.1276, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.4892890406705992, | |
| "grad_norm": 0.09810927510261536, | |
| "learning_rate": 3.7602633811781166e-05, | |
| "loss": 0.1298, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.4892890406705992, | |
| "eval_loss": 0.11950553208589554, | |
| "eval_runtime": 38.7353, | |
| "eval_samples_per_second": 4.389, | |
| "eval_steps_per_second": 4.389, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.5141260478112386, | |
| "grad_norm": 0.10148163884878159, | |
| "learning_rate": 3.4276082177154535e-05, | |
| "loss": 0.1303, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.5389630549518785, | |
| "grad_norm": 0.10012848675251007, | |
| "learning_rate": 3.1072920330882647e-05, | |
| "loss": 0.128, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.5638000620925179, | |
| "grad_norm": 0.09456542879343033, | |
| "learning_rate": 2.7999161874640022e-05, | |
| "loss": 0.1293, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.5638000620925179, | |
| "eval_loss": 0.11886715888977051, | |
| "eval_runtime": 38.5823, | |
| "eval_samples_per_second": 4.406, | |
| "eval_steps_per_second": 4.406, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.5886370692331573, | |
| "grad_norm": 0.09369179606437683, | |
| "learning_rate": 2.506057746875753e-05, | |
| "loss": 0.1299, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.613474076373797, | |
| "grad_norm": 0.09706632047891617, | |
| "learning_rate": 2.226268399841055e-05, | |
| "loss": 0.1244, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.6383110835144366, | |
| "grad_norm": 0.08659979701042175, | |
| "learning_rate": 1.9610734216243522e-05, | |
| "loss": 0.1166, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.6383110835144366, | |
| "eval_loss": 0.11844287067651749, | |
| "eval_runtime": 38.4977, | |
| "eval_samples_per_second": 4.416, | |
| "eval_steps_per_second": 4.416, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.663148090655076, | |
| "grad_norm": 0.0770510733127594, | |
| "learning_rate": 1.710970688087561e-05, | |
| "loss": 0.1284, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.6879850977957156, | |
| "grad_norm": 0.10163529217243195, | |
| "learning_rate": 1.4764297409801764e-05, | |
| "loss": 0.1298, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.7128221049363552, | |
| "grad_norm": 0.10533007979393005, | |
| "learning_rate": 1.2578909064236889e-05, | |
| "loss": 0.1282, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.7128221049363552, | |
| "eval_loss": 0.11826732009649277, | |
| "eval_runtime": 38.6249, | |
| "eval_samples_per_second": 4.401, | |
| "eval_steps_per_second": 4.401, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.7376591120769946, | |
| "grad_norm": 0.07278448343276978, | |
| "learning_rate": 1.0557644682453039e-05, | |
| "loss": 0.1292, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.7624961192176343, | |
| "grad_norm": 0.09515662491321564, | |
| "learning_rate": 8.70429897712921e-06, | |
| "loss": 0.1236, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.787333126358274, | |
| "grad_norm": 0.08752795308828354, | |
| "learning_rate": 7.022351411174866e-06, | |
| "loss": 0.1353, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.787333126358274, | |
| "eval_loss": 0.11793459951877594, | |
| "eval_runtime": 38.65, | |
| "eval_samples_per_second": 4.398, | |
| "eval_steps_per_second": 4.398, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.8121701334989133, | |
| "grad_norm": 0.10476204007863998, | |
| "learning_rate": 5.51495966540182e-06, | |
| "loss": 0.1279, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.837007140639553, | |
| "grad_norm": 0.11006776243448257, | |
| "learning_rate": 4.1849537103084925e-06, | |
| "loss": 0.1286, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.8618441477801926, | |
| "grad_norm": 0.0957954004406929, | |
| "learning_rate": 3.034830493105956e-06, | |
| "loss": 0.1292, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.8618441477801926, | |
| "eval_loss": 0.11800022423267365, | |
| "eval_runtime": 38.6325, | |
| "eval_samples_per_second": 4.4, | |
| "eval_steps_per_second": 4.4, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.886681154920832, | |
| "grad_norm": 0.09181220084428787, | |
| "learning_rate": 2.066749249960498e-06, | |
| "loss": 0.1249, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.9115181620614716, | |
| "grad_norm": 0.08642429113388062, | |
| "learning_rate": 1.2825274522532792e-06, | |
| "loss": 0.1218, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.9363551692021113, | |
| "grad_norm": 0.0925707072019577, | |
| "learning_rate": 6.836373944677954e-07, | |
| "loss": 0.1299, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.9363551692021113, | |
| "eval_loss": 0.1179690733551979, | |
| "eval_runtime": 38.5796, | |
| "eval_samples_per_second": 4.406, | |
| "eval_steps_per_second": 4.406, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.9611921763427507, | |
| "grad_norm": 0.09501737356185913, | |
| "learning_rate": 2.712034301107114e-07, | |
| "loss": 0.1289, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.9860291834833903, | |
| "grad_norm": 0.09520602226257324, | |
| "learning_rate": 4.599986085573882e-08, | |
| "loss": 0.128, | |
| "step": 800 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 806, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2428317974302515e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |