| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9785780813411984, | |
| "eval_steps": 30, | |
| "global_step": 1200, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.024837007140639553, | |
| "grad_norm": 8.359491348266602, | |
| "learning_rate": 1.487603305785124e-05, | |
| "loss": 2.6721, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04967401428127911, | |
| "grad_norm": 0.796862006187439, | |
| "learning_rate": 3.1404958677685955e-05, | |
| "loss": 1.0523, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.07451102142191866, | |
| "grad_norm": 0.9021220803260803, | |
| "learning_rate": 4.793388429752066e-05, | |
| "loss": 0.6256, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.07451102142191866, | |
| "eval_loss": 0.5602371096611023, | |
| "eval_runtime": 38.7856, | |
| "eval_samples_per_second": 4.383, | |
| "eval_steps_per_second": 4.383, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09934802856255821, | |
| "grad_norm": 0.5970816612243652, | |
| "learning_rate": 6.446280991735537e-05, | |
| "loss": 0.4598, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.12418503570319776, | |
| "grad_norm": 0.6478993892669678, | |
| "learning_rate": 8.099173553719009e-05, | |
| "loss": 0.3539, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.14902204284383733, | |
| "grad_norm": 0.4981272518634796, | |
| "learning_rate": 9.75206611570248e-05, | |
| "loss": 0.2828, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.14902204284383733, | |
| "eval_loss": 0.28084230422973633, | |
| "eval_runtime": 38.8471, | |
| "eval_samples_per_second": 4.376, | |
| "eval_steps_per_second": 4.376, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.17385904998447688, | |
| "grad_norm": 0.5028337836265564, | |
| "learning_rate": 0.0001140495867768595, | |
| "loss": 0.2523, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.19869605712511643, | |
| "grad_norm": 0.7332940697669983, | |
| "learning_rate": 0.00013057851239669423, | |
| "loss": 0.2251, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.22353306426575598, | |
| "grad_norm": 0.43022558093070984, | |
| "learning_rate": 0.00014710743801652894, | |
| "loss": 0.1988, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.22353306426575598, | |
| "eval_loss": 0.19827328622341156, | |
| "eval_runtime": 38.898, | |
| "eval_samples_per_second": 4.37, | |
| "eval_steps_per_second": 4.37, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.24837007140639553, | |
| "grad_norm": 0.33378636837005615, | |
| "learning_rate": 0.00016363636363636366, | |
| "loss": 0.1873, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2732070785470351, | |
| "grad_norm": 0.6605438590049744, | |
| "learning_rate": 0.00018016528925619835, | |
| "loss": 0.1663, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.29804408568767465, | |
| "grad_norm": 0.26731953024864197, | |
| "learning_rate": 0.0001966942148760331, | |
| "loss": 0.1704, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.29804408568767465, | |
| "eval_loss": 0.16433002054691315, | |
| "eval_runtime": 38.9658, | |
| "eval_samples_per_second": 4.363, | |
| "eval_steps_per_second": 4.363, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3228810928283142, | |
| "grad_norm": 0.33725589513778687, | |
| "learning_rate": 0.00019997332081116373, | |
| "loss": 0.166, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.34771809996895375, | |
| "grad_norm": 0.41751629114151, | |
| "learning_rate": 0.00019986496100395275, | |
| "loss": 0.1693, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3725551071095933, | |
| "grad_norm": 0.34873735904693604, | |
| "learning_rate": 0.000199673343399533, | |
| "loss": 0.1571, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3725551071095933, | |
| "eval_loss": 0.15006589889526367, | |
| "eval_runtime": 38.8567, | |
| "eval_samples_per_second": 4.375, | |
| "eval_steps_per_second": 4.375, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.39739211425023285, | |
| "grad_norm": 0.31080615520477295, | |
| "learning_rate": 0.00019939862775022893, | |
| "loss": 0.1608, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.4222291213908724, | |
| "grad_norm": 0.18451546132564545, | |
| "learning_rate": 0.0001990410430875205, | |
| "loss": 0.1477, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.44706612853151195, | |
| "grad_norm": 0.21936577558517456, | |
| "learning_rate": 0.00019860088753109896, | |
| "loss": 0.1582, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.44706612853151195, | |
| "eval_loss": 0.14357255399227142, | |
| "eval_runtime": 38.8454, | |
| "eval_samples_per_second": 4.376, | |
| "eval_steps_per_second": 4.376, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.47190313567215153, | |
| "grad_norm": 0.16537490487098694, | |
| "learning_rate": 0.00019807852804032305, | |
| "loss": 0.1474, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.49674014281279105, | |
| "grad_norm": 0.21974419057369232, | |
| "learning_rate": 0.00019747440010828383, | |
| "loss": 0.1475, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5215771499534306, | |
| "grad_norm": 0.26473143696784973, | |
| "learning_rate": 0.00019678900739873226, | |
| "loss": 0.1443, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5215771499534306, | |
| "eval_loss": 0.13610073924064636, | |
| "eval_runtime": 38.8034, | |
| "eval_samples_per_second": 4.381, | |
| "eval_steps_per_second": 4.381, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5464141570940702, | |
| "grad_norm": 0.24532222747802734, | |
| "learning_rate": 0.000196022921326173, | |
| "loss": 0.1418, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5712511642347097, | |
| "grad_norm": 0.189656063914299, | |
| "learning_rate": 0.00019517678057947384, | |
| "loss": 0.1463, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5960881713753493, | |
| "grad_norm": 0.16695067286491394, | |
| "learning_rate": 0.00019425129058938832, | |
| "loss": 0.1518, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5960881713753493, | |
| "eval_loss": 0.12985987961292267, | |
| "eval_runtime": 38.9149, | |
| "eval_samples_per_second": 4.369, | |
| "eval_steps_per_second": 4.369, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.6209251785159888, | |
| "grad_norm": 0.20477107167243958, | |
| "learning_rate": 0.00019324722294043558, | |
| "loss": 0.1365, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6457621856566284, | |
| "grad_norm": 0.14411672949790955, | |
| "learning_rate": 0.00019216541472762735, | |
| "loss": 0.1396, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.670599192797268, | |
| "grad_norm": 0.12490475922822952, | |
| "learning_rate": 0.0001910067678585786, | |
| "loss": 0.1462, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.670599192797268, | |
| "eval_loss": 0.13094556331634521, | |
| "eval_runtime": 38.8929, | |
| "eval_samples_per_second": 4.371, | |
| "eval_steps_per_second": 4.371, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6954361999379075, | |
| "grad_norm": 0.2038726955652237, | |
| "learning_rate": 0.0001897722483015838, | |
| "loss": 0.1331, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.720273207078547, | |
| "grad_norm": 0.1514689326286316, | |
| "learning_rate": 0.00018846288528028555, | |
| "loss": 0.138, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.7451102142191866, | |
| "grad_norm": 0.22410112619400024, | |
| "learning_rate": 0.0001870797704156067, | |
| "loss": 0.1478, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7451102142191866, | |
| "eval_loss": 0.1277894824743271, | |
| "eval_runtime": 38.922, | |
| "eval_samples_per_second": 4.368, | |
| "eval_steps_per_second": 4.368, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7699472213598262, | |
| "grad_norm": 0.1832033395767212, | |
| "learning_rate": 0.00018562405681566216, | |
| "loss": 0.1431, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7947842285004657, | |
| "grad_norm": 0.11910755932331085, | |
| "learning_rate": 0.00018409695811440796, | |
| "loss": 0.1427, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.8196212356411052, | |
| "grad_norm": 0.13471344113349915, | |
| "learning_rate": 0.00018249974745983023, | |
| "loss": 0.1471, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.8196212356411052, | |
| "eval_loss": 0.12796619534492493, | |
| "eval_runtime": 38.9144, | |
| "eval_samples_per_second": 4.369, | |
| "eval_steps_per_second": 4.369, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.8444582427817448, | |
| "grad_norm": 0.1409328728914261, | |
| "learning_rate": 0.00018083375645251684, | |
| "loss": 0.1413, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.8692952499223844, | |
| "grad_norm": 0.26645776629447937, | |
| "learning_rate": 0.00017910037403549693, | |
| "loss": 0.1383, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8941322570630239, | |
| "grad_norm": 0.21028681099414825, | |
| "learning_rate": 0.0001773010453362737, | |
| "loss": 0.138, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8941322570630239, | |
| "eval_loss": 0.12485021352767944, | |
| "eval_runtime": 39.0821, | |
| "eval_samples_per_second": 4.35, | |
| "eval_steps_per_second": 4.35, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.9189692642036634, | |
| "grad_norm": 0.13296136260032654, | |
| "learning_rate": 0.0001754372704620164, | |
| "loss": 0.1494, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.9438062713443031, | |
| "grad_norm": 0.16724663972854614, | |
| "learning_rate": 0.00017351060324891502, | |
| "loss": 0.1391, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.9686432784849426, | |
| "grad_norm": 0.11043282598257065, | |
| "learning_rate": 0.00017152264996674136, | |
| "loss": 0.1425, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9686432784849426, | |
| "eval_loss": 0.127302885055542, | |
| "eval_runtime": 38.9, | |
| "eval_samples_per_second": 4.37, | |
| "eval_steps_per_second": 4.37, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9934802856255821, | |
| "grad_norm": 0.28038299083709717, | |
| "learning_rate": 0.00016947506797969562, | |
| "loss": 0.1323, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.0173859049984477, | |
| "grad_norm": 0.13846513628959656, | |
| "learning_rate": 0.00016736956436465573, | |
| "loss": 0.1375, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.0422229121390871, | |
| "grad_norm": 0.13630150258541107, | |
| "learning_rate": 0.00016520789448798087, | |
| "loss": 0.138, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.0422229121390871, | |
| "eval_loss": 0.12523552775382996, | |
| "eval_runtime": 38.8684, | |
| "eval_samples_per_second": 4.374, | |
| "eval_steps_per_second": 4.374, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.0670599192797268, | |
| "grad_norm": 0.6552147269248962, | |
| "learning_rate": 0.00016299186054205577, | |
| "loss": 0.1347, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.0918969264203664, | |
| "grad_norm": 0.14625756442546844, | |
| "learning_rate": 0.00016072331004279614, | |
| "loss": 0.1388, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.1167339335610058, | |
| "grad_norm": 0.29366788268089294, | |
| "learning_rate": 0.00015840413428936767, | |
| "loss": 0.1237, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.1167339335610058, | |
| "eval_loss": 0.1281893253326416, | |
| "eval_runtime": 38.858, | |
| "eval_samples_per_second": 4.375, | |
| "eval_steps_per_second": 4.375, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.1415709407016454, | |
| "grad_norm": 0.4872620105743408, | |
| "learning_rate": 0.00015603626678740263, | |
| "loss": 0.1364, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.166407947842285, | |
| "grad_norm": 0.14464133977890015, | |
| "learning_rate": 0.000153621681637029, | |
| "loss": 0.1354, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.1912449549829245, | |
| "grad_norm": 0.11566495150327682, | |
| "learning_rate": 0.00015116239188705556, | |
| "loss": 0.1337, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.1912449549829245, | |
| "eval_loss": 0.1254083812236786, | |
| "eval_runtime": 39.142, | |
| "eval_samples_per_second": 4.343, | |
| "eval_steps_per_second": 4.343, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.2160819621235641, | |
| "grad_norm": 0.14898011088371277, | |
| "learning_rate": 0.00014866044785668563, | |
| "loss": 0.1383, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.2409189692642038, | |
| "grad_norm": 0.09731869399547577, | |
| "learning_rate": 0.00014611793542615803, | |
| "loss": 0.1353, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.2657559764048432, | |
| "grad_norm": 0.088087297976017, | |
| "learning_rate": 0.00014353697429774084, | |
| "loss": 0.1277, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.2657559764048432, | |
| "eval_loss": 0.1225721463561058, | |
| "eval_runtime": 38.987, | |
| "eval_samples_per_second": 4.36, | |
| "eval_steps_per_second": 4.36, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.2905929835454828, | |
| "grad_norm": 0.0956592932343483, | |
| "learning_rate": 0.0001409197162285275, | |
| "loss": 0.1334, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.3154299906861224, | |
| "grad_norm": 0.09264901280403137, | |
| "learning_rate": 0.000138268343236509, | |
| "loss": 0.1355, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.3402669978267618, | |
| "grad_norm": 0.0848374217748642, | |
| "learning_rate": 0.00013558506578141682, | |
| "loss": 0.1331, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.3402669978267618, | |
| "eval_loss": 0.12285340577363968, | |
| "eval_runtime": 39.0259, | |
| "eval_samples_per_second": 4.356, | |
| "eval_steps_per_second": 4.356, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.3651040049674015, | |
| "grad_norm": 0.13393981754779816, | |
| "learning_rate": 0.00013287212092185464, | |
| "loss": 0.1293, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.389941012108041, | |
| "grad_norm": 0.1015724316239357, | |
| "learning_rate": 0.00013013177045025374, | |
| "loss": 0.1336, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.4147780192486805, | |
| "grad_norm": 0.09620890021324158, | |
| "learning_rate": 0.0001273662990072083, | |
| "loss": 0.1327, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.4147780192486805, | |
| "eval_loss": 0.12293345481157303, | |
| "eval_runtime": 39.0058, | |
| "eval_samples_per_second": 4.358, | |
| "eval_steps_per_second": 4.358, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.4396150263893202, | |
| "grad_norm": 0.10039085894823074, | |
| "learning_rate": 0.00012457801217676182, | |
| "loss": 0.1358, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.4644520335299596, | |
| "grad_norm": 0.0853387713432312, | |
| "learning_rate": 0.00012176923456423284, | |
| "loss": 0.1294, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.4892890406705992, | |
| "grad_norm": 0.10647860914468765, | |
| "learning_rate": 0.00011894230785818284, | |
| "loss": 0.1344, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.4892890406705992, | |
| "eval_loss": 0.12142268568277359, | |
| "eval_runtime": 38.984, | |
| "eval_samples_per_second": 4.361, | |
| "eval_steps_per_second": 4.361, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.5141260478112386, | |
| "grad_norm": 0.10750167816877365, | |
| "learning_rate": 0.00011609958887814129, | |
| "loss": 0.1328, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.5389630549518785, | |
| "grad_norm": 0.10130150616168976, | |
| "learning_rate": 0.00011324344760971671, | |
| "loss": 0.1305, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.5638000620925179, | |
| "grad_norm": 0.08659154921770096, | |
| "learning_rate": 0.00011037626522873019, | |
| "loss": 0.1329, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.5638000620925179, | |
| "eval_loss": 0.12055275589227676, | |
| "eval_runtime": 38.9483, | |
| "eval_samples_per_second": 4.365, | |
| "eval_steps_per_second": 4.365, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.5886370692331573, | |
| "grad_norm": 0.10068133473396301, | |
| "learning_rate": 0.00010750043211602045, | |
| "loss": 0.1332, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.613474076373797, | |
| "grad_norm": 0.10626350343227386, | |
| "learning_rate": 0.00010461834586457398, | |
| "loss": 0.1265, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.6383110835144366, | |
| "grad_norm": 0.08024556934833527, | |
| "learning_rate": 0.00010173240928064285, | |
| "loss": 0.1188, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.6383110835144366, | |
| "eval_loss": 0.12125765532255173, | |
| "eval_runtime": 39.0777, | |
| "eval_samples_per_second": 4.35, | |
| "eval_steps_per_second": 4.35, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.663148090655076, | |
| "grad_norm": 0.08063840121030807, | |
| "learning_rate": 9.884502838051595e-05, | |
| "loss": 0.1325, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.6879850977957156, | |
| "grad_norm": 0.0849422737956047, | |
| "learning_rate": 9.595861038461398e-05, | |
| "loss": 0.1322, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.7128221049363552, | |
| "grad_norm": 0.09707438945770264, | |
| "learning_rate": 9.307556171058085e-05, | |
| "loss": 0.1303, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.7128221049363552, | |
| "eval_loss": 0.119967520236969, | |
| "eval_runtime": 38.9279, | |
| "eval_samples_per_second": 4.367, | |
| "eval_steps_per_second": 4.367, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.7376591120769946, | |
| "grad_norm": 0.06874745339155197, | |
| "learning_rate": 9.019828596704394e-05, | |
| "loss": 0.1323, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.7624961192176343, | |
| "grad_norm": 0.09782899171113968, | |
| "learning_rate": 8.732918194971664e-05, | |
| "loss": 0.1269, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.787333126358274, | |
| "grad_norm": 0.08459767699241638, | |
| "learning_rate": 8.447064164151304e-05, | |
| "loss": 0.1383, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.787333126358274, | |
| "eval_loss": 0.11947113275527954, | |
| "eval_runtime": 39.0046, | |
| "eval_samples_per_second": 4.358, | |
| "eval_steps_per_second": 4.358, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.8121701334989133, | |
| "grad_norm": 0.10213370621204376, | |
| "learning_rate": 8.162504821834295e-05, | |
| "loss": 0.1306, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.837007140639553, | |
| "grad_norm": 0.08585705608129501, | |
| "learning_rate": 7.879477406224894e-05, | |
| "loss": 0.1316, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.8618441477801926, | |
| "grad_norm": 0.08754217624664307, | |
| "learning_rate": 7.598217878354237e-05, | |
| "loss": 0.1314, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.8618441477801926, | |
| "eval_loss": 0.11994090676307678, | |
| "eval_runtime": 39.0279, | |
| "eval_samples_per_second": 4.356, | |
| "eval_steps_per_second": 4.356, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.886681154920832, | |
| "grad_norm": 0.09176009893417358, | |
| "learning_rate": 7.318960725358741e-05, | |
| "loss": 0.1272, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.9115181620614716, | |
| "grad_norm": 0.07954169064760208, | |
| "learning_rate": 7.041938764987297e-05, | |
| "loss": 0.1254, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.9363551692021113, | |
| "grad_norm": 0.10337759554386139, | |
| "learning_rate": 6.767382951500204e-05, | |
| "loss": 0.1324, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.9363551692021113, | |
| "eval_loss": 0.12009570002555847, | |
| "eval_runtime": 39.0135, | |
| "eval_samples_per_second": 4.357, | |
| "eval_steps_per_second": 4.357, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.9611921763427507, | |
| "grad_norm": 0.0923035591840744, | |
| "learning_rate": 6.495522183121741e-05, | |
| "loss": 0.1312, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.9860291834833903, | |
| "grad_norm": 0.08417502790689468, | |
| "learning_rate": 6.226583111206856e-05, | |
| "loss": 0.1308, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.009934802856256, | |
| "grad_norm": 0.0744447335600853, | |
| "learning_rate": 5.960789951281052e-05, | |
| "loss": 0.1226, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.009934802856256, | |
| "eval_loss": 0.11844488978385925, | |
| "eval_runtime": 39.1559, | |
| "eval_samples_per_second": 4.342, | |
| "eval_steps_per_second": 4.342, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.0347718099968954, | |
| "grad_norm": 0.07120949774980545, | |
| "learning_rate": 5.698364296111056e-05, | |
| "loss": 0.1307, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.059608817137535, | |
| "grad_norm": 0.10359616577625275, | |
| "learning_rate": 5.43952493096211e-05, | |
| "loss": 0.1235, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.0844458242781743, | |
| "grad_norm": 0.07456395030021667, | |
| "learning_rate": 5.184487651195825e-05, | |
| "loss": 0.1251, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.0844458242781743, | |
| "eval_loss": 0.11879772692918777, | |
| "eval_runtime": 39.037, | |
| "eval_samples_per_second": 4.355, | |
| "eval_steps_per_second": 4.355, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.109282831418814, | |
| "grad_norm": 0.08009591698646545, | |
| "learning_rate": 4.933465082360807e-05, | |
| "loss": 0.1349, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.1341198385594535, | |
| "grad_norm": 0.09120004624128342, | |
| "learning_rate": 4.686666502925908e-05, | |
| "loss": 0.1268, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.158956845700093, | |
| "grad_norm": 0.06066734343767166, | |
| "learning_rate": 4.444297669803981e-05, | |
| "loss": 0.1259, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.158956845700093, | |
| "eval_loss": 0.11877793818712234, | |
| "eval_runtime": 38.9985, | |
| "eval_samples_per_second": 4.359, | |
| "eval_steps_per_second": 4.359, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.183793852840733, | |
| "grad_norm": 0.09893694519996643, | |
| "learning_rate": 4.206560646811545e-05, | |
| "loss": 0.1295, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.208630859981372, | |
| "grad_norm": 0.0868426039814949, | |
| "learning_rate": 3.973653636207437e-05, | |
| "loss": 0.1193, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.2334678671220116, | |
| "grad_norm": 0.0975928083062172, | |
| "learning_rate": 3.745770813450824e-05, | |
| "loss": 0.1282, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.2334678671220116, | |
| "eval_loss": 0.1184425801038742, | |
| "eval_runtime": 38.8574, | |
| "eval_samples_per_second": 4.375, | |
| "eval_steps_per_second": 4.375, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.2583048742626515, | |
| "grad_norm": 0.07092616707086563, | |
| "learning_rate": 3.523102165316381e-05, | |
| "loss": 0.1212, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.283141881403291, | |
| "grad_norm": 0.0809750035405159, | |
| "learning_rate": 3.3058333315016065e-05, | |
| "loss": 0.1252, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.3079788885439303, | |
| "grad_norm": 0.10307040810585022, | |
| "learning_rate": 3.094145449858285e-05, | |
| "loss": 0.1263, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.3079788885439303, | |
| "eval_loss": 0.11812498420476913, | |
| "eval_runtime": 38.9098, | |
| "eval_samples_per_second": 4.369, | |
| "eval_steps_per_second": 4.369, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.33281589568457, | |
| "grad_norm": 0.087245412170887, | |
| "learning_rate": 2.8882150053771995e-05, | |
| "loss": 0.1261, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.3576529028252096, | |
| "grad_norm": 0.08954016864299774, | |
| "learning_rate": 2.688213683051892e-05, | |
| "loss": 0.13, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.382489909965849, | |
| "grad_norm": 0.08771616220474243, | |
| "learning_rate": 2.4943082247442585e-05, | |
| "loss": 0.1232, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.382489909965849, | |
| "eval_loss": 0.1186952143907547, | |
| "eval_runtime": 38.9954, | |
| "eval_samples_per_second": 4.359, | |
| "eval_steps_per_second": 4.359, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.407326917106489, | |
| "grad_norm": 0.09044978022575378, | |
| "learning_rate": 2.3066602901712108e-05, | |
| "loss": 0.1269, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.4321639242471282, | |
| "grad_norm": 0.07978509366512299, | |
| "learning_rate": 2.1254263221283654e-05, | |
| "loss": 0.1234, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.4570009313877677, | |
| "grad_norm": 0.09153130650520325, | |
| "learning_rate": 1.950757416063077e-05, | |
| "loss": 0.1219, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.4570009313877677, | |
| "eval_loss": 0.11810684204101562, | |
| "eval_runtime": 39.0556, | |
| "eval_samples_per_second": 4.353, | |
| "eval_steps_per_second": 4.353, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.4818379385284075, | |
| "grad_norm": 0.09865284711122513, | |
| "learning_rate": 1.7827991941056177e-05, | |
| "loss": 0.1196, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.506674945669047, | |
| "grad_norm": 0.08960665762424469, | |
| "learning_rate": 1.621691683663418e-05, | |
| "loss": 0.1279, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.5315119528096863, | |
| "grad_norm": 0.09712927043437958, | |
| "learning_rate": 1.4675692006797137e-05, | |
| "loss": 0.125, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.5315119528096863, | |
| "eval_loss": 0.11813132464885712, | |
| "eval_runtime": 38.9216, | |
| "eval_samples_per_second": 4.368, | |
| "eval_steps_per_second": 4.368, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.556348959950326, | |
| "grad_norm": 0.10061313211917877, | |
| "learning_rate": 1.3205602376538163e-05, | |
| "loss": 0.128, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.5811859670909656, | |
| "grad_norm": 0.07715890556573868, | |
| "learning_rate": 1.1807873565164506e-05, | |
| "loss": 0.1231, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.606022974231605, | |
| "grad_norm": 0.09734167903661728, | |
| "learning_rate": 1.0483670864493778e-05, | |
| "loss": 0.133, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.606022974231605, | |
| "eval_loss": 0.11806084215641022, | |
| "eval_runtime": 39.1009, | |
| "eval_samples_per_second": 4.348, | |
| "eval_steps_per_second": 4.348, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.630859981372245, | |
| "grad_norm": 0.07243157923221588, | |
| "learning_rate": 9.234098267345958e-06, | |
| "loss": 0.1278, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.6556969885128843, | |
| "grad_norm": 0.12202966213226318, | |
| "learning_rate": 8.060197547140347e-06, | |
| "loss": 0.1228, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.6805339956535237, | |
| "grad_norm": 0.07589470595121384, | |
| "learning_rate": 6.962947389365071e-06, | |
| "loss": 0.1225, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.6805339956535237, | |
| "eval_loss": 0.11801353096961975, | |
| "eval_runtime": 39.1137, | |
| "eval_samples_per_second": 4.346, | |
| "eval_steps_per_second": 4.346, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.7053710027941635, | |
| "grad_norm": 0.11445856839418411, | |
| "learning_rate": 5.943262575643238e-06, | |
| "loss": 0.135, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.730208009934803, | |
| "grad_norm": 0.09918007999658585, | |
| "learning_rate": 5.001993221076162e-06, | |
| "loss": 0.1312, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.7550450170754424, | |
| "grad_norm": 0.08956551551818848, | |
| "learning_rate": 4.139924065499035e-06, | |
| "loss": 0.1188, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.7550450170754424, | |
| "eval_loss": 0.11794496327638626, | |
| "eval_runtime": 39.0409, | |
| "eval_samples_per_second": 4.354, | |
| "eval_steps_per_second": 4.354, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.779882024216082, | |
| "grad_norm": 0.09554298222064972, | |
| "learning_rate": 3.3577738192404395e-06, | |
| "loss": 0.124, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.8047190313567216, | |
| "grad_norm": 0.09960418939590454, | |
| "learning_rate": 2.656194563930714e-06, | |
| "loss": 0.1295, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.829556038497361, | |
| "grad_norm": 0.11168912798166275, | |
| "learning_rate": 2.035771208859194e-06, | |
| "loss": 0.1243, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.829556038497361, | |
| "eval_loss": 0.11804591119289398, | |
| "eval_runtime": 39.2098, | |
| "eval_samples_per_second": 4.336, | |
| "eval_steps_per_second": 4.336, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.854393045638001, | |
| "grad_norm": 0.08320163935422897, | |
| "learning_rate": 1.49702100333291e-06, | |
| "loss": 0.1238, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.8792300527786403, | |
| "grad_norm": 0.11576654016971588, | |
| "learning_rate": 1.0403931054440374e-06, | |
| "loss": 0.1206, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.9040670599192797, | |
| "grad_norm": 0.09662230312824249, | |
| "learning_rate": 6.662682076050031e-07, | |
| "loss": 0.1205, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.9040670599192797, | |
| "eval_loss": 0.11792467534542084, | |
| "eval_runtime": 39.0045, | |
| "eval_samples_per_second": 4.358, | |
| "eval_steps_per_second": 4.358, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.928904067059919, | |
| "grad_norm": 0.09648039191961288, | |
| "learning_rate": 3.7495821916382344e-07, | |
| "loss": 0.129, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.953741074200559, | |
| "grad_norm": 0.08923009783029556, | |
| "learning_rate": 1.6670600636403687e-07, | |
| "loss": 0.1237, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.9785780813411984, | |
| "grad_norm": 0.08443022519350052, | |
| "learning_rate": 4.168518986628067e-08, | |
| "loss": 0.1196, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.9785780813411984, | |
| "eval_loss": 0.11797202378511429, | |
| "eval_runtime": 39.051, | |
| "eval_samples_per_second": 4.353, | |
| "eval_steps_per_second": 4.353, | |
| "step": 1200 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1209, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.8653749005815194e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |