| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9994683678894205, | |
| "eval_steps": 500, | |
| "global_step": 470, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.002126528442317916, | |
| "grad_norm": 1.1132217598089287, | |
| "learning_rate": 4.255319148936171e-06, | |
| "loss": 1.475, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.01063264221158958, | |
| "grad_norm": 1.0674803445954786, | |
| "learning_rate": 2.1276595744680852e-05, | |
| "loss": 1.3986, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.02126528442317916, | |
| "grad_norm": 0.27714288812132803, | |
| "learning_rate": 4.2553191489361704e-05, | |
| "loss": 1.3198, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03189792663476874, | |
| "grad_norm": 0.24323884897635753, | |
| "learning_rate": 6.382978723404256e-05, | |
| "loss": 1.3201, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.04253056884635832, | |
| "grad_norm": 0.17616144014740132, | |
| "learning_rate": 8.510638297872341e-05, | |
| "loss": 1.2681, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0531632110579479, | |
| "grad_norm": 0.14425961839274853, | |
| "learning_rate": 0.00010638297872340425, | |
| "loss": 1.2294, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.06379585326953748, | |
| "grad_norm": 0.09993563340523522, | |
| "learning_rate": 0.00012765957446808513, | |
| "loss": 1.2203, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.07442849548112707, | |
| "grad_norm": 0.09436627728374515, | |
| "learning_rate": 0.00014893617021276596, | |
| "loss": 1.1787, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.08506113769271664, | |
| "grad_norm": 0.08152779581665957, | |
| "learning_rate": 0.00017021276595744682, | |
| "loss": 1.213, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.09569377990430622, | |
| "grad_norm": 0.08994938392428893, | |
| "learning_rate": 0.00019148936170212768, | |
| "loss": 1.1771, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.1063264221158958, | |
| "grad_norm": 0.08889534543964475, | |
| "learning_rate": 0.0001999751793267259, | |
| "loss": 1.1937, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.11695906432748537, | |
| "grad_norm": 0.0818192481558469, | |
| "learning_rate": 0.00019982354205260347, | |
| "loss": 1.1549, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.12759170653907495, | |
| "grad_norm": 0.08592131915598437, | |
| "learning_rate": 0.0001995342655949951, | |
| "loss": 1.1452, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.13822434875066453, | |
| "grad_norm": 0.08847958460762677, | |
| "learning_rate": 0.000199107748815478, | |
| "loss": 1.1381, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.14885699096225413, | |
| "grad_norm": 0.09175483942286262, | |
| "learning_rate": 0.000198544579806, | |
| "loss": 1.1512, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.1594896331738437, | |
| "grad_norm": 0.07308896698803534, | |
| "learning_rate": 0.00019784553507800349, | |
| "loss": 1.1245, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.17012227538543329, | |
| "grad_norm": 0.07352573225963126, | |
| "learning_rate": 0.00019701157849175228, | |
| "loss": 1.1456, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.18075491759702286, | |
| "grad_norm": 0.08124735026425489, | |
| "learning_rate": 0.00019604385992733715, | |
| "loss": 1.138, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.19138755980861244, | |
| "grad_norm": 0.07150321514192684, | |
| "learning_rate": 0.0001949437136991925, | |
| "loss": 1.1375, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.20202020202020202, | |
| "grad_norm": 0.07663110018396366, | |
| "learning_rate": 0.00019371265671631037, | |
| "loss": 1.123, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.2126528442317916, | |
| "grad_norm": 0.08486863588170293, | |
| "learning_rate": 0.00019235238639068856, | |
| "loss": 1.1357, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.22328548644338117, | |
| "grad_norm": 0.0762558400001174, | |
| "learning_rate": 0.00019086477829689685, | |
| "loss": 1.131, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.23391812865497075, | |
| "grad_norm": 0.07452030437492205, | |
| "learning_rate": 0.00018925188358598813, | |
| "loss": 1.1442, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.24455077086656035, | |
| "grad_norm": 0.07570723134200702, | |
| "learning_rate": 0.00018751592615732005, | |
| "loss": 1.1342, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.2551834130781499, | |
| "grad_norm": 0.15468624584475818, | |
| "learning_rate": 0.00018565929959218758, | |
| "loss": 1.1158, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2658160552897395, | |
| "grad_norm": 0.08459758525414239, | |
| "learning_rate": 0.00018368456385349334, | |
| "loss": 1.1103, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.27644869750132905, | |
| "grad_norm": 0.0699573756198723, | |
| "learning_rate": 0.00018159444175600703, | |
| "loss": 1.1375, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.28708133971291866, | |
| "grad_norm": 0.09015868621107682, | |
| "learning_rate": 0.000179391815212081, | |
| "loss": 1.1351, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.29771398192450826, | |
| "grad_norm": 0.07786434215665021, | |
| "learning_rate": 0.00017707972125799735, | |
| "loss": 1.1094, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3083466241360978, | |
| "grad_norm": 0.0722359719751335, | |
| "learning_rate": 0.0001746613478664271, | |
| "loss": 1.109, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.3189792663476874, | |
| "grad_norm": 0.07007415812680974, | |
| "learning_rate": 0.00017214002955077393, | |
| "loss": 1.118, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.32961190855927697, | |
| "grad_norm": 0.07271324414680513, | |
| "learning_rate": 0.00016951924276746425, | |
| "loss": 1.1093, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.34024455077086657, | |
| "grad_norm": 0.07579503380547685, | |
| "learning_rate": 0.0001668026011225225, | |
| "loss": 1.132, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.3508771929824561, | |
| "grad_norm": 0.0755893077221133, | |
| "learning_rate": 0.00016399385038904138, | |
| "loss": 1.1041, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.3615098351940457, | |
| "grad_norm": 0.07521138011494473, | |
| "learning_rate": 0.00016109686334241655, | |
| "loss": 1.1106, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.3721424774056353, | |
| "grad_norm": 0.07475367435754098, | |
| "learning_rate": 0.00015811563442046767, | |
| "loss": 1.0981, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.3827751196172249, | |
| "grad_norm": 0.07569742075334658, | |
| "learning_rate": 0.00015505427421580808, | |
| "loss": 1.1027, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.3934077618288145, | |
| "grad_norm": 0.06970712543989657, | |
| "learning_rate": 0.00015191700380805752, | |
| "loss": 1.1148, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.40404040404040403, | |
| "grad_norm": 0.07988874051314146, | |
| "learning_rate": 0.00014870814894371245, | |
| "loss": 1.1113, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.41467304625199364, | |
| "grad_norm": 0.07160684049436854, | |
| "learning_rate": 0.0001454321340716992, | |
| "loss": 1.0861, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.4253056884635832, | |
| "grad_norm": 0.07005079598481417, | |
| "learning_rate": 0.0001420934762428335, | |
| "loss": 1.0971, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4359383306751728, | |
| "grad_norm": 0.07394470646142667, | |
| "learning_rate": 0.00013869677888159887, | |
| "loss": 1.1136, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.44657097288676234, | |
| "grad_norm": 0.07523040228715787, | |
| "learning_rate": 0.00013524672543882996, | |
| "loss": 1.1196, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.45720361509835195, | |
| "grad_norm": 0.0740459816152623, | |
| "learning_rate": 0.00013174807293405428, | |
| "loss": 1.122, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.4678362573099415, | |
| "grad_norm": 0.07407785053371259, | |
| "learning_rate": 0.00012820564539639512, | |
| "loss": 1.1101, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.4784688995215311, | |
| "grad_norm": 0.07407385022434756, | |
| "learning_rate": 0.0001246243272130804, | |
| "loss": 1.1093, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.4891015417331207, | |
| "grad_norm": 0.0691999860793088, | |
| "learning_rate": 0.00012100905639472779, | |
| "loss": 1.0947, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.49973418394471025, | |
| "grad_norm": 0.07023344891744904, | |
| "learning_rate": 0.00011736481776669306, | |
| "loss": 1.1322, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.5103668261562998, | |
| "grad_norm": 0.07064724436775818, | |
| "learning_rate": 0.00011369663609586854, | |
| "loss": 1.1188, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5209994683678895, | |
| "grad_norm": 0.07583291773979836, | |
| "learning_rate": 0.00011000956916240985, | |
| "loss": 1.0995, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.531632110579479, | |
| "grad_norm": 0.07152892272662684, | |
| "learning_rate": 0.00010630870078594249, | |
| "loss": 1.1236, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5422647527910686, | |
| "grad_norm": 0.07101072737185006, | |
| "learning_rate": 0.0001025991338158651, | |
| "loss": 1.1311, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.5528973950026581, | |
| "grad_norm": 0.0702113295280909, | |
| "learning_rate": 9.888598309541347e-05, | |
| "loss": 1.1021, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5635300372142478, | |
| "grad_norm": 0.07560382456183587, | |
| "learning_rate": 9.517436840918766e-05, | |
| "loss": 1.1193, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.5741626794258373, | |
| "grad_norm": 0.07181091122638743, | |
| "learning_rate": 9.146940742386553e-05, | |
| "loss": 1.1171, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5847953216374269, | |
| "grad_norm": 0.0719306314196102, | |
| "learning_rate": 8.777620863183657e-05, | |
| "loss": 1.0973, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.5954279638490165, | |
| "grad_norm": 0.06768671936995115, | |
| "learning_rate": 8.409986430748545e-05, | |
| "loss": 1.0993, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6060606060606061, | |
| "grad_norm": 0.07166544718736949, | |
| "learning_rate": 8.044544348583755e-05, | |
| "loss": 1.1116, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.6166932482721956, | |
| "grad_norm": 0.07026460560964139, | |
| "learning_rate": 7.681798497324716e-05, | |
| "loss": 1.1215, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6273258904837852, | |
| "grad_norm": 0.07493637213313761, | |
| "learning_rate": 7.322249039976608e-05, | |
| "loss": 1.0951, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.6379585326953748, | |
| "grad_norm": 0.0735588585321075, | |
| "learning_rate": 6.966391732277143e-05, | |
| "loss": 1.1051, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6485911749069644, | |
| "grad_norm": 0.07103076233316474, | |
| "learning_rate": 6.614717239136246e-05, | |
| "loss": 1.1099, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.6592238171185539, | |
| "grad_norm": 0.06929087442452384, | |
| "learning_rate": 6.267710458095053e-05, | |
| "loss": 1.0891, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.6698564593301436, | |
| "grad_norm": 0.07048509973214657, | |
| "learning_rate": 5.9258498507371194e-05, | |
| "loss": 1.1308, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.6804891015417331, | |
| "grad_norm": 0.06987687341789249, | |
| "learning_rate": 5.589606782973683e-05, | |
| "loss": 1.1288, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.6911217437533227, | |
| "grad_norm": 0.0726818503990417, | |
| "learning_rate": 5.259444875112624e-05, | |
| "loss": 1.1166, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.7017543859649122, | |
| "grad_norm": 0.07076508826669288, | |
| "learning_rate": 4.93581936260724e-05, | |
| "loss": 1.1094, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.7123870281765019, | |
| "grad_norm": 0.06894707145543243, | |
| "learning_rate": 4.6191764683662744e-05, | |
| "loss": 1.097, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.7230196703880915, | |
| "grad_norm": 0.07530097462452943, | |
| "learning_rate": 4.309952787490689e-05, | |
| "loss": 1.1017, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.733652312599681, | |
| "grad_norm": 0.06877541461151461, | |
| "learning_rate": 4.008574685285442e-05, | |
| "loss": 1.1034, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.7442849548112705, | |
| "grad_norm": 0.07024805662579879, | |
| "learning_rate": 3.7154577093764334e-05, | |
| "loss": 1.0996, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7549175970228602, | |
| "grad_norm": 0.06979054766691796, | |
| "learning_rate": 3.4310060167430725e-05, | |
| "loss": 1.0993, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.7655502392344498, | |
| "grad_norm": 0.07127452889238414, | |
| "learning_rate": 3.155611816456586e-05, | |
| "loss": 1.1108, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.7761828814460393, | |
| "grad_norm": 0.06929411714148895, | |
| "learning_rate": 2.889654828892393e-05, | |
| "loss": 1.1014, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.786815523657629, | |
| "grad_norm": 0.06986765320557396, | |
| "learning_rate": 2.6335017621622116e-05, | |
| "loss": 1.1178, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.7974481658692185, | |
| "grad_norm": 0.14171152314730037, | |
| "learning_rate": 2.3875058064877807e-05, | |
| "loss": 1.1056, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.8080808080808081, | |
| "grad_norm": 0.07523630706911914, | |
| "learning_rate": 2.1520061472133902e-05, | |
| "loss": 1.1196, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.8187134502923976, | |
| "grad_norm": 0.06927631258661882, | |
| "learning_rate": 1.927327497128706e-05, | |
| "loss": 1.1165, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.8293460925039873, | |
| "grad_norm": 0.07022924736143368, | |
| "learning_rate": 1.7137796487466797e-05, | |
| "loss": 1.102, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.8399787347155768, | |
| "grad_norm": 0.06841887685146186, | |
| "learning_rate": 1.5116570471539293e-05, | |
| "loss": 1.1, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.8506113769271664, | |
| "grad_norm": 0.0680433251960951, | |
| "learning_rate": 1.3212383840225329e-05, | |
| "loss": 1.1007, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.861244019138756, | |
| "grad_norm": 0.06849436910046028, | |
| "learning_rate": 1.1427862133430156e-05, | |
| "loss": 1.0919, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.8718766613503456, | |
| "grad_norm": 0.06705533678821948, | |
| "learning_rate": 9.765465894083636e-06, | |
| "loss": 1.1018, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.8825093035619351, | |
| "grad_norm": 0.06997290719332996, | |
| "learning_rate": 8.227487275482592e-06, | |
| "loss": 1.1141, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.8931419457735247, | |
| "grad_norm": 0.06955661856691482, | |
| "learning_rate": 6.81604688081271e-06, | |
| "loss": 1.1179, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.9037745879851143, | |
| "grad_norm": 0.06939654515531442, | |
| "learning_rate": 5.533090839208133e-06, | |
| "loss": 1.0859, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.9144072301967039, | |
| "grad_norm": 0.06890766733706018, | |
| "learning_rate": 4.380388122380141e-06, | |
| "loss": 1.1041, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.9250398724082934, | |
| "grad_norm": 0.06774543961855677, | |
| "learning_rate": 3.359528105515064e-06, | |
| "loss": 1.1054, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.935672514619883, | |
| "grad_norm": 0.06656651460274651, | |
| "learning_rate": 2.471918375804105e-06, | |
| "loss": 1.1018, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.9463051568314726, | |
| "grad_norm": 0.06852233232210171, | |
| "learning_rate": 1.7187827916271382e-06, | |
| "loss": 1.1129, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.9569377990430622, | |
| "grad_norm": 0.06708991081678066, | |
| "learning_rate": 1.1011597950663865e-06, | |
| "loss": 1.1046, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.9675704412546517, | |
| "grad_norm": 0.06813589171568658, | |
| "learning_rate": 6.199009800765265e-07, | |
| "loss": 1.0986, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.9782030834662414, | |
| "grad_norm": 0.0700679858970667, | |
| "learning_rate": 2.756699182858369e-07, | |
| "loss": 1.111, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.988835725677831, | |
| "grad_norm": 0.07404495795184537, | |
| "learning_rate": 6.894124404711599e-08, | |
| "loss": 1.1039, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.9994683678894205, | |
| "grad_norm": 0.07042216599583477, | |
| "learning_rate": 0.0, | |
| "loss": 1.1192, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.9994683678894205, | |
| "eval_runtime": 5.9798, | |
| "eval_samples_per_second": 1.672, | |
| "eval_steps_per_second": 0.502, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.9994683678894205, | |
| "step": 470, | |
| "total_flos": 3.0618166709610086e+17, | |
| "train_loss": 1.1288761854171754, | |
| "train_runtime": 18761.7867, | |
| "train_samples_per_second": 3.207, | |
| "train_steps_per_second": 0.025 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 470, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.0618166709610086e+17, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |