{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994683678894205, "eval_steps": 500, "global_step": 470, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002126528442317916, "grad_norm": 1.1132217598089287, "learning_rate": 4.255319148936171e-06, "loss": 1.475, "step": 1 }, { "epoch": 0.01063264221158958, "grad_norm": 1.0674803445954786, "learning_rate": 2.1276595744680852e-05, "loss": 1.3986, "step": 5 }, { "epoch": 0.02126528442317916, "grad_norm": 0.27714288812132803, "learning_rate": 4.2553191489361704e-05, "loss": 1.3198, "step": 10 }, { "epoch": 0.03189792663476874, "grad_norm": 0.24323884897635753, "learning_rate": 6.382978723404256e-05, "loss": 1.3201, "step": 15 }, { "epoch": 0.04253056884635832, "grad_norm": 0.17616144014740132, "learning_rate": 8.510638297872341e-05, "loss": 1.2681, "step": 20 }, { "epoch": 0.0531632110579479, "grad_norm": 0.14425961839274853, "learning_rate": 0.00010638297872340425, "loss": 1.2294, "step": 25 }, { "epoch": 0.06379585326953748, "grad_norm": 0.09993563340523522, "learning_rate": 0.00012765957446808513, "loss": 1.2203, "step": 30 }, { "epoch": 0.07442849548112707, "grad_norm": 0.09436627728374515, "learning_rate": 0.00014893617021276596, "loss": 1.1787, "step": 35 }, { "epoch": 0.08506113769271664, "grad_norm": 0.08152779581665957, "learning_rate": 0.00017021276595744682, "loss": 1.213, "step": 40 }, { "epoch": 0.09569377990430622, "grad_norm": 0.08994938392428893, "learning_rate": 0.00019148936170212768, "loss": 1.1771, "step": 45 }, { "epoch": 0.1063264221158958, "grad_norm": 0.08889534543964475, "learning_rate": 0.0001999751793267259, "loss": 1.1937, "step": 50 }, { "epoch": 0.11695906432748537, "grad_norm": 0.0818192481558469, "learning_rate": 0.00019982354205260347, "loss": 1.1549, "step": 55 }, { "epoch": 0.12759170653907495, "grad_norm": 0.08592131915598437, "learning_rate": 0.0001995342655949951, "loss": 1.1452, "step": 60 }, { "epoch": 0.13822434875066453, "grad_norm": 0.08847958460762677, "learning_rate": 0.000199107748815478, "loss": 1.1381, "step": 65 }, { "epoch": 0.14885699096225413, "grad_norm": 0.09175483942286262, "learning_rate": 0.000198544579806, "loss": 1.1512, "step": 70 }, { "epoch": 0.1594896331738437, "grad_norm": 0.07308896698803534, "learning_rate": 0.00019784553507800349, "loss": 1.1245, "step": 75 }, { "epoch": 0.17012227538543329, "grad_norm": 0.07352573225963126, "learning_rate": 0.00019701157849175228, "loss": 1.1456, "step": 80 }, { "epoch": 0.18075491759702286, "grad_norm": 0.08124735026425489, "learning_rate": 0.00019604385992733715, "loss": 1.138, "step": 85 }, { "epoch": 0.19138755980861244, "grad_norm": 0.07150321514192684, "learning_rate": 0.0001949437136991925, "loss": 1.1375, "step": 90 }, { "epoch": 0.20202020202020202, "grad_norm": 0.07663110018396366, "learning_rate": 0.00019371265671631037, "loss": 1.123, "step": 95 }, { "epoch": 0.2126528442317916, "grad_norm": 0.08486863588170293, "learning_rate": 0.00019235238639068856, "loss": 1.1357, "step": 100 }, { "epoch": 0.22328548644338117, "grad_norm": 0.0762558400001174, "learning_rate": 0.00019086477829689685, "loss": 1.131, "step": 105 }, { "epoch": 0.23391812865497075, "grad_norm": 0.07452030437492205, "learning_rate": 0.00018925188358598813, "loss": 1.1442, "step": 110 }, { "epoch": 0.24455077086656035, "grad_norm": 0.07570723134200702, "learning_rate": 0.00018751592615732005, "loss": 1.1342, "step": 115 }, { "epoch": 0.2551834130781499, "grad_norm": 0.15468624584475818, "learning_rate": 0.00018565929959218758, "loss": 1.1158, "step": 120 }, { "epoch": 0.2658160552897395, "grad_norm": 0.08459758525414239, "learning_rate": 0.00018368456385349334, "loss": 1.1103, "step": 125 }, { "epoch": 0.27644869750132905, "grad_norm": 0.0699573756198723, "learning_rate": 0.00018159444175600703, "loss": 1.1375, "step": 130 }, { "epoch": 0.28708133971291866, "grad_norm": 0.09015868621107682, "learning_rate": 0.000179391815212081, "loss": 1.1351, "step": 135 }, { "epoch": 0.29771398192450826, "grad_norm": 0.07786434215665021, "learning_rate": 0.00017707972125799735, "loss": 1.1094, "step": 140 }, { "epoch": 0.3083466241360978, "grad_norm": 0.0722359719751335, "learning_rate": 0.0001746613478664271, "loss": 1.109, "step": 145 }, { "epoch": 0.3189792663476874, "grad_norm": 0.07007415812680974, "learning_rate": 0.00017214002955077393, "loss": 1.118, "step": 150 }, { "epoch": 0.32961190855927697, "grad_norm": 0.07271324414680513, "learning_rate": 0.00016951924276746425, "loss": 1.1093, "step": 155 }, { "epoch": 0.34024455077086657, "grad_norm": 0.07579503380547685, "learning_rate": 0.0001668026011225225, "loss": 1.132, "step": 160 }, { "epoch": 0.3508771929824561, "grad_norm": 0.0755893077221133, "learning_rate": 0.00016399385038904138, "loss": 1.1041, "step": 165 }, { "epoch": 0.3615098351940457, "grad_norm": 0.07521138011494473, "learning_rate": 0.00016109686334241655, "loss": 1.1106, "step": 170 }, { "epoch": 0.3721424774056353, "grad_norm": 0.07475367435754098, "learning_rate": 0.00015811563442046767, "loss": 1.0981, "step": 175 }, { "epoch": 0.3827751196172249, "grad_norm": 0.07569742075334658, "learning_rate": 0.00015505427421580808, "loss": 1.1027, "step": 180 }, { "epoch": 0.3934077618288145, "grad_norm": 0.06970712543989657, "learning_rate": 0.00015191700380805752, "loss": 1.1148, "step": 185 }, { "epoch": 0.40404040404040403, "grad_norm": 0.07988874051314146, "learning_rate": 0.00014870814894371245, "loss": 1.1113, "step": 190 }, { "epoch": 0.41467304625199364, "grad_norm": 0.07160684049436854, "learning_rate": 0.0001454321340716992, "loss": 1.0861, "step": 195 }, { "epoch": 0.4253056884635832, "grad_norm": 0.07005079598481417, "learning_rate": 0.0001420934762428335, "loss": 1.0971, "step": 200 }, { "epoch": 0.4359383306751728, "grad_norm": 0.07394470646142667, "learning_rate": 0.00013869677888159887, "loss": 1.1136, "step": 205 }, { "epoch": 0.44657097288676234, "grad_norm": 0.07523040228715787, "learning_rate": 0.00013524672543882996, "loss": 1.1196, "step": 210 }, { "epoch": 0.45720361509835195, "grad_norm": 0.0740459816152623, "learning_rate": 0.00013174807293405428, "loss": 1.122, "step": 215 }, { "epoch": 0.4678362573099415, "grad_norm": 0.07407785053371259, "learning_rate": 0.00012820564539639512, "loss": 1.1101, "step": 220 }, { "epoch": 0.4784688995215311, "grad_norm": 0.07407385022434756, "learning_rate": 0.0001246243272130804, "loss": 1.1093, "step": 225 }, { "epoch": 0.4891015417331207, "grad_norm": 0.0691999860793088, "learning_rate": 0.00012100905639472779, "loss": 1.0947, "step": 230 }, { "epoch": 0.49973418394471025, "grad_norm": 0.07023344891744904, "learning_rate": 0.00011736481776669306, "loss": 1.1322, "step": 235 }, { "epoch": 0.5103668261562998, "grad_norm": 0.07064724436775818, "learning_rate": 0.00011369663609586854, "loss": 1.1188, "step": 240 }, { "epoch": 0.5209994683678895, "grad_norm": 0.07583291773979836, "learning_rate": 0.00011000956916240985, "loss": 1.0995, "step": 245 }, { "epoch": 0.531632110579479, "grad_norm": 0.07152892272662684, "learning_rate": 0.00010630870078594249, "loss": 1.1236, "step": 250 }, { "epoch": 0.5422647527910686, "grad_norm": 0.07101072737185006, "learning_rate": 0.0001025991338158651, "loss": 1.1311, "step": 255 }, { "epoch": 0.5528973950026581, "grad_norm": 0.0702113295280909, "learning_rate": 9.888598309541347e-05, "loss": 1.1021, "step": 260 }, { "epoch": 0.5635300372142478, "grad_norm": 0.07560382456183587, "learning_rate": 9.517436840918766e-05, "loss": 1.1193, "step": 265 }, { "epoch": 0.5741626794258373, "grad_norm": 0.07181091122638743, "learning_rate": 9.146940742386553e-05, "loss": 1.1171, "step": 270 }, { "epoch": 0.5847953216374269, "grad_norm": 0.0719306314196102, "learning_rate": 8.777620863183657e-05, "loss": 1.0973, "step": 275 }, { "epoch": 0.5954279638490165, "grad_norm": 0.06768671936995115, "learning_rate": 8.409986430748545e-05, "loss": 1.0993, "step": 280 }, { "epoch": 0.6060606060606061, "grad_norm": 0.07166544718736949, "learning_rate": 8.044544348583755e-05, "loss": 1.1116, "step": 285 }, { "epoch": 0.6166932482721956, "grad_norm": 0.07026460560964139, "learning_rate": 7.681798497324716e-05, "loss": 1.1215, "step": 290 }, { "epoch": 0.6273258904837852, "grad_norm": 0.07493637213313761, "learning_rate": 7.322249039976608e-05, "loss": 1.0951, "step": 295 }, { "epoch": 0.6379585326953748, "grad_norm": 0.0735588585321075, "learning_rate": 6.966391732277143e-05, "loss": 1.1051, "step": 300 }, { "epoch": 0.6485911749069644, "grad_norm": 0.07103076233316474, "learning_rate": 6.614717239136246e-05, "loss": 1.1099, "step": 305 }, { "epoch": 0.6592238171185539, "grad_norm": 0.06929087442452384, "learning_rate": 6.267710458095053e-05, "loss": 1.0891, "step": 310 }, { "epoch": 0.6698564593301436, "grad_norm": 0.07048509973214657, "learning_rate": 5.9258498507371194e-05, "loss": 1.1308, "step": 315 }, { "epoch": 0.6804891015417331, "grad_norm": 0.06987687341789249, "learning_rate": 5.589606782973683e-05, "loss": 1.1288, "step": 320 }, { "epoch": 0.6911217437533227, "grad_norm": 0.0726818503990417, "learning_rate": 5.259444875112624e-05, "loss": 1.1166, "step": 325 }, { "epoch": 0.7017543859649122, "grad_norm": 0.07076508826669288, "learning_rate": 4.93581936260724e-05, "loss": 1.1094, "step": 330 }, { "epoch": 0.7123870281765019, "grad_norm": 0.06894707145543243, "learning_rate": 4.6191764683662744e-05, "loss": 1.097, "step": 335 }, { "epoch": 0.7230196703880915, "grad_norm": 0.07530097462452943, "learning_rate": 4.309952787490689e-05, "loss": 1.1017, "step": 340 }, { "epoch": 0.733652312599681, "grad_norm": 0.06877541461151461, "learning_rate": 4.008574685285442e-05, "loss": 1.1034, "step": 345 }, { "epoch": 0.7442849548112705, "grad_norm": 0.07024805662579879, "learning_rate": 3.7154577093764334e-05, "loss": 1.0996, "step": 350 }, { "epoch": 0.7549175970228602, "grad_norm": 0.06979054766691796, "learning_rate": 3.4310060167430725e-05, "loss": 1.0993, "step": 355 }, { "epoch": 0.7655502392344498, "grad_norm": 0.07127452889238414, "learning_rate": 3.155611816456586e-05, "loss": 1.1108, "step": 360 }, { "epoch": 0.7761828814460393, "grad_norm": 0.06929411714148895, "learning_rate": 2.889654828892393e-05, "loss": 1.1014, "step": 365 }, { "epoch": 0.786815523657629, "grad_norm": 0.06986765320557396, "learning_rate": 2.6335017621622116e-05, "loss": 1.1178, "step": 370 }, { "epoch": 0.7974481658692185, "grad_norm": 0.14171152314730037, "learning_rate": 2.3875058064877807e-05, "loss": 1.1056, "step": 375 }, { "epoch": 0.8080808080808081, "grad_norm": 0.07523630706911914, "learning_rate": 2.1520061472133902e-05, "loss": 1.1196, "step": 380 }, { "epoch": 0.8187134502923976, "grad_norm": 0.06927631258661882, "learning_rate": 1.927327497128706e-05, "loss": 1.1165, "step": 385 }, { "epoch": 0.8293460925039873, "grad_norm": 0.07022924736143368, "learning_rate": 1.7137796487466797e-05, "loss": 1.102, "step": 390 }, { "epoch": 0.8399787347155768, "grad_norm": 0.06841887685146186, "learning_rate": 1.5116570471539293e-05, "loss": 1.1, "step": 395 }, { "epoch": 0.8506113769271664, "grad_norm": 0.0680433251960951, "learning_rate": 1.3212383840225329e-05, "loss": 1.1007, "step": 400 }, { "epoch": 0.861244019138756, "grad_norm": 0.06849436910046028, "learning_rate": 1.1427862133430156e-05, "loss": 1.0919, "step": 405 }, { "epoch": 0.8718766613503456, "grad_norm": 0.06705533678821948, "learning_rate": 9.765465894083636e-06, "loss": 1.1018, "step": 410 }, { "epoch": 0.8825093035619351, "grad_norm": 0.06997290719332996, "learning_rate": 8.227487275482592e-06, "loss": 1.1141, "step": 415 }, { "epoch": 0.8931419457735247, "grad_norm": 0.06955661856691482, "learning_rate": 6.81604688081271e-06, "loss": 1.1179, "step": 420 }, { "epoch": 0.9037745879851143, "grad_norm": 0.06939654515531442, "learning_rate": 5.533090839208133e-06, "loss": 1.0859, "step": 425 }, { "epoch": 0.9144072301967039, "grad_norm": 0.06890766733706018, "learning_rate": 4.380388122380141e-06, "loss": 1.1041, "step": 430 }, { "epoch": 0.9250398724082934, "grad_norm": 0.06774543961855677, "learning_rate": 3.359528105515064e-06, "loss": 1.1054, "step": 435 }, { "epoch": 0.935672514619883, "grad_norm": 0.06656651460274651, "learning_rate": 2.471918375804105e-06, "loss": 1.1018, "step": 440 }, { "epoch": 0.9463051568314726, "grad_norm": 0.06852233232210171, "learning_rate": 1.7187827916271382e-06, "loss": 1.1129, "step": 445 }, { "epoch": 0.9569377990430622, "grad_norm": 0.06708991081678066, "learning_rate": 1.1011597950663865e-06, "loss": 1.1046, "step": 450 }, { "epoch": 0.9675704412546517, "grad_norm": 0.06813589171568658, "learning_rate": 6.199009800765265e-07, "loss": 1.0986, "step": 455 }, { "epoch": 0.9782030834662414, "grad_norm": 0.0700679858970667, "learning_rate": 2.756699182858369e-07, "loss": 1.111, "step": 460 }, { "epoch": 0.988835725677831, "grad_norm": 0.07404495795184537, "learning_rate": 6.894124404711599e-08, "loss": 1.1039, "step": 465 }, { "epoch": 0.9994683678894205, "grad_norm": 0.07042216599583477, "learning_rate": 0.0, "loss": 1.1192, "step": 470 }, { "epoch": 0.9994683678894205, "eval_runtime": 5.9798, "eval_samples_per_second": 1.672, "eval_steps_per_second": 0.502, "step": 470 }, { "epoch": 0.9994683678894205, "step": 470, "total_flos": 3.0618166709610086e+17, "train_loss": 1.1288761854171754, "train_runtime": 18761.7867, "train_samples_per_second": 3.207, "train_steps_per_second": 0.025 } ], "logging_steps": 5, "max_steps": 470, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.0618166709610086e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }