{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9946777054997042, "eval_steps": 500, "global_step": 1266, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02365464222353637, "grad_norm": 2.64785060100586, "learning_rate": 5e-06, "loss": 0.8881, "step": 10 }, { "epoch": 0.04730928444707274, "grad_norm": 4.492435661685993, "learning_rate": 5e-06, "loss": 0.7888, "step": 20 }, { "epoch": 0.0709639266706091, "grad_norm": 1.1116541886300364, "learning_rate": 5e-06, "loss": 0.7578, "step": 30 }, { "epoch": 0.09461856889414548, "grad_norm": 1.218928037085855, "learning_rate": 5e-06, "loss": 0.7402, "step": 40 }, { "epoch": 0.11827321111768184, "grad_norm": 1.020418802489519, "learning_rate": 5e-06, "loss": 0.7286, "step": 50 }, { "epoch": 0.1419278533412182, "grad_norm": 0.962265802957487, "learning_rate": 5e-06, "loss": 0.7167, "step": 60 }, { "epoch": 0.16558249556475457, "grad_norm": 0.7842406295728257, "learning_rate": 5e-06, "loss": 0.7122, "step": 70 }, { "epoch": 0.18923713778829096, "grad_norm": 0.8009401339234955, "learning_rate": 5e-06, "loss": 0.7019, "step": 80 }, { "epoch": 0.21289178001182732, "grad_norm": 0.7411954553666439, "learning_rate": 5e-06, "loss": 0.6865, "step": 90 }, { "epoch": 0.23654642223536368, "grad_norm": 0.6604551640687066, "learning_rate": 5e-06, "loss": 0.6854, "step": 100 }, { "epoch": 0.26020106445890007, "grad_norm": 0.5272790934190837, "learning_rate": 5e-06, "loss": 0.6904, "step": 110 }, { "epoch": 0.2838557066824364, "grad_norm": 0.5812760583772804, "learning_rate": 5e-06, "loss": 0.6776, "step": 120 }, { "epoch": 0.3075103489059728, "grad_norm": 0.5288883948662314, "learning_rate": 5e-06, "loss": 0.6738, "step": 130 }, { "epoch": 0.33116499112950915, "grad_norm": 0.63694474679749, "learning_rate": 5e-06, "loss": 0.6822, "step": 140 }, { "epoch": 0.35481963335304556, "grad_norm": 0.8869276352236491, "learning_rate": 5e-06, "loss": 0.6649, "step": 150 }, { "epoch": 0.3784742755765819, "grad_norm": 0.518289407793539, "learning_rate": 5e-06, "loss": 0.6696, "step": 160 }, { "epoch": 0.4021289178001183, "grad_norm": 0.5358648478939547, "learning_rate": 5e-06, "loss": 0.6671, "step": 170 }, { "epoch": 0.42578356002365464, "grad_norm": 0.797540982889909, "learning_rate": 5e-06, "loss": 0.6743, "step": 180 }, { "epoch": 0.449438202247191, "grad_norm": 0.4908769384958429, "learning_rate": 5e-06, "loss": 0.6704, "step": 190 }, { "epoch": 0.47309284447072736, "grad_norm": 0.5763974723432601, "learning_rate": 5e-06, "loss": 0.6692, "step": 200 }, { "epoch": 0.4967474866942638, "grad_norm": 0.9377426988511532, "learning_rate": 5e-06, "loss": 0.6696, "step": 210 }, { "epoch": 0.5204021289178001, "grad_norm": 0.576249879822413, "learning_rate": 5e-06, "loss": 0.6688, "step": 220 }, { "epoch": 0.5440567711413364, "grad_norm": 0.5017180552678443, "learning_rate": 5e-06, "loss": 0.6605, "step": 230 }, { "epoch": 0.5677114133648729, "grad_norm": 0.46598378368467036, "learning_rate": 5e-06, "loss": 0.6649, "step": 240 }, { "epoch": 0.5913660555884093, "grad_norm": 0.44951386010612127, "learning_rate": 5e-06, "loss": 0.6618, "step": 250 }, { "epoch": 0.6150206978119456, "grad_norm": 0.6107309306047823, "learning_rate": 5e-06, "loss": 0.6525, "step": 260 }, { "epoch": 0.638675340035482, "grad_norm": 0.6063385890447376, "learning_rate": 5e-06, "loss": 0.664, "step": 270 }, { "epoch": 0.6623299822590183, "grad_norm": 0.62161938246522, "learning_rate": 5e-06, "loss": 0.6658, "step": 280 }, { "epoch": 0.6859846244825547, "grad_norm": 0.48803716140240055, "learning_rate": 5e-06, "loss": 0.6599, "step": 290 }, { "epoch": 0.7096392667060911, "grad_norm": 0.5643745069932464, "learning_rate": 5e-06, "loss": 0.6536, "step": 300 }, { "epoch": 0.7332939089296274, "grad_norm": 0.45536574485759423, "learning_rate": 5e-06, "loss": 0.662, "step": 310 }, { "epoch": 0.7569485511531638, "grad_norm": 0.5246164914927769, "learning_rate": 5e-06, "loss": 0.656, "step": 320 }, { "epoch": 0.7806031933767001, "grad_norm": 0.5653234939153071, "learning_rate": 5e-06, "loss": 0.6541, "step": 330 }, { "epoch": 0.8042578356002366, "grad_norm": 0.553750057338791, "learning_rate": 5e-06, "loss": 0.6539, "step": 340 }, { "epoch": 0.8279124778237729, "grad_norm": 0.5823597490849275, "learning_rate": 5e-06, "loss": 0.6602, "step": 350 }, { "epoch": 0.8515671200473093, "grad_norm": 0.432948815077171, "learning_rate": 5e-06, "loss": 0.6537, "step": 360 }, { "epoch": 0.8752217622708457, "grad_norm": 0.44732587095055915, "learning_rate": 5e-06, "loss": 0.6571, "step": 370 }, { "epoch": 0.898876404494382, "grad_norm": 0.45966183440739067, "learning_rate": 5e-06, "loss": 0.6478, "step": 380 }, { "epoch": 0.9225310467179184, "grad_norm": 0.6457761210048674, "learning_rate": 5e-06, "loss": 0.6552, "step": 390 }, { "epoch": 0.9461856889414547, "grad_norm": 0.4724156783274873, "learning_rate": 5e-06, "loss": 0.6467, "step": 400 }, { "epoch": 0.9698403311649911, "grad_norm": 0.4559392408256043, "learning_rate": 5e-06, "loss": 0.6492, "step": 410 }, { "epoch": 0.9934949733885275, "grad_norm": 0.5566904074437511, "learning_rate": 5e-06, "loss": 0.6483, "step": 420 }, { "epoch": 0.9982259018332348, "eval_loss": 0.6504195332527161, "eval_runtime": 226.3163, "eval_samples_per_second": 50.323, "eval_steps_per_second": 0.393, "step": 422 }, { "epoch": 1.0171496156120639, "grad_norm": 0.5100430406130203, "learning_rate": 5e-06, "loss": 0.6374, "step": 430 }, { "epoch": 1.0408042578356003, "grad_norm": 0.6182142994254941, "learning_rate": 5e-06, "loss": 0.6056, "step": 440 }, { "epoch": 1.0644589000591367, "grad_norm": 0.612159821355247, "learning_rate": 5e-06, "loss": 0.6041, "step": 450 }, { "epoch": 1.0881135422826729, "grad_norm": 0.620568436985301, "learning_rate": 5e-06, "loss": 0.6125, "step": 460 }, { "epoch": 1.1117681845062093, "grad_norm": 0.48353189963783877, "learning_rate": 5e-06, "loss": 0.6114, "step": 470 }, { "epoch": 1.1354228267297457, "grad_norm": 0.553405903828158, "learning_rate": 5e-06, "loss": 0.6114, "step": 480 }, { "epoch": 1.1590774689532821, "grad_norm": 0.48890661916776557, "learning_rate": 5e-06, "loss": 0.6086, "step": 490 }, { "epoch": 1.1827321111768185, "grad_norm": 0.4626708227952605, "learning_rate": 5e-06, "loss": 0.6048, "step": 500 }, { "epoch": 1.2063867534003547, "grad_norm": 0.5562602420898135, "learning_rate": 5e-06, "loss": 0.6045, "step": 510 }, { "epoch": 1.2300413956238911, "grad_norm": 0.5583615267258394, "learning_rate": 5e-06, "loss": 0.6106, "step": 520 }, { "epoch": 1.2536960378474276, "grad_norm": 0.5120824375752575, "learning_rate": 5e-06, "loss": 0.6116, "step": 530 }, { "epoch": 1.277350680070964, "grad_norm": 0.5477302994052297, "learning_rate": 5e-06, "loss": 0.6119, "step": 540 }, { "epoch": 1.3010053222945004, "grad_norm": 0.6186813978530775, "learning_rate": 5e-06, "loss": 0.6102, "step": 550 }, { "epoch": 1.3246599645180366, "grad_norm": 0.4482998491087788, "learning_rate": 5e-06, "loss": 0.6082, "step": 560 }, { "epoch": 1.348314606741573, "grad_norm": 0.8072488882089637, "learning_rate": 5e-06, "loss": 0.6082, "step": 570 }, { "epoch": 1.3719692489651094, "grad_norm": 0.48728798126789036, "learning_rate": 5e-06, "loss": 0.6118, "step": 580 }, { "epoch": 1.3956238911886458, "grad_norm": 0.4366149357515769, "learning_rate": 5e-06, "loss": 0.6123, "step": 590 }, { "epoch": 1.4192785334121822, "grad_norm": 0.5669347135329578, "learning_rate": 5e-06, "loss": 0.5995, "step": 600 }, { "epoch": 1.4429331756357184, "grad_norm": 0.43980097172154653, "learning_rate": 5e-06, "loss": 0.6012, "step": 610 }, { "epoch": 1.4665878178592548, "grad_norm": 0.4546181925945345, "learning_rate": 5e-06, "loss": 0.6039, "step": 620 }, { "epoch": 1.4902424600827913, "grad_norm": 0.4439868785027596, "learning_rate": 5e-06, "loss": 0.6145, "step": 630 }, { "epoch": 1.5138971023063275, "grad_norm": 0.4840991507057293, "learning_rate": 5e-06, "loss": 0.6128, "step": 640 }, { "epoch": 1.537551744529864, "grad_norm": 0.5028857797195146, "learning_rate": 5e-06, "loss": 0.6037, "step": 650 }, { "epoch": 1.5612063867534003, "grad_norm": 0.6295444122174948, "learning_rate": 5e-06, "loss": 0.6081, "step": 660 }, { "epoch": 1.5848610289769367, "grad_norm": 0.5574140089154338, "learning_rate": 5e-06, "loss": 0.605, "step": 670 }, { "epoch": 1.6085156712004731, "grad_norm": 0.5278087358672146, "learning_rate": 5e-06, "loss": 0.609, "step": 680 }, { "epoch": 1.6321703134240093, "grad_norm": 0.5407251516890358, "learning_rate": 5e-06, "loss": 0.6023, "step": 690 }, { "epoch": 1.655824955647546, "grad_norm": 0.47669538747864815, "learning_rate": 5e-06, "loss": 0.6098, "step": 700 }, { "epoch": 1.6794795978710821, "grad_norm": 0.5117309364481941, "learning_rate": 5e-06, "loss": 0.6083, "step": 710 }, { "epoch": 1.7031342400946186, "grad_norm": 0.4622792038975577, "learning_rate": 5e-06, "loss": 0.6145, "step": 720 }, { "epoch": 1.726788882318155, "grad_norm": 0.476155429063029, "learning_rate": 5e-06, "loss": 0.6102, "step": 730 }, { "epoch": 1.7504435245416912, "grad_norm": 0.6072162079153387, "learning_rate": 5e-06, "loss": 0.6047, "step": 740 }, { "epoch": 1.7740981667652278, "grad_norm": 0.4580614369890106, "learning_rate": 5e-06, "loss": 0.6104, "step": 750 }, { "epoch": 1.797752808988764, "grad_norm": 0.48163931069349186, "learning_rate": 5e-06, "loss": 0.6152, "step": 760 }, { "epoch": 1.8214074512123004, "grad_norm": 0.44236726486886196, "learning_rate": 5e-06, "loss": 0.6159, "step": 770 }, { "epoch": 1.8450620934358368, "grad_norm": 0.4140807715487973, "learning_rate": 5e-06, "loss": 0.6077, "step": 780 }, { "epoch": 1.868716735659373, "grad_norm": 0.49794149670638754, "learning_rate": 5e-06, "loss": 0.6064, "step": 790 }, { "epoch": 1.8923713778829097, "grad_norm": 0.48140566757242875, "learning_rate": 5e-06, "loss": 0.613, "step": 800 }, { "epoch": 1.9160260201064458, "grad_norm": 0.4407179231154673, "learning_rate": 5e-06, "loss": 0.6126, "step": 810 }, { "epoch": 1.9396806623299823, "grad_norm": 0.41696082412030117, "learning_rate": 5e-06, "loss": 0.5994, "step": 820 }, { "epoch": 1.9633353045535187, "grad_norm": 0.47981289616269807, "learning_rate": 5e-06, "loss": 0.612, "step": 830 }, { "epoch": 1.9869899467770549, "grad_norm": 0.4484580880097136, "learning_rate": 5e-06, "loss": 0.5968, "step": 840 }, { "epoch": 1.9988172678888232, "eval_loss": 0.6400034427642822, "eval_runtime": 225.8288, "eval_samples_per_second": 50.432, "eval_steps_per_second": 0.394, "step": 845 }, { "epoch": 2.0106445890005915, "grad_norm": 0.6332180702757985, "learning_rate": 5e-06, "loss": 0.5995, "step": 850 }, { "epoch": 2.0342992312241277, "grad_norm": 0.5223111355986098, "learning_rate": 5e-06, "loss": 0.5669, "step": 860 }, { "epoch": 2.057953873447664, "grad_norm": 0.5560692373029835, "learning_rate": 5e-06, "loss": 0.5668, "step": 870 }, { "epoch": 2.0816085156712005, "grad_norm": 0.5874280616479215, "learning_rate": 5e-06, "loss": 0.559, "step": 880 }, { "epoch": 2.1052631578947367, "grad_norm": 0.5381432115880647, "learning_rate": 5e-06, "loss": 0.5615, "step": 890 }, { "epoch": 2.1289178001182734, "grad_norm": 0.5340607102638687, "learning_rate": 5e-06, "loss": 0.567, "step": 900 }, { "epoch": 2.1525724423418096, "grad_norm": 0.4709298987555601, "learning_rate": 5e-06, "loss": 0.5616, "step": 910 }, { "epoch": 2.1762270845653457, "grad_norm": 0.47531624539694045, "learning_rate": 5e-06, "loss": 0.5634, "step": 920 }, { "epoch": 2.1998817267888824, "grad_norm": 0.5690095865272987, "learning_rate": 5e-06, "loss": 0.5627, "step": 930 }, { "epoch": 2.2235363690124186, "grad_norm": 0.6002806476085656, "learning_rate": 5e-06, "loss": 0.5643, "step": 940 }, { "epoch": 2.247191011235955, "grad_norm": 0.5868800123289198, "learning_rate": 5e-06, "loss": 0.5702, "step": 950 }, { "epoch": 2.2708456534594914, "grad_norm": 0.545743902564753, "learning_rate": 5e-06, "loss": 0.564, "step": 960 }, { "epoch": 2.2945002956830276, "grad_norm": 0.5069047371924982, "learning_rate": 5e-06, "loss": 0.5657, "step": 970 }, { "epoch": 2.3181549379065642, "grad_norm": 0.5832343780575133, "learning_rate": 5e-06, "loss": 0.5696, "step": 980 }, { "epoch": 2.3418095801301004, "grad_norm": 0.49163574506732044, "learning_rate": 5e-06, "loss": 0.5673, "step": 990 }, { "epoch": 2.365464222353637, "grad_norm": 0.5769534565432972, "learning_rate": 5e-06, "loss": 0.5674, "step": 1000 }, { "epoch": 2.3891188645771733, "grad_norm": 0.4748456336186354, "learning_rate": 5e-06, "loss": 0.5656, "step": 1010 }, { "epoch": 2.4127735068007095, "grad_norm": 0.5571397878943326, "learning_rate": 5e-06, "loss": 0.5632, "step": 1020 }, { "epoch": 2.436428149024246, "grad_norm": 0.4977538685424081, "learning_rate": 5e-06, "loss": 0.5662, "step": 1030 }, { "epoch": 2.4600827912477823, "grad_norm": 0.6212689514507728, "learning_rate": 5e-06, "loss": 0.5669, "step": 1040 }, { "epoch": 2.483737433471319, "grad_norm": 0.5004435631831575, "learning_rate": 5e-06, "loss": 0.5694, "step": 1050 }, { "epoch": 2.507392075694855, "grad_norm": 0.5290951851110267, "learning_rate": 5e-06, "loss": 0.5686, "step": 1060 }, { "epoch": 2.5310467179183913, "grad_norm": 0.48790551419012185, "learning_rate": 5e-06, "loss": 0.5709, "step": 1070 }, { "epoch": 2.554701360141928, "grad_norm": 0.5234534909018554, "learning_rate": 5e-06, "loss": 0.5718, "step": 1080 }, { "epoch": 2.578356002365464, "grad_norm": 0.4907774872609342, "learning_rate": 5e-06, "loss": 0.5661, "step": 1090 }, { "epoch": 2.6020106445890008, "grad_norm": 0.44774060132900434, "learning_rate": 5e-06, "loss": 0.5681, "step": 1100 }, { "epoch": 2.625665286812537, "grad_norm": 0.5280954839046109, "learning_rate": 5e-06, "loss": 0.566, "step": 1110 }, { "epoch": 2.649319929036073, "grad_norm": 0.6085042539902774, "learning_rate": 5e-06, "loss": 0.5773, "step": 1120 }, { "epoch": 2.67297457125961, "grad_norm": 0.5793916767265476, "learning_rate": 5e-06, "loss": 0.5646, "step": 1130 }, { "epoch": 2.696629213483146, "grad_norm": 0.452089081498202, "learning_rate": 5e-06, "loss": 0.5653, "step": 1140 }, { "epoch": 2.7202838557066826, "grad_norm": 0.548967392597449, "learning_rate": 5e-06, "loss": 0.5689, "step": 1150 }, { "epoch": 2.743938497930219, "grad_norm": 0.5154350449304891, "learning_rate": 5e-06, "loss": 0.5751, "step": 1160 }, { "epoch": 2.767593140153755, "grad_norm": 0.5408130206005414, "learning_rate": 5e-06, "loss": 0.5659, "step": 1170 }, { "epoch": 2.7912477823772917, "grad_norm": 0.448067927236881, "learning_rate": 5e-06, "loss": 0.5696, "step": 1180 }, { "epoch": 2.814902424600828, "grad_norm": 0.46851404269934493, "learning_rate": 5e-06, "loss": 0.5692, "step": 1190 }, { "epoch": 2.8385570668243645, "grad_norm": 0.4742832176992908, "learning_rate": 5e-06, "loss": 0.5676, "step": 1200 }, { "epoch": 2.8622117090479007, "grad_norm": 0.5257873035609077, "learning_rate": 5e-06, "loss": 0.569, "step": 1210 }, { "epoch": 2.885866351271437, "grad_norm": 0.4515773409971667, "learning_rate": 5e-06, "loss": 0.563, "step": 1220 }, { "epoch": 2.9095209934949735, "grad_norm": 0.4991731794016107, "learning_rate": 5e-06, "loss": 0.5695, "step": 1230 }, { "epoch": 2.9331756357185097, "grad_norm": 0.5065264471878053, "learning_rate": 5e-06, "loss": 0.5706, "step": 1240 }, { "epoch": 2.9568302779420463, "grad_norm": 0.44403389629974643, "learning_rate": 5e-06, "loss": 0.5786, "step": 1250 }, { "epoch": 2.9804849201655825, "grad_norm": 0.5042460817434247, "learning_rate": 5e-06, "loss": 0.572, "step": 1260 }, { "epoch": 2.9946777054997042, "eval_loss": 0.6413525938987732, "eval_runtime": 226.2058, "eval_samples_per_second": 50.348, "eval_steps_per_second": 0.393, "step": 1266 }, { "epoch": 2.9946777054997042, "step": 1266, "total_flos": 2120178393415680.0, "train_loss": 0.6193434227887676, "train_runtime": 37811.5197, "train_samples_per_second": 17.167, "train_steps_per_second": 0.033 } ], "logging_steps": 10, "max_steps": 1266, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2120178393415680.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }