{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9982238010657194, "eval_steps": 500, "global_step": 1266, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.023682652457075192, "grad_norm": 11.580180719970365, "learning_rate": 5e-06, "loss": 0.8801, "step": 10 }, { "epoch": 0.047365304914150384, "grad_norm": 5.694386546554053, "learning_rate": 5e-06, "loss": 0.7848, "step": 20 }, { "epoch": 0.07104795737122557, "grad_norm": 0.7928365185740414, "learning_rate": 5e-06, "loss": 0.7523, "step": 30 }, { "epoch": 0.09473060982830077, "grad_norm": 1.0064056553225857, "learning_rate": 5e-06, "loss": 0.7311, "step": 40 }, { "epoch": 0.11841326228537596, "grad_norm": 0.8588102677301916, "learning_rate": 5e-06, "loss": 0.7244, "step": 50 }, { "epoch": 0.14209591474245115, "grad_norm": 0.7370273691645304, "learning_rate": 5e-06, "loss": 0.706, "step": 60 }, { "epoch": 0.16577856719952636, "grad_norm": 0.7411369649066072, "learning_rate": 5e-06, "loss": 0.7186, "step": 70 }, { "epoch": 0.18946121965660154, "grad_norm": 1.2615112159678439, "learning_rate": 5e-06, "loss": 0.6969, "step": 80 }, { "epoch": 0.21314387211367672, "grad_norm": 0.4974656315107545, "learning_rate": 5e-06, "loss": 0.6972, "step": 90 }, { "epoch": 0.23682652457075193, "grad_norm": 0.5147576667313654, "learning_rate": 5e-06, "loss": 0.68, "step": 100 }, { "epoch": 0.2605091770278271, "grad_norm": 0.4710033839515121, "learning_rate": 5e-06, "loss": 0.6812, "step": 110 }, { "epoch": 0.2841918294849023, "grad_norm": 0.48270825897847935, "learning_rate": 5e-06, "loss": 0.6846, "step": 120 }, { "epoch": 0.30787448194197753, "grad_norm": 0.790240185510984, "learning_rate": 5e-06, "loss": 0.676, "step": 130 }, { "epoch": 0.3315571343990527, "grad_norm": 0.8104277282528387, "learning_rate": 5e-06, "loss": 0.6742, "step": 140 }, { "epoch": 0.3552397868561279, "grad_norm": 0.578671757224867, "learning_rate": 5e-06, "loss": 0.6757, "step": 150 }, { "epoch": 0.3789224393132031, "grad_norm": 0.5874201118510068, "learning_rate": 5e-06, "loss": 0.6754, "step": 160 }, { "epoch": 0.40260509177027826, "grad_norm": 0.4917631935458892, "learning_rate": 5e-06, "loss": 0.67, "step": 170 }, { "epoch": 0.42628774422735344, "grad_norm": 0.6038925351396778, "learning_rate": 5e-06, "loss": 0.6667, "step": 180 }, { "epoch": 0.4499703966844287, "grad_norm": 0.5111399599850822, "learning_rate": 5e-06, "loss": 0.6667, "step": 190 }, { "epoch": 0.47365304914150386, "grad_norm": 0.5213320224608164, "learning_rate": 5e-06, "loss": 0.6683, "step": 200 }, { "epoch": 0.49733570159857904, "grad_norm": 0.48719340333216493, "learning_rate": 5e-06, "loss": 0.6654, "step": 210 }, { "epoch": 0.5210183540556542, "grad_norm": 0.7087017510753161, "learning_rate": 5e-06, "loss": 0.6756, "step": 220 }, { "epoch": 0.5447010065127295, "grad_norm": 0.5190899878624864, "learning_rate": 5e-06, "loss": 0.659, "step": 230 }, { "epoch": 0.5683836589698046, "grad_norm": 0.45436323557307495, "learning_rate": 5e-06, "loss": 0.667, "step": 240 }, { "epoch": 0.5920663114268798, "grad_norm": 0.5064142167200019, "learning_rate": 5e-06, "loss": 0.6657, "step": 250 }, { "epoch": 0.6157489638839551, "grad_norm": 0.634114764397902, "learning_rate": 5e-06, "loss": 0.6566, "step": 260 }, { "epoch": 0.6394316163410302, "grad_norm": 0.5074368322778877, "learning_rate": 5e-06, "loss": 0.6566, "step": 270 }, { "epoch": 0.6631142687981054, "grad_norm": 0.6267290730783063, "learning_rate": 5e-06, "loss": 0.6576, "step": 280 }, { "epoch": 0.6867969212551805, "grad_norm": 0.6348222091289437, "learning_rate": 5e-06, "loss": 0.6636, "step": 290 }, { "epoch": 0.7104795737122558, "grad_norm": 0.5413257273118547, "learning_rate": 5e-06, "loss": 0.66, "step": 300 }, { "epoch": 0.7341622261693309, "grad_norm": 0.47592664539666046, "learning_rate": 5e-06, "loss": 0.6595, "step": 310 }, { "epoch": 0.7578448786264061, "grad_norm": 0.478428145343345, "learning_rate": 5e-06, "loss": 0.6555, "step": 320 }, { "epoch": 0.7815275310834814, "grad_norm": 0.6373996447619408, "learning_rate": 5e-06, "loss": 0.6546, "step": 330 }, { "epoch": 0.8052101835405565, "grad_norm": 0.4755583991704758, "learning_rate": 5e-06, "loss": 0.6537, "step": 340 }, { "epoch": 0.8288928359976317, "grad_norm": 0.47716917902843486, "learning_rate": 5e-06, "loss": 0.6596, "step": 350 }, { "epoch": 0.8525754884547069, "grad_norm": 0.4644937496582173, "learning_rate": 5e-06, "loss": 0.6546, "step": 360 }, { "epoch": 0.8762581409117821, "grad_norm": 0.4239705934033583, "learning_rate": 5e-06, "loss": 0.6523, "step": 370 }, { "epoch": 0.8999407933688574, "grad_norm": 0.510195972449156, "learning_rate": 5e-06, "loss": 0.6545, "step": 380 }, { "epoch": 0.9236234458259325, "grad_norm": 0.8218655966870532, "learning_rate": 5e-06, "loss": 0.6575, "step": 390 }, { "epoch": 0.9473060982830077, "grad_norm": 0.778122168525512, "learning_rate": 5e-06, "loss": 0.642, "step": 400 }, { "epoch": 0.9709887507400828, "grad_norm": 0.515457826896854, "learning_rate": 5e-06, "loss": 0.6542, "step": 410 }, { "epoch": 0.9946714031971581, "grad_norm": 0.5282551277749459, "learning_rate": 5e-06, "loss": 0.6414, "step": 420 }, { "epoch": 0.9994079336885732, "eval_loss": 0.6519263386726379, "eval_runtime": 224.2129, "eval_samples_per_second": 50.742, "eval_steps_per_second": 0.397, "step": 422 }, { "epoch": 1.0183540556542332, "grad_norm": 0.6600161274484254, "learning_rate": 5e-06, "loss": 0.616, "step": 430 }, { "epoch": 1.0420367081113084, "grad_norm": 0.5013360565903009, "learning_rate": 5e-06, "loss": 0.613, "step": 440 }, { "epoch": 1.0657193605683837, "grad_norm": 0.5420218599698048, "learning_rate": 5e-06, "loss": 0.6043, "step": 450 }, { "epoch": 1.089402013025459, "grad_norm": 0.5550600850970575, "learning_rate": 5e-06, "loss": 0.6116, "step": 460 }, { "epoch": 1.1130846654825342, "grad_norm": 0.5711211097551357, "learning_rate": 5e-06, "loss": 0.6114, "step": 470 }, { "epoch": 1.1367673179396092, "grad_norm": 0.4738237703033311, "learning_rate": 5e-06, "loss": 0.6004, "step": 480 }, { "epoch": 1.1604499703966844, "grad_norm": 0.519995665132958, "learning_rate": 5e-06, "loss": 0.6069, "step": 490 }, { "epoch": 1.1841326228537596, "grad_norm": 0.5719251310502527, "learning_rate": 5e-06, "loss": 0.6152, "step": 500 }, { "epoch": 1.2078152753108349, "grad_norm": 0.42275665886846947, "learning_rate": 5e-06, "loss": 0.6117, "step": 510 }, { "epoch": 1.2314979277679101, "grad_norm": 0.5168994422728499, "learning_rate": 5e-06, "loss": 0.6188, "step": 520 }, { "epoch": 1.2551805802249851, "grad_norm": 0.4960517990662579, "learning_rate": 5e-06, "loss": 0.6074, "step": 530 }, { "epoch": 1.2788632326820604, "grad_norm": 0.4972749506521509, "learning_rate": 5e-06, "loss": 0.6124, "step": 540 }, { "epoch": 1.3025458851391356, "grad_norm": 0.4512891659935728, "learning_rate": 5e-06, "loss": 0.6116, "step": 550 }, { "epoch": 1.3262285375962108, "grad_norm": 0.5486800171063785, "learning_rate": 5e-06, "loss": 0.6118, "step": 560 }, { "epoch": 1.349911190053286, "grad_norm": 0.4960949423919287, "learning_rate": 5e-06, "loss": 0.6086, "step": 570 }, { "epoch": 1.373593842510361, "grad_norm": 0.7111942461169334, "learning_rate": 5e-06, "loss": 0.6083, "step": 580 }, { "epoch": 1.3972764949674363, "grad_norm": 0.6185227863814267, "learning_rate": 5e-06, "loss": 0.6037, "step": 590 }, { "epoch": 1.4209591474245116, "grad_norm": 0.6854895211681724, "learning_rate": 5e-06, "loss": 0.6057, "step": 600 }, { "epoch": 1.4446417998815868, "grad_norm": 0.498807007502488, "learning_rate": 5e-06, "loss": 0.6089, "step": 610 }, { "epoch": 1.468324452338662, "grad_norm": 0.4784601661555535, "learning_rate": 5e-06, "loss": 0.6047, "step": 620 }, { "epoch": 1.492007104795737, "grad_norm": 0.47899370881340453, "learning_rate": 5e-06, "loss": 0.6025, "step": 630 }, { "epoch": 1.5156897572528123, "grad_norm": 0.5028475227733583, "learning_rate": 5e-06, "loss": 0.6097, "step": 640 }, { "epoch": 1.5393724097098875, "grad_norm": 0.44305542706371875, "learning_rate": 5e-06, "loss": 0.6049, "step": 650 }, { "epoch": 1.5630550621669625, "grad_norm": 0.44715347654354887, "learning_rate": 5e-06, "loss": 0.6095, "step": 660 }, { "epoch": 1.586737714624038, "grad_norm": 0.5007577942426897, "learning_rate": 5e-06, "loss": 0.6061, "step": 670 }, { "epoch": 1.610420367081113, "grad_norm": 0.5700146554097163, "learning_rate": 5e-06, "loss": 0.6086, "step": 680 }, { "epoch": 1.6341030195381883, "grad_norm": 0.5159365931584635, "learning_rate": 5e-06, "loss": 0.6113, "step": 690 }, { "epoch": 1.6577856719952635, "grad_norm": 0.5437452800969841, "learning_rate": 5e-06, "loss": 0.6044, "step": 700 }, { "epoch": 1.6814683244523385, "grad_norm": 0.6343168684170412, "learning_rate": 5e-06, "loss": 0.6044, "step": 710 }, { "epoch": 1.705150976909414, "grad_norm": 0.47604360281895153, "learning_rate": 5e-06, "loss": 0.6065, "step": 720 }, { "epoch": 1.728833629366489, "grad_norm": 0.6030663856542576, "learning_rate": 5e-06, "loss": 0.607, "step": 730 }, { "epoch": 1.7525162818235642, "grad_norm": 0.5300503522678514, "learning_rate": 5e-06, "loss": 0.6066, "step": 740 }, { "epoch": 1.7761989342806395, "grad_norm": 0.4618010387153902, "learning_rate": 5e-06, "loss": 0.6093, "step": 750 }, { "epoch": 1.7998815867377145, "grad_norm": 0.4342864081355629, "learning_rate": 5e-06, "loss": 0.6045, "step": 760 }, { "epoch": 1.82356423919479, "grad_norm": 0.4523387457254953, "learning_rate": 5e-06, "loss": 0.6049, "step": 770 }, { "epoch": 1.847246891651865, "grad_norm": 0.5145138329314661, "learning_rate": 5e-06, "loss": 0.6056, "step": 780 }, { "epoch": 1.8709295441089402, "grad_norm": 0.43672760333779226, "learning_rate": 5e-06, "loss": 0.6136, "step": 790 }, { "epoch": 1.8946121965660154, "grad_norm": 0.5495600074734466, "learning_rate": 5e-06, "loss": 0.6057, "step": 800 }, { "epoch": 1.9182948490230904, "grad_norm": 0.559949134805628, "learning_rate": 5e-06, "loss": 0.6045, "step": 810 }, { "epoch": 1.941977501480166, "grad_norm": 0.4650635136990293, "learning_rate": 5e-06, "loss": 0.6122, "step": 820 }, { "epoch": 1.965660153937241, "grad_norm": 0.5401564592759, "learning_rate": 5e-06, "loss": 0.6092, "step": 830 }, { "epoch": 1.9893428063943162, "grad_norm": 0.456052398569066, "learning_rate": 5e-06, "loss": 0.6064, "step": 840 }, { "epoch": 1.9988158673771461, "eval_loss": 0.6422228217124939, "eval_runtime": 225.1134, "eval_samples_per_second": 50.539, "eval_steps_per_second": 0.395, "step": 844 }, { "epoch": 2.0130254588513914, "grad_norm": 0.6189993607524042, "learning_rate": 5e-06, "loss": 0.582, "step": 850 }, { "epoch": 2.0367081113084664, "grad_norm": 0.5740784935763802, "learning_rate": 5e-06, "loss": 0.5607, "step": 860 }, { "epoch": 2.060390763765542, "grad_norm": 0.5964396961918896, "learning_rate": 5e-06, "loss": 0.5646, "step": 870 }, { "epoch": 2.084073416222617, "grad_norm": 0.5507692139006432, "learning_rate": 5e-06, "loss": 0.5664, "step": 880 }, { "epoch": 2.1077560686796923, "grad_norm": 0.5018720611836012, "learning_rate": 5e-06, "loss": 0.5631, "step": 890 }, { "epoch": 2.1314387211367674, "grad_norm": 0.6003460231298862, "learning_rate": 5e-06, "loss": 0.5679, "step": 900 }, { "epoch": 2.1551213735938424, "grad_norm": 0.5290328026637183, "learning_rate": 5e-06, "loss": 0.5637, "step": 910 }, { "epoch": 2.178804026050918, "grad_norm": 0.45527710086603834, "learning_rate": 5e-06, "loss": 0.5618, "step": 920 }, { "epoch": 2.202486678507993, "grad_norm": 0.4740063474020259, "learning_rate": 5e-06, "loss": 0.5653, "step": 930 }, { "epoch": 2.2261693309650683, "grad_norm": 0.48997671873108534, "learning_rate": 5e-06, "loss": 0.565, "step": 940 }, { "epoch": 2.2498519834221433, "grad_norm": 0.49914808237901415, "learning_rate": 5e-06, "loss": 0.5675, "step": 950 }, { "epoch": 2.2735346358792183, "grad_norm": 0.596920997938112, "learning_rate": 5e-06, "loss": 0.5656, "step": 960 }, { "epoch": 2.297217288336294, "grad_norm": 0.5735564153703901, "learning_rate": 5e-06, "loss": 0.565, "step": 970 }, { "epoch": 2.320899940793369, "grad_norm": 0.5519912742369928, "learning_rate": 5e-06, "loss": 0.5713, "step": 980 }, { "epoch": 2.3445825932504443, "grad_norm": 0.4675640643082247, "learning_rate": 5e-06, "loss": 0.5755, "step": 990 }, { "epoch": 2.3682652457075193, "grad_norm": 0.4361312913430622, "learning_rate": 5e-06, "loss": 0.5717, "step": 1000 }, { "epoch": 2.3919478981645943, "grad_norm": 0.4640032816908529, "learning_rate": 5e-06, "loss": 0.5689, "step": 1010 }, { "epoch": 2.4156305506216698, "grad_norm": 0.44037996481930514, "learning_rate": 5e-06, "loss": 0.5643, "step": 1020 }, { "epoch": 2.4393132030787448, "grad_norm": 0.5045787949778652, "learning_rate": 5e-06, "loss": 0.5638, "step": 1030 }, { "epoch": 2.4629958555358202, "grad_norm": 0.4621573812643278, "learning_rate": 5e-06, "loss": 0.5587, "step": 1040 }, { "epoch": 2.4866785079928952, "grad_norm": 0.5264078685670218, "learning_rate": 5e-06, "loss": 0.563, "step": 1050 }, { "epoch": 2.5103611604499703, "grad_norm": 0.5343460789042177, "learning_rate": 5e-06, "loss": 0.5694, "step": 1060 }, { "epoch": 2.5340438129070457, "grad_norm": 0.5325193577248657, "learning_rate": 5e-06, "loss": 0.5691, "step": 1070 }, { "epoch": 2.5577264653641207, "grad_norm": 0.5468298911750293, "learning_rate": 5e-06, "loss": 0.5749, "step": 1080 }, { "epoch": 2.581409117821196, "grad_norm": 0.5781509084959986, "learning_rate": 5e-06, "loss": 0.5749, "step": 1090 }, { "epoch": 2.605091770278271, "grad_norm": 0.5270742215186335, "learning_rate": 5e-06, "loss": 0.5649, "step": 1100 }, { "epoch": 2.6287744227353462, "grad_norm": 0.5368871058088638, "learning_rate": 5e-06, "loss": 0.5653, "step": 1110 }, { "epoch": 2.6524570751924217, "grad_norm": 0.4678129428884298, "learning_rate": 5e-06, "loss": 0.5666, "step": 1120 }, { "epoch": 2.6761397276494967, "grad_norm": 0.5495858339889211, "learning_rate": 5e-06, "loss": 0.5776, "step": 1130 }, { "epoch": 2.699822380106572, "grad_norm": 0.4436260648403391, "learning_rate": 5e-06, "loss": 0.5733, "step": 1140 }, { "epoch": 2.723505032563647, "grad_norm": 0.46368684704169655, "learning_rate": 5e-06, "loss": 0.5706, "step": 1150 }, { "epoch": 2.747187685020722, "grad_norm": 0.4702868949052167, "learning_rate": 5e-06, "loss": 0.5687, "step": 1160 }, { "epoch": 2.7708703374777977, "grad_norm": 0.5006009407591486, "learning_rate": 5e-06, "loss": 0.565, "step": 1170 }, { "epoch": 2.7945529899348727, "grad_norm": 0.4916863056579115, "learning_rate": 5e-06, "loss": 0.5728, "step": 1180 }, { "epoch": 2.818235642391948, "grad_norm": 0.6631146094885297, "learning_rate": 5e-06, "loss": 0.5684, "step": 1190 }, { "epoch": 2.841918294849023, "grad_norm": 0.4673828810760241, "learning_rate": 5e-06, "loss": 0.5656, "step": 1200 }, { "epoch": 2.865600947306098, "grad_norm": 0.4232627921898123, "learning_rate": 5e-06, "loss": 0.5662, "step": 1210 }, { "epoch": 2.8892835997631736, "grad_norm": 0.45632210985931476, "learning_rate": 5e-06, "loss": 0.5649, "step": 1220 }, { "epoch": 2.9129662522202486, "grad_norm": 0.47501442992489695, "learning_rate": 5e-06, "loss": 0.5671, "step": 1230 }, { "epoch": 2.936648904677324, "grad_norm": 0.5162383441649955, "learning_rate": 5e-06, "loss": 0.5626, "step": 1240 }, { "epoch": 2.960331557134399, "grad_norm": 0.47876663087966187, "learning_rate": 5e-06, "loss": 0.5661, "step": 1250 }, { "epoch": 2.984014209591474, "grad_norm": 0.45472166434431854, "learning_rate": 5e-06, "loss": 0.5685, "step": 1260 }, { "epoch": 2.9982238010657194, "eval_loss": 0.6428163051605225, "eval_runtime": 225.5335, "eval_samples_per_second": 50.445, "eval_steps_per_second": 0.395, "step": 1266 }, { "epoch": 2.9982238010657194, "step": 1266, "total_flos": 2120178393415680.0, "train_loss": 0.6184906803224512, "train_runtime": 37764.6257, "train_samples_per_second": 17.171, "train_steps_per_second": 0.034 } ], "logging_steps": 10, "max_steps": 1266, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2120178393415680.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }