{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9515938606847696, "eval_steps": 500, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0590318772136954, "grad_norm": 6.553243637084961, "learning_rate": 2.45e-05, "loss": 4.6761, "step": 50 }, { "epoch": 0.1180637544273908, "grad_norm": 8.582610130310059, "learning_rate": 4.9500000000000004e-05, "loss": 4.2203, "step": 100 }, { "epoch": 0.1770956316410862, "grad_norm": 9.231163024902344, "learning_rate": 4.8996312986480954e-05, "loss": 3.9576, "step": 150 }, { "epoch": 0.2361275088547816, "grad_norm": 7.166695594787598, "learning_rate": 4.7972142564522735e-05, "loss": 3.8839, "step": 200 }, { "epoch": 0.29515938606847697, "grad_norm": 7.867295265197754, "learning_rate": 4.6947972142564524e-05, "loss": 3.9156, "step": 250 }, { "epoch": 0.3541912632821724, "grad_norm": 9.70073127746582, "learning_rate": 4.592380172060631e-05, "loss": 3.7892, "step": 300 }, { "epoch": 0.4132231404958678, "grad_norm": 10.692851066589355, "learning_rate": 4.48996312986481e-05, "loss": 3.7978, "step": 350 }, { "epoch": 0.4722550177095632, "grad_norm": 8.353034019470215, "learning_rate": 4.387546087668988e-05, "loss": 3.6882, "step": 400 }, { "epoch": 0.5312868949232585, "grad_norm": 7.379666328430176, "learning_rate": 4.285129045473167e-05, "loss": 3.6928, "step": 450 }, { "epoch": 0.5903187721369539, "grad_norm": 8.379216194152832, "learning_rate": 4.182712003277345e-05, "loss": 3.697, "step": 500 }, { "epoch": 0.6493506493506493, "grad_norm": 5.806574821472168, "learning_rate": 4.080294961081524e-05, "loss": 3.6493, "step": 550 }, { "epoch": 0.7083825265643447, "grad_norm": 5.262754917144775, "learning_rate": 3.977877918885703e-05, "loss": 3.6309, "step": 600 }, { "epoch": 0.7674144037780402, "grad_norm": 7.273934364318848, "learning_rate": 3.875460876689882e-05, "loss": 3.6006, "step": 650 }, { "epoch": 0.8264462809917356, "grad_norm": 9.894654273986816, "learning_rate": 3.77304383449406e-05, "loss": 3.5184, "step": 700 }, { "epoch": 0.885478158205431, "grad_norm": 7.7802510261535645, "learning_rate": 3.670626792298239e-05, "loss": 3.5088, "step": 750 }, { "epoch": 0.9445100354191264, "grad_norm": 4.538764953613281, "learning_rate": 3.568209750102417e-05, "loss": 3.5163, "step": 800 }, { "epoch": 1.0035419126328218, "grad_norm": 9.740047454833984, "learning_rate": 3.465792707906596e-05, "loss": 3.4815, "step": 850 }, { "epoch": 1.062573789846517, "grad_norm": 5.822250843048096, "learning_rate": 3.3633756657107746e-05, "loss": 3.3865, "step": 900 }, { "epoch": 1.1216056670602126, "grad_norm": 8.263042449951172, "learning_rate": 3.260958623514953e-05, "loss": 3.3305, "step": 950 }, { "epoch": 1.1806375442739079, "grad_norm": 5.721621513366699, "learning_rate": 3.1585415813191316e-05, "loss": 3.2901, "step": 1000 }, { "epoch": 1.2396694214876034, "grad_norm": 5.100550651550293, "learning_rate": 3.05612453912331e-05, "loss": 3.2108, "step": 1050 }, { "epoch": 1.2987012987012987, "grad_norm": 6.78065299987793, "learning_rate": 2.9537074969274887e-05, "loss": 3.2782, "step": 1100 }, { "epoch": 1.3577331759149942, "grad_norm": 6.02623176574707, "learning_rate": 2.851290454731668e-05, "loss": 3.2726, "step": 1150 }, { "epoch": 1.4167650531286895, "grad_norm": 6.936172962188721, "learning_rate": 2.7488734125358463e-05, "loss": 3.3236, "step": 1200 }, { "epoch": 1.4757969303423848, "grad_norm": 6.60649299621582, "learning_rate": 2.646456370340025e-05, "loss": 3.2326, "step": 1250 }, { "epoch": 1.5348288075560803, "grad_norm": 9.457938194274902, "learning_rate": 2.5440393281442034e-05, "loss": 3.2819, "step": 1300 }, { "epoch": 1.5938606847697758, "grad_norm": 8.299750328063965, "learning_rate": 2.441622285948382e-05, "loss": 3.1736, "step": 1350 }, { "epoch": 1.6528925619834711, "grad_norm": 6.856365203857422, "learning_rate": 2.3392052437525604e-05, "loss": 3.2544, "step": 1400 }, { "epoch": 1.7119244391971664, "grad_norm": 7.17230224609375, "learning_rate": 2.236788201556739e-05, "loss": 3.2322, "step": 1450 }, { "epoch": 1.770956316410862, "grad_norm": 5.7366814613342285, "learning_rate": 2.1343711593609177e-05, "loss": 3.1887, "step": 1500 }, { "epoch": 1.8299881936245572, "grad_norm": 8.30285358428955, "learning_rate": 2.0319541171650962e-05, "loss": 3.2267, "step": 1550 }, { "epoch": 1.8890200708382525, "grad_norm": 9.261168479919434, "learning_rate": 1.929537074969275e-05, "loss": 3.1754, "step": 1600 }, { "epoch": 1.948051948051948, "grad_norm": 7.7448248863220215, "learning_rate": 1.8271200327734536e-05, "loss": 3.2218, "step": 1650 }, { "epoch": 2.0070838252656436, "grad_norm": 6.691303730010986, "learning_rate": 1.724702990577632e-05, "loss": 3.1039, "step": 1700 }, { "epoch": 2.0661157024793386, "grad_norm": 6.595968723297119, "learning_rate": 1.622285948381811e-05, "loss": 3.0747, "step": 1750 }, { "epoch": 2.125147579693034, "grad_norm": 9.761649131774902, "learning_rate": 1.5198689061859894e-05, "loss": 3.0566, "step": 1800 }, { "epoch": 2.1841794569067297, "grad_norm": 4.652196407318115, "learning_rate": 1.417451863990168e-05, "loss": 3.1184, "step": 1850 }, { "epoch": 2.243211334120425, "grad_norm": 9.410402297973633, "learning_rate": 1.3150348217943468e-05, "loss": 3.0613, "step": 1900 }, { "epoch": 2.3022432113341202, "grad_norm": 6.215067386627197, "learning_rate": 1.2126177795985253e-05, "loss": 3.1054, "step": 1950 }, { "epoch": 2.3612750885478158, "grad_norm": 7.889769077301025, "learning_rate": 1.110200737402704e-05, "loss": 2.999, "step": 2000 }, { "epoch": 2.4203069657615113, "grad_norm": 5.15559720993042, "learning_rate": 1.0077836952068826e-05, "loss": 3.0281, "step": 2050 }, { "epoch": 2.479338842975207, "grad_norm": 5.545195579528809, "learning_rate": 9.053666530110611e-06, "loss": 3.0612, "step": 2100 }, { "epoch": 2.538370720188902, "grad_norm": 9.813671112060547, "learning_rate": 8.029496108152396e-06, "loss": 3.03, "step": 2150 }, { "epoch": 2.5974025974025974, "grad_norm": 9.66518783569336, "learning_rate": 7.005325686194184e-06, "loss": 3.0564, "step": 2200 }, { "epoch": 2.656434474616293, "grad_norm": 7.902320384979248, "learning_rate": 5.981155264235969e-06, "loss": 2.9861, "step": 2250 }, { "epoch": 2.7154663518299884, "grad_norm": 5.70012092590332, "learning_rate": 4.956984842277755e-06, "loss": 3.0116, "step": 2300 }, { "epoch": 2.7744982290436835, "grad_norm": 6.998533725738525, "learning_rate": 3.9328144203195416e-06, "loss": 3.008, "step": 2350 }, { "epoch": 2.833530106257379, "grad_norm": 5.67135763168335, "learning_rate": 2.9086439983613274e-06, "loss": 3.03, "step": 2400 }, { "epoch": 2.8925619834710745, "grad_norm": 8.523286819458008, "learning_rate": 1.8844735764031136e-06, "loss": 2.9847, "step": 2450 }, { "epoch": 2.9515938606847696, "grad_norm": 4.999369144439697, "learning_rate": 8.603031544448998e-07, "loss": 3.0083, "step": 2500 } ], "logging_steps": 50, "max_steps": 2541, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2612183615668224.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }