| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9515938606847696, | |
| "eval_steps": 500, | |
| "global_step": 2500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0590318772136954, | |
| "grad_norm": 6.553243637084961, | |
| "learning_rate": 2.45e-05, | |
| "loss": 4.6761, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1180637544273908, | |
| "grad_norm": 8.582610130310059, | |
| "learning_rate": 4.9500000000000004e-05, | |
| "loss": 4.2203, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1770956316410862, | |
| "grad_norm": 9.231163024902344, | |
| "learning_rate": 4.8996312986480954e-05, | |
| "loss": 3.9576, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.2361275088547816, | |
| "grad_norm": 7.166695594787598, | |
| "learning_rate": 4.7972142564522735e-05, | |
| "loss": 3.8839, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.29515938606847697, | |
| "grad_norm": 7.867295265197754, | |
| "learning_rate": 4.6947972142564524e-05, | |
| "loss": 3.9156, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3541912632821724, | |
| "grad_norm": 9.70073127746582, | |
| "learning_rate": 4.592380172060631e-05, | |
| "loss": 3.7892, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.4132231404958678, | |
| "grad_norm": 10.692851066589355, | |
| "learning_rate": 4.48996312986481e-05, | |
| "loss": 3.7978, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.4722550177095632, | |
| "grad_norm": 8.353034019470215, | |
| "learning_rate": 4.387546087668988e-05, | |
| "loss": 3.6882, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5312868949232585, | |
| "grad_norm": 7.379666328430176, | |
| "learning_rate": 4.285129045473167e-05, | |
| "loss": 3.6928, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.5903187721369539, | |
| "grad_norm": 8.379216194152832, | |
| "learning_rate": 4.182712003277345e-05, | |
| "loss": 3.697, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6493506493506493, | |
| "grad_norm": 5.806574821472168, | |
| "learning_rate": 4.080294961081524e-05, | |
| "loss": 3.6493, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.7083825265643447, | |
| "grad_norm": 5.262754917144775, | |
| "learning_rate": 3.977877918885703e-05, | |
| "loss": 3.6309, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.7674144037780402, | |
| "grad_norm": 7.273934364318848, | |
| "learning_rate": 3.875460876689882e-05, | |
| "loss": 3.6006, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.8264462809917356, | |
| "grad_norm": 9.894654273986816, | |
| "learning_rate": 3.77304383449406e-05, | |
| "loss": 3.5184, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.885478158205431, | |
| "grad_norm": 7.7802510261535645, | |
| "learning_rate": 3.670626792298239e-05, | |
| "loss": 3.5088, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.9445100354191264, | |
| "grad_norm": 4.538764953613281, | |
| "learning_rate": 3.568209750102417e-05, | |
| "loss": 3.5163, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.0035419126328218, | |
| "grad_norm": 9.740047454833984, | |
| "learning_rate": 3.465792707906596e-05, | |
| "loss": 3.4815, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.062573789846517, | |
| "grad_norm": 5.822250843048096, | |
| "learning_rate": 3.3633756657107746e-05, | |
| "loss": 3.3865, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.1216056670602126, | |
| "grad_norm": 8.263042449951172, | |
| "learning_rate": 3.260958623514953e-05, | |
| "loss": 3.3305, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.1806375442739079, | |
| "grad_norm": 5.721621513366699, | |
| "learning_rate": 3.1585415813191316e-05, | |
| "loss": 3.2901, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.2396694214876034, | |
| "grad_norm": 5.100550651550293, | |
| "learning_rate": 3.05612453912331e-05, | |
| "loss": 3.2108, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.2987012987012987, | |
| "grad_norm": 6.78065299987793, | |
| "learning_rate": 2.9537074969274887e-05, | |
| "loss": 3.2782, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.3577331759149942, | |
| "grad_norm": 6.02623176574707, | |
| "learning_rate": 2.851290454731668e-05, | |
| "loss": 3.2726, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.4167650531286895, | |
| "grad_norm": 6.936172962188721, | |
| "learning_rate": 2.7488734125358463e-05, | |
| "loss": 3.3236, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.4757969303423848, | |
| "grad_norm": 6.60649299621582, | |
| "learning_rate": 2.646456370340025e-05, | |
| "loss": 3.2326, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.5348288075560803, | |
| "grad_norm": 9.457938194274902, | |
| "learning_rate": 2.5440393281442034e-05, | |
| "loss": 3.2819, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.5938606847697758, | |
| "grad_norm": 8.299750328063965, | |
| "learning_rate": 2.441622285948382e-05, | |
| "loss": 3.1736, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.6528925619834711, | |
| "grad_norm": 6.856365203857422, | |
| "learning_rate": 2.3392052437525604e-05, | |
| "loss": 3.2544, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.7119244391971664, | |
| "grad_norm": 7.17230224609375, | |
| "learning_rate": 2.236788201556739e-05, | |
| "loss": 3.2322, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.770956316410862, | |
| "grad_norm": 5.7366814613342285, | |
| "learning_rate": 2.1343711593609177e-05, | |
| "loss": 3.1887, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.8299881936245572, | |
| "grad_norm": 8.30285358428955, | |
| "learning_rate": 2.0319541171650962e-05, | |
| "loss": 3.2267, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.8890200708382525, | |
| "grad_norm": 9.261168479919434, | |
| "learning_rate": 1.929537074969275e-05, | |
| "loss": 3.1754, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.948051948051948, | |
| "grad_norm": 7.7448248863220215, | |
| "learning_rate": 1.8271200327734536e-05, | |
| "loss": 3.2218, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.0070838252656436, | |
| "grad_norm": 6.691303730010986, | |
| "learning_rate": 1.724702990577632e-05, | |
| "loss": 3.1039, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.0661157024793386, | |
| "grad_norm": 6.595968723297119, | |
| "learning_rate": 1.622285948381811e-05, | |
| "loss": 3.0747, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.125147579693034, | |
| "grad_norm": 9.761649131774902, | |
| "learning_rate": 1.5198689061859894e-05, | |
| "loss": 3.0566, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.1841794569067297, | |
| "grad_norm": 4.652196407318115, | |
| "learning_rate": 1.417451863990168e-05, | |
| "loss": 3.1184, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.243211334120425, | |
| "grad_norm": 9.410402297973633, | |
| "learning_rate": 1.3150348217943468e-05, | |
| "loss": 3.0613, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.3022432113341202, | |
| "grad_norm": 6.215067386627197, | |
| "learning_rate": 1.2126177795985253e-05, | |
| "loss": 3.1054, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.3612750885478158, | |
| "grad_norm": 7.889769077301025, | |
| "learning_rate": 1.110200737402704e-05, | |
| "loss": 2.999, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.4203069657615113, | |
| "grad_norm": 5.15559720993042, | |
| "learning_rate": 1.0077836952068826e-05, | |
| "loss": 3.0281, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 2.479338842975207, | |
| "grad_norm": 5.545195579528809, | |
| "learning_rate": 9.053666530110611e-06, | |
| "loss": 3.0612, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.538370720188902, | |
| "grad_norm": 9.813671112060547, | |
| "learning_rate": 8.029496108152396e-06, | |
| "loss": 3.03, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 2.5974025974025974, | |
| "grad_norm": 9.66518783569336, | |
| "learning_rate": 7.005325686194184e-06, | |
| "loss": 3.0564, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.656434474616293, | |
| "grad_norm": 7.902320384979248, | |
| "learning_rate": 5.981155264235969e-06, | |
| "loss": 2.9861, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.7154663518299884, | |
| "grad_norm": 5.70012092590332, | |
| "learning_rate": 4.956984842277755e-06, | |
| "loss": 3.0116, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.7744982290436835, | |
| "grad_norm": 6.998533725738525, | |
| "learning_rate": 3.9328144203195416e-06, | |
| "loss": 3.008, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.833530106257379, | |
| "grad_norm": 5.67135763168335, | |
| "learning_rate": 2.9086439983613274e-06, | |
| "loss": 3.03, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.8925619834710745, | |
| "grad_norm": 8.523286819458008, | |
| "learning_rate": 1.8844735764031136e-06, | |
| "loss": 2.9847, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 2.9515938606847696, | |
| "grad_norm": 4.999369144439697, | |
| "learning_rate": 8.603031544448998e-07, | |
| "loss": 3.0083, | |
| "step": 2500 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 2541, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2612183615668224.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |