| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 300000000000000000, |
| "global_step": 4230, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.02364066193853428, |
| "grad_norm": 1506.188232421875, |
| "learning_rate": 2.978723404255319e-06, |
| "loss": 19.5964, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.04728132387706856, |
| "grad_norm": 1148.9078369140625, |
| "learning_rate": 6.4539007092198585e-06, |
| "loss": 7.2922, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.07092198581560284, |
| "grad_norm": 625.9593505859375, |
| "learning_rate": 9.929078014184396e-06, |
| "loss": 2.0938, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.09456264775413711, |
| "grad_norm": 190.0230255126953, |
| "learning_rate": 1.347517730496454e-05, |
| "loss": 1.5984, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.1182033096926714, |
| "grad_norm": 201.42596435546875, |
| "learning_rate": 1.7021276595744682e-05, |
| "loss": 1.4525, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.14184397163120568, |
| "grad_norm": 42.652748107910156, |
| "learning_rate": 2.0567375886524822e-05, |
| "loss": 1.299, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.16548463356973994, |
| "grad_norm": 38.504234313964844, |
| "learning_rate": 2.4113475177304965e-05, |
| "loss": 1.2265, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.18912529550827423, |
| "grad_norm": 12.436441421508789, |
| "learning_rate": 2.765957446808511e-05, |
| "loss": 1.157, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.2127659574468085, |
| "grad_norm": 7.723984241485596, |
| "learning_rate": 2.9998524001298715e-05, |
| "loss": 1.0646, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.2364066193853428, |
| "grad_norm": 6.159175395965576, |
| "learning_rate": 2.9977078965004553e-05, |
| "loss": 0.9783, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.26004728132387706, |
| "grad_norm": 5.534773826599121, |
| "learning_rate": 2.9930139839249263e-05, |
| "loss": 0.9361, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.28368794326241137, |
| "grad_norm": 4.956591606140137, |
| "learning_rate": 2.9857786524143804e-05, |
| "loss": 0.8943, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.3073286052009456, |
| "grad_norm": 4.068881511688232, |
| "learning_rate": 2.976014218001153e-05, |
| "loss": 0.8575, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.3309692671394799, |
| "grad_norm": 6.331308364868164, |
| "learning_rate": 2.963737301774379e-05, |
| "loss": 0.8035, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.3546099290780142, |
| "grad_norm": 3.5792810916900635, |
| "learning_rate": 2.9489688015874604e-05, |
| "loss": 0.7974, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.37825059101654845, |
| "grad_norm": 4.266261577606201, |
| "learning_rate": 2.9317338564855907e-05, |
| "loss": 0.7663, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.40189125295508277, |
| "grad_norm": 3.804710865020752, |
| "learning_rate": 2.9120618039138956e-05, |
| "loss": 0.7613, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.425531914893617, |
| "grad_norm": 4.593356132507324, |
| "learning_rate": 2.889986129779028e-05, |
| "loss": 0.7473, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.4491725768321513, |
| "grad_norm": 3.493183135986328, |
| "learning_rate": 2.86554441144922e-05, |
| "loss": 0.7426, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.4728132387706856, |
| "grad_norm": 5.607442855834961, |
| "learning_rate": 2.838778253789822e-05, |
| "loss": 0.7147, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.49645390070921985, |
| "grad_norm": 5.357724189758301, |
| "learning_rate": 2.8097332183432076e-05, |
| "loss": 0.7038, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.5200945626477541, |
| "grad_norm": 4.367136478424072, |
| "learning_rate": 2.7784587457735947e-05, |
| "loss": 0.6779, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.5437352245862884, |
| "grad_norm": 5.684892654418945, |
| "learning_rate": 2.7450080717087995e-05, |
| "loss": 0.6698, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.5673758865248227, |
| "grad_norm": 9.005069732666016, |
| "learning_rate": 2.7094381361221724e-05, |
| "loss": 0.692, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.5910165484633569, |
| "grad_norm": 6.619784832000732, |
| "learning_rate": 2.6718094864089753e-05, |
| "loss": 0.6623, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.6146572104018913, |
| "grad_norm": 5.136876583099365, |
| "learning_rate": 2.63218617432218e-05, |
| "loss": 0.6518, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.6382978723404256, |
| "grad_norm": 9.187308311462402, |
| "learning_rate": 2.590635646943119e-05, |
| "loss": 0.6466, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.6619385342789598, |
| "grad_norm": 9.047890663146973, |
| "learning_rate": 2.547228631872591e-05, |
| "loss": 0.6326, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.6855791962174941, |
| "grad_norm": 5.511709690093994, |
| "learning_rate": 2.5020390168378374e-05, |
| "loss": 0.6261, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.7092198581560284, |
| "grad_norm": 11.79383659362793, |
| "learning_rate": 2.4551437239203342e-05, |
| "loss": 0.6387, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.7328605200945626, |
| "grad_norm": 11.677322387695312, |
| "learning_rate": 2.4066225786184802e-05, |
| "loss": 0.6246, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.7565011820330969, |
| "grad_norm": 6.934109687805176, |
| "learning_rate": 2.3565581739680718e-05, |
| "loss": 0.607, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.7801418439716312, |
| "grad_norm": 10.238001823425293, |
| "learning_rate": 2.3050357299518546e-05, |
| "loss": 0.5975, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.8037825059101655, |
| "grad_norm": 4.702815055847168, |
| "learning_rate": 2.2521429484374676e-05, |
| "loss": 0.5886, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.8274231678486997, |
| "grad_norm": 8.547035217285156, |
| "learning_rate": 2.197969863890705e-05, |
| "loss": 0.5869, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.851063829787234, |
| "grad_norm": 6.0442399978637695, |
| "learning_rate": 2.1426086901182144e-05, |
| "loss": 0.5837, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.8747044917257684, |
| "grad_norm": 6.5389018058776855, |
| "learning_rate": 2.086153663300503e-05, |
| "loss": 0.5885, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.8983451536643026, |
| "grad_norm": 22.425125122070312, |
| "learning_rate": 2.0287008815824495e-05, |
| "loss": 0.5859, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.9219858156028369, |
| "grad_norm": 11.52462387084961, |
| "learning_rate": 1.9703481414943606e-05, |
| "loss": 0.5714, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.9456264775413712, |
| "grad_norm": 37.39026641845703, |
| "learning_rate": 1.9111947714820277e-05, |
| "loss": 0.565, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.9692671394799054, |
| "grad_norm": 5.455526351928711, |
| "learning_rate": 1.85134146282915e-05, |
| "loss": 0.5478, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.9929078014184397, |
| "grad_norm": 22.083446502685547, |
| "learning_rate": 1.7908900982599148e-05, |
| "loss": 0.5469, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.016548463356974, |
| "grad_norm": 10.494434356689453, |
| "learning_rate": 1.7299435785135098e-05, |
| "loss": 0.5444, |
| "step": 2150 |
| }, |
| { |
| "epoch": 1.0401891252955082, |
| "grad_norm": 20.982011795043945, |
| "learning_rate": 1.6686056471857595e-05, |
| "loss": 0.5356, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.0638297872340425, |
| "grad_norm": 10.873411178588867, |
| "learning_rate": 1.606980714136041e-05, |
| "loss": 0.5466, |
| "step": 2250 |
| }, |
| { |
| "epoch": 1.0874704491725768, |
| "grad_norm": 33.547916412353516, |
| "learning_rate": 1.5451736777600882e-05, |
| "loss": 0.5157, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.1111111111111112, |
| "grad_norm": 14.819494247436523, |
| "learning_rate": 1.4832897464312018e-05, |
| "loss": 0.5492, |
| "step": 2350 |
| }, |
| { |
| "epoch": 1.1347517730496455, |
| "grad_norm": 12.469233512878418, |
| "learning_rate": 1.4214342594138124e-05, |
| "loss": 0.5305, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.1583924349881798, |
| "grad_norm": 6.700783729553223, |
| "learning_rate": 1.3597125075542446e-05, |
| "loss": 0.5401, |
| "step": 2450 |
| }, |
| { |
| "epoch": 1.1820330969267139, |
| "grad_norm": 7.968286514282227, |
| "learning_rate": 1.2982295540538918e-05, |
| "loss": 0.528, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.2056737588652482, |
| "grad_norm": 6.891226291656494, |
| "learning_rate": 1.237090055629899e-05, |
| "loss": 0.5242, |
| "step": 2550 |
| }, |
| { |
| "epoch": 1.2293144208037825, |
| "grad_norm": 25.54634666442871, |
| "learning_rate": 1.1763980843677541e-05, |
| "loss": 0.4995, |
| "step": 2600 |
| }, |
| { |
| "epoch": 1.2529550827423168, |
| "grad_norm": 5.665143966674805, |
| "learning_rate": 1.1162569505690563e-05, |
| "loss": 0.4916, |
| "step": 2650 |
| }, |
| { |
| "epoch": 1.2765957446808511, |
| "grad_norm": 127.18929290771484, |
| "learning_rate": 1.0567690268959864e-05, |
| "loss": 0.4954, |
| "step": 2700 |
| }, |
| { |
| "epoch": 1.3002364066193852, |
| "grad_norm": 8.059895515441895, |
| "learning_rate": 9.980355741118442e-06, |
| "loss": 0.5046, |
| "step": 2750 |
| }, |
| { |
| "epoch": 1.3238770685579198, |
| "grad_norm": 5.609386920928955, |
| "learning_rate": 9.401565687142579e-06, |
| "loss": 0.5089, |
| "step": 2800 |
| }, |
| { |
| "epoch": 1.3475177304964538, |
| "grad_norm": 12.760987281799316, |
| "learning_rate": 8.832305327544893e-06, |
| "loss": 0.4951, |
| "step": 2850 |
| }, |
| { |
| "epoch": 1.3711583924349882, |
| "grad_norm": 3.802379846572876, |
| "learning_rate": 8.27354366132499e-06, |
| "loss": 0.4945, |
| "step": 2900 |
| }, |
| { |
| "epoch": 1.3947990543735225, |
| "grad_norm": 9.105464935302734, |
| "learning_rate": 7.726231816532574e-06, |
| "loss": 0.4808, |
| "step": 2950 |
| }, |
| { |
| "epoch": 1.4184397163120568, |
| "grad_norm": 41.7253303527832, |
| "learning_rate": 7.1913014312505226e-06, |
| "loss": 0.4898, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.442080378250591, |
| "grad_norm": 3.800203800201416, |
| "learning_rate": 6.6696630677540235e-06, |
| "loss": 0.4718, |
| "step": 3050 |
| }, |
| { |
| "epoch": 1.4657210401891252, |
| "grad_norm": 8.743084907531738, |
| "learning_rate": 6.162204662544992e-06, |
| "loss": 0.488, |
| "step": 3100 |
| }, |
| { |
| "epoch": 1.4893617021276595, |
| "grad_norm": 7.492750644683838, |
| "learning_rate": 5.66979001490036e-06, |
| "loss": 0.4593, |
| "step": 3150 |
| }, |
| { |
| "epoch": 1.5130023640661938, |
| "grad_norm": 3.9157655239105225, |
| "learning_rate": 5.193257316506778e-06, |
| "loss": 0.4883, |
| "step": 3200 |
| }, |
| { |
| "epoch": 1.5366430260047281, |
| "grad_norm": 6.936612606048584, |
| "learning_rate": 4.733417724684879e-06, |
| "loss": 0.464, |
| "step": 3250 |
| }, |
| { |
| "epoch": 1.5602836879432624, |
| "grad_norm": 7.2238688468933105, |
| "learning_rate": 4.2910539816315166e-06, |
| "loss": 0.4663, |
| "step": 3300 |
| }, |
| { |
| "epoch": 1.5839243498817965, |
| "grad_norm": 3.867790460586548, |
| "learning_rate": 3.866919082030514e-06, |
| "loss": 0.4485, |
| "step": 3350 |
| }, |
| { |
| "epoch": 1.607565011820331, |
| "grad_norm": 6.582269668579102, |
| "learning_rate": 3.461734991299779e-06, |
| "loss": 0.4421, |
| "step": 3400 |
| }, |
| { |
| "epoch": 1.6312056737588652, |
| "grad_norm": 5.653420448303223, |
| "learning_rate": 3.0761914166566895e-06, |
| "loss": 0.4673, |
| "step": 3450 |
| }, |
| { |
| "epoch": 1.6548463356973995, |
| "grad_norm": 12.142635345458984, |
| "learning_rate": 2.71094463309358e-06, |
| "loss": 0.4525, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.6784869976359338, |
| "grad_norm": 9.567633628845215, |
| "learning_rate": 2.3666163662618575e-06, |
| "loss": 0.4698, |
| "step": 3550 |
| }, |
| { |
| "epoch": 1.702127659574468, |
| "grad_norm": 5.014305591583252, |
| "learning_rate": 2.043792734166174e-06, |
| "loss": 0.4669, |
| "step": 3600 |
| }, |
| { |
| "epoch": 1.7257683215130024, |
| "grad_norm": 5.084635257720947, |
| "learning_rate": 1.7430232494702537e-06, |
| "loss": 0.449, |
| "step": 3650 |
| }, |
| { |
| "epoch": 1.7494089834515365, |
| "grad_norm": 14.368584632873535, |
| "learning_rate": 1.4648198841125453e-06, |
| "loss": 0.447, |
| "step": 3700 |
| }, |
| { |
| "epoch": 1.773049645390071, |
| "grad_norm": 5.743776321411133, |
| "learning_rate": 1.209656197823985e-06, |
| "loss": 0.4455, |
| "step": 3750 |
| }, |
| { |
| "epoch": 1.7966903073286051, |
| "grad_norm": 5.783755779266357, |
| "learning_rate": 9.779665320312675e-07, |
| "loss": 0.4416, |
| "step": 3800 |
| }, |
| { |
| "epoch": 1.8203309692671394, |
| "grad_norm": 7.874058723449707, |
| "learning_rate": 7.701452705178236e-07, |
| "loss": 0.4501, |
| "step": 3850 |
| }, |
| { |
| "epoch": 1.8439716312056738, |
| "grad_norm": 8.542813301086426, |
| "learning_rate": 5.865461681009542e-07, |
| "loss": 0.4626, |
| "step": 3900 |
| }, |
| { |
| "epoch": 1.867612293144208, |
| "grad_norm": 6.454606056213379, |
| "learning_rate": 4.2748174846788724e-07, |
| "loss": 0.4527, |
| "step": 3950 |
| }, |
| { |
| "epoch": 1.8912529550827424, |
| "grad_norm": 18.86402702331543, |
| "learning_rate": 2.9322277219574145e-07, |
| "loss": 0.4503, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.9148936170212765, |
| "grad_norm": 6.615504741668701, |
| "learning_rate": 1.839977758609801e-07, |
| "loss": 0.4526, |
| "step": 4050 |
| }, |
| { |
| "epoch": 1.938534278959811, |
| "grad_norm": 11.798493385314941, |
| "learning_rate": 9.99926830228265e-08, |
| "loss": 0.4485, |
| "step": 4100 |
| }, |
| { |
| "epoch": 1.962174940898345, |
| "grad_norm": 20.15699577331543, |
| "learning_rate": 4.135048774287553e-08, |
| "loss": 0.4371, |
| "step": 4150 |
| }, |
| { |
| "epoch": 1.9858156028368794, |
| "grad_norm": 15.80203628540039, |
| "learning_rate": 8.171011179587961e-09, |
| "loss": 0.4495, |
| "step": 4200 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 4230, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.1254586343864402e+18, |
| "train_batch_size": 12, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|