| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.9873039581777445, |
| "eval_steps": 500, |
| "global_step": 8000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.037341299477221805, |
| "grad_norm": 0.9237794876098633, |
| "learning_rate": 4.938386855862584e-05, |
| "loss": 8.8586, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.07468259895444361, |
| "grad_norm": 1.3463598489761353, |
| "learning_rate": 4.876151356733881e-05, |
| "loss": 8.1136, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.11202389843166542, |
| "grad_norm": 1.2067362070083618, |
| "learning_rate": 4.8139158576051786e-05, |
| "loss": 7.8791, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.14936519790888722, |
| "grad_norm": 1.3990025520324707, |
| "learning_rate": 4.7516803584764754e-05, |
| "loss": 7.7099, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.18670649738610903, |
| "grad_norm": 1.5422577857971191, |
| "learning_rate": 4.689444859347772e-05, |
| "loss": 7.5067, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.22404779686333084, |
| "grad_norm": 1.439476490020752, |
| "learning_rate": 4.627209360219069e-05, |
| "loss": 7.4054, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.26138909634055263, |
| "grad_norm": 1.4985345602035522, |
| "learning_rate": 4.564973861090366e-05, |
| "loss": 7.3043, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.29873039581777444, |
| "grad_norm": 1.6559139490127563, |
| "learning_rate": 4.502738361961663e-05, |
| "loss": 7.1577, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.33607169529499625, |
| "grad_norm": 1.677016258239746, |
| "learning_rate": 4.4405028628329605e-05, |
| "loss": 7.0927, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.37341299477221807, |
| "grad_norm": 1.5764284133911133, |
| "learning_rate": 4.378267363704257e-05, |
| "loss": 6.9999, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.4107542942494399, |
| "grad_norm": 1.7246273756027222, |
| "learning_rate": 4.316031864575554e-05, |
| "loss": 6.9177, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.4480955937266617, |
| "grad_norm": 1.7886656522750854, |
| "learning_rate": 4.253796365446851e-05, |
| "loss": 6.8377, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.4854368932038835, |
| "grad_norm": 1.710162878036499, |
| "learning_rate": 4.191560866318148e-05, |
| "loss": 6.7706, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.5227781926811053, |
| "grad_norm": 1.8272024393081665, |
| "learning_rate": 4.129325367189445e-05, |
| "loss": 6.7233, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.5601194921583271, |
| "grad_norm": 1.8057252168655396, |
| "learning_rate": 4.0670898680607424e-05, |
| "loss": 6.6577, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.5974607916355489, |
| "grad_norm": 1.9049605131149292, |
| "learning_rate": 4.004854368932039e-05, |
| "loss": 6.6056, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.6348020911127707, |
| "grad_norm": 1.9891490936279297, |
| "learning_rate": 3.942618869803336e-05, |
| "loss": 6.5396, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.6721433905899925, |
| "grad_norm": 1.8199862241744995, |
| "learning_rate": 3.880383370674633e-05, |
| "loss": 6.4798, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.7094846900672144, |
| "grad_norm": 1.8243427276611328, |
| "learning_rate": 3.81814787154593e-05, |
| "loss": 6.4103, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.7468259895444361, |
| "grad_norm": 1.8216381072998047, |
| "learning_rate": 3.755912372417227e-05, |
| "loss": 6.4254, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.784167289021658, |
| "grad_norm": 1.7757889032363892, |
| "learning_rate": 3.693676873288524e-05, |
| "loss": 6.3402, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.8215085884988798, |
| "grad_norm": 1.9034113883972168, |
| "learning_rate": 3.631441374159821e-05, |
| "loss": 6.3114, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.8588498879761016, |
| "grad_norm": 1.8567832708358765, |
| "learning_rate": 3.569205875031118e-05, |
| "loss": 6.2969, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.8961911874533234, |
| "grad_norm": 1.7683762311935425, |
| "learning_rate": 3.5069703759024146e-05, |
| "loss": 6.2537, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.9335324869305451, |
| "grad_norm": 1.8633127212524414, |
| "learning_rate": 3.444734876773712e-05, |
| "loss": 6.2365, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.970873786407767, |
| "grad_norm": 2.112578868865967, |
| "learning_rate": 3.382499377645009e-05, |
| "loss": 6.1906, |
| "step": 2600 |
| }, |
| { |
| "epoch": 1.0082150858849888, |
| "grad_norm": 2.0527567863464355, |
| "learning_rate": 3.320263878516306e-05, |
| "loss": 6.1523, |
| "step": 2700 |
| }, |
| { |
| "epoch": 1.0455563853622105, |
| "grad_norm": 1.8975645303726196, |
| "learning_rate": 3.258028379387602e-05, |
| "loss": 6.0659, |
| "step": 2800 |
| }, |
| { |
| "epoch": 1.0828976848394325, |
| "grad_norm": 1.9186785221099854, |
| "learning_rate": 3.1957928802589e-05, |
| "loss": 6.0698, |
| "step": 2900 |
| }, |
| { |
| "epoch": 1.1202389843166543, |
| "grad_norm": 1.9995322227478027, |
| "learning_rate": 3.1335573811301965e-05, |
| "loss": 6.019, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.157580283793876, |
| "grad_norm": 1.8683958053588867, |
| "learning_rate": 3.071321882001494e-05, |
| "loss": 6.007, |
| "step": 3100 |
| }, |
| { |
| "epoch": 1.1949215832710978, |
| "grad_norm": 1.8966848850250244, |
| "learning_rate": 3.0090863828727907e-05, |
| "loss": 5.9987, |
| "step": 3200 |
| }, |
| { |
| "epoch": 1.2322628827483197, |
| "grad_norm": 2.010756731033325, |
| "learning_rate": 2.9468508837440878e-05, |
| "loss": 5.9598, |
| "step": 3300 |
| }, |
| { |
| "epoch": 1.2696041822255415, |
| "grad_norm": 2.081808567047119, |
| "learning_rate": 2.8846153846153845e-05, |
| "loss": 5.9516, |
| "step": 3400 |
| }, |
| { |
| "epoch": 1.3069454817027633, |
| "grad_norm": 1.923231601715088, |
| "learning_rate": 2.8223798854866816e-05, |
| "loss": 5.9389, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.344286781179985, |
| "grad_norm": 1.9322913885116577, |
| "learning_rate": 2.7601443863579784e-05, |
| "loss": 5.9107, |
| "step": 3600 |
| }, |
| { |
| "epoch": 1.3816280806572068, |
| "grad_norm": 1.9434425830841064, |
| "learning_rate": 2.697908887229276e-05, |
| "loss": 5.8912, |
| "step": 3700 |
| }, |
| { |
| "epoch": 1.4189693801344287, |
| "grad_norm": 2.046572208404541, |
| "learning_rate": 2.635673388100573e-05, |
| "loss": 5.8741, |
| "step": 3800 |
| }, |
| { |
| "epoch": 1.4563106796116505, |
| "grad_norm": 1.9436527490615845, |
| "learning_rate": 2.5734378889718697e-05, |
| "loss": 5.8457, |
| "step": 3900 |
| }, |
| { |
| "epoch": 1.4936519790888723, |
| "grad_norm": 2.0540173053741455, |
| "learning_rate": 2.5112023898431668e-05, |
| "loss": 5.8389, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.5309932785660942, |
| "grad_norm": 2.0585784912109375, |
| "learning_rate": 2.4489668907144635e-05, |
| "loss": 5.8202, |
| "step": 4100 |
| }, |
| { |
| "epoch": 1.568334578043316, |
| "grad_norm": 2.124342441558838, |
| "learning_rate": 2.3867313915857606e-05, |
| "loss": 5.8081, |
| "step": 4200 |
| }, |
| { |
| "epoch": 1.6056758775205378, |
| "grad_norm": 2.073033571243286, |
| "learning_rate": 2.3244958924570577e-05, |
| "loss": 5.8116, |
| "step": 4300 |
| }, |
| { |
| "epoch": 1.6430171769977595, |
| "grad_norm": 2.0749969482421875, |
| "learning_rate": 2.2622603933283545e-05, |
| "loss": 5.7896, |
| "step": 4400 |
| }, |
| { |
| "epoch": 1.6803584764749813, |
| "grad_norm": 2.076416015625, |
| "learning_rate": 2.2000248941996516e-05, |
| "loss": 5.7661, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.717699775952203, |
| "grad_norm": 2.149789810180664, |
| "learning_rate": 2.1377893950709483e-05, |
| "loss": 5.7682, |
| "step": 4600 |
| }, |
| { |
| "epoch": 1.7550410754294248, |
| "grad_norm": 2.0563135147094727, |
| "learning_rate": 2.0755538959422454e-05, |
| "loss": 5.7755, |
| "step": 4700 |
| }, |
| { |
| "epoch": 1.7923823749066468, |
| "grad_norm": 2.032025098800659, |
| "learning_rate": 2.0133183968135425e-05, |
| "loss": 5.7458, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.8297236743838685, |
| "grad_norm": 2.0097744464874268, |
| "learning_rate": 1.9510828976848393e-05, |
| "loss": 5.7221, |
| "step": 4900 |
| }, |
| { |
| "epoch": 1.8670649738610905, |
| "grad_norm": 2.105161190032959, |
| "learning_rate": 1.8888473985561364e-05, |
| "loss": 5.7195, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.9044062733383122, |
| "grad_norm": 2.0916309356689453, |
| "learning_rate": 1.8266118994274335e-05, |
| "loss": 5.7039, |
| "step": 5100 |
| }, |
| { |
| "epoch": 1.941747572815534, |
| "grad_norm": 2.057687520980835, |
| "learning_rate": 1.7643764002987302e-05, |
| "loss": 5.6782, |
| "step": 5200 |
| }, |
| { |
| "epoch": 1.9790888722927558, |
| "grad_norm": 2.152930498123169, |
| "learning_rate": 1.7021409011700273e-05, |
| "loss": 5.6739, |
| "step": 5300 |
| }, |
| { |
| "epoch": 2.0164301717699775, |
| "grad_norm": 2.140216827392578, |
| "learning_rate": 1.6399054020413244e-05, |
| "loss": 5.6407, |
| "step": 5400 |
| }, |
| { |
| "epoch": 2.0537714712471993, |
| "grad_norm": 2.0678555965423584, |
| "learning_rate": 1.5776699029126215e-05, |
| "loss": 5.5972, |
| "step": 5500 |
| }, |
| { |
| "epoch": 2.091112770724421, |
| "grad_norm": 2.1241703033447266, |
| "learning_rate": 1.5154344037839185e-05, |
| "loss": 5.5728, |
| "step": 5600 |
| }, |
| { |
| "epoch": 2.1284540702016432, |
| "grad_norm": 2.1589901447296143, |
| "learning_rate": 1.4531989046552156e-05, |
| "loss": 5.6079, |
| "step": 5700 |
| }, |
| { |
| "epoch": 2.165795369678865, |
| "grad_norm": 2.1543970108032227, |
| "learning_rate": 1.3909634055265125e-05, |
| "loss": 5.5878, |
| "step": 5800 |
| }, |
| { |
| "epoch": 2.2031366691560867, |
| "grad_norm": 2.368490219116211, |
| "learning_rate": 1.3287279063978094e-05, |
| "loss": 5.5668, |
| "step": 5900 |
| }, |
| { |
| "epoch": 2.2404779686333085, |
| "grad_norm": 2.3079488277435303, |
| "learning_rate": 1.2664924072691065e-05, |
| "loss": 5.5384, |
| "step": 6000 |
| }, |
| { |
| "epoch": 2.2778192681105303, |
| "grad_norm": 2.2130212783813477, |
| "learning_rate": 1.2042569081404033e-05, |
| "loss": 5.5472, |
| "step": 6100 |
| }, |
| { |
| "epoch": 2.315160567587752, |
| "grad_norm": 2.1821630001068115, |
| "learning_rate": 1.1420214090117002e-05, |
| "loss": 5.5582, |
| "step": 6200 |
| }, |
| { |
| "epoch": 2.3525018670649738, |
| "grad_norm": 2.238124132156372, |
| "learning_rate": 1.0797859098829975e-05, |
| "loss": 5.5296, |
| "step": 6300 |
| }, |
| { |
| "epoch": 2.3898431665421955, |
| "grad_norm": 2.233442544937134, |
| "learning_rate": 1.0175504107542944e-05, |
| "loss": 5.5563, |
| "step": 6400 |
| }, |
| { |
| "epoch": 2.4271844660194173, |
| "grad_norm": 2.1606245040893555, |
| "learning_rate": 9.553149116255913e-06, |
| "loss": 5.5595, |
| "step": 6500 |
| }, |
| { |
| "epoch": 2.4645257654966395, |
| "grad_norm": 2.3241500854492188, |
| "learning_rate": 8.930794124968882e-06, |
| "loss": 5.5406, |
| "step": 6600 |
| }, |
| { |
| "epoch": 2.5018670649738612, |
| "grad_norm": 2.2552995681762695, |
| "learning_rate": 8.308439133681853e-06, |
| "loss": 5.522, |
| "step": 6700 |
| }, |
| { |
| "epoch": 2.539208364451083, |
| "grad_norm": 2.2733113765716553, |
| "learning_rate": 7.686084142394823e-06, |
| "loss": 5.5386, |
| "step": 6800 |
| }, |
| { |
| "epoch": 2.5765496639283048, |
| "grad_norm": 2.2476372718811035, |
| "learning_rate": 7.063729151107793e-06, |
| "loss": 5.5117, |
| "step": 6900 |
| }, |
| { |
| "epoch": 2.6138909634055265, |
| "grad_norm": 2.283897638320923, |
| "learning_rate": 6.441374159820762e-06, |
| "loss": 5.5096, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.6512322628827483, |
| "grad_norm": 2.2005655765533447, |
| "learning_rate": 5.819019168533732e-06, |
| "loss": 5.4929, |
| "step": 7100 |
| }, |
| { |
| "epoch": 2.68857356235997, |
| "grad_norm": 2.318183422088623, |
| "learning_rate": 5.1966641772467014e-06, |
| "loss": 5.5228, |
| "step": 7200 |
| }, |
| { |
| "epoch": 2.725914861837192, |
| "grad_norm": 2.2970705032348633, |
| "learning_rate": 4.5743091859596715e-06, |
| "loss": 5.5041, |
| "step": 7300 |
| }, |
| { |
| "epoch": 2.7632561613144135, |
| "grad_norm": 2.26883602142334, |
| "learning_rate": 3.951954194672642e-06, |
| "loss": 5.4682, |
| "step": 7400 |
| }, |
| { |
| "epoch": 2.8005974607916357, |
| "grad_norm": 2.2655177116394043, |
| "learning_rate": 3.329599203385611e-06, |
| "loss": 5.484, |
| "step": 7500 |
| }, |
| { |
| "epoch": 2.8379387602688575, |
| "grad_norm": 2.264005184173584, |
| "learning_rate": 2.707244212098581e-06, |
| "loss": 5.4943, |
| "step": 7600 |
| }, |
| { |
| "epoch": 2.8752800597460793, |
| "grad_norm": 2.2275617122650146, |
| "learning_rate": 2.0848892208115507e-06, |
| "loss": 5.5058, |
| "step": 7700 |
| }, |
| { |
| "epoch": 2.912621359223301, |
| "grad_norm": 2.2127552032470703, |
| "learning_rate": 1.4625342295245209e-06, |
| "loss": 5.4636, |
| "step": 7800 |
| }, |
| { |
| "epoch": 2.9499626587005228, |
| "grad_norm": 2.3027756214141846, |
| "learning_rate": 8.401792382374907e-07, |
| "loss": 5.4856, |
| "step": 7900 |
| }, |
| { |
| "epoch": 2.9873039581777445, |
| "grad_norm": 2.233044147491455, |
| "learning_rate": 2.1782424695046054e-07, |
| "loss": 5.4813, |
| "step": 8000 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 8034, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 8359993562628096.0, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|