| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 4096, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01220703125, | |
| "grad_norm": 2.1223344802856445, | |
| "learning_rate": 0.0002964111328125, | |
| "loss": 3.4492, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0244140625, | |
| "grad_norm": 2.2851481437683105, | |
| "learning_rate": 0.00029274902343749996, | |
| "loss": 2.9104, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.03662109375, | |
| "grad_norm": 2.4541029930114746, | |
| "learning_rate": 0.0002890869140625, | |
| "loss": 2.8348, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.048828125, | |
| "grad_norm": 1.7225837707519531, | |
| "learning_rate": 0.00028542480468749995, | |
| "loss": 2.632, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.06103515625, | |
| "grad_norm": 1.7940356731414795, | |
| "learning_rate": 0.0002817626953125, | |
| "loss": 2.5249, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.0732421875, | |
| "grad_norm": 2.038764715194702, | |
| "learning_rate": 0.0002781005859375, | |
| "loss": 2.638, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.08544921875, | |
| "grad_norm": 1.749374270439148, | |
| "learning_rate": 0.00027443847656249997, | |
| "loss": 2.5429, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.09765625, | |
| "grad_norm": 2.4714009761810303, | |
| "learning_rate": 0.0002707763671875, | |
| "loss": 2.493, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.10986328125, | |
| "grad_norm": 2.3028531074523926, | |
| "learning_rate": 0.00026711425781249996, | |
| "loss": 2.628, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.1220703125, | |
| "grad_norm": 2.187049627304077, | |
| "learning_rate": 0.0002634521484375, | |
| "loss": 2.4359, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.13427734375, | |
| "grad_norm": 1.598210334777832, | |
| "learning_rate": 0.00025979003906249996, | |
| "loss": 2.4709, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.146484375, | |
| "grad_norm": 2.5418989658355713, | |
| "learning_rate": 0.0002561279296875, | |
| "loss": 2.3846, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.15869140625, | |
| "grad_norm": 1.573973536491394, | |
| "learning_rate": 0.0002524658203125, | |
| "loss": 2.389, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.1708984375, | |
| "grad_norm": 1.9976751804351807, | |
| "learning_rate": 0.0002488037109375, | |
| "loss": 2.2888, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.18310546875, | |
| "grad_norm": 2.015338659286499, | |
| "learning_rate": 0.0002451416015625, | |
| "loss": 2.4254, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.1953125, | |
| "grad_norm": 1.4985029697418213, | |
| "learning_rate": 0.00024147949218749997, | |
| "loss": 2.3687, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.20751953125, | |
| "grad_norm": 1.7180825471878052, | |
| "learning_rate": 0.0002378173828125, | |
| "loss": 2.2702, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.2197265625, | |
| "grad_norm": 1.9257062673568726, | |
| "learning_rate": 0.0002341552734375, | |
| "loss": 2.3384, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.23193359375, | |
| "grad_norm": 1.661446213722229, | |
| "learning_rate": 0.00023049316406249999, | |
| "loss": 2.3008, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.244140625, | |
| "grad_norm": 1.5374796390533447, | |
| "learning_rate": 0.00022683105468749998, | |
| "loss": 2.249, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.25634765625, | |
| "grad_norm": 2.2978200912475586, | |
| "learning_rate": 0.00022316894531249998, | |
| "loss": 2.4204, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.2685546875, | |
| "grad_norm": 1.5432515144348145, | |
| "learning_rate": 0.00021950683593749998, | |
| "loss": 2.2478, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.28076171875, | |
| "grad_norm": 1.3733800649642944, | |
| "learning_rate": 0.00021584472656249997, | |
| "loss": 2.3758, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.29296875, | |
| "grad_norm": 2.20448637008667, | |
| "learning_rate": 0.00021218261718749997, | |
| "loss": 2.4319, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.30517578125, | |
| "grad_norm": 2.009197950363159, | |
| "learning_rate": 0.0002085205078125, | |
| "loss": 2.3377, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.3173828125, | |
| "grad_norm": 2.2222068309783936, | |
| "learning_rate": 0.0002048583984375, | |
| "loss": 2.2057, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.32958984375, | |
| "grad_norm": 1.4453160762786865, | |
| "learning_rate": 0.00020119628906249999, | |
| "loss": 2.2579, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.341796875, | |
| "grad_norm": 1.935774564743042, | |
| "learning_rate": 0.00019753417968749998, | |
| "loss": 2.3683, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.35400390625, | |
| "grad_norm": 1.6490037441253662, | |
| "learning_rate": 0.00019387207031249998, | |
| "loss": 2.1556, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.3662109375, | |
| "grad_norm": 1.5202099084854126, | |
| "learning_rate": 0.00019020996093749998, | |
| "loss": 2.3038, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.37841796875, | |
| "grad_norm": 2.076368570327759, | |
| "learning_rate": 0.00018654785156249997, | |
| "loss": 2.2289, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.390625, | |
| "grad_norm": 2.0410258769989014, | |
| "learning_rate": 0.0001828857421875, | |
| "loss": 2.2923, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.40283203125, | |
| "grad_norm": 2.3565592765808105, | |
| "learning_rate": 0.0001792236328125, | |
| "loss": 2.1986, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.4150390625, | |
| "grad_norm": 2.239086151123047, | |
| "learning_rate": 0.0001755615234375, | |
| "loss": 2.2802, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.42724609375, | |
| "grad_norm": 1.6485434770584106, | |
| "learning_rate": 0.0001718994140625, | |
| "loss": 2.2984, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.439453125, | |
| "grad_norm": 1.72841477394104, | |
| "learning_rate": 0.00016823730468749998, | |
| "loss": 2.1722, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.45166015625, | |
| "grad_norm": 2.204685926437378, | |
| "learning_rate": 0.00016457519531249998, | |
| "loss": 2.2262, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.4638671875, | |
| "grad_norm": 1.953203797340393, | |
| "learning_rate": 0.00016091308593749998, | |
| "loss": 2.1813, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.47607421875, | |
| "grad_norm": 1.9295494556427002, | |
| "learning_rate": 0.00015725097656249997, | |
| "loss": 2.2335, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.48828125, | |
| "grad_norm": 1.726406455039978, | |
| "learning_rate": 0.0001535888671875, | |
| "loss": 2.1963, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.50048828125, | |
| "grad_norm": 2.202180862426758, | |
| "learning_rate": 0.0001499267578125, | |
| "loss": 2.0174, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.5126953125, | |
| "grad_norm": 1.6080197095870972, | |
| "learning_rate": 0.0001462646484375, | |
| "loss": 2.3332, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.52490234375, | |
| "grad_norm": 1.879781723022461, | |
| "learning_rate": 0.0001426025390625, | |
| "loss": 2.1724, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.537109375, | |
| "grad_norm": 1.6669050455093384, | |
| "learning_rate": 0.00013894042968749999, | |
| "loss": 2.2165, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.54931640625, | |
| "grad_norm": 2.043221950531006, | |
| "learning_rate": 0.00013527832031249998, | |
| "loss": 2.1075, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.5615234375, | |
| "grad_norm": 2.403533935546875, | |
| "learning_rate": 0.00013161621093749998, | |
| "loss": 2.1541, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.57373046875, | |
| "grad_norm": 1.7574760913848877, | |
| "learning_rate": 0.00012795410156249998, | |
| "loss": 2.0927, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.5859375, | |
| "grad_norm": 1.649275302886963, | |
| "learning_rate": 0.0001242919921875, | |
| "loss": 2.1524, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.59814453125, | |
| "grad_norm": 1.8339548110961914, | |
| "learning_rate": 0.0001206298828125, | |
| "loss": 2.1512, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.6103515625, | |
| "grad_norm": 1.6969799995422363, | |
| "learning_rate": 0.00011696777343749999, | |
| "loss": 2.1217, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.62255859375, | |
| "grad_norm": 2.043633460998535, | |
| "learning_rate": 0.00011330566406249999, | |
| "loss": 2.2374, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.634765625, | |
| "grad_norm": 1.7596385478973389, | |
| "learning_rate": 0.00010964355468749999, | |
| "loss": 2.1372, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.64697265625, | |
| "grad_norm": 1.6535054445266724, | |
| "learning_rate": 0.0001059814453125, | |
| "loss": 2.1119, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.6591796875, | |
| "grad_norm": 1.773929238319397, | |
| "learning_rate": 0.0001023193359375, | |
| "loss": 2.1854, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.67138671875, | |
| "grad_norm": 2.7318737506866455, | |
| "learning_rate": 9.865722656249999e-05, | |
| "loss": 2.126, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.68359375, | |
| "grad_norm": 1.675519585609436, | |
| "learning_rate": 9.499511718749999e-05, | |
| "loss": 2.1262, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.69580078125, | |
| "grad_norm": 2.4060487747192383, | |
| "learning_rate": 9.13330078125e-05, | |
| "loss": 2.1711, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.7080078125, | |
| "grad_norm": 2.342238426208496, | |
| "learning_rate": 8.76708984375e-05, | |
| "loss": 2.0191, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.72021484375, | |
| "grad_norm": 1.7769362926483154, | |
| "learning_rate": 8.400878906249999e-05, | |
| "loss": 2.2146, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.732421875, | |
| "grad_norm": 2.0888478755950928, | |
| "learning_rate": 8.034667968749999e-05, | |
| "loss": 2.1147, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.74462890625, | |
| "grad_norm": 1.8527532815933228, | |
| "learning_rate": 7.66845703125e-05, | |
| "loss": 2.2788, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.7568359375, | |
| "grad_norm": 2.0175349712371826, | |
| "learning_rate": 7.30224609375e-05, | |
| "loss": 2.0617, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.76904296875, | |
| "grad_norm": 2.087881565093994, | |
| "learning_rate": 6.936035156249999e-05, | |
| "loss": 2.1744, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.78125, | |
| "grad_norm": 1.489088535308838, | |
| "learning_rate": 6.569824218749999e-05, | |
| "loss": 2.2375, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.79345703125, | |
| "grad_norm": 2.086857795715332, | |
| "learning_rate": 6.20361328125e-05, | |
| "loss": 2.1169, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.8056640625, | |
| "grad_norm": 2.4959232807159424, | |
| "learning_rate": 5.8374023437499995e-05, | |
| "loss": 2.0883, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.81787109375, | |
| "grad_norm": 1.963571310043335, | |
| "learning_rate": 5.47119140625e-05, | |
| "loss": 2.0865, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.830078125, | |
| "grad_norm": 2.02911376953125, | |
| "learning_rate": 5.1049804687499995e-05, | |
| "loss": 2.1444, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.84228515625, | |
| "grad_norm": 1.9320927858352661, | |
| "learning_rate": 4.73876953125e-05, | |
| "loss": 2.0413, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.8544921875, | |
| "grad_norm": 2.840395450592041, | |
| "learning_rate": 4.3725585937499995e-05, | |
| "loss": 2.2372, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.86669921875, | |
| "grad_norm": 2.4306936264038086, | |
| "learning_rate": 4.00634765625e-05, | |
| "loss": 2.0752, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.87890625, | |
| "grad_norm": 2.095372200012207, | |
| "learning_rate": 3.6401367187499996e-05, | |
| "loss": 2.1668, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.89111328125, | |
| "grad_norm": 2.0222434997558594, | |
| "learning_rate": 3.27392578125e-05, | |
| "loss": 2.1369, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.9033203125, | |
| "grad_norm": 2.1027109622955322, | |
| "learning_rate": 2.9077148437499996e-05, | |
| "loss": 2.2062, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.91552734375, | |
| "grad_norm": 2.095820188522339, | |
| "learning_rate": 2.5415039062499996e-05, | |
| "loss": 2.0975, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.927734375, | |
| "grad_norm": 1.7171921730041504, | |
| "learning_rate": 2.17529296875e-05, | |
| "loss": 2.0854, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.93994140625, | |
| "grad_norm": 1.5770667791366577, | |
| "learning_rate": 1.80908203125e-05, | |
| "loss": 2.0413, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.9521484375, | |
| "grad_norm": 2.3405392169952393, | |
| "learning_rate": 1.4428710937499998e-05, | |
| "loss": 2.1474, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.96435546875, | |
| "grad_norm": 1.8116958141326904, | |
| "learning_rate": 1.0766601562499998e-05, | |
| "loss": 2.2726, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.9765625, | |
| "grad_norm": 2.7648468017578125, | |
| "learning_rate": 7.104492187499999e-06, | |
| "loss": 2.0461, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.98876953125, | |
| "grad_norm": 1.8557350635528564, | |
| "learning_rate": 3.4423828124999995e-06, | |
| "loss": 2.0223, | |
| "step": 4050 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 4096, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.16016998116352e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |