| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 19.234146341463415, | |
| "eval_steps": 50, | |
| "global_step": 1000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.3902439024390244, | |
| "grad_norm": 0.14482291042804718, | |
| "learning_rate": 4.222222222222222e-05, | |
| "loss": 1.0074, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.7804878048780488, | |
| "grad_norm": 0.24155008792877197, | |
| "learning_rate": 8.666666666666667e-05, | |
| "loss": 0.9693, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.975609756097561, | |
| "eval_runtime": 0.0074, | |
| "eval_samples_per_second": 268.521, | |
| "eval_steps_per_second": 134.261, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.1560975609756097, | |
| "grad_norm": 0.2531428337097168, | |
| "learning_rate": 0.00013111111111111111, | |
| "loss": 0.8984, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.5463414634146342, | |
| "grad_norm": 0.2627839148044586, | |
| "learning_rate": 0.00017555555555555556, | |
| "loss": 0.8435, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.9365853658536585, | |
| "grad_norm": 0.24496296048164368, | |
| "learning_rate": 0.0001999952797459453, | |
| "loss": 0.8363, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.9365853658536585, | |
| "eval_runtime": 0.0079, | |
| "eval_samples_per_second": 252.236, | |
| "eval_steps_per_second": 126.118, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2.3121951219512193, | |
| "grad_norm": 0.3583034873008728, | |
| "learning_rate": 0.00019995099455998424, | |
| "loss": 0.742, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.7024390243902436, | |
| "grad_norm": 0.273295134305954, | |
| "learning_rate": 0.00019986011387082198, | |
| "loss": 0.731, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.897560975609756, | |
| "eval_runtime": 0.0076, | |
| "eval_samples_per_second": 263.122, | |
| "eval_steps_per_second": 131.561, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 3.078048780487805, | |
| "grad_norm": 0.365675687789917, | |
| "learning_rate": 0.0001997226800455352, | |
| "loss": 0.6594, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 3.4682926829268292, | |
| "grad_norm": 0.3552844226360321, | |
| "learning_rate": 0.00019953875715350382, | |
| "loss": 0.5675, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 3.8585365853658535, | |
| "grad_norm": 0.3008895218372345, | |
| "learning_rate": 0.00019930843093654305, | |
| "loss": 0.5654, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 3.8585365853658535, | |
| "eval_runtime": 0.013, | |
| "eval_samples_per_second": 153.267, | |
| "eval_steps_per_second": 76.633, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 4.234146341463415, | |
| "grad_norm": 0.3664332330226898, | |
| "learning_rate": 0.00019903180876893194, | |
| "loss": 0.4938, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 4.624390243902439, | |
| "grad_norm": 0.39528584480285645, | |
| "learning_rate": 0.0001987090196073572, | |
| "loss": 0.4202, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 4.819512195121951, | |
| "eval_runtime": 0.0074, | |
| "eval_samples_per_second": 270.592, | |
| "eval_steps_per_second": 135.296, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.7486668229103088, | |
| "learning_rate": 0.00019834021393079585, | |
| "loss": 0.4247, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 5.390243902439025, | |
| "grad_norm": 0.3803549110889435, | |
| "learning_rate": 0.00019792556367036432, | |
| "loss": 0.3021, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 5.780487804878049, | |
| "grad_norm": 0.3973694145679474, | |
| "learning_rate": 0.00019746526212916705, | |
| "loss": 0.3178, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 5.780487804878049, | |
| "eval_runtime": 0.0079, | |
| "eval_samples_per_second": 253.892, | |
| "eval_steps_per_second": 126.946, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 6.15609756097561, | |
| "grad_norm": 0.42896056175231934, | |
| "learning_rate": 0.00019695952389218167, | |
| "loss": 0.2677, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 6.546341463414635, | |
| "grad_norm": 0.43547552824020386, | |
| "learning_rate": 0.00019640858472622316, | |
| "loss": 0.206, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 6.741463414634146, | |
| "eval_runtime": 0.0085, | |
| "eval_samples_per_second": 235.192, | |
| "eval_steps_per_second": 117.596, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 6.9365853658536585, | |
| "grad_norm": 0.3835262656211853, | |
| "learning_rate": 0.0001958127014700332, | |
| "loss": 0.1982, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 7.31219512195122, | |
| "grad_norm": 0.4060633182525635, | |
| "learning_rate": 0.00019517215191454618, | |
| "loss": 0.1558, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 7.702439024390244, | |
| "grad_norm": 0.45835667848587036, | |
| "learning_rate": 0.00019448723467338763, | |
| "loss": 0.1233, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 7.702439024390244, | |
| "eval_runtime": 0.0082, | |
| "eval_samples_per_second": 243.141, | |
| "eval_steps_per_second": 121.571, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 8.078048780487805, | |
| "grad_norm": 0.3501657247543335, | |
| "learning_rate": 0.00019375826904366553, | |
| "loss": 0.1306, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 8.46829268292683, | |
| "grad_norm": 0.412287175655365, | |
| "learning_rate": 0.00019298559485711927, | |
| "loss": 0.0828, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 8.663414634146342, | |
| "eval_runtime": 0.0077, | |
| "eval_samples_per_second": 260.524, | |
| "eval_steps_per_second": 130.262, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 8.858536585365854, | |
| "grad_norm": 0.39604613184928894, | |
| "learning_rate": 0.0001921695723216957, | |
| "loss": 0.1054, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 9.234146341463415, | |
| "grad_norm": 0.3440980017185211, | |
| "learning_rate": 0.00019131058185362597, | |
| "loss": 0.066, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 9.62439024390244, | |
| "grad_norm": 0.3329509198665619, | |
| "learning_rate": 0.00019040902390008216, | |
| "loss": 0.0626, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 9.62439024390244, | |
| "eval_runtime": 0.0081, | |
| "eval_samples_per_second": 245.842, | |
| "eval_steps_per_second": 122.921, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.6339950561523438, | |
| "learning_rate": 0.00018946531875249493, | |
| "loss": 0.0633, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 10.390243902439025, | |
| "grad_norm": 0.3356594443321228, | |
| "learning_rate": 0.000188479906350621, | |
| "loss": 0.0427, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 10.585365853658537, | |
| "eval_runtime": 0.0078, | |
| "eval_samples_per_second": 255.462, | |
| "eval_steps_per_second": 127.731, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 10.78048780487805, | |
| "grad_norm": 0.29425719380378723, | |
| "learning_rate": 0.0001874532460774503, | |
| "loss": 0.0427, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 11.15609756097561, | |
| "grad_norm": 0.3889470100402832, | |
| "learning_rate": 0.0001863858165450492, | |
| "loss": 0.0375, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 11.546341463414635, | |
| "grad_norm": 0.2772677540779114, | |
| "learning_rate": 0.00018527811537143954, | |
| "loss": 0.0308, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 11.546341463414635, | |
| "eval_runtime": 0.0083, | |
| "eval_samples_per_second": 240.741, | |
| "eval_steps_per_second": 120.37, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 11.93658536585366, | |
| "grad_norm": 0.2893534004688263, | |
| "learning_rate": 0.00018413065894861728, | |
| "loss": 0.0305, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 12.31219512195122, | |
| "grad_norm": 0.2675611078739166, | |
| "learning_rate": 0.00018294398220181917, | |
| "loss": 0.029, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 12.507317073170732, | |
| "eval_runtime": 0.0121, | |
| "eval_samples_per_second": 165.868, | |
| "eval_steps_per_second": 82.934, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 12.702439024390245, | |
| "grad_norm": 0.258488267660141, | |
| "learning_rate": 0.00018171863834014928, | |
| "loss": 0.0228, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 13.078048780487805, | |
| "grad_norm": 0.2041754275560379, | |
| "learning_rate": 0.00018045519859868217, | |
| "loss": 0.0208, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 13.46829268292683, | |
| "grad_norm": 0.23853613436222076, | |
| "learning_rate": 0.00017915425197216245, | |
| "loss": 0.0164, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 13.46829268292683, | |
| "eval_runtime": 0.008, | |
| "eval_samples_per_second": 251.156, | |
| "eval_steps_per_second": 125.578, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 13.858536585365854, | |
| "grad_norm": 0.20536492764949799, | |
| "learning_rate": 0.00017781640494042526, | |
| "loss": 0.022, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 14.234146341463415, | |
| "grad_norm": 0.2892828583717346, | |
| "learning_rate": 0.00017644228118566532, | |
| "loss": 0.0164, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 14.429268292682927, | |
| "eval_runtime": 0.0082, | |
| "eval_samples_per_second": 242.726, | |
| "eval_steps_per_second": 121.363, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 14.62439024390244, | |
| "grad_norm": 0.2670450806617737, | |
| "learning_rate": 0.00017503252130168657, | |
| "loss": 0.0159, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 0.3871139585971832, | |
| "learning_rate": 0.0001735877824952679, | |
| "loss": 0.0144, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 15.390243902439025, | |
| "grad_norm": 0.19563424587249756, | |
| "learning_rate": 0.0001721087382797844, | |
| "loss": 0.0101, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 15.390243902439025, | |
| "eval_runtime": 0.008, | |
| "eval_samples_per_second": 248.92, | |
| "eval_steps_per_second": 124.46, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 15.78048780487805, | |
| "grad_norm": 0.1579708755016327, | |
| "learning_rate": 0.00017059607816122618, | |
| "loss": 0.0125, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 16.15609756097561, | |
| "grad_norm": 0.1630859375, | |
| "learning_rate": 0.00016905050731676245, | |
| "loss": 0.01, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 16.351219512195122, | |
| "eval_runtime": 0.0082, | |
| "eval_samples_per_second": 243.282, | |
| "eval_steps_per_second": 121.641, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 16.546341463414635, | |
| "grad_norm": 0.13275454938411713, | |
| "learning_rate": 0.0001674727462659993, | |
| "loss": 0.0088, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 16.93658536585366, | |
| "grad_norm": 0.14907070994377136, | |
| "learning_rate": 0.0001658635305350855, | |
| "loss": 0.0099, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 17.31219512195122, | |
| "grad_norm": 0.1718103587627411, | |
| "learning_rate": 0.00016422361031382217, | |
| "loss": 0.0073, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 17.31219512195122, | |
| "eval_runtime": 0.0077, | |
| "eval_samples_per_second": 258.397, | |
| "eval_steps_per_second": 129.199, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 17.702439024390245, | |
| "grad_norm": 0.14583879709243774, | |
| "learning_rate": 0.00016255375010593704, | |
| "loss": 0.0075, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 18.078048780487805, | |
| "grad_norm": 0.15573625266551971, | |
| "learning_rate": 0.00016085472837268502, | |
| "loss": 0.0059, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 18.273170731707317, | |
| "eval_runtime": 0.0078, | |
| "eval_samples_per_second": 256.415, | |
| "eval_steps_per_second": 128.207, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 18.46829268292683, | |
| "grad_norm": 0.1077326312661171, | |
| "learning_rate": 0.00015912733716994275, | |
| "loss": 0.0069, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 18.858536585365854, | |
| "grad_norm": 0.14486464858055115, | |
| "learning_rate": 0.0001573723817789649, | |
| "loss": 0.0063, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 19.234146341463415, | |
| "grad_norm": 0.08889542520046234, | |
| "learning_rate": 0.00015559068033097582, | |
| "loss": 0.0057, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 19.234146341463415, | |
| "eval_runtime": 0.0078, | |
| "eval_samples_per_second": 257.873, | |
| "eval_steps_per_second": 128.936, | |
| "step": 1000 | |
| } | |
| ], | |
| "logging_steps": 20, | |
| "max_steps": 3000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 58, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.340812514582446e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |