{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 19.234146341463415, "eval_steps": 50, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.3902439024390244, "grad_norm": 0.14482291042804718, "learning_rate": 4.222222222222222e-05, "loss": 1.0074, "step": 20 }, { "epoch": 0.7804878048780488, "grad_norm": 0.24155008792877197, "learning_rate": 8.666666666666667e-05, "loss": 0.9693, "step": 40 }, { "epoch": 0.975609756097561, "eval_runtime": 0.0074, "eval_samples_per_second": 268.521, "eval_steps_per_second": 134.261, "step": 50 }, { "epoch": 1.1560975609756097, "grad_norm": 0.2531428337097168, "learning_rate": 0.00013111111111111111, "loss": 0.8984, "step": 60 }, { "epoch": 1.5463414634146342, "grad_norm": 0.2627839148044586, "learning_rate": 0.00017555555555555556, "loss": 0.8435, "step": 80 }, { "epoch": 1.9365853658536585, "grad_norm": 0.24496296048164368, "learning_rate": 0.0001999952797459453, "loss": 0.8363, "step": 100 }, { "epoch": 1.9365853658536585, "eval_runtime": 0.0079, "eval_samples_per_second": 252.236, "eval_steps_per_second": 126.118, "step": 100 }, { "epoch": 2.3121951219512193, "grad_norm": 0.3583034873008728, "learning_rate": 0.00019995099455998424, "loss": 0.742, "step": 120 }, { "epoch": 2.7024390243902436, "grad_norm": 0.273295134305954, "learning_rate": 0.00019986011387082198, "loss": 0.731, "step": 140 }, { "epoch": 2.897560975609756, "eval_runtime": 0.0076, "eval_samples_per_second": 263.122, "eval_steps_per_second": 131.561, "step": 150 }, { "epoch": 3.078048780487805, "grad_norm": 0.365675687789917, "learning_rate": 0.0001997226800455352, "loss": 0.6594, "step": 160 }, { "epoch": 3.4682926829268292, "grad_norm": 0.3552844226360321, "learning_rate": 0.00019953875715350382, "loss": 0.5675, "step": 180 }, { "epoch": 3.8585365853658535, "grad_norm": 0.3008895218372345, "learning_rate": 0.00019930843093654305, "loss": 0.5654, "step": 200 }, { "epoch": 3.8585365853658535, "eval_runtime": 0.013, "eval_samples_per_second": 153.267, "eval_steps_per_second": 76.633, "step": 200 }, { "epoch": 4.234146341463415, "grad_norm": 0.3664332330226898, "learning_rate": 0.00019903180876893194, "loss": 0.4938, "step": 220 }, { "epoch": 4.624390243902439, "grad_norm": 0.39528584480285645, "learning_rate": 0.0001987090196073572, "loss": 0.4202, "step": 240 }, { "epoch": 4.819512195121951, "eval_runtime": 0.0074, "eval_samples_per_second": 270.592, "eval_steps_per_second": 135.296, "step": 250 }, { "epoch": 5.0, "grad_norm": 0.7486668229103088, "learning_rate": 0.00019834021393079585, "loss": 0.4247, "step": 260 }, { "epoch": 5.390243902439025, "grad_norm": 0.3803549110889435, "learning_rate": 0.00019792556367036432, "loss": 0.3021, "step": 280 }, { "epoch": 5.780487804878049, "grad_norm": 0.3973694145679474, "learning_rate": 0.00019746526212916705, "loss": 0.3178, "step": 300 }, { "epoch": 5.780487804878049, "eval_runtime": 0.0079, "eval_samples_per_second": 253.892, "eval_steps_per_second": 126.946, "step": 300 }, { "epoch": 6.15609756097561, "grad_norm": 0.42896056175231934, "learning_rate": 0.00019695952389218167, "loss": 0.2677, "step": 320 }, { "epoch": 6.546341463414635, "grad_norm": 0.43547552824020386, "learning_rate": 0.00019640858472622316, "loss": 0.206, "step": 340 }, { "epoch": 6.741463414634146, "eval_runtime": 0.0085, "eval_samples_per_second": 235.192, "eval_steps_per_second": 117.596, "step": 350 }, { "epoch": 6.9365853658536585, "grad_norm": 0.3835262656211853, "learning_rate": 0.0001958127014700332, "loss": 0.1982, "step": 360 }, { "epoch": 7.31219512195122, "grad_norm": 0.4060633182525635, "learning_rate": 0.00019517215191454618, "loss": 0.1558, "step": 380 }, { "epoch": 7.702439024390244, "grad_norm": 0.45835667848587036, "learning_rate": 0.00019448723467338763, "loss": 0.1233, "step": 400 }, { "epoch": 7.702439024390244, "eval_runtime": 0.0082, "eval_samples_per_second": 243.141, "eval_steps_per_second": 121.571, "step": 400 }, { "epoch": 8.078048780487805, "grad_norm": 0.3501657247543335, "learning_rate": 0.00019375826904366553, "loss": 0.1306, "step": 420 }, { "epoch": 8.46829268292683, "grad_norm": 0.412287175655365, "learning_rate": 0.00019298559485711927, "loss": 0.0828, "step": 440 }, { "epoch": 8.663414634146342, "eval_runtime": 0.0077, "eval_samples_per_second": 260.524, "eval_steps_per_second": 130.262, "step": 450 }, { "epoch": 8.858536585365854, "grad_norm": 0.39604613184928894, "learning_rate": 0.0001921695723216957, "loss": 0.1054, "step": 460 }, { "epoch": 9.234146341463415, "grad_norm": 0.3440980017185211, "learning_rate": 0.00019131058185362597, "loss": 0.066, "step": 480 }, { "epoch": 9.62439024390244, "grad_norm": 0.3329509198665619, "learning_rate": 0.00019040902390008216, "loss": 0.0626, "step": 500 }, { "epoch": 9.62439024390244, "eval_runtime": 0.0081, "eval_samples_per_second": 245.842, "eval_steps_per_second": 122.921, "step": 500 }, { "epoch": 10.0, "grad_norm": 0.6339950561523438, "learning_rate": 0.00018946531875249493, "loss": 0.0633, "step": 520 }, { "epoch": 10.390243902439025, "grad_norm": 0.3356594443321228, "learning_rate": 0.000188479906350621, "loss": 0.0427, "step": 540 }, { "epoch": 10.585365853658537, "eval_runtime": 0.0078, "eval_samples_per_second": 255.462, "eval_steps_per_second": 127.731, "step": 550 }, { "epoch": 10.78048780487805, "grad_norm": 0.29425719380378723, "learning_rate": 0.0001874532460774503, "loss": 0.0427, "step": 560 }, { "epoch": 11.15609756097561, "grad_norm": 0.3889470100402832, "learning_rate": 0.0001863858165450492, "loss": 0.0375, "step": 580 }, { "epoch": 11.546341463414635, "grad_norm": 0.2772677540779114, "learning_rate": 0.00018527811537143954, "loss": 0.0308, "step": 600 }, { "epoch": 11.546341463414635, "eval_runtime": 0.0083, "eval_samples_per_second": 240.741, "eval_steps_per_second": 120.37, "step": 600 }, { "epoch": 11.93658536585366, "grad_norm": 0.2893534004688263, "learning_rate": 0.00018413065894861728, "loss": 0.0305, "step": 620 }, { "epoch": 12.31219512195122, "grad_norm": 0.2675611078739166, "learning_rate": 0.00018294398220181917, "loss": 0.029, "step": 640 }, { "epoch": 12.507317073170732, "eval_runtime": 0.0121, "eval_samples_per_second": 165.868, "eval_steps_per_second": 82.934, "step": 650 }, { "epoch": 12.702439024390245, "grad_norm": 0.258488267660141, "learning_rate": 0.00018171863834014928, "loss": 0.0228, "step": 660 }, { "epoch": 13.078048780487805, "grad_norm": 0.2041754275560379, "learning_rate": 0.00018045519859868217, "loss": 0.0208, "step": 680 }, { "epoch": 13.46829268292683, "grad_norm": 0.23853613436222076, "learning_rate": 0.00017915425197216245, "loss": 0.0164, "step": 700 }, { "epoch": 13.46829268292683, "eval_runtime": 0.008, "eval_samples_per_second": 251.156, "eval_steps_per_second": 125.578, "step": 700 }, { "epoch": 13.858536585365854, "grad_norm": 0.20536492764949799, "learning_rate": 0.00017781640494042526, "loss": 0.022, "step": 720 }, { "epoch": 14.234146341463415, "grad_norm": 0.2892828583717346, "learning_rate": 0.00017644228118566532, "loss": 0.0164, "step": 740 }, { "epoch": 14.429268292682927, "eval_runtime": 0.0082, "eval_samples_per_second": 242.726, "eval_steps_per_second": 121.363, "step": 750 }, { "epoch": 14.62439024390244, "grad_norm": 0.2670450806617737, "learning_rate": 0.00017503252130168657, "loss": 0.0159, "step": 760 }, { "epoch": 15.0, "grad_norm": 0.3871139585971832, "learning_rate": 0.0001735877824952679, "loss": 0.0144, "step": 780 }, { "epoch": 15.390243902439025, "grad_norm": 0.19563424587249756, "learning_rate": 0.0001721087382797844, "loss": 0.0101, "step": 800 }, { "epoch": 15.390243902439025, "eval_runtime": 0.008, "eval_samples_per_second": 248.92, "eval_steps_per_second": 124.46, "step": 800 }, { "epoch": 15.78048780487805, "grad_norm": 0.1579708755016327, "learning_rate": 0.00017059607816122618, "loss": 0.0125, "step": 820 }, { "epoch": 16.15609756097561, "grad_norm": 0.1630859375, "learning_rate": 0.00016905050731676245, "loss": 0.01, "step": 840 }, { "epoch": 16.351219512195122, "eval_runtime": 0.0082, "eval_samples_per_second": 243.282, "eval_steps_per_second": 121.641, "step": 850 }, { "epoch": 16.546341463414635, "grad_norm": 0.13275454938411713, "learning_rate": 0.0001674727462659993, "loss": 0.0088, "step": 860 }, { "epoch": 16.93658536585366, "grad_norm": 0.14907070994377136, "learning_rate": 0.0001658635305350855, "loss": 0.0099, "step": 880 }, { "epoch": 17.31219512195122, "grad_norm": 0.1718103587627411, "learning_rate": 0.00016422361031382217, "loss": 0.0073, "step": 900 }, { "epoch": 17.31219512195122, "eval_runtime": 0.0077, "eval_samples_per_second": 258.397, "eval_steps_per_second": 129.199, "step": 900 }, { "epoch": 17.702439024390245, "grad_norm": 0.14583879709243774, "learning_rate": 0.00016255375010593704, "loss": 0.0075, "step": 920 }, { "epoch": 18.078048780487805, "grad_norm": 0.15573625266551971, "learning_rate": 0.00016085472837268502, "loss": 0.0059, "step": 940 }, { "epoch": 18.273170731707317, "eval_runtime": 0.0078, "eval_samples_per_second": 256.415, "eval_steps_per_second": 128.207, "step": 950 }, { "epoch": 18.46829268292683, "grad_norm": 0.1077326312661171, "learning_rate": 0.00015912733716994275, "loss": 0.0069, "step": 960 }, { "epoch": 18.858536585365854, "grad_norm": 0.14486464858055115, "learning_rate": 0.0001573723817789649, "loss": 0.0063, "step": 980 }, { "epoch": 19.234146341463415, "grad_norm": 0.08889542520046234, "learning_rate": 0.00015559068033097582, "loss": 0.0057, "step": 1000 }, { "epoch": 19.234146341463415, "eval_runtime": 0.0078, "eval_samples_per_second": 257.873, "eval_steps_per_second": 128.936, "step": 1000 } ], "logging_steps": 20, "max_steps": 3000, "num_input_tokens_seen": 0, "num_train_epochs": 58, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.340812514582446e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }