{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 480, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0625, "grad_norm": 0.5966607332229614, "learning_rate": 9.8125e-05, "loss": 1.9853, "mean_token_accuracy": 0.7986810088157654, "num_tokens": 81920.0, "step": 10 }, { "epoch": 0.125, "grad_norm": 0.517964780330658, "learning_rate": 9.604166666666668e-05, "loss": 0.1767, "mean_token_accuracy": 0.9620053827762604, "num_tokens": 163840.0, "step": 20 }, { "epoch": 0.1875, "grad_norm": 0.3082728683948517, "learning_rate": 9.395833333333333e-05, "loss": 0.0745, "mean_token_accuracy": 0.9809599459171295, "num_tokens": 245760.0, "step": 30 }, { "epoch": 0.25, "grad_norm": 0.1719624400138855, "learning_rate": 9.1875e-05, "loss": 0.0512, "mean_token_accuracy": 0.9850024461746216, "num_tokens": 327680.0, "step": 40 }, { "epoch": 0.3125, "grad_norm": 0.16888852417469025, "learning_rate": 8.979166666666668e-05, "loss": 0.0432, "mean_token_accuracy": 0.9855642437934875, "num_tokens": 409600.0, "step": 50 }, { "epoch": 0.375, "grad_norm": 0.18019415438175201, "learning_rate": 8.770833333333334e-05, "loss": 0.0358, "mean_token_accuracy": 0.9871152937412262, "num_tokens": 491520.0, "step": 60 }, { "epoch": 0.4375, "grad_norm": 0.14396421611309052, "learning_rate": 8.5625e-05, "loss": 0.0322, "mean_token_accuracy": 0.9876770913600922, "num_tokens": 573440.0, "step": 70 }, { "epoch": 0.5, "grad_norm": 0.1333707869052887, "learning_rate": 8.354166666666667e-05, "loss": 0.0307, "mean_token_accuracy": 0.9872984886169434, "num_tokens": 655360.0, "step": 80 }, { "epoch": 0.5625, "grad_norm": 0.118904247879982, "learning_rate": 8.145833333333334e-05, "loss": 0.0298, "mean_token_accuracy": 0.9874206185340881, "num_tokens": 737280.0, "step": 90 }, { "epoch": 0.625, "grad_norm": 0.10813665390014648, "learning_rate": 7.9375e-05, "loss": 0.0288, "mean_token_accuracy": 0.9879335641860962, "num_tokens": 819200.0, "step": 100 }, { "epoch": 0.6875, "grad_norm": 0.11911392956972122, "learning_rate": 7.729166666666667e-05, "loss": 0.0287, "mean_token_accuracy": 0.987469470500946, "num_tokens": 901120.0, "step": 110 }, { "epoch": 0.75, "grad_norm": 0.11651206016540527, "learning_rate": 7.520833333333334e-05, "loss": 0.0281, "mean_token_accuracy": 0.9881045460700989, "num_tokens": 983040.0, "step": 120 }, { "epoch": 0.8125, "grad_norm": 0.12028653174638748, "learning_rate": 7.3125e-05, "loss": 0.0279, "mean_token_accuracy": 0.9879335641860962, "num_tokens": 1064960.0, "step": 130 }, { "epoch": 0.875, "grad_norm": 0.10548015683889389, "learning_rate": 7.104166666666667e-05, "loss": 0.0276, "mean_token_accuracy": 0.9877259433269501, "num_tokens": 1146880.0, "step": 140 }, { "epoch": 0.9375, "grad_norm": 0.10059994459152222, "learning_rate": 6.895833333333333e-05, "loss": 0.0276, "mean_token_accuracy": 0.9879824161529541, "num_tokens": 1228800.0, "step": 150 }, { "epoch": 1.0, "grad_norm": 0.09910538047552109, "learning_rate": 6.6875e-05, "loss": 0.0275, "mean_token_accuracy": 0.9881045460700989, "num_tokens": 1310720.0, "step": 160 }, { "epoch": 1.0, "eval_runtime": 11.8471, "eval_samples_per_second": 13.505, "eval_steps_per_second": 0.844, "step": 160 }, { "epoch": 1.0, "eval_runtime": 10.9136, "eval_samples_per_second": 14.661, "eval_steps_per_second": 0.916, "step": 160 }, { "epoch": 1.0625, "grad_norm": 0.10032763332128525, "learning_rate": 6.479166666666668e-05, "loss": 0.0267, "mean_token_accuracy": 0.9882633149623871, "num_tokens": 1392640.0, "step": 170 }, { "epoch": 1.125, "grad_norm": 0.10291949659585953, "learning_rate": 6.270833333333333e-05, "loss": 0.0271, "mean_token_accuracy": 0.9882144629955292, "num_tokens": 1474560.0, "step": 180 }, { "epoch": 1.1875, "grad_norm": 0.10503465682268143, "learning_rate": 6.0624999999999996e-05, "loss": 0.027, "mean_token_accuracy": 0.988031268119812, "num_tokens": 1556480.0, "step": 190 }, { "epoch": 1.25, "grad_norm": 0.09680064767599106, "learning_rate": 5.8541666666666676e-05, "loss": 0.0268, "mean_token_accuracy": 0.9883976578712463, "num_tokens": 1638400.0, "step": 200 }, { "epoch": 1.3125, "grad_norm": 0.09701237827539444, "learning_rate": 5.6458333333333335e-05, "loss": 0.0268, "mean_token_accuracy": 0.9880923330783844, "num_tokens": 1720320.0, "step": 210 }, { "epoch": 1.375, "grad_norm": 0.09592857956886292, "learning_rate": 5.4375e-05, "loss": 0.027, "mean_token_accuracy": 0.9881533980369568, "num_tokens": 1802240.0, "step": 220 }, { "epoch": 1.4375, "grad_norm": 0.09052903950214386, "learning_rate": 5.229166666666667e-05, "loss": 0.027, "mean_token_accuracy": 0.9882999539375306, "num_tokens": 1884160.0, "step": 230 }, { "epoch": 1.5, "grad_norm": 0.1032903790473938, "learning_rate": 5.020833333333333e-05, "loss": 0.0268, "mean_token_accuracy": 0.988312166929245, "num_tokens": 1966080.0, "step": 240 }, { "epoch": 1.5625, "grad_norm": 0.10449512302875519, "learning_rate": 4.8125000000000004e-05, "loss": 0.0266, "mean_token_accuracy": 0.9881900370121002, "num_tokens": 2048000.0, "step": 250 }, { "epoch": 1.625, "grad_norm": 0.09428944438695908, "learning_rate": 4.604166666666666e-05, "loss": 0.0267, "mean_token_accuracy": 0.9882511019706726, "num_tokens": 2129920.0, "step": 260 }, { "epoch": 1.6875, "grad_norm": 0.10462497174739838, "learning_rate": 4.3958333333333336e-05, "loss": 0.0266, "mean_token_accuracy": 0.9879091382026672, "num_tokens": 2211840.0, "step": 270 }, { "epoch": 1.75, "grad_norm": 0.09638702869415283, "learning_rate": 4.1875e-05, "loss": 0.0266, "mean_token_accuracy": 0.9883488059043884, "num_tokens": 2293760.0, "step": 280 }, { "epoch": 1.8125, "grad_norm": 0.10269024223089218, "learning_rate": 3.979166666666667e-05, "loss": 0.0265, "mean_token_accuracy": 0.9881533980369568, "num_tokens": 2375680.0, "step": 290 }, { "epoch": 1.875, "grad_norm": 0.09432139992713928, "learning_rate": 3.770833333333333e-05, "loss": 0.0264, "mean_token_accuracy": 0.9882633149623871, "num_tokens": 2457600.0, "step": 300 }, { "epoch": 1.9375, "grad_norm": 0.10591922700405121, "learning_rate": 3.5625000000000005e-05, "loss": 0.0265, "mean_token_accuracy": 0.9878358602523803, "num_tokens": 2539520.0, "step": 310 }, { "epoch": 2.0, "grad_norm": 0.0988362580537796, "learning_rate": 3.3541666666666664e-05, "loss": 0.0264, "mean_token_accuracy": 0.9880923330783844, "num_tokens": 2621440.0, "step": 320 }, { "epoch": 2.0, "eval_runtime": 10.7547, "eval_samples_per_second": 14.877, "eval_steps_per_second": 0.93, "step": 320 }, { "epoch": 2.0625, "grad_norm": 0.08879899233579636, "learning_rate": 3.145833333333334e-05, "loss": 0.026, "mean_token_accuracy": 0.9881656110286713, "num_tokens": 2703360.0, "step": 330 }, { "epoch": 2.125, "grad_norm": 0.09961431473493576, "learning_rate": 2.9375000000000003e-05, "loss": 0.0257, "mean_token_accuracy": 0.9886052787303925, "num_tokens": 2785280.0, "step": 340 }, { "epoch": 2.1875, "grad_norm": 0.1122380793094635, "learning_rate": 2.7291666666666665e-05, "loss": 0.0261, "mean_token_accuracy": 0.9886541306972504, "num_tokens": 2867200.0, "step": 350 }, { "epoch": 2.25, "grad_norm": 0.09964418411254883, "learning_rate": 2.5208333333333334e-05, "loss": 0.0255, "mean_token_accuracy": 0.9889594554901123, "num_tokens": 2949120.0, "step": 360 }, { "epoch": 2.3125, "grad_norm": 0.09933824837207794, "learning_rate": 2.3125000000000003e-05, "loss": 0.0259, "mean_token_accuracy": 0.9884220838546753, "num_tokens": 3031040.0, "step": 370 }, { "epoch": 2.375, "grad_norm": 0.09340930730104446, "learning_rate": 2.104166666666667e-05, "loss": 0.0262, "mean_token_accuracy": 0.9882755279541016, "num_tokens": 3112960.0, "step": 380 }, { "epoch": 2.4375, "grad_norm": 0.09159277379512787, "learning_rate": 1.8958333333333334e-05, "loss": 0.0259, "mean_token_accuracy": 0.9886297047138214, "num_tokens": 3194880.0, "step": 390 }, { "epoch": 2.5, "grad_norm": 0.10940947383642197, "learning_rate": 1.6875000000000004e-05, "loss": 0.0258, "mean_token_accuracy": 0.9885075747966766, "num_tokens": 3276800.0, "step": 400 }, { "epoch": 2.5625, "grad_norm": 0.09535407274961472, "learning_rate": 1.4791666666666668e-05, "loss": 0.0259, "mean_token_accuracy": 0.9886174917221069, "num_tokens": 3358720.0, "step": 410 }, { "epoch": 2.625, "grad_norm": 0.08938491344451904, "learning_rate": 1.2708333333333333e-05, "loss": 0.0257, "mean_token_accuracy": 0.9884831488132477, "num_tokens": 3440640.0, "step": 420 }, { "epoch": 2.6875, "grad_norm": 0.09536239504814148, "learning_rate": 1.0625e-05, "loss": 0.0257, "mean_token_accuracy": 0.9886052787303925, "num_tokens": 3522560.0, "step": 430 }, { "epoch": 2.75, "grad_norm": 0.0934009775519371, "learning_rate": 8.541666666666666e-06, "loss": 0.0257, "mean_token_accuracy": 0.9886907696723938, "num_tokens": 3604480.0, "step": 440 }, { "epoch": 2.8125, "grad_norm": 0.09570059180259705, "learning_rate": 6.458333333333334e-06, "loss": 0.0255, "mean_token_accuracy": 0.9888617515563964, "num_tokens": 3686400.0, "step": 450 }, { "epoch": 2.875, "grad_norm": 0.09678570926189423, "learning_rate": 4.375e-06, "loss": 0.0255, "mean_token_accuracy": 0.988361018896103, "num_tokens": 3768320.0, "step": 460 }, { "epoch": 2.9375, "grad_norm": 0.0881657674908638, "learning_rate": 2.2916666666666666e-06, "loss": 0.0255, "mean_token_accuracy": 0.9887884736061097, "num_tokens": 3850240.0, "step": 470 }, { "epoch": 3.0, "grad_norm": 0.10601094365119934, "learning_rate": 2.0833333333333333e-07, "loss": 0.0257, "mean_token_accuracy": 0.9887518346309662, "num_tokens": 3932160.0, "step": 480 }, { "epoch": 3.0, "eval_runtime": 10.7566, "eval_samples_per_second": 14.875, "eval_steps_per_second": 0.93, "step": 480 }, { "epoch": 3.0, "step": 480, "total_flos": 0.0, "train_loss": 0.07282223819444576, "train_runtime": 407.0393, "train_samples_per_second": 4.717, "train_steps_per_second": 1.179 } ], "logging_steps": 10, "max_steps": 480, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }