| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 480, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0625, | |
| "grad_norm": 0.5966607332229614, | |
| "learning_rate": 9.8125e-05, | |
| "loss": 1.9853, | |
| "mean_token_accuracy": 0.7986810088157654, | |
| "num_tokens": 81920.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.125, | |
| "grad_norm": 0.517964780330658, | |
| "learning_rate": 9.604166666666668e-05, | |
| "loss": 0.1767, | |
| "mean_token_accuracy": 0.9620053827762604, | |
| "num_tokens": 163840.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.1875, | |
| "grad_norm": 0.3082728683948517, | |
| "learning_rate": 9.395833333333333e-05, | |
| "loss": 0.0745, | |
| "mean_token_accuracy": 0.9809599459171295, | |
| "num_tokens": 245760.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.1719624400138855, | |
| "learning_rate": 9.1875e-05, | |
| "loss": 0.0512, | |
| "mean_token_accuracy": 0.9850024461746216, | |
| "num_tokens": 327680.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.3125, | |
| "grad_norm": 0.16888852417469025, | |
| "learning_rate": 8.979166666666668e-05, | |
| "loss": 0.0432, | |
| "mean_token_accuracy": 0.9855642437934875, | |
| "num_tokens": 409600.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.375, | |
| "grad_norm": 0.18019415438175201, | |
| "learning_rate": 8.770833333333334e-05, | |
| "loss": 0.0358, | |
| "mean_token_accuracy": 0.9871152937412262, | |
| "num_tokens": 491520.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.4375, | |
| "grad_norm": 0.14396421611309052, | |
| "learning_rate": 8.5625e-05, | |
| "loss": 0.0322, | |
| "mean_token_accuracy": 0.9876770913600922, | |
| "num_tokens": 573440.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.1333707869052887, | |
| "learning_rate": 8.354166666666667e-05, | |
| "loss": 0.0307, | |
| "mean_token_accuracy": 0.9872984886169434, | |
| "num_tokens": 655360.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.5625, | |
| "grad_norm": 0.118904247879982, | |
| "learning_rate": 8.145833333333334e-05, | |
| "loss": 0.0298, | |
| "mean_token_accuracy": 0.9874206185340881, | |
| "num_tokens": 737280.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 0.10813665390014648, | |
| "learning_rate": 7.9375e-05, | |
| "loss": 0.0288, | |
| "mean_token_accuracy": 0.9879335641860962, | |
| "num_tokens": 819200.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6875, | |
| "grad_norm": 0.11911392956972122, | |
| "learning_rate": 7.729166666666667e-05, | |
| "loss": 0.0287, | |
| "mean_token_accuracy": 0.987469470500946, | |
| "num_tokens": 901120.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.11651206016540527, | |
| "learning_rate": 7.520833333333334e-05, | |
| "loss": 0.0281, | |
| "mean_token_accuracy": 0.9881045460700989, | |
| "num_tokens": 983040.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.8125, | |
| "grad_norm": 0.12028653174638748, | |
| "learning_rate": 7.3125e-05, | |
| "loss": 0.0279, | |
| "mean_token_accuracy": 0.9879335641860962, | |
| "num_tokens": 1064960.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.875, | |
| "grad_norm": 0.10548015683889389, | |
| "learning_rate": 7.104166666666667e-05, | |
| "loss": 0.0276, | |
| "mean_token_accuracy": 0.9877259433269501, | |
| "num_tokens": 1146880.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.9375, | |
| "grad_norm": 0.10059994459152222, | |
| "learning_rate": 6.895833333333333e-05, | |
| "loss": 0.0276, | |
| "mean_token_accuracy": 0.9879824161529541, | |
| "num_tokens": 1228800.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.09910538047552109, | |
| "learning_rate": 6.6875e-05, | |
| "loss": 0.0275, | |
| "mean_token_accuracy": 0.9881045460700989, | |
| "num_tokens": 1310720.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_runtime": 11.8471, | |
| "eval_samples_per_second": 13.505, | |
| "eval_steps_per_second": 0.844, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_runtime": 10.9136, | |
| "eval_samples_per_second": 14.661, | |
| "eval_steps_per_second": 0.916, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.0625, | |
| "grad_norm": 0.10032763332128525, | |
| "learning_rate": 6.479166666666668e-05, | |
| "loss": 0.0267, | |
| "mean_token_accuracy": 0.9882633149623871, | |
| "num_tokens": 1392640.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.125, | |
| "grad_norm": 0.10291949659585953, | |
| "learning_rate": 6.270833333333333e-05, | |
| "loss": 0.0271, | |
| "mean_token_accuracy": 0.9882144629955292, | |
| "num_tokens": 1474560.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.1875, | |
| "grad_norm": 0.10503465682268143, | |
| "learning_rate": 6.0624999999999996e-05, | |
| "loss": 0.027, | |
| "mean_token_accuracy": 0.988031268119812, | |
| "num_tokens": 1556480.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 0.09680064767599106, | |
| "learning_rate": 5.8541666666666676e-05, | |
| "loss": 0.0268, | |
| "mean_token_accuracy": 0.9883976578712463, | |
| "num_tokens": 1638400.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.3125, | |
| "grad_norm": 0.09701237827539444, | |
| "learning_rate": 5.6458333333333335e-05, | |
| "loss": 0.0268, | |
| "mean_token_accuracy": 0.9880923330783844, | |
| "num_tokens": 1720320.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.375, | |
| "grad_norm": 0.09592857956886292, | |
| "learning_rate": 5.4375e-05, | |
| "loss": 0.027, | |
| "mean_token_accuracy": 0.9881533980369568, | |
| "num_tokens": 1802240.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.4375, | |
| "grad_norm": 0.09052903950214386, | |
| "learning_rate": 5.229166666666667e-05, | |
| "loss": 0.027, | |
| "mean_token_accuracy": 0.9882999539375306, | |
| "num_tokens": 1884160.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.1032903790473938, | |
| "learning_rate": 5.020833333333333e-05, | |
| "loss": 0.0268, | |
| "mean_token_accuracy": 0.988312166929245, | |
| "num_tokens": 1966080.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.5625, | |
| "grad_norm": 0.10449512302875519, | |
| "learning_rate": 4.8125000000000004e-05, | |
| "loss": 0.0266, | |
| "mean_token_accuracy": 0.9881900370121002, | |
| "num_tokens": 2048000.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.625, | |
| "grad_norm": 0.09428944438695908, | |
| "learning_rate": 4.604166666666666e-05, | |
| "loss": 0.0267, | |
| "mean_token_accuracy": 0.9882511019706726, | |
| "num_tokens": 2129920.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.6875, | |
| "grad_norm": 0.10462497174739838, | |
| "learning_rate": 4.3958333333333336e-05, | |
| "loss": 0.0266, | |
| "mean_token_accuracy": 0.9879091382026672, | |
| "num_tokens": 2211840.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 0.09638702869415283, | |
| "learning_rate": 4.1875e-05, | |
| "loss": 0.0266, | |
| "mean_token_accuracy": 0.9883488059043884, | |
| "num_tokens": 2293760.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.8125, | |
| "grad_norm": 0.10269024223089218, | |
| "learning_rate": 3.979166666666667e-05, | |
| "loss": 0.0265, | |
| "mean_token_accuracy": 0.9881533980369568, | |
| "num_tokens": 2375680.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.875, | |
| "grad_norm": 0.09432139992713928, | |
| "learning_rate": 3.770833333333333e-05, | |
| "loss": 0.0264, | |
| "mean_token_accuracy": 0.9882633149623871, | |
| "num_tokens": 2457600.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.9375, | |
| "grad_norm": 0.10591922700405121, | |
| "learning_rate": 3.5625000000000005e-05, | |
| "loss": 0.0265, | |
| "mean_token_accuracy": 0.9878358602523803, | |
| "num_tokens": 2539520.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.0988362580537796, | |
| "learning_rate": 3.3541666666666664e-05, | |
| "loss": 0.0264, | |
| "mean_token_accuracy": 0.9880923330783844, | |
| "num_tokens": 2621440.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_runtime": 10.7547, | |
| "eval_samples_per_second": 14.877, | |
| "eval_steps_per_second": 0.93, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.0625, | |
| "grad_norm": 0.08879899233579636, | |
| "learning_rate": 3.145833333333334e-05, | |
| "loss": 0.026, | |
| "mean_token_accuracy": 0.9881656110286713, | |
| "num_tokens": 2703360.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.125, | |
| "grad_norm": 0.09961431473493576, | |
| "learning_rate": 2.9375000000000003e-05, | |
| "loss": 0.0257, | |
| "mean_token_accuracy": 0.9886052787303925, | |
| "num_tokens": 2785280.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.1875, | |
| "grad_norm": 0.1122380793094635, | |
| "learning_rate": 2.7291666666666665e-05, | |
| "loss": 0.0261, | |
| "mean_token_accuracy": 0.9886541306972504, | |
| "num_tokens": 2867200.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 0.09964418411254883, | |
| "learning_rate": 2.5208333333333334e-05, | |
| "loss": 0.0255, | |
| "mean_token_accuracy": 0.9889594554901123, | |
| "num_tokens": 2949120.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.3125, | |
| "grad_norm": 0.09933824837207794, | |
| "learning_rate": 2.3125000000000003e-05, | |
| "loss": 0.0259, | |
| "mean_token_accuracy": 0.9884220838546753, | |
| "num_tokens": 3031040.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.375, | |
| "grad_norm": 0.09340930730104446, | |
| "learning_rate": 2.104166666666667e-05, | |
| "loss": 0.0262, | |
| "mean_token_accuracy": 0.9882755279541016, | |
| "num_tokens": 3112960.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.4375, | |
| "grad_norm": 0.09159277379512787, | |
| "learning_rate": 1.8958333333333334e-05, | |
| "loss": 0.0259, | |
| "mean_token_accuracy": 0.9886297047138214, | |
| "num_tokens": 3194880.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.10940947383642197, | |
| "learning_rate": 1.6875000000000004e-05, | |
| "loss": 0.0258, | |
| "mean_token_accuracy": 0.9885075747966766, | |
| "num_tokens": 3276800.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.5625, | |
| "grad_norm": 0.09535407274961472, | |
| "learning_rate": 1.4791666666666668e-05, | |
| "loss": 0.0259, | |
| "mean_token_accuracy": 0.9886174917221069, | |
| "num_tokens": 3358720.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.625, | |
| "grad_norm": 0.08938491344451904, | |
| "learning_rate": 1.2708333333333333e-05, | |
| "loss": 0.0257, | |
| "mean_token_accuracy": 0.9884831488132477, | |
| "num_tokens": 3440640.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.6875, | |
| "grad_norm": 0.09536239504814148, | |
| "learning_rate": 1.0625e-05, | |
| "loss": 0.0257, | |
| "mean_token_accuracy": 0.9886052787303925, | |
| "num_tokens": 3522560.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 0.0934009775519371, | |
| "learning_rate": 8.541666666666666e-06, | |
| "loss": 0.0257, | |
| "mean_token_accuracy": 0.9886907696723938, | |
| "num_tokens": 3604480.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.8125, | |
| "grad_norm": 0.09570059180259705, | |
| "learning_rate": 6.458333333333334e-06, | |
| "loss": 0.0255, | |
| "mean_token_accuracy": 0.9888617515563964, | |
| "num_tokens": 3686400.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.875, | |
| "grad_norm": 0.09678570926189423, | |
| "learning_rate": 4.375e-06, | |
| "loss": 0.0255, | |
| "mean_token_accuracy": 0.988361018896103, | |
| "num_tokens": 3768320.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.9375, | |
| "grad_norm": 0.0881657674908638, | |
| "learning_rate": 2.2916666666666666e-06, | |
| "loss": 0.0255, | |
| "mean_token_accuracy": 0.9887884736061097, | |
| "num_tokens": 3850240.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.10601094365119934, | |
| "learning_rate": 2.0833333333333333e-07, | |
| "loss": 0.0257, | |
| "mean_token_accuracy": 0.9887518346309662, | |
| "num_tokens": 3932160.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_runtime": 10.7566, | |
| "eval_samples_per_second": 14.875, | |
| "eval_steps_per_second": 0.93, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 480, | |
| "total_flos": 0.0, | |
| "train_loss": 0.07282223819444576, | |
| "train_runtime": 407.0393, | |
| "train_samples_per_second": 4.717, | |
| "train_steps_per_second": 1.179 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 480, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |