| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 1332, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 2.4947163558006284, | |
| "epoch": 0.07507507507507508, | |
| "grad_norm": 2.2692031860351562, | |
| "learning_rate": 1.99976055134129e-05, | |
| "loss": 9.666, | |
| "mean_token_accuracy": 0.3714864462614059, | |
| "num_tokens": 175430.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 4.957374696731567, | |
| "epoch": 0.15015015015015015, | |
| "grad_norm": 0.4004897177219391, | |
| "learning_rate": 1.98972684724999e-05, | |
| "loss": 3.687, | |
| "mean_token_accuracy": 0.5028938430547715, | |
| "num_tokens": 350404.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 4.707881135940552, | |
| "epoch": 0.22522522522522523, | |
| "grad_norm": 0.4421725571155548, | |
| "learning_rate": 1.9650816346189154e-05, | |
| "loss": 3.4827, | |
| "mean_token_accuracy": 0.5126789313554764, | |
| "num_tokens": 526514.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 4.64449327468872, | |
| "epoch": 0.3003003003003003, | |
| "grad_norm": 0.4718657433986664, | |
| "learning_rate": 1.926188754982551e-05, | |
| "loss": 3.4128, | |
| "mean_token_accuracy": 0.5216181075572968, | |
| "num_tokens": 701236.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 4.598890199661255, | |
| "epoch": 0.37537537537537535, | |
| "grad_norm": 0.4660639762878418, | |
| "learning_rate": 1.8736223906463698e-05, | |
| "loss": 3.3675, | |
| "mean_token_accuracy": 0.528342125415802, | |
| "num_tokens": 876499.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 4.603140239715576, | |
| "epoch": 0.45045045045045046, | |
| "grad_norm": 0.47321391105651855, | |
| "learning_rate": 1.8081585879342008e-05, | |
| "loss": 3.3821, | |
| "mean_token_accuracy": 0.5266077440977096, | |
| "num_tokens": 1051864.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 4.587698755264282, | |
| "epoch": 0.5255255255255256, | |
| "grad_norm": 0.5947113633155823, | |
| "learning_rate": 1.7307638002821942e-05, | |
| "loss": 3.3509, | |
| "mean_token_accuracy": 0.5320688891410827, | |
| "num_tokens": 1226112.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 4.596508531570435, | |
| "epoch": 0.6006006006006006, | |
| "grad_norm": 0.5403749942779541, | |
| "learning_rate": 1.6425806203196734e-05, | |
| "loss": 3.3672, | |
| "mean_token_accuracy": 0.5314443480968475, | |
| "num_tokens": 1401199.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 4.5638793849945065, | |
| "epoch": 0.6756756756756757, | |
| "grad_norm": 0.5813198685646057, | |
| "learning_rate": 1.544910911576629e-05, | |
| "loss": 3.3328, | |
| "mean_token_accuracy": 0.5359585750102996, | |
| "num_tokens": 1576517.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 4.571099786758423, | |
| "epoch": 0.7507507507507507, | |
| "grad_norm": 0.5679024457931519, | |
| "learning_rate": 1.4391965888473705e-05, | |
| "loss": 3.3438, | |
| "mean_token_accuracy": 0.5345500099658966, | |
| "num_tokens": 1751444.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 4.5622615623474125, | |
| "epoch": 0.8258258258258259, | |
| "grad_norm": 0.6713469624519348, | |
| "learning_rate": 1.3269983309531584e-05, | |
| "loss": 3.3411, | |
| "mean_token_accuracy": 0.5363436281681061, | |
| "num_tokens": 1926907.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 4.562127561569214, | |
| "epoch": 0.9009009009009009, | |
| "grad_norm": 0.6330661177635193, | |
| "learning_rate": 1.2099725401709685e-05, | |
| "loss": 3.3382, | |
| "mean_token_accuracy": 0.5368253135681152, | |
| "num_tokens": 2102132.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 4.561514482498169, | |
| "epoch": 0.975975975975976, | |
| "grad_norm": 0.7525476813316345, | |
| "learning_rate": 1.0898468884803366e-05, | |
| "loss": 3.3411, | |
| "mean_token_accuracy": 0.537065578699112, | |
| "num_tokens": 2277537.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 4.548706264495849, | |
| "epoch": 1.0510510510510511, | |
| "grad_norm": 0.6428335309028625, | |
| "learning_rate": 9.683948116432609e-06, | |
| "loss": 3.3205, | |
| "mean_token_accuracy": 0.5413484466075897, | |
| "num_tokens": 2449566.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 4.5432649517059325, | |
| "epoch": 1.1261261261261262, | |
| "grad_norm": 0.6926820278167725, | |
| "learning_rate": 8.474093276654764e-06, | |
| "loss": 3.3025, | |
| "mean_token_accuracy": 0.5445451009273529, | |
| "num_tokens": 2623865.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 4.537814807891846, | |
| "epoch": 1.2012012012012012, | |
| "grad_norm": 0.6473987698554993, | |
| "learning_rate": 7.286765661616761e-06, | |
| "loss": 3.3031, | |
| "mean_token_accuracy": 0.5444674134254456, | |
| "num_tokens": 2798793.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 4.533882551193237, | |
| "epoch": 1.2762762762762763, | |
| "grad_norm": 0.7260850667953491, | |
| "learning_rate": 6.139493994152428e-06, | |
| "loss": 3.3086, | |
| "mean_token_accuracy": 0.5432299053668976, | |
| "num_tokens": 2975077.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 4.509051198959351, | |
| "epoch": 1.3513513513513513, | |
| "grad_norm": 0.7649775743484497, | |
| "learning_rate": 5.0492156442170914e-06, | |
| "loss": 3.262, | |
| "mean_token_accuracy": 0.5496996784210205, | |
| "num_tokens": 3149867.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 4.533229093551636, | |
| "epoch": 1.4264264264264264, | |
| "grad_norm": 0.7427539229393005, | |
| "learning_rate": 4.0320265795669815e-06, | |
| "loss": 3.2888, | |
| "mean_token_accuracy": 0.5470581102371216, | |
| "num_tokens": 3324230.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 4.5299335289001466, | |
| "epoch": 1.5015015015015014, | |
| "grad_norm": 0.7631803154945374, | |
| "learning_rate": 3.1029437382047368e-06, | |
| "loss": 3.2959, | |
| "mean_token_accuracy": 0.5465954422950745, | |
| "num_tokens": 3499600.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 4.519344367980957, | |
| "epoch": 1.5765765765765765, | |
| "grad_norm": 0.7936219573020935, | |
| "learning_rate": 2.275683330727697e-06, | |
| "loss": 3.2927, | |
| "mean_token_accuracy": 0.5452944958209991, | |
| "num_tokens": 3676089.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 4.532338409423828, | |
| "epoch": 1.6516516516516515, | |
| "grad_norm": 0.735207200050354, | |
| "learning_rate": 1.562458345539739e-06, | |
| "loss": 3.3022, | |
| "mean_token_accuracy": 0.5458044326305389, | |
| "num_tokens": 3851878.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 4.519860811233521, | |
| "epoch": 1.7267267267267268, | |
| "grad_norm": 0.9440169930458069, | |
| "learning_rate": 9.737982463922102e-07, | |
| "loss": 3.2767, | |
| "mean_token_accuracy": 0.5487184035778045, | |
| "num_tokens": 4026785.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 4.5121307468414305, | |
| "epoch": 1.8018018018018018, | |
| "grad_norm": 0.8288739919662476, | |
| "learning_rate": 5.183935240903415e-07, | |
| "loss": 3.2591, | |
| "mean_token_accuracy": 0.5512541651725769, | |
| "num_tokens": 4200908.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 4.517954025268555, | |
| "epoch": 1.8768768768768769, | |
| "grad_norm": 0.7847370505332947, | |
| "learning_rate": 2.0296739727517335e-07, | |
| "loss": 3.2798, | |
| "mean_token_accuracy": 0.5484513938426971, | |
| "num_tokens": 4376428.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 4.50932373046875, | |
| "epoch": 1.951951951951952, | |
| "grad_norm": 0.7920858860015869, | |
| "learning_rate": 3.217655638451112e-08, | |
| "loss": 3.2683, | |
| "mean_token_accuracy": 0.5490582883358002, | |
| "num_tokens": 4551678.0, | |
| "step": 1300 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 1332, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.767706644103712e+17, | |
| "train_batch_size": 24, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |