{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 43960, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 2.494273384451866, "epoch": 0.022747952684258416, "grad_norm": 18.375, "learning_rate": 6.991084023229653e-05, "loss": 2.56, "mean_token_accuracy": 0.47454266212880614, "num_tokens": 137746.0, "step": 1000 }, { "entropy": 2.1963223298788073, "epoch": 0.04549590536851683, "grad_norm": 36.75, "learning_rate": 6.964345916042913e-05, "loss": 2.2203, "mean_token_accuracy": 0.5251638530790805, "num_tokens": 274474.0, "step": 2000 }, { "entropy": 2.071790741562843, "epoch": 0.06824385805277525, "grad_norm": 18.625, "learning_rate": 6.919922168729661e-05, "loss": 2.0846, "mean_token_accuracy": 0.5476762084960938, "num_tokens": 410511.0, "step": 3000 }, { "entropy": 1.9665795323848725, "epoch": 0.09099181073703366, "grad_norm": 19.375, "learning_rate": 6.858039566497577e-05, "loss": 1.9827, "mean_token_accuracy": 0.5608653167784214, "num_tokens": 546387.0, "step": 4000 }, { "entropy": 1.8746658574938775, "epoch": 0.11373976342129208, "grad_norm": 14.5, "learning_rate": 6.779014022785937e-05, "loss": 1.878, "mean_token_accuracy": 0.5797940475940704, "num_tokens": 685188.0, "step": 5000 }, { "entropy": 1.8111666428446769, "epoch": 0.1364877161055505, "grad_norm": 25.375, "learning_rate": 6.683248966513431e-05, "loss": 1.8199, "mean_token_accuracy": 0.5904675594568253, "num_tokens": 823629.0, "step": 6000 }, { "entropy": 1.7517779313921928, "epoch": 0.1592356687898089, "grad_norm": 15.0625, "learning_rate": 6.571233282555582e-05, "loss": 1.7533, "mean_token_accuracy": 0.6037232976555824, "num_tokens": 962974.0, "step": 7000 }, { "entropy": 1.7067503632903098, "epoch": 0.18198362147406733, "grad_norm": 15.25, "learning_rate": 6.443538815965688e-05, "loss": 1.7095, "mean_token_accuracy": 0.6099362835586071, "num_tokens": 1097086.0, "step": 8000 }, { "entropy": 1.6539168149232863, "epoch": 0.20473157415832574, "grad_norm": 21.25, "learning_rate": 6.300817452680371e-05, "loss": 1.6574, "mean_token_accuracy": 0.6192078518867493, "num_tokens": 1241889.0, "step": 9000 }, { "entropy": 1.6205334926843644, "epoch": 0.22747952684258416, "grad_norm": 11.5625, "learning_rate": 6.143797791612843e-05, "loss": 1.623, "mean_token_accuracy": 0.6248027366697788, "num_tokens": 1378131.0, "step": 10000 }, { "entropy": 1.5953178288936616, "epoch": 0.2502274795268426, "grad_norm": 16.25, "learning_rate": 5.9732814251230294e-05, "loss": 1.5932, "mean_token_accuracy": 0.6300182574093341, "num_tokens": 1513496.0, "step": 11000 }, { "entropy": 1.5745678083896637, "epoch": 0.272975432211101, "grad_norm": 16.5, "learning_rate": 5.7901388468528755e-05, "loss": 1.5751, "mean_token_accuracy": 0.6331984028220177, "num_tokens": 1651603.0, "step": 12000 }, { "entropy": 1.576361115962267, "epoch": 0.29572338489535943, "grad_norm": 19.5, "learning_rate": 5.595305007817556e-05, "loss": 1.5795, "mean_token_accuracy": 0.6308481710553169, "num_tokens": 1790851.0, "step": 13000 }, { "entropy": 1.506616612225771, "epoch": 0.3184713375796178, "grad_norm": 15.8125, "learning_rate": 5.3897745434389274e-05, "loss": 1.504, "mean_token_accuracy": 0.6438729543685913, "num_tokens": 1926179.0, "step": 14000 }, { "entropy": 1.4861243069171906, "epoch": 0.34121929026387626, "grad_norm": 17.75, "learning_rate": 5.1745966958874055e-05, "loss": 1.4832, "mean_token_accuracy": 0.6503438740372658, "num_tokens": 2062340.0, "step": 15000 }, { "entropy": 1.4704010844826698, "epoch": 0.36396724294813465, "grad_norm": 21.0, "learning_rate": 4.9508699576539914e-05, "loss": 1.4694, "mean_token_accuracy": 0.6534121400117874, "num_tokens": 2207639.0, "step": 16000 }, { "entropy": 1.4741187323331832, "epoch": 0.3867151956323931, "grad_norm": 19.375, "learning_rate": 4.7197364636971925e-05, "loss": 1.4749, "mean_token_accuracy": 0.6518012735545635, "num_tokens": 2348887.0, "step": 17000 }, { "entropy": 1.4345429268479348, "epoch": 0.4094631483166515, "grad_norm": 19.25, "learning_rate": 4.482376160793216e-05, "loss": 1.4354, "mean_token_accuracy": 0.6596047645211219, "num_tokens": 2483718.0, "step": 18000 }, { "entropy": 1.4003090425133704, "epoch": 0.4322111010009099, "grad_norm": 20.625, "learning_rate": 4.240000783855147e-05, "loss": 1.3979, "mean_token_accuracy": 0.6681542407870292, "num_tokens": 2622656.0, "step": 19000 }, { "entropy": 1.4163592108488083, "epoch": 0.4549590536851683, "grad_norm": 16.5, "learning_rate": 3.993847669972281e-05, "loss": 1.4083, "mean_token_accuracy": 0.664802198857069, "num_tokens": 2760831.0, "step": 20000 }, { "entropy": 1.4052879491746426, "epoch": 0.47770700636942676, "grad_norm": 23.875, "learning_rate": 3.745173441749185e-05, "loss": 1.4074, "mean_token_accuracy": 0.6652796367108822, "num_tokens": 2900387.0, "step": 21000 }, { "entropy": 1.3997035399377347, "epoch": 0.5004549590536852, "grad_norm": 25.875, "learning_rate": 3.495247592191375e-05, "loss": 1.3972, "mean_token_accuracy": 0.6675955319404602, "num_tokens": 3037204.0, "step": 22000 }, { "entropy": 1.415595789283514, "epoch": 0.5232029117379435, "grad_norm": 26.75, "learning_rate": 3.245346003886994e-05, "loss": 1.4129, "mean_token_accuracy": 0.6636776500046253, "num_tokens": 3179823.0, "step": 23000 }, { "entropy": 1.379241144567728, "epoch": 0.545950864422202, "grad_norm": 20.0, "learning_rate": 2.996744435569409e-05, "loss": 1.3739, "mean_token_accuracy": 0.6730512301325798, "num_tokens": 3318434.0, "step": 24000 }, { "entropy": 1.3975665314793586, "epoch": 0.5686988171064604, "grad_norm": 20.875, "learning_rate": 2.7507120093120825e-05, "loss": 1.3976, "mean_token_accuracy": 0.6678333807885647, "num_tokens": 3461132.0, "step": 25000 }, { "entropy": 1.380986104875803, "epoch": 0.5914467697907189, "grad_norm": 14.6875, "learning_rate": 2.5085047316038814e-05, "loss": 1.3817, "mean_token_accuracy": 0.6719385531246662, "num_tokens": 3601560.0, "step": 26000 }, { "entropy": 1.385195587992668, "epoch": 0.6141947224749773, "grad_norm": 21.25, "learning_rate": 2.271359081380046e-05, "loss": 1.3815, "mean_token_accuracy": 0.6699076734781265, "num_tokens": 3745377.0, "step": 27000 }, { "entropy": 1.3656318633258342, "epoch": 0.6369426751592356, "grad_norm": 22.5, "learning_rate": 2.040485697742177e-05, "loss": 1.357, "mean_token_accuracy": 0.6759414212107658, "num_tokens": 3879126.0, "step": 28000 }, { "entropy": 1.3772443866729736, "epoch": 0.6596906278434941, "grad_norm": 26.0, "learning_rate": 1.8170631995917233e-05, "loss": 1.3745, "mean_token_accuracy": 0.6744600256979465, "num_tokens": 4017925.0, "step": 29000 }, { "entropy": 1.365410826742649, "epoch": 0.6824385805277525, "grad_norm": 22.5, "learning_rate": 1.602232168728024e-05, "loss": 1.3584, "mean_token_accuracy": 0.675611907929182, "num_tokens": 4156599.0, "step": 30000 }, { "entropy": 1.3873618737459184, "epoch": 0.705186533212011, "grad_norm": 13.25, "learning_rate": 1.3970893271274471e-05, "loss": 1.3803, "mean_token_accuracy": 0.6701690441966057, "num_tokens": 4295734.0, "step": 31000 }, { "entropy": 1.359022274851799, "epoch": 0.7279344858962693, "grad_norm": 16.75, "learning_rate": 1.202681938128876e-05, "loss": 1.3556, "mean_token_accuracy": 0.6761138562858104, "num_tokens": 4431490.0, "step": 32000 }, { "entropy": 1.3703680724203586, "epoch": 0.7506824385805277, "grad_norm": 19.0, "learning_rate": 1.0200024601077386e-05, "loss": 1.362, "mean_token_accuracy": 0.6743936349153519, "num_tokens": 4571660.0, "step": 33000 }, { "entropy": 1.3698465181291104, "epoch": 0.7734303912647862, "grad_norm": 24.25, "learning_rate": 8.49983479931827e-06, "loss": 1.3757, "mean_token_accuracy": 0.6719635992050171, "num_tokens": 4707985.0, "step": 34000 }, { "entropy": 1.3624781457483768, "epoch": 0.7961783439490446, "grad_norm": 16.875, "learning_rate": 6.9349295206380985e-06, "loss": 1.3518, "mean_token_accuracy": 0.6769196209311485, "num_tokens": 4844765.0, "step": 35000 }, { "entropy": 1.3442125609219073, "epoch": 0.818926296633303, "grad_norm": 15.25, "learning_rate": 5.513297676150713e-06, "loss": 1.3335, "mean_token_accuracy": 0.6805432761013508, "num_tokens": 4978615.0, "step": 36000 }, { "entropy": 1.355837997198105, "epoch": 0.8416742493175614, "grad_norm": 16.0, "learning_rate": 4.242196759710179e-06, "loss": 1.3377, "mean_token_accuracy": 0.6823569060564041, "num_tokens": 5117536.0, "step": 37000 }, { "entropy": 1.3855277094841003, "epoch": 0.8644222020018199, "grad_norm": 17.625, "learning_rate": 3.1281157980815473e-06, "loss": 1.3731, "mean_token_accuracy": 0.6734900210201741, "num_tokens": 5258021.0, "step": 38000 }, { "entropy": 1.3616491684913634, "epoch": 0.8871701546860783, "grad_norm": 33.25, "learning_rate": 2.1767422241703795e-06, "loss": 1.3462, "mean_token_accuracy": 0.6789434304237366, "num_tokens": 5395368.0, "step": 39000 }, { "entropy": 1.3761241393685342, "epoch": 0.9099181073703366, "grad_norm": 16.0, "learning_rate": 1.392932842424574e-06, "loss": 1.3787, "mean_token_accuracy": 0.6720339168906212, "num_tokens": 5533958.0, "step": 40000 }, { "entropy": 1.3613062560856342, "epoch": 0.9326660600545951, "grad_norm": 31.375, "learning_rate": 7.806890346314221e-07, "loss": 1.3621, "mean_token_accuracy": 0.6752942685186863, "num_tokens": 5668914.0, "step": 41000 }, { "entropy": 1.3506658849716187, "epoch": 0.9554140127388535, "grad_norm": 13.8125, "learning_rate": 3.431363326854719e-07, "loss": 1.3478, "mean_token_accuracy": 0.6773408466875553, "num_tokens": 5806407.0, "step": 42000 }, { "entropy": 1.3391023662090302, "epoch": 0.978161965423112, "grad_norm": 13.125, "learning_rate": 8.250846260903998e-08, "loss": 1.3275, "mean_token_accuracy": 0.68198375582695, "num_tokens": 5942532.0, "step": 43000 } ], "logging_steps": 1000, "max_steps": 43960, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3040032742440704e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }