| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 43960, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 2.494273384451866, | |
| "epoch": 0.022747952684258416, | |
| "grad_norm": 18.375, | |
| "learning_rate": 6.991084023229653e-05, | |
| "loss": 2.56, | |
| "mean_token_accuracy": 0.47454266212880614, | |
| "num_tokens": 137746.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 2.1963223298788073, | |
| "epoch": 0.04549590536851683, | |
| "grad_norm": 36.75, | |
| "learning_rate": 6.964345916042913e-05, | |
| "loss": 2.2203, | |
| "mean_token_accuracy": 0.5251638530790805, | |
| "num_tokens": 274474.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "entropy": 2.071790741562843, | |
| "epoch": 0.06824385805277525, | |
| "grad_norm": 18.625, | |
| "learning_rate": 6.919922168729661e-05, | |
| "loss": 2.0846, | |
| "mean_token_accuracy": 0.5476762084960938, | |
| "num_tokens": 410511.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "entropy": 1.9665795323848725, | |
| "epoch": 0.09099181073703366, | |
| "grad_norm": 19.375, | |
| "learning_rate": 6.858039566497577e-05, | |
| "loss": 1.9827, | |
| "mean_token_accuracy": 0.5608653167784214, | |
| "num_tokens": 546387.0, | |
| "step": 4000 | |
| }, | |
| { | |
| "entropy": 1.8746658574938775, | |
| "epoch": 0.11373976342129208, | |
| "grad_norm": 14.5, | |
| "learning_rate": 6.779014022785937e-05, | |
| "loss": 1.878, | |
| "mean_token_accuracy": 0.5797940475940704, | |
| "num_tokens": 685188.0, | |
| "step": 5000 | |
| }, | |
| { | |
| "entropy": 1.8111666428446769, | |
| "epoch": 0.1364877161055505, | |
| "grad_norm": 25.375, | |
| "learning_rate": 6.683248966513431e-05, | |
| "loss": 1.8199, | |
| "mean_token_accuracy": 0.5904675594568253, | |
| "num_tokens": 823629.0, | |
| "step": 6000 | |
| }, | |
| { | |
| "entropy": 1.7517779313921928, | |
| "epoch": 0.1592356687898089, | |
| "grad_norm": 15.0625, | |
| "learning_rate": 6.571233282555582e-05, | |
| "loss": 1.7533, | |
| "mean_token_accuracy": 0.6037232976555824, | |
| "num_tokens": 962974.0, | |
| "step": 7000 | |
| }, | |
| { | |
| "entropy": 1.7067503632903098, | |
| "epoch": 0.18198362147406733, | |
| "grad_norm": 15.25, | |
| "learning_rate": 6.443538815965688e-05, | |
| "loss": 1.7095, | |
| "mean_token_accuracy": 0.6099362835586071, | |
| "num_tokens": 1097086.0, | |
| "step": 8000 | |
| }, | |
| { | |
| "entropy": 1.6539168149232863, | |
| "epoch": 0.20473157415832574, | |
| "grad_norm": 21.25, | |
| "learning_rate": 6.300817452680371e-05, | |
| "loss": 1.6574, | |
| "mean_token_accuracy": 0.6192078518867493, | |
| "num_tokens": 1241889.0, | |
| "step": 9000 | |
| }, | |
| { | |
| "entropy": 1.6205334926843644, | |
| "epoch": 0.22747952684258416, | |
| "grad_norm": 11.5625, | |
| "learning_rate": 6.143797791612843e-05, | |
| "loss": 1.623, | |
| "mean_token_accuracy": 0.6248027366697788, | |
| "num_tokens": 1378131.0, | |
| "step": 10000 | |
| }, | |
| { | |
| "entropy": 1.5953178288936616, | |
| "epoch": 0.2502274795268426, | |
| "grad_norm": 16.25, | |
| "learning_rate": 5.9732814251230294e-05, | |
| "loss": 1.5932, | |
| "mean_token_accuracy": 0.6300182574093341, | |
| "num_tokens": 1513496.0, | |
| "step": 11000 | |
| }, | |
| { | |
| "entropy": 1.5745678083896637, | |
| "epoch": 0.272975432211101, | |
| "grad_norm": 16.5, | |
| "learning_rate": 5.7901388468528755e-05, | |
| "loss": 1.5751, | |
| "mean_token_accuracy": 0.6331984028220177, | |
| "num_tokens": 1651603.0, | |
| "step": 12000 | |
| }, | |
| { | |
| "entropy": 1.576361115962267, | |
| "epoch": 0.29572338489535943, | |
| "grad_norm": 19.5, | |
| "learning_rate": 5.595305007817556e-05, | |
| "loss": 1.5795, | |
| "mean_token_accuracy": 0.6308481710553169, | |
| "num_tokens": 1790851.0, | |
| "step": 13000 | |
| }, | |
| { | |
| "entropy": 1.506616612225771, | |
| "epoch": 0.3184713375796178, | |
| "grad_norm": 15.8125, | |
| "learning_rate": 5.3897745434389274e-05, | |
| "loss": 1.504, | |
| "mean_token_accuracy": 0.6438729543685913, | |
| "num_tokens": 1926179.0, | |
| "step": 14000 | |
| }, | |
| { | |
| "entropy": 1.4861243069171906, | |
| "epoch": 0.34121929026387626, | |
| "grad_norm": 17.75, | |
| "learning_rate": 5.1745966958874055e-05, | |
| "loss": 1.4832, | |
| "mean_token_accuracy": 0.6503438740372658, | |
| "num_tokens": 2062340.0, | |
| "step": 15000 | |
| }, | |
| { | |
| "entropy": 1.4704010844826698, | |
| "epoch": 0.36396724294813465, | |
| "grad_norm": 21.0, | |
| "learning_rate": 4.9508699576539914e-05, | |
| "loss": 1.4694, | |
| "mean_token_accuracy": 0.6534121400117874, | |
| "num_tokens": 2207639.0, | |
| "step": 16000 | |
| }, | |
| { | |
| "entropy": 1.4741187323331832, | |
| "epoch": 0.3867151956323931, | |
| "grad_norm": 19.375, | |
| "learning_rate": 4.7197364636971925e-05, | |
| "loss": 1.4749, | |
| "mean_token_accuracy": 0.6518012735545635, | |
| "num_tokens": 2348887.0, | |
| "step": 17000 | |
| }, | |
| { | |
| "entropy": 1.4345429268479348, | |
| "epoch": 0.4094631483166515, | |
| "grad_norm": 19.25, | |
| "learning_rate": 4.482376160793216e-05, | |
| "loss": 1.4354, | |
| "mean_token_accuracy": 0.6596047645211219, | |
| "num_tokens": 2483718.0, | |
| "step": 18000 | |
| }, | |
| { | |
| "entropy": 1.4003090425133704, | |
| "epoch": 0.4322111010009099, | |
| "grad_norm": 20.625, | |
| "learning_rate": 4.240000783855147e-05, | |
| "loss": 1.3979, | |
| "mean_token_accuracy": 0.6681542407870292, | |
| "num_tokens": 2622656.0, | |
| "step": 19000 | |
| }, | |
| { | |
| "entropy": 1.4163592108488083, | |
| "epoch": 0.4549590536851683, | |
| "grad_norm": 16.5, | |
| "learning_rate": 3.993847669972281e-05, | |
| "loss": 1.4083, | |
| "mean_token_accuracy": 0.664802198857069, | |
| "num_tokens": 2760831.0, | |
| "step": 20000 | |
| }, | |
| { | |
| "entropy": 1.4052879491746426, | |
| "epoch": 0.47770700636942676, | |
| "grad_norm": 23.875, | |
| "learning_rate": 3.745173441749185e-05, | |
| "loss": 1.4074, | |
| "mean_token_accuracy": 0.6652796367108822, | |
| "num_tokens": 2900387.0, | |
| "step": 21000 | |
| }, | |
| { | |
| "entropy": 1.3997035399377347, | |
| "epoch": 0.5004549590536852, | |
| "grad_norm": 25.875, | |
| "learning_rate": 3.495247592191375e-05, | |
| "loss": 1.3972, | |
| "mean_token_accuracy": 0.6675955319404602, | |
| "num_tokens": 3037204.0, | |
| "step": 22000 | |
| }, | |
| { | |
| "entropy": 1.415595789283514, | |
| "epoch": 0.5232029117379435, | |
| "grad_norm": 26.75, | |
| "learning_rate": 3.245346003886994e-05, | |
| "loss": 1.4129, | |
| "mean_token_accuracy": 0.6636776500046253, | |
| "num_tokens": 3179823.0, | |
| "step": 23000 | |
| }, | |
| { | |
| "entropy": 1.379241144567728, | |
| "epoch": 0.545950864422202, | |
| "grad_norm": 20.0, | |
| "learning_rate": 2.996744435569409e-05, | |
| "loss": 1.3739, | |
| "mean_token_accuracy": 0.6730512301325798, | |
| "num_tokens": 3318434.0, | |
| "step": 24000 | |
| }, | |
| { | |
| "entropy": 1.3975665314793586, | |
| "epoch": 0.5686988171064604, | |
| "grad_norm": 20.875, | |
| "learning_rate": 2.7507120093120825e-05, | |
| "loss": 1.3976, | |
| "mean_token_accuracy": 0.6678333807885647, | |
| "num_tokens": 3461132.0, | |
| "step": 25000 | |
| }, | |
| { | |
| "entropy": 1.380986104875803, | |
| "epoch": 0.5914467697907189, | |
| "grad_norm": 14.6875, | |
| "learning_rate": 2.5085047316038814e-05, | |
| "loss": 1.3817, | |
| "mean_token_accuracy": 0.6719385531246662, | |
| "num_tokens": 3601560.0, | |
| "step": 26000 | |
| }, | |
| { | |
| "entropy": 1.385195587992668, | |
| "epoch": 0.6141947224749773, | |
| "grad_norm": 21.25, | |
| "learning_rate": 2.271359081380046e-05, | |
| "loss": 1.3815, | |
| "mean_token_accuracy": 0.6699076734781265, | |
| "num_tokens": 3745377.0, | |
| "step": 27000 | |
| }, | |
| { | |
| "entropy": 1.3656318633258342, | |
| "epoch": 0.6369426751592356, | |
| "grad_norm": 22.5, | |
| "learning_rate": 2.040485697742177e-05, | |
| "loss": 1.357, | |
| "mean_token_accuracy": 0.6759414212107658, | |
| "num_tokens": 3879126.0, | |
| "step": 28000 | |
| }, | |
| { | |
| "entropy": 1.3772443866729736, | |
| "epoch": 0.6596906278434941, | |
| "grad_norm": 26.0, | |
| "learning_rate": 1.8170631995917233e-05, | |
| "loss": 1.3745, | |
| "mean_token_accuracy": 0.6744600256979465, | |
| "num_tokens": 4017925.0, | |
| "step": 29000 | |
| }, | |
| { | |
| "entropy": 1.365410826742649, | |
| "epoch": 0.6824385805277525, | |
| "grad_norm": 22.5, | |
| "learning_rate": 1.602232168728024e-05, | |
| "loss": 1.3584, | |
| "mean_token_accuracy": 0.675611907929182, | |
| "num_tokens": 4156599.0, | |
| "step": 30000 | |
| }, | |
| { | |
| "entropy": 1.3873618737459184, | |
| "epoch": 0.705186533212011, | |
| "grad_norm": 13.25, | |
| "learning_rate": 1.3970893271274471e-05, | |
| "loss": 1.3803, | |
| "mean_token_accuracy": 0.6701690441966057, | |
| "num_tokens": 4295734.0, | |
| "step": 31000 | |
| }, | |
| { | |
| "entropy": 1.359022274851799, | |
| "epoch": 0.7279344858962693, | |
| "grad_norm": 16.75, | |
| "learning_rate": 1.202681938128876e-05, | |
| "loss": 1.3556, | |
| "mean_token_accuracy": 0.6761138562858104, | |
| "num_tokens": 4431490.0, | |
| "step": 32000 | |
| }, | |
| { | |
| "entropy": 1.3703680724203586, | |
| "epoch": 0.7506824385805277, | |
| "grad_norm": 19.0, | |
| "learning_rate": 1.0200024601077386e-05, | |
| "loss": 1.362, | |
| "mean_token_accuracy": 0.6743936349153519, | |
| "num_tokens": 4571660.0, | |
| "step": 33000 | |
| }, | |
| { | |
| "entropy": 1.3698465181291104, | |
| "epoch": 0.7734303912647862, | |
| "grad_norm": 24.25, | |
| "learning_rate": 8.49983479931827e-06, | |
| "loss": 1.3757, | |
| "mean_token_accuracy": 0.6719635992050171, | |
| "num_tokens": 4707985.0, | |
| "step": 34000 | |
| }, | |
| { | |
| "entropy": 1.3624781457483768, | |
| "epoch": 0.7961783439490446, | |
| "grad_norm": 16.875, | |
| "learning_rate": 6.9349295206380985e-06, | |
| "loss": 1.3518, | |
| "mean_token_accuracy": 0.6769196209311485, | |
| "num_tokens": 4844765.0, | |
| "step": 35000 | |
| }, | |
| { | |
| "entropy": 1.3442125609219073, | |
| "epoch": 0.818926296633303, | |
| "grad_norm": 15.25, | |
| "learning_rate": 5.513297676150713e-06, | |
| "loss": 1.3335, | |
| "mean_token_accuracy": 0.6805432761013508, | |
| "num_tokens": 4978615.0, | |
| "step": 36000 | |
| }, | |
| { | |
| "entropy": 1.355837997198105, | |
| "epoch": 0.8416742493175614, | |
| "grad_norm": 16.0, | |
| "learning_rate": 4.242196759710179e-06, | |
| "loss": 1.3377, | |
| "mean_token_accuracy": 0.6823569060564041, | |
| "num_tokens": 5117536.0, | |
| "step": 37000 | |
| }, | |
| { | |
| "entropy": 1.3855277094841003, | |
| "epoch": 0.8644222020018199, | |
| "grad_norm": 17.625, | |
| "learning_rate": 3.1281157980815473e-06, | |
| "loss": 1.3731, | |
| "mean_token_accuracy": 0.6734900210201741, | |
| "num_tokens": 5258021.0, | |
| "step": 38000 | |
| }, | |
| { | |
| "entropy": 1.3616491684913634, | |
| "epoch": 0.8871701546860783, | |
| "grad_norm": 33.25, | |
| "learning_rate": 2.1767422241703795e-06, | |
| "loss": 1.3462, | |
| "mean_token_accuracy": 0.6789434304237366, | |
| "num_tokens": 5395368.0, | |
| "step": 39000 | |
| }, | |
| { | |
| "entropy": 1.3761241393685342, | |
| "epoch": 0.9099181073703366, | |
| "grad_norm": 16.0, | |
| "learning_rate": 1.392932842424574e-06, | |
| "loss": 1.3787, | |
| "mean_token_accuracy": 0.6720339168906212, | |
| "num_tokens": 5533958.0, | |
| "step": 40000 | |
| }, | |
| { | |
| "entropy": 1.3613062560856342, | |
| "epoch": 0.9326660600545951, | |
| "grad_norm": 31.375, | |
| "learning_rate": 7.806890346314221e-07, | |
| "loss": 1.3621, | |
| "mean_token_accuracy": 0.6752942685186863, | |
| "num_tokens": 5668914.0, | |
| "step": 41000 | |
| }, | |
| { | |
| "entropy": 1.3506658849716187, | |
| "epoch": 0.9554140127388535, | |
| "grad_norm": 13.8125, | |
| "learning_rate": 3.431363326854719e-07, | |
| "loss": 1.3478, | |
| "mean_token_accuracy": 0.6773408466875553, | |
| "num_tokens": 5806407.0, | |
| "step": 42000 | |
| }, | |
| { | |
| "entropy": 1.3391023662090302, | |
| "epoch": 0.978161965423112, | |
| "grad_norm": 13.125, | |
| "learning_rate": 8.250846260903998e-08, | |
| "loss": 1.3275, | |
| "mean_token_accuracy": 0.68198375582695, | |
| "num_tokens": 5942532.0, | |
| "step": 43000 | |
| } | |
| ], | |
| "logging_steps": 1000, | |
| "max_steps": 43960, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.3040032742440704e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |