| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 275, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.03636363636363636, |
| "grad_norm": 3.3212404251098633, |
| "learning_rate": 0.00019345454545454546, |
| "loss": 1.5788, |
| "mean_token_accuracy": 0.6589842185378074, |
| "num_tokens": 11127.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.07272727272727272, |
| "grad_norm": 2.2878754138946533, |
| "learning_rate": 0.0001861818181818182, |
| "loss": 1.2084, |
| "mean_token_accuracy": 0.7237757340073585, |
| "num_tokens": 22177.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.10909090909090909, |
| "grad_norm": 3.08931303024292, |
| "learning_rate": 0.00017890909090909093, |
| "loss": 0.9623, |
| "mean_token_accuracy": 0.7773568660020829, |
| "num_tokens": 33167.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.14545454545454545, |
| "grad_norm": 3.2146613597869873, |
| "learning_rate": 0.00017163636363636364, |
| "loss": 0.6221, |
| "mean_token_accuracy": 0.8660929881036281, |
| "num_tokens": 44818.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.18181818181818182, |
| "grad_norm": 2.5239977836608887, |
| "learning_rate": 0.00016436363636363637, |
| "loss": 0.7484, |
| "mean_token_accuracy": 0.8272845461964607, |
| "num_tokens": 55735.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.21818181818181817, |
| "grad_norm": 2.0953986644744873, |
| "learning_rate": 0.00015709090909090908, |
| "loss": 0.4557, |
| "mean_token_accuracy": 0.9092939600348473, |
| "num_tokens": 66765.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.2545454545454545, |
| "grad_norm": 2.588249444961548, |
| "learning_rate": 0.00014981818181818184, |
| "loss": 0.6584, |
| "mean_token_accuracy": 0.8400282755494117, |
| "num_tokens": 77331.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.2909090909090909, |
| "grad_norm": 2.2984700202941895, |
| "learning_rate": 0.00014254545454545455, |
| "loss": 0.6106, |
| "mean_token_accuracy": 0.8571661487221718, |
| "num_tokens": 88227.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.32727272727272727, |
| "grad_norm": 2.475771427154541, |
| "learning_rate": 0.00013527272727272729, |
| "loss": 0.5752, |
| "mean_token_accuracy": 0.8697961375117302, |
| "num_tokens": 98777.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.36363636363636365, |
| "grad_norm": 2.535799980163574, |
| "learning_rate": 0.00012800000000000002, |
| "loss": 0.4911, |
| "mean_token_accuracy": 0.89216850399971, |
| "num_tokens": 109773.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 2.4313271045684814, |
| "learning_rate": 0.00012072727272727273, |
| "loss": 0.6944, |
| "mean_token_accuracy": 0.8319280952215194, |
| "num_tokens": 120110.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.43636363636363634, |
| "grad_norm": 2.3609564304351807, |
| "learning_rate": 0.00011345454545454545, |
| "loss": 0.568, |
| "mean_token_accuracy": 0.8562197998166085, |
| "num_tokens": 130750.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.4727272727272727, |
| "grad_norm": 2.44640851020813, |
| "learning_rate": 0.00010618181818181819, |
| "loss": 0.452, |
| "mean_token_accuracy": 0.8784870430827141, |
| "num_tokens": 141506.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.509090909090909, |
| "grad_norm": 2.0682320594787598, |
| "learning_rate": 9.890909090909092e-05, |
| "loss": 0.6779, |
| "mean_token_accuracy": 0.8436270222067833, |
| "num_tokens": 152717.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.5454545454545454, |
| "grad_norm": 1.1546458005905151, |
| "learning_rate": 9.163636363636364e-05, |
| "loss": 0.4282, |
| "mean_token_accuracy": 0.8993956357240677, |
| "num_tokens": 164030.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.5818181818181818, |
| "grad_norm": 1.2640266418457031, |
| "learning_rate": 8.436363636363637e-05, |
| "loss": 0.538, |
| "mean_token_accuracy": 0.8718804702162742, |
| "num_tokens": 175458.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.6181818181818182, |
| "grad_norm": 1.5814223289489746, |
| "learning_rate": 7.709090909090909e-05, |
| "loss": 0.4352, |
| "mean_token_accuracy": 0.8879004895687104, |
| "num_tokens": 186778.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.6545454545454545, |
| "grad_norm": 1.8492006063461304, |
| "learning_rate": 6.981818181818182e-05, |
| "loss": 0.5202, |
| "mean_token_accuracy": 0.8709954559803009, |
| "num_tokens": 197630.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.6909090909090909, |
| "grad_norm": 1.8375712633132935, |
| "learning_rate": 6.254545454545456e-05, |
| "loss": 0.4214, |
| "mean_token_accuracy": 0.8964368030428886, |
| "num_tokens": 208414.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.7272727272727273, |
| "grad_norm": 1.5674086809158325, |
| "learning_rate": 5.527272727272727e-05, |
| "loss": 0.5739, |
| "mean_token_accuracy": 0.8617855116724968, |
| "num_tokens": 219498.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.7636363636363637, |
| "grad_norm": 1.284372329711914, |
| "learning_rate": 4.8e-05, |
| "loss": 0.4049, |
| "mean_token_accuracy": 0.8968110710382462, |
| "num_tokens": 229984.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 1.6603507995605469, |
| "learning_rate": 4.072727272727273e-05, |
| "loss": 0.3727, |
| "mean_token_accuracy": 0.9041761666536331, |
| "num_tokens": 241090.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.8363636363636363, |
| "grad_norm": 1.6871552467346191, |
| "learning_rate": 3.345454545454546e-05, |
| "loss": 0.421, |
| "mean_token_accuracy": 0.8905564934015274, |
| "num_tokens": 252564.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.8727272727272727, |
| "grad_norm": 1.2738430500030518, |
| "learning_rate": 2.6181818181818187e-05, |
| "loss": 0.4663, |
| "mean_token_accuracy": 0.884245416522026, |
| "num_tokens": 263652.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.9090909090909091, |
| "grad_norm": 1.5028655529022217, |
| "learning_rate": 1.890909090909091e-05, |
| "loss": 0.2951, |
| "mean_token_accuracy": 0.9238881275057793, |
| "num_tokens": 274892.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.9454545454545454, |
| "grad_norm": 1.8719642162322998, |
| "learning_rate": 1.1636363636363637e-05, |
| "loss": 0.624, |
| "mean_token_accuracy": 0.8429773986339569, |
| "num_tokens": 285890.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.9818181818181818, |
| "grad_norm": 0.8801164031028748, |
| "learning_rate": 4.363636363636364e-06, |
| "loss": 0.4423, |
| "mean_token_accuracy": 0.888069073855877, |
| "num_tokens": 297643.0, |
| "step": 270 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 275, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.4648058643857408e+16, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|