| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 15, |
| "global_step": 60, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 1.181640625, |
| "epoch": 0.03361344537815126, |
| "grad_norm": 1.7928742767831207, |
| "learning_rate": 0.0, |
| "loss": 1.3131, |
| "mean_token_accuracy": 0.6369074434041977, |
| "num_tokens": 75310.0, |
| "step": 1 |
| }, |
| { |
| "entropy": 1.119140625, |
| "epoch": 0.06722689075630252, |
| "grad_norm": 1.6506218424352948, |
| "learning_rate": 3e-05, |
| "loss": 1.3469, |
| "mean_token_accuracy": 0.6300242394208908, |
| "num_tokens": 153607.0, |
| "step": 2 |
| }, |
| { |
| "entropy": 1.1728515625, |
| "epoch": 0.13445378151260504, |
| "grad_norm": 1.3796606244503395, |
| "learning_rate": 2.9482758620689654e-05, |
| "loss": 1.289, |
| "mean_token_accuracy": 0.6425867825746536, |
| "num_tokens": 289000.0, |
| "step": 4 |
| }, |
| { |
| "entropy": 1.2216796875, |
| "epoch": 0.20168067226890757, |
| "grad_norm": 3.4206735699743493, |
| "learning_rate": 2.8448275862068966e-05, |
| "loss": 1.2113, |
| "mean_token_accuracy": 0.6542532071471214, |
| "num_tokens": 438702.0, |
| "step": 6 |
| }, |
| { |
| "entropy": 1.259765625, |
| "epoch": 0.2689075630252101, |
| "grad_norm": 2.2379649541100193, |
| "learning_rate": 2.741379310344828e-05, |
| "loss": 1.1519, |
| "mean_token_accuracy": 0.663833349943161, |
| "num_tokens": 580037.0, |
| "step": 8 |
| }, |
| { |
| "entropy": 1.2392578125, |
| "epoch": 0.33613445378151263, |
| "grad_norm": 1.1181315509763536, |
| "learning_rate": 2.6379310344827588e-05, |
| "loss": 1.097, |
| "mean_token_accuracy": 0.6762372255325317, |
| "num_tokens": 724254.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 1.1494140625, |
| "epoch": 0.40336134453781514, |
| "grad_norm": 1.191042616939082, |
| "learning_rate": 2.5344827586206897e-05, |
| "loss": 1.088, |
| "mean_token_accuracy": 0.6783676072955132, |
| "num_tokens": 875036.0, |
| "step": 12 |
| }, |
| { |
| "entropy": 1.154296875, |
| "epoch": 0.47058823529411764, |
| "grad_norm": 1.249507656809331, |
| "learning_rate": 2.4310344827586206e-05, |
| "loss": 1.0712, |
| "mean_token_accuracy": 0.676607720553875, |
| "num_tokens": 1023337.0, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.5042016806722689, |
| "eval_entropy": 1.1395089285714286, |
| "eval_loss": 1.0697113275527954, |
| "eval_mean_token_accuracy": 0.6743263857705253, |
| "eval_num_tokens": 1094042.0, |
| "eval_runtime": 14.7587, |
| "eval_samples_per_second": 3.388, |
| "eval_steps_per_second": 0.474, |
| "step": 15 |
| }, |
| { |
| "entropy": 1.1630859375, |
| "epoch": 0.5378151260504201, |
| "grad_norm": 1.3489797487532325, |
| "learning_rate": 2.327586206896552e-05, |
| "loss": 1.0461, |
| "mean_token_accuracy": 0.6843359917402267, |
| "num_tokens": 1168826.0, |
| "step": 16 |
| }, |
| { |
| "entropy": 1.1298828125, |
| "epoch": 0.6050420168067226, |
| "grad_norm": 1.4108132020883717, |
| "learning_rate": 2.2241379310344828e-05, |
| "loss": 1.0388, |
| "mean_token_accuracy": 0.6870746314525604, |
| "num_tokens": 1322246.0, |
| "step": 18 |
| }, |
| { |
| "entropy": 1.1669921875, |
| "epoch": 0.6722689075630253, |
| "grad_norm": 1.3354043806449676, |
| "learning_rate": 2.1206896551724137e-05, |
| "loss": 1.0249, |
| "mean_token_accuracy": 0.688920646905899, |
| "num_tokens": 1464063.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 1.16015625, |
| "epoch": 0.7394957983193278, |
| "grad_norm": 1.666062653107746, |
| "learning_rate": 2.017241379310345e-05, |
| "loss": 1.0323, |
| "mean_token_accuracy": 0.6870173960924149, |
| "num_tokens": 1600863.0, |
| "step": 22 |
| }, |
| { |
| "entropy": 1.1806640625, |
| "epoch": 0.8067226890756303, |
| "grad_norm": 1.2035574738536507, |
| "learning_rate": 1.9137931034482762e-05, |
| "loss": 1.0128, |
| "mean_token_accuracy": 0.6902513056993484, |
| "num_tokens": 1772011.0, |
| "step": 24 |
| }, |
| { |
| "entropy": 1.1025390625, |
| "epoch": 0.8739495798319328, |
| "grad_norm": 1.0664858711055925, |
| "learning_rate": 1.8103448275862068e-05, |
| "loss": 1.014, |
| "mean_token_accuracy": 0.6933257803320885, |
| "num_tokens": 1928733.0, |
| "step": 26 |
| }, |
| { |
| "entropy": 1.12744140625, |
| "epoch": 0.9411764705882353, |
| "grad_norm": 1.070630709251991, |
| "learning_rate": 1.706896551724138e-05, |
| "loss": 0.994, |
| "mean_token_accuracy": 0.6967712193727493, |
| "num_tokens": 2068126.0, |
| "step": 28 |
| }, |
| { |
| "entropy": 1.1819196428571428, |
| "epoch": 1.0, |
| "grad_norm": 2.6295169243673575, |
| "learning_rate": 1.603448275862069e-05, |
| "loss": 0.9902, |
| "mean_token_accuracy": 0.6980065788541522, |
| "num_tokens": 2182360.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_entropy": 1.125, |
| "eval_loss": 1.0098644495010376, |
| "eval_mean_token_accuracy": 0.6880264622824532, |
| "eval_num_tokens": 2182360.0, |
| "eval_runtime": 14.5734, |
| "eval_samples_per_second": 3.431, |
| "eval_steps_per_second": 0.48, |
| "step": 30 |
| }, |
| { |
| "entropy": 1.099609375, |
| "epoch": 1.0672268907563025, |
| "grad_norm": 2.050074542522579, |
| "learning_rate": 1.5e-05, |
| "loss": 0.8933, |
| "mean_token_accuracy": 0.7218180298805237, |
| "num_tokens": 2325564.0, |
| "step": 32 |
| }, |
| { |
| "entropy": 1.16015625, |
| "epoch": 1.134453781512605, |
| "grad_norm": 1.0945630850199306, |
| "learning_rate": 1.396551724137931e-05, |
| "loss": 0.9022, |
| "mean_token_accuracy": 0.7185544371604919, |
| "num_tokens": 2478474.0, |
| "step": 34 |
| }, |
| { |
| "entropy": 1.0927734375, |
| "epoch": 1.2016806722689075, |
| "grad_norm": 1.4418349180955692, |
| "learning_rate": 1.293103448275862e-05, |
| "loss": 0.8661, |
| "mean_token_accuracy": 0.7244188115000725, |
| "num_tokens": 2626615.0, |
| "step": 36 |
| }, |
| { |
| "entropy": 1.052734375, |
| "epoch": 1.26890756302521, |
| "grad_norm": 1.1722563745135475, |
| "learning_rate": 1.1896551724137931e-05, |
| "loss": 0.8564, |
| "mean_token_accuracy": 0.7331436201930046, |
| "num_tokens": 2778957.0, |
| "step": 38 |
| }, |
| { |
| "entropy": 1.0771484375, |
| "epoch": 1.3361344537815127, |
| "grad_norm": 1.2206493994010927, |
| "learning_rate": 1.0862068965517242e-05, |
| "loss": 0.8483, |
| "mean_token_accuracy": 0.731599785387516, |
| "num_tokens": 2934688.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 1.03759765625, |
| "epoch": 1.403361344537815, |
| "grad_norm": 1.213607284992565, |
| "learning_rate": 9.827586206896551e-06, |
| "loss": 0.8423, |
| "mean_token_accuracy": 0.7331928312778473, |
| "num_tokens": 3078722.0, |
| "step": 42 |
| }, |
| { |
| "entropy": 1.0634765625, |
| "epoch": 1.4705882352941178, |
| "grad_norm": 1.8432998759311154, |
| "learning_rate": 8.793103448275862e-06, |
| "loss": 0.8403, |
| "mean_token_accuracy": 0.7300194650888443, |
| "num_tokens": 3222738.0, |
| "step": 44 |
| }, |
| { |
| "epoch": 1.504201680672269, |
| "eval_entropy": 1.05859375, |
| "eval_loss": 0.999748706817627, |
| "eval_mean_token_accuracy": 0.6920892340796334, |
| "eval_num_tokens": 3300341.0, |
| "eval_runtime": 14.5813, |
| "eval_samples_per_second": 3.429, |
| "eval_steps_per_second": 0.48, |
| "step": 45 |
| }, |
| { |
| "entropy": 1.060546875, |
| "epoch": 1.53781512605042, |
| "grad_norm": 1.2747974851902248, |
| "learning_rate": 7.758620689655173e-06, |
| "loss": 0.8441, |
| "mean_token_accuracy": 0.7343461066484451, |
| "num_tokens": 3379604.0, |
| "step": 46 |
| }, |
| { |
| "entropy": 1.03125, |
| "epoch": 1.6050420168067228, |
| "grad_norm": 1.5541516528133, |
| "learning_rate": 6.724137931034483e-06, |
| "loss": 0.8433, |
| "mean_token_accuracy": 0.7312941700220108, |
| "num_tokens": 3524464.0, |
| "step": 48 |
| }, |
| { |
| "entropy": 1.099609375, |
| "epoch": 1.6722689075630253, |
| "grad_norm": 1.4541114741349364, |
| "learning_rate": 5.689655172413793e-06, |
| "loss": 0.8468, |
| "mean_token_accuracy": 0.7327947691082954, |
| "num_tokens": 3671601.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 1.0830078125, |
| "epoch": 1.7394957983193278, |
| "grad_norm": 1.3250573197747766, |
| "learning_rate": 4.655172413793104e-06, |
| "loss": 0.8337, |
| "mean_token_accuracy": 0.7332699969410896, |
| "num_tokens": 3831435.0, |
| "step": 52 |
| }, |
| { |
| "entropy": 1.060546875, |
| "epoch": 1.8067226890756303, |
| "grad_norm": 1.9274897237742625, |
| "learning_rate": 3.620689655172414e-06, |
| "loss": 0.8375, |
| "mean_token_accuracy": 0.7363650351762772, |
| "num_tokens": 3970690.0, |
| "step": 54 |
| }, |
| { |
| "entropy": 1.0380859375, |
| "epoch": 1.8739495798319328, |
| "grad_norm": 1.3947288532125532, |
| "learning_rate": 2.5862068965517246e-06, |
| "loss": 0.8176, |
| "mean_token_accuracy": 0.7397993430495262, |
| "num_tokens": 4116319.0, |
| "step": 56 |
| }, |
| { |
| "entropy": 1.0576171875, |
| "epoch": 1.9411764705882353, |
| "grad_norm": 1.2609408019593042, |
| "learning_rate": 1.5517241379310346e-06, |
| "loss": 0.8193, |
| "mean_token_accuracy": 0.7374719008803368, |
| "num_tokens": 4261278.0, |
| "step": 58 |
| }, |
| { |
| "entropy": 1.0569196428571428, |
| "epoch": 2.0, |
| "grad_norm": 2.203503634167548, |
| "learning_rate": 5.172413793103448e-07, |
| "loss": 0.8165, |
| "mean_token_accuracy": 0.7374913011278424, |
| "num_tokens": 4371243.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_entropy": 1.0647321428571428, |
| "eval_loss": 0.9897834658622742, |
| "eval_mean_token_accuracy": 0.6942669238362994, |
| "eval_num_tokens": 4371243.0, |
| "eval_runtime": 14.5645, |
| "eval_samples_per_second": 3.433, |
| "eval_steps_per_second": 0.481, |
| "step": 60 |
| } |
| ], |
| "logging_steps": 2, |
| "max_steps": 60, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 30, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 209132409847808.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|