| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 8.790419161676647, | |
| "eval_steps": 500, | |
| "global_step": 369, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.23952095808383234, | |
| "grad_norm": 0.7001179456710815, | |
| "learning_rate": 7.500000000000001e-05, | |
| "loss": 1.4338, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.47904191616766467, | |
| "grad_norm": 0.27172571420669556, | |
| "learning_rate": 9.990516643685222e-05, | |
| "loss": 0.8267, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.718562874251497, | |
| "grad_norm": 0.19638711214065552, | |
| "learning_rate": 9.944154131125642e-05, | |
| "loss": 0.4881, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.9580838323353293, | |
| "grad_norm": 0.20754235982894897, | |
| "learning_rate": 9.859528969650738e-05, | |
| "loss": 0.3591, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_accuracy": 0.9034878200745683, | |
| "eval_loss": 0.39884674549102783, | |
| "eval_runtime": 30.4717, | |
| "eval_samples_per_second": 12.635, | |
| "eval_steps_per_second": 2.133, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 1.1916167664670658, | |
| "grad_norm": 0.16468322277069092, | |
| "learning_rate": 9.737296070648186e-05, | |
| "loss": 0.4172, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.4311377245508983, | |
| "grad_norm": 0.1612997055053711, | |
| "learning_rate": 9.57840139057007e-05, | |
| "loss": 0.3893, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.6706586826347305, | |
| "grad_norm": 0.16499397158622742, | |
| "learning_rate": 9.384074610206495e-05, | |
| "loss": 0.3658, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.910179640718563, | |
| "grad_norm": 0.14762192964553833, | |
| "learning_rate": 9.155819618225708e-05, | |
| "loss": 0.2673, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_accuracy": 0.9196909824354998, | |
| "eval_loss": 0.325968861579895, | |
| "eval_runtime": 30.3583, | |
| "eval_samples_per_second": 12.682, | |
| "eval_steps_per_second": 2.141, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 2.143712574850299, | |
| "grad_norm": 0.1777992993593216, | |
| "learning_rate": 8.895402872628352e-05, | |
| "loss": 0.3165, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 2.3832335329341316, | |
| "grad_norm": 0.2070295810699463, | |
| "learning_rate": 8.604839730186125e-05, | |
| "loss": 0.336, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2.622754491017964, | |
| "grad_norm": 0.1750418245792389, | |
| "learning_rate": 8.286378849660896e-05, | |
| "loss": 0.288, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 2.8622754491017965, | |
| "grad_norm": 0.20214907824993134, | |
| "learning_rate": 7.942484789507283e-05, | |
| "loss": 0.2425, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_accuracy": 0.9289047847079928, | |
| "eval_loss": 0.289765864610672, | |
| "eval_runtime": 30.3541, | |
| "eval_samples_per_second": 12.684, | |
| "eval_steps_per_second": 2.141, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 3.095808383233533, | |
| "grad_norm": 0.2640642523765564, | |
| "learning_rate": 7.57581893473448e-05, | |
| "loss": 0.2304, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 3.3353293413173652, | |
| "grad_norm": 0.26811081171035767, | |
| "learning_rate": 7.18921890053375e-05, | |
| "loss": 0.2905, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 3.5748502994011977, | |
| "grad_norm": 0.27033841609954834, | |
| "learning_rate": 6.785676572066225e-05, | |
| "loss": 0.2336, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 3.81437125748503, | |
| "grad_norm": 0.22121788561344147, | |
| "learning_rate": 6.368314950360415e-05, | |
| "loss": 0.2069, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_accuracy": 0.9345709163732674, | |
| "eval_loss": 0.2659221291542053, | |
| "eval_runtime": 30.3929, | |
| "eval_samples_per_second": 12.667, | |
| "eval_steps_per_second": 2.139, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 4.047904191616767, | |
| "grad_norm": 0.2626728117465973, | |
| "learning_rate": 5.940363983508257e-05, | |
| "loss": 0.1815, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 4.287425149700598, | |
| "grad_norm": 0.2664365768432617, | |
| "learning_rate": 5.5051355702012893e-05, | |
| "loss": 0.2426, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 4.526946107784431, | |
| "grad_norm": 0.2793976962566376, | |
| "learning_rate": 5.0659979290537954e-05, | |
| "loss": 0.1998, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 4.766467065868263, | |
| "grad_norm": 0.2911984622478485, | |
| "learning_rate": 4.626349532067879e-05, | |
| "loss": 0.1713, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.2870716452598572, | |
| "learning_rate": 4.189592803968563e-05, | |
| "loss": 0.138, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_accuracy": 0.9391251537470097, | |
| "eval_loss": 0.2524815797805786, | |
| "eval_runtime": 30.4065, | |
| "eval_samples_per_second": 12.662, | |
| "eval_steps_per_second": 2.138, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 5.2395209580838324, | |
| "grad_norm": 0.3472362756729126, | |
| "learning_rate": 3.759107790948882e-05, | |
| "loss": 0.199, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 5.479041916167665, | |
| "grad_norm": 0.33083346486091614, | |
| "learning_rate": 3.338226002601703e-05, | |
| "loss": 0.1654, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 5.718562874251497, | |
| "grad_norm": 0.3220432996749878, | |
| "learning_rate": 2.9302046294747497e-05, | |
| "loss": 0.1459, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 5.95808383233533, | |
| "grad_norm": 0.35546302795410156, | |
| "learning_rate": 2.5382013357782893e-05, | |
| "loss": 0.1251, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_accuracy": 0.9430640763868036, | |
| "eval_loss": 0.24969537556171417, | |
| "eval_runtime": 30.3846, | |
| "eval_samples_per_second": 12.671, | |
| "eval_steps_per_second": 2.139, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 6.191616766467066, | |
| "grad_norm": 0.35284000635147095, | |
| "learning_rate": 2.1652498223239427e-05, | |
| "loss": 0.1486, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 6.431137724550898, | |
| "grad_norm": 0.31098905205726624, | |
| "learning_rate": 1.814236348812211e-05, | |
| "loss": 0.1527, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 6.6706586826347305, | |
| "grad_norm": 0.34438222646713257, | |
| "learning_rate": 1.4878773971620074e-05, | |
| "loss": 0.1344, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 6.910179640718563, | |
| "grad_norm": 0.2907264232635498, | |
| "learning_rate": 1.1886986487449475e-05, | |
| "loss": 0.1072, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_accuracy": 0.9438642485491895, | |
| "eval_loss": 0.24749523401260376, | |
| "eval_runtime": 30.43, | |
| "eval_samples_per_second": 12.652, | |
| "eval_steps_per_second": 2.136, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 7.1437125748503, | |
| "grad_norm": 0.3019481301307678, | |
| "learning_rate": 9.190154382188921e-06, | |
| "loss": 0.1224, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 7.383233532934132, | |
| "grad_norm": 0.29105937480926514, | |
| "learning_rate": 6.809148352279182e-06, | |
| "loss": 0.1493, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 7.6227544910179645, | |
| "grad_norm": 0.2779920697212219, | |
| "learning_rate": 4.762394926378477e-06, | |
| "loss": 0.1126, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 7.862275449101796, | |
| "grad_norm": 0.3359215557575226, | |
| "learning_rate": 3.065733863053072e-06, | |
| "loss": 0.1059, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_accuracy": 0.944612993482476, | |
| "eval_loss": 0.2483372986316681, | |
| "eval_runtime": 30.4006, | |
| "eval_samples_per_second": 12.664, | |
| "eval_steps_per_second": 2.138, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 8.095808383233534, | |
| "grad_norm": 0.26059141755104065, | |
| "learning_rate": 1.7322955673980678e-06, | |
| "loss": 0.0986, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 8.335329341317365, | |
| "grad_norm": 0.26576822996139526, | |
| "learning_rate": 7.723994752570462e-07, | |
| "loss": 0.1465, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 8.574850299401197, | |
| "grad_norm": 0.2771126329898834, | |
| "learning_rate": 1.9347419144180035e-07, | |
| "loss": 0.1073, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 8.790419161676647, | |
| "eval_accuracy": 0.9448452683547727, | |
| "eval_loss": 0.24962204694747925, | |
| "eval_runtime": 30.226, | |
| "eval_samples_per_second": 12.737, | |
| "eval_steps_per_second": 2.15, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 8.790419161676647, | |
| "step": 369, | |
| "total_flos": 3.210182982232965e+17, | |
| "train_loss": 0.2637010880602085, | |
| "train_runtime": 2751.4148, | |
| "train_samples_per_second": 3.271, | |
| "train_steps_per_second": 0.134 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 369, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 9, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.210182982232965e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |