| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 1000, |
| "global_step": 19225, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.02600780234070221, |
| "grad_norm": 1.6643455028533936, |
| "learning_rate": 4.869960988296489e-05, |
| "loss": 4.3874, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.05201560468140442, |
| "grad_norm": 1.7501577138900757, |
| "learning_rate": 4.7399219765929784e-05, |
| "loss": 2.9077, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.05201560468140442, |
| "eval_accuracy": 0.4548191024569696, |
| "eval_loss": 2.4249117374420166, |
| "eval_runtime": 53.3056, |
| "eval_samples_per_second": 116.066, |
| "eval_steps_per_second": 3.639, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.07802340702210664, |
| "grad_norm": 1.7988471984863281, |
| "learning_rate": 4.609882964889467e-05, |
| "loss": 2.2562, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.10403120936280884, |
| "grad_norm": 1.8898983001708984, |
| "learning_rate": 4.479843953185956e-05, |
| "loss": 1.9481, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.10403120936280884, |
| "eval_accuracy": 0.575744146154528, |
| "eval_loss": 1.7689398527145386, |
| "eval_runtime": 54.0862, |
| "eval_samples_per_second": 114.391, |
| "eval_steps_per_second": 3.587, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.13003901170351106, |
| "grad_norm": 1.4796483516693115, |
| "learning_rate": 4.349804941482445e-05, |
| "loss": 1.8022, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.15604681404421328, |
| "grad_norm": 1.6372228860855103, |
| "learning_rate": 4.219765929778934e-05, |
| "loss": 1.7051, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.15604681404421328, |
| "eval_accuracy": 0.607199120408399, |
| "eval_loss": 1.5896528959274292, |
| "eval_runtime": 54.2028, |
| "eval_samples_per_second": 114.145, |
| "eval_steps_per_second": 3.579, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.18205461638491546, |
| "grad_norm": 1.3465585708618164, |
| "learning_rate": 4.089726918075423e-05, |
| "loss": 1.6377, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.20806241872561768, |
| "grad_norm": 1.3859813213348389, |
| "learning_rate": 3.9596879063719113e-05, |
| "loss": 1.5909, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.20806241872561768, |
| "eval_accuracy": 0.6242940571162597, |
| "eval_loss": 1.4958508014678955, |
| "eval_runtime": 54.1171, |
| "eval_samples_per_second": 114.326, |
| "eval_steps_per_second": 3.585, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.2340702210663199, |
| "grad_norm": 1.2944873571395874, |
| "learning_rate": 3.8296488946684004e-05, |
| "loss": 1.5494, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.26007802340702213, |
| "grad_norm": 1.2166898250579834, |
| "learning_rate": 3.6996098829648895e-05, |
| "loss": 1.5165, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.26007802340702213, |
| "eval_accuracy": 0.6359300971781876, |
| "eval_loss": 1.4326342344284058, |
| "eval_runtime": 54.1288, |
| "eval_samples_per_second": 114.301, |
| "eval_steps_per_second": 3.584, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.28608582574772434, |
| "grad_norm": 1.2316468954086304, |
| "learning_rate": 3.5695708712613785e-05, |
| "loss": 1.49, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.31209362808842656, |
| "grad_norm": 1.2356523275375366, |
| "learning_rate": 3.4395318595578676e-05, |
| "loss": 1.4645, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.31209362808842656, |
| "eval_accuracy": 0.64413021279917, |
| "eval_loss": 1.3879565000534058, |
| "eval_runtime": 54.3409, |
| "eval_samples_per_second": 113.855, |
| "eval_steps_per_second": 3.57, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.3381014304291287, |
| "grad_norm": 1.2244901657104492, |
| "learning_rate": 3.3094928478543566e-05, |
| "loss": 1.4444, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.3641092327698309, |
| "grad_norm": 1.1649054288864136, |
| "learning_rate": 3.179453836150845e-05, |
| "loss": 1.4312, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.3641092327698309, |
| "eval_accuracy": 0.6512030317407878, |
| "eval_loss": 1.3517647981643677, |
| "eval_runtime": 54.3348, |
| "eval_samples_per_second": 113.868, |
| "eval_steps_per_second": 3.57, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.39011703511053314, |
| "grad_norm": 1.2162460088729858, |
| "learning_rate": 3.049414824447334e-05, |
| "loss": 1.4078, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.41612483745123535, |
| "grad_norm": 1.1526248455047607, |
| "learning_rate": 2.919375812743823e-05, |
| "loss": 1.3952, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.41612483745123535, |
| "eval_accuracy": 0.65624023885102, |
| "eval_loss": 1.3251394033432007, |
| "eval_runtime": 54.0133, |
| "eval_samples_per_second": 114.546, |
| "eval_steps_per_second": 3.592, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.44213263979193757, |
| "grad_norm": 1.1357614994049072, |
| "learning_rate": 2.7893368010403122e-05, |
| "loss": 1.3816, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.4681404421326398, |
| "grad_norm": 1.1306383609771729, |
| "learning_rate": 2.659297789336801e-05, |
| "loss": 1.3699, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.4681404421326398, |
| "eval_accuracy": 0.6608040919526501, |
| "eval_loss": 1.3029407262802124, |
| "eval_runtime": 53.9707, |
| "eval_samples_per_second": 114.636, |
| "eval_steps_per_second": 3.595, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.494148244473342, |
| "grad_norm": 1.142600655555725, |
| "learning_rate": 2.52925877763329e-05, |
| "loss": 1.3575, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.5201560468140443, |
| "grad_norm": 1.1064521074295044, |
| "learning_rate": 2.399219765929779e-05, |
| "loss": 1.3497, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.5201560468140443, |
| "eval_accuracy": 0.6642810951793887, |
| "eval_loss": 1.283758521080017, |
| "eval_runtime": 54.92, |
| "eval_samples_per_second": 112.655, |
| "eval_steps_per_second": 3.532, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.5461638491547465, |
| "grad_norm": 1.1795347929000854, |
| "learning_rate": 2.269180754226268e-05, |
| "loss": 1.3358, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.5721716514954487, |
| "grad_norm": 1.1179834604263306, |
| "learning_rate": 2.1391417425227568e-05, |
| "loss": 1.3312, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.5721716514954487, |
| "eval_accuracy": 0.6679980933123578, |
| "eval_loss": 1.2663341760635376, |
| "eval_runtime": 53.252, |
| "eval_samples_per_second": 116.184, |
| "eval_steps_per_second": 3.643, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.5981794538361509, |
| "grad_norm": 1.1356099843978882, |
| "learning_rate": 2.009102730819246e-05, |
| "loss": 1.3224, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.6241872561768531, |
| "grad_norm": 1.1126354932785034, |
| "learning_rate": 1.879063719115735e-05, |
| "loss": 1.3141, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.6241872561768531, |
| "eval_accuracy": 0.670771701330052, |
| "eval_loss": 1.251522183418274, |
| "eval_runtime": 53.6198, |
| "eval_samples_per_second": 115.386, |
| "eval_steps_per_second": 3.618, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.6501950585175552, |
| "grad_norm": 1.1169648170471191, |
| "learning_rate": 1.7490247074122236e-05, |
| "loss": 1.3075, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.6762028608582574, |
| "grad_norm": 1.0920188426971436, |
| "learning_rate": 1.6189856957087127e-05, |
| "loss": 1.3035, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.6762028608582574, |
| "eval_accuracy": 0.6731488990648414, |
| "eval_loss": 1.2399502992630005, |
| "eval_runtime": 54.3769, |
| "eval_samples_per_second": 113.78, |
| "eval_steps_per_second": 3.568, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.7022106631989596, |
| "grad_norm": 1.0997040271759033, |
| "learning_rate": 1.4889466840052016e-05, |
| "loss": 1.2977, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.7282184655396619, |
| "grad_norm": 1.1275485754013062, |
| "learning_rate": 1.3589076723016905e-05, |
| "loss": 1.2874, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.7282184655396619, |
| "eval_accuracy": 0.6752162679575517, |
| "eval_loss": 1.229522705078125, |
| "eval_runtime": 53.8249, |
| "eval_samples_per_second": 114.947, |
| "eval_steps_per_second": 3.604, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.7542262678803641, |
| "grad_norm": 1.1439367532730103, |
| "learning_rate": 1.2288686605981795e-05, |
| "loss": 1.2856, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.7802340702210663, |
| "grad_norm": 1.1231589317321777, |
| "learning_rate": 1.0988296488946684e-05, |
| "loss": 1.28, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.7802340702210663, |
| "eval_accuracy": 0.6771842577877083, |
| "eval_loss": 1.2204456329345703, |
| "eval_runtime": 53.3077, |
| "eval_samples_per_second": 116.062, |
| "eval_steps_per_second": 3.639, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.8062418725617685, |
| "grad_norm": 1.1076003313064575, |
| "learning_rate": 9.687906371911575e-06, |
| "loss": 1.2779, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.8322496749024707, |
| "grad_norm": 1.0912443399429321, |
| "learning_rate": 8.387516254876464e-06, |
| "loss": 1.2689, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.8322496749024707, |
| "eval_accuracy": 0.6791140127480112, |
| "eval_loss": 1.2115237712860107, |
| "eval_runtime": 53.1926, |
| "eval_samples_per_second": 116.313, |
| "eval_steps_per_second": 3.647, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.8582574772431729, |
| "grad_norm": 1.1728153228759766, |
| "learning_rate": 7.087126137841353e-06, |
| "loss": 1.2673, |
| "step": 16500 |
| }, |
| { |
| "epoch": 0.8842652795838751, |
| "grad_norm": 1.1343005895614624, |
| "learning_rate": 5.786736020806242e-06, |
| "loss": 1.2625, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.8842652795838751, |
| "eval_accuracy": 0.6804928379926947, |
| "eval_loss": 1.2050906419754028, |
| "eval_runtime": 53.314, |
| "eval_samples_per_second": 116.048, |
| "eval_steps_per_second": 3.639, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.9102730819245773, |
| "grad_norm": 1.148543119430542, |
| "learning_rate": 4.486345903771132e-06, |
| "loss": 1.2607, |
| "step": 17500 |
| }, |
| { |
| "epoch": 0.9362808842652796, |
| "grad_norm": 1.121073603630066, |
| "learning_rate": 3.185955786736021e-06, |
| "loss": 1.2536, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.9362808842652796, |
| "eval_accuracy": 0.6816515757427242, |
| "eval_loss": 1.1991944313049316, |
| "eval_runtime": 53.7355, |
| "eval_samples_per_second": 115.138, |
| "eval_steps_per_second": 3.61, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.9622886866059818, |
| "grad_norm": 1.117564082145691, |
| "learning_rate": 1.8855656697009103e-06, |
| "loss": 1.2541, |
| "step": 18500 |
| }, |
| { |
| "epoch": 0.988296488946684, |
| "grad_norm": 1.1206711530685425, |
| "learning_rate": 5.851755526657998e-07, |
| "loss": 1.2535, |
| "step": 19000 |
| }, |
| { |
| "epoch": 0.988296488946684, |
| "eval_accuracy": 0.6824358645607154, |
| "eval_loss": 1.1952418088912964, |
| "eval_runtime": 53.4734, |
| "eval_samples_per_second": 115.702, |
| "eval_steps_per_second": 3.628, |
| "step": 19000 |
| }, |
| { |
| "epoch": 1.0, |
| "step": 19225, |
| "total_flos": 3.21490058084352e+17, |
| "train_loss": 1.5375442913635808, |
| "train_runtime": 7514.5868, |
| "train_samples_per_second": 81.867, |
| "train_steps_per_second": 2.558 |
| } |
| ], |
| "logging_steps": 500, |
| "max_steps": 19225, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.21490058084352e+17, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|