| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 1000, |
| "global_step": 19818, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.02522958926228681, |
| "grad_norm": 1.6351510286331177, |
| "learning_rate": 4.873852053688566e-05, |
| "loss": 4.2934, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.05045917852457362, |
| "grad_norm": 2.1481850147247314, |
| "learning_rate": 4.747704107377132e-05, |
| "loss": 2.8305, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.05045917852457362, |
| "eval_accuracy": 0.46500522497769375, |
| "eval_loss": 2.379011869430542, |
| "eval_runtime": 56.6734, |
| "eval_samples_per_second": 112.469, |
| "eval_steps_per_second": 3.529, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.07568876778686043, |
| "grad_norm": 1.5696136951446533, |
| "learning_rate": 4.6215561610656984e-05, |
| "loss": 2.2245, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.10091835704914724, |
| "grad_norm": 1.5901012420654297, |
| "learning_rate": 4.4954082147542644e-05, |
| "loss": 1.9272, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.10091835704914724, |
| "eval_accuracy": 0.5808557246708203, |
| "eval_loss": 1.750130534172058, |
| "eval_runtime": 55.5938, |
| "eval_samples_per_second": 114.653, |
| "eval_steps_per_second": 3.598, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.12614794631143406, |
| "grad_norm": 1.5128012895584106, |
| "learning_rate": 4.3692602684428305e-05, |
| "loss": 1.7748, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.15137753557372086, |
| "grad_norm": 1.318622350692749, |
| "learning_rate": 4.243112322131396e-05, |
| "loss": 1.6793, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.15137753557372086, |
| "eval_accuracy": 0.6136108291841765, |
| "eval_loss": 1.568946123123169, |
| "eval_runtime": 54.4039, |
| "eval_samples_per_second": 117.161, |
| "eval_steps_per_second": 3.676, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.17660712483600766, |
| "grad_norm": 1.4798802137374878, |
| "learning_rate": 4.116964375819962e-05, |
| "loss": 1.613, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.20183671409829448, |
| "grad_norm": 1.2613641023635864, |
| "learning_rate": 3.990816429508528e-05, |
| "loss": 1.5605, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.20183671409829448, |
| "eval_accuracy": 0.6316407595495017, |
| "eval_loss": 1.4698580503463745, |
| "eval_runtime": 55.0048, |
| "eval_samples_per_second": 115.881, |
| "eval_steps_per_second": 3.636, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.22706630336058128, |
| "grad_norm": 1.1746481657028198, |
| "learning_rate": 3.864668483197093e-05, |
| "loss": 1.5228, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.2522958926228681, |
| "grad_norm": 1.2291498184204102, |
| "learning_rate": 3.738520536885659e-05, |
| "loss": 1.4891, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.2522958926228681, |
| "eval_accuracy": 0.642200060669245, |
| "eval_loss": 1.4112207889556885, |
| "eval_runtime": 54.5565, |
| "eval_samples_per_second": 116.833, |
| "eval_steps_per_second": 3.666, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.2775254818851549, |
| "grad_norm": 1.257660150527954, |
| "learning_rate": 3.6123725905742254e-05, |
| "loss": 1.4607, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.3027550711474417, |
| "grad_norm": 1.1654913425445557, |
| "learning_rate": 3.4862246442627914e-05, |
| "loss": 1.4391, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.3027550711474417, |
| "eval_accuracy": 0.6513591843207115, |
| "eval_loss": 1.364630103111267, |
| "eval_runtime": 54.7084, |
| "eval_samples_per_second": 116.509, |
| "eval_steps_per_second": 3.656, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.3279846604097285, |
| "grad_norm": 1.112001657485962, |
| "learning_rate": 3.3600766979513575e-05, |
| "loss": 1.4193, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.3532142496720153, |
| "grad_norm": 1.1872960329055786, |
| "learning_rate": 3.2339287516399235e-05, |
| "loss": 1.3995, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.3532142496720153, |
| "eval_accuracy": 0.6575222655822269, |
| "eval_loss": 1.3316864967346191, |
| "eval_runtime": 54.512, |
| "eval_samples_per_second": 116.928, |
| "eval_steps_per_second": 3.669, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.37844383893430217, |
| "grad_norm": 1.129939079284668, |
| "learning_rate": 3.1077808053284896e-05, |
| "loss": 1.3852, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.40367342819658897, |
| "grad_norm": 1.1093947887420654, |
| "learning_rate": 2.9816328590170556e-05, |
| "loss": 1.3707, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.40367342819658897, |
| "eval_accuracy": 0.663146746266679, |
| "eval_loss": 1.3020933866500854, |
| "eval_runtime": 54.3588, |
| "eval_samples_per_second": 117.258, |
| "eval_steps_per_second": 3.679, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.42890301745887577, |
| "grad_norm": 1.1245774030685425, |
| "learning_rate": 2.855484912705621e-05, |
| "loss": 1.3574, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.45413260672116257, |
| "grad_norm": 1.1140567064285278, |
| "learning_rate": 2.729336966394187e-05, |
| "loss": 1.3424, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.45413260672116257, |
| "eval_accuracy": 0.6674909770600935, |
| "eval_loss": 1.2805943489074707, |
| "eval_runtime": 54.0957, |
| "eval_samples_per_second": 117.828, |
| "eval_steps_per_second": 3.697, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.47936219598344937, |
| "grad_norm": 1.1032965183258057, |
| "learning_rate": 2.603189020082753e-05, |
| "loss": 1.3331, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.5045917852457362, |
| "grad_norm": 1.1406043767929077, |
| "learning_rate": 2.477041073771319e-05, |
| "loss": 1.3242, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.5045917852457362, |
| "eval_accuracy": 0.6713588714661621, |
| "eval_loss": 1.2613232135772705, |
| "eval_runtime": 54.8004, |
| "eval_samples_per_second": 116.313, |
| "eval_steps_per_second": 3.65, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.529821374508023, |
| "grad_norm": 1.0637890100479126, |
| "learning_rate": 2.350893127459885e-05, |
| "loss": 1.316, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.5550509637703098, |
| "grad_norm": 1.0851575136184692, |
| "learning_rate": 2.224745181148451e-05, |
| "loss": 1.3058, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.5550509637703098, |
| "eval_accuracy": 0.6747553370072272, |
| "eval_loss": 1.2435107231140137, |
| "eval_runtime": 54.5981, |
| "eval_samples_per_second": 116.744, |
| "eval_steps_per_second": 3.663, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.5802805530325966, |
| "grad_norm": 1.1059494018554688, |
| "learning_rate": 2.098597234837017e-05, |
| "loss": 1.298, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.6055101422948834, |
| "grad_norm": 1.1078968048095703, |
| "learning_rate": 1.972449288525583e-05, |
| "loss": 1.2888, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.6055101422948834, |
| "eval_accuracy": 0.6777041444946341, |
| "eval_loss": 1.229060173034668, |
| "eval_runtime": 55.4356, |
| "eval_samples_per_second": 114.98, |
| "eval_steps_per_second": 3.608, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.6307397315571702, |
| "grad_norm": 1.1068435907363892, |
| "learning_rate": 1.846301342214149e-05, |
| "loss": 1.2855, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.655969320819457, |
| "grad_norm": 1.0919195413589478, |
| "learning_rate": 1.7201533959027147e-05, |
| "loss": 1.2748, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.655969320819457, |
| "eval_accuracy": 0.6801306075727364, |
| "eval_loss": 1.2177754640579224, |
| "eval_runtime": 55.3404, |
| "eval_samples_per_second": 115.178, |
| "eval_steps_per_second": 3.614, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.6811989100817438, |
| "grad_norm": 1.0631417036056519, |
| "learning_rate": 1.5940054495912807e-05, |
| "loss": 1.2717, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.7064284993440306, |
| "grad_norm": 1.1203536987304688, |
| "learning_rate": 1.4678575032798466e-05, |
| "loss": 1.2654, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.7064284993440306, |
| "eval_accuracy": 0.6820543563309032, |
| "eval_loss": 1.2076771259307861, |
| "eval_runtime": 54.4302, |
| "eval_samples_per_second": 117.104, |
| "eval_steps_per_second": 3.674, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.7316580886063175, |
| "grad_norm": 1.0601987838745117, |
| "learning_rate": 1.3417095569684127e-05, |
| "loss": 1.2592, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.7568876778686043, |
| "grad_norm": 1.0845381021499634, |
| "learning_rate": 1.2155616106569785e-05, |
| "loss": 1.2549, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.7568876778686043, |
| "eval_accuracy": 0.6842664220266779, |
| "eval_loss": 1.196437120437622, |
| "eval_runtime": 54.7305, |
| "eval_samples_per_second": 116.462, |
| "eval_steps_per_second": 3.654, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.7821172671308911, |
| "grad_norm": 1.1175463199615479, |
| "learning_rate": 1.0894136643455446e-05, |
| "loss": 1.2499, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.8073468563931779, |
| "grad_norm": 1.1143847703933716, |
| "learning_rate": 9.632657180341105e-06, |
| "loss": 1.2459, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.8073468563931779, |
| "eval_accuracy": 0.6860334367900387, |
| "eval_loss": 1.1877895593643188, |
| "eval_runtime": 54.8073, |
| "eval_samples_per_second": 116.298, |
| "eval_steps_per_second": 3.649, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.8325764456554647, |
| "grad_norm": 1.0714515447616577, |
| "learning_rate": 8.371177717226763e-06, |
| "loss": 1.2401, |
| "step": 16500 |
| }, |
| { |
| "epoch": 0.8578060349177515, |
| "grad_norm": 1.0937442779541016, |
| "learning_rate": 7.109698254112424e-06, |
| "loss": 1.2385, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.8578060349177515, |
| "eval_accuracy": 0.6873077056382217, |
| "eval_loss": 1.1818801164627075, |
| "eval_runtime": 54.4299, |
| "eval_samples_per_second": 117.105, |
| "eval_steps_per_second": 3.674, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.8830356241800383, |
| "grad_norm": 1.125985026359558, |
| "learning_rate": 5.848218790998083e-06, |
| "loss": 1.2335, |
| "step": 17500 |
| }, |
| { |
| "epoch": 0.9082652134423251, |
| "grad_norm": 1.0988380908966064, |
| "learning_rate": 4.586739327883742e-06, |
| "loss": 1.2286, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.9082652134423251, |
| "eval_accuracy": 0.6885514558318389, |
| "eval_loss": 1.1755918264389038, |
| "eval_runtime": 54.168, |
| "eval_samples_per_second": 117.671, |
| "eval_steps_per_second": 3.692, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.9334948027046119, |
| "grad_norm": 1.0741068124771118, |
| "learning_rate": 3.325259864769402e-06, |
| "loss": 1.2284, |
| "step": 18500 |
| }, |
| { |
| "epoch": 0.9587243919668987, |
| "grad_norm": 1.1670308113098145, |
| "learning_rate": 2.063780401655061e-06, |
| "loss": 1.2246, |
| "step": 19000 |
| }, |
| { |
| "epoch": 0.9587243919668987, |
| "eval_accuracy": 0.6896297305064778, |
| "eval_loss": 1.1708202362060547, |
| "eval_runtime": 54.2124, |
| "eval_samples_per_second": 117.575, |
| "eval_steps_per_second": 3.689, |
| "step": 19000 |
| }, |
| { |
| "epoch": 0.9839539812291855, |
| "grad_norm": 1.1070556640625, |
| "learning_rate": 8.023009385407206e-07, |
| "loss": 1.2213, |
| "step": 19500 |
| }, |
| { |
| "epoch": 1.0, |
| "step": 19818, |
| "total_flos": 3.31395116433408e+17, |
| "train_loss": 1.5000046812714452, |
| "train_runtime": 7707.6508, |
| "train_samples_per_second": 82.275, |
| "train_steps_per_second": 2.571 |
| } |
| ], |
| "logging_steps": 500, |
| "max_steps": 19818, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.31395116433408e+17, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|