| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.2293267752843529, |
| "eval_steps": 100, |
| "global_step": 2000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 0.06958419799804688, |
| "epoch": 0.06148170919151552, |
| "grad_norm": 0.12353515625, |
| "learning_rate": 0.0002, |
| "loss": 0.0684, |
| "mean_token_accuracy": 0.98111328125, |
| "num_tokens": 102400.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.06148170919151552, |
| "eval_entropy": 0.06598355625215815, |
| "eval_loss": 0.09076400101184845, |
| "eval_mean_token_accuracy": 0.9780192506906077, |
| "eval_num_tokens": 102400.0, |
| "eval_runtime": 58.8573, |
| "eval_samples_per_second": 6.15, |
| "eval_steps_per_second": 6.15, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.05923065185546875, |
| "epoch": 0.12296341838303104, |
| "grad_norm": 0.140625, |
| "learning_rate": 0.0002, |
| "loss": 0.0597, |
| "mean_token_accuracy": 0.9837109375, |
| "num_tokens": 204739.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.12296341838303104, |
| "eval_entropy": 0.0683226190219268, |
| "eval_loss": 0.0902482345700264, |
| "eval_mean_token_accuracy": 0.9778681802486188, |
| "eval_num_tokens": 204739.0, |
| "eval_runtime": 45.3855, |
| "eval_samples_per_second": 7.976, |
| "eval_steps_per_second": 7.976, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.0568572998046875, |
| "epoch": 0.18444512757454656, |
| "grad_norm": 0.1689453125, |
| "learning_rate": 0.0002, |
| "loss": 0.0531, |
| "mean_token_accuracy": 0.98462890625, |
| "num_tokens": 307139.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.18444512757454656, |
| "eval_entropy": 0.06631065073592887, |
| "eval_loss": 0.09246724843978882, |
| "eval_mean_token_accuracy": 0.9779005524861878, |
| "eval_num_tokens": 307139.0, |
| "eval_runtime": 45.6788, |
| "eval_samples_per_second": 7.925, |
| "eval_steps_per_second": 7.925, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.06324066162109375, |
| "epoch": 0.24592683676606208, |
| "grad_norm": 0.21875, |
| "learning_rate": 0.0002, |
| "loss": 0.0581, |
| "mean_token_accuracy": 0.98373046875, |
| "num_tokens": 409539.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.24592683676606208, |
| "eval_entropy": 0.07637689917127072, |
| "eval_loss": 0.08831820636987686, |
| "eval_mean_token_accuracy": 0.9779868784530387, |
| "eval_num_tokens": 409539.0, |
| "eval_runtime": 45.2583, |
| "eval_samples_per_second": 7.999, |
| "eval_steps_per_second": 7.999, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.06784759521484375, |
| "epoch": 0.3074085459575776, |
| "grad_norm": 0.1826171875, |
| "learning_rate": 0.0002, |
| "loss": 0.0584, |
| "mean_token_accuracy": 0.98369140625, |
| "num_tokens": 511939.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.3074085459575776, |
| "eval_entropy": 0.07653134172133978, |
| "eval_loss": 0.08888135105371475, |
| "eval_mean_token_accuracy": 0.9780300414364641, |
| "eval_num_tokens": 511939.0, |
| "eval_runtime": 45.6823, |
| "eval_samples_per_second": 7.924, |
| "eval_steps_per_second": 7.924, |
| "step": 500 |
| }, |
| { |
| "entropy": 0.065264892578125, |
| "epoch": 0.3688902551490931, |
| "grad_norm": 0.193359375, |
| "learning_rate": 0.0002, |
| "loss": 0.0516, |
| "mean_token_accuracy": 0.98486328125, |
| "num_tokens": 614339.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.3688902551490931, |
| "eval_entropy": 0.07568089606353591, |
| "eval_loss": 0.09014976769685745, |
| "eval_mean_token_accuracy": 0.9778897617403315, |
| "eval_num_tokens": 614339.0, |
| "eval_runtime": 45.9066, |
| "eval_samples_per_second": 7.886, |
| "eval_steps_per_second": 7.886, |
| "step": 600 |
| }, |
| { |
| "entropy": 0.0712152099609375, |
| "epoch": 0.43037196434060865, |
| "grad_norm": 0.2421875, |
| "learning_rate": 0.0002, |
| "loss": 0.0584, |
| "mean_token_accuracy": 0.983046875, |
| "num_tokens": 716739.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.43037196434060865, |
| "eval_entropy": 0.07403969106094613, |
| "eval_loss": 0.09142003953456879, |
| "eval_mean_token_accuracy": 0.977965296961326, |
| "eval_num_tokens": 716739.0, |
| "eval_runtime": 45.4178, |
| "eval_samples_per_second": 7.97, |
| "eval_steps_per_second": 7.97, |
| "step": 700 |
| }, |
| { |
| "entropy": 0.0648382568359375, |
| "epoch": 0.49185367353212417, |
| "grad_norm": 0.162109375, |
| "learning_rate": 0.0002, |
| "loss": 0.0512, |
| "mean_token_accuracy": 0.9851953125, |
| "num_tokens": 819139.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.49185367353212417, |
| "eval_entropy": 0.07564245403142265, |
| "eval_loss": 0.08956116437911987, |
| "eval_mean_token_accuracy": 0.977846598756906, |
| "eval_num_tokens": 819139.0, |
| "eval_runtime": 45.1314, |
| "eval_samples_per_second": 8.021, |
| "eval_steps_per_second": 8.021, |
| "step": 800 |
| }, |
| { |
| "entropy": 0.0652081298828125, |
| "epoch": 0.5533353827236397, |
| "grad_norm": 0.1474609375, |
| "learning_rate": 0.0002, |
| "loss": 0.0503, |
| "mean_token_accuracy": 0.98572265625, |
| "num_tokens": 921539.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.5533353827236397, |
| "eval_entropy": 0.07643860874913674, |
| "eval_loss": 0.09081660211086273, |
| "eval_mean_token_accuracy": 0.977781854281768, |
| "eval_num_tokens": 921539.0, |
| "eval_runtime": 45.2589, |
| "eval_samples_per_second": 7.998, |
| "eval_steps_per_second": 7.998, |
| "step": 900 |
| }, |
| { |
| "entropy": 0.0689361572265625, |
| "epoch": 0.6148170919151552, |
| "grad_norm": 0.1279296875, |
| "learning_rate": 0.0002, |
| "loss": 0.0547, |
| "mean_token_accuracy": 0.98447265625, |
| "num_tokens": 1023939.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.6148170919151552, |
| "eval_entropy": 0.07796246439053868, |
| "eval_loss": 0.09036962687969208, |
| "eval_mean_token_accuracy": 0.9782674378453039, |
| "eval_num_tokens": 1023939.0, |
| "eval_runtime": 45.7559, |
| "eval_samples_per_second": 7.912, |
| "eval_steps_per_second": 7.912, |
| "step": 1000 |
| }, |
| { |
| "entropy": 0.07148681640625, |
| "epoch": 0.6762988011066707, |
| "grad_norm": 0.06884765625, |
| "learning_rate": 0.0002, |
| "loss": 0.0556, |
| "mean_token_accuracy": 0.98369140625, |
| "num_tokens": 1126339.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.6762988011066707, |
| "eval_entropy": 0.07977396085117404, |
| "eval_loss": 0.0895102471113205, |
| "eval_mean_token_accuracy": 0.9782026933701657, |
| "eval_num_tokens": 1126339.0, |
| "eval_runtime": 45.3177, |
| "eval_samples_per_second": 7.988, |
| "eval_steps_per_second": 7.988, |
| "step": 1100 |
| }, |
| { |
| "entropy": 0.0679693603515625, |
| "epoch": 0.7377805102981863, |
| "grad_norm": 0.08837890625, |
| "learning_rate": 0.0002, |
| "loss": 0.0534, |
| "mean_token_accuracy": 0.98482421875, |
| "num_tokens": 1228739.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.7377805102981863, |
| "eval_entropy": 0.07723847278573895, |
| "eval_loss": 0.09167025238275528, |
| "eval_mean_token_accuracy": 0.9781595303867403, |
| "eval_num_tokens": 1228739.0, |
| "eval_runtime": 45.0604, |
| "eval_samples_per_second": 8.034, |
| "eval_steps_per_second": 8.034, |
| "step": 1200 |
| }, |
| { |
| "entropy": 0.0679254150390625, |
| "epoch": 0.7992622194897018, |
| "grad_norm": 0.2158203125, |
| "learning_rate": 0.0002, |
| "loss": 0.0537, |
| "mean_token_accuracy": 0.98462890625, |
| "num_tokens": 1331139.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.7992622194897018, |
| "eval_entropy": 0.07550386038933012, |
| "eval_loss": 0.09077057242393494, |
| "eval_mean_token_accuracy": 0.9782026933701657, |
| "eval_num_tokens": 1331139.0, |
| "eval_runtime": 45.1835, |
| "eval_samples_per_second": 8.012, |
| "eval_steps_per_second": 8.012, |
| "step": 1300 |
| }, |
| { |
| "entropy": 0.0667388916015625, |
| "epoch": 0.8607439286812173, |
| "grad_norm": 0.166015625, |
| "learning_rate": 0.0002, |
| "loss": 0.0497, |
| "mean_token_accuracy": 0.98517578125, |
| "num_tokens": 1433475.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.8607439286812173, |
| "eval_entropy": 0.07443018117662292, |
| "eval_loss": 0.0910056084394455, |
| "eval_mean_token_accuracy": 0.9782026933701657, |
| "eval_num_tokens": 1433475.0, |
| "eval_runtime": 46.1219, |
| "eval_samples_per_second": 7.849, |
| "eval_steps_per_second": 7.849, |
| "step": 1400 |
| }, |
| { |
| "entropy": 0.065689697265625, |
| "epoch": 0.9222256378727328, |
| "grad_norm": 0.130859375, |
| "learning_rate": 0.0002, |
| "loss": 0.0491, |
| "mean_token_accuracy": 0.985703125, |
| "num_tokens": 1535875.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.9222256378727328, |
| "eval_entropy": 0.07454584448377072, |
| "eval_loss": 0.09148883074522018, |
| "eval_mean_token_accuracy": 0.9779976691988951, |
| "eval_num_tokens": 1535875.0, |
| "eval_runtime": 45.4996, |
| "eval_samples_per_second": 7.956, |
| "eval_steps_per_second": 7.956, |
| "step": 1500 |
| }, |
| { |
| "entropy": 0.06522216796875, |
| "epoch": 0.9837073470642483, |
| "grad_norm": 0.234375, |
| "learning_rate": 0.0002, |
| "loss": 0.0503, |
| "mean_token_accuracy": 0.98533203125, |
| "num_tokens": 1638213.0, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.9837073470642483, |
| "eval_entropy": 0.08157905030645718, |
| "eval_loss": 0.08965592086315155, |
| "eval_mean_token_accuracy": 0.9780300414364641, |
| "eval_num_tokens": 1638213.0, |
| "eval_runtime": 46.8838, |
| "eval_samples_per_second": 7.721, |
| "eval_steps_per_second": 7.721, |
| "step": 1600 |
| }, |
| { |
| "entropy": 0.0635839107647613, |
| "epoch": 1.0448816477098064, |
| "grad_norm": 0.166015625, |
| "learning_rate": 0.0002, |
| "loss": 0.045, |
| "mean_token_accuracy": 0.9866323806532663, |
| "num_tokens": 1740101.0, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.0448816477098064, |
| "eval_entropy": 0.07249357950621547, |
| "eval_loss": 0.09257431328296661, |
| "eval_mean_token_accuracy": 0.9786019509668509, |
| "eval_num_tokens": 1740101.0, |
| "eval_runtime": 46.53, |
| "eval_samples_per_second": 7.78, |
| "eval_steps_per_second": 7.78, |
| "step": 1700 |
| }, |
| { |
| "entropy": 0.0614752197265625, |
| "epoch": 1.1063633569013218, |
| "grad_norm": 0.23828125, |
| "learning_rate": 0.0002, |
| "loss": 0.0472, |
| "mean_token_accuracy": 0.98642578125, |
| "num_tokens": 1842501.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.1063633569013218, |
| "eval_entropy": 0.06955445010359115, |
| "eval_loss": 0.09410356730222702, |
| "eval_mean_token_accuracy": 0.9780732044198895, |
| "eval_num_tokens": 1842501.0, |
| "eval_runtime": 45.9671, |
| "eval_samples_per_second": 7.875, |
| "eval_steps_per_second": 7.875, |
| "step": 1800 |
| }, |
| { |
| "entropy": 0.059293212890625, |
| "epoch": 1.1678450660928374, |
| "grad_norm": 0.357421875, |
| "learning_rate": 0.0002, |
| "loss": 0.0437, |
| "mean_token_accuracy": 0.986953125, |
| "num_tokens": 1944901.0, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.1678450660928374, |
| "eval_entropy": 0.0688324817636395, |
| "eval_loss": 0.09622900187969208, |
| "eval_mean_token_accuracy": 0.9781379488950276, |
| "eval_num_tokens": 1944901.0, |
| "eval_runtime": 46.9487, |
| "eval_samples_per_second": 7.711, |
| "eval_steps_per_second": 7.711, |
| "step": 1900 |
| }, |
| { |
| "entropy": 0.063258056640625, |
| "epoch": 1.2293267752843529, |
| "grad_norm": 0.34375, |
| "learning_rate": 0.0002, |
| "loss": 0.0472, |
| "mean_token_accuracy": 0.9858984375, |
| "num_tokens": 2047301.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.2293267752843529, |
| "eval_entropy": 0.06529750086325967, |
| "eval_loss": 0.09772183746099472, |
| "eval_mean_token_accuracy": 0.9782458563535912, |
| "eval_num_tokens": 2047301.0, |
| "eval_runtime": 46.937, |
| "eval_samples_per_second": 7.712, |
| "eval_steps_per_second": 7.712, |
| "step": 2000 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 3254, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.2602922377362944e+16, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|