| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 1000, | |
| "global_step": 19042, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02625774603508035, | |
| "grad_norm": 1.9965242147445679, | |
| "learning_rate": 4.868711269824598e-05, | |
| "loss": 4.4037, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0525154920701607, | |
| "grad_norm": 1.66657555103302, | |
| "learning_rate": 4.737422539649197e-05, | |
| "loss": 2.9308, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.0525154920701607, | |
| "eval_accuracy": 0.4407849232661833, | |
| "eval_loss": 2.475210666656494, | |
| "eval_runtime": 52.4616, | |
| "eval_samples_per_second": 116.809, | |
| "eval_steps_per_second": 3.66, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.07877323810524105, | |
| "grad_norm": 1.709204912185669, | |
| "learning_rate": 4.606133809473795e-05, | |
| "loss": 2.3035, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.1050309841403214, | |
| "grad_norm": 1.9202218055725098, | |
| "learning_rate": 4.4748450792983934e-05, | |
| "loss": 1.9919, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.1050309841403214, | |
| "eval_accuracy": 0.5647940067737086, | |
| "eval_loss": 1.8136076927185059, | |
| "eval_runtime": 52.2754, | |
| "eval_samples_per_second": 117.225, | |
| "eval_steps_per_second": 3.673, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.13128873017540174, | |
| "grad_norm": 1.549422025680542, | |
| "learning_rate": 4.3435563491229914e-05, | |
| "loss": 1.8363, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.1575464762104821, | |
| "grad_norm": 1.7258355617523193, | |
| "learning_rate": 4.2122676189475893e-05, | |
| "loss": 1.7406, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.1575464762104821, | |
| "eval_accuracy": 0.5983668062755066, | |
| "eval_loss": 1.6234792470932007, | |
| "eval_runtime": 52.6459, | |
| "eval_samples_per_second": 116.4, | |
| "eval_steps_per_second": 3.647, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.18380422224556245, | |
| "grad_norm": 1.4436798095703125, | |
| "learning_rate": 4.080978888772188e-05, | |
| "loss": 1.6711, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.2100619682806428, | |
| "grad_norm": 1.3306225538253784, | |
| "learning_rate": 3.949690158596786e-05, | |
| "loss": 1.6185, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.2100619682806428, | |
| "eval_accuracy": 0.6164799047495081, | |
| "eval_loss": 1.5258336067199707, | |
| "eval_runtime": 52.3533, | |
| "eval_samples_per_second": 117.051, | |
| "eval_steps_per_second": 3.667, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.23631971431572313, | |
| "grad_norm": 1.235955834388733, | |
| "learning_rate": 3.8184014284213846e-05, | |
| "loss": 1.5788, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.2625774603508035, | |
| "grad_norm": 1.1904724836349487, | |
| "learning_rate": 3.6871126982459825e-05, | |
| "loss": 1.5461, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.2625774603508035, | |
| "eval_accuracy": 0.6282321233049777, | |
| "eval_loss": 1.4625129699707031, | |
| "eval_runtime": 52.375, | |
| "eval_samples_per_second": 117.002, | |
| "eval_steps_per_second": 3.666, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.28883520638588384, | |
| "grad_norm": 1.2717902660369873, | |
| "learning_rate": 3.555823968070581e-05, | |
| "loss": 1.5198, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.3150929524209642, | |
| "grad_norm": 1.1882010698318481, | |
| "learning_rate": 3.42453523789518e-05, | |
| "loss": 1.4955, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.3150929524209642, | |
| "eval_accuracy": 0.6368267127605542, | |
| "eval_loss": 1.4170297384262085, | |
| "eval_runtime": 52.2927, | |
| "eval_samples_per_second": 117.187, | |
| "eval_steps_per_second": 3.672, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.34135069845604454, | |
| "grad_norm": 1.1391606330871582, | |
| "learning_rate": 3.293246507719778e-05, | |
| "loss": 1.4737, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.3676084444911249, | |
| "grad_norm": 1.325378179550171, | |
| "learning_rate": 3.161957777544376e-05, | |
| "loss": 1.4553, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.3676084444911249, | |
| "eval_accuracy": 0.643254111059215, | |
| "eval_loss": 1.3824151754379272, | |
| "eval_runtime": 52.618, | |
| "eval_samples_per_second": 116.462, | |
| "eval_steps_per_second": 3.649, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.39386619052620525, | |
| "grad_norm": 1.1828022003173828, | |
| "learning_rate": 3.030669047368974e-05, | |
| "loss": 1.4368, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.4201239365612856, | |
| "grad_norm": 1.1762062311172485, | |
| "learning_rate": 2.8993803171935723e-05, | |
| "loss": 1.4218, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.4201239365612856, | |
| "eval_accuracy": 0.6492348631603664, | |
| "eval_loss": 1.3531708717346191, | |
| "eval_runtime": 52.4914, | |
| "eval_samples_per_second": 116.743, | |
| "eval_steps_per_second": 3.658, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.4463816825963659, | |
| "grad_norm": 1.1989212036132812, | |
| "learning_rate": 2.7680915870181706e-05, | |
| "loss": 1.4113, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.47263942863144626, | |
| "grad_norm": 1.2929445505142212, | |
| "learning_rate": 2.636802856842769e-05, | |
| "loss": 1.3986, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.47263942863144626, | |
| "eval_accuracy": 0.6536616055271829, | |
| "eval_loss": 1.3304531574249268, | |
| "eval_runtime": 52.4713, | |
| "eval_samples_per_second": 116.788, | |
| "eval_steps_per_second": 3.659, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.4988971746665266, | |
| "grad_norm": 1.1339648962020874, | |
| "learning_rate": 2.5055141266673672e-05, | |
| "loss": 1.3867, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.525154920701607, | |
| "grad_norm": 1.1032936573028564, | |
| "learning_rate": 2.374225396491965e-05, | |
| "loss": 1.3722, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.525154920701607, | |
| "eval_accuracy": 0.6575196715746703, | |
| "eval_loss": 1.310016393661499, | |
| "eval_runtime": 52.3673, | |
| "eval_samples_per_second": 117.02, | |
| "eval_steps_per_second": 3.666, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.5514126667366873, | |
| "grad_norm": 1.0910552740097046, | |
| "learning_rate": 2.2429366663165635e-05, | |
| "loss": 1.3665, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.5776704127717677, | |
| "grad_norm": 1.133002758026123, | |
| "learning_rate": 2.1116479361411618e-05, | |
| "loss": 1.3573, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.5776704127717677, | |
| "eval_accuracy": 0.660813846797802, | |
| "eval_loss": 1.2933967113494873, | |
| "eval_runtime": 52.0286, | |
| "eval_samples_per_second": 117.781, | |
| "eval_steps_per_second": 3.69, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.603928158806848, | |
| "grad_norm": 1.1310782432556152, | |
| "learning_rate": 1.98035920596576e-05, | |
| "loss": 1.3486, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.6301859048419284, | |
| "grad_norm": 1.0979714393615723, | |
| "learning_rate": 1.8490704757903583e-05, | |
| "loss": 1.3448, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.6301859048419284, | |
| "eval_accuracy": 0.6639449961588427, | |
| "eval_loss": 1.2785269021987915, | |
| "eval_runtime": 52.6733, | |
| "eval_samples_per_second": 116.34, | |
| "eval_steps_per_second": 3.645, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.6564436508770087, | |
| "grad_norm": 1.1284676790237427, | |
| "learning_rate": 1.7177817456149563e-05, | |
| "loss": 1.3377, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.6827013969120891, | |
| "grad_norm": 1.0903546810150146, | |
| "learning_rate": 1.5864930154395546e-05, | |
| "loss": 1.3291, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.6827013969120891, | |
| "eval_accuracy": 0.6664656758777874, | |
| "eval_loss": 1.2657496929168701, | |
| "eval_runtime": 53.0667, | |
| "eval_samples_per_second": 115.477, | |
| "eval_steps_per_second": 3.618, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.7089591429471694, | |
| "grad_norm": 1.1001683473587036, | |
| "learning_rate": 1.4552042852641529e-05, | |
| "loss": 1.3225, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.7352168889822498, | |
| "grad_norm": 1.1466083526611328, | |
| "learning_rate": 1.3239155550887514e-05, | |
| "loss": 1.3174, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.7352168889822498, | |
| "eval_accuracy": 0.6685889999974477, | |
| "eval_loss": 1.2551158666610718, | |
| "eval_runtime": 52.488, | |
| "eval_samples_per_second": 116.75, | |
| "eval_steps_per_second": 3.658, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.7614746350173301, | |
| "grad_norm": 1.1432747840881348, | |
| "learning_rate": 1.1926268249133495e-05, | |
| "loss": 1.3109, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.7877323810524105, | |
| "grad_norm": 1.111801266670227, | |
| "learning_rate": 1.0613380947379476e-05, | |
| "loss": 1.3052, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.7877323810524105, | |
| "eval_accuracy": 0.670427906199194, | |
| "eval_loss": 1.2463113069534302, | |
| "eval_runtime": 51.9603, | |
| "eval_samples_per_second": 117.936, | |
| "eval_steps_per_second": 3.695, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.8139901270874909, | |
| "grad_norm": 1.1736992597579956, | |
| "learning_rate": 9.30049364562546e-06, | |
| "loss": 1.3038, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.8402478731225712, | |
| "grad_norm": 1.1293760538101196, | |
| "learning_rate": 7.987606343871442e-06, | |
| "loss": 1.2968, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.8402478731225712, | |
| "eval_accuracy": 0.6724532871884005, | |
| "eval_loss": 1.236586570739746, | |
| "eval_runtime": 52.6147, | |
| "eval_samples_per_second": 116.469, | |
| "eval_steps_per_second": 3.649, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.8665056191576515, | |
| "grad_norm": 1.1446099281311035, | |
| "learning_rate": 6.674719042117425e-06, | |
| "loss": 1.295, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.8927633651927318, | |
| "grad_norm": 1.1120586395263672, | |
| "learning_rate": 5.361831740363407e-06, | |
| "loss": 1.2856, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.8927633651927318, | |
| "eval_accuracy": 0.673523483380933, | |
| "eval_loss": 1.230779767036438, | |
| "eval_runtime": 52.5434, | |
| "eval_samples_per_second": 116.627, | |
| "eval_steps_per_second": 3.654, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.9190211112278122, | |
| "grad_norm": 1.148032546043396, | |
| "learning_rate": 4.04894443860939e-06, | |
| "loss": 1.2862, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.9452788572628925, | |
| "grad_norm": 1.116765022277832, | |
| "learning_rate": 2.7360571368553723e-06, | |
| "loss": 1.2817, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.9452788572628925, | |
| "eval_accuracy": 0.6749144672531769, | |
| "eval_loss": 1.2248890399932861, | |
| "eval_runtime": 52.4248, | |
| "eval_samples_per_second": 116.891, | |
| "eval_steps_per_second": 3.662, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.9715366032979729, | |
| "grad_norm": 1.1118154525756836, | |
| "learning_rate": 1.423169835101355e-06, | |
| "loss": 1.2816, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.9977943493330532, | |
| "grad_norm": 1.120948076248169, | |
| "learning_rate": 1.1028253334733746e-07, | |
| "loss": 1.2814, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.9977943493330532, | |
| "eval_accuracy": 0.6756562189740409, | |
| "eval_loss": 1.2216291427612305, | |
| "eval_runtime": 53.1168, | |
| "eval_samples_per_second": 115.368, | |
| "eval_steps_per_second": 3.615, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 19042, | |
| "total_flos": 3.18433463894016e+17, | |
| "train_loss": 1.5689714319777832, | |
| "train_runtime": 7397.2028, | |
| "train_samples_per_second": 82.375, | |
| "train_steps_per_second": 2.574 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 19042, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.18433463894016e+17, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |