| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.995667244367418, | |
| "eval_steps": 500, | |
| "global_step": 864, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03466204506065858, | |
| "grad_norm": 1.8127493746878947, | |
| "learning_rate": 5e-06, | |
| "loss": 1.0247, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06932409012131716, | |
| "grad_norm": 1.892292516893664, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9061, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.10398613518197573, | |
| "grad_norm": 2.439531384426821, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8729, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1386481802426343, | |
| "grad_norm": 1.0105961240246888, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8422, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1733102253032929, | |
| "grad_norm": 1.0541427004735089, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8226, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.20797227036395147, | |
| "grad_norm": 1.185839497666169, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8035, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.24263431542461006, | |
| "grad_norm": 1.1025425925379357, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7971, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2772963604852686, | |
| "grad_norm": 1.0489992399782422, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7891, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3119584055459272, | |
| "grad_norm": 0.668603535180152, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7823, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3466204506065858, | |
| "grad_norm": 0.569376657027902, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7697, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.38128249566724437, | |
| "grad_norm": 0.6909448594076529, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7641, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.41594454072790293, | |
| "grad_norm": 0.6532682299793838, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7695, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4506065857885615, | |
| "grad_norm": 0.9741442291789676, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7702, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.4852686308492201, | |
| "grad_norm": 0.8963697523822133, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7635, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5199306759098787, | |
| "grad_norm": 0.7797154633763044, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7577, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5545927209705372, | |
| "grad_norm": 0.8410605236589601, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7597, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5892547660311959, | |
| "grad_norm": 0.7051595274843617, | |
| "learning_rate": 5e-06, | |
| "loss": 0.752, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.6239168110918544, | |
| "grad_norm": 0.6800181939208395, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7527, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.658578856152513, | |
| "grad_norm": 0.7986625471943152, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7491, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6932409012131716, | |
| "grad_norm": 0.8468221058427845, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7471, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7279029462738301, | |
| "grad_norm": 0.7527636890957969, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7488, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.7625649913344887, | |
| "grad_norm": 0.672904711451661, | |
| "learning_rate": 5e-06, | |
| "loss": 0.744, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7972270363951474, | |
| "grad_norm": 0.9298264839873263, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7438, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.8318890814558059, | |
| "grad_norm": 0.6925885250176548, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7402, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.8665511265164645, | |
| "grad_norm": 0.6976668007067893, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7449, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.901213171577123, | |
| "grad_norm": 0.7134513511376641, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7378, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.9358752166377816, | |
| "grad_norm": 0.5758590804698668, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7439, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.9705372616984402, | |
| "grad_norm": 0.7076061848472048, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7382, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.9982668977469671, | |
| "eval_loss": 0.7344536185264587, | |
| "eval_runtime": 308.2237, | |
| "eval_samples_per_second": 25.215, | |
| "eval_steps_per_second": 0.396, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.005632582322357, | |
| "grad_norm": 0.7185681802232957, | |
| "learning_rate": 5e-06, | |
| "loss": 0.779, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.0402946273830156, | |
| "grad_norm": 0.9393905325717241, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6889, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.074956672443674, | |
| "grad_norm": 0.8787089784301063, | |
| "learning_rate": 5e-06, | |
| "loss": 0.687, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.1096187175043328, | |
| "grad_norm": 0.7560092649402328, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6872, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.1442807625649913, | |
| "grad_norm": 0.6643286211815734, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6858, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.1789428076256498, | |
| "grad_norm": 0.7127668776455044, | |
| "learning_rate": 5e-06, | |
| "loss": 0.684, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.2136048526863086, | |
| "grad_norm": 0.655292316893117, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6855, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.248266897746967, | |
| "grad_norm": 0.8839088016848645, | |
| "learning_rate": 5e-06, | |
| "loss": 0.686, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.2829289428076256, | |
| "grad_norm": 0.624864756502428, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6819, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.317590987868284, | |
| "grad_norm": 0.7439571552243042, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6851, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.3522530329289428, | |
| "grad_norm": 0.5854034874524795, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6868, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.3869150779896013, | |
| "grad_norm": 0.6734106560005542, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6834, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.4215771230502598, | |
| "grad_norm": 0.6926581209135775, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6832, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.4562391681109186, | |
| "grad_norm": 1.1324386970749247, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6842, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.490901213171577, | |
| "grad_norm": 0.7226777314119034, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6844, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.5255632582322356, | |
| "grad_norm": 0.7481904787146205, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6791, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.5602253032928943, | |
| "grad_norm": 0.6135505957665759, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6817, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.5948873483535528, | |
| "grad_norm": 0.7553340277380959, | |
| "learning_rate": 5e-06, | |
| "loss": 0.684, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.6295493934142113, | |
| "grad_norm": 0.7233556793224363, | |
| "learning_rate": 5e-06, | |
| "loss": 0.681, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.66421143847487, | |
| "grad_norm": 0.5547213886367687, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6806, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.6988734835355286, | |
| "grad_norm": 0.6625866861543885, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6792, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.733535528596187, | |
| "grad_norm": 0.8682937684926717, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6789, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.7681975736568458, | |
| "grad_norm": 0.6685275937902929, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6822, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.8028596187175043, | |
| "grad_norm": 1.0295956431263236, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6825, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.8375216637781628, | |
| "grad_norm": 0.784814610980589, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6769, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.8721837088388216, | |
| "grad_norm": 0.7570247170470147, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6782, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.90684575389948, | |
| "grad_norm": 0.5807065830422653, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6846, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.9415077989601386, | |
| "grad_norm": 0.6301636959503909, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6787, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.9761698440207973, | |
| "grad_norm": 0.6686036844283325, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6785, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.9969670710571923, | |
| "eval_loss": 0.7195846438407898, | |
| "eval_runtime": 306.775, | |
| "eval_samples_per_second": 25.335, | |
| "eval_steps_per_second": 0.398, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 2.011265164644714, | |
| "grad_norm": 1.229519224308769, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7064, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.0459272097053725, | |
| "grad_norm": 0.9032143353484438, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6269, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.080589254766031, | |
| "grad_norm": 0.7420064693943627, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6204, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.11525129982669, | |
| "grad_norm": 1.2914353849457911, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6251, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.149913344887348, | |
| "grad_norm": 0.7778946515270286, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6274, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.184575389948007, | |
| "grad_norm": 0.7043162671127772, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6267, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.2192374350086657, | |
| "grad_norm": 0.5973234209878199, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6284, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.253899480069324, | |
| "grad_norm": 0.666095113544406, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6334, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.2885615251299827, | |
| "grad_norm": 0.6767024363263829, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6292, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.3232235701906414, | |
| "grad_norm": 0.5737190679416464, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6299, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.3578856152512997, | |
| "grad_norm": 0.5600750074108755, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6342, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.3925476603119584, | |
| "grad_norm": 0.5910347547974553, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6315, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.427209705372617, | |
| "grad_norm": 0.6226740928701757, | |
| "learning_rate": 5e-06, | |
| "loss": 0.631, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.4618717504332754, | |
| "grad_norm": 0.6210136062823411, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6295, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.496533795493934, | |
| "grad_norm": 0.5748749993993215, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6315, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.5311958405545925, | |
| "grad_norm": 0.6967001634339309, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6362, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.565857885615251, | |
| "grad_norm": 0.6258079849864094, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6303, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.60051993067591, | |
| "grad_norm": 0.6125604920957239, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6285, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.635181975736568, | |
| "grad_norm": 0.5972379433259742, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6339, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.669844020797227, | |
| "grad_norm": 0.6758633252723798, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6326, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.7045060658578857, | |
| "grad_norm": 0.6607811157555928, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6295, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.739168110918544, | |
| "grad_norm": 0.7251327172929152, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6253, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.7738301559792027, | |
| "grad_norm": 0.5734616475373774, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6328, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.8084922010398614, | |
| "grad_norm": 0.5940604342669007, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6346, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.8431542461005197, | |
| "grad_norm": 0.6989887403612659, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6331, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.8778162911611784, | |
| "grad_norm": 0.592871012328308, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6274, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.912478336221837, | |
| "grad_norm": 0.7052513186995701, | |
| "learning_rate": 5e-06, | |
| "loss": 0.632, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.9471403812824954, | |
| "grad_norm": 0.6220289067550866, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6307, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.981802426343154, | |
| "grad_norm": 0.6904590828336521, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6293, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.995667244367418, | |
| "eval_loss": 0.720524787902832, | |
| "eval_runtime": 307.7205, | |
| "eval_samples_per_second": 25.257, | |
| "eval_steps_per_second": 0.396, | |
| "step": 864 | |
| }, | |
| { | |
| "epoch": 2.995667244367418, | |
| "step": 864, | |
| "total_flos": 1447022800404480.0, | |
| "train_loss": 0.6994864764036955, | |
| "train_runtime": 51001.1529, | |
| "train_samples_per_second": 8.686, | |
| "train_steps_per_second": 0.017 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 864, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1447022800404480.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |