{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.995667244367418, "eval_steps": 500, "global_step": 864, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03466204506065858, "grad_norm": 1.8127493746878947, "learning_rate": 5e-06, "loss": 1.0247, "step": 10 }, { "epoch": 0.06932409012131716, "grad_norm": 1.892292516893664, "learning_rate": 5e-06, "loss": 0.9061, "step": 20 }, { "epoch": 0.10398613518197573, "grad_norm": 2.439531384426821, "learning_rate": 5e-06, "loss": 0.8729, "step": 30 }, { "epoch": 0.1386481802426343, "grad_norm": 1.0105961240246888, "learning_rate": 5e-06, "loss": 0.8422, "step": 40 }, { "epoch": 0.1733102253032929, "grad_norm": 1.0541427004735089, "learning_rate": 5e-06, "loss": 0.8226, "step": 50 }, { "epoch": 0.20797227036395147, "grad_norm": 1.185839497666169, "learning_rate": 5e-06, "loss": 0.8035, "step": 60 }, { "epoch": 0.24263431542461006, "grad_norm": 1.1025425925379357, "learning_rate": 5e-06, "loss": 0.7971, "step": 70 }, { "epoch": 0.2772963604852686, "grad_norm": 1.0489992399782422, "learning_rate": 5e-06, "loss": 0.7891, "step": 80 }, { "epoch": 0.3119584055459272, "grad_norm": 0.668603535180152, "learning_rate": 5e-06, "loss": 0.7823, "step": 90 }, { "epoch": 0.3466204506065858, "grad_norm": 0.569376657027902, "learning_rate": 5e-06, "loss": 0.7697, "step": 100 }, { "epoch": 0.38128249566724437, "grad_norm": 0.6909448594076529, "learning_rate": 5e-06, "loss": 0.7641, "step": 110 }, { "epoch": 0.41594454072790293, "grad_norm": 0.6532682299793838, "learning_rate": 5e-06, "loss": 0.7695, "step": 120 }, { "epoch": 0.4506065857885615, "grad_norm": 0.9741442291789676, "learning_rate": 5e-06, "loss": 0.7702, "step": 130 }, { "epoch": 0.4852686308492201, "grad_norm": 0.8963697523822133, "learning_rate": 5e-06, "loss": 0.7635, "step": 140 }, { "epoch": 0.5199306759098787, "grad_norm": 0.7797154633763044, "learning_rate": 5e-06, "loss": 0.7577, "step": 150 }, { "epoch": 0.5545927209705372, "grad_norm": 0.8410605236589601, "learning_rate": 5e-06, "loss": 0.7597, "step": 160 }, { "epoch": 0.5892547660311959, "grad_norm": 0.7051595274843617, "learning_rate": 5e-06, "loss": 0.752, "step": 170 }, { "epoch": 0.6239168110918544, "grad_norm": 0.6800181939208395, "learning_rate": 5e-06, "loss": 0.7527, "step": 180 }, { "epoch": 0.658578856152513, "grad_norm": 0.7986625471943152, "learning_rate": 5e-06, "loss": 0.7491, "step": 190 }, { "epoch": 0.6932409012131716, "grad_norm": 0.8468221058427845, "learning_rate": 5e-06, "loss": 0.7471, "step": 200 }, { "epoch": 0.7279029462738301, "grad_norm": 0.7527636890957969, "learning_rate": 5e-06, "loss": 0.7488, "step": 210 }, { "epoch": 0.7625649913344887, "grad_norm": 0.672904711451661, "learning_rate": 5e-06, "loss": 0.744, "step": 220 }, { "epoch": 0.7972270363951474, "grad_norm": 0.9298264839873263, "learning_rate": 5e-06, "loss": 0.7438, "step": 230 }, { "epoch": 0.8318890814558059, "grad_norm": 0.6925885250176548, "learning_rate": 5e-06, "loss": 0.7402, "step": 240 }, { "epoch": 0.8665511265164645, "grad_norm": 0.6976668007067893, "learning_rate": 5e-06, "loss": 0.7449, "step": 250 }, { "epoch": 0.901213171577123, "grad_norm": 0.7134513511376641, "learning_rate": 5e-06, "loss": 0.7378, "step": 260 }, { "epoch": 0.9358752166377816, "grad_norm": 0.5758590804698668, "learning_rate": 5e-06, "loss": 0.7439, "step": 270 }, { "epoch": 0.9705372616984402, "grad_norm": 0.7076061848472048, "learning_rate": 5e-06, "loss": 0.7382, "step": 280 }, { "epoch": 0.9982668977469671, "eval_loss": 0.7344536185264587, "eval_runtime": 308.2237, "eval_samples_per_second": 25.215, "eval_steps_per_second": 0.396, "step": 288 }, { "epoch": 1.005632582322357, "grad_norm": 0.7185681802232957, "learning_rate": 5e-06, "loss": 0.779, "step": 290 }, { "epoch": 1.0402946273830156, "grad_norm": 0.9393905325717241, "learning_rate": 5e-06, "loss": 0.6889, "step": 300 }, { "epoch": 1.074956672443674, "grad_norm": 0.8787089784301063, "learning_rate": 5e-06, "loss": 0.687, "step": 310 }, { "epoch": 1.1096187175043328, "grad_norm": 0.7560092649402328, "learning_rate": 5e-06, "loss": 0.6872, "step": 320 }, { "epoch": 1.1442807625649913, "grad_norm": 0.6643286211815734, "learning_rate": 5e-06, "loss": 0.6858, "step": 330 }, { "epoch": 1.1789428076256498, "grad_norm": 0.7127668776455044, "learning_rate": 5e-06, "loss": 0.684, "step": 340 }, { "epoch": 1.2136048526863086, "grad_norm": 0.655292316893117, "learning_rate": 5e-06, "loss": 0.6855, "step": 350 }, { "epoch": 1.248266897746967, "grad_norm": 0.8839088016848645, "learning_rate": 5e-06, "loss": 0.686, "step": 360 }, { "epoch": 1.2829289428076256, "grad_norm": 0.624864756502428, "learning_rate": 5e-06, "loss": 0.6819, "step": 370 }, { "epoch": 1.317590987868284, "grad_norm": 0.7439571552243042, "learning_rate": 5e-06, "loss": 0.6851, "step": 380 }, { "epoch": 1.3522530329289428, "grad_norm": 0.5854034874524795, "learning_rate": 5e-06, "loss": 0.6868, "step": 390 }, { "epoch": 1.3869150779896013, "grad_norm": 0.6734106560005542, "learning_rate": 5e-06, "loss": 0.6834, "step": 400 }, { "epoch": 1.4215771230502598, "grad_norm": 0.6926581209135775, "learning_rate": 5e-06, "loss": 0.6832, "step": 410 }, { "epoch": 1.4562391681109186, "grad_norm": 1.1324386970749247, "learning_rate": 5e-06, "loss": 0.6842, "step": 420 }, { "epoch": 1.490901213171577, "grad_norm": 0.7226777314119034, "learning_rate": 5e-06, "loss": 0.6844, "step": 430 }, { "epoch": 1.5255632582322356, "grad_norm": 0.7481904787146205, "learning_rate": 5e-06, "loss": 0.6791, "step": 440 }, { "epoch": 1.5602253032928943, "grad_norm": 0.6135505957665759, "learning_rate": 5e-06, "loss": 0.6817, "step": 450 }, { "epoch": 1.5948873483535528, "grad_norm": 0.7553340277380959, "learning_rate": 5e-06, "loss": 0.684, "step": 460 }, { "epoch": 1.6295493934142113, "grad_norm": 0.7233556793224363, "learning_rate": 5e-06, "loss": 0.681, "step": 470 }, { "epoch": 1.66421143847487, "grad_norm": 0.5547213886367687, "learning_rate": 5e-06, "loss": 0.6806, "step": 480 }, { "epoch": 1.6988734835355286, "grad_norm": 0.6625866861543885, "learning_rate": 5e-06, "loss": 0.6792, "step": 490 }, { "epoch": 1.733535528596187, "grad_norm": 0.8682937684926717, "learning_rate": 5e-06, "loss": 0.6789, "step": 500 }, { "epoch": 1.7681975736568458, "grad_norm": 0.6685275937902929, "learning_rate": 5e-06, "loss": 0.6822, "step": 510 }, { "epoch": 1.8028596187175043, "grad_norm": 1.0295956431263236, "learning_rate": 5e-06, "loss": 0.6825, "step": 520 }, { "epoch": 1.8375216637781628, "grad_norm": 0.784814610980589, "learning_rate": 5e-06, "loss": 0.6769, "step": 530 }, { "epoch": 1.8721837088388216, "grad_norm": 0.7570247170470147, "learning_rate": 5e-06, "loss": 0.6782, "step": 540 }, { "epoch": 1.90684575389948, "grad_norm": 0.5807065830422653, "learning_rate": 5e-06, "loss": 0.6846, "step": 550 }, { "epoch": 1.9415077989601386, "grad_norm": 0.6301636959503909, "learning_rate": 5e-06, "loss": 0.6787, "step": 560 }, { "epoch": 1.9761698440207973, "grad_norm": 0.6686036844283325, "learning_rate": 5e-06, "loss": 0.6785, "step": 570 }, { "epoch": 1.9969670710571923, "eval_loss": 0.7195846438407898, "eval_runtime": 306.775, "eval_samples_per_second": 25.335, "eval_steps_per_second": 0.398, "step": 576 }, { "epoch": 2.011265164644714, "grad_norm": 1.229519224308769, "learning_rate": 5e-06, "loss": 0.7064, "step": 580 }, { "epoch": 2.0459272097053725, "grad_norm": 0.9032143353484438, "learning_rate": 5e-06, "loss": 0.6269, "step": 590 }, { "epoch": 2.080589254766031, "grad_norm": 0.7420064693943627, "learning_rate": 5e-06, "loss": 0.6204, "step": 600 }, { "epoch": 2.11525129982669, "grad_norm": 1.2914353849457911, "learning_rate": 5e-06, "loss": 0.6251, "step": 610 }, { "epoch": 2.149913344887348, "grad_norm": 0.7778946515270286, "learning_rate": 5e-06, "loss": 0.6274, "step": 620 }, { "epoch": 2.184575389948007, "grad_norm": 0.7043162671127772, "learning_rate": 5e-06, "loss": 0.6267, "step": 630 }, { "epoch": 2.2192374350086657, "grad_norm": 0.5973234209878199, "learning_rate": 5e-06, "loss": 0.6284, "step": 640 }, { "epoch": 2.253899480069324, "grad_norm": 0.666095113544406, "learning_rate": 5e-06, "loss": 0.6334, "step": 650 }, { "epoch": 2.2885615251299827, "grad_norm": 0.6767024363263829, "learning_rate": 5e-06, "loss": 0.6292, "step": 660 }, { "epoch": 2.3232235701906414, "grad_norm": 0.5737190679416464, "learning_rate": 5e-06, "loss": 0.6299, "step": 670 }, { "epoch": 2.3578856152512997, "grad_norm": 0.5600750074108755, "learning_rate": 5e-06, "loss": 0.6342, "step": 680 }, { "epoch": 2.3925476603119584, "grad_norm": 0.5910347547974553, "learning_rate": 5e-06, "loss": 0.6315, "step": 690 }, { "epoch": 2.427209705372617, "grad_norm": 0.6226740928701757, "learning_rate": 5e-06, "loss": 0.631, "step": 700 }, { "epoch": 2.4618717504332754, "grad_norm": 0.6210136062823411, "learning_rate": 5e-06, "loss": 0.6295, "step": 710 }, { "epoch": 2.496533795493934, "grad_norm": 0.5748749993993215, "learning_rate": 5e-06, "loss": 0.6315, "step": 720 }, { "epoch": 2.5311958405545925, "grad_norm": 0.6967001634339309, "learning_rate": 5e-06, "loss": 0.6362, "step": 730 }, { "epoch": 2.565857885615251, "grad_norm": 0.6258079849864094, "learning_rate": 5e-06, "loss": 0.6303, "step": 740 }, { "epoch": 2.60051993067591, "grad_norm": 0.6125604920957239, "learning_rate": 5e-06, "loss": 0.6285, "step": 750 }, { "epoch": 2.635181975736568, "grad_norm": 0.5972379433259742, "learning_rate": 5e-06, "loss": 0.6339, "step": 760 }, { "epoch": 2.669844020797227, "grad_norm": 0.6758633252723798, "learning_rate": 5e-06, "loss": 0.6326, "step": 770 }, { "epoch": 2.7045060658578857, "grad_norm": 0.6607811157555928, "learning_rate": 5e-06, "loss": 0.6295, "step": 780 }, { "epoch": 2.739168110918544, "grad_norm": 0.7251327172929152, "learning_rate": 5e-06, "loss": 0.6253, "step": 790 }, { "epoch": 2.7738301559792027, "grad_norm": 0.5734616475373774, "learning_rate": 5e-06, "loss": 0.6328, "step": 800 }, { "epoch": 2.8084922010398614, "grad_norm": 0.5940604342669007, "learning_rate": 5e-06, "loss": 0.6346, "step": 810 }, { "epoch": 2.8431542461005197, "grad_norm": 0.6989887403612659, "learning_rate": 5e-06, "loss": 0.6331, "step": 820 }, { "epoch": 2.8778162911611784, "grad_norm": 0.592871012328308, "learning_rate": 5e-06, "loss": 0.6274, "step": 830 }, { "epoch": 2.912478336221837, "grad_norm": 0.7052513186995701, "learning_rate": 5e-06, "loss": 0.632, "step": 840 }, { "epoch": 2.9471403812824954, "grad_norm": 0.6220289067550866, "learning_rate": 5e-06, "loss": 0.6307, "step": 850 }, { "epoch": 2.981802426343154, "grad_norm": 0.6904590828336521, "learning_rate": 5e-06, "loss": 0.6293, "step": 860 }, { "epoch": 2.995667244367418, "eval_loss": 0.720524787902832, "eval_runtime": 307.7205, "eval_samples_per_second": 25.257, "eval_steps_per_second": 0.396, "step": 864 }, { "epoch": 2.995667244367418, "step": 864, "total_flos": 1447022800404480.0, "train_loss": 0.6994864764036955, "train_runtime": 51001.1529, "train_samples_per_second": 8.686, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 864, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1447022800404480.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }