| { | |
| "best_global_step": 28000, | |
| "best_metric": 2.380680799484253, | |
| "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-28000", | |
| "epoch": 0.56, | |
| "eval_steps": 100, | |
| "global_step": 28000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0005, | |
| "grad_norm": 27.027176292677446, | |
| "learning_rate": 4.8e-08, | |
| "loss": 3.52, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.001, | |
| "grad_norm": 22.883614597253285, | |
| "learning_rate": 9.8e-08, | |
| "loss": 3.4361, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0015, | |
| "grad_norm": 14.88008652186332, | |
| "learning_rate": 1.4800000000000003e-07, | |
| "loss": 3.2752, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.002, | |
| "grad_norm": 8.829920836438578, | |
| "learning_rate": 1.9800000000000003e-07, | |
| "loss": 3.073, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.002, | |
| "eval_loss": 2.8928089141845703, | |
| "eval_runtime": 31.5789, | |
| "eval_samples_per_second": 3.23, | |
| "eval_steps_per_second": 1.615, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0025, | |
| "grad_norm": 6.672581323543055, | |
| "learning_rate": 2.48e-07, | |
| "loss": 2.8787, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.003, | |
| "grad_norm": 3.485187933164644, | |
| "learning_rate": 2.9800000000000005e-07, | |
| "loss": 2.7569, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0035, | |
| "grad_norm": 1.6514027733962566, | |
| "learning_rate": 3.48e-07, | |
| "loss": 2.683, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.004, | |
| "grad_norm": 1.714322054077562, | |
| "learning_rate": 3.9800000000000004e-07, | |
| "loss": 2.6417, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.004, | |
| "eval_loss": 2.608551263809204, | |
| "eval_runtime": 31.7434, | |
| "eval_samples_per_second": 3.213, | |
| "eval_steps_per_second": 1.607, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0045, | |
| "grad_norm": 1.1166252829937406, | |
| "learning_rate": 4.4800000000000004e-07, | |
| "loss": 2.6075, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.005, | |
| "grad_norm": 1.2360541139925998, | |
| "learning_rate": 4.98e-07, | |
| "loss": 2.5833, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.0055, | |
| "grad_norm": 1.1186934925325145, | |
| "learning_rate": 5.480000000000001e-07, | |
| "loss": 2.568, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.006, | |
| "grad_norm": 2.2165517261683907, | |
| "learning_rate": 5.98e-07, | |
| "loss": 2.5488, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.006, | |
| "eval_loss": 2.532663345336914, | |
| "eval_runtime": 31.7717, | |
| "eval_samples_per_second": 3.21, | |
| "eval_steps_per_second": 1.605, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0065, | |
| "grad_norm": 1.9955553189401838, | |
| "learning_rate": 6.48e-07, | |
| "loss": 2.5186, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.007, | |
| "grad_norm": 1.7134269827298882, | |
| "learning_rate": 6.98e-07, | |
| "loss": 2.5133, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.0075, | |
| "grad_norm": 4.086994695184575, | |
| "learning_rate": 7.480000000000001e-07, | |
| "loss": 2.4979, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 2.2539165526987732, | |
| "learning_rate": 7.98e-07, | |
| "loss": 2.49, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "eval_loss": 2.4952430725097656, | |
| "eval_runtime": 31.9652, | |
| "eval_samples_per_second": 3.191, | |
| "eval_steps_per_second": 1.595, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.0085, | |
| "grad_norm": 1.138897058010547, | |
| "learning_rate": 8.480000000000001e-07, | |
| "loss": 2.4748, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.009, | |
| "grad_norm": 1.0112216946364496, | |
| "learning_rate": 8.980000000000001e-07, | |
| "loss": 2.4801, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.0095, | |
| "grad_norm": 1.3243191157122005, | |
| "learning_rate": 9.480000000000001e-07, | |
| "loss": 2.4699, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.2276747327077127, | |
| "learning_rate": 9.98e-07, | |
| "loss": 2.468, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "eval_loss": 2.4748668670654297, | |
| "eval_runtime": 31.7813, | |
| "eval_samples_per_second": 3.209, | |
| "eval_steps_per_second": 1.605, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0105, | |
| "grad_norm": 0.845856364918703, | |
| "learning_rate": 1.0480000000000002e-06, | |
| "loss": 2.4738, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.011, | |
| "grad_norm": 1.3677643157822397, | |
| "learning_rate": 1.0980000000000001e-06, | |
| "loss": 2.4535, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.0115, | |
| "grad_norm": 2.919464896391848, | |
| "learning_rate": 1.148e-06, | |
| "loss": 2.4558, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.012, | |
| "grad_norm": 0.9435018771336037, | |
| "learning_rate": 1.1980000000000002e-06, | |
| "loss": 2.4568, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.012, | |
| "eval_loss": 2.4655494689941406, | |
| "eval_runtime": 31.7457, | |
| "eval_samples_per_second": 3.213, | |
| "eval_steps_per_second": 1.607, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.0125, | |
| "grad_norm": 0.844314601352543, | |
| "learning_rate": 1.248e-06, | |
| "loss": 2.4493, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.013, | |
| "grad_norm": 0.8266693044311944, | |
| "learning_rate": 1.2980000000000001e-06, | |
| "loss": 2.4491, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.0135, | |
| "grad_norm": 0.9456226537014805, | |
| "learning_rate": 1.348e-06, | |
| "loss": 2.4538, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.014, | |
| "grad_norm": 1.241067240172021, | |
| "learning_rate": 1.3980000000000002e-06, | |
| "loss": 2.441, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.014, | |
| "eval_loss": 2.459726572036743, | |
| "eval_runtime": 31.7996, | |
| "eval_samples_per_second": 3.208, | |
| "eval_steps_per_second": 1.604, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.0145, | |
| "grad_norm": 0.8214981637560076, | |
| "learning_rate": 1.4480000000000002e-06, | |
| "loss": 2.4375, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.015, | |
| "grad_norm": 0.8463041725741063, | |
| "learning_rate": 1.498e-06, | |
| "loss": 2.4476, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.0155, | |
| "grad_norm": 1.0459233803315569, | |
| "learning_rate": 1.548e-06, | |
| "loss": 2.4388, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 0.7899668512736558, | |
| "learning_rate": 1.5980000000000002e-06, | |
| "loss": 2.4376, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "eval_loss": 2.4541866779327393, | |
| "eval_runtime": 31.8537, | |
| "eval_samples_per_second": 3.202, | |
| "eval_steps_per_second": 1.601, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.0165, | |
| "grad_norm": 0.8397014905084252, | |
| "learning_rate": 1.6480000000000001e-06, | |
| "loss": 2.436, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.017, | |
| "grad_norm": 0.7623848831497283, | |
| "learning_rate": 1.6980000000000003e-06, | |
| "loss": 2.4384, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.0175, | |
| "grad_norm": 0.7990535915346776, | |
| "learning_rate": 1.7480000000000002e-06, | |
| "loss": 2.4388, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.018, | |
| "grad_norm": 1.1027343926443682, | |
| "learning_rate": 1.798e-06, | |
| "loss": 2.4195, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.018, | |
| "eval_loss": 2.4497900009155273, | |
| "eval_runtime": 32.04, | |
| "eval_samples_per_second": 3.184, | |
| "eval_steps_per_second": 1.592, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.0185, | |
| "grad_norm": 1.0518607606934676, | |
| "learning_rate": 1.8480000000000001e-06, | |
| "loss": 2.441, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.019, | |
| "grad_norm": 0.7969899064558551, | |
| "learning_rate": 1.898e-06, | |
| "loss": 2.4416, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.0195, | |
| "grad_norm": 0.6779464500616844, | |
| "learning_rate": 1.9480000000000002e-06, | |
| "loss": 2.4397, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.8662904314628106, | |
| "learning_rate": 1.998e-06, | |
| "loss": 2.4316, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "eval_loss": 2.4468765258789062, | |
| "eval_runtime": 31.891, | |
| "eval_samples_per_second": 3.198, | |
| "eval_steps_per_second": 1.599, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.0205, | |
| "grad_norm": 0.6931713924838875, | |
| "learning_rate": 2.048e-06, | |
| "loss": 2.4456, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.021, | |
| "grad_norm": 0.6887441871643851, | |
| "learning_rate": 2.098e-06, | |
| "loss": 2.4253, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.0215, | |
| "grad_norm": 0.7500338911423412, | |
| "learning_rate": 2.148e-06, | |
| "loss": 2.431, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.022, | |
| "grad_norm": 0.7458051760406093, | |
| "learning_rate": 2.198e-06, | |
| "loss": 2.4164, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.022, | |
| "eval_loss": 2.4442293643951416, | |
| "eval_runtime": 31.8697, | |
| "eval_samples_per_second": 3.201, | |
| "eval_steps_per_second": 1.6, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.0225, | |
| "grad_norm": 0.8345425864188605, | |
| "learning_rate": 2.2480000000000003e-06, | |
| "loss": 2.4241, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.023, | |
| "grad_norm": 0.6997049438769294, | |
| "learning_rate": 2.2980000000000003e-06, | |
| "loss": 2.43, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.0235, | |
| "grad_norm": 0.7476759709197881, | |
| "learning_rate": 2.3480000000000002e-06, | |
| "loss": 2.4342, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 0.6735584083816767, | |
| "learning_rate": 2.398e-06, | |
| "loss": 2.4274, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "eval_loss": 2.4423961639404297, | |
| "eval_runtime": 31.6272, | |
| "eval_samples_per_second": 3.225, | |
| "eval_steps_per_second": 1.613, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.0245, | |
| "grad_norm": 0.7414830106555006, | |
| "learning_rate": 2.448e-06, | |
| "loss": 2.4363, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.025, | |
| "grad_norm": 0.7852755880662065, | |
| "learning_rate": 2.498e-06, | |
| "loss": 2.4356, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.0255, | |
| "grad_norm": 0.6550676975591231, | |
| "learning_rate": 2.5480000000000004e-06, | |
| "loss": 2.4219, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.026, | |
| "grad_norm": 0.6708503716821785, | |
| "learning_rate": 2.598e-06, | |
| "loss": 2.4442, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.026, | |
| "eval_loss": 2.440678358078003, | |
| "eval_runtime": 31.7661, | |
| "eval_samples_per_second": 3.211, | |
| "eval_steps_per_second": 1.605, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.0265, | |
| "grad_norm": 0.6923805904104993, | |
| "learning_rate": 2.648e-06, | |
| "loss": 2.4317, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.027, | |
| "grad_norm": 0.6600109660858106, | |
| "learning_rate": 2.6980000000000003e-06, | |
| "loss": 2.432, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.0275, | |
| "grad_norm": 0.841715383150229, | |
| "learning_rate": 2.748e-06, | |
| "loss": 2.4196, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.028, | |
| "grad_norm": 0.6392005959511108, | |
| "learning_rate": 2.798e-06, | |
| "loss": 2.4274, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.028, | |
| "eval_loss": 2.439229726791382, | |
| "eval_runtime": 32.0465, | |
| "eval_samples_per_second": 3.183, | |
| "eval_steps_per_second": 1.591, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.0285, | |
| "grad_norm": 0.6653339947473879, | |
| "learning_rate": 2.848e-06, | |
| "loss": 2.4209, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.029, | |
| "grad_norm": 0.6607591145573396, | |
| "learning_rate": 2.8980000000000005e-06, | |
| "loss": 2.4111, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.0295, | |
| "grad_norm": 0.6492342012137399, | |
| "learning_rate": 2.9480000000000004e-06, | |
| "loss": 2.4319, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.6418256237639189, | |
| "learning_rate": 2.9980000000000003e-06, | |
| "loss": 2.4257, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "eval_loss": 2.4380884170532227, | |
| "eval_runtime": 32.1017, | |
| "eval_samples_per_second": 3.177, | |
| "eval_steps_per_second": 1.589, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.0305, | |
| "grad_norm": 0.719808061901716, | |
| "learning_rate": 3.0480000000000003e-06, | |
| "loss": 2.4305, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.031, | |
| "grad_norm": 0.6138892760464039, | |
| "learning_rate": 3.0980000000000007e-06, | |
| "loss": 2.4253, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.0315, | |
| "grad_norm": 0.7179717159222389, | |
| "learning_rate": 3.1480000000000006e-06, | |
| "loss": 2.4286, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 0.6337699388954209, | |
| "learning_rate": 3.198e-06, | |
| "loss": 2.4281, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "eval_loss": 2.4367759227752686, | |
| "eval_runtime": 32.1865, | |
| "eval_samples_per_second": 3.169, | |
| "eval_steps_per_second": 1.585, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.0325, | |
| "grad_norm": 0.6399383081078225, | |
| "learning_rate": 3.248e-06, | |
| "loss": 2.4127, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.033, | |
| "grad_norm": 0.6239480160142674, | |
| "learning_rate": 3.298e-06, | |
| "loss": 2.4271, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.0335, | |
| "grad_norm": 0.6964721038747086, | |
| "learning_rate": 3.348e-06, | |
| "loss": 2.4168, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.034, | |
| "grad_norm": 0.6246300346633158, | |
| "learning_rate": 3.3980000000000003e-06, | |
| "loss": 2.4312, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.034, | |
| "eval_loss": 2.43576717376709, | |
| "eval_runtime": 32.35, | |
| "eval_samples_per_second": 3.153, | |
| "eval_steps_per_second": 1.577, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.0345, | |
| "grad_norm": 0.6609046760569887, | |
| "learning_rate": 3.4480000000000003e-06, | |
| "loss": 2.4201, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.035, | |
| "grad_norm": 0.611833218468793, | |
| "learning_rate": 3.4980000000000002e-06, | |
| "loss": 2.4248, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.0355, | |
| "grad_norm": 0.6374610168215615, | |
| "learning_rate": 3.548e-06, | |
| "loss": 2.4195, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.036, | |
| "grad_norm": 0.608911757784224, | |
| "learning_rate": 3.5980000000000005e-06, | |
| "loss": 2.4207, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.036, | |
| "eval_loss": 2.4352190494537354, | |
| "eval_runtime": 32.4107, | |
| "eval_samples_per_second": 3.147, | |
| "eval_steps_per_second": 1.574, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.0365, | |
| "grad_norm": 0.7277576842118675, | |
| "learning_rate": 3.6480000000000005e-06, | |
| "loss": 2.429, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 0.037, | |
| "grad_norm": 0.6177267450079238, | |
| "learning_rate": 3.6980000000000004e-06, | |
| "loss": 2.4216, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.0375, | |
| "grad_norm": 0.6909621222715888, | |
| "learning_rate": 3.7480000000000004e-06, | |
| "loss": 2.4141, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 0.038, | |
| "grad_norm": 0.6271064789808471, | |
| "learning_rate": 3.7980000000000007e-06, | |
| "loss": 2.4204, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.038, | |
| "eval_loss": 2.434185743331909, | |
| "eval_runtime": 32.1923, | |
| "eval_samples_per_second": 3.168, | |
| "eval_steps_per_second": 1.584, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.0385, | |
| "grad_norm": 4.465543129416645, | |
| "learning_rate": 3.848e-06, | |
| "loss": 2.4278, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 0.039, | |
| "grad_norm": 0.59428248175071, | |
| "learning_rate": 3.898e-06, | |
| "loss": 2.4231, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.0395, | |
| "grad_norm": 0.6300066797920092, | |
| "learning_rate": 3.948e-06, | |
| "loss": 2.4163, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.5995770487766363, | |
| "learning_rate": 3.9980000000000005e-06, | |
| "loss": 2.4236, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "eval_loss": 2.433772563934326, | |
| "eval_runtime": 32.062, | |
| "eval_samples_per_second": 3.181, | |
| "eval_steps_per_second": 1.591, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.0405, | |
| "grad_norm": 0.595289417756029, | |
| "learning_rate": 4.048e-06, | |
| "loss": 2.424, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 0.041, | |
| "grad_norm": 0.6134282240517589, | |
| "learning_rate": 4.098e-06, | |
| "loss": 2.4255, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.0415, | |
| "grad_norm": 0.6629564791128602, | |
| "learning_rate": 4.148000000000001e-06, | |
| "loss": 2.4097, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 0.042, | |
| "grad_norm": 0.621927005205136, | |
| "learning_rate": 4.198e-06, | |
| "loss": 2.4268, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.042, | |
| "eval_loss": 2.433004379272461, | |
| "eval_runtime": 32.0064, | |
| "eval_samples_per_second": 3.187, | |
| "eval_steps_per_second": 1.593, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.0425, | |
| "grad_norm": 0.5955395744872489, | |
| "learning_rate": 4.248000000000001e-06, | |
| "loss": 2.4134, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 0.043, | |
| "grad_norm": 0.630503522814338, | |
| "learning_rate": 4.298e-06, | |
| "loss": 2.4195, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.0435, | |
| "grad_norm": 0.6187515125513555, | |
| "learning_rate": 4.3480000000000006e-06, | |
| "loss": 2.4258, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 0.044, | |
| "grad_norm": 0.7454395191545767, | |
| "learning_rate": 4.398000000000001e-06, | |
| "loss": 2.4226, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.044, | |
| "eval_loss": 2.4322543144226074, | |
| "eval_runtime": 31.9813, | |
| "eval_samples_per_second": 3.189, | |
| "eval_steps_per_second": 1.595, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.0445, | |
| "grad_norm": 0.6347211303495337, | |
| "learning_rate": 4.4480000000000004e-06, | |
| "loss": 2.4191, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 0.045, | |
| "grad_norm": 0.6135245446733344, | |
| "learning_rate": 4.498e-06, | |
| "loss": 2.4229, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.0455, | |
| "grad_norm": 0.6009500019971098, | |
| "learning_rate": 4.548e-06, | |
| "loss": 2.42, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 0.046, | |
| "grad_norm": 0.661258489557284, | |
| "learning_rate": 4.598e-06, | |
| "loss": 2.4129, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.046, | |
| "eval_loss": 2.432189464569092, | |
| "eval_runtime": 31.9429, | |
| "eval_samples_per_second": 3.193, | |
| "eval_steps_per_second": 1.597, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.0465, | |
| "grad_norm": 0.6139592783182132, | |
| "learning_rate": 4.648e-06, | |
| "loss": 2.4104, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 0.047, | |
| "grad_norm": 0.583220993400474, | |
| "learning_rate": 4.698000000000001e-06, | |
| "loss": 2.4244, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.0475, | |
| "grad_norm": 0.6293186545915876, | |
| "learning_rate": 4.748e-06, | |
| "loss": 2.4225, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 0.5798657043139257, | |
| "learning_rate": 4.7980000000000005e-06, | |
| "loss": 2.4283, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "eval_loss": 2.4312729835510254, | |
| "eval_runtime": 31.7379, | |
| "eval_samples_per_second": 3.214, | |
| "eval_steps_per_second": 1.607, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.0485, | |
| "grad_norm": 0.6301056488676946, | |
| "learning_rate": 4.848000000000001e-06, | |
| "loss": 2.4238, | |
| "step": 2425 | |
| }, | |
| { | |
| "epoch": 0.049, | |
| "grad_norm": 0.6050753634716797, | |
| "learning_rate": 4.898e-06, | |
| "loss": 2.4209, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.0495, | |
| "grad_norm": 0.5954330421177886, | |
| "learning_rate": 4.948000000000001e-06, | |
| "loss": 2.4208, | |
| "step": 2475 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.6115913011006808, | |
| "learning_rate": 4.998e-06, | |
| "loss": 2.4199, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "eval_loss": 2.430593490600586, | |
| "eval_runtime": 31.7859, | |
| "eval_samples_per_second": 3.209, | |
| "eval_steps_per_second": 1.604, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.0505, | |
| "grad_norm": 0.6088167798442012, | |
| "learning_rate": 5.048000000000001e-06, | |
| "loss": 2.4204, | |
| "step": 2525 | |
| }, | |
| { | |
| "epoch": 0.051, | |
| "grad_norm": 0.5886456022713933, | |
| "learning_rate": 5.098000000000001e-06, | |
| "loss": 2.4233, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.0515, | |
| "grad_norm": 0.5755814876588983, | |
| "learning_rate": 5.1480000000000005e-06, | |
| "loss": 2.414, | |
| "step": 2575 | |
| }, | |
| { | |
| "epoch": 0.052, | |
| "grad_norm": 0.6101796511458513, | |
| "learning_rate": 5.198000000000001e-06, | |
| "loss": 2.4134, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.052, | |
| "eval_loss": 2.430147886276245, | |
| "eval_runtime": 31.667, | |
| "eval_samples_per_second": 3.221, | |
| "eval_steps_per_second": 1.611, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.0525, | |
| "grad_norm": 0.5829483894700689, | |
| "learning_rate": 5.248000000000001e-06, | |
| "loss": 2.4176, | |
| "step": 2625 | |
| }, | |
| { | |
| "epoch": 0.053, | |
| "grad_norm": 0.5756679405925968, | |
| "learning_rate": 5.298000000000001e-06, | |
| "loss": 2.4196, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.0535, | |
| "grad_norm": 0.6203149656143291, | |
| "learning_rate": 5.348000000000001e-06, | |
| "loss": 2.4128, | |
| "step": 2675 | |
| }, | |
| { | |
| "epoch": 0.054, | |
| "grad_norm": 0.6107431848759605, | |
| "learning_rate": 5.398e-06, | |
| "loss": 2.4066, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.054, | |
| "eval_loss": 2.4298744201660156, | |
| "eval_runtime": 31.8888, | |
| "eval_samples_per_second": 3.199, | |
| "eval_steps_per_second": 1.599, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.0545, | |
| "grad_norm": 0.6313360362618398, | |
| "learning_rate": 5.448e-06, | |
| "loss": 2.4116, | |
| "step": 2725 | |
| }, | |
| { | |
| "epoch": 0.055, | |
| "grad_norm": 0.7567581764202255, | |
| "learning_rate": 5.498e-06, | |
| "loss": 2.4137, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.0555, | |
| "grad_norm": 0.5808819096916863, | |
| "learning_rate": 5.548e-06, | |
| "loss": 2.4261, | |
| "step": 2775 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 0.7401050453151701, | |
| "learning_rate": 5.5980000000000004e-06, | |
| "loss": 2.4102, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "eval_loss": 2.429075002670288, | |
| "eval_runtime": 31.9187, | |
| "eval_samples_per_second": 3.196, | |
| "eval_steps_per_second": 1.598, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.0565, | |
| "grad_norm": 0.6100412128745759, | |
| "learning_rate": 5.648e-06, | |
| "loss": 2.4205, | |
| "step": 2825 | |
| }, | |
| { | |
| "epoch": 0.057, | |
| "grad_norm": 0.6038298357908357, | |
| "learning_rate": 5.698e-06, | |
| "loss": 2.4104, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.0575, | |
| "grad_norm": 0.6294303689076208, | |
| "learning_rate": 5.748e-06, | |
| "loss": 2.4101, | |
| "step": 2875 | |
| }, | |
| { | |
| "epoch": 0.058, | |
| "grad_norm": 0.6000316496044382, | |
| "learning_rate": 5.798e-06, | |
| "loss": 2.4116, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.058, | |
| "eval_loss": 2.428636074066162, | |
| "eval_runtime": 31.9776, | |
| "eval_samples_per_second": 3.19, | |
| "eval_steps_per_second": 1.595, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.0585, | |
| "grad_norm": 0.6662370599985865, | |
| "learning_rate": 5.848000000000001e-06, | |
| "loss": 2.4271, | |
| "step": 2925 | |
| }, | |
| { | |
| "epoch": 0.059, | |
| "grad_norm": 0.6065686333783092, | |
| "learning_rate": 5.898e-06, | |
| "loss": 2.4141, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.0595, | |
| "grad_norm": 0.5896191268179571, | |
| "learning_rate": 5.9480000000000005e-06, | |
| "loss": 2.4194, | |
| "step": 2975 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.5984986372167933, | |
| "learning_rate": 5.998000000000001e-06, | |
| "loss": 2.4107, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "eval_loss": 2.428344488143921, | |
| "eval_runtime": 31.827, | |
| "eval_samples_per_second": 3.205, | |
| "eval_steps_per_second": 1.602, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.0605, | |
| "grad_norm": 0.6057904687423932, | |
| "learning_rate": 6.048e-06, | |
| "loss": 2.4231, | |
| "step": 3025 | |
| }, | |
| { | |
| "epoch": 0.061, | |
| "grad_norm": 0.5775023699888965, | |
| "learning_rate": 6.098000000000001e-06, | |
| "loss": 2.4193, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.0615, | |
| "grad_norm": 0.5945486563983137, | |
| "learning_rate": 6.148e-06, | |
| "loss": 2.4101, | |
| "step": 3075 | |
| }, | |
| { | |
| "epoch": 0.062, | |
| "grad_norm": 0.5893073406656858, | |
| "learning_rate": 6.198000000000001e-06, | |
| "loss": 2.41, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.062, | |
| "eval_loss": 2.4278364181518555, | |
| "eval_runtime": 31.4582, | |
| "eval_samples_per_second": 3.242, | |
| "eval_steps_per_second": 1.621, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.0625, | |
| "grad_norm": 0.6413551002827471, | |
| "learning_rate": 6.248000000000001e-06, | |
| "loss": 2.4155, | |
| "step": 3125 | |
| }, | |
| { | |
| "epoch": 0.063, | |
| "grad_norm": 0.5799664342522566, | |
| "learning_rate": 6.2980000000000005e-06, | |
| "loss": 2.409, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.0635, | |
| "grad_norm": 0.5811811320062699, | |
| "learning_rate": 6.348000000000001e-06, | |
| "loss": 2.4103, | |
| "step": 3175 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 1.7009375984265656, | |
| "learning_rate": 6.398000000000001e-06, | |
| "loss": 2.4063, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "eval_loss": 2.4270801544189453, | |
| "eval_runtime": 31.5638, | |
| "eval_samples_per_second": 3.232, | |
| "eval_steps_per_second": 1.616, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.0645, | |
| "grad_norm": 0.5922661228031734, | |
| "learning_rate": 6.448000000000001e-06, | |
| "loss": 2.4146, | |
| "step": 3225 | |
| }, | |
| { | |
| "epoch": 0.065, | |
| "grad_norm": 0.6108654698148237, | |
| "learning_rate": 6.498000000000001e-06, | |
| "loss": 2.4202, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.0655, | |
| "grad_norm": 0.5882408729466215, | |
| "learning_rate": 6.548000000000001e-06, | |
| "loss": 2.4226, | |
| "step": 3275 | |
| }, | |
| { | |
| "epoch": 0.066, | |
| "grad_norm": 0.6095634937429834, | |
| "learning_rate": 6.598000000000001e-06, | |
| "loss": 2.4175, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.066, | |
| "eval_loss": 2.4271743297576904, | |
| "eval_runtime": 31.605, | |
| "eval_samples_per_second": 3.227, | |
| "eval_steps_per_second": 1.614, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.0665, | |
| "grad_norm": 0.584006486469731, | |
| "learning_rate": 6.648e-06, | |
| "loss": 2.4183, | |
| "step": 3325 | |
| }, | |
| { | |
| "epoch": 0.067, | |
| "grad_norm": 0.6183114977641251, | |
| "learning_rate": 6.698e-06, | |
| "loss": 2.4074, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.0675, | |
| "grad_norm": 0.6102359150325862, | |
| "learning_rate": 6.7480000000000004e-06, | |
| "loss": 2.4168, | |
| "step": 3375 | |
| }, | |
| { | |
| "epoch": 0.068, | |
| "grad_norm": 0.6988080460632056, | |
| "learning_rate": 6.798e-06, | |
| "loss": 2.433, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.068, | |
| "eval_loss": 2.4267990589141846, | |
| "eval_runtime": 31.5337, | |
| "eval_samples_per_second": 3.235, | |
| "eval_steps_per_second": 1.617, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.0685, | |
| "grad_norm": 0.5923385092093629, | |
| "learning_rate": 6.848e-06, | |
| "loss": 2.4137, | |
| "step": 3425 | |
| }, | |
| { | |
| "epoch": 0.069, | |
| "grad_norm": 0.5873912274008383, | |
| "learning_rate": 6.898e-06, | |
| "loss": 2.4183, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.0695, | |
| "grad_norm": 0.5885684717655756, | |
| "learning_rate": 6.948e-06, | |
| "loss": 2.4282, | |
| "step": 3475 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.6026217656458652, | |
| "learning_rate": 6.998000000000001e-06, | |
| "loss": 2.4234, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "eval_loss": 2.4262564182281494, | |
| "eval_runtime": 31.7503, | |
| "eval_samples_per_second": 3.213, | |
| "eval_steps_per_second": 1.606, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.0705, | |
| "grad_norm": 0.5820881270462898, | |
| "learning_rate": 7.048e-06, | |
| "loss": 2.413, | |
| "step": 3525 | |
| }, | |
| { | |
| "epoch": 0.071, | |
| "grad_norm": 0.6178510668793894, | |
| "learning_rate": 7.0980000000000005e-06, | |
| "loss": 2.3954, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.0715, | |
| "grad_norm": 0.6186160369787075, | |
| "learning_rate": 7.148000000000001e-06, | |
| "loss": 2.4153, | |
| "step": 3575 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 0.6053079331192983, | |
| "learning_rate": 7.198e-06, | |
| "loss": 2.4061, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "eval_loss": 2.4260003566741943, | |
| "eval_runtime": 31.4103, | |
| "eval_samples_per_second": 3.247, | |
| "eval_steps_per_second": 1.624, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.0725, | |
| "grad_norm": 0.6002224672812325, | |
| "learning_rate": 7.248000000000001e-06, | |
| "loss": 2.4062, | |
| "step": 3625 | |
| }, | |
| { | |
| "epoch": 0.073, | |
| "grad_norm": 0.616881726200715, | |
| "learning_rate": 7.298e-06, | |
| "loss": 2.4167, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.0735, | |
| "grad_norm": 0.6148731575970318, | |
| "learning_rate": 7.348000000000001e-06, | |
| "loss": 2.4123, | |
| "step": 3675 | |
| }, | |
| { | |
| "epoch": 0.074, | |
| "grad_norm": 0.6221338587681139, | |
| "learning_rate": 7.398000000000001e-06, | |
| "loss": 2.4199, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.074, | |
| "eval_loss": 2.4258594512939453, | |
| "eval_runtime": 31.717, | |
| "eval_samples_per_second": 3.216, | |
| "eval_steps_per_second": 1.608, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.0745, | |
| "grad_norm": 0.6024880998969679, | |
| "learning_rate": 7.4480000000000005e-06, | |
| "loss": 2.4187, | |
| "step": 3725 | |
| }, | |
| { | |
| "epoch": 0.075, | |
| "grad_norm": 0.5998431875234804, | |
| "learning_rate": 7.498000000000001e-06, | |
| "loss": 2.4045, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.0755, | |
| "grad_norm": 0.5963168253580089, | |
| "learning_rate": 7.548000000000001e-06, | |
| "loss": 2.4161, | |
| "step": 3775 | |
| }, | |
| { | |
| "epoch": 0.076, | |
| "grad_norm": 0.5891194096424622, | |
| "learning_rate": 7.598000000000001e-06, | |
| "loss": 2.4217, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.076, | |
| "eval_loss": 2.425435781478882, | |
| "eval_runtime": 32.0333, | |
| "eval_samples_per_second": 3.184, | |
| "eval_steps_per_second": 1.592, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.0765, | |
| "grad_norm": 0.6220515512248757, | |
| "learning_rate": 7.648e-06, | |
| "loss": 2.4115, | |
| "step": 3825 | |
| }, | |
| { | |
| "epoch": 0.077, | |
| "grad_norm": 0.592208980582776, | |
| "learning_rate": 7.698000000000002e-06, | |
| "loss": 2.4123, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.0775, | |
| "grad_norm": 0.6050688229723428, | |
| "learning_rate": 7.748000000000001e-06, | |
| "loss": 2.4124, | |
| "step": 3875 | |
| }, | |
| { | |
| "epoch": 0.078, | |
| "grad_norm": 0.6128946719272819, | |
| "learning_rate": 7.798e-06, | |
| "loss": 2.4167, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.078, | |
| "eval_loss": 2.4252073764801025, | |
| "eval_runtime": 31.7629, | |
| "eval_samples_per_second": 3.211, | |
| "eval_steps_per_second": 1.606, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.0785, | |
| "grad_norm": 0.6300203936594084, | |
| "learning_rate": 7.848000000000002e-06, | |
| "loss": 2.4253, | |
| "step": 3925 | |
| }, | |
| { | |
| "epoch": 0.079, | |
| "grad_norm": 0.622492494084331, | |
| "learning_rate": 7.898e-06, | |
| "loss": 2.4126, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.0795, | |
| "grad_norm": 0.6054040520886763, | |
| "learning_rate": 7.948e-06, | |
| "loss": 2.4082, | |
| "step": 3975 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.5997365393444213, | |
| "learning_rate": 7.998e-06, | |
| "loss": 2.4187, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "eval_loss": 2.4248712062835693, | |
| "eval_runtime": 31.7678, | |
| "eval_samples_per_second": 3.211, | |
| "eval_steps_per_second": 1.605, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.0805, | |
| "grad_norm": 0.5914805613039377, | |
| "learning_rate": 8.048e-06, | |
| "loss": 2.4136, | |
| "step": 4025 | |
| }, | |
| { | |
| "epoch": 0.081, | |
| "grad_norm": 0.6868999656119101, | |
| "learning_rate": 8.098000000000001e-06, | |
| "loss": 2.4071, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.0815, | |
| "grad_norm": 0.6116238023737347, | |
| "learning_rate": 8.148e-06, | |
| "loss": 2.399, | |
| "step": 4075 | |
| }, | |
| { | |
| "epoch": 0.082, | |
| "grad_norm": 0.6278682082032867, | |
| "learning_rate": 8.198e-06, | |
| "loss": 2.4147, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.082, | |
| "eval_loss": 2.424673318862915, | |
| "eval_runtime": 31.702, | |
| "eval_samples_per_second": 3.217, | |
| "eval_steps_per_second": 1.609, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.0825, | |
| "grad_norm": 0.652529340562497, | |
| "learning_rate": 8.248e-06, | |
| "loss": 2.4122, | |
| "step": 4125 | |
| }, | |
| { | |
| "epoch": 0.083, | |
| "grad_norm": 0.6241764244719189, | |
| "learning_rate": 8.298000000000001e-06, | |
| "loss": 2.4034, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.0835, | |
| "grad_norm": 0.6093599459247064, | |
| "learning_rate": 8.348e-06, | |
| "loss": 2.4184, | |
| "step": 4175 | |
| }, | |
| { | |
| "epoch": 0.084, | |
| "grad_norm": 0.6145457262520279, | |
| "learning_rate": 8.398e-06, | |
| "loss": 2.4099, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.084, | |
| "eval_loss": 2.424262046813965, | |
| "eval_runtime": 31.7126, | |
| "eval_samples_per_second": 3.216, | |
| "eval_steps_per_second": 1.608, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.0845, | |
| "grad_norm": 0.6094287468338311, | |
| "learning_rate": 8.448000000000001e-06, | |
| "loss": 2.413, | |
| "step": 4225 | |
| }, | |
| { | |
| "epoch": 0.085, | |
| "grad_norm": 0.6138052906293812, | |
| "learning_rate": 8.498e-06, | |
| "loss": 2.3935, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.0855, | |
| "grad_norm": 0.6122465571930669, | |
| "learning_rate": 8.548e-06, | |
| "loss": 2.4061, | |
| "step": 4275 | |
| }, | |
| { | |
| "epoch": 0.086, | |
| "grad_norm": 0.612830490698143, | |
| "learning_rate": 8.598000000000001e-06, | |
| "loss": 2.4112, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.086, | |
| "eval_loss": 2.4238767623901367, | |
| "eval_runtime": 31.7292, | |
| "eval_samples_per_second": 3.215, | |
| "eval_steps_per_second": 1.607, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.0865, | |
| "grad_norm": 0.628133619898939, | |
| "learning_rate": 8.648000000000001e-06, | |
| "loss": 2.4046, | |
| "step": 4325 | |
| }, | |
| { | |
| "epoch": 0.087, | |
| "grad_norm": 0.6496528950628708, | |
| "learning_rate": 8.698e-06, | |
| "loss": 2.4068, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.0875, | |
| "grad_norm": 0.5799286999894695, | |
| "learning_rate": 8.748000000000002e-06, | |
| "loss": 2.4072, | |
| "step": 4375 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": 0.5910425054287555, | |
| "learning_rate": 8.798000000000001e-06, | |
| "loss": 2.3926, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "eval_loss": 2.4238674640655518, | |
| "eval_runtime": 31.7606, | |
| "eval_samples_per_second": 3.212, | |
| "eval_steps_per_second": 1.606, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.0885, | |
| "grad_norm": 0.6159620367072861, | |
| "learning_rate": 8.848e-06, | |
| "loss": 2.4115, | |
| "step": 4425 | |
| }, | |
| { | |
| "epoch": 0.089, | |
| "grad_norm": 0.6972746637095123, | |
| "learning_rate": 8.898000000000002e-06, | |
| "loss": 2.4105, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.0895, | |
| "grad_norm": 0.585353172093314, | |
| "learning_rate": 8.948000000000001e-06, | |
| "loss": 2.4198, | |
| "step": 4475 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.6059468344943013, | |
| "learning_rate": 8.998000000000001e-06, | |
| "loss": 2.4069, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "eval_loss": 2.42350435256958, | |
| "eval_runtime": 31.6869, | |
| "eval_samples_per_second": 3.219, | |
| "eval_steps_per_second": 1.609, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.0905, | |
| "grad_norm": 0.6015924987371338, | |
| "learning_rate": 9.048e-06, | |
| "loss": 2.4081, | |
| "step": 4525 | |
| }, | |
| { | |
| "epoch": 0.091, | |
| "grad_norm": 0.6006000726208087, | |
| "learning_rate": 9.098000000000002e-06, | |
| "loss": 2.4079, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.0915, | |
| "grad_norm": 0.6334216081429662, | |
| "learning_rate": 9.148e-06, | |
| "loss": 2.4021, | |
| "step": 4575 | |
| }, | |
| { | |
| "epoch": 0.092, | |
| "grad_norm": 0.618758486975248, | |
| "learning_rate": 9.198e-06, | |
| "loss": 2.4191, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.092, | |
| "eval_loss": 2.42366361618042, | |
| "eval_runtime": 31.7351, | |
| "eval_samples_per_second": 3.214, | |
| "eval_steps_per_second": 1.607, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.0925, | |
| "grad_norm": 0.5982185393268022, | |
| "learning_rate": 9.248e-06, | |
| "loss": 2.4131, | |
| "step": 4625 | |
| }, | |
| { | |
| "epoch": 0.093, | |
| "grad_norm": 0.5778256378393931, | |
| "learning_rate": 9.298e-06, | |
| "loss": 2.4105, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.0935, | |
| "grad_norm": 0.5892823966497687, | |
| "learning_rate": 9.348000000000001e-06, | |
| "loss": 2.4146, | |
| "step": 4675 | |
| }, | |
| { | |
| "epoch": 0.094, | |
| "grad_norm": 0.6000897787974973, | |
| "learning_rate": 9.398e-06, | |
| "loss": 2.4141, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.094, | |
| "eval_loss": 2.4225125312805176, | |
| "eval_runtime": 31.7008, | |
| "eval_samples_per_second": 3.218, | |
| "eval_steps_per_second": 1.609, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.0945, | |
| "grad_norm": 0.6317324097500899, | |
| "learning_rate": 9.448e-06, | |
| "loss": 2.4157, | |
| "step": 4725 | |
| }, | |
| { | |
| "epoch": 0.095, | |
| "grad_norm": 0.6157270042215848, | |
| "learning_rate": 9.498000000000001e-06, | |
| "loss": 2.4091, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.0955, | |
| "grad_norm": 0.5753740107095965, | |
| "learning_rate": 9.548e-06, | |
| "loss": 2.4142, | |
| "step": 4775 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 0.6173977503240126, | |
| "learning_rate": 9.598e-06, | |
| "loss": 2.4083, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "eval_loss": 2.422691583633423, | |
| "eval_runtime": 31.4709, | |
| "eval_samples_per_second": 3.241, | |
| "eval_steps_per_second": 1.621, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.0965, | |
| "grad_norm": 0.5942953368600239, | |
| "learning_rate": 9.648000000000001e-06, | |
| "loss": 2.4087, | |
| "step": 4825 | |
| }, | |
| { | |
| "epoch": 0.097, | |
| "grad_norm": 0.6555799317672051, | |
| "learning_rate": 9.698000000000001e-06, | |
| "loss": 2.4014, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.0975, | |
| "grad_norm": 0.5757950367748221, | |
| "learning_rate": 9.748e-06, | |
| "loss": 2.4068, | |
| "step": 4875 | |
| }, | |
| { | |
| "epoch": 0.098, | |
| "grad_norm": 0.632774385045014, | |
| "learning_rate": 9.798e-06, | |
| "loss": 2.4087, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.098, | |
| "eval_loss": 2.4220755100250244, | |
| "eval_runtime": 31.4352, | |
| "eval_samples_per_second": 3.245, | |
| "eval_steps_per_second": 1.622, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.0985, | |
| "grad_norm": 0.5781361622989438, | |
| "learning_rate": 9.848000000000001e-06, | |
| "loss": 2.4143, | |
| "step": 4925 | |
| }, | |
| { | |
| "epoch": 0.099, | |
| "grad_norm": 0.6262568188074606, | |
| "learning_rate": 9.898e-06, | |
| "loss": 2.4142, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.0995, | |
| "grad_norm": 0.6349024994263993, | |
| "learning_rate": 9.948e-06, | |
| "loss": 2.4086, | |
| "step": 4975 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.5902257687086163, | |
| "learning_rate": 9.998000000000002e-06, | |
| "loss": 2.4075, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "eval_loss": 2.4221627712249756, | |
| "eval_runtime": 31.4547, | |
| "eval_samples_per_second": 3.243, | |
| "eval_steps_per_second": 1.621, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.1005, | |
| "grad_norm": 0.6096554216132576, | |
| "learning_rate": 9.994666666666668e-06, | |
| "loss": 2.4056, | |
| "step": 5025 | |
| }, | |
| { | |
| "epoch": 0.101, | |
| "grad_norm": 0.6157713116203616, | |
| "learning_rate": 9.989111111111111e-06, | |
| "loss": 2.4104, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.1015, | |
| "grad_norm": 0.6100961136574927, | |
| "learning_rate": 9.983555555555556e-06, | |
| "loss": 2.4041, | |
| "step": 5075 | |
| }, | |
| { | |
| "epoch": 0.102, | |
| "grad_norm": 0.5965243725355741, | |
| "learning_rate": 9.978000000000002e-06, | |
| "loss": 2.406, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.102, | |
| "eval_loss": 2.4214208126068115, | |
| "eval_runtime": 31.4633, | |
| "eval_samples_per_second": 3.242, | |
| "eval_steps_per_second": 1.621, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.1025, | |
| "grad_norm": 0.7288147495415569, | |
| "learning_rate": 9.972444444444445e-06, | |
| "loss": 2.419, | |
| "step": 5125 | |
| }, | |
| { | |
| "epoch": 0.103, | |
| "grad_norm": 0.6027052437896476, | |
| "learning_rate": 9.966888888888889e-06, | |
| "loss": 2.4149, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.1035, | |
| "grad_norm": 0.6351514057651396, | |
| "learning_rate": 9.961333333333334e-06, | |
| "loss": 2.4053, | |
| "step": 5175 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": 0.5912339833990681, | |
| "learning_rate": 9.95577777777778e-06, | |
| "loss": 2.4099, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "eval_loss": 2.4213571548461914, | |
| "eval_runtime": 31.7689, | |
| "eval_samples_per_second": 3.211, | |
| "eval_steps_per_second": 1.605, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.1045, | |
| "grad_norm": 0.6252419519280321, | |
| "learning_rate": 9.950222222222223e-06, | |
| "loss": 2.4044, | |
| "step": 5225 | |
| }, | |
| { | |
| "epoch": 0.105, | |
| "grad_norm": 0.5932871252062307, | |
| "learning_rate": 9.944666666666668e-06, | |
| "loss": 2.4041, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.1055, | |
| "grad_norm": 0.6265014889786313, | |
| "learning_rate": 9.939111111111112e-06, | |
| "loss": 2.4121, | |
| "step": 5275 | |
| }, | |
| { | |
| "epoch": 0.106, | |
| "grad_norm": 0.5586876350334784, | |
| "learning_rate": 9.933555555555557e-06, | |
| "loss": 2.4005, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.106, | |
| "eval_loss": 2.4209611415863037, | |
| "eval_runtime": 31.4697, | |
| "eval_samples_per_second": 3.241, | |
| "eval_steps_per_second": 1.621, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.1065, | |
| "grad_norm": 0.6208578145519013, | |
| "learning_rate": 9.928e-06, | |
| "loss": 2.4095, | |
| "step": 5325 | |
| }, | |
| { | |
| "epoch": 0.107, | |
| "grad_norm": 0.5761711209442947, | |
| "learning_rate": 9.922444444444446e-06, | |
| "loss": 2.411, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.1075, | |
| "grad_norm": 0.6259961321288001, | |
| "learning_rate": 9.91688888888889e-06, | |
| "loss": 2.4062, | |
| "step": 5375 | |
| }, | |
| { | |
| "epoch": 0.108, | |
| "grad_norm": 0.6636296843455429, | |
| "learning_rate": 9.911333333333335e-06, | |
| "loss": 2.411, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.108, | |
| "eval_loss": 2.420535087585449, | |
| "eval_runtime": 31.4447, | |
| "eval_samples_per_second": 3.244, | |
| "eval_steps_per_second": 1.622, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.1085, | |
| "grad_norm": 0.5977322049971575, | |
| "learning_rate": 9.905777777777778e-06, | |
| "loss": 2.4073, | |
| "step": 5425 | |
| }, | |
| { | |
| "epoch": 0.109, | |
| "grad_norm": 0.605286836273461, | |
| "learning_rate": 9.900222222222223e-06, | |
| "loss": 2.4023, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.1095, | |
| "grad_norm": 0.6244785501127309, | |
| "learning_rate": 9.894666666666669e-06, | |
| "loss": 2.4084, | |
| "step": 5475 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.6135442380195029, | |
| "learning_rate": 9.889111111111112e-06, | |
| "loss": 2.4068, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "eval_loss": 2.4201102256774902, | |
| "eval_runtime": 31.806, | |
| "eval_samples_per_second": 3.207, | |
| "eval_steps_per_second": 1.603, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.1105, | |
| "grad_norm": 0.6260082123047037, | |
| "learning_rate": 9.883555555555556e-06, | |
| "loss": 2.4053, | |
| "step": 5525 | |
| }, | |
| { | |
| "epoch": 0.111, | |
| "grad_norm": 0.5956336151974914, | |
| "learning_rate": 9.878000000000001e-06, | |
| "loss": 2.4152, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.1115, | |
| "grad_norm": 0.6149620176113736, | |
| "learning_rate": 9.872444444444446e-06, | |
| "loss": 2.4055, | |
| "step": 5575 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 0.6326092489345128, | |
| "learning_rate": 9.86688888888889e-06, | |
| "loss": 2.3968, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "eval_loss": 2.420125722885132, | |
| "eval_runtime": 31.8082, | |
| "eval_samples_per_second": 3.207, | |
| "eval_steps_per_second": 1.603, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.1125, | |
| "grad_norm": 0.6390446494212693, | |
| "learning_rate": 9.861333333333333e-06, | |
| "loss": 2.4045, | |
| "step": 5625 | |
| }, | |
| { | |
| "epoch": 0.113, | |
| "grad_norm": 0.6670896967232433, | |
| "learning_rate": 9.855777777777779e-06, | |
| "loss": 2.4013, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.1135, | |
| "grad_norm": 0.6185087617978746, | |
| "learning_rate": 9.850222222222224e-06, | |
| "loss": 2.4015, | |
| "step": 5675 | |
| }, | |
| { | |
| "epoch": 0.114, | |
| "grad_norm": 0.6040525454825223, | |
| "learning_rate": 9.844666666666667e-06, | |
| "loss": 2.4109, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.114, | |
| "eval_loss": 2.419764518737793, | |
| "eval_runtime": 31.7256, | |
| "eval_samples_per_second": 3.215, | |
| "eval_steps_per_second": 1.608, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.1145, | |
| "grad_norm": 0.6010942125132981, | |
| "learning_rate": 9.839111111111111e-06, | |
| "loss": 2.4092, | |
| "step": 5725 | |
| }, | |
| { | |
| "epoch": 0.115, | |
| "grad_norm": 0.602852118998295, | |
| "learning_rate": 9.833555555555556e-06, | |
| "loss": 2.414, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.1155, | |
| "grad_norm": 0.6189454944937772, | |
| "learning_rate": 9.828000000000001e-06, | |
| "loss": 2.4112, | |
| "step": 5775 | |
| }, | |
| { | |
| "epoch": 0.116, | |
| "grad_norm": 0.5871735622958322, | |
| "learning_rate": 9.822444444444445e-06, | |
| "loss": 2.3993, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.116, | |
| "eval_loss": 2.419255495071411, | |
| "eval_runtime": 31.7146, | |
| "eval_samples_per_second": 3.216, | |
| "eval_steps_per_second": 1.608, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.1165, | |
| "grad_norm": 0.5719116548117884, | |
| "learning_rate": 9.81688888888889e-06, | |
| "loss": 2.4128, | |
| "step": 5825 | |
| }, | |
| { | |
| "epoch": 0.117, | |
| "grad_norm": 0.5855276996729913, | |
| "learning_rate": 9.811333333333334e-06, | |
| "loss": 2.4127, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.1175, | |
| "grad_norm": 0.5948413134062237, | |
| "learning_rate": 9.805777777777779e-06, | |
| "loss": 2.4028, | |
| "step": 5875 | |
| }, | |
| { | |
| "epoch": 0.118, | |
| "grad_norm": 0.6114053718118341, | |
| "learning_rate": 9.800222222222223e-06, | |
| "loss": 2.4085, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.118, | |
| "eval_loss": 2.4192631244659424, | |
| "eval_runtime": 31.8221, | |
| "eval_samples_per_second": 3.205, | |
| "eval_steps_per_second": 1.603, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.1185, | |
| "grad_norm": 0.6171839632107143, | |
| "learning_rate": 9.794666666666668e-06, | |
| "loss": 2.4063, | |
| "step": 5925 | |
| }, | |
| { | |
| "epoch": 0.119, | |
| "grad_norm": 0.5985426708940325, | |
| "learning_rate": 9.789111111111111e-06, | |
| "loss": 2.401, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.1195, | |
| "grad_norm": 0.6242757087701617, | |
| "learning_rate": 9.783555555555557e-06, | |
| "loss": 2.3977, | |
| "step": 5975 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.6472329844420622, | |
| "learning_rate": 9.778e-06, | |
| "loss": 2.4066, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "eval_loss": 2.4190170764923096, | |
| "eval_runtime": 31.973, | |
| "eval_samples_per_second": 3.19, | |
| "eval_steps_per_second": 1.595, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.1205, | |
| "grad_norm": 0.5979904516506753, | |
| "learning_rate": 9.772444444444445e-06, | |
| "loss": 2.4044, | |
| "step": 6025 | |
| }, | |
| { | |
| "epoch": 0.121, | |
| "grad_norm": 0.5980588594331456, | |
| "learning_rate": 9.76688888888889e-06, | |
| "loss": 2.41, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.1215, | |
| "grad_norm": 0.6344150039672136, | |
| "learning_rate": 9.761333333333334e-06, | |
| "loss": 2.4, | |
| "step": 6075 | |
| }, | |
| { | |
| "epoch": 0.122, | |
| "grad_norm": 0.6035110768502723, | |
| "learning_rate": 9.755777777777778e-06, | |
| "loss": 2.4148, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.122, | |
| "eval_loss": 2.418259382247925, | |
| "eval_runtime": 31.784, | |
| "eval_samples_per_second": 3.209, | |
| "eval_steps_per_second": 1.605, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.1225, | |
| "grad_norm": 0.5792932239951794, | |
| "learning_rate": 9.750222222222223e-06, | |
| "loss": 2.4061, | |
| "step": 6125 | |
| }, | |
| { | |
| "epoch": 0.123, | |
| "grad_norm": 0.6529554995007899, | |
| "learning_rate": 9.744666666666668e-06, | |
| "loss": 2.4036, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.1235, | |
| "grad_norm": 0.5946064726146467, | |
| "learning_rate": 9.739111111111112e-06, | |
| "loss": 2.4014, | |
| "step": 6175 | |
| }, | |
| { | |
| "epoch": 0.124, | |
| "grad_norm": 0.5739473618849045, | |
| "learning_rate": 9.733555555555555e-06, | |
| "loss": 2.4057, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.124, | |
| "eval_loss": 2.4179208278656006, | |
| "eval_runtime": 31.6981, | |
| "eval_samples_per_second": 3.218, | |
| "eval_steps_per_second": 1.609, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.1245, | |
| "grad_norm": 0.6907211114020956, | |
| "learning_rate": 9.728e-06, | |
| "loss": 2.393, | |
| "step": 6225 | |
| }, | |
| { | |
| "epoch": 0.125, | |
| "grad_norm": 0.6225931887903327, | |
| "learning_rate": 9.722444444444446e-06, | |
| "loss": 2.4147, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.1255, | |
| "grad_norm": 0.568397246680531, | |
| "learning_rate": 9.71688888888889e-06, | |
| "loss": 2.4024, | |
| "step": 6275 | |
| }, | |
| { | |
| "epoch": 0.126, | |
| "grad_norm": 0.5842879344272728, | |
| "learning_rate": 9.711333333333333e-06, | |
| "loss": 2.404, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.126, | |
| "eval_loss": 2.4178576469421387, | |
| "eval_runtime": 31.7994, | |
| "eval_samples_per_second": 3.208, | |
| "eval_steps_per_second": 1.604, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.1265, | |
| "grad_norm": 0.5805192382099048, | |
| "learning_rate": 9.705777777777778e-06, | |
| "loss": 2.4063, | |
| "step": 6325 | |
| }, | |
| { | |
| "epoch": 0.127, | |
| "grad_norm": 0.6600294122711824, | |
| "learning_rate": 9.700222222222224e-06, | |
| "loss": 2.4078, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.1275, | |
| "grad_norm": 0.6263098682936462, | |
| "learning_rate": 9.694666666666667e-06, | |
| "loss": 2.3961, | |
| "step": 6375 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 0.6961912679129473, | |
| "learning_rate": 9.68911111111111e-06, | |
| "loss": 2.4127, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "eval_loss": 2.417247772216797, | |
| "eval_runtime": 31.7325, | |
| "eval_samples_per_second": 3.214, | |
| "eval_steps_per_second": 1.607, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.1285, | |
| "grad_norm": 0.6396950069271417, | |
| "learning_rate": 9.683555555555556e-06, | |
| "loss": 2.4041, | |
| "step": 6425 | |
| }, | |
| { | |
| "epoch": 0.129, | |
| "grad_norm": 0.6164180606933177, | |
| "learning_rate": 9.678000000000001e-06, | |
| "loss": 2.4, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.1295, | |
| "grad_norm": 0.6120640198257105, | |
| "learning_rate": 9.672444444444445e-06, | |
| "loss": 2.3966, | |
| "step": 6475 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.6013045247718226, | |
| "learning_rate": 9.66688888888889e-06, | |
| "loss": 2.3991, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "eval_loss": 2.417280673980713, | |
| "eval_runtime": 31.8112, | |
| "eval_samples_per_second": 3.206, | |
| "eval_steps_per_second": 1.603, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.1305, | |
| "grad_norm": 0.6061836537875764, | |
| "learning_rate": 9.661333333333334e-06, | |
| "loss": 2.4161, | |
| "step": 6525 | |
| }, | |
| { | |
| "epoch": 0.131, | |
| "grad_norm": 0.6100864625060891, | |
| "learning_rate": 9.655777777777779e-06, | |
| "loss": 2.4052, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.1315, | |
| "grad_norm": 0.6932893052541476, | |
| "learning_rate": 9.650222222222222e-06, | |
| "loss": 2.4036, | |
| "step": 6575 | |
| }, | |
| { | |
| "epoch": 0.132, | |
| "grad_norm": 0.5859072202807338, | |
| "learning_rate": 9.644666666666668e-06, | |
| "loss": 2.4045, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.132, | |
| "eval_loss": 2.416877031326294, | |
| "eval_runtime": 31.5203, | |
| "eval_samples_per_second": 3.236, | |
| "eval_steps_per_second": 1.618, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.1325, | |
| "grad_norm": 0.579002436095642, | |
| "learning_rate": 9.639111111111113e-06, | |
| "loss": 2.4015, | |
| "step": 6625 | |
| }, | |
| { | |
| "epoch": 0.133, | |
| "grad_norm": 0.5968858601649685, | |
| "learning_rate": 9.633555555555556e-06, | |
| "loss": 2.3986, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.1335, | |
| "grad_norm": 0.5964714549861985, | |
| "learning_rate": 9.628e-06, | |
| "loss": 2.4062, | |
| "step": 6675 | |
| }, | |
| { | |
| "epoch": 0.134, | |
| "grad_norm": 0.6126102944808797, | |
| "learning_rate": 9.622444444444445e-06, | |
| "loss": 2.4033, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.134, | |
| "eval_loss": 2.4164350032806396, | |
| "eval_runtime": 31.4543, | |
| "eval_samples_per_second": 3.243, | |
| "eval_steps_per_second": 1.621, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.1345, | |
| "grad_norm": 0.5774452345333466, | |
| "learning_rate": 9.61688888888889e-06, | |
| "loss": 2.3997, | |
| "step": 6725 | |
| }, | |
| { | |
| "epoch": 0.135, | |
| "grad_norm": 0.6227260743975279, | |
| "learning_rate": 9.611333333333334e-06, | |
| "loss": 2.4018, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.1355, | |
| "grad_norm": 0.5846707991616706, | |
| "learning_rate": 9.605777777777778e-06, | |
| "loss": 2.3985, | |
| "step": 6775 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "grad_norm": 0.6172483484063671, | |
| "learning_rate": 9.600222222222223e-06, | |
| "loss": 2.4213, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "eval_loss": 2.41625714302063, | |
| "eval_runtime": 31.5517, | |
| "eval_samples_per_second": 3.233, | |
| "eval_steps_per_second": 1.616, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.1365, | |
| "grad_norm": 0.5965299711032601, | |
| "learning_rate": 9.594666666666668e-06, | |
| "loss": 2.3976, | |
| "step": 6825 | |
| }, | |
| { | |
| "epoch": 0.137, | |
| "grad_norm": 0.5884739304234496, | |
| "learning_rate": 9.589111111111112e-06, | |
| "loss": 2.3947, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.1375, | |
| "grad_norm": 0.5737065693146471, | |
| "learning_rate": 9.583555555555555e-06, | |
| "loss": 2.3983, | |
| "step": 6875 | |
| }, | |
| { | |
| "epoch": 0.138, | |
| "grad_norm": 0.6249698819825935, | |
| "learning_rate": 9.578e-06, | |
| "loss": 2.4008, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.138, | |
| "eval_loss": 2.4156551361083984, | |
| "eval_runtime": 31.5071, | |
| "eval_samples_per_second": 3.237, | |
| "eval_steps_per_second": 1.619, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.1385, | |
| "grad_norm": 0.5930008566650997, | |
| "learning_rate": 9.572444444444446e-06, | |
| "loss": 2.3951, | |
| "step": 6925 | |
| }, | |
| { | |
| "epoch": 0.139, | |
| "grad_norm": 0.6564746022716046, | |
| "learning_rate": 9.56688888888889e-06, | |
| "loss": 2.4083, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.1395, | |
| "grad_norm": 0.611311960098376, | |
| "learning_rate": 9.561333333333333e-06, | |
| "loss": 2.4032, | |
| "step": 6975 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.594692534551516, | |
| "learning_rate": 9.555777777777778e-06, | |
| "loss": 2.41, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "eval_loss": 2.415269374847412, | |
| "eval_runtime": 31.7535, | |
| "eval_samples_per_second": 3.212, | |
| "eval_steps_per_second": 1.606, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.1405, | |
| "grad_norm": 0.5975652527083385, | |
| "learning_rate": 9.550222222222223e-06, | |
| "loss": 2.398, | |
| "step": 7025 | |
| }, | |
| { | |
| "epoch": 0.141, | |
| "grad_norm": 0.5642285559875744, | |
| "learning_rate": 9.544666666666667e-06, | |
| "loss": 2.3907, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.1415, | |
| "grad_norm": 0.5977243463765347, | |
| "learning_rate": 9.539111111111112e-06, | |
| "loss": 2.4063, | |
| "step": 7075 | |
| }, | |
| { | |
| "epoch": 0.142, | |
| "grad_norm": 0.5938091922766982, | |
| "learning_rate": 9.533555555555556e-06, | |
| "loss": 2.4064, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.142, | |
| "eval_loss": 2.4153244495391846, | |
| "eval_runtime": 31.6856, | |
| "eval_samples_per_second": 3.219, | |
| "eval_steps_per_second": 1.61, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.1425, | |
| "grad_norm": 0.6203811817044198, | |
| "learning_rate": 9.528000000000001e-06, | |
| "loss": 2.3995, | |
| "step": 7125 | |
| }, | |
| { | |
| "epoch": 0.143, | |
| "grad_norm": 0.5748373728564159, | |
| "learning_rate": 9.522444444444444e-06, | |
| "loss": 2.4052, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.1435, | |
| "grad_norm": 0.6318360721408016, | |
| "learning_rate": 9.51688888888889e-06, | |
| "loss": 2.396, | |
| "step": 7175 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 0.5777480191110791, | |
| "learning_rate": 9.511333333333335e-06, | |
| "loss": 2.3966, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "eval_loss": 2.414691209793091, | |
| "eval_runtime": 31.5495, | |
| "eval_samples_per_second": 3.233, | |
| "eval_steps_per_second": 1.617, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.1445, | |
| "grad_norm": 0.5896122820881663, | |
| "learning_rate": 9.505777777777779e-06, | |
| "loss": 2.4018, | |
| "step": 7225 | |
| }, | |
| { | |
| "epoch": 0.145, | |
| "grad_norm": 0.6081675838061575, | |
| "learning_rate": 9.500222222222222e-06, | |
| "loss": 2.4036, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.1455, | |
| "grad_norm": 0.6032973832585987, | |
| "learning_rate": 9.494666666666667e-06, | |
| "loss": 2.4025, | |
| "step": 7275 | |
| }, | |
| { | |
| "epoch": 0.146, | |
| "grad_norm": 0.6283775464354142, | |
| "learning_rate": 9.489111111111113e-06, | |
| "loss": 2.4078, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.146, | |
| "eval_loss": 2.4143505096435547, | |
| "eval_runtime": 31.4643, | |
| "eval_samples_per_second": 3.242, | |
| "eval_steps_per_second": 1.621, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.1465, | |
| "grad_norm": 0.5969038728051346, | |
| "learning_rate": 9.483555555555556e-06, | |
| "loss": 2.4066, | |
| "step": 7325 | |
| }, | |
| { | |
| "epoch": 0.147, | |
| "grad_norm": 0.6048317665387537, | |
| "learning_rate": 9.478e-06, | |
| "loss": 2.4007, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.1475, | |
| "grad_norm": 0.5721050600021237, | |
| "learning_rate": 9.472444444444445e-06, | |
| "loss": 2.4146, | |
| "step": 7375 | |
| }, | |
| { | |
| "epoch": 0.148, | |
| "grad_norm": 0.6019256818391423, | |
| "learning_rate": 9.46688888888889e-06, | |
| "loss": 2.399, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.148, | |
| "eval_loss": 2.414281129837036, | |
| "eval_runtime": 31.7034, | |
| "eval_samples_per_second": 3.217, | |
| "eval_steps_per_second": 1.609, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.1485, | |
| "grad_norm": 0.6386043502919573, | |
| "learning_rate": 9.461333333333334e-06, | |
| "loss": 2.3957, | |
| "step": 7425 | |
| }, | |
| { | |
| "epoch": 0.149, | |
| "grad_norm": 0.5819226766027404, | |
| "learning_rate": 9.455777777777777e-06, | |
| "loss": 2.4001, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.1495, | |
| "grad_norm": 0.6372396676223023, | |
| "learning_rate": 9.450222222222223e-06, | |
| "loss": 2.3976, | |
| "step": 7475 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.5888017578283452, | |
| "learning_rate": 9.444666666666668e-06, | |
| "loss": 2.4008, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "eval_loss": 2.414154291152954, | |
| "eval_runtime": 31.8152, | |
| "eval_samples_per_second": 3.206, | |
| "eval_steps_per_second": 1.603, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.1505, | |
| "grad_norm": 0.6132781564549638, | |
| "learning_rate": 9.439111111111111e-06, | |
| "loss": 2.4077, | |
| "step": 7525 | |
| }, | |
| { | |
| "epoch": 0.151, | |
| "grad_norm": 0.6063002641957036, | |
| "learning_rate": 9.433555555555557e-06, | |
| "loss": 2.3889, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 0.1515, | |
| "grad_norm": 0.614169638364484, | |
| "learning_rate": 9.428e-06, | |
| "loss": 2.4121, | |
| "step": 7575 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "grad_norm": 0.5826866596297434, | |
| "learning_rate": 9.422444444444445e-06, | |
| "loss": 2.4075, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "eval_loss": 2.414039134979248, | |
| "eval_runtime": 31.7985, | |
| "eval_samples_per_second": 3.208, | |
| "eval_steps_per_second": 1.604, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.1525, | |
| "grad_norm": 0.5964985955677213, | |
| "learning_rate": 9.41688888888889e-06, | |
| "loss": 2.3976, | |
| "step": 7625 | |
| }, | |
| { | |
| "epoch": 0.153, | |
| "grad_norm": 0.5946671745059025, | |
| "learning_rate": 9.411333333333334e-06, | |
| "loss": 2.3947, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 0.1535, | |
| "grad_norm": 0.5894909865358033, | |
| "learning_rate": 9.405777777777778e-06, | |
| "loss": 2.4079, | |
| "step": 7675 | |
| }, | |
| { | |
| "epoch": 0.154, | |
| "grad_norm": 0.6048420481174572, | |
| "learning_rate": 9.400222222222223e-06, | |
| "loss": 2.4015, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.154, | |
| "eval_loss": 2.413475275039673, | |
| "eval_runtime": 31.9136, | |
| "eval_samples_per_second": 3.196, | |
| "eval_steps_per_second": 1.598, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.1545, | |
| "grad_norm": 0.617559481688582, | |
| "learning_rate": 9.394666666666668e-06, | |
| "loss": 2.4036, | |
| "step": 7725 | |
| }, | |
| { | |
| "epoch": 0.155, | |
| "grad_norm": 0.6350332331451685, | |
| "learning_rate": 9.389111111111112e-06, | |
| "loss": 2.3989, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.1555, | |
| "grad_norm": 0.6034892604414784, | |
| "learning_rate": 9.383555555555557e-06, | |
| "loss": 2.398, | |
| "step": 7775 | |
| }, | |
| { | |
| "epoch": 0.156, | |
| "grad_norm": 0.5879016941841427, | |
| "learning_rate": 9.378e-06, | |
| "loss": 2.3989, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.156, | |
| "eval_loss": 2.4134128093719482, | |
| "eval_runtime": 31.7809, | |
| "eval_samples_per_second": 3.209, | |
| "eval_steps_per_second": 1.605, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.1565, | |
| "grad_norm": 0.5957060592966067, | |
| "learning_rate": 9.372444444444446e-06, | |
| "loss": 2.3951, | |
| "step": 7825 | |
| }, | |
| { | |
| "epoch": 0.157, | |
| "grad_norm": 0.6127788552445546, | |
| "learning_rate": 9.36688888888889e-06, | |
| "loss": 2.3966, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 0.1575, | |
| "grad_norm": 0.6103495429829666, | |
| "learning_rate": 9.361333333333335e-06, | |
| "loss": 2.3974, | |
| "step": 7875 | |
| }, | |
| { | |
| "epoch": 0.158, | |
| "grad_norm": 0.5940303847498369, | |
| "learning_rate": 9.355777777777778e-06, | |
| "loss": 2.3982, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.158, | |
| "eval_loss": 2.4130520820617676, | |
| "eval_runtime": 31.8718, | |
| "eval_samples_per_second": 3.2, | |
| "eval_steps_per_second": 1.6, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.1585, | |
| "grad_norm": 0.5967208318826438, | |
| "learning_rate": 9.350222222222224e-06, | |
| "loss": 2.3963, | |
| "step": 7925 | |
| }, | |
| { | |
| "epoch": 0.159, | |
| "grad_norm": 0.6074697420049116, | |
| "learning_rate": 9.344666666666667e-06, | |
| "loss": 2.4004, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 0.1595, | |
| "grad_norm": 0.6007548308453654, | |
| "learning_rate": 9.339111111111112e-06, | |
| "loss": 2.3972, | |
| "step": 7975 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.6058573477149505, | |
| "learning_rate": 9.333555555555558e-06, | |
| "loss": 2.4, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_loss": 2.4125914573669434, | |
| "eval_runtime": 31.8819, | |
| "eval_samples_per_second": 3.199, | |
| "eval_steps_per_second": 1.6, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.1605, | |
| "grad_norm": 0.5861319558312379, | |
| "learning_rate": 9.328000000000001e-06, | |
| "loss": 2.3883, | |
| "step": 8025 | |
| }, | |
| { | |
| "epoch": 0.161, | |
| "grad_norm": 0.5836976562991806, | |
| "learning_rate": 9.322444444444445e-06, | |
| "loss": 2.3858, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 0.1615, | |
| "grad_norm": 0.5844356099514875, | |
| "learning_rate": 9.31688888888889e-06, | |
| "loss": 2.408, | |
| "step": 8075 | |
| }, | |
| { | |
| "epoch": 0.162, | |
| "grad_norm": 0.5898038882596441, | |
| "learning_rate": 9.311333333333335e-06, | |
| "loss": 2.3979, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.162, | |
| "eval_loss": 2.4123263359069824, | |
| "eval_runtime": 31.7798, | |
| "eval_samples_per_second": 3.21, | |
| "eval_steps_per_second": 1.605, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.1625, | |
| "grad_norm": 0.6072648398087778, | |
| "learning_rate": 9.305777777777779e-06, | |
| "loss": 2.3904, | |
| "step": 8125 | |
| }, | |
| { | |
| "epoch": 0.163, | |
| "grad_norm": 0.5947190221089934, | |
| "learning_rate": 9.300222222222222e-06, | |
| "loss": 2.3908, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 0.1635, | |
| "grad_norm": 0.5923294532719955, | |
| "learning_rate": 9.294666666666668e-06, | |
| "loss": 2.3994, | |
| "step": 8175 | |
| }, | |
| { | |
| "epoch": 0.164, | |
| "grad_norm": 0.6238957997579533, | |
| "learning_rate": 9.289111111111113e-06, | |
| "loss": 2.3935, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.164, | |
| "eval_loss": 2.4118340015411377, | |
| "eval_runtime": 31.8145, | |
| "eval_samples_per_second": 3.206, | |
| "eval_steps_per_second": 1.603, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.1645, | |
| "grad_norm": 0.576622489198895, | |
| "learning_rate": 9.283555555555556e-06, | |
| "loss": 2.396, | |
| "step": 8225 | |
| }, | |
| { | |
| "epoch": 0.165, | |
| "grad_norm": 0.6185118704471244, | |
| "learning_rate": 9.278e-06, | |
| "loss": 2.4035, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 0.1655, | |
| "grad_norm": 0.5796535805449304, | |
| "learning_rate": 9.272444444444445e-06, | |
| "loss": 2.3943, | |
| "step": 8275 | |
| }, | |
| { | |
| "epoch": 0.166, | |
| "grad_norm": 0.6173375014397958, | |
| "learning_rate": 9.26688888888889e-06, | |
| "loss": 2.3935, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.166, | |
| "eval_loss": 2.4114973545074463, | |
| "eval_runtime": 31.7754, | |
| "eval_samples_per_second": 3.21, | |
| "eval_steps_per_second": 1.605, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.1665, | |
| "grad_norm": 0.5618534321843206, | |
| "learning_rate": 9.261333333333334e-06, | |
| "loss": 2.3974, | |
| "step": 8325 | |
| }, | |
| { | |
| "epoch": 0.167, | |
| "grad_norm": 0.6009214777241336, | |
| "learning_rate": 9.25577777777778e-06, | |
| "loss": 2.4, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 0.1675, | |
| "grad_norm": 0.5772198441104387, | |
| "learning_rate": 9.250222222222223e-06, | |
| "loss": 2.3991, | |
| "step": 8375 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "grad_norm": 0.5740163940994337, | |
| "learning_rate": 9.244666666666668e-06, | |
| "loss": 2.3947, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "eval_loss": 2.411425828933716, | |
| "eval_runtime": 31.5099, | |
| "eval_samples_per_second": 3.237, | |
| "eval_steps_per_second": 1.619, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.1685, | |
| "grad_norm": 0.5687873679002051, | |
| "learning_rate": 9.239111111111112e-06, | |
| "loss": 2.3966, | |
| "step": 8425 | |
| }, | |
| { | |
| "epoch": 0.169, | |
| "grad_norm": 0.5610136891748577, | |
| "learning_rate": 9.233555555555557e-06, | |
| "loss": 2.3998, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 0.1695, | |
| "grad_norm": 0.6032713755890403, | |
| "learning_rate": 9.228e-06, | |
| "loss": 2.3943, | |
| "step": 8475 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.5964144518891603, | |
| "learning_rate": 9.222444444444446e-06, | |
| "loss": 2.3883, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "eval_loss": 2.411017656326294, | |
| "eval_runtime": 31.5307, | |
| "eval_samples_per_second": 3.235, | |
| "eval_steps_per_second": 1.617, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.1705, | |
| "grad_norm": 0.6150332993234658, | |
| "learning_rate": 9.21688888888889e-06, | |
| "loss": 2.3947, | |
| "step": 8525 | |
| }, | |
| { | |
| "epoch": 0.171, | |
| "grad_norm": 0.5996705331900282, | |
| "learning_rate": 9.211333333333334e-06, | |
| "loss": 2.3767, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 0.1715, | |
| "grad_norm": 0.5824632831455251, | |
| "learning_rate": 9.20577777777778e-06, | |
| "loss": 2.3872, | |
| "step": 8575 | |
| }, | |
| { | |
| "epoch": 0.172, | |
| "grad_norm": 0.606207861483595, | |
| "learning_rate": 9.200222222222223e-06, | |
| "loss": 2.4039, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.172, | |
| "eval_loss": 2.4107751846313477, | |
| "eval_runtime": 31.4387, | |
| "eval_samples_per_second": 3.244, | |
| "eval_steps_per_second": 1.622, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.1725, | |
| "grad_norm": 0.576823131255562, | |
| "learning_rate": 9.194666666666667e-06, | |
| "loss": 2.3954, | |
| "step": 8625 | |
| }, | |
| { | |
| "epoch": 0.173, | |
| "grad_norm": 0.56597712239854, | |
| "learning_rate": 9.189111111111112e-06, | |
| "loss": 2.4072, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 0.1735, | |
| "grad_norm": 0.5825959007699376, | |
| "learning_rate": 9.183555555555557e-06, | |
| "loss": 2.4081, | |
| "step": 8675 | |
| }, | |
| { | |
| "epoch": 0.174, | |
| "grad_norm": 0.5776918671405765, | |
| "learning_rate": 9.178000000000001e-06, | |
| "loss": 2.4091, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.174, | |
| "eval_loss": 2.410761594772339, | |
| "eval_runtime": 31.7246, | |
| "eval_samples_per_second": 3.215, | |
| "eval_steps_per_second": 1.608, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.1745, | |
| "grad_norm": 0.6256369047041809, | |
| "learning_rate": 9.172444444444444e-06, | |
| "loss": 2.3953, | |
| "step": 8725 | |
| }, | |
| { | |
| "epoch": 0.175, | |
| "grad_norm": 0.5964709475887552, | |
| "learning_rate": 9.16688888888889e-06, | |
| "loss": 2.39, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 0.1755, | |
| "grad_norm": 0.5775755843795828, | |
| "learning_rate": 9.161333333333335e-06, | |
| "loss": 2.391, | |
| "step": 8775 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 0.6655706627980364, | |
| "learning_rate": 9.155777777777779e-06, | |
| "loss": 2.4048, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "eval_loss": 2.4105958938598633, | |
| "eval_runtime": 31.4248, | |
| "eval_samples_per_second": 3.246, | |
| "eval_steps_per_second": 1.623, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.1765, | |
| "grad_norm": 0.5865172878151053, | |
| "learning_rate": 9.150222222222222e-06, | |
| "loss": 2.3878, | |
| "step": 8825 | |
| }, | |
| { | |
| "epoch": 0.177, | |
| "grad_norm": 0.584391124965856, | |
| "learning_rate": 9.144666666666667e-06, | |
| "loss": 2.401, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 0.1775, | |
| "grad_norm": 0.5726598382185046, | |
| "learning_rate": 9.139111111111113e-06, | |
| "loss": 2.4018, | |
| "step": 8875 | |
| }, | |
| { | |
| "epoch": 0.178, | |
| "grad_norm": 0.5690725395770588, | |
| "learning_rate": 9.133555555555556e-06, | |
| "loss": 2.4034, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.178, | |
| "eval_loss": 2.4101033210754395, | |
| "eval_runtime": 31.4686, | |
| "eval_samples_per_second": 3.241, | |
| "eval_steps_per_second": 1.621, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.1785, | |
| "grad_norm": 0.5978143013011991, | |
| "learning_rate": 9.128e-06, | |
| "loss": 2.4014, | |
| "step": 8925 | |
| }, | |
| { | |
| "epoch": 0.179, | |
| "grad_norm": 0.6085180927490662, | |
| "learning_rate": 9.122444444444445e-06, | |
| "loss": 2.3924, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 0.1795, | |
| "grad_norm": 0.5720265034599029, | |
| "learning_rate": 9.11688888888889e-06, | |
| "loss": 2.3977, | |
| "step": 8975 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.5739306861609581, | |
| "learning_rate": 9.111333333333334e-06, | |
| "loss": 2.3992, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "eval_loss": 2.410008430480957, | |
| "eval_runtime": 32.192, | |
| "eval_samples_per_second": 3.168, | |
| "eval_steps_per_second": 1.584, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.1805, | |
| "grad_norm": 0.5908878679870805, | |
| "learning_rate": 9.105777777777779e-06, | |
| "loss": 2.3938, | |
| "step": 9025 | |
| }, | |
| { | |
| "epoch": 0.181, | |
| "grad_norm": 0.5496267273049, | |
| "learning_rate": 9.100222222222223e-06, | |
| "loss": 2.3961, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 0.1815, | |
| "grad_norm": 0.5979695738071065, | |
| "learning_rate": 9.094666666666668e-06, | |
| "loss": 2.3858, | |
| "step": 9075 | |
| }, | |
| { | |
| "epoch": 0.182, | |
| "grad_norm": 0.5938166893318079, | |
| "learning_rate": 9.089111111111111e-06, | |
| "loss": 2.3862, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.182, | |
| "eval_loss": 2.410053253173828, | |
| "eval_runtime": 32.2577, | |
| "eval_samples_per_second": 3.162, | |
| "eval_steps_per_second": 1.581, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.1825, | |
| "grad_norm": 0.5958942390294701, | |
| "learning_rate": 9.083555555555557e-06, | |
| "loss": 2.3928, | |
| "step": 9125 | |
| }, | |
| { | |
| "epoch": 0.183, | |
| "grad_norm": 0.5859164810125311, | |
| "learning_rate": 9.078000000000002e-06, | |
| "loss": 2.4022, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 0.1835, | |
| "grad_norm": 0.5798241289951321, | |
| "learning_rate": 9.072444444444445e-06, | |
| "loss": 2.3928, | |
| "step": 9175 | |
| }, | |
| { | |
| "epoch": 0.184, | |
| "grad_norm": 0.5882407091400851, | |
| "learning_rate": 9.066888888888889e-06, | |
| "loss": 2.3973, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.184, | |
| "eval_loss": 2.409634590148926, | |
| "eval_runtime": 32.249, | |
| "eval_samples_per_second": 3.163, | |
| "eval_steps_per_second": 1.581, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.1845, | |
| "grad_norm": 0.5903772748051019, | |
| "learning_rate": 9.061333333333334e-06, | |
| "loss": 2.3831, | |
| "step": 9225 | |
| }, | |
| { | |
| "epoch": 0.185, | |
| "grad_norm": 0.6211646089814673, | |
| "learning_rate": 9.05577777777778e-06, | |
| "loss": 2.3983, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 0.1855, | |
| "grad_norm": 0.6172378815389531, | |
| "learning_rate": 9.050222222222223e-06, | |
| "loss": 2.3961, | |
| "step": 9275 | |
| }, | |
| { | |
| "epoch": 0.186, | |
| "grad_norm": 0.6117693503941964, | |
| "learning_rate": 9.044666666666667e-06, | |
| "loss": 2.3991, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.186, | |
| "eval_loss": 2.4100780487060547, | |
| "eval_runtime": 31.6698, | |
| "eval_samples_per_second": 3.221, | |
| "eval_steps_per_second": 1.61, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.1865, | |
| "grad_norm": 0.5955035334939845, | |
| "learning_rate": 9.039111111111112e-06, | |
| "loss": 2.4013, | |
| "step": 9325 | |
| }, | |
| { | |
| "epoch": 0.187, | |
| "grad_norm": 0.6304889803867978, | |
| "learning_rate": 9.033555555555557e-06, | |
| "loss": 2.4045, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 0.1875, | |
| "grad_norm": 0.5650857479280212, | |
| "learning_rate": 9.028e-06, | |
| "loss": 2.3993, | |
| "step": 9375 | |
| }, | |
| { | |
| "epoch": 0.188, | |
| "grad_norm": 0.6102368092141387, | |
| "learning_rate": 9.022444444444444e-06, | |
| "loss": 2.3969, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.188, | |
| "eval_loss": 2.4091312885284424, | |
| "eval_runtime": 31.7427, | |
| "eval_samples_per_second": 3.213, | |
| "eval_steps_per_second": 1.607, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.1885, | |
| "grad_norm": 0.5647006274355659, | |
| "learning_rate": 9.01688888888889e-06, | |
| "loss": 2.3962, | |
| "step": 9425 | |
| }, | |
| { | |
| "epoch": 0.189, | |
| "grad_norm": 0.639478683787589, | |
| "learning_rate": 9.011333333333335e-06, | |
| "loss": 2.3957, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 0.1895, | |
| "grad_norm": 0.5788568545073746, | |
| "learning_rate": 9.005777777777778e-06, | |
| "loss": 2.3914, | |
| "step": 9475 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.7290164754099147, | |
| "learning_rate": 9.000222222222222e-06, | |
| "loss": 2.386, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "eval_loss": 2.4086694717407227, | |
| "eval_runtime": 31.8061, | |
| "eval_samples_per_second": 3.207, | |
| "eval_steps_per_second": 1.603, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.1905, | |
| "grad_norm": 0.5817637514180484, | |
| "learning_rate": 8.994666666666667e-06, | |
| "loss": 2.4006, | |
| "step": 9525 | |
| }, | |
| { | |
| "epoch": 0.191, | |
| "grad_norm": 0.5697879107784812, | |
| "learning_rate": 8.989111111111112e-06, | |
| "loss": 2.3899, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 0.1915, | |
| "grad_norm": 0.584610269954786, | |
| "learning_rate": 8.983555555555556e-06, | |
| "loss": 2.3944, | |
| "step": 9575 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 0.608795413325502, | |
| "learning_rate": 8.978000000000001e-06, | |
| "loss": 2.398, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "eval_loss": 2.408263683319092, | |
| "eval_runtime": 31.6859, | |
| "eval_samples_per_second": 3.219, | |
| "eval_steps_per_second": 1.61, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.1925, | |
| "grad_norm": 0.5915130204472873, | |
| "learning_rate": 8.972444444444445e-06, | |
| "loss": 2.407, | |
| "step": 9625 | |
| }, | |
| { | |
| "epoch": 0.193, | |
| "grad_norm": 0.59521034646126, | |
| "learning_rate": 8.96688888888889e-06, | |
| "loss": 2.3924, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 0.1935, | |
| "grad_norm": 0.6050238690396914, | |
| "learning_rate": 8.961333333333333e-06, | |
| "loss": 2.3869, | |
| "step": 9675 | |
| }, | |
| { | |
| "epoch": 0.194, | |
| "grad_norm": 0.5691067223521449, | |
| "learning_rate": 8.955777777777779e-06, | |
| "loss": 2.3874, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.194, | |
| "eval_loss": 2.408264398574829, | |
| "eval_runtime": 31.8579, | |
| "eval_samples_per_second": 3.202, | |
| "eval_steps_per_second": 1.601, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.1945, | |
| "grad_norm": 0.5753054034666798, | |
| "learning_rate": 8.950222222222224e-06, | |
| "loss": 2.4027, | |
| "step": 9725 | |
| }, | |
| { | |
| "epoch": 0.195, | |
| "grad_norm": 0.5864767839913545, | |
| "learning_rate": 8.944666666666668e-06, | |
| "loss": 2.3924, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 0.1955, | |
| "grad_norm": 0.6642807256080032, | |
| "learning_rate": 8.939111111111111e-06, | |
| "loss": 2.3709, | |
| "step": 9775 | |
| }, | |
| { | |
| "epoch": 0.196, | |
| "grad_norm": 0.6084139101409156, | |
| "learning_rate": 8.933555555555556e-06, | |
| "loss": 2.3958, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.196, | |
| "eval_loss": 2.4076178073883057, | |
| "eval_runtime": 31.7733, | |
| "eval_samples_per_second": 3.21, | |
| "eval_steps_per_second": 1.605, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.1965, | |
| "grad_norm": 0.592206064244208, | |
| "learning_rate": 8.928000000000002e-06, | |
| "loss": 2.3922, | |
| "step": 9825 | |
| }, | |
| { | |
| "epoch": 0.197, | |
| "grad_norm": 0.5685236067589632, | |
| "learning_rate": 8.922444444444445e-06, | |
| "loss": 2.3908, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 0.1975, | |
| "grad_norm": 0.6034821273699428, | |
| "learning_rate": 8.916888888888889e-06, | |
| "loss": 2.3903, | |
| "step": 9875 | |
| }, | |
| { | |
| "epoch": 0.198, | |
| "grad_norm": 0.5910198540350765, | |
| "learning_rate": 8.911333333333334e-06, | |
| "loss": 2.3767, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.198, | |
| "eval_loss": 2.407928705215454, | |
| "eval_runtime": 31.7033, | |
| "eval_samples_per_second": 3.217, | |
| "eval_steps_per_second": 1.609, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.1985, | |
| "grad_norm": 0.5849079897115376, | |
| "learning_rate": 8.90577777777778e-06, | |
| "loss": 2.3956, | |
| "step": 9925 | |
| }, | |
| { | |
| "epoch": 0.199, | |
| "grad_norm": 0.5683901924605945, | |
| "learning_rate": 8.900222222222223e-06, | |
| "loss": 2.3884, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 0.1995, | |
| "grad_norm": 0.6037241225699064, | |
| "learning_rate": 8.894666666666666e-06, | |
| "loss": 2.3934, | |
| "step": 9975 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.5807810374364664, | |
| "learning_rate": 8.889111111111112e-06, | |
| "loss": 2.3999, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "eval_loss": 2.40779447555542, | |
| "eval_runtime": 31.7288, | |
| "eval_samples_per_second": 3.215, | |
| "eval_steps_per_second": 1.607, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.2005, | |
| "grad_norm": 0.5959223333719136, | |
| "learning_rate": 8.883555555555557e-06, | |
| "loss": 2.387, | |
| "step": 10025 | |
| }, | |
| { | |
| "epoch": 0.201, | |
| "grad_norm": 0.604008744038432, | |
| "learning_rate": 8.878e-06, | |
| "loss": 2.4016, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 0.2015, | |
| "grad_norm": 0.5721419521050413, | |
| "learning_rate": 8.872444444444444e-06, | |
| "loss": 2.3884, | |
| "step": 10075 | |
| }, | |
| { | |
| "epoch": 0.202, | |
| "grad_norm": 0.5986167284289824, | |
| "learning_rate": 8.86688888888889e-06, | |
| "loss": 2.3945, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.202, | |
| "eval_loss": 2.4074654579162598, | |
| "eval_runtime": 31.8658, | |
| "eval_samples_per_second": 3.201, | |
| "eval_steps_per_second": 1.6, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.2025, | |
| "grad_norm": 0.6046479507995179, | |
| "learning_rate": 8.861333333333334e-06, | |
| "loss": 2.3858, | |
| "step": 10125 | |
| }, | |
| { | |
| "epoch": 0.203, | |
| "grad_norm": 0.5633013817443194, | |
| "learning_rate": 8.855777777777778e-06, | |
| "loss": 2.3879, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 0.2035, | |
| "grad_norm": 0.5953174401982892, | |
| "learning_rate": 8.850222222222223e-06, | |
| "loss": 2.3967, | |
| "step": 10175 | |
| }, | |
| { | |
| "epoch": 0.204, | |
| "grad_norm": 0.6306212647705982, | |
| "learning_rate": 8.844666666666667e-06, | |
| "loss": 2.3927, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.204, | |
| "eval_loss": 2.407031297683716, | |
| "eval_runtime": 31.7801, | |
| "eval_samples_per_second": 3.21, | |
| "eval_steps_per_second": 1.605, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.2045, | |
| "grad_norm": 0.5605617492602121, | |
| "learning_rate": 8.839111111111112e-06, | |
| "loss": 2.4081, | |
| "step": 10225 | |
| }, | |
| { | |
| "epoch": 0.205, | |
| "grad_norm": 0.5739246143474902, | |
| "learning_rate": 8.833555555555556e-06, | |
| "loss": 2.3841, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 0.2055, | |
| "grad_norm": 0.5938549959471341, | |
| "learning_rate": 8.828000000000001e-06, | |
| "loss": 2.3902, | |
| "step": 10275 | |
| }, | |
| { | |
| "epoch": 0.206, | |
| "grad_norm": 0.5902936931354175, | |
| "learning_rate": 8.822444444444446e-06, | |
| "loss": 2.3905, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.206, | |
| "eval_loss": 2.4066004753112793, | |
| "eval_runtime": 31.7707, | |
| "eval_samples_per_second": 3.211, | |
| "eval_steps_per_second": 1.605, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.2065, | |
| "grad_norm": 0.5697435057211838, | |
| "learning_rate": 8.81688888888889e-06, | |
| "loss": 2.3854, | |
| "step": 10325 | |
| }, | |
| { | |
| "epoch": 0.207, | |
| "grad_norm": 0.5879126074250441, | |
| "learning_rate": 8.811333333333333e-06, | |
| "loss": 2.3917, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 0.2075, | |
| "grad_norm": 0.5800642153182343, | |
| "learning_rate": 8.805777777777778e-06, | |
| "loss": 2.3929, | |
| "step": 10375 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 0.5794546973922929, | |
| "learning_rate": 8.800222222222224e-06, | |
| "loss": 2.3912, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "eval_loss": 2.4065024852752686, | |
| "eval_runtime": 31.7191, | |
| "eval_samples_per_second": 3.216, | |
| "eval_steps_per_second": 1.608, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.2085, | |
| "grad_norm": 0.5776454190712899, | |
| "learning_rate": 8.794666666666667e-06, | |
| "loss": 2.386, | |
| "step": 10425 | |
| }, | |
| { | |
| "epoch": 0.209, | |
| "grad_norm": 0.5578455228918948, | |
| "learning_rate": 8.78911111111111e-06, | |
| "loss": 2.3869, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 0.2095, | |
| "grad_norm": 0.5721674793656858, | |
| "learning_rate": 8.783555555555556e-06, | |
| "loss": 2.3779, | |
| "step": 10475 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.5950633442730316, | |
| "learning_rate": 8.778000000000001e-06, | |
| "loss": 2.3845, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "eval_loss": 2.4065566062927246, | |
| "eval_runtime": 31.8091, | |
| "eval_samples_per_second": 3.207, | |
| "eval_steps_per_second": 1.603, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.2105, | |
| "grad_norm": 0.605078293663896, | |
| "learning_rate": 8.772444444444445e-06, | |
| "loss": 2.3913, | |
| "step": 10525 | |
| }, | |
| { | |
| "epoch": 0.211, | |
| "grad_norm": 0.567849892850204, | |
| "learning_rate": 8.766888888888888e-06, | |
| "loss": 2.3966, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 0.2115, | |
| "grad_norm": 0.6876645024191659, | |
| "learning_rate": 8.761333333333334e-06, | |
| "loss": 2.3993, | |
| "step": 10575 | |
| }, | |
| { | |
| "epoch": 0.212, | |
| "grad_norm": 0.5841938304908528, | |
| "learning_rate": 8.755777777777779e-06, | |
| "loss": 2.3916, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.212, | |
| "eval_loss": 2.4061877727508545, | |
| "eval_runtime": 31.8484, | |
| "eval_samples_per_second": 3.203, | |
| "eval_steps_per_second": 1.601, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.2125, | |
| "grad_norm": 0.5649004204666818, | |
| "learning_rate": 8.750222222222223e-06, | |
| "loss": 2.381, | |
| "step": 10625 | |
| }, | |
| { | |
| "epoch": 0.213, | |
| "grad_norm": 0.5678489376050115, | |
| "learning_rate": 8.744666666666666e-06, | |
| "loss": 2.3995, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 0.2135, | |
| "grad_norm": 0.5712733595317334, | |
| "learning_rate": 8.739111111111111e-06, | |
| "loss": 2.3954, | |
| "step": 10675 | |
| }, | |
| { | |
| "epoch": 0.214, | |
| "grad_norm": 0.573353636066434, | |
| "learning_rate": 8.733555555555557e-06, | |
| "loss": 2.379, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.214, | |
| "eval_loss": 2.4055771827697754, | |
| "eval_runtime": 31.8192, | |
| "eval_samples_per_second": 3.206, | |
| "eval_steps_per_second": 1.603, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.2145, | |
| "grad_norm": 0.6133309651928519, | |
| "learning_rate": 8.728e-06, | |
| "loss": 2.3946, | |
| "step": 10725 | |
| }, | |
| { | |
| "epoch": 0.215, | |
| "grad_norm": 0.6033931866035528, | |
| "learning_rate": 8.722444444444445e-06, | |
| "loss": 2.3935, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 0.2155, | |
| "grad_norm": 0.6008672136487845, | |
| "learning_rate": 8.716888888888889e-06, | |
| "loss": 2.3872, | |
| "step": 10775 | |
| }, | |
| { | |
| "epoch": 0.216, | |
| "grad_norm": 0.5728704483928734, | |
| "learning_rate": 8.711333333333334e-06, | |
| "loss": 2.3917, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.216, | |
| "eval_loss": 2.4059016704559326, | |
| "eval_runtime": 31.7995, | |
| "eval_samples_per_second": 3.208, | |
| "eval_steps_per_second": 1.604, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.2165, | |
| "grad_norm": 0.5888944153423502, | |
| "learning_rate": 8.705777777777778e-06, | |
| "loss": 2.3946, | |
| "step": 10825 | |
| }, | |
| { | |
| "epoch": 0.217, | |
| "grad_norm": 0.5947880979306366, | |
| "learning_rate": 8.700222222222223e-06, | |
| "loss": 2.3736, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 0.2175, | |
| "grad_norm": 0.6163696606959644, | |
| "learning_rate": 8.694666666666668e-06, | |
| "loss": 2.3838, | |
| "step": 10875 | |
| }, | |
| { | |
| "epoch": 0.218, | |
| "grad_norm": 0.6004092938812543, | |
| "learning_rate": 8.689111111111112e-06, | |
| "loss": 2.3942, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.218, | |
| "eval_loss": 2.4055566787719727, | |
| "eval_runtime": 31.7386, | |
| "eval_samples_per_second": 3.214, | |
| "eval_steps_per_second": 1.607, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.2185, | |
| "grad_norm": 0.5786273641598245, | |
| "learning_rate": 8.683555555555555e-06, | |
| "loss": 2.3938, | |
| "step": 10925 | |
| }, | |
| { | |
| "epoch": 0.219, | |
| "grad_norm": 0.5764162885826465, | |
| "learning_rate": 8.678e-06, | |
| "loss": 2.3939, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 0.2195, | |
| "grad_norm": 0.5923291223123188, | |
| "learning_rate": 8.672444444444446e-06, | |
| "loss": 2.3847, | |
| "step": 10975 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.6102815146568634, | |
| "learning_rate": 8.66688888888889e-06, | |
| "loss": 2.3901, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "eval_loss": 2.405616044998169, | |
| "eval_runtime": 31.7048, | |
| "eval_samples_per_second": 3.217, | |
| "eval_steps_per_second": 1.609, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.2205, | |
| "grad_norm": 0.619520274382602, | |
| "learning_rate": 8.661333333333335e-06, | |
| "loss": 2.3868, | |
| "step": 11025 | |
| }, | |
| { | |
| "epoch": 0.221, | |
| "grad_norm": 0.5973378822756289, | |
| "learning_rate": 8.655777777777778e-06, | |
| "loss": 2.398, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 0.2215, | |
| "grad_norm": 0.6143187669490118, | |
| "learning_rate": 8.650222222222223e-06, | |
| "loss": 2.387, | |
| "step": 11075 | |
| }, | |
| { | |
| "epoch": 0.222, | |
| "grad_norm": 0.5804040103557917, | |
| "learning_rate": 8.644666666666669e-06, | |
| "loss": 2.3951, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.222, | |
| "eval_loss": 2.4050545692443848, | |
| "eval_runtime": 31.7713, | |
| "eval_samples_per_second": 3.21, | |
| "eval_steps_per_second": 1.605, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.2225, | |
| "grad_norm": 0.5833158956225722, | |
| "learning_rate": 8.639111111111112e-06, | |
| "loss": 2.3854, | |
| "step": 11125 | |
| }, | |
| { | |
| "epoch": 0.223, | |
| "grad_norm": 0.5741811771851818, | |
| "learning_rate": 8.633555555555556e-06, | |
| "loss": 2.3866, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 0.2235, | |
| "grad_norm": 0.5856955103294486, | |
| "learning_rate": 8.628000000000001e-06, | |
| "loss": 2.4058, | |
| "step": 11175 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 0.5685596699989746, | |
| "learning_rate": 8.622444444444446e-06, | |
| "loss": 2.3953, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "eval_loss": 2.4051928520202637, | |
| "eval_runtime": 35.481, | |
| "eval_samples_per_second": 2.875, | |
| "eval_steps_per_second": 1.437, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.2245, | |
| "grad_norm": 0.5854297741723825, | |
| "learning_rate": 8.61688888888889e-06, | |
| "loss": 2.3977, | |
| "step": 11225 | |
| }, | |
| { | |
| "epoch": 0.225, | |
| "grad_norm": 0.582929503102295, | |
| "learning_rate": 8.611333333333333e-06, | |
| "loss": 2.3948, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 0.2255, | |
| "grad_norm": 0.5839207937169353, | |
| "learning_rate": 8.605777777777779e-06, | |
| "loss": 2.4104, | |
| "step": 11275 | |
| }, | |
| { | |
| "epoch": 0.226, | |
| "grad_norm": 0.5568849917729087, | |
| "learning_rate": 8.600222222222224e-06, | |
| "loss": 2.4011, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.226, | |
| "eval_loss": 2.404717445373535, | |
| "eval_runtime": 31.9835, | |
| "eval_samples_per_second": 3.189, | |
| "eval_steps_per_second": 1.595, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.2265, | |
| "grad_norm": 0.5549969270675909, | |
| "learning_rate": 8.594666666666668e-06, | |
| "loss": 2.3965, | |
| "step": 11325 | |
| }, | |
| { | |
| "epoch": 0.227, | |
| "grad_norm": 0.5606539732290856, | |
| "learning_rate": 8.589111111111111e-06, | |
| "loss": 2.3921, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 0.2275, | |
| "grad_norm": 0.5626929771754517, | |
| "learning_rate": 8.583555555555556e-06, | |
| "loss": 2.3912, | |
| "step": 11375 | |
| }, | |
| { | |
| "epoch": 0.228, | |
| "grad_norm": 0.5731631708828652, | |
| "learning_rate": 8.578000000000002e-06, | |
| "loss": 2.3926, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.228, | |
| "eval_loss": 2.4047322273254395, | |
| "eval_runtime": 31.8245, | |
| "eval_samples_per_second": 3.205, | |
| "eval_steps_per_second": 1.603, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.2285, | |
| "grad_norm": 0.5661654100374769, | |
| "learning_rate": 8.572444444444445e-06, | |
| "loss": 2.3951, | |
| "step": 11425 | |
| }, | |
| { | |
| "epoch": 0.229, | |
| "grad_norm": 0.5602181256620924, | |
| "learning_rate": 8.56688888888889e-06, | |
| "loss": 2.3812, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 0.2295, | |
| "grad_norm": 0.5950733473289397, | |
| "learning_rate": 8.561333333333334e-06, | |
| "loss": 2.3963, | |
| "step": 11475 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.5733938863696743, | |
| "learning_rate": 8.55577777777778e-06, | |
| "loss": 2.3932, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "eval_loss": 2.403830051422119, | |
| "eval_runtime": 31.7862, | |
| "eval_samples_per_second": 3.209, | |
| "eval_steps_per_second": 1.604, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.2305, | |
| "grad_norm": 0.5702512759518216, | |
| "learning_rate": 8.550222222222223e-06, | |
| "loss": 2.3824, | |
| "step": 11525 | |
| }, | |
| { | |
| "epoch": 0.231, | |
| "grad_norm": 0.5749933738625221, | |
| "learning_rate": 8.544666666666668e-06, | |
| "loss": 2.3674, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 0.2315, | |
| "grad_norm": 0.563814842108926, | |
| "learning_rate": 8.539111111111112e-06, | |
| "loss": 2.3866, | |
| "step": 11575 | |
| }, | |
| { | |
| "epoch": 0.232, | |
| "grad_norm": 0.601764608458657, | |
| "learning_rate": 8.533555555555557e-06, | |
| "loss": 2.3949, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.232, | |
| "eval_loss": 2.4035561084747314, | |
| "eval_runtime": 31.7077, | |
| "eval_samples_per_second": 3.217, | |
| "eval_steps_per_second": 1.608, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.2325, | |
| "grad_norm": 0.5674229084100237, | |
| "learning_rate": 8.528e-06, | |
| "loss": 2.3782, | |
| "step": 11625 | |
| }, | |
| { | |
| "epoch": 0.233, | |
| "grad_norm": 0.5660025767055805, | |
| "learning_rate": 8.522444444444446e-06, | |
| "loss": 2.3811, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 0.2335, | |
| "grad_norm": 0.5776196117388842, | |
| "learning_rate": 8.51688888888889e-06, | |
| "loss": 2.3964, | |
| "step": 11675 | |
| }, | |
| { | |
| "epoch": 0.234, | |
| "grad_norm": 0.5815076886720436, | |
| "learning_rate": 8.511333333333334e-06, | |
| "loss": 2.3907, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.234, | |
| "eval_loss": 2.4035725593566895, | |
| "eval_runtime": 31.7541, | |
| "eval_samples_per_second": 3.212, | |
| "eval_steps_per_second": 1.606, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.2345, | |
| "grad_norm": 0.5810635532925048, | |
| "learning_rate": 8.505777777777778e-06, | |
| "loss": 2.3921, | |
| "step": 11725 | |
| }, | |
| { | |
| "epoch": 0.235, | |
| "grad_norm": 0.5635380257098753, | |
| "learning_rate": 8.500222222222223e-06, | |
| "loss": 2.4062, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 0.2355, | |
| "grad_norm": 0.5985004911332629, | |
| "learning_rate": 8.494666666666668e-06, | |
| "loss": 2.3853, | |
| "step": 11775 | |
| }, | |
| { | |
| "epoch": 0.236, | |
| "grad_norm": 0.580078413647693, | |
| "learning_rate": 8.489111111111112e-06, | |
| "loss": 2.3826, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.236, | |
| "eval_loss": 2.403505325317383, | |
| "eval_runtime": 31.7265, | |
| "eval_samples_per_second": 3.215, | |
| "eval_steps_per_second": 1.607, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.2365, | |
| "grad_norm": 0.5560334145179444, | |
| "learning_rate": 8.483555555555556e-06, | |
| "loss": 2.3829, | |
| "step": 11825 | |
| }, | |
| { | |
| "epoch": 0.237, | |
| "grad_norm": 0.5870934042209253, | |
| "learning_rate": 8.478e-06, | |
| "loss": 2.374, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 0.2375, | |
| "grad_norm": 0.5745342448568999, | |
| "learning_rate": 8.472444444444446e-06, | |
| "loss": 2.3797, | |
| "step": 11875 | |
| }, | |
| { | |
| "epoch": 0.238, | |
| "grad_norm": 0.5676573173578097, | |
| "learning_rate": 8.46688888888889e-06, | |
| "loss": 2.3867, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.238, | |
| "eval_loss": 2.403400421142578, | |
| "eval_runtime": 31.8105, | |
| "eval_samples_per_second": 3.206, | |
| "eval_steps_per_second": 1.603, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.2385, | |
| "grad_norm": 0.5701256243606029, | |
| "learning_rate": 8.461333333333333e-06, | |
| "loss": 2.3832, | |
| "step": 11925 | |
| }, | |
| { | |
| "epoch": 0.239, | |
| "grad_norm": 0.5839965205220576, | |
| "learning_rate": 8.455777777777778e-06, | |
| "loss": 2.3939, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 0.2395, | |
| "grad_norm": 0.581600775004578, | |
| "learning_rate": 8.450222222222224e-06, | |
| "loss": 2.382, | |
| "step": 11975 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.5945113931788275, | |
| "learning_rate": 8.444666666666667e-06, | |
| "loss": 2.3947, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "eval_loss": 2.4031572341918945, | |
| "eval_runtime": 31.7154, | |
| "eval_samples_per_second": 3.216, | |
| "eval_steps_per_second": 1.608, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.2405, | |
| "grad_norm": 0.5687487747515707, | |
| "learning_rate": 8.43911111111111e-06, | |
| "loss": 2.3859, | |
| "step": 12025 | |
| }, | |
| { | |
| "epoch": 0.241, | |
| "grad_norm": 0.6156971193882954, | |
| "learning_rate": 8.433555555555556e-06, | |
| "loss": 2.3936, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 0.2415, | |
| "grad_norm": 0.5735725917481376, | |
| "learning_rate": 8.428000000000001e-06, | |
| "loss": 2.3867, | |
| "step": 12075 | |
| }, | |
| { | |
| "epoch": 0.242, | |
| "grad_norm": 0.5900311312717111, | |
| "learning_rate": 8.422444444444445e-06, | |
| "loss": 2.381, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.242, | |
| "eval_loss": 2.402616262435913, | |
| "eval_runtime": 31.728, | |
| "eval_samples_per_second": 3.215, | |
| "eval_steps_per_second": 1.607, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.2425, | |
| "grad_norm": 0.6210456413331185, | |
| "learning_rate": 8.41688888888889e-06, | |
| "loss": 2.3897, | |
| "step": 12125 | |
| }, | |
| { | |
| "epoch": 0.243, | |
| "grad_norm": 0.564076844370536, | |
| "learning_rate": 8.411333333333334e-06, | |
| "loss": 2.3789, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 0.2435, | |
| "grad_norm": 0.5787670607206897, | |
| "learning_rate": 8.405777777777779e-06, | |
| "loss": 2.3927, | |
| "step": 12175 | |
| }, | |
| { | |
| "epoch": 0.244, | |
| "grad_norm": 0.557686861390105, | |
| "learning_rate": 8.400222222222222e-06, | |
| "loss": 2.3761, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.244, | |
| "eval_loss": 2.4025542736053467, | |
| "eval_runtime": 31.8116, | |
| "eval_samples_per_second": 3.206, | |
| "eval_steps_per_second": 1.603, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.2445, | |
| "grad_norm": 0.5642621664909974, | |
| "learning_rate": 8.394666666666668e-06, | |
| "loss": 2.3787, | |
| "step": 12225 | |
| }, | |
| { | |
| "epoch": 0.245, | |
| "grad_norm": 0.5812642245692796, | |
| "learning_rate": 8.389111111111113e-06, | |
| "loss": 2.3888, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 0.2455, | |
| "grad_norm": 0.5903665572148793, | |
| "learning_rate": 8.383555555555557e-06, | |
| "loss": 2.3874, | |
| "step": 12275 | |
| }, | |
| { | |
| "epoch": 0.246, | |
| "grad_norm": 0.5752826274496151, | |
| "learning_rate": 8.378e-06, | |
| "loss": 2.3851, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.246, | |
| "eval_loss": 2.4024178981781006, | |
| "eval_runtime": 31.9538, | |
| "eval_samples_per_second": 3.192, | |
| "eval_steps_per_second": 1.596, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.2465, | |
| "grad_norm": 0.5625780105871633, | |
| "learning_rate": 8.372444444444445e-06, | |
| "loss": 2.3857, | |
| "step": 12325 | |
| }, | |
| { | |
| "epoch": 0.247, | |
| "grad_norm": 0.5516059110433715, | |
| "learning_rate": 8.36688888888889e-06, | |
| "loss": 2.387, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 0.2475, | |
| "grad_norm": 0.5743651124710031, | |
| "learning_rate": 8.361333333333334e-06, | |
| "loss": 2.3899, | |
| "step": 12375 | |
| }, | |
| { | |
| "epoch": 0.248, | |
| "grad_norm": 0.6065509345211424, | |
| "learning_rate": 8.355777777777778e-06, | |
| "loss": 2.3811, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.248, | |
| "eval_loss": 2.402189254760742, | |
| "eval_runtime": 31.7357, | |
| "eval_samples_per_second": 3.214, | |
| "eval_steps_per_second": 1.607, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.2485, | |
| "grad_norm": 0.569411806780091, | |
| "learning_rate": 8.350222222222223e-06, | |
| "loss": 2.3891, | |
| "step": 12425 | |
| }, | |
| { | |
| "epoch": 0.249, | |
| "grad_norm": 0.5781227404353481, | |
| "learning_rate": 8.344666666666668e-06, | |
| "loss": 2.3799, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 0.2495, | |
| "grad_norm": 0.5882770416548074, | |
| "learning_rate": 8.339111111111112e-06, | |
| "loss": 2.3921, | |
| "step": 12475 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.6053137792053689, | |
| "learning_rate": 8.333555555555555e-06, | |
| "loss": 2.3923, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "eval_loss": 2.401906967163086, | |
| "eval_runtime": 31.7052, | |
| "eval_samples_per_second": 3.217, | |
| "eval_steps_per_second": 1.609, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.2505, | |
| "grad_norm": 0.5493940361276148, | |
| "learning_rate": 8.328e-06, | |
| "loss": 2.3872, | |
| "step": 12525 | |
| }, | |
| { | |
| "epoch": 0.251, | |
| "grad_norm": 0.5844453837465953, | |
| "learning_rate": 8.322444444444446e-06, | |
| "loss": 2.3859, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 0.2515, | |
| "grad_norm": 0.589694030674745, | |
| "learning_rate": 8.31688888888889e-06, | |
| "loss": 2.3852, | |
| "step": 12575 | |
| }, | |
| { | |
| "epoch": 0.252, | |
| "grad_norm": 0.5985872367130171, | |
| "learning_rate": 8.311333333333333e-06, | |
| "loss": 2.378, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.252, | |
| "eval_loss": 2.4017632007598877, | |
| "eval_runtime": 31.8059, | |
| "eval_samples_per_second": 3.207, | |
| "eval_steps_per_second": 1.603, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.2525, | |
| "grad_norm": 0.6246560097732429, | |
| "learning_rate": 8.305777777777778e-06, | |
| "loss": 2.3891, | |
| "step": 12625 | |
| }, | |
| { | |
| "epoch": 0.253, | |
| "grad_norm": 0.5977851115835912, | |
| "learning_rate": 8.300222222222223e-06, | |
| "loss": 2.3884, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 0.2535, | |
| "grad_norm": 0.5535634109353079, | |
| "learning_rate": 8.294666666666667e-06, | |
| "loss": 2.3894, | |
| "step": 12675 | |
| }, | |
| { | |
| "epoch": 0.254, | |
| "grad_norm": 0.5647542662126371, | |
| "learning_rate": 8.289111111111112e-06, | |
| "loss": 2.3889, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 0.254, | |
| "eval_loss": 2.4015073776245117, | |
| "eval_runtime": 31.6682, | |
| "eval_samples_per_second": 3.221, | |
| "eval_steps_per_second": 1.61, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 0.2545, | |
| "grad_norm": 0.5689860381748764, | |
| "learning_rate": 8.283555555555556e-06, | |
| "loss": 2.391, | |
| "step": 12725 | |
| }, | |
| { | |
| "epoch": 0.255, | |
| "grad_norm": 0.5788815220722723, | |
| "learning_rate": 8.278000000000001e-06, | |
| "loss": 2.3746, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 0.2555, | |
| "grad_norm": 0.5746385277305921, | |
| "learning_rate": 8.272444444444445e-06, | |
| "loss": 2.3884, | |
| "step": 12775 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 0.5952261074381101, | |
| "learning_rate": 8.26688888888889e-06, | |
| "loss": 2.387, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "eval_loss": 2.401090383529663, | |
| "eval_runtime": 31.7518, | |
| "eval_samples_per_second": 3.212, | |
| "eval_steps_per_second": 1.606, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.2565, | |
| "grad_norm": 0.581914246490724, | |
| "learning_rate": 8.261333333333335e-06, | |
| "loss": 2.3879, | |
| "step": 12825 | |
| }, | |
| { | |
| "epoch": 0.257, | |
| "grad_norm": 0.5582195018164189, | |
| "learning_rate": 8.255777777777779e-06, | |
| "loss": 2.3783, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 0.2575, | |
| "grad_norm": 0.5633036552978725, | |
| "learning_rate": 8.250222222222222e-06, | |
| "loss": 2.3845, | |
| "step": 12875 | |
| }, | |
| { | |
| "epoch": 0.258, | |
| "grad_norm": 0.5613155523789654, | |
| "learning_rate": 8.244666666666667e-06, | |
| "loss": 2.3942, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.258, | |
| "eval_loss": 2.4014108180999756, | |
| "eval_runtime": 31.8052, | |
| "eval_samples_per_second": 3.207, | |
| "eval_steps_per_second": 1.604, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.2585, | |
| "grad_norm": 0.5906307979751212, | |
| "learning_rate": 8.239111111111113e-06, | |
| "loss": 2.3807, | |
| "step": 12925 | |
| }, | |
| { | |
| "epoch": 0.259, | |
| "grad_norm": 0.5786593603781868, | |
| "learning_rate": 8.233555555555556e-06, | |
| "loss": 2.3848, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 0.2595, | |
| "grad_norm": 0.5739057988147651, | |
| "learning_rate": 8.228e-06, | |
| "loss": 2.3841, | |
| "step": 12975 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.5727067411665359, | |
| "learning_rate": 8.222444444444445e-06, | |
| "loss": 2.3771, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "eval_loss": 2.4009385108947754, | |
| "eval_runtime": 31.8075, | |
| "eval_samples_per_second": 3.207, | |
| "eval_steps_per_second": 1.603, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.2605, | |
| "grad_norm": 0.5758550911461594, | |
| "learning_rate": 8.21688888888889e-06, | |
| "loss": 2.39, | |
| "step": 13025 | |
| }, | |
| { | |
| "epoch": 0.261, | |
| "grad_norm": 0.5506335078390368, | |
| "learning_rate": 8.211333333333334e-06, | |
| "loss": 2.3879, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 0.2615, | |
| "grad_norm": 0.578047700560021, | |
| "learning_rate": 8.205777777777777e-06, | |
| "loss": 2.3772, | |
| "step": 13075 | |
| }, | |
| { | |
| "epoch": 0.262, | |
| "grad_norm": 0.5517825098879646, | |
| "learning_rate": 8.200222222222223e-06, | |
| "loss": 2.3751, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 0.262, | |
| "eval_loss": 2.4008378982543945, | |
| "eval_runtime": 31.8219, | |
| "eval_samples_per_second": 3.205, | |
| "eval_steps_per_second": 1.603, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 0.2625, | |
| "grad_norm": 0.6060142395322289, | |
| "learning_rate": 8.194666666666668e-06, | |
| "loss": 2.3859, | |
| "step": 13125 | |
| }, | |
| { | |
| "epoch": 0.263, | |
| "grad_norm": 0.6151379264003006, | |
| "learning_rate": 8.189111111111111e-06, | |
| "loss": 2.3906, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 0.2635, | |
| "grad_norm": 0.5889091981712471, | |
| "learning_rate": 8.183555555555555e-06, | |
| "loss": 2.3813, | |
| "step": 13175 | |
| }, | |
| { | |
| "epoch": 0.264, | |
| "grad_norm": 0.7021686085407579, | |
| "learning_rate": 8.178e-06, | |
| "loss": 2.3844, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 0.264, | |
| "eval_loss": 2.400826930999756, | |
| "eval_runtime": 31.7255, | |
| "eval_samples_per_second": 3.215, | |
| "eval_steps_per_second": 1.608, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 0.2645, | |
| "grad_norm": 0.5738899506070113, | |
| "learning_rate": 8.172444444444446e-06, | |
| "loss": 2.3974, | |
| "step": 13225 | |
| }, | |
| { | |
| "epoch": 0.265, | |
| "grad_norm": 0.618543215020873, | |
| "learning_rate": 8.166888888888889e-06, | |
| "loss": 2.3846, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 0.2655, | |
| "grad_norm": 0.5529480549821216, | |
| "learning_rate": 8.161333333333334e-06, | |
| "loss": 2.3816, | |
| "step": 13275 | |
| }, | |
| { | |
| "epoch": 0.266, | |
| "grad_norm": 0.569904631452621, | |
| "learning_rate": 8.155777777777778e-06, | |
| "loss": 2.3809, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 0.266, | |
| "eval_loss": 2.4002933502197266, | |
| "eval_runtime": 31.6983, | |
| "eval_samples_per_second": 3.218, | |
| "eval_steps_per_second": 1.609, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 0.2665, | |
| "grad_norm": 0.5743878084278218, | |
| "learning_rate": 8.150222222222223e-06, | |
| "loss": 2.3941, | |
| "step": 13325 | |
| }, | |
| { | |
| "epoch": 0.267, | |
| "grad_norm": 0.5594243149898632, | |
| "learning_rate": 8.144666666666667e-06, | |
| "loss": 2.3878, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 0.2675, | |
| "grad_norm": 0.5810666087448406, | |
| "learning_rate": 8.139111111111112e-06, | |
| "loss": 2.381, | |
| "step": 13375 | |
| }, | |
| { | |
| "epoch": 0.268, | |
| "grad_norm": 0.5595852108101106, | |
| "learning_rate": 8.133555555555557e-06, | |
| "loss": 2.3792, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 0.268, | |
| "eval_loss": 2.400261878967285, | |
| "eval_runtime": 31.6975, | |
| "eval_samples_per_second": 3.218, | |
| "eval_steps_per_second": 1.609, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 0.2685, | |
| "grad_norm": 0.5789530002361615, | |
| "learning_rate": 8.128e-06, | |
| "loss": 2.3759, | |
| "step": 13425 | |
| }, | |
| { | |
| "epoch": 0.269, | |
| "grad_norm": 0.5662301407639397, | |
| "learning_rate": 8.122444444444444e-06, | |
| "loss": 2.3791, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 0.2695, | |
| "grad_norm": 0.6131145841315326, | |
| "learning_rate": 8.11688888888889e-06, | |
| "loss": 2.3833, | |
| "step": 13475 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.5607318024001929, | |
| "learning_rate": 8.111333333333335e-06, | |
| "loss": 2.3724, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "eval_loss": 2.4000020027160645, | |
| "eval_runtime": 31.71, | |
| "eval_samples_per_second": 3.217, | |
| "eval_steps_per_second": 1.608, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.2705, | |
| "grad_norm": 0.5692755244185855, | |
| "learning_rate": 8.105777777777778e-06, | |
| "loss": 2.3788, | |
| "step": 13525 | |
| }, | |
| { | |
| "epoch": 0.271, | |
| "grad_norm": 0.5647342769538716, | |
| "learning_rate": 8.100222222222222e-06, | |
| "loss": 2.3799, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 0.2715, | |
| "grad_norm": 0.5976773519089553, | |
| "learning_rate": 8.094666666666667e-06, | |
| "loss": 2.3828, | |
| "step": 13575 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 0.5642506953063758, | |
| "learning_rate": 8.089111111111112e-06, | |
| "loss": 2.3835, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "eval_loss": 2.400066614151001, | |
| "eval_runtime": 31.8128, | |
| "eval_samples_per_second": 3.206, | |
| "eval_steps_per_second": 1.603, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 0.2725, | |
| "grad_norm": 0.5616659241704035, | |
| "learning_rate": 8.083555555555556e-06, | |
| "loss": 2.3801, | |
| "step": 13625 | |
| }, | |
| { | |
| "epoch": 0.273, | |
| "grad_norm": 0.5878315825498157, | |
| "learning_rate": 8.078e-06, | |
| "loss": 2.3781, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 0.2735, | |
| "grad_norm": 0.5716337786191225, | |
| "learning_rate": 8.072444444444445e-06, | |
| "loss": 2.3932, | |
| "step": 13675 | |
| }, | |
| { | |
| "epoch": 0.274, | |
| "grad_norm": 0.5636757577555458, | |
| "learning_rate": 8.06688888888889e-06, | |
| "loss": 2.4041, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 0.274, | |
| "eval_loss": 2.3997650146484375, | |
| "eval_runtime": 31.4871, | |
| "eval_samples_per_second": 3.239, | |
| "eval_steps_per_second": 1.62, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 0.2745, | |
| "grad_norm": 0.5564992808480433, | |
| "learning_rate": 8.061333333333334e-06, | |
| "loss": 2.3971, | |
| "step": 13725 | |
| }, | |
| { | |
| "epoch": 0.275, | |
| "grad_norm": 0.5736246457745038, | |
| "learning_rate": 8.055777777777777e-06, | |
| "loss": 2.3847, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 0.2755, | |
| "grad_norm": 0.5423430973262378, | |
| "learning_rate": 8.050222222222222e-06, | |
| "loss": 2.3786, | |
| "step": 13775 | |
| }, | |
| { | |
| "epoch": 0.276, | |
| "grad_norm": 0.5672815850751382, | |
| "learning_rate": 8.044666666666668e-06, | |
| "loss": 2.3945, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 0.276, | |
| "eval_loss": 2.399338483810425, | |
| "eval_runtime": 31.3741, | |
| "eval_samples_per_second": 3.251, | |
| "eval_steps_per_second": 1.626, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 0.2765, | |
| "grad_norm": 0.5919813611615313, | |
| "learning_rate": 8.039111111111111e-06, | |
| "loss": 2.3738, | |
| "step": 13825 | |
| }, | |
| { | |
| "epoch": 0.277, | |
| "grad_norm": 0.5679311638374708, | |
| "learning_rate": 8.033555555555556e-06, | |
| "loss": 2.3771, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 0.2775, | |
| "grad_norm": 0.5533203763453908, | |
| "learning_rate": 8.028e-06, | |
| "loss": 2.3831, | |
| "step": 13875 | |
| }, | |
| { | |
| "epoch": 0.278, | |
| "grad_norm": 0.5674818164725537, | |
| "learning_rate": 8.022444444444445e-06, | |
| "loss": 2.3811, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 0.278, | |
| "eval_loss": 2.3990118503570557, | |
| "eval_runtime": 31.47, | |
| "eval_samples_per_second": 3.241, | |
| "eval_steps_per_second": 1.621, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 0.2785, | |
| "grad_norm": 0.5664699981127816, | |
| "learning_rate": 8.016888888888889e-06, | |
| "loss": 2.3848, | |
| "step": 13925 | |
| }, | |
| { | |
| "epoch": 0.279, | |
| "grad_norm": 0.6085875103795902, | |
| "learning_rate": 8.011333333333334e-06, | |
| "loss": 2.3822, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 0.2795, | |
| "grad_norm": 0.561160479481643, | |
| "learning_rate": 8.00577777777778e-06, | |
| "loss": 2.3722, | |
| "step": 13975 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.566395855978902, | |
| "learning_rate": 8.000222222222223e-06, | |
| "loss": 2.3922, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "eval_loss": 2.3991119861602783, | |
| "eval_runtime": 31.6591, | |
| "eval_samples_per_second": 3.222, | |
| "eval_steps_per_second": 1.611, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.2805, | |
| "grad_norm": 0.5680524398621669, | |
| "learning_rate": 7.994666666666666e-06, | |
| "loss": 2.382, | |
| "step": 14025 | |
| }, | |
| { | |
| "epoch": 0.281, | |
| "grad_norm": 0.5577808062612865, | |
| "learning_rate": 7.989111111111112e-06, | |
| "loss": 2.3817, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 0.2815, | |
| "grad_norm": 0.5609272583996402, | |
| "learning_rate": 7.983555555555557e-06, | |
| "loss": 2.3807, | |
| "step": 14075 | |
| }, | |
| { | |
| "epoch": 0.282, | |
| "grad_norm": 0.5572862450140419, | |
| "learning_rate": 7.978e-06, | |
| "loss": 2.3883, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 0.282, | |
| "eval_loss": 2.399045467376709, | |
| "eval_runtime": 31.4262, | |
| "eval_samples_per_second": 3.246, | |
| "eval_steps_per_second": 1.623, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 0.2825, | |
| "grad_norm": 0.5548825232758766, | |
| "learning_rate": 7.972444444444444e-06, | |
| "loss": 2.3906, | |
| "step": 14125 | |
| }, | |
| { | |
| "epoch": 0.283, | |
| "grad_norm": 0.5699464235282781, | |
| "learning_rate": 7.96688888888889e-06, | |
| "loss": 2.3985, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 0.2835, | |
| "grad_norm": 0.5949860745449153, | |
| "learning_rate": 7.961333333333335e-06, | |
| "loss": 2.384, | |
| "step": 14175 | |
| }, | |
| { | |
| "epoch": 0.284, | |
| "grad_norm": 1.207767068552352, | |
| "learning_rate": 7.955777777777778e-06, | |
| "loss": 2.3897, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 0.284, | |
| "eval_loss": 2.3988163471221924, | |
| "eval_runtime": 31.5331, | |
| "eval_samples_per_second": 3.235, | |
| "eval_steps_per_second": 1.617, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 0.2845, | |
| "grad_norm": 0.5734778733619218, | |
| "learning_rate": 7.950222222222222e-06, | |
| "loss": 2.3995, | |
| "step": 14225 | |
| }, | |
| { | |
| "epoch": 0.285, | |
| "grad_norm": 0.5809053174835214, | |
| "learning_rate": 7.944666666666667e-06, | |
| "loss": 2.3935, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 0.2855, | |
| "grad_norm": 0.5721177604701749, | |
| "learning_rate": 7.939111111111112e-06, | |
| "loss": 2.3831, | |
| "step": 14275 | |
| }, | |
| { | |
| "epoch": 0.286, | |
| "grad_norm": 0.5870187369085319, | |
| "learning_rate": 7.933555555555556e-06, | |
| "loss": 2.3876, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 0.286, | |
| "eval_loss": 2.3985910415649414, | |
| "eval_runtime": 31.8276, | |
| "eval_samples_per_second": 3.205, | |
| "eval_steps_per_second": 1.602, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 0.2865, | |
| "grad_norm": 0.5540420732959112, | |
| "learning_rate": 7.928e-06, | |
| "loss": 2.3894, | |
| "step": 14325 | |
| }, | |
| { | |
| "epoch": 0.287, | |
| "grad_norm": 0.5771375830109964, | |
| "learning_rate": 7.922444444444445e-06, | |
| "loss": 2.3919, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 0.2875, | |
| "grad_norm": 0.558274829145414, | |
| "learning_rate": 7.91688888888889e-06, | |
| "loss": 2.3792, | |
| "step": 14375 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 0.5489382411994304, | |
| "learning_rate": 7.911333333333333e-06, | |
| "loss": 2.382, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "eval_loss": 2.398547887802124, | |
| "eval_runtime": 31.7859, | |
| "eval_samples_per_second": 3.209, | |
| "eval_steps_per_second": 1.604, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 0.2885, | |
| "grad_norm": 0.5437020470565486, | |
| "learning_rate": 7.905777777777779e-06, | |
| "loss": 2.391, | |
| "step": 14425 | |
| }, | |
| { | |
| "epoch": 0.289, | |
| "grad_norm": 0.5822012645571201, | |
| "learning_rate": 7.900222222222222e-06, | |
| "loss": 2.3774, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 0.2895, | |
| "grad_norm": 0.5662409547337693, | |
| "learning_rate": 7.894666666666667e-06, | |
| "loss": 2.3754, | |
| "step": 14475 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.574336415517884, | |
| "learning_rate": 7.889111111111113e-06, | |
| "loss": 2.3696, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "eval_loss": 2.3984858989715576, | |
| "eval_runtime": 31.7473, | |
| "eval_samples_per_second": 3.213, | |
| "eval_steps_per_second": 1.606, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.2905, | |
| "grad_norm": 0.5564392509678192, | |
| "learning_rate": 7.883555555555556e-06, | |
| "loss": 2.3856, | |
| "step": 14525 | |
| }, | |
| { | |
| "epoch": 0.291, | |
| "grad_norm": 0.5518394045498354, | |
| "learning_rate": 7.878e-06, | |
| "loss": 2.3972, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 0.2915, | |
| "grad_norm": 0.5795808696759357, | |
| "learning_rate": 7.872444444444445e-06, | |
| "loss": 2.3831, | |
| "step": 14575 | |
| }, | |
| { | |
| "epoch": 0.292, | |
| "grad_norm": 0.5601055983017486, | |
| "learning_rate": 7.86688888888889e-06, | |
| "loss": 2.3844, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 0.292, | |
| "eval_loss": 2.3982439041137695, | |
| "eval_runtime": 31.6763, | |
| "eval_samples_per_second": 3.22, | |
| "eval_steps_per_second": 1.61, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 0.2925, | |
| "grad_norm": 0.5964235234322374, | |
| "learning_rate": 7.861333333333334e-06, | |
| "loss": 2.3899, | |
| "step": 14625 | |
| }, | |
| { | |
| "epoch": 0.293, | |
| "grad_norm": 0.5610795516162878, | |
| "learning_rate": 7.855777777777779e-06, | |
| "loss": 2.3838, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 0.2935, | |
| "grad_norm": 0.5670881867616083, | |
| "learning_rate": 7.850222222222223e-06, | |
| "loss": 2.3825, | |
| "step": 14675 | |
| }, | |
| { | |
| "epoch": 0.294, | |
| "grad_norm": 0.5643624181789829, | |
| "learning_rate": 7.844666666666668e-06, | |
| "loss": 2.3882, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 0.294, | |
| "eval_loss": 2.398089647293091, | |
| "eval_runtime": 31.7677, | |
| "eval_samples_per_second": 3.211, | |
| "eval_steps_per_second": 1.605, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 0.2945, | |
| "grad_norm": 0.5686315690402087, | |
| "learning_rate": 7.839111111111111e-06, | |
| "loss": 2.3745, | |
| "step": 14725 | |
| }, | |
| { | |
| "epoch": 0.295, | |
| "grad_norm": 0.5893983725540548, | |
| "learning_rate": 7.833555555555557e-06, | |
| "loss": 2.378, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 0.2955, | |
| "grad_norm": 0.5972901998200331, | |
| "learning_rate": 7.828000000000002e-06, | |
| "loss": 2.377, | |
| "step": 14775 | |
| }, | |
| { | |
| "epoch": 0.296, | |
| "grad_norm": 0.5804879541179684, | |
| "learning_rate": 7.822444444444446e-06, | |
| "loss": 2.3911, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 0.296, | |
| "eval_loss": 2.397839069366455, | |
| "eval_runtime": 31.7602, | |
| "eval_samples_per_second": 3.212, | |
| "eval_steps_per_second": 1.606, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 0.2965, | |
| "grad_norm": 0.577463980570899, | |
| "learning_rate": 7.816888888888889e-06, | |
| "loss": 2.3896, | |
| "step": 14825 | |
| }, | |
| { | |
| "epoch": 0.297, | |
| "grad_norm": 0.5800702741538564, | |
| "learning_rate": 7.811333333333334e-06, | |
| "loss": 2.3838, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 0.2975, | |
| "grad_norm": 0.6037725626202978, | |
| "learning_rate": 7.80577777777778e-06, | |
| "loss": 2.3827, | |
| "step": 14875 | |
| }, | |
| { | |
| "epoch": 0.298, | |
| "grad_norm": 0.5862145198472817, | |
| "learning_rate": 7.800222222222223e-06, | |
| "loss": 2.3801, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 0.298, | |
| "eval_loss": 2.3976035118103027, | |
| "eval_runtime": 31.751, | |
| "eval_samples_per_second": 3.212, | |
| "eval_steps_per_second": 1.606, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 0.2985, | |
| "grad_norm": 0.5670781074548332, | |
| "learning_rate": 7.794666666666667e-06, | |
| "loss": 2.3819, | |
| "step": 14925 | |
| }, | |
| { | |
| "epoch": 0.299, | |
| "grad_norm": 0.5571823653622203, | |
| "learning_rate": 7.789111111111112e-06, | |
| "loss": 2.3835, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 0.2995, | |
| "grad_norm": 0.5733242457342494, | |
| "learning_rate": 7.783555555555557e-06, | |
| "loss": 2.3728, | |
| "step": 14975 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.5619677124489769, | |
| "learning_rate": 7.778e-06, | |
| "loss": 2.3794, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "eval_loss": 2.397136688232422, | |
| "eval_runtime": 31.7183, | |
| "eval_samples_per_second": 3.216, | |
| "eval_steps_per_second": 1.608, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.3005, | |
| "grad_norm": 0.5657448746286691, | |
| "learning_rate": 7.772444444444444e-06, | |
| "loss": 2.3897, | |
| "step": 15025 | |
| }, | |
| { | |
| "epoch": 0.301, | |
| "grad_norm": 0.5523525627604269, | |
| "learning_rate": 7.76688888888889e-06, | |
| "loss": 2.3795, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 0.3015, | |
| "grad_norm": 0.5950789860717867, | |
| "learning_rate": 7.761333333333335e-06, | |
| "loss": 2.3914, | |
| "step": 15075 | |
| }, | |
| { | |
| "epoch": 0.302, | |
| "grad_norm": 0.5999400034143391, | |
| "learning_rate": 7.755777777777778e-06, | |
| "loss": 2.3769, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 0.302, | |
| "eval_loss": 2.396873950958252, | |
| "eval_runtime": 31.7696, | |
| "eval_samples_per_second": 3.211, | |
| "eval_steps_per_second": 1.605, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 0.3025, | |
| "grad_norm": 0.558834977842146, | |
| "learning_rate": 7.750222222222222e-06, | |
| "loss": 2.3854, | |
| "step": 15125 | |
| }, | |
| { | |
| "epoch": 0.303, | |
| "grad_norm": 0.5582295283472423, | |
| "learning_rate": 7.744666666666667e-06, | |
| "loss": 2.3821, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 0.3035, | |
| "grad_norm": 0.5632905015995245, | |
| "learning_rate": 7.739111111111112e-06, | |
| "loss": 2.3798, | |
| "step": 15175 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 0.5514118333084079, | |
| "learning_rate": 7.733555555555556e-06, | |
| "loss": 2.3788, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "eval_loss": 2.3965888023376465, | |
| "eval_runtime": 31.7152, | |
| "eval_samples_per_second": 3.216, | |
| "eval_steps_per_second": 1.608, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 0.3045, | |
| "grad_norm": 0.5649018768322466, | |
| "learning_rate": 7.728000000000001e-06, | |
| "loss": 2.3912, | |
| "step": 15225 | |
| }, | |
| { | |
| "epoch": 0.305, | |
| "grad_norm": 0.581561230195339, | |
| "learning_rate": 7.722444444444445e-06, | |
| "loss": 2.3766, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 0.3055, | |
| "grad_norm": 0.5604985750115082, | |
| "learning_rate": 7.71688888888889e-06, | |
| "loss": 2.3852, | |
| "step": 15275 | |
| }, | |
| { | |
| "epoch": 0.306, | |
| "grad_norm": 0.5602736035393524, | |
| "learning_rate": 7.711333333333334e-06, | |
| "loss": 2.3867, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 0.306, | |
| "eval_loss": 2.3968026638031006, | |
| "eval_runtime": 31.8105, | |
| "eval_samples_per_second": 3.206, | |
| "eval_steps_per_second": 1.603, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 0.3065, | |
| "grad_norm": 0.5404472339052024, | |
| "learning_rate": 7.705777777777779e-06, | |
| "loss": 2.3835, | |
| "step": 15325 | |
| }, | |
| { | |
| "epoch": 0.307, | |
| "grad_norm": 0.5732167481475767, | |
| "learning_rate": 7.700222222222224e-06, | |
| "loss": 2.386, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 0.3075, | |
| "grad_norm": 0.5668975128857069, | |
| "learning_rate": 7.694666666666668e-06, | |
| "loss": 2.3838, | |
| "step": 15375 | |
| }, | |
| { | |
| "epoch": 0.308, | |
| "grad_norm": 0.5478312505357384, | |
| "learning_rate": 7.689111111111111e-06, | |
| "loss": 2.4068, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 0.308, | |
| "eval_loss": 2.39662766456604, | |
| "eval_runtime": 31.4625, | |
| "eval_samples_per_second": 3.242, | |
| "eval_steps_per_second": 1.621, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 0.3085, | |
| "grad_norm": 0.5853236703412803, | |
| "learning_rate": 7.683555555555556e-06, | |
| "loss": 2.3781, | |
| "step": 15425 | |
| }, | |
| { | |
| "epoch": 0.309, | |
| "grad_norm": 0.566498029803985, | |
| "learning_rate": 7.678000000000002e-06, | |
| "loss": 2.3825, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 0.3095, | |
| "grad_norm": 0.5876295223419085, | |
| "learning_rate": 7.672444444444445e-06, | |
| "loss": 2.3821, | |
| "step": 15475 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.5308633915785282, | |
| "learning_rate": 7.666888888888889e-06, | |
| "loss": 2.3762, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "eval_loss": 2.39650559425354, | |
| "eval_runtime": 31.6255, | |
| "eval_samples_per_second": 3.225, | |
| "eval_steps_per_second": 1.613, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.3105, | |
| "grad_norm": 1.090575647217174, | |
| "learning_rate": 7.661333333333334e-06, | |
| "loss": 2.3854, | |
| "step": 15525 | |
| }, | |
| { | |
| "epoch": 0.311, | |
| "grad_norm": 0.5608565584872227, | |
| "learning_rate": 7.65577777777778e-06, | |
| "loss": 2.3909, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 0.3115, | |
| "grad_norm": 0.5664910219445479, | |
| "learning_rate": 7.650222222222223e-06, | |
| "loss": 2.3876, | |
| "step": 15575 | |
| }, | |
| { | |
| "epoch": 0.312, | |
| "grad_norm": 0.5743138998726522, | |
| "learning_rate": 7.644666666666666e-06, | |
| "loss": 2.3891, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 0.312, | |
| "eval_loss": 2.395846128463745, | |
| "eval_runtime": 31.422, | |
| "eval_samples_per_second": 3.246, | |
| "eval_steps_per_second": 1.623, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 0.3125, | |
| "grad_norm": 0.5838966503811626, | |
| "learning_rate": 7.639111111111112e-06, | |
| "loss": 2.3744, | |
| "step": 15625 | |
| }, | |
| { | |
| "epoch": 0.313, | |
| "grad_norm": 0.5861982665217826, | |
| "learning_rate": 7.633555555555557e-06, | |
| "loss": 2.386, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 0.3135, | |
| "grad_norm": 0.5623110973377239, | |
| "learning_rate": 7.628000000000001e-06, | |
| "loss": 2.3729, | |
| "step": 15675 | |
| }, | |
| { | |
| "epoch": 0.314, | |
| "grad_norm": 0.5546807091447383, | |
| "learning_rate": 7.622444444444445e-06, | |
| "loss": 2.3758, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 0.314, | |
| "eval_loss": 2.396050453186035, | |
| "eval_runtime": 31.4839, | |
| "eval_samples_per_second": 3.24, | |
| "eval_steps_per_second": 1.62, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 0.3145, | |
| "grad_norm": 0.566357543453858, | |
| "learning_rate": 7.616888888888889e-06, | |
| "loss": 2.3814, | |
| "step": 15725 | |
| }, | |
| { | |
| "epoch": 0.315, | |
| "grad_norm": 0.5863021742964364, | |
| "learning_rate": 7.611333333333334e-06, | |
| "loss": 2.3912, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 0.3155, | |
| "grad_norm": 0.5448091994015362, | |
| "learning_rate": 7.605777777777779e-06, | |
| "loss": 2.3949, | |
| "step": 15775 | |
| }, | |
| { | |
| "epoch": 0.316, | |
| "grad_norm": 0.5571622234957405, | |
| "learning_rate": 7.600222222222223e-06, | |
| "loss": 2.3893, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 0.316, | |
| "eval_loss": 2.3957884311676025, | |
| "eval_runtime": 31.4676, | |
| "eval_samples_per_second": 3.241, | |
| "eval_steps_per_second": 1.621, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 0.3165, | |
| "grad_norm": 0.6175149611764096, | |
| "learning_rate": 7.594666666666667e-06, | |
| "loss": 2.3858, | |
| "step": 15825 | |
| }, | |
| { | |
| "epoch": 0.317, | |
| "grad_norm": 0.5811416818392343, | |
| "learning_rate": 7.589111111111111e-06, | |
| "loss": 2.3893, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 0.3175, | |
| "grad_norm": 0.5685262674194088, | |
| "learning_rate": 7.5835555555555566e-06, | |
| "loss": 2.3895, | |
| "step": 15875 | |
| }, | |
| { | |
| "epoch": 0.318, | |
| "grad_norm": 0.5726231388910242, | |
| "learning_rate": 7.578000000000001e-06, | |
| "loss": 2.3924, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 0.318, | |
| "eval_loss": 2.3957200050354004, | |
| "eval_runtime": 31.6833, | |
| "eval_samples_per_second": 3.219, | |
| "eval_steps_per_second": 1.61, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 0.3185, | |
| "grad_norm": 0.5881014617899262, | |
| "learning_rate": 7.572444444444445e-06, | |
| "loss": 2.3719, | |
| "step": 15925 | |
| }, | |
| { | |
| "epoch": 0.319, | |
| "grad_norm": 0.5635459036409981, | |
| "learning_rate": 7.566888888888889e-06, | |
| "loss": 2.378, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 0.3195, | |
| "grad_norm": 0.5604907919572244, | |
| "learning_rate": 7.561333333333334e-06, | |
| "loss": 2.3744, | |
| "step": 15975 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.5743956921241223, | |
| "learning_rate": 7.555777777777779e-06, | |
| "loss": 2.3872, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_loss": 2.3958442211151123, | |
| "eval_runtime": 31.9703, | |
| "eval_samples_per_second": 3.19, | |
| "eval_steps_per_second": 1.595, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.3205, | |
| "grad_norm": 0.5490540509150809, | |
| "learning_rate": 7.550222222222223e-06, | |
| "loss": 2.3908, | |
| "step": 16025 | |
| }, | |
| { | |
| "epoch": 0.321, | |
| "grad_norm": 0.5604566538327537, | |
| "learning_rate": 7.5446666666666665e-06, | |
| "loss": 2.3816, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 0.3215, | |
| "grad_norm": 0.5482351645184266, | |
| "learning_rate": 7.539111111111112e-06, | |
| "loss": 2.3783, | |
| "step": 16075 | |
| }, | |
| { | |
| "epoch": 0.322, | |
| "grad_norm": 0.5738611670880387, | |
| "learning_rate": 7.533555555555556e-06, | |
| "loss": 2.3807, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 0.322, | |
| "eval_loss": 2.3955187797546387, | |
| "eval_runtime": 31.7782, | |
| "eval_samples_per_second": 3.21, | |
| "eval_steps_per_second": 1.605, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 0.3225, | |
| "grad_norm": 0.6007459037823811, | |
| "learning_rate": 7.528000000000001e-06, | |
| "loss": 2.3908, | |
| "step": 16125 | |
| }, | |
| { | |
| "epoch": 0.323, | |
| "grad_norm": 0.5719140015142068, | |
| "learning_rate": 7.522444444444446e-06, | |
| "loss": 2.379, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 0.3235, | |
| "grad_norm": 0.5722843141001409, | |
| "learning_rate": 7.516888888888889e-06, | |
| "loss": 2.3831, | |
| "step": 16175 | |
| }, | |
| { | |
| "epoch": 0.324, | |
| "grad_norm": 0.5500359198684006, | |
| "learning_rate": 7.511333333333334e-06, | |
| "loss": 2.3899, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 0.324, | |
| "eval_loss": 2.3954145908355713, | |
| "eval_runtime": 31.9265, | |
| "eval_samples_per_second": 3.195, | |
| "eval_steps_per_second": 1.597, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 0.3245, | |
| "grad_norm": 0.5988197648020003, | |
| "learning_rate": 7.505777777777778e-06, | |
| "loss": 2.3768, | |
| "step": 16225 | |
| }, | |
| { | |
| "epoch": 0.325, | |
| "grad_norm": 0.566314534087209, | |
| "learning_rate": 7.5002222222222235e-06, | |
| "loss": 2.3731, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 0.3255, | |
| "grad_norm": 0.5462158611596983, | |
| "learning_rate": 7.494666666666667e-06, | |
| "loss": 2.3821, | |
| "step": 16275 | |
| }, | |
| { | |
| "epoch": 0.326, | |
| "grad_norm": 0.5546038414202229, | |
| "learning_rate": 7.4891111111111114e-06, | |
| "loss": 2.3725, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 0.326, | |
| "eval_loss": 2.395524501800537, | |
| "eval_runtime": 31.8126, | |
| "eval_samples_per_second": 3.206, | |
| "eval_steps_per_second": 1.603, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 0.3265, | |
| "grad_norm": 0.5596467845027929, | |
| "learning_rate": 7.483555555555556e-06, | |
| "loss": 2.3843, | |
| "step": 16325 | |
| }, | |
| { | |
| "epoch": 0.327, | |
| "grad_norm": 0.5815120805791782, | |
| "learning_rate": 7.478000000000001e-06, | |
| "loss": 2.3815, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 0.3275, | |
| "grad_norm": 0.5597449596999192, | |
| "learning_rate": 7.4724444444444455e-06, | |
| "loss": 2.3732, | |
| "step": 16375 | |
| }, | |
| { | |
| "epoch": 0.328, | |
| "grad_norm": 0.5818958282150155, | |
| "learning_rate": 7.466888888888889e-06, | |
| "loss": 2.3793, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 0.328, | |
| "eval_loss": 2.3949294090270996, | |
| "eval_runtime": 31.7738, | |
| "eval_samples_per_second": 3.21, | |
| "eval_steps_per_second": 1.605, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 0.3285, | |
| "grad_norm": 0.5662000485734395, | |
| "learning_rate": 7.4613333333333334e-06, | |
| "loss": 2.3812, | |
| "step": 16425 | |
| }, | |
| { | |
| "epoch": 0.329, | |
| "grad_norm": 0.5563577533028059, | |
| "learning_rate": 7.455777777777779e-06, | |
| "loss": 2.3761, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 0.3295, | |
| "grad_norm": 0.5687992956190129, | |
| "learning_rate": 7.450222222222223e-06, | |
| "loss": 2.381, | |
| "step": 16475 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.5487444076942639, | |
| "learning_rate": 7.4446666666666675e-06, | |
| "loss": 2.3883, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "eval_loss": 2.395174026489258, | |
| "eval_runtime": 31.7762, | |
| "eval_samples_per_second": 3.21, | |
| "eval_steps_per_second": 1.605, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.3305, | |
| "grad_norm": 0.5469101598299175, | |
| "learning_rate": 7.439111111111111e-06, | |
| "loss": 2.3766, | |
| "step": 16525 | |
| }, | |
| { | |
| "epoch": 0.331, | |
| "grad_norm": 0.5567200858341991, | |
| "learning_rate": 7.433555555555556e-06, | |
| "loss": 2.3939, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 0.3315, | |
| "grad_norm": 0.600536691861987, | |
| "learning_rate": 7.428000000000001e-06, | |
| "loss": 2.3822, | |
| "step": 16575 | |
| }, | |
| { | |
| "epoch": 0.332, | |
| "grad_norm": 0.5505048207350117, | |
| "learning_rate": 7.422444444444445e-06, | |
| "loss": 2.378, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 0.332, | |
| "eval_loss": 2.39481520652771, | |
| "eval_runtime": 31.8394, | |
| "eval_samples_per_second": 3.204, | |
| "eval_steps_per_second": 1.602, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 0.3325, | |
| "grad_norm": 0.5492676702406505, | |
| "learning_rate": 7.416888888888889e-06, | |
| "loss": 2.3769, | |
| "step": 16625 | |
| }, | |
| { | |
| "epoch": 0.333, | |
| "grad_norm": 0.5492443037384863, | |
| "learning_rate": 7.411333333333334e-06, | |
| "loss": 2.3701, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 0.3335, | |
| "grad_norm": 0.5857568383624908, | |
| "learning_rate": 7.405777777777778e-06, | |
| "loss": 2.381, | |
| "step": 16675 | |
| }, | |
| { | |
| "epoch": 0.334, | |
| "grad_norm": 0.5647204860919086, | |
| "learning_rate": 7.400222222222223e-06, | |
| "loss": 2.3819, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 0.334, | |
| "eval_loss": 2.394426107406616, | |
| "eval_runtime": 31.892, | |
| "eval_samples_per_second": 3.198, | |
| "eval_steps_per_second": 1.599, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 0.3345, | |
| "grad_norm": 0.5730702201176824, | |
| "learning_rate": 7.394666666666668e-06, | |
| "loss": 2.3857, | |
| "step": 16725 | |
| }, | |
| { | |
| "epoch": 0.335, | |
| "grad_norm": 0.5521969424083262, | |
| "learning_rate": 7.3891111111111115e-06, | |
| "loss": 2.363, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 0.3355, | |
| "grad_norm": 0.6057695700506919, | |
| "learning_rate": 7.383555555555556e-06, | |
| "loss": 2.3848, | |
| "step": 16775 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 0.5749986280132275, | |
| "learning_rate": 7.378e-06, | |
| "loss": 2.389, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "eval_loss": 2.3945508003234863, | |
| "eval_runtime": 31.7463, | |
| "eval_samples_per_second": 3.213, | |
| "eval_steps_per_second": 1.606, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 0.3365, | |
| "grad_norm": 0.5947076066210849, | |
| "learning_rate": 7.372444444444446e-06, | |
| "loss": 2.3865, | |
| "step": 16825 | |
| }, | |
| { | |
| "epoch": 0.337, | |
| "grad_norm": 0.564221658006085, | |
| "learning_rate": 7.366888888888889e-06, | |
| "loss": 2.3696, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 0.3375, | |
| "grad_norm": 0.5702041520098122, | |
| "learning_rate": 7.3613333333333336e-06, | |
| "loss": 2.3872, | |
| "step": 16875 | |
| }, | |
| { | |
| "epoch": 0.338, | |
| "grad_norm": 0.5538661614565709, | |
| "learning_rate": 7.355777777777778e-06, | |
| "loss": 2.3828, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 0.338, | |
| "eval_loss": 2.3942644596099854, | |
| "eval_runtime": 31.8144, | |
| "eval_samples_per_second": 3.206, | |
| "eval_steps_per_second": 1.603, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 0.3385, | |
| "grad_norm": 0.5614412730199092, | |
| "learning_rate": 7.350222222222223e-06, | |
| "loss": 2.3898, | |
| "step": 16925 | |
| }, | |
| { | |
| "epoch": 0.339, | |
| "grad_norm": 0.5656638849693418, | |
| "learning_rate": 7.344666666666668e-06, | |
| "loss": 2.3639, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 0.3395, | |
| "grad_norm": 0.5587793192894792, | |
| "learning_rate": 7.339111111111111e-06, | |
| "loss": 2.3761, | |
| "step": 16975 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.5537041511919, | |
| "learning_rate": 7.3335555555555556e-06, | |
| "loss": 2.3785, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "eval_loss": 2.394216775894165, | |
| "eval_runtime": 31.7287, | |
| "eval_samples_per_second": 3.215, | |
| "eval_steps_per_second": 1.607, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.3405, | |
| "grad_norm": 0.5625979440161315, | |
| "learning_rate": 7.328000000000001e-06, | |
| "loss": 2.3706, | |
| "step": 17025 | |
| }, | |
| { | |
| "epoch": 0.341, | |
| "grad_norm": 0.5578934058534382, | |
| "learning_rate": 7.322444444444445e-06, | |
| "loss": 2.3717, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 0.3415, | |
| "grad_norm": 0.5600783145650656, | |
| "learning_rate": 7.31688888888889e-06, | |
| "loss": 2.3549, | |
| "step": 17075 | |
| }, | |
| { | |
| "epoch": 0.342, | |
| "grad_norm": 0.5443562716925451, | |
| "learning_rate": 7.311333333333334e-06, | |
| "loss": 2.3818, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 0.342, | |
| "eval_loss": 2.3939199447631836, | |
| "eval_runtime": 31.7183, | |
| "eval_samples_per_second": 3.216, | |
| "eval_steps_per_second": 1.608, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 0.3425, | |
| "grad_norm": 0.6040551095214175, | |
| "learning_rate": 7.3057777777777784e-06, | |
| "loss": 2.3856, | |
| "step": 17125 | |
| }, | |
| { | |
| "epoch": 0.343, | |
| "grad_norm": 0.5800600768624563, | |
| "learning_rate": 7.300222222222223e-06, | |
| "loss": 2.3812, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 0.3435, | |
| "grad_norm": 0.606456873691792, | |
| "learning_rate": 7.294666666666668e-06, | |
| "loss": 2.3823, | |
| "step": 17175 | |
| }, | |
| { | |
| "epoch": 0.344, | |
| "grad_norm": 0.5820033666001653, | |
| "learning_rate": 7.289111111111112e-06, | |
| "loss": 2.3772, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 0.344, | |
| "eval_loss": 2.39414644241333, | |
| "eval_runtime": 31.4591, | |
| "eval_samples_per_second": 3.242, | |
| "eval_steps_per_second": 1.621, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 0.3445, | |
| "grad_norm": 0.592691728166079, | |
| "learning_rate": 7.283555555555556e-06, | |
| "loss": 2.3757, | |
| "step": 17225 | |
| }, | |
| { | |
| "epoch": 0.345, | |
| "grad_norm": 0.5475066044517582, | |
| "learning_rate": 7.2780000000000005e-06, | |
| "loss": 2.393, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 0.3455, | |
| "grad_norm": 0.5412153350606916, | |
| "learning_rate": 7.272444444444446e-06, | |
| "loss": 2.3775, | |
| "step": 17275 | |
| }, | |
| { | |
| "epoch": 0.346, | |
| "grad_norm": 0.5703055910606494, | |
| "learning_rate": 7.26688888888889e-06, | |
| "loss": 2.3919, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 0.346, | |
| "eval_loss": 2.393954277038574, | |
| "eval_runtime": 31.4832, | |
| "eval_samples_per_second": 3.24, | |
| "eval_steps_per_second": 1.62, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 0.3465, | |
| "grad_norm": 0.5720004911842855, | |
| "learning_rate": 7.261333333333334e-06, | |
| "loss": 2.3744, | |
| "step": 17325 | |
| }, | |
| { | |
| "epoch": 0.347, | |
| "grad_norm": 0.5651936652229611, | |
| "learning_rate": 7.255777777777778e-06, | |
| "loss": 2.3766, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 0.3475, | |
| "grad_norm": 0.552954097582646, | |
| "learning_rate": 7.250222222222223e-06, | |
| "loss": 2.38, | |
| "step": 17375 | |
| }, | |
| { | |
| "epoch": 0.348, | |
| "grad_norm": 0.5753937605402671, | |
| "learning_rate": 7.244666666666668e-06, | |
| "loss": 2.3825, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 0.348, | |
| "eval_loss": 2.3936057090759277, | |
| "eval_runtime": 31.5155, | |
| "eval_samples_per_second": 3.237, | |
| "eval_steps_per_second": 1.618, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 0.3485, | |
| "grad_norm": 0.5982429265702776, | |
| "learning_rate": 7.239111111111111e-06, | |
| "loss": 2.3748, | |
| "step": 17425 | |
| }, | |
| { | |
| "epoch": 0.349, | |
| "grad_norm": 0.5707105076014326, | |
| "learning_rate": 7.233555555555556e-06, | |
| "loss": 2.3871, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 0.3495, | |
| "grad_norm": 0.5749982454192974, | |
| "learning_rate": 7.228000000000001e-06, | |
| "loss": 2.3722, | |
| "step": 17475 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.5667678087541999, | |
| "learning_rate": 7.222444444444445e-06, | |
| "loss": 2.3897, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "eval_loss": 2.3934316635131836, | |
| "eval_runtime": 31.5133, | |
| "eval_samples_per_second": 3.237, | |
| "eval_steps_per_second": 1.618, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.3505, | |
| "grad_norm": 0.551269238238286, | |
| "learning_rate": 7.21688888888889e-06, | |
| "loss": 2.3759, | |
| "step": 17525 | |
| }, | |
| { | |
| "epoch": 0.351, | |
| "grad_norm": 0.5683477126287287, | |
| "learning_rate": 7.211333333333333e-06, | |
| "loss": 2.3751, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 0.3515, | |
| "grad_norm": 0.5534527601932518, | |
| "learning_rate": 7.2057777777777785e-06, | |
| "loss": 2.3749, | |
| "step": 17575 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 0.5444580304379504, | |
| "learning_rate": 7.200222222222223e-06, | |
| "loss": 2.3839, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "eval_loss": 2.3928964138031006, | |
| "eval_runtime": 31.79, | |
| "eval_samples_per_second": 3.209, | |
| "eval_steps_per_second": 1.604, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 0.3525, | |
| "grad_norm": 0.5683011717419817, | |
| "learning_rate": 7.194666666666667e-06, | |
| "loss": 2.3697, | |
| "step": 17625 | |
| }, | |
| { | |
| "epoch": 0.353, | |
| "grad_norm": 0.5597200154635523, | |
| "learning_rate": 7.189111111111111e-06, | |
| "loss": 2.3758, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 0.3535, | |
| "grad_norm": 0.5389975543023572, | |
| "learning_rate": 7.183555555555556e-06, | |
| "loss": 2.3748, | |
| "step": 17675 | |
| }, | |
| { | |
| "epoch": 0.354, | |
| "grad_norm": 0.5766556300730846, | |
| "learning_rate": 7.1780000000000006e-06, | |
| "loss": 2.3863, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 0.354, | |
| "eval_loss": 2.3929381370544434, | |
| "eval_runtime": 31.4662, | |
| "eval_samples_per_second": 3.242, | |
| "eval_steps_per_second": 1.621, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 0.3545, | |
| "grad_norm": 0.5422601731930108, | |
| "learning_rate": 7.172444444444445e-06, | |
| "loss": 2.3795, | |
| "step": 17725 | |
| }, | |
| { | |
| "epoch": 0.355, | |
| "grad_norm": 0.587749563771833, | |
| "learning_rate": 7.16688888888889e-06, | |
| "loss": 2.3741, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 0.3555, | |
| "grad_norm": 0.5448174780243932, | |
| "learning_rate": 7.161333333333334e-06, | |
| "loss": 2.374, | |
| "step": 17775 | |
| }, | |
| { | |
| "epoch": 0.356, | |
| "grad_norm": 0.5487711297157323, | |
| "learning_rate": 7.155777777777778e-06, | |
| "loss": 2.3872, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 0.356, | |
| "eval_loss": 2.3928709030151367, | |
| "eval_runtime": 31.7364, | |
| "eval_samples_per_second": 3.214, | |
| "eval_steps_per_second": 1.607, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 0.3565, | |
| "grad_norm": 0.5749112760792647, | |
| "learning_rate": 7.150222222222223e-06, | |
| "loss": 2.375, | |
| "step": 17825 | |
| }, | |
| { | |
| "epoch": 0.357, | |
| "grad_norm": 0.5657127084376901, | |
| "learning_rate": 7.144666666666668e-06, | |
| "loss": 2.3635, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 0.3575, | |
| "grad_norm": 0.5552559911086609, | |
| "learning_rate": 7.139111111111112e-06, | |
| "loss": 2.3791, | |
| "step": 17875 | |
| }, | |
| { | |
| "epoch": 0.358, | |
| "grad_norm": 0.5587079571658956, | |
| "learning_rate": 7.133555555555556e-06, | |
| "loss": 2.3792, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 0.358, | |
| "eval_loss": 2.39250111579895, | |
| "eval_runtime": 31.8377, | |
| "eval_samples_per_second": 3.204, | |
| "eval_steps_per_second": 1.602, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 0.3585, | |
| "grad_norm": 0.5476769108414363, | |
| "learning_rate": 7.128e-06, | |
| "loss": 2.3796, | |
| "step": 17925 | |
| }, | |
| { | |
| "epoch": 0.359, | |
| "grad_norm": 0.5519286017800472, | |
| "learning_rate": 7.1224444444444454e-06, | |
| "loss": 2.3689, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 0.3595, | |
| "grad_norm": 0.5690523665272621, | |
| "learning_rate": 7.11688888888889e-06, | |
| "loss": 2.3758, | |
| "step": 17975 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.575484852893059, | |
| "learning_rate": 7.111333333333333e-06, | |
| "loss": 2.3723, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "eval_loss": 2.3920133113861084, | |
| "eval_runtime": 31.9286, | |
| "eval_samples_per_second": 3.195, | |
| "eval_steps_per_second": 1.597, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.3605, | |
| "grad_norm": 0.5779120077378331, | |
| "learning_rate": 7.105777777777778e-06, | |
| "loss": 2.3798, | |
| "step": 18025 | |
| }, | |
| { | |
| "epoch": 0.361, | |
| "grad_norm": 0.575309417070187, | |
| "learning_rate": 7.100222222222223e-06, | |
| "loss": 2.3875, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 0.3615, | |
| "grad_norm": 0.6000430306182747, | |
| "learning_rate": 7.0946666666666675e-06, | |
| "loss": 2.3727, | |
| "step": 18075 | |
| }, | |
| { | |
| "epoch": 0.362, | |
| "grad_norm": 0.5701734522791184, | |
| "learning_rate": 7.089111111111112e-06, | |
| "loss": 2.3793, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 0.362, | |
| "eval_loss": 2.392152786254883, | |
| "eval_runtime": 31.8363, | |
| "eval_samples_per_second": 3.204, | |
| "eval_steps_per_second": 1.602, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 0.3625, | |
| "grad_norm": 0.5731611332750656, | |
| "learning_rate": 7.083555555555555e-06, | |
| "loss": 2.3715, | |
| "step": 18125 | |
| }, | |
| { | |
| "epoch": 0.363, | |
| "grad_norm": 0.6114229583074544, | |
| "learning_rate": 7.078000000000001e-06, | |
| "loss": 2.383, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 0.3635, | |
| "grad_norm": 0.541007634609165, | |
| "learning_rate": 7.072444444444445e-06, | |
| "loss": 2.3686, | |
| "step": 18175 | |
| }, | |
| { | |
| "epoch": 0.364, | |
| "grad_norm": 0.5725748950012406, | |
| "learning_rate": 7.0668888888888895e-06, | |
| "loss": 2.3873, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 0.364, | |
| "eval_loss": 2.392261505126953, | |
| "eval_runtime": 31.7706, | |
| "eval_samples_per_second": 3.211, | |
| "eval_steps_per_second": 1.605, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 0.3645, | |
| "grad_norm": 0.5593670656564304, | |
| "learning_rate": 7.061333333333333e-06, | |
| "loss": 2.3804, | |
| "step": 18225 | |
| }, | |
| { | |
| "epoch": 0.365, | |
| "grad_norm": 0.6009795583649221, | |
| "learning_rate": 7.055777777777778e-06, | |
| "loss": 2.3795, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 0.3655, | |
| "grad_norm": 0.5664495345544722, | |
| "learning_rate": 7.050222222222223e-06, | |
| "loss": 2.3631, | |
| "step": 18275 | |
| }, | |
| { | |
| "epoch": 0.366, | |
| "grad_norm": 0.6104006309418994, | |
| "learning_rate": 7.044666666666667e-06, | |
| "loss": 2.3748, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 0.366, | |
| "eval_loss": 2.392148971557617, | |
| "eval_runtime": 31.734, | |
| "eval_samples_per_second": 3.214, | |
| "eval_steps_per_second": 1.607, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 0.3665, | |
| "grad_norm": 0.5506059883330837, | |
| "learning_rate": 7.039111111111112e-06, | |
| "loss": 2.3714, | |
| "step": 18325 | |
| }, | |
| { | |
| "epoch": 0.367, | |
| "grad_norm": 0.5621509156408089, | |
| "learning_rate": 7.033555555555556e-06, | |
| "loss": 2.368, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 0.3675, | |
| "grad_norm": 0.5587181787810226, | |
| "learning_rate": 7.028e-06, | |
| "loss": 2.3791, | |
| "step": 18375 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 0.5677798724220077, | |
| "learning_rate": 7.022444444444445e-06, | |
| "loss": 2.384, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "eval_loss": 2.391704559326172, | |
| "eval_runtime": 31.7798, | |
| "eval_samples_per_second": 3.21, | |
| "eval_steps_per_second": 1.605, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 0.3685, | |
| "grad_norm": 0.5905061339542746, | |
| "learning_rate": 7.01688888888889e-06, | |
| "loss": 2.3881, | |
| "step": 18425 | |
| }, | |
| { | |
| "epoch": 0.369, | |
| "grad_norm": 0.554978244766298, | |
| "learning_rate": 7.011333333333334e-06, | |
| "loss": 2.3683, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 0.3695, | |
| "grad_norm": 0.5517801842410981, | |
| "learning_rate": 7.005777777777778e-06, | |
| "loss": 2.3835, | |
| "step": 18475 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.5501181046318251, | |
| "learning_rate": 7.000222222222222e-06, | |
| "loss": 2.374, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "eval_loss": 2.3915836811065674, | |
| "eval_runtime": 31.7662, | |
| "eval_samples_per_second": 3.211, | |
| "eval_steps_per_second": 1.605, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.3705, | |
| "grad_norm": 0.576826996404141, | |
| "learning_rate": 6.9946666666666676e-06, | |
| "loss": 2.3819, | |
| "step": 18525 | |
| }, | |
| { | |
| "epoch": 0.371, | |
| "grad_norm": 0.5739797151959755, | |
| "learning_rate": 6.989111111111112e-06, | |
| "loss": 2.3794, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 0.3715, | |
| "grad_norm": 0.5511012262440002, | |
| "learning_rate": 6.9835555555555555e-06, | |
| "loss": 2.3894, | |
| "step": 18575 | |
| }, | |
| { | |
| "epoch": 0.372, | |
| "grad_norm": 0.5958849979817049, | |
| "learning_rate": 6.978e-06, | |
| "loss": 2.3674, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 0.372, | |
| "eval_loss": 2.391352415084839, | |
| "eval_runtime": 31.7756, | |
| "eval_samples_per_second": 3.21, | |
| "eval_steps_per_second": 1.605, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 0.3725, | |
| "grad_norm": 0.5595892595435197, | |
| "learning_rate": 6.972444444444445e-06, | |
| "loss": 2.3835, | |
| "step": 18625 | |
| }, | |
| { | |
| "epoch": 0.373, | |
| "grad_norm": 0.5946746403488841, | |
| "learning_rate": 6.96688888888889e-06, | |
| "loss": 2.3716, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 0.3735, | |
| "grad_norm": 0.5613740876716816, | |
| "learning_rate": 6.961333333333334e-06, | |
| "loss": 2.3843, | |
| "step": 18675 | |
| }, | |
| { | |
| "epoch": 0.374, | |
| "grad_norm": 0.58419422677193, | |
| "learning_rate": 6.9557777777777776e-06, | |
| "loss": 2.3883, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 0.374, | |
| "eval_loss": 2.391383409500122, | |
| "eval_runtime": 31.7182, | |
| "eval_samples_per_second": 3.216, | |
| "eval_steps_per_second": 1.608, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 0.3745, | |
| "grad_norm": 0.5508427755524951, | |
| "learning_rate": 6.950222222222223e-06, | |
| "loss": 2.3749, | |
| "step": 18725 | |
| }, | |
| { | |
| "epoch": 0.375, | |
| "grad_norm": 0.5686856026931271, | |
| "learning_rate": 6.944666666666667e-06, | |
| "loss": 2.38, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 0.3755, | |
| "grad_norm": 0.5531747783480245, | |
| "learning_rate": 6.939111111111112e-06, | |
| "loss": 2.3718, | |
| "step": 18775 | |
| }, | |
| { | |
| "epoch": 0.376, | |
| "grad_norm": 0.5800045444885175, | |
| "learning_rate": 6.933555555555556e-06, | |
| "loss": 2.3703, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 0.376, | |
| "eval_loss": 2.391113042831421, | |
| "eval_runtime": 31.7446, | |
| "eval_samples_per_second": 3.213, | |
| "eval_steps_per_second": 1.607, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 0.3765, | |
| "grad_norm": 0.5451395919825731, | |
| "learning_rate": 6.928e-06, | |
| "loss": 2.3746, | |
| "step": 18825 | |
| }, | |
| { | |
| "epoch": 0.377, | |
| "grad_norm": 0.5619738492106079, | |
| "learning_rate": 6.922444444444445e-06, | |
| "loss": 2.3815, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 0.3775, | |
| "grad_norm": 0.5811440137998495, | |
| "learning_rate": 6.91688888888889e-06, | |
| "loss": 2.3655, | |
| "step": 18875 | |
| }, | |
| { | |
| "epoch": 0.378, | |
| "grad_norm": 0.5528301840539304, | |
| "learning_rate": 6.9113333333333345e-06, | |
| "loss": 2.3721, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 0.378, | |
| "eval_loss": 2.3908257484436035, | |
| "eval_runtime": 31.6268, | |
| "eval_samples_per_second": 3.225, | |
| "eval_steps_per_second": 1.613, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 0.3785, | |
| "grad_norm": 0.5791069800351532, | |
| "learning_rate": 6.905777777777778e-06, | |
| "loss": 2.3798, | |
| "step": 18925 | |
| }, | |
| { | |
| "epoch": 0.379, | |
| "grad_norm": 0.5692008495737035, | |
| "learning_rate": 6.9002222222222224e-06, | |
| "loss": 2.3723, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 0.3795, | |
| "grad_norm": 0.5614405054433378, | |
| "learning_rate": 6.894666666666668e-06, | |
| "loss": 2.3739, | |
| "step": 18975 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.5641420025760586, | |
| "learning_rate": 6.889111111111112e-06, | |
| "loss": 2.3728, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "eval_loss": 2.390749454498291, | |
| "eval_runtime": 31.8098, | |
| "eval_samples_per_second": 3.207, | |
| "eval_steps_per_second": 1.603, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.3805, | |
| "grad_norm": 0.5526396554433541, | |
| "learning_rate": 6.8835555555555565e-06, | |
| "loss": 2.3779, | |
| "step": 19025 | |
| }, | |
| { | |
| "epoch": 0.381, | |
| "grad_norm": 0.574490460414078, | |
| "learning_rate": 6.878e-06, | |
| "loss": 2.3727, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 0.3815, | |
| "grad_norm": 0.5611671894801677, | |
| "learning_rate": 6.872444444444445e-06, | |
| "loss": 2.379, | |
| "step": 19075 | |
| }, | |
| { | |
| "epoch": 0.382, | |
| "grad_norm": 0.5434475778092571, | |
| "learning_rate": 6.86688888888889e-06, | |
| "loss": 2.3788, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 0.382, | |
| "eval_loss": 2.390854597091675, | |
| "eval_runtime": 31.4727, | |
| "eval_samples_per_second": 3.241, | |
| "eval_steps_per_second": 1.62, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 0.3825, | |
| "grad_norm": 0.5438441040943751, | |
| "learning_rate": 6.861333333333334e-06, | |
| "loss": 2.3849, | |
| "step": 19125 | |
| }, | |
| { | |
| "epoch": 0.383, | |
| "grad_norm": 0.5617582167520553, | |
| "learning_rate": 6.855777777777778e-06, | |
| "loss": 2.3778, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 0.3835, | |
| "grad_norm": 0.5734148354957039, | |
| "learning_rate": 6.850222222222223e-06, | |
| "loss": 2.3749, | |
| "step": 19175 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 0.5567016447555824, | |
| "learning_rate": 6.844666666666667e-06, | |
| "loss": 2.3786, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "eval_loss": 2.390947103500366, | |
| "eval_runtime": 31.472, | |
| "eval_samples_per_second": 3.241, | |
| "eval_steps_per_second": 1.62, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 0.3845, | |
| "grad_norm": 0.5630941651558155, | |
| "learning_rate": 6.839111111111112e-06, | |
| "loss": 2.371, | |
| "step": 19225 | |
| }, | |
| { | |
| "epoch": 0.385, | |
| "grad_norm": 0.5472891744821744, | |
| "learning_rate": 6.833555555555557e-06, | |
| "loss": 2.371, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 0.3855, | |
| "grad_norm": 0.563854124925733, | |
| "learning_rate": 6.8280000000000005e-06, | |
| "loss": 2.3802, | |
| "step": 19275 | |
| }, | |
| { | |
| "epoch": 0.386, | |
| "grad_norm": 0.5535188682099162, | |
| "learning_rate": 6.822444444444445e-06, | |
| "loss": 2.3668, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 0.386, | |
| "eval_loss": 2.3904383182525635, | |
| "eval_runtime": 31.5109, | |
| "eval_samples_per_second": 3.237, | |
| "eval_steps_per_second": 1.618, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 0.3865, | |
| "grad_norm": 0.5847689751509554, | |
| "learning_rate": 6.816888888888889e-06, | |
| "loss": 2.3723, | |
| "step": 19325 | |
| }, | |
| { | |
| "epoch": 0.387, | |
| "grad_norm": 0.5477508463021717, | |
| "learning_rate": 6.811333333333335e-06, | |
| "loss": 2.3748, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 0.3875, | |
| "grad_norm": 0.5530662776524751, | |
| "learning_rate": 6.805777777777778e-06, | |
| "loss": 2.372, | |
| "step": 19375 | |
| }, | |
| { | |
| "epoch": 0.388, | |
| "grad_norm": 0.5627088332087185, | |
| "learning_rate": 6.8002222222222225e-06, | |
| "loss": 2.3649, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 0.388, | |
| "eval_loss": 2.3902432918548584, | |
| "eval_runtime": 31.5016, | |
| "eval_samples_per_second": 3.238, | |
| "eval_steps_per_second": 1.619, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 0.3885, | |
| "grad_norm": 0.5917805991329846, | |
| "learning_rate": 6.794666666666667e-06, | |
| "loss": 2.389, | |
| "step": 19425 | |
| }, | |
| { | |
| "epoch": 0.389, | |
| "grad_norm": 0.5637153841856668, | |
| "learning_rate": 6.789111111111112e-06, | |
| "loss": 2.381, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 0.3895, | |
| "grad_norm": 0.5638546592221216, | |
| "learning_rate": 6.783555555555557e-06, | |
| "loss": 2.3674, | |
| "step": 19475 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.5442599823902955, | |
| "learning_rate": 6.778e-06, | |
| "loss": 2.3684, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "eval_loss": 2.3898606300354004, | |
| "eval_runtime": 31.4637, | |
| "eval_samples_per_second": 3.242, | |
| "eval_steps_per_second": 1.621, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.3905, | |
| "grad_norm": 0.582280869057288, | |
| "learning_rate": 6.7724444444444446e-06, | |
| "loss": 2.3691, | |
| "step": 19525 | |
| }, | |
| { | |
| "epoch": 0.391, | |
| "grad_norm": 0.5427829071455205, | |
| "learning_rate": 6.76688888888889e-06, | |
| "loss": 2.372, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 0.3915, | |
| "grad_norm": 0.5690660297920415, | |
| "learning_rate": 6.761333333333334e-06, | |
| "loss": 2.3696, | |
| "step": 19575 | |
| }, | |
| { | |
| "epoch": 0.392, | |
| "grad_norm": 0.5887280660795969, | |
| "learning_rate": 6.755777777777779e-06, | |
| "loss": 2.3647, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 0.392, | |
| "eval_loss": 2.389928102493286, | |
| "eval_runtime": 31.425, | |
| "eval_samples_per_second": 3.246, | |
| "eval_steps_per_second": 1.623, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 0.3925, | |
| "grad_norm": 0.5706193677763675, | |
| "learning_rate": 6.750222222222222e-06, | |
| "loss": 2.3693, | |
| "step": 19625 | |
| }, | |
| { | |
| "epoch": 0.393, | |
| "grad_norm": 0.5446782496969111, | |
| "learning_rate": 6.7446666666666674e-06, | |
| "loss": 2.3808, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 0.3935, | |
| "grad_norm": 0.5571942248079983, | |
| "learning_rate": 6.739111111111112e-06, | |
| "loss": 2.3825, | |
| "step": 19675 | |
| }, | |
| { | |
| "epoch": 0.394, | |
| "grad_norm": 0.5452923856402259, | |
| "learning_rate": 6.733555555555556e-06, | |
| "loss": 2.3689, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 0.394, | |
| "eval_loss": 2.3896048069000244, | |
| "eval_runtime": 31.5836, | |
| "eval_samples_per_second": 3.23, | |
| "eval_steps_per_second": 1.615, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 0.3945, | |
| "grad_norm": 0.5828792681612529, | |
| "learning_rate": 6.728e-06, | |
| "loss": 2.3733, | |
| "step": 19725 | |
| }, | |
| { | |
| "epoch": 0.395, | |
| "grad_norm": 0.5615201455315739, | |
| "learning_rate": 6.722444444444445e-06, | |
| "loss": 2.3689, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 0.3955, | |
| "grad_norm": 0.5585669738111114, | |
| "learning_rate": 6.7168888888888894e-06, | |
| "loss": 2.3873, | |
| "step": 19775 | |
| }, | |
| { | |
| "epoch": 0.396, | |
| "grad_norm": 0.5412795214285975, | |
| "learning_rate": 6.711333333333334e-06, | |
| "loss": 2.3786, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 0.396, | |
| "eval_loss": 2.3894851207733154, | |
| "eval_runtime": 31.4877, | |
| "eval_samples_per_second": 3.239, | |
| "eval_steps_per_second": 1.62, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 0.3965, | |
| "grad_norm": 0.5778930227780084, | |
| "learning_rate": 6.705777777777779e-06, | |
| "loss": 2.3766, | |
| "step": 19825 | |
| }, | |
| { | |
| "epoch": 0.397, | |
| "grad_norm": 0.5682987690385847, | |
| "learning_rate": 6.700222222222223e-06, | |
| "loss": 2.3783, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 0.3975, | |
| "grad_norm": 0.5763865594632764, | |
| "learning_rate": 6.694666666666667e-06, | |
| "loss": 2.3738, | |
| "step": 19875 | |
| }, | |
| { | |
| "epoch": 0.398, | |
| "grad_norm": 0.5514756259491804, | |
| "learning_rate": 6.6891111111111115e-06, | |
| "loss": 2.3764, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 0.398, | |
| "eval_loss": 2.388927698135376, | |
| "eval_runtime": 31.7775, | |
| "eval_samples_per_second": 3.21, | |
| "eval_steps_per_second": 1.605, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 0.3985, | |
| "grad_norm": 0.5577240438533453, | |
| "learning_rate": 6.683555555555557e-06, | |
| "loss": 2.374, | |
| "step": 19925 | |
| }, | |
| { | |
| "epoch": 0.399, | |
| "grad_norm": 0.553314104963858, | |
| "learning_rate": 6.678e-06, | |
| "loss": 2.3726, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 0.3995, | |
| "grad_norm": 0.5615070159418603, | |
| "learning_rate": 6.672444444444445e-06, | |
| "loss": 2.3683, | |
| "step": 19975 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.5595654854755111, | |
| "learning_rate": 6.666888888888889e-06, | |
| "loss": 2.3632, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "eval_loss": 2.389249801635742, | |
| "eval_runtime": 31.7934, | |
| "eval_samples_per_second": 3.208, | |
| "eval_steps_per_second": 1.604, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.4005, | |
| "grad_norm": 0.5697829378233469, | |
| "learning_rate": 6.661333333333334e-06, | |
| "loss": 2.3675, | |
| "step": 20025 | |
| }, | |
| { | |
| "epoch": 0.401, | |
| "grad_norm": 0.5582897347067457, | |
| "learning_rate": 6.655777777777779e-06, | |
| "loss": 2.3672, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 0.4015, | |
| "grad_norm": 0.5926925535950422, | |
| "learning_rate": 6.650222222222222e-06, | |
| "loss": 2.3733, | |
| "step": 20075 | |
| }, | |
| { | |
| "epoch": 0.402, | |
| "grad_norm": 0.544270592824537, | |
| "learning_rate": 6.644666666666667e-06, | |
| "loss": 2.3803, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 0.402, | |
| "eval_loss": 2.389204502105713, | |
| "eval_runtime": 31.8367, | |
| "eval_samples_per_second": 3.204, | |
| "eval_steps_per_second": 1.602, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 0.4025, | |
| "grad_norm": 0.5530370407597024, | |
| "learning_rate": 6.639111111111112e-06, | |
| "loss": 2.3633, | |
| "step": 20125 | |
| }, | |
| { | |
| "epoch": 0.403, | |
| "grad_norm": 0.5731039592674091, | |
| "learning_rate": 6.633555555555556e-06, | |
| "loss": 2.3642, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 0.4035, | |
| "grad_norm": 0.5599029138977244, | |
| "learning_rate": 6.628e-06, | |
| "loss": 2.378, | |
| "step": 20175 | |
| }, | |
| { | |
| "epoch": 0.404, | |
| "grad_norm": 0.5833746985921849, | |
| "learning_rate": 6.622444444444444e-06, | |
| "loss": 2.3797, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 0.404, | |
| "eval_loss": 2.388874053955078, | |
| "eval_runtime": 31.8821, | |
| "eval_samples_per_second": 3.199, | |
| "eval_steps_per_second": 1.6, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 0.4045, | |
| "grad_norm": 0.5758811776953918, | |
| "learning_rate": 6.6168888888888896e-06, | |
| "loss": 2.3759, | |
| "step": 20225 | |
| }, | |
| { | |
| "epoch": 0.405, | |
| "grad_norm": 0.559073322750905, | |
| "learning_rate": 6.611333333333334e-06, | |
| "loss": 2.3743, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 0.4055, | |
| "grad_norm": 0.5638862668814341, | |
| "learning_rate": 6.605777777777778e-06, | |
| "loss": 2.3726, | |
| "step": 20275 | |
| }, | |
| { | |
| "epoch": 0.406, | |
| "grad_norm": 0.5611977328077278, | |
| "learning_rate": 6.600222222222222e-06, | |
| "loss": 2.3704, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 0.406, | |
| "eval_loss": 2.3888099193573, | |
| "eval_runtime": 31.7076, | |
| "eval_samples_per_second": 3.217, | |
| "eval_steps_per_second": 1.608, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 0.4065, | |
| "grad_norm": 0.5664333139784736, | |
| "learning_rate": 6.594666666666667e-06, | |
| "loss": 2.3644, | |
| "step": 20325 | |
| }, | |
| { | |
| "epoch": 0.407, | |
| "grad_norm": 0.5549238936705829, | |
| "learning_rate": 6.5891111111111116e-06, | |
| "loss": 2.3594, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 0.4075, | |
| "grad_norm": 0.56940110218198, | |
| "learning_rate": 6.583555555555556e-06, | |
| "loss": 2.3743, | |
| "step": 20375 | |
| }, | |
| { | |
| "epoch": 0.408, | |
| "grad_norm": 0.5757908141952881, | |
| "learning_rate": 6.578000000000001e-06, | |
| "loss": 2.3774, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 0.408, | |
| "eval_loss": 2.3890221118927, | |
| "eval_runtime": 31.8193, | |
| "eval_samples_per_second": 3.206, | |
| "eval_steps_per_second": 1.603, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 0.4085, | |
| "grad_norm": 0.6023338293027314, | |
| "learning_rate": 6.572444444444445e-06, | |
| "loss": 2.3774, | |
| "step": 20425 | |
| }, | |
| { | |
| "epoch": 0.409, | |
| "grad_norm": 0.5398042018053211, | |
| "learning_rate": 6.566888888888889e-06, | |
| "loss": 2.3785, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 0.4095, | |
| "grad_norm": 0.5961544515028506, | |
| "learning_rate": 6.561333333333334e-06, | |
| "loss": 2.3867, | |
| "step": 20475 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.5517605161130648, | |
| "learning_rate": 6.555777777777779e-06, | |
| "loss": 2.3713, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "eval_loss": 2.38859224319458, | |
| "eval_runtime": 31.8577, | |
| "eval_samples_per_second": 3.202, | |
| "eval_steps_per_second": 1.601, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.4105, | |
| "grad_norm": 0.5753260144360031, | |
| "learning_rate": 6.550222222222222e-06, | |
| "loss": 2.3653, | |
| "step": 20525 | |
| }, | |
| { | |
| "epoch": 0.411, | |
| "grad_norm": 0.6404542212883029, | |
| "learning_rate": 6.544666666666667e-06, | |
| "loss": 2.3869, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 0.4115, | |
| "grad_norm": 0.5777253920326619, | |
| "learning_rate": 6.539111111111112e-06, | |
| "loss": 2.3813, | |
| "step": 20575 | |
| }, | |
| { | |
| "epoch": 0.412, | |
| "grad_norm": 0.5698546516216307, | |
| "learning_rate": 6.5335555555555565e-06, | |
| "loss": 2.3775, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 0.412, | |
| "eval_loss": 2.388434648513794, | |
| "eval_runtime": 31.8295, | |
| "eval_samples_per_second": 3.205, | |
| "eval_steps_per_second": 1.602, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 0.4125, | |
| "grad_norm": 0.5842535685269022, | |
| "learning_rate": 6.528000000000001e-06, | |
| "loss": 2.3896, | |
| "step": 20625 | |
| }, | |
| { | |
| "epoch": 0.413, | |
| "grad_norm": 0.5595088265556925, | |
| "learning_rate": 6.522444444444444e-06, | |
| "loss": 2.3878, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 0.4135, | |
| "grad_norm": 0.5751254243123975, | |
| "learning_rate": 6.51688888888889e-06, | |
| "loss": 2.367, | |
| "step": 20675 | |
| }, | |
| { | |
| "epoch": 0.414, | |
| "grad_norm": 0.5394876201865446, | |
| "learning_rate": 6.511333333333334e-06, | |
| "loss": 2.3776, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 0.414, | |
| "eval_loss": 2.3883957862854004, | |
| "eval_runtime": 31.8095, | |
| "eval_samples_per_second": 3.207, | |
| "eval_steps_per_second": 1.603, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 0.4145, | |
| "grad_norm": 0.5601399673585632, | |
| "learning_rate": 6.5057777777777785e-06, | |
| "loss": 2.3679, | |
| "step": 20725 | |
| }, | |
| { | |
| "epoch": 0.415, | |
| "grad_norm": 0.5715098373270459, | |
| "learning_rate": 6.500222222222222e-06, | |
| "loss": 2.3811, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 0.4155, | |
| "grad_norm": 0.5517830411358287, | |
| "learning_rate": 6.494666666666667e-06, | |
| "loss": 2.3723, | |
| "step": 20775 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 0.5736440167807991, | |
| "learning_rate": 6.489111111111112e-06, | |
| "loss": 2.3804, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "eval_loss": 2.388143539428711, | |
| "eval_runtime": 31.9362, | |
| "eval_samples_per_second": 3.194, | |
| "eval_steps_per_second": 1.597, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 0.4165, | |
| "grad_norm": 0.5772877970336647, | |
| "learning_rate": 6.483555555555556e-06, | |
| "loss": 2.3721, | |
| "step": 20825 | |
| }, | |
| { | |
| "epoch": 0.417, | |
| "grad_norm": 0.5746556720939705, | |
| "learning_rate": 6.478000000000001e-06, | |
| "loss": 2.3662, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 0.4175, | |
| "grad_norm": 0.5605696940354651, | |
| "learning_rate": 6.472444444444445e-06, | |
| "loss": 2.3783, | |
| "step": 20875 | |
| }, | |
| { | |
| "epoch": 0.418, | |
| "grad_norm": 0.5474840165552274, | |
| "learning_rate": 6.466888888888889e-06, | |
| "loss": 2.3799, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 0.418, | |
| "eval_loss": 2.388044595718384, | |
| "eval_runtime": 31.8313, | |
| "eval_samples_per_second": 3.204, | |
| "eval_steps_per_second": 1.602, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 0.4185, | |
| "grad_norm": 0.5663680125421368, | |
| "learning_rate": 6.461333333333334e-06, | |
| "loss": 2.3843, | |
| "step": 20925 | |
| }, | |
| { | |
| "epoch": 0.419, | |
| "grad_norm": 0.5531423851896319, | |
| "learning_rate": 6.455777777777779e-06, | |
| "loss": 2.3661, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 0.4195, | |
| "grad_norm": 0.5644562314935403, | |
| "learning_rate": 6.450222222222223e-06, | |
| "loss": 2.3762, | |
| "step": 20975 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.5653831391780122, | |
| "learning_rate": 6.444666666666667e-06, | |
| "loss": 2.3588, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "eval_loss": 2.388213872909546, | |
| "eval_runtime": 31.7864, | |
| "eval_samples_per_second": 3.209, | |
| "eval_steps_per_second": 1.604, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.4205, | |
| "grad_norm": 0.5447308357523696, | |
| "learning_rate": 6.439111111111111e-06, | |
| "loss": 2.3803, | |
| "step": 21025 | |
| }, | |
| { | |
| "epoch": 0.421, | |
| "grad_norm": 0.5426314550064573, | |
| "learning_rate": 6.4335555555555566e-06, | |
| "loss": 2.3798, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 0.4215, | |
| "grad_norm": 0.5623213994558643, | |
| "learning_rate": 6.428000000000001e-06, | |
| "loss": 2.3855, | |
| "step": 21075 | |
| }, | |
| { | |
| "epoch": 0.422, | |
| "grad_norm": 0.551782200199429, | |
| "learning_rate": 6.4224444444444445e-06, | |
| "loss": 2.3744, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 0.422, | |
| "eval_loss": 2.3879234790802, | |
| "eval_runtime": 31.7247, | |
| "eval_samples_per_second": 3.215, | |
| "eval_steps_per_second": 1.608, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 0.4225, | |
| "grad_norm": 0.527718965025146, | |
| "learning_rate": 6.416888888888889e-06, | |
| "loss": 2.3629, | |
| "step": 21125 | |
| }, | |
| { | |
| "epoch": 0.423, | |
| "grad_norm": 0.5608708238117702, | |
| "learning_rate": 6.411333333333334e-06, | |
| "loss": 2.3775, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 0.4235, | |
| "grad_norm": 0.5448339479028284, | |
| "learning_rate": 6.405777777777779e-06, | |
| "loss": 2.379, | |
| "step": 21175 | |
| }, | |
| { | |
| "epoch": 0.424, | |
| "grad_norm": 0.5418336159854089, | |
| "learning_rate": 6.400222222222223e-06, | |
| "loss": 2.3771, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 0.424, | |
| "eval_loss": 2.3878672122955322, | |
| "eval_runtime": 31.8891, | |
| "eval_samples_per_second": 3.199, | |
| "eval_steps_per_second": 1.599, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 0.4245, | |
| "grad_norm": 0.5765916975285049, | |
| "learning_rate": 6.3946666666666665e-06, | |
| "loss": 2.3838, | |
| "step": 21225 | |
| }, | |
| { | |
| "epoch": 0.425, | |
| "grad_norm": 0.5482787584221817, | |
| "learning_rate": 6.389111111111112e-06, | |
| "loss": 2.3751, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 0.4255, | |
| "grad_norm": 0.5592623692636863, | |
| "learning_rate": 6.383555555555556e-06, | |
| "loss": 2.3714, | |
| "step": 21275 | |
| }, | |
| { | |
| "epoch": 0.426, | |
| "grad_norm": 0.5502456266750644, | |
| "learning_rate": 6.378000000000001e-06, | |
| "loss": 2.3687, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 0.426, | |
| "eval_loss": 2.387702226638794, | |
| "eval_runtime": 31.8474, | |
| "eval_samples_per_second": 3.203, | |
| "eval_steps_per_second": 1.601, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 0.4265, | |
| "grad_norm": 0.5508844144432443, | |
| "learning_rate": 6.372444444444444e-06, | |
| "loss": 2.3705, | |
| "step": 21325 | |
| }, | |
| { | |
| "epoch": 0.427, | |
| "grad_norm": 0.5551955771008479, | |
| "learning_rate": 6.366888888888889e-06, | |
| "loss": 2.3616, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 0.4275, | |
| "grad_norm": 0.5482174863813819, | |
| "learning_rate": 6.361333333333334e-06, | |
| "loss": 2.3679, | |
| "step": 21375 | |
| }, | |
| { | |
| "epoch": 0.428, | |
| "grad_norm": 0.540793837360148, | |
| "learning_rate": 6.355777777777778e-06, | |
| "loss": 2.3724, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 0.428, | |
| "eval_loss": 2.3876450061798096, | |
| "eval_runtime": 32.2051, | |
| "eval_samples_per_second": 3.167, | |
| "eval_steps_per_second": 1.584, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 0.4285, | |
| "grad_norm": 0.5478812262209652, | |
| "learning_rate": 6.3502222222222235e-06, | |
| "loss": 2.3639, | |
| "step": 21425 | |
| }, | |
| { | |
| "epoch": 0.429, | |
| "grad_norm": 0.5598419449976438, | |
| "learning_rate": 6.344666666666667e-06, | |
| "loss": 2.3686, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 0.4295, | |
| "grad_norm": 0.5650989625187698, | |
| "learning_rate": 6.339111111111111e-06, | |
| "loss": 2.3755, | |
| "step": 21475 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.5521104434834965, | |
| "learning_rate": 6.333555555555556e-06, | |
| "loss": 2.3819, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "eval_loss": 2.386732578277588, | |
| "eval_runtime": 32.423, | |
| "eval_samples_per_second": 3.146, | |
| "eval_steps_per_second": 1.573, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.4305, | |
| "grad_norm": 0.5718504697288973, | |
| "learning_rate": 6.328000000000001e-06, | |
| "loss": 2.3768, | |
| "step": 21525 | |
| }, | |
| { | |
| "epoch": 0.431, | |
| "grad_norm": 0.5647383482527034, | |
| "learning_rate": 6.3224444444444455e-06, | |
| "loss": 2.3634, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 0.4315, | |
| "grad_norm": 0.5740444089490578, | |
| "learning_rate": 6.316888888888889e-06, | |
| "loss": 2.3683, | |
| "step": 21575 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 0.5468815860778439, | |
| "learning_rate": 6.3113333333333334e-06, | |
| "loss": 2.3775, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "eval_loss": 2.386624813079834, | |
| "eval_runtime": 32.2361, | |
| "eval_samples_per_second": 3.164, | |
| "eval_steps_per_second": 1.582, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 0.4325, | |
| "grad_norm": 0.5491782166979611, | |
| "learning_rate": 6.305777777777779e-06, | |
| "loss": 2.3678, | |
| "step": 21625 | |
| }, | |
| { | |
| "epoch": 0.433, | |
| "grad_norm": 0.5493956319744467, | |
| "learning_rate": 6.300222222222223e-06, | |
| "loss": 2.3632, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 0.4335, | |
| "grad_norm": 0.5517199994093782, | |
| "learning_rate": 6.294666666666667e-06, | |
| "loss": 2.3719, | |
| "step": 21675 | |
| }, | |
| { | |
| "epoch": 0.434, | |
| "grad_norm": 0.5480082798934808, | |
| "learning_rate": 6.289111111111111e-06, | |
| "loss": 2.3705, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 0.434, | |
| "eval_loss": 2.386605978012085, | |
| "eval_runtime": 31.811, | |
| "eval_samples_per_second": 3.206, | |
| "eval_steps_per_second": 1.603, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 0.4345, | |
| "grad_norm": 0.5988374708555845, | |
| "learning_rate": 6.283555555555556e-06, | |
| "loss": 2.3736, | |
| "step": 21725 | |
| }, | |
| { | |
| "epoch": 0.435, | |
| "grad_norm": 0.5394989364015422, | |
| "learning_rate": 6.278000000000001e-06, | |
| "loss": 2.38, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 0.4355, | |
| "grad_norm": 0.5660475248416822, | |
| "learning_rate": 6.272444444444445e-06, | |
| "loss": 2.3712, | |
| "step": 21775 | |
| }, | |
| { | |
| "epoch": 0.436, | |
| "grad_norm": 0.5824076374736812, | |
| "learning_rate": 6.266888888888889e-06, | |
| "loss": 2.3781, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 0.436, | |
| "eval_loss": 2.3868014812469482, | |
| "eval_runtime": 32.0011, | |
| "eval_samples_per_second": 3.187, | |
| "eval_steps_per_second": 1.594, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 0.4365, | |
| "grad_norm": 0.5604649354431509, | |
| "learning_rate": 6.261333333333334e-06, | |
| "loss": 2.3673, | |
| "step": 21825 | |
| }, | |
| { | |
| "epoch": 0.437, | |
| "grad_norm": 0.5581917280058185, | |
| "learning_rate": 6.255777777777778e-06, | |
| "loss": 2.3575, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 0.4375, | |
| "grad_norm": 0.5682187519985219, | |
| "learning_rate": 6.250222222222223e-06, | |
| "loss": 2.3752, | |
| "step": 21875 | |
| }, | |
| { | |
| "epoch": 0.438, | |
| "grad_norm": 0.5343819916754123, | |
| "learning_rate": 6.244666666666666e-06, | |
| "loss": 2.3688, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 0.438, | |
| "eval_loss": 2.3865694999694824, | |
| "eval_runtime": 31.8681, | |
| "eval_samples_per_second": 3.201, | |
| "eval_steps_per_second": 1.6, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 0.4385, | |
| "grad_norm": 0.6084740129821103, | |
| "learning_rate": 6.2391111111111115e-06, | |
| "loss": 2.3611, | |
| "step": 21925 | |
| }, | |
| { | |
| "epoch": 0.439, | |
| "grad_norm": 0.5550908983577711, | |
| "learning_rate": 6.233555555555556e-06, | |
| "loss": 2.364, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 0.4395, | |
| "grad_norm": 0.5605896822575689, | |
| "learning_rate": 6.228e-06, | |
| "loss": 2.3875, | |
| "step": 21975 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.5679795530728957, | |
| "learning_rate": 6.222444444444446e-06, | |
| "loss": 2.3637, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "eval_loss": 2.3865110874176025, | |
| "eval_runtime": 31.8116, | |
| "eval_samples_per_second": 3.206, | |
| "eval_steps_per_second": 1.603, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.4405, | |
| "grad_norm": 0.5533397760322247, | |
| "learning_rate": 6.216888888888889e-06, | |
| "loss": 2.371, | |
| "step": 22025 | |
| }, | |
| { | |
| "epoch": 0.441, | |
| "grad_norm": 0.5551275205002794, | |
| "learning_rate": 6.2113333333333336e-06, | |
| "loss": 2.3684, | |
| "step": 22050 | |
| }, | |
| { | |
| "epoch": 0.4415, | |
| "grad_norm": 0.5520948023453888, | |
| "learning_rate": 6.205777777777778e-06, | |
| "loss": 2.3602, | |
| "step": 22075 | |
| }, | |
| { | |
| "epoch": 0.442, | |
| "grad_norm": 0.5679529169964138, | |
| "learning_rate": 6.200222222222223e-06, | |
| "loss": 2.3867, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 0.442, | |
| "eval_loss": 2.3863022327423096, | |
| "eval_runtime": 32.0036, | |
| "eval_samples_per_second": 3.187, | |
| "eval_steps_per_second": 1.594, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 0.4425, | |
| "grad_norm": 0.5619895216629556, | |
| "learning_rate": 6.194666666666668e-06, | |
| "loss": 2.3701, | |
| "step": 22125 | |
| }, | |
| { | |
| "epoch": 0.443, | |
| "grad_norm": 0.5515875809771505, | |
| "learning_rate": 6.189111111111111e-06, | |
| "loss": 2.3734, | |
| "step": 22150 | |
| }, | |
| { | |
| "epoch": 0.4435, | |
| "grad_norm": 0.5686425996531567, | |
| "learning_rate": 6.1835555555555556e-06, | |
| "loss": 2.3698, | |
| "step": 22175 | |
| }, | |
| { | |
| "epoch": 0.444, | |
| "grad_norm": 0.5580871882801617, | |
| "learning_rate": 6.178000000000001e-06, | |
| "loss": 2.3676, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 0.444, | |
| "eval_loss": 2.3865246772766113, | |
| "eval_runtime": 31.7174, | |
| "eval_samples_per_second": 3.216, | |
| "eval_steps_per_second": 1.608, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 0.4445, | |
| "grad_norm": 0.5784261034385078, | |
| "learning_rate": 6.172444444444445e-06, | |
| "loss": 2.3723, | |
| "step": 22225 | |
| }, | |
| { | |
| "epoch": 0.445, | |
| "grad_norm": 0.5570688655308026, | |
| "learning_rate": 6.166888888888889e-06, | |
| "loss": 2.3709, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 0.4455, | |
| "grad_norm": 0.5716930839552549, | |
| "learning_rate": 6.161333333333334e-06, | |
| "loss": 2.3734, | |
| "step": 22275 | |
| }, | |
| { | |
| "epoch": 0.446, | |
| "grad_norm": 0.5550340902020618, | |
| "learning_rate": 6.1557777777777784e-06, | |
| "loss": 2.3648, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 0.446, | |
| "eval_loss": 2.38633131980896, | |
| "eval_runtime": 31.7943, | |
| "eval_samples_per_second": 3.208, | |
| "eval_steps_per_second": 1.604, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 0.4465, | |
| "grad_norm": 0.5719936248106342, | |
| "learning_rate": 6.150222222222223e-06, | |
| "loss": 2.3751, | |
| "step": 22325 | |
| }, | |
| { | |
| "epoch": 0.447, | |
| "grad_norm": 0.5616671760742846, | |
| "learning_rate": 6.144666666666668e-06, | |
| "loss": 2.3748, | |
| "step": 22350 | |
| }, | |
| { | |
| "epoch": 0.4475, | |
| "grad_norm": 0.5785985644213604, | |
| "learning_rate": 6.139111111111112e-06, | |
| "loss": 2.3837, | |
| "step": 22375 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 0.5645620599147937, | |
| "learning_rate": 6.133555555555556e-06, | |
| "loss": 2.3745, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "eval_loss": 2.3862569332122803, | |
| "eval_runtime": 31.9593, | |
| "eval_samples_per_second": 3.192, | |
| "eval_steps_per_second": 1.596, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 0.4485, | |
| "grad_norm": 0.5469950240628229, | |
| "learning_rate": 6.1280000000000005e-06, | |
| "loss": 2.3642, | |
| "step": 22425 | |
| }, | |
| { | |
| "epoch": 0.449, | |
| "grad_norm": 0.5324393599981698, | |
| "learning_rate": 6.122444444444446e-06, | |
| "loss": 2.379, | |
| "step": 22450 | |
| }, | |
| { | |
| "epoch": 0.4495, | |
| "grad_norm": 0.5519962387254249, | |
| "learning_rate": 6.116888888888889e-06, | |
| "loss": 2.3635, | |
| "step": 22475 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.5588336399127953, | |
| "learning_rate": 6.111333333333334e-06, | |
| "loss": 2.3718, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "eval_loss": 2.385950803756714, | |
| "eval_runtime": 31.7208, | |
| "eval_samples_per_second": 3.216, | |
| "eval_steps_per_second": 1.608, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.4505, | |
| "grad_norm": 0.5923640418917652, | |
| "learning_rate": 6.105777777777778e-06, | |
| "loss": 2.3719, | |
| "step": 22525 | |
| }, | |
| { | |
| "epoch": 0.451, | |
| "grad_norm": 0.5653562982992056, | |
| "learning_rate": 6.100222222222223e-06, | |
| "loss": 2.3808, | |
| "step": 22550 | |
| }, | |
| { | |
| "epoch": 0.4515, | |
| "grad_norm": 0.5636846873459127, | |
| "learning_rate": 6.094666666666668e-06, | |
| "loss": 2.3641, | |
| "step": 22575 | |
| }, | |
| { | |
| "epoch": 0.452, | |
| "grad_norm": 0.5850003926588586, | |
| "learning_rate": 6.089111111111111e-06, | |
| "loss": 2.3572, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 0.452, | |
| "eval_loss": 2.386296033859253, | |
| "eval_runtime": 31.8709, | |
| "eval_samples_per_second": 3.2, | |
| "eval_steps_per_second": 1.6, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 0.4525, | |
| "grad_norm": 0.5334735362781007, | |
| "learning_rate": 6.083555555555556e-06, | |
| "loss": 2.3732, | |
| "step": 22625 | |
| }, | |
| { | |
| "epoch": 0.453, | |
| "grad_norm": 0.5809776122118506, | |
| "learning_rate": 6.078000000000001e-06, | |
| "loss": 2.3842, | |
| "step": 22650 | |
| }, | |
| { | |
| "epoch": 0.4535, | |
| "grad_norm": 0.5438625993671827, | |
| "learning_rate": 6.072444444444445e-06, | |
| "loss": 2.3802, | |
| "step": 22675 | |
| }, | |
| { | |
| "epoch": 0.454, | |
| "grad_norm": 0.5581266930595516, | |
| "learning_rate": 6.06688888888889e-06, | |
| "loss": 2.3757, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 0.454, | |
| "eval_loss": 2.3853445053100586, | |
| "eval_runtime": 31.9465, | |
| "eval_samples_per_second": 3.193, | |
| "eval_steps_per_second": 1.596, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 0.4545, | |
| "grad_norm": 0.5665471911134969, | |
| "learning_rate": 6.061333333333333e-06, | |
| "loss": 2.3632, | |
| "step": 22725 | |
| }, | |
| { | |
| "epoch": 0.455, | |
| "grad_norm": 0.5602817372745607, | |
| "learning_rate": 6.0557777777777785e-06, | |
| "loss": 2.3759, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 0.4555, | |
| "grad_norm": 0.5546395592927382, | |
| "learning_rate": 6.050222222222223e-06, | |
| "loss": 2.3654, | |
| "step": 22775 | |
| }, | |
| { | |
| "epoch": 0.456, | |
| "grad_norm": 0.5466059675730089, | |
| "learning_rate": 6.044666666666667e-06, | |
| "loss": 2.3747, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 0.456, | |
| "eval_loss": 2.3854382038116455, | |
| "eval_runtime": 31.8135, | |
| "eval_samples_per_second": 3.206, | |
| "eval_steps_per_second": 1.603, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 0.4565, | |
| "grad_norm": 0.556576922176953, | |
| "learning_rate": 6.039111111111111e-06, | |
| "loss": 2.3752, | |
| "step": 22825 | |
| }, | |
| { | |
| "epoch": 0.457, | |
| "grad_norm": 0.5587160453347744, | |
| "learning_rate": 6.033555555555556e-06, | |
| "loss": 2.3753, | |
| "step": 22850 | |
| }, | |
| { | |
| "epoch": 0.4575, | |
| "grad_norm": 0.5581750567947692, | |
| "learning_rate": 6.0280000000000006e-06, | |
| "loss": 2.3744, | |
| "step": 22875 | |
| }, | |
| { | |
| "epoch": 0.458, | |
| "grad_norm": 0.5665211201226871, | |
| "learning_rate": 6.022444444444445e-06, | |
| "loss": 2.3707, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 0.458, | |
| "eval_loss": 2.3854050636291504, | |
| "eval_runtime": 31.8453, | |
| "eval_samples_per_second": 3.203, | |
| "eval_steps_per_second": 1.601, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 0.4585, | |
| "grad_norm": 0.559138638343371, | |
| "learning_rate": 6.01688888888889e-06, | |
| "loss": 2.3771, | |
| "step": 22925 | |
| }, | |
| { | |
| "epoch": 0.459, | |
| "grad_norm": 0.5765629867304476, | |
| "learning_rate": 6.011333333333334e-06, | |
| "loss": 2.3751, | |
| "step": 22950 | |
| }, | |
| { | |
| "epoch": 0.4595, | |
| "grad_norm": 0.5697804508664757, | |
| "learning_rate": 6.005777777777778e-06, | |
| "loss": 2.3837, | |
| "step": 22975 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.5813773268685459, | |
| "learning_rate": 6.000222222222223e-06, | |
| "loss": 2.37, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "eval_loss": 2.385390520095825, | |
| "eval_runtime": 31.767, | |
| "eval_samples_per_second": 3.211, | |
| "eval_steps_per_second": 1.605, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.4605, | |
| "grad_norm": 0.5644614073323889, | |
| "learning_rate": 5.994666666666668e-06, | |
| "loss": 2.3627, | |
| "step": 23025 | |
| }, | |
| { | |
| "epoch": 0.461, | |
| "grad_norm": 0.561196100799294, | |
| "learning_rate": 5.989111111111111e-06, | |
| "loss": 2.373, | |
| "step": 23050 | |
| }, | |
| { | |
| "epoch": 0.4615, | |
| "grad_norm": 0.5988172465498709, | |
| "learning_rate": 5.983555555555556e-06, | |
| "loss": 2.3625, | |
| "step": 23075 | |
| }, | |
| { | |
| "epoch": 0.462, | |
| "grad_norm": 0.5561927981892911, | |
| "learning_rate": 5.978e-06, | |
| "loss": 2.366, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 0.462, | |
| "eval_loss": 2.3851592540740967, | |
| "eval_runtime": 31.9972, | |
| "eval_samples_per_second": 3.188, | |
| "eval_steps_per_second": 1.594, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 0.4625, | |
| "grad_norm": 0.5473375939412587, | |
| "learning_rate": 5.9724444444444454e-06, | |
| "loss": 2.3577, | |
| "step": 23125 | |
| }, | |
| { | |
| "epoch": 0.463, | |
| "grad_norm": 0.5422432723666715, | |
| "learning_rate": 5.96688888888889e-06, | |
| "loss": 2.3724, | |
| "step": 23150 | |
| }, | |
| { | |
| "epoch": 0.4635, | |
| "grad_norm": 0.5459369802725026, | |
| "learning_rate": 5.961333333333333e-06, | |
| "loss": 2.3693, | |
| "step": 23175 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 0.5602391995824985, | |
| "learning_rate": 5.955777777777778e-06, | |
| "loss": 2.3662, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "eval_loss": 2.384812593460083, | |
| "eval_runtime": 31.7736, | |
| "eval_samples_per_second": 3.21, | |
| "eval_steps_per_second": 1.605, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 0.4645, | |
| "grad_norm": 0.5382771454200044, | |
| "learning_rate": 5.950222222222223e-06, | |
| "loss": 2.373, | |
| "step": 23225 | |
| }, | |
| { | |
| "epoch": 0.465, | |
| "grad_norm": 0.5616408548500356, | |
| "learning_rate": 5.9446666666666675e-06, | |
| "loss": 2.3744, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 0.4655, | |
| "grad_norm": 0.5626270768454595, | |
| "learning_rate": 5.939111111111111e-06, | |
| "loss": 2.3745, | |
| "step": 23275 | |
| }, | |
| { | |
| "epoch": 0.466, | |
| "grad_norm": 0.5771198592247021, | |
| "learning_rate": 5.933555555555555e-06, | |
| "loss": 2.3712, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 0.466, | |
| "eval_loss": 2.385037660598755, | |
| "eval_runtime": 31.6688, | |
| "eval_samples_per_second": 3.221, | |
| "eval_steps_per_second": 1.61, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 0.4665, | |
| "grad_norm": 0.553677767303205, | |
| "learning_rate": 5.928000000000001e-06, | |
| "loss": 2.3688, | |
| "step": 23325 | |
| }, | |
| { | |
| "epoch": 0.467, | |
| "grad_norm": 0.5761122434148291, | |
| "learning_rate": 5.922444444444445e-06, | |
| "loss": 2.3697, | |
| "step": 23350 | |
| }, | |
| { | |
| "epoch": 0.4675, | |
| "grad_norm": 0.5776134096430138, | |
| "learning_rate": 5.9168888888888895e-06, | |
| "loss": 2.3696, | |
| "step": 23375 | |
| }, | |
| { | |
| "epoch": 0.468, | |
| "grad_norm": 0.5410943763458229, | |
| "learning_rate": 5.911333333333333e-06, | |
| "loss": 2.3748, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 0.468, | |
| "eval_loss": 2.3850579261779785, | |
| "eval_runtime": 31.7506, | |
| "eval_samples_per_second": 3.213, | |
| "eval_steps_per_second": 1.606, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 0.4685, | |
| "grad_norm": 0.5496846088073756, | |
| "learning_rate": 5.905777777777778e-06, | |
| "loss": 2.3631, | |
| "step": 23425 | |
| }, | |
| { | |
| "epoch": 0.469, | |
| "grad_norm": 0.5489837887647091, | |
| "learning_rate": 5.900222222222223e-06, | |
| "loss": 2.3752, | |
| "step": 23450 | |
| }, | |
| { | |
| "epoch": 0.4695, | |
| "grad_norm": 0.5595321821458019, | |
| "learning_rate": 5.894666666666667e-06, | |
| "loss": 2.3681, | |
| "step": 23475 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.5441176871533538, | |
| "learning_rate": 5.889111111111112e-06, | |
| "loss": 2.3689, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "eval_loss": 2.3847615718841553, | |
| "eval_runtime": 31.7515, | |
| "eval_samples_per_second": 3.212, | |
| "eval_steps_per_second": 1.606, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.4705, | |
| "grad_norm": 0.5591005943894303, | |
| "learning_rate": 5.883555555555556e-06, | |
| "loss": 2.3687, | |
| "step": 23525 | |
| }, | |
| { | |
| "epoch": 0.471, | |
| "grad_norm": 0.5569068986313633, | |
| "learning_rate": 5.878e-06, | |
| "loss": 2.3579, | |
| "step": 23550 | |
| }, | |
| { | |
| "epoch": 0.4715, | |
| "grad_norm": 0.5544550604142251, | |
| "learning_rate": 5.872444444444445e-06, | |
| "loss": 2.3654, | |
| "step": 23575 | |
| }, | |
| { | |
| "epoch": 0.472, | |
| "grad_norm": 0.5682698532685105, | |
| "learning_rate": 5.86688888888889e-06, | |
| "loss": 2.3686, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 0.472, | |
| "eval_loss": 2.384906053543091, | |
| "eval_runtime": 31.7623, | |
| "eval_samples_per_second": 3.211, | |
| "eval_steps_per_second": 1.606, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 0.4725, | |
| "grad_norm": 0.5754081011772445, | |
| "learning_rate": 5.8613333333333335e-06, | |
| "loss": 2.3629, | |
| "step": 23625 | |
| }, | |
| { | |
| "epoch": 0.473, | |
| "grad_norm": 0.605492062724259, | |
| "learning_rate": 5.855777777777778e-06, | |
| "loss": 2.3702, | |
| "step": 23650 | |
| }, | |
| { | |
| "epoch": 0.4735, | |
| "grad_norm": 0.5407520724247802, | |
| "learning_rate": 5.850222222222222e-06, | |
| "loss": 2.3652, | |
| "step": 23675 | |
| }, | |
| { | |
| "epoch": 0.474, | |
| "grad_norm": 0.5531865604429913, | |
| "learning_rate": 5.8446666666666676e-06, | |
| "loss": 2.3724, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 0.474, | |
| "eval_loss": 2.3844547271728516, | |
| "eval_runtime": 31.833, | |
| "eval_samples_per_second": 3.204, | |
| "eval_steps_per_second": 1.602, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 0.4745, | |
| "grad_norm": 0.573840223481603, | |
| "learning_rate": 5.839111111111112e-06, | |
| "loss": 2.365, | |
| "step": 23725 | |
| }, | |
| { | |
| "epoch": 0.475, | |
| "grad_norm": 0.545580569851831, | |
| "learning_rate": 5.8335555555555555e-06, | |
| "loss": 2.3813, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 0.4755, | |
| "grad_norm": 0.551471960312376, | |
| "learning_rate": 5.828e-06, | |
| "loss": 2.3617, | |
| "step": 23775 | |
| }, | |
| { | |
| "epoch": 0.476, | |
| "grad_norm": 0.5953130526303944, | |
| "learning_rate": 5.822444444444445e-06, | |
| "loss": 2.3781, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 0.476, | |
| "eval_loss": 2.38433575630188, | |
| "eval_runtime": 31.8506, | |
| "eval_samples_per_second": 3.202, | |
| "eval_steps_per_second": 1.601, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 0.4765, | |
| "grad_norm": 0.5604797565202618, | |
| "learning_rate": 5.81688888888889e-06, | |
| "loss": 2.3716, | |
| "step": 23825 | |
| }, | |
| { | |
| "epoch": 0.477, | |
| "grad_norm": 0.554661200228578, | |
| "learning_rate": 5.811333333333333e-06, | |
| "loss": 2.3724, | |
| "step": 23850 | |
| }, | |
| { | |
| "epoch": 0.4775, | |
| "grad_norm": 0.5534736868914567, | |
| "learning_rate": 5.8057777777777775e-06, | |
| "loss": 2.3754, | |
| "step": 23875 | |
| }, | |
| { | |
| "epoch": 0.478, | |
| "grad_norm": 0.541434243018937, | |
| "learning_rate": 5.800222222222223e-06, | |
| "loss": 2.3612, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 0.478, | |
| "eval_loss": 2.3843014240264893, | |
| "eval_runtime": 31.7803, | |
| "eval_samples_per_second": 3.21, | |
| "eval_steps_per_second": 1.605, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 0.4785, | |
| "grad_norm": 0.5557683143124796, | |
| "learning_rate": 5.794666666666667e-06, | |
| "loss": 2.3639, | |
| "step": 23925 | |
| }, | |
| { | |
| "epoch": 0.479, | |
| "grad_norm": 0.5799527873689908, | |
| "learning_rate": 5.789111111111112e-06, | |
| "loss": 2.373, | |
| "step": 23950 | |
| }, | |
| { | |
| "epoch": 0.4795, | |
| "grad_norm": 0.590904770982699, | |
| "learning_rate": 5.783555555555556e-06, | |
| "loss": 2.3778, | |
| "step": 23975 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.5561040991296016, | |
| "learning_rate": 5.778e-06, | |
| "loss": 2.3552, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_loss": 2.3842599391937256, | |
| "eval_runtime": 31.7209, | |
| "eval_samples_per_second": 3.216, | |
| "eval_steps_per_second": 1.608, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.4805, | |
| "grad_norm": 0.5640470742370431, | |
| "learning_rate": 5.772444444444445e-06, | |
| "loss": 2.3622, | |
| "step": 24025 | |
| }, | |
| { | |
| "epoch": 0.481, | |
| "grad_norm": 0.5463055265939479, | |
| "learning_rate": 5.76688888888889e-06, | |
| "loss": 2.3609, | |
| "step": 24050 | |
| }, | |
| { | |
| "epoch": 0.4815, | |
| "grad_norm": 0.566766243472923, | |
| "learning_rate": 5.7613333333333345e-06, | |
| "loss": 2.3824, | |
| "step": 24075 | |
| }, | |
| { | |
| "epoch": 0.482, | |
| "grad_norm": 0.5584478304684121, | |
| "learning_rate": 5.755777777777778e-06, | |
| "loss": 2.3744, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 0.482, | |
| "eval_loss": 2.384092330932617, | |
| "eval_runtime": 31.7835, | |
| "eval_samples_per_second": 3.209, | |
| "eval_steps_per_second": 1.605, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 0.4825, | |
| "grad_norm": 0.5731740442874064, | |
| "learning_rate": 5.7502222222222224e-06, | |
| "loss": 2.3733, | |
| "step": 24125 | |
| }, | |
| { | |
| "epoch": 0.483, | |
| "grad_norm": 0.5552901331066319, | |
| "learning_rate": 5.744666666666668e-06, | |
| "loss": 2.3755, | |
| "step": 24150 | |
| }, | |
| { | |
| "epoch": 0.4835, | |
| "grad_norm": 0.5535450397337369, | |
| "learning_rate": 5.739111111111112e-06, | |
| "loss": 2.3777, | |
| "step": 24175 | |
| }, | |
| { | |
| "epoch": 0.484, | |
| "grad_norm": 0.5622658531288893, | |
| "learning_rate": 5.733555555555556e-06, | |
| "loss": 2.3671, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 0.484, | |
| "eval_loss": 2.3840036392211914, | |
| "eval_runtime": 31.7615, | |
| "eval_samples_per_second": 3.211, | |
| "eval_steps_per_second": 1.606, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 0.4845, | |
| "grad_norm": 0.5526779804173192, | |
| "learning_rate": 5.728e-06, | |
| "loss": 2.374, | |
| "step": 24225 | |
| }, | |
| { | |
| "epoch": 0.485, | |
| "grad_norm": 0.5383978006357063, | |
| "learning_rate": 5.722444444444445e-06, | |
| "loss": 2.3664, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 0.4855, | |
| "grad_norm": 0.5542389650019858, | |
| "learning_rate": 5.71688888888889e-06, | |
| "loss": 2.3692, | |
| "step": 24275 | |
| }, | |
| { | |
| "epoch": 0.486, | |
| "grad_norm": 0.5542459781042757, | |
| "learning_rate": 5.711333333333334e-06, | |
| "loss": 2.379, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 0.486, | |
| "eval_loss": 2.3838605880737305, | |
| "eval_runtime": 31.8313, | |
| "eval_samples_per_second": 3.204, | |
| "eval_steps_per_second": 1.602, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 0.4865, | |
| "grad_norm": 0.5371257785961498, | |
| "learning_rate": 5.705777777777778e-06, | |
| "loss": 2.3759, | |
| "step": 24325 | |
| }, | |
| { | |
| "epoch": 0.487, | |
| "grad_norm": 0.5334074315105899, | |
| "learning_rate": 5.700222222222223e-06, | |
| "loss": 2.3842, | |
| "step": 24350 | |
| }, | |
| { | |
| "epoch": 0.4875, | |
| "grad_norm": 0.5712028005119992, | |
| "learning_rate": 5.694666666666667e-06, | |
| "loss": 2.373, | |
| "step": 24375 | |
| }, | |
| { | |
| "epoch": 0.488, | |
| "grad_norm": 0.5527635817323101, | |
| "learning_rate": 5.689111111111112e-06, | |
| "loss": 2.3632, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 0.488, | |
| "eval_loss": 2.383908987045288, | |
| "eval_runtime": 31.8006, | |
| "eval_samples_per_second": 3.207, | |
| "eval_steps_per_second": 1.604, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 0.4885, | |
| "grad_norm": 0.5497988709199122, | |
| "learning_rate": 5.683555555555555e-06, | |
| "loss": 2.3674, | |
| "step": 24425 | |
| }, | |
| { | |
| "epoch": 0.489, | |
| "grad_norm": 0.5478963614360626, | |
| "learning_rate": 5.6780000000000005e-06, | |
| "loss": 2.3795, | |
| "step": 24450 | |
| }, | |
| { | |
| "epoch": 0.4895, | |
| "grad_norm": 0.5418443665589167, | |
| "learning_rate": 5.672444444444445e-06, | |
| "loss": 2.3769, | |
| "step": 24475 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.5637739038034214, | |
| "learning_rate": 5.666888888888889e-06, | |
| "loss": 2.3754, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "eval_loss": 2.3835647106170654, | |
| "eval_runtime": 31.695, | |
| "eval_samples_per_second": 3.218, | |
| "eval_steps_per_second": 1.609, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.4905, | |
| "grad_norm": 0.5352738455560374, | |
| "learning_rate": 5.661333333333335e-06, | |
| "loss": 2.3665, | |
| "step": 24525 | |
| }, | |
| { | |
| "epoch": 0.491, | |
| "grad_norm": 0.5593898219847685, | |
| "learning_rate": 5.655777777777778e-06, | |
| "loss": 2.3621, | |
| "step": 24550 | |
| }, | |
| { | |
| "epoch": 0.4915, | |
| "grad_norm": 0.5340153226573613, | |
| "learning_rate": 5.6502222222222225e-06, | |
| "loss": 2.3704, | |
| "step": 24575 | |
| }, | |
| { | |
| "epoch": 0.492, | |
| "grad_norm": 0.5434269177198789, | |
| "learning_rate": 5.644666666666667e-06, | |
| "loss": 2.3707, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 0.492, | |
| "eval_loss": 2.38376522064209, | |
| "eval_runtime": 31.8117, | |
| "eval_samples_per_second": 3.206, | |
| "eval_steps_per_second": 1.603, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 0.4925, | |
| "grad_norm": 0.5555073289213541, | |
| "learning_rate": 5.639111111111112e-06, | |
| "loss": 2.3702, | |
| "step": 24625 | |
| }, | |
| { | |
| "epoch": 0.493, | |
| "grad_norm": 0.5608796205061338, | |
| "learning_rate": 5.633555555555557e-06, | |
| "loss": 2.373, | |
| "step": 24650 | |
| }, | |
| { | |
| "epoch": 0.4935, | |
| "grad_norm": 0.5639681025688454, | |
| "learning_rate": 5.628e-06, | |
| "loss": 2.3641, | |
| "step": 24675 | |
| }, | |
| { | |
| "epoch": 0.494, | |
| "grad_norm": 0.5610119210421548, | |
| "learning_rate": 5.6224444444444446e-06, | |
| "loss": 2.372, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 0.494, | |
| "eval_loss": 2.383573293685913, | |
| "eval_runtime": 31.6948, | |
| "eval_samples_per_second": 3.218, | |
| "eval_steps_per_second": 1.609, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 0.4945, | |
| "grad_norm": 0.5442392815853518, | |
| "learning_rate": 5.61688888888889e-06, | |
| "loss": 2.3651, | |
| "step": 24725 | |
| }, | |
| { | |
| "epoch": 0.495, | |
| "grad_norm": 0.5562532962787945, | |
| "learning_rate": 5.611333333333334e-06, | |
| "loss": 2.3705, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 0.4955, | |
| "grad_norm": 0.5488206873990799, | |
| "learning_rate": 5.605777777777778e-06, | |
| "loss": 2.3623, | |
| "step": 24775 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 0.5653453728755813, | |
| "learning_rate": 5.600222222222222e-06, | |
| "loss": 2.3746, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "eval_loss": 2.383600950241089, | |
| "eval_runtime": 31.8215, | |
| "eval_samples_per_second": 3.205, | |
| "eval_steps_per_second": 1.603, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 0.4965, | |
| "grad_norm": 0.5714575887868236, | |
| "learning_rate": 5.5946666666666674e-06, | |
| "loss": 2.3698, | |
| "step": 24825 | |
| }, | |
| { | |
| "epoch": 0.497, | |
| "grad_norm": 0.5479503311373944, | |
| "learning_rate": 5.589111111111112e-06, | |
| "loss": 2.3753, | |
| "step": 24850 | |
| }, | |
| { | |
| "epoch": 0.4975, | |
| "grad_norm": 0.5465196721627547, | |
| "learning_rate": 5.583555555555556e-06, | |
| "loss": 2.3627, | |
| "step": 24875 | |
| }, | |
| { | |
| "epoch": 0.498, | |
| "grad_norm": 0.5545182382115218, | |
| "learning_rate": 5.578e-06, | |
| "loss": 2.3623, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 0.498, | |
| "eval_loss": 2.383317470550537, | |
| "eval_runtime": 31.8409, | |
| "eval_samples_per_second": 3.203, | |
| "eval_steps_per_second": 1.602, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 0.4985, | |
| "grad_norm": 0.5624766646317664, | |
| "learning_rate": 5.572444444444445e-06, | |
| "loss": 2.3659, | |
| "step": 24925 | |
| }, | |
| { | |
| "epoch": 0.499, | |
| "grad_norm": 0.5642199082921324, | |
| "learning_rate": 5.5668888888888894e-06, | |
| "loss": 2.3684, | |
| "step": 24950 | |
| }, | |
| { | |
| "epoch": 0.4995, | |
| "grad_norm": 0.5917431910025611, | |
| "learning_rate": 5.561333333333334e-06, | |
| "loss": 2.3723, | |
| "step": 24975 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.5530201275821488, | |
| "learning_rate": 5.555777777777777e-06, | |
| "loss": 2.3685, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "eval_loss": 2.3832170963287354, | |
| "eval_runtime": 31.7959, | |
| "eval_samples_per_second": 3.208, | |
| "eval_steps_per_second": 1.604, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.5005, | |
| "grad_norm": 0.5509816083841773, | |
| "learning_rate": 5.550222222222223e-06, | |
| "loss": 2.3559, | |
| "step": 25025 | |
| }, | |
| { | |
| "epoch": 0.501, | |
| "grad_norm": 0.5547472529206742, | |
| "learning_rate": 5.544666666666667e-06, | |
| "loss": 2.3648, | |
| "step": 25050 | |
| }, | |
| { | |
| "epoch": 0.5015, | |
| "grad_norm": 0.546260980184131, | |
| "learning_rate": 5.5391111111111115e-06, | |
| "loss": 2.3701, | |
| "step": 25075 | |
| }, | |
| { | |
| "epoch": 0.502, | |
| "grad_norm": 0.5481216862316385, | |
| "learning_rate": 5.533555555555557e-06, | |
| "loss": 2.3798, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 0.502, | |
| "eval_loss": 2.38305926322937, | |
| "eval_runtime": 32.0473, | |
| "eval_samples_per_second": 3.183, | |
| "eval_steps_per_second": 1.591, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 0.5025, | |
| "grad_norm": 0.5670640165543723, | |
| "learning_rate": 5.528e-06, | |
| "loss": 2.3622, | |
| "step": 25125 | |
| }, | |
| { | |
| "epoch": 0.503, | |
| "grad_norm": 0.5463137917421312, | |
| "learning_rate": 5.522444444444445e-06, | |
| "loss": 2.3719, | |
| "step": 25150 | |
| }, | |
| { | |
| "epoch": 0.5035, | |
| "grad_norm": 0.5400999701410277, | |
| "learning_rate": 5.516888888888889e-06, | |
| "loss": 2.3616, | |
| "step": 25175 | |
| }, | |
| { | |
| "epoch": 0.504, | |
| "grad_norm": 0.5802126499364532, | |
| "learning_rate": 5.511333333333334e-06, | |
| "loss": 2.3721, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 0.504, | |
| "eval_loss": 2.3829147815704346, | |
| "eval_runtime": 31.7438, | |
| "eval_samples_per_second": 3.213, | |
| "eval_steps_per_second": 1.607, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 0.5045, | |
| "grad_norm": 0.5435607747773122, | |
| "learning_rate": 5.505777777777779e-06, | |
| "loss": 2.3603, | |
| "step": 25225 | |
| }, | |
| { | |
| "epoch": 0.505, | |
| "grad_norm": 0.5453890322127348, | |
| "learning_rate": 5.500222222222222e-06, | |
| "loss": 2.3636, | |
| "step": 25250 | |
| }, | |
| { | |
| "epoch": 0.5055, | |
| "grad_norm": 0.5477131217196112, | |
| "learning_rate": 5.494666666666667e-06, | |
| "loss": 2.3697, | |
| "step": 25275 | |
| }, | |
| { | |
| "epoch": 0.506, | |
| "grad_norm": 0.5621665226631756, | |
| "learning_rate": 5.489111111111112e-06, | |
| "loss": 2.3687, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 0.506, | |
| "eval_loss": 2.3831355571746826, | |
| "eval_runtime": 31.7979, | |
| "eval_samples_per_second": 3.208, | |
| "eval_steps_per_second": 1.604, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 0.5065, | |
| "grad_norm": 0.5622191727496813, | |
| "learning_rate": 5.483555555555556e-06, | |
| "loss": 2.368, | |
| "step": 25325 | |
| }, | |
| { | |
| "epoch": 0.507, | |
| "grad_norm": 0.5375310388584507, | |
| "learning_rate": 5.478e-06, | |
| "loss": 2.3617, | |
| "step": 25350 | |
| }, | |
| { | |
| "epoch": 0.5075, | |
| "grad_norm": 0.5421092937376346, | |
| "learning_rate": 5.472444444444444e-06, | |
| "loss": 2.3759, | |
| "step": 25375 | |
| }, | |
| { | |
| "epoch": 0.508, | |
| "grad_norm": 0.5726686989658507, | |
| "learning_rate": 5.4668888888888896e-06, | |
| "loss": 2.37, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 0.508, | |
| "eval_loss": 2.383046865463257, | |
| "eval_runtime": 31.8165, | |
| "eval_samples_per_second": 3.206, | |
| "eval_steps_per_second": 1.603, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 0.5085, | |
| "grad_norm": 0.536904504012326, | |
| "learning_rate": 5.461333333333334e-06, | |
| "loss": 2.3683, | |
| "step": 25425 | |
| }, | |
| { | |
| "epoch": 0.509, | |
| "grad_norm": 0.5792290465322086, | |
| "learning_rate": 5.455777777777778e-06, | |
| "loss": 2.3641, | |
| "step": 25450 | |
| }, | |
| { | |
| "epoch": 0.5095, | |
| "grad_norm": 0.5667490944788528, | |
| "learning_rate": 5.450222222222222e-06, | |
| "loss": 2.3673, | |
| "step": 25475 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.5581091402617585, | |
| "learning_rate": 5.444666666666667e-06, | |
| "loss": 2.374, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "eval_loss": 2.3831074237823486, | |
| "eval_runtime": 31.8462, | |
| "eval_samples_per_second": 3.203, | |
| "eval_steps_per_second": 1.601, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.5105, | |
| "grad_norm": 0.5629059983127724, | |
| "learning_rate": 5.4391111111111116e-06, | |
| "loss": 2.376, | |
| "step": 25525 | |
| }, | |
| { | |
| "epoch": 0.511, | |
| "grad_norm": 0.5600711744363054, | |
| "learning_rate": 5.433555555555556e-06, | |
| "loss": 2.3702, | |
| "step": 25550 | |
| }, | |
| { | |
| "epoch": 0.5115, | |
| "grad_norm": 0.5500784026204207, | |
| "learning_rate": 5.4279999999999995e-06, | |
| "loss": 2.3704, | |
| "step": 25575 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 0.553377338742942, | |
| "learning_rate": 5.422444444444445e-06, | |
| "loss": 2.3644, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "eval_loss": 2.3826544284820557, | |
| "eval_runtime": 31.7739, | |
| "eval_samples_per_second": 3.21, | |
| "eval_steps_per_second": 1.605, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 0.5125, | |
| "grad_norm": 0.5861763037221558, | |
| "learning_rate": 5.416888888888889e-06, | |
| "loss": 2.3658, | |
| "step": 25625 | |
| }, | |
| { | |
| "epoch": 0.513, | |
| "grad_norm": 0.5538084648071333, | |
| "learning_rate": 5.411333333333334e-06, | |
| "loss": 2.3693, | |
| "step": 25650 | |
| }, | |
| { | |
| "epoch": 0.5135, | |
| "grad_norm": 0.5699472071254841, | |
| "learning_rate": 5.405777777777779e-06, | |
| "loss": 2.3707, | |
| "step": 25675 | |
| }, | |
| { | |
| "epoch": 0.514, | |
| "grad_norm": 0.5440880568370218, | |
| "learning_rate": 5.400222222222222e-06, | |
| "loss": 2.3664, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 0.514, | |
| "eval_loss": 2.382906675338745, | |
| "eval_runtime": 31.7874, | |
| "eval_samples_per_second": 3.209, | |
| "eval_steps_per_second": 1.604, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 0.5145, | |
| "grad_norm": 0.551256815387497, | |
| "learning_rate": 5.394666666666667e-06, | |
| "loss": 2.3608, | |
| "step": 25725 | |
| }, | |
| { | |
| "epoch": 0.515, | |
| "grad_norm": 0.552653919875225, | |
| "learning_rate": 5.389111111111112e-06, | |
| "loss": 2.3648, | |
| "step": 25750 | |
| }, | |
| { | |
| "epoch": 0.5155, | |
| "grad_norm": 0.5489775829628063, | |
| "learning_rate": 5.3835555555555565e-06, | |
| "loss": 2.368, | |
| "step": 25775 | |
| }, | |
| { | |
| "epoch": 0.516, | |
| "grad_norm": 0.545224524462321, | |
| "learning_rate": 5.378e-06, | |
| "loss": 2.37, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 0.516, | |
| "eval_loss": 2.382946491241455, | |
| "eval_runtime": 31.8142, | |
| "eval_samples_per_second": 3.206, | |
| "eval_steps_per_second": 1.603, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 0.5165, | |
| "grad_norm": 0.6177434912819645, | |
| "learning_rate": 5.372444444444444e-06, | |
| "loss": 2.3576, | |
| "step": 25825 | |
| }, | |
| { | |
| "epoch": 0.517, | |
| "grad_norm": 0.5731672053410489, | |
| "learning_rate": 5.36688888888889e-06, | |
| "loss": 2.3641, | |
| "step": 25850 | |
| }, | |
| { | |
| "epoch": 0.5175, | |
| "grad_norm": 0.547417736306074, | |
| "learning_rate": 5.361333333333334e-06, | |
| "loss": 2.3669, | |
| "step": 25875 | |
| }, | |
| { | |
| "epoch": 0.518, | |
| "grad_norm": 0.5666721324439973, | |
| "learning_rate": 5.3557777777777785e-06, | |
| "loss": 2.3633, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 0.518, | |
| "eval_loss": 2.3824901580810547, | |
| "eval_runtime": 31.8236, | |
| "eval_samples_per_second": 3.205, | |
| "eval_steps_per_second": 1.603, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 0.5185, | |
| "grad_norm": 0.5493694553264233, | |
| "learning_rate": 5.350222222222222e-06, | |
| "loss": 2.3676, | |
| "step": 25925 | |
| }, | |
| { | |
| "epoch": 0.519, | |
| "grad_norm": 0.5581911332398992, | |
| "learning_rate": 5.344666666666667e-06, | |
| "loss": 2.3665, | |
| "step": 25950 | |
| }, | |
| { | |
| "epoch": 0.5195, | |
| "grad_norm": 0.5523156791576098, | |
| "learning_rate": 5.339111111111112e-06, | |
| "loss": 2.3634, | |
| "step": 25975 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.5394984851015033, | |
| "learning_rate": 5.333555555555556e-06, | |
| "loss": 2.3693, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "eval_loss": 2.3825063705444336, | |
| "eval_runtime": 31.7579, | |
| "eval_samples_per_second": 3.212, | |
| "eval_steps_per_second": 1.606, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.5205, | |
| "grad_norm": 0.5425846904290578, | |
| "learning_rate": 5.328000000000001e-06, | |
| "loss": 2.3675, | |
| "step": 26025 | |
| }, | |
| { | |
| "epoch": 0.521, | |
| "grad_norm": 0.5621800567569987, | |
| "learning_rate": 5.322444444444445e-06, | |
| "loss": 2.3759, | |
| "step": 26050 | |
| }, | |
| { | |
| "epoch": 0.5215, | |
| "grad_norm": 0.5544103291449336, | |
| "learning_rate": 5.316888888888889e-06, | |
| "loss": 2.3576, | |
| "step": 26075 | |
| }, | |
| { | |
| "epoch": 0.522, | |
| "grad_norm": 0.550125457461572, | |
| "learning_rate": 5.311333333333334e-06, | |
| "loss": 2.3567, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 0.522, | |
| "eval_loss": 2.382749319076538, | |
| "eval_runtime": 31.8184, | |
| "eval_samples_per_second": 3.206, | |
| "eval_steps_per_second": 1.603, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 0.5225, | |
| "grad_norm": 0.5441956885780074, | |
| "learning_rate": 5.305777777777779e-06, | |
| "loss": 2.3562, | |
| "step": 26125 | |
| }, | |
| { | |
| "epoch": 0.523, | |
| "grad_norm": 0.5677266247403775, | |
| "learning_rate": 5.3002222222222225e-06, | |
| "loss": 2.3666, | |
| "step": 26150 | |
| }, | |
| { | |
| "epoch": 0.5235, | |
| "grad_norm": 0.5396975563673215, | |
| "learning_rate": 5.294666666666667e-06, | |
| "loss": 2.351, | |
| "step": 26175 | |
| }, | |
| { | |
| "epoch": 0.524, | |
| "grad_norm": 0.5374437057610971, | |
| "learning_rate": 5.289111111111111e-06, | |
| "loss": 2.3625, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 0.524, | |
| "eval_loss": 2.3822991847991943, | |
| "eval_runtime": 31.8822, | |
| "eval_samples_per_second": 3.199, | |
| "eval_steps_per_second": 1.6, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 0.5245, | |
| "grad_norm": 0.5627076715491244, | |
| "learning_rate": 5.2835555555555566e-06, | |
| "loss": 2.3699, | |
| "step": 26225 | |
| }, | |
| { | |
| "epoch": 0.525, | |
| "grad_norm": 0.5430691314388109, | |
| "learning_rate": 5.278000000000001e-06, | |
| "loss": 2.3648, | |
| "step": 26250 | |
| }, | |
| { | |
| "epoch": 0.5255, | |
| "grad_norm": 0.5319128139639624, | |
| "learning_rate": 5.2724444444444445e-06, | |
| "loss": 2.3722, | |
| "step": 26275 | |
| }, | |
| { | |
| "epoch": 0.526, | |
| "grad_norm": 0.5560009569047116, | |
| "learning_rate": 5.266888888888889e-06, | |
| "loss": 2.3763, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 0.526, | |
| "eval_loss": 2.3822247982025146, | |
| "eval_runtime": 31.7558, | |
| "eval_samples_per_second": 3.212, | |
| "eval_steps_per_second": 1.606, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 0.5265, | |
| "grad_norm": 0.5586923319248112, | |
| "learning_rate": 5.261333333333334e-06, | |
| "loss": 2.366, | |
| "step": 26325 | |
| }, | |
| { | |
| "epoch": 0.527, | |
| "grad_norm": 0.5621950392943218, | |
| "learning_rate": 5.255777777777779e-06, | |
| "loss": 2.3713, | |
| "step": 26350 | |
| }, | |
| { | |
| "epoch": 0.5275, | |
| "grad_norm": 0.5630783729958978, | |
| "learning_rate": 5.250222222222222e-06, | |
| "loss": 2.3508, | |
| "step": 26375 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 0.5543463911581646, | |
| "learning_rate": 5.2446666666666665e-06, | |
| "loss": 2.3588, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "eval_loss": 2.3820412158966064, | |
| "eval_runtime": 31.7735, | |
| "eval_samples_per_second": 3.21, | |
| "eval_steps_per_second": 1.605, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 0.5285, | |
| "grad_norm": 0.5521701819516005, | |
| "learning_rate": 5.239111111111112e-06, | |
| "loss": 2.3798, | |
| "step": 26425 | |
| }, | |
| { | |
| "epoch": 0.529, | |
| "grad_norm": 0.5697290541696707, | |
| "learning_rate": 5.233555555555556e-06, | |
| "loss": 2.3705, | |
| "step": 26450 | |
| }, | |
| { | |
| "epoch": 0.5295, | |
| "grad_norm": 0.5456656767494042, | |
| "learning_rate": 5.228000000000001e-06, | |
| "loss": 2.3603, | |
| "step": 26475 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.544157308823069, | |
| "learning_rate": 5.222444444444444e-06, | |
| "loss": 2.3598, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "eval_loss": 2.3819408416748047, | |
| "eval_runtime": 31.804, | |
| "eval_samples_per_second": 3.207, | |
| "eval_steps_per_second": 1.604, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.5305, | |
| "grad_norm": 0.5399718074412095, | |
| "learning_rate": 5.216888888888889e-06, | |
| "loss": 2.3765, | |
| "step": 26525 | |
| }, | |
| { | |
| "epoch": 0.531, | |
| "grad_norm": 0.542440216852853, | |
| "learning_rate": 5.211333333333334e-06, | |
| "loss": 2.3758, | |
| "step": 26550 | |
| }, | |
| { | |
| "epoch": 0.5315, | |
| "grad_norm": 0.5648571300651365, | |
| "learning_rate": 5.205777777777778e-06, | |
| "loss": 2.3685, | |
| "step": 26575 | |
| }, | |
| { | |
| "epoch": 0.532, | |
| "grad_norm": 0.573442767423831, | |
| "learning_rate": 5.2002222222222235e-06, | |
| "loss": 2.3556, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 0.532, | |
| "eval_loss": 2.382056951522827, | |
| "eval_runtime": 31.8038, | |
| "eval_samples_per_second": 3.207, | |
| "eval_steps_per_second": 1.604, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 0.5325, | |
| "grad_norm": 0.6056414806190663, | |
| "learning_rate": 5.194666666666667e-06, | |
| "loss": 2.3595, | |
| "step": 26625 | |
| }, | |
| { | |
| "epoch": 0.533, | |
| "grad_norm": 0.5481757619700885, | |
| "learning_rate": 5.189111111111111e-06, | |
| "loss": 2.3727, | |
| "step": 26650 | |
| }, | |
| { | |
| "epoch": 0.5335, | |
| "grad_norm": 0.5610562792027696, | |
| "learning_rate": 5.183555555555556e-06, | |
| "loss": 2.3673, | |
| "step": 26675 | |
| }, | |
| { | |
| "epoch": 0.534, | |
| "grad_norm": 0.5702347426339772, | |
| "learning_rate": 5.178000000000001e-06, | |
| "loss": 2.3622, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 0.534, | |
| "eval_loss": 2.381828546524048, | |
| "eval_runtime": 31.992, | |
| "eval_samples_per_second": 3.188, | |
| "eval_steps_per_second": 1.594, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 0.5345, | |
| "grad_norm": 0.5565593579595437, | |
| "learning_rate": 5.172444444444445e-06, | |
| "loss": 2.3651, | |
| "step": 26725 | |
| }, | |
| { | |
| "epoch": 0.535, | |
| "grad_norm": 0.5398272748687973, | |
| "learning_rate": 5.166888888888889e-06, | |
| "loss": 2.3703, | |
| "step": 26750 | |
| }, | |
| { | |
| "epoch": 0.5355, | |
| "grad_norm": 0.5611538131409728, | |
| "learning_rate": 5.1613333333333334e-06, | |
| "loss": 2.3778, | |
| "step": 26775 | |
| }, | |
| { | |
| "epoch": 0.536, | |
| "grad_norm": 0.5436520053621182, | |
| "learning_rate": 5.155777777777779e-06, | |
| "loss": 2.3561, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 0.536, | |
| "eval_loss": 2.381396532058716, | |
| "eval_runtime": 31.772, | |
| "eval_samples_per_second": 3.21, | |
| "eval_steps_per_second": 1.605, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 0.5365, | |
| "grad_norm": 0.5574841239488896, | |
| "learning_rate": 5.150222222222223e-06, | |
| "loss": 2.3607, | |
| "step": 26825 | |
| }, | |
| { | |
| "epoch": 0.537, | |
| "grad_norm": 0.5459267231396281, | |
| "learning_rate": 5.144666666666667e-06, | |
| "loss": 2.3652, | |
| "step": 26850 | |
| }, | |
| { | |
| "epoch": 0.5375, | |
| "grad_norm": 0.5764624554311072, | |
| "learning_rate": 5.139111111111111e-06, | |
| "loss": 2.3748, | |
| "step": 26875 | |
| }, | |
| { | |
| "epoch": 0.538, | |
| "grad_norm": 0.5452582655691465, | |
| "learning_rate": 5.133555555555556e-06, | |
| "loss": 2.3751, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 0.538, | |
| "eval_loss": 2.3815813064575195, | |
| "eval_runtime": 31.833, | |
| "eval_samples_per_second": 3.204, | |
| "eval_steps_per_second": 1.602, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 0.5385, | |
| "grad_norm": 0.5591974032204698, | |
| "learning_rate": 5.128000000000001e-06, | |
| "loss": 2.3595, | |
| "step": 26925 | |
| }, | |
| { | |
| "epoch": 0.539, | |
| "grad_norm": 0.5910956937930101, | |
| "learning_rate": 5.122444444444444e-06, | |
| "loss": 2.3712, | |
| "step": 26950 | |
| }, | |
| { | |
| "epoch": 0.5395, | |
| "grad_norm": 0.5532516136915937, | |
| "learning_rate": 5.116888888888889e-06, | |
| "loss": 2.3673, | |
| "step": 26975 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.5654498740726267, | |
| "learning_rate": 5.111333333333334e-06, | |
| "loss": 2.3667, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "eval_loss": 2.3814122676849365, | |
| "eval_runtime": 31.7588, | |
| "eval_samples_per_second": 3.212, | |
| "eval_steps_per_second": 1.606, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.5405, | |
| "grad_norm": 0.5639894142193489, | |
| "learning_rate": 5.105777777777778e-06, | |
| "loss": 2.3604, | |
| "step": 27025 | |
| }, | |
| { | |
| "epoch": 0.541, | |
| "grad_norm": 0.5650474829629732, | |
| "learning_rate": 5.100222222222223e-06, | |
| "loss": 2.3615, | |
| "step": 27050 | |
| }, | |
| { | |
| "epoch": 0.5415, | |
| "grad_norm": 0.5549449402784257, | |
| "learning_rate": 5.094666666666666e-06, | |
| "loss": 2.3679, | |
| "step": 27075 | |
| }, | |
| { | |
| "epoch": 0.542, | |
| "grad_norm": 0.5615002192664388, | |
| "learning_rate": 5.0891111111111115e-06, | |
| "loss": 2.3634, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 0.542, | |
| "eval_loss": 2.381121873855591, | |
| "eval_runtime": 31.7586, | |
| "eval_samples_per_second": 3.212, | |
| "eval_steps_per_second": 1.606, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 0.5425, | |
| "grad_norm": 0.5403095468370492, | |
| "learning_rate": 5.083555555555556e-06, | |
| "loss": 2.3665, | |
| "step": 27125 | |
| }, | |
| { | |
| "epoch": 0.543, | |
| "grad_norm": 0.5421716749680758, | |
| "learning_rate": 5.078e-06, | |
| "loss": 2.369, | |
| "step": 27150 | |
| }, | |
| { | |
| "epoch": 0.5435, | |
| "grad_norm": 0.5590064616229682, | |
| "learning_rate": 5.072444444444446e-06, | |
| "loss": 2.3594, | |
| "step": 27175 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 0.5444799207706167, | |
| "learning_rate": 5.066888888888889e-06, | |
| "loss": 2.3582, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "eval_loss": 2.3811404705047607, | |
| "eval_runtime": 31.8368, | |
| "eval_samples_per_second": 3.204, | |
| "eval_steps_per_second": 1.602, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 0.5445, | |
| "grad_norm": 0.5694522608963828, | |
| "learning_rate": 5.0613333333333336e-06, | |
| "loss": 2.3651, | |
| "step": 27225 | |
| }, | |
| { | |
| "epoch": 0.545, | |
| "grad_norm": 0.5357232316900923, | |
| "learning_rate": 5.055777777777778e-06, | |
| "loss": 2.3595, | |
| "step": 27250 | |
| }, | |
| { | |
| "epoch": 0.5455, | |
| "grad_norm": 0.5449200504756736, | |
| "learning_rate": 5.050222222222223e-06, | |
| "loss": 2.3563, | |
| "step": 27275 | |
| }, | |
| { | |
| "epoch": 0.546, | |
| "grad_norm": 0.5669179572699722, | |
| "learning_rate": 5.044666666666667e-06, | |
| "loss": 2.3705, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 0.546, | |
| "eval_loss": 2.3810057640075684, | |
| "eval_runtime": 31.7869, | |
| "eval_samples_per_second": 3.209, | |
| "eval_steps_per_second": 1.604, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 0.5465, | |
| "grad_norm": 0.5536644347581473, | |
| "learning_rate": 5.039111111111111e-06, | |
| "loss": 2.3658, | |
| "step": 27325 | |
| }, | |
| { | |
| "epoch": 0.547, | |
| "grad_norm": 0.5774297317851765, | |
| "learning_rate": 5.0335555555555556e-06, | |
| "loss": 2.3553, | |
| "step": 27350 | |
| }, | |
| { | |
| "epoch": 0.5475, | |
| "grad_norm": 0.567395549600367, | |
| "learning_rate": 5.028000000000001e-06, | |
| "loss": 2.3694, | |
| "step": 27375 | |
| }, | |
| { | |
| "epoch": 0.548, | |
| "grad_norm": 0.5501789999743681, | |
| "learning_rate": 5.022444444444445e-06, | |
| "loss": 2.3643, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 0.548, | |
| "eval_loss": 2.3811025619506836, | |
| "eval_runtime": 31.9197, | |
| "eval_samples_per_second": 3.196, | |
| "eval_steps_per_second": 1.598, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 0.5485, | |
| "grad_norm": 0.5719215133111718, | |
| "learning_rate": 5.016888888888889e-06, | |
| "loss": 2.365, | |
| "step": 27425 | |
| }, | |
| { | |
| "epoch": 0.549, | |
| "grad_norm": 0.5899241097551456, | |
| "learning_rate": 5.011333333333333e-06, | |
| "loss": 2.3774, | |
| "step": 27450 | |
| }, | |
| { | |
| "epoch": 0.5495, | |
| "grad_norm": 0.5731413292155066, | |
| "learning_rate": 5.0057777777777784e-06, | |
| "loss": 2.3706, | |
| "step": 27475 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.5425656065958468, | |
| "learning_rate": 5.000222222222223e-06, | |
| "loss": 2.3566, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "eval_loss": 2.380763292312622, | |
| "eval_runtime": 31.8162, | |
| "eval_samples_per_second": 3.206, | |
| "eval_steps_per_second": 1.603, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.5505, | |
| "grad_norm": 0.5601626399029922, | |
| "learning_rate": 4.994666666666667e-06, | |
| "loss": 2.3762, | |
| "step": 27525 | |
| }, | |
| { | |
| "epoch": 0.551, | |
| "grad_norm": 0.5715204135637444, | |
| "learning_rate": 4.989111111111112e-06, | |
| "loss": 2.363, | |
| "step": 27550 | |
| }, | |
| { | |
| "epoch": 0.5515, | |
| "grad_norm": 0.547533853702179, | |
| "learning_rate": 4.983555555555556e-06, | |
| "loss": 2.3659, | |
| "step": 27575 | |
| }, | |
| { | |
| "epoch": 0.552, | |
| "grad_norm": 0.5817399132816639, | |
| "learning_rate": 4.9780000000000005e-06, | |
| "loss": 2.3693, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 0.552, | |
| "eval_loss": 2.3807787895202637, | |
| "eval_runtime": 31.8396, | |
| "eval_samples_per_second": 3.204, | |
| "eval_steps_per_second": 1.602, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 0.5525, | |
| "grad_norm": 0.544660595894246, | |
| "learning_rate": 4.972444444444445e-06, | |
| "loss": 2.3661, | |
| "step": 27625 | |
| }, | |
| { | |
| "epoch": 0.553, | |
| "grad_norm": 0.5813863819688693, | |
| "learning_rate": 4.966888888888889e-06, | |
| "loss": 2.365, | |
| "step": 27650 | |
| }, | |
| { | |
| "epoch": 0.5535, | |
| "grad_norm": 0.555794514365692, | |
| "learning_rate": 4.961333333333334e-06, | |
| "loss": 2.3724, | |
| "step": 27675 | |
| }, | |
| { | |
| "epoch": 0.554, | |
| "grad_norm": 0.5549771654031, | |
| "learning_rate": 4.955777777777778e-06, | |
| "loss": 2.3712, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 0.554, | |
| "eval_loss": 2.380859613418579, | |
| "eval_runtime": 32.035, | |
| "eval_samples_per_second": 3.184, | |
| "eval_steps_per_second": 1.592, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 0.5545, | |
| "grad_norm": 0.5660580874490311, | |
| "learning_rate": 4.9502222222222225e-06, | |
| "loss": 2.3626, | |
| "step": 27725 | |
| }, | |
| { | |
| "epoch": 0.555, | |
| "grad_norm": 0.5408935222204184, | |
| "learning_rate": 4.944666666666667e-06, | |
| "loss": 2.3546, | |
| "step": 27750 | |
| }, | |
| { | |
| "epoch": 0.5555, | |
| "grad_norm": 0.5574539497290301, | |
| "learning_rate": 4.939111111111112e-06, | |
| "loss": 2.3503, | |
| "step": 27775 | |
| }, | |
| { | |
| "epoch": 0.556, | |
| "grad_norm": 0.5733587459238179, | |
| "learning_rate": 4.933555555555556e-06, | |
| "loss": 2.3787, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 0.556, | |
| "eval_loss": 2.380819082260132, | |
| "eval_runtime": 31.8731, | |
| "eval_samples_per_second": 3.2, | |
| "eval_steps_per_second": 1.6, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 0.5565, | |
| "grad_norm": 0.5469010479471977, | |
| "learning_rate": 4.928000000000001e-06, | |
| "loss": 2.3728, | |
| "step": 27825 | |
| }, | |
| { | |
| "epoch": 0.557, | |
| "grad_norm": 0.5575923461377743, | |
| "learning_rate": 4.9224444444444445e-06, | |
| "loss": 2.3587, | |
| "step": 27850 | |
| }, | |
| { | |
| "epoch": 0.5575, | |
| "grad_norm": 0.5484615569385746, | |
| "learning_rate": 4.91688888888889e-06, | |
| "loss": 2.3554, | |
| "step": 27875 | |
| }, | |
| { | |
| "epoch": 0.558, | |
| "grad_norm": 0.5700580906470195, | |
| "learning_rate": 4.911333333333333e-06, | |
| "loss": 2.3591, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 0.558, | |
| "eval_loss": 2.380748748779297, | |
| "eval_runtime": 31.8799, | |
| "eval_samples_per_second": 3.2, | |
| "eval_steps_per_second": 1.6, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 0.5585, | |
| "grad_norm": 0.5644741625244013, | |
| "learning_rate": 4.9057777777777785e-06, | |
| "loss": 2.3573, | |
| "step": 27925 | |
| }, | |
| { | |
| "epoch": 0.559, | |
| "grad_norm": 0.5518750142742082, | |
| "learning_rate": 4.900222222222223e-06, | |
| "loss": 2.3722, | |
| "step": 27950 | |
| }, | |
| { | |
| "epoch": 0.5595, | |
| "grad_norm": 0.5570570164343176, | |
| "learning_rate": 4.894666666666667e-06, | |
| "loss": 2.3644, | |
| "step": 27975 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.5454507656456767, | |
| "learning_rate": 4.889111111111112e-06, | |
| "loss": 2.3545, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "eval_loss": 2.380680799484253, | |
| "eval_runtime": 31.8506, | |
| "eval_samples_per_second": 3.202, | |
| "eval_steps_per_second": 1.601, | |
| "step": 28000 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 50000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 9223372036854775807, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 8.91296643531617e+19, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |