| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.255076102306606, | |
| "eval_steps": 500, | |
| "global_step": 5000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.002510591558135886, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 1.9964e-05, | |
| "loss": 1.2975, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.005021183116271772, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 1.9924e-05, | |
| "loss": 0.6792, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.007531774674407657, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 1.9884e-05, | |
| "loss": 0.6571, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.010042366232543544, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.9844000000000002e-05, | |
| "loss": 0.6431, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.012552957790679428, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 1.9804000000000002e-05, | |
| "loss": 0.6739, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.015063549348815314, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.9764000000000003e-05, | |
| "loss": 0.6491, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0175741409069512, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 1.9724e-05, | |
| "loss": 0.5911, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.020084732465087088, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.9684e-05, | |
| "loss": 0.6117, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.022595324023222972, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 1.9644e-05, | |
| "loss": 0.5892, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.025105915581358856, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.9604e-05, | |
| "loss": 0.5893, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.027616507139494744, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 1.9564e-05, | |
| "loss": 0.6412, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.03012709869763063, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.9524e-05, | |
| "loss": 0.5514, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.032637690255766516, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 1.9484000000000002e-05, | |
| "loss": 0.6029, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.0351482818139024, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.9444000000000002e-05, | |
| "loss": 0.5723, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.037658873372038285, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.9404e-05, | |
| "loss": 0.5721, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.040169464930174176, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.9364e-05, | |
| "loss": 0.5911, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.04268005648831006, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.9324e-05, | |
| "loss": 0.5532, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.045190648046445944, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.9284e-05, | |
| "loss": 0.5836, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.04770123960458183, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 1.9244000000000004e-05, | |
| "loss": 0.5294, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.05021183116271771, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 1.9204e-05, | |
| "loss": 0.5675, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.052722422720853604, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 1.9164e-05, | |
| "loss": 0.4689, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.05523301427898949, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 1.9124000000000002e-05, | |
| "loss": 0.5765, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.05774360583712537, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.9084000000000002e-05, | |
| "loss": 0.5369, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.06025419739526126, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 1.9044000000000003e-05, | |
| "loss": 0.4895, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.06276478895339714, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.9004000000000003e-05, | |
| "loss": 0.5191, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.06527538051153303, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 1.8964000000000003e-05, | |
| "loss": 0.5655, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.06778597206966891, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 1.8924000000000004e-05, | |
| "loss": 0.5454, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.0702965636278048, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.8884e-05, | |
| "loss": 0.5656, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.07280715518594069, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.8844e-05, | |
| "loss": 0.481, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.07531774674407657, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 1.8804e-05, | |
| "loss": 0.5365, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.07782833830221246, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 1.8764000000000002e-05, | |
| "loss": 0.5271, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.08033892986034835, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 1.8724000000000002e-05, | |
| "loss": 0.5145, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.08284952141848423, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 1.8684000000000003e-05, | |
| "loss": 0.5129, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.08536011297662012, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 1.8644000000000003e-05, | |
| "loss": 0.5539, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.087870704534756, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.8604000000000003e-05, | |
| "loss": 0.5051, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.09038129609289189, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.8564e-05, | |
| "loss": 0.5034, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.09289188765102778, | |
| "grad_norm": 1.625, | |
| "learning_rate": 1.8524e-05, | |
| "loss": 0.4867, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.09540247920916366, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 1.8484e-05, | |
| "loss": 0.5237, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.09791307076729955, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.8444e-05, | |
| "loss": 0.5305, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.10042366232543543, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 1.8404000000000002e-05, | |
| "loss": 0.498, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.10293425388357132, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 1.8364000000000002e-05, | |
| "loss": 0.512, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.10544484544170721, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.8324000000000003e-05, | |
| "loss": 0.4874, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.10795543699984309, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.8284000000000003e-05, | |
| "loss": 0.5123, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.11046602855797898, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.8244e-05, | |
| "loss": 0.4691, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.11297662011611485, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 1.8204e-05, | |
| "loss": 0.4485, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.11548721167425074, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.8164e-05, | |
| "loss": 0.4647, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.11799780323238664, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.8124e-05, | |
| "loss": 0.4518, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.12050839479052251, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 1.8084e-05, | |
| "loss": 0.4848, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.1230189863486584, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 1.8044000000000002e-05, | |
| "loss": 0.4624, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.12552957790679428, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 1.8004000000000002e-05, | |
| "loss": 0.489, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1280401694649302, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 1.7964000000000003e-05, | |
| "loss": 0.4556, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.13055076102306606, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.7924e-05, | |
| "loss": 0.4726, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.13306135258120194, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.7884e-05, | |
| "loss": 0.4686, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.13557194413933782, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 1.7844e-05, | |
| "loss": 0.4228, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.13808253569747372, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.7804e-05, | |
| "loss": 0.4523, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.1405931272556096, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 1.7764e-05, | |
| "loss": 0.4235, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.14310371881374548, | |
| "grad_norm": 1.875, | |
| "learning_rate": 1.7724000000000002e-05, | |
| "loss": 0.4474, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.14561431037188138, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.7684000000000002e-05, | |
| "loss": 0.4576, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.14812490193001726, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.7644000000000003e-05, | |
| "loss": 0.4345, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.15063549348815314, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.7604e-05, | |
| "loss": 0.4124, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.15314608504628904, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.7564e-05, | |
| "loss": 0.4457, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.15565667660442492, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.7524e-05, | |
| "loss": 0.4569, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.1581672681625608, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.7484e-05, | |
| "loss": 0.4065, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.1606778597206967, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 1.7444e-05, | |
| "loss": 0.4231, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.16318845127883258, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 1.7404e-05, | |
| "loss": 0.4306, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.16569904283696846, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 1.7364000000000002e-05, | |
| "loss": 0.4261, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.16820963439510433, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 1.7324000000000002e-05, | |
| "loss": 0.3939, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.17072022595324024, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 1.7284e-05, | |
| "loss": 0.3979, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.17323081751137612, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.7244e-05, | |
| "loss": 0.419, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.175741409069512, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.7204e-05, | |
| "loss": 0.4181, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.1782520006276479, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.7164e-05, | |
| "loss": 0.4345, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.18076259218578378, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.7124e-05, | |
| "loss": 0.436, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.18327318374391965, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.7084e-05, | |
| "loss": 0.3904, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.18578377530205556, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.7044e-05, | |
| "loss": 0.3941, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.18829436686019144, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.7004000000000002e-05, | |
| "loss": 0.403, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.19080495841832731, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.6964e-05, | |
| "loss": 0.397, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.1933155499764632, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.6924e-05, | |
| "loss": 0.3869, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.1958261415345991, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.6884e-05, | |
| "loss": 0.428, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.19833673309273497, | |
| "grad_norm": 2.375, | |
| "learning_rate": 1.6844e-05, | |
| "loss": 0.4024, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.20084732465087085, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.6804e-05, | |
| "loss": 0.4061, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.20335791620900676, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 1.6764e-05, | |
| "loss": 0.3882, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.20586850776714263, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.6724e-05, | |
| "loss": 0.3973, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.2083790993252785, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.6684e-05, | |
| "loss": 0.3871, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.21088969088341442, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.6644000000000002e-05, | |
| "loss": 0.3666, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.2134002824415503, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.6604000000000002e-05, | |
| "loss": 0.3853, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.21591087399968617, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.6564000000000003e-05, | |
| "loss": 0.3772, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.21842146555782208, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 1.6524000000000003e-05, | |
| "loss": 0.3926, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.22093205711595795, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.6484000000000003e-05, | |
| "loss": 0.3824, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.22344264867409383, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.6444000000000004e-05, | |
| "loss": 0.3777, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.2259532402322297, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 1.6404e-05, | |
| "loss": 0.3886, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.2284638317903656, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.6364e-05, | |
| "loss": 0.4085, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.2309744233485015, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.6324e-05, | |
| "loss": 0.3692, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.23348501490663737, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.6284000000000002e-05, | |
| "loss": 0.3528, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.23599560646477327, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.6244000000000002e-05, | |
| "loss": 0.3708, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.23850619802290915, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.6204000000000003e-05, | |
| "loss": 0.3625, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.24101678958104503, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.6164000000000003e-05, | |
| "loss": 0.3596, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.24352738113918093, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.6124000000000004e-05, | |
| "loss": 0.3576, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.2460379726973168, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 1.6084e-05, | |
| "loss": 0.3481, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.2485485642554527, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.6044e-05, | |
| "loss": 0.3899, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.25105915581358856, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.6004e-05, | |
| "loss": 0.3838, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.25356974737172444, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.5964e-05, | |
| "loss": 0.3522, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.2560803389298604, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.5924000000000002e-05, | |
| "loss": 0.3277, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.25859093048799625, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.5884000000000002e-05, | |
| "loss": 0.3532, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.26110152204613213, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.5844000000000003e-05, | |
| "loss": 0.3558, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.263612113604268, | |
| "grad_norm": 2.375, | |
| "learning_rate": 1.5804000000000003e-05, | |
| "loss": 0.3617, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.2661227051624039, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 1.5764e-05, | |
| "loss": 0.3607, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.26863329672053976, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.5724e-05, | |
| "loss": 0.3948, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.27114388827867564, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.5684e-05, | |
| "loss": 0.3776, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.27365447983681157, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.5644e-05, | |
| "loss": 0.3324, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.27616507139494745, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.5604000000000002e-05, | |
| "loss": 0.3536, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.2786756629530833, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 1.5564000000000002e-05, | |
| "loss": 0.3591, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.2811862545112192, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.5524000000000002e-05, | |
| "loss": 0.3598, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.2836968460693551, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 1.5484000000000003e-05, | |
| "loss": 0.323, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.28620743762749096, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 1.5444e-05, | |
| "loss": 0.3496, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.2887180291856269, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.5404e-05, | |
| "loss": 0.3442, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.29122862074376277, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.5364e-05, | |
| "loss": 0.3392, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.29373921230189864, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.5324e-05, | |
| "loss": 0.3689, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.2962498038600345, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.5284e-05, | |
| "loss": 0.347, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.2987603954181704, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 1.5244000000000002e-05, | |
| "loss": 0.3487, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.3012709869763063, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 1.5204e-05, | |
| "loss": 0.3182, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.30378157853444215, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.5164e-05, | |
| "loss": 0.3298, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.3062921700925781, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 1.5124000000000001e-05, | |
| "loss": 0.3299, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.30880276165071396, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 1.5084000000000002e-05, | |
| "loss": 0.3685, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.31131335320884984, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.5044e-05, | |
| "loss": 0.3321, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.3138239447669857, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.5004e-05, | |
| "loss": 0.3397, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.3163345363251216, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 1.4964000000000001e-05, | |
| "loss": 0.3296, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.3188451278832575, | |
| "grad_norm": 1.875, | |
| "learning_rate": 1.4924000000000001e-05, | |
| "loss": 0.3122, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.3213557194413934, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.4884e-05, | |
| "loss": 0.3325, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.3238663109995293, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.4844e-05, | |
| "loss": 0.343, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.32637690255766516, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 1.4804000000000001e-05, | |
| "loss": 0.332, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.32888749411580104, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.4764000000000001e-05, | |
| "loss": 0.3303, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.3313980856739369, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.4724e-05, | |
| "loss": 0.3302, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.3339086772320728, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 1.4684e-05, | |
| "loss": 0.3304, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.33641926879020867, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 1.4644e-05, | |
| "loss": 0.3247, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.3389298603483446, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 1.4604000000000001e-05, | |
| "loss": 0.324, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.3414404519064805, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 1.4564e-05, | |
| "loss": 0.3353, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.34395104346461636, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.4524e-05, | |
| "loss": 0.321, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.34646163502275223, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 1.4484e-05, | |
| "loss": 0.3133, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.3489722265808881, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 1.4444000000000001e-05, | |
| "loss": 0.3516, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.351482818139024, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 1.4404e-05, | |
| "loss": 0.3093, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.35399340969715987, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 1.4364e-05, | |
| "loss": 0.3171, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.3565040012552958, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.4324e-05, | |
| "loss": 0.3321, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.3590145928134317, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.4284e-05, | |
| "loss": 0.3111, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.36152518437156755, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 1.4244000000000003e-05, | |
| "loss": 0.3403, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.36403577592970343, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 1.4204000000000002e-05, | |
| "loss": 0.3174, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.3665463674878393, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.4164000000000002e-05, | |
| "loss": 0.3332, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.3690569590459752, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 1.4124000000000002e-05, | |
| "loss": 0.3139, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.3715675506041111, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 1.4084000000000003e-05, | |
| "loss": 0.328, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.374078142162247, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.4044000000000001e-05, | |
| "loss": 0.3301, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.3765887337203829, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 1.4004000000000002e-05, | |
| "loss": 0.3218, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.37909932527851875, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 1.3964000000000002e-05, | |
| "loss": 0.3371, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.38160991683665463, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 1.3924000000000003e-05, | |
| "loss": 0.3162, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.3841205083947905, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.3884000000000001e-05, | |
| "loss": 0.3092, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.3866310999529264, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.3844000000000002e-05, | |
| "loss": 0.3224, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.3891416915110623, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 1.3804000000000002e-05, | |
| "loss": 0.3027, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.3916522830691982, | |
| "grad_norm": 2.375, | |
| "learning_rate": 1.3764000000000002e-05, | |
| "loss": 0.3303, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.39416287462733407, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 1.3724000000000001e-05, | |
| "loss": 0.3317, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.39667346618546995, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 1.3684000000000001e-05, | |
| "loss": 0.3195, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.3991840577436058, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.3644000000000002e-05, | |
| "loss": 0.3185, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.4016946493017417, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.3604000000000002e-05, | |
| "loss": 0.2818, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.40420524085987763, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.3564000000000001e-05, | |
| "loss": 0.2996, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.4067158324180135, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 1.3524000000000001e-05, | |
| "loss": 0.3167, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.4092264239761494, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.3484000000000002e-05, | |
| "loss": 0.3257, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.41173701553428527, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 1.3444000000000002e-05, | |
| "loss": 0.3145, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.41424760709242114, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 1.3404e-05, | |
| "loss": 0.2936, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.416758198650557, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 1.3364000000000001e-05, | |
| "loss": 0.3158, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.4192687902086929, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 1.3324000000000002e-05, | |
| "loss": 0.3233, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.42177938176682883, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 1.3284000000000002e-05, | |
| "loss": 0.2975, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.4242899733249647, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.3244e-05, | |
| "loss": 0.3058, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.4268005648831006, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.3204000000000001e-05, | |
| "loss": 0.3107, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.42931115644123646, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.3164000000000001e-05, | |
| "loss": 0.3096, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.43182174799937234, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 1.3124000000000002e-05, | |
| "loss": 0.3052, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.4343323395575082, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.3084e-05, | |
| "loss": 0.3029, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.43684293111564415, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.3044e-05, | |
| "loss": 0.3147, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.43935352267378003, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 1.3004000000000001e-05, | |
| "loss": 0.3053, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.4418641142319159, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 1.2964000000000002e-05, | |
| "loss": 0.2913, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.4443747057900518, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.2924e-05, | |
| "loss": 0.2916, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.44688529734818766, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.2884e-05, | |
| "loss": 0.3159, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.44939588890632354, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 1.2844000000000001e-05, | |
| "loss": 0.2987, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.4519064804644594, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 1.2804000000000001e-05, | |
| "loss": 0.3136, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.45441707202259535, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 1.2764e-05, | |
| "loss": 0.3038, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.4569276635807312, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.2724e-05, | |
| "loss": 0.2916, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.4594382551388671, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.2684000000000001e-05, | |
| "loss": 0.3113, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.461948846697003, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 1.2644000000000001e-05, | |
| "loss": 0.2728, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.46445943825513886, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.2604e-05, | |
| "loss": 0.2868, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.46697002981327473, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.2564e-05, | |
| "loss": 0.289, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.4694806213714106, | |
| "grad_norm": 1.4921875, | |
| "learning_rate": 1.2524e-05, | |
| "loss": 0.2881, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.47199121292954654, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 1.2484000000000001e-05, | |
| "loss": 0.2803, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.4745018044876824, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 1.2444e-05, | |
| "loss": 0.2785, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.4770123960458183, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 1.2404e-05, | |
| "loss": 0.2976, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.4795229876039542, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.2364e-05, | |
| "loss": 0.2737, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.48203357916209005, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 1.2324000000000001e-05, | |
| "loss": 0.3237, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.48454417072022593, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 1.2284e-05, | |
| "loss": 0.297, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.48705476227836186, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.2244e-05, | |
| "loss": 0.2745, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.48956535383649774, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.2204e-05, | |
| "loss": 0.3059, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.4920759453946336, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.2164e-05, | |
| "loss": 0.2876, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.4945865369527695, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 1.2124e-05, | |
| "loss": 0.2801, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.4970971285109054, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 1.2084e-05, | |
| "loss": 0.2971, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.49960772006904125, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 1.2044e-05, | |
| "loss": 0.2979, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.5021183116271771, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.2004e-05, | |
| "loss": 0.306, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5046289031853131, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 1.1964e-05, | |
| "loss": 0.2975, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.5071394947434489, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.1924e-05, | |
| "loss": 0.2771, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.5096500863015848, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 1.1884e-05, | |
| "loss": 0.2903, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.5121606778597207, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.1844e-05, | |
| "loss": 0.2808, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.5146712694178566, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.1803999999999999e-05, | |
| "loss": 0.2856, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.5171818609759925, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 1.1764e-05, | |
| "loss": 0.2868, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.5196924525341283, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.1724000000000002e-05, | |
| "loss": 0.2973, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.5222030440922643, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 1.1684000000000002e-05, | |
| "loss": 0.3097, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.5247136356504001, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.1644000000000002e-05, | |
| "loss": 0.2731, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.527224227208536, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.1604000000000003e-05, | |
| "loss": 0.2849, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.5297348187666719, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.1564000000000001e-05, | |
| "loss": 0.2949, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.5322454103248078, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.1524000000000002e-05, | |
| "loss": 0.3049, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.5347560018829437, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 1.1484000000000002e-05, | |
| "loss": 0.2851, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.5372665934410795, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 1.1444000000000003e-05, | |
| "loss": 0.2919, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.5397771849992155, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.1404000000000001e-05, | |
| "loss": 0.2967, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.5422877765573513, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 1.1364000000000002e-05, | |
| "loss": 0.2801, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.5447983681154872, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.1324000000000002e-05, | |
| "loss": 0.3012, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.5473089596736231, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 1.1284000000000002e-05, | |
| "loss": 0.2925, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.549819551231759, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 1.1244000000000001e-05, | |
| "loss": 0.2954, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.5523301427898949, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 1.1204000000000001e-05, | |
| "loss": 0.2799, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.5548407343480307, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 1.1164000000000002e-05, | |
| "loss": 0.2704, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.5573513259061667, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.1124000000000002e-05, | |
| "loss": 0.2876, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.5598619174643025, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 1.1084000000000001e-05, | |
| "loss": 0.2633, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.5623725090224384, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.1044000000000001e-05, | |
| "loss": 0.2867, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.5648831005805743, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 1.1004000000000002e-05, | |
| "loss": 0.2702, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.5673936921387102, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 1.0964000000000002e-05, | |
| "loss": 0.2852, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.5699042836968461, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 1.0924e-05, | |
| "loss": 0.2746, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.5724148752549819, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 1.0884000000000001e-05, | |
| "loss": 0.2932, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.5749254668131178, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 1.0844000000000002e-05, | |
| "loss": 0.2903, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.5774360583712538, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.0804000000000002e-05, | |
| "loss": 0.2851, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.5799466499293896, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.0764e-05, | |
| "loss": 0.2923, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.5824572414875255, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.0724000000000001e-05, | |
| "loss": 0.3003, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.5849678330456614, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.0684000000000001e-05, | |
| "loss": 0.291, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.5874784246037973, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 1.0644000000000002e-05, | |
| "loss": 0.2787, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.5899890161619331, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.0604e-05, | |
| "loss": 0.2764, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.592499607720069, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 1.0564e-05, | |
| "loss": 0.2842, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.595010199278205, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.0524000000000001e-05, | |
| "loss": 0.2922, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.5975207908363408, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 1.0484000000000002e-05, | |
| "loss": 0.283, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.6000313823944767, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 1.0444e-05, | |
| "loss": 0.2796, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.6025419739526126, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 1.0404e-05, | |
| "loss": 0.2828, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.6050525655107485, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 1.0364000000000001e-05, | |
| "loss": 0.2866, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.6075631570688843, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.0324000000000001e-05, | |
| "loss": 0.2836, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.6100737486270202, | |
| "grad_norm": 1.875, | |
| "learning_rate": 1.0284e-05, | |
| "loss": 0.3037, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.6125843401851562, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.0244e-05, | |
| "loss": 0.2774, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.615094931743292, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 1.0204000000000001e-05, | |
| "loss": 0.2922, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.6176055233014279, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.0164000000000001e-05, | |
| "loss": 0.283, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.6201161148595637, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.0124e-05, | |
| "loss": 0.2725, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.6226267064176997, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 1.0084e-05, | |
| "loss": 0.2749, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.6251372979758355, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 1.0044e-05, | |
| "loss": 0.2823, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.6276478895339714, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.0004000000000001e-05, | |
| "loss": 0.281, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.6301584810921074, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 9.964e-06, | |
| "loss": 0.2856, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.6326690726502432, | |
| "grad_norm": 1.4921875, | |
| "learning_rate": 9.924e-06, | |
| "loss": 0.28, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.6351796642083791, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 9.884e-06, | |
| "loss": 0.267, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.637690255766515, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 9.844000000000001e-06, | |
| "loss": 0.2563, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.6402008473246509, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 9.804000000000001e-06, | |
| "loss": 0.2911, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.6427114388827868, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 9.764000000000002e-06, | |
| "loss": 0.2897, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.6452220304409226, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 9.724e-06, | |
| "loss": 0.2975, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.6477326219990586, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 9.684e-06, | |
| "loss": 0.2722, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.6502432135571944, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 9.644000000000001e-06, | |
| "loss": 0.2787, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.6527538051153303, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 9.604000000000002e-06, | |
| "loss": 0.2822, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.6552643966734661, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 9.564e-06, | |
| "loss": 0.2903, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.6577749882316021, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 9.524e-06, | |
| "loss": 0.2693, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.660285579789738, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 9.484000000000001e-06, | |
| "loss": 0.2768, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.6627961713478738, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 9.444000000000001e-06, | |
| "loss": 0.2707, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.6653067629060098, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 9.404e-06, | |
| "loss": 0.2791, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.6678173544641456, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 9.364e-06, | |
| "loss": 0.2924, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.6703279460222815, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 9.324000000000001e-06, | |
| "loss": 0.2875, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.6728385375804173, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 9.284000000000001e-06, | |
| "loss": 0.2602, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.6753491291385533, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 9.244e-06, | |
| "loss": 0.2924, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.6778597206966892, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 9.204e-06, | |
| "loss": 0.2841, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.680370312254825, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 9.164e-06, | |
| "loss": 0.2745, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.682880903812961, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 9.124000000000001e-06, | |
| "loss": 0.2876, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.6853914953710968, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 9.084e-06, | |
| "loss": 0.2549, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.6879020869292327, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 9.044e-06, | |
| "loss": 0.2512, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.6904126784873685, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 9.004e-06, | |
| "loss": 0.2686, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.6929232700455045, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 8.964000000000001e-06, | |
| "loss": 0.2714, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.6954338616036404, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 8.924e-06, | |
| "loss": 0.275, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.6979444531617762, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 8.884e-06, | |
| "loss": 0.2631, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.7004550447199122, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 8.844e-06, | |
| "loss": 0.2639, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.702965636278048, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 8.804e-06, | |
| "loss": 0.2755, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.7054762278361839, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 8.764e-06, | |
| "loss": 0.282, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.7079868193943197, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 8.724e-06, | |
| "loss": 0.2672, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.7104974109524557, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 8.684e-06, | |
| "loss": 0.2663, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.7130080025105916, | |
| "grad_norm": 1.75, | |
| "learning_rate": 8.644e-06, | |
| "loss": 0.263, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.7155185940687274, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 8.604000000000001e-06, | |
| "loss": 0.276, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.7180291856268634, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 8.564000000000001e-06, | |
| "loss": 0.2622, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.7205397771849992, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 8.524000000000002e-06, | |
| "loss": 0.2574, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.7230503687431351, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 8.484e-06, | |
| "loss": 0.2586, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.725560960301271, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 8.444e-06, | |
| "loss": 0.2694, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.7280715518594069, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 8.404000000000001e-06, | |
| "loss": 0.2817, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.7305821434175428, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 8.364000000000002e-06, | |
| "loss": 0.2767, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.7330927349756786, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 8.324e-06, | |
| "loss": 0.2547, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.7356033265338145, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 8.284e-06, | |
| "loss": 0.2724, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.7381139180919504, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 8.244000000000001e-06, | |
| "loss": 0.2972, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.7406245096500863, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 8.204000000000001e-06, | |
| "loss": 0.2601, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.7431351012082222, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 8.164e-06, | |
| "loss": 0.2552, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.7456456927663581, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 8.124e-06, | |
| "loss": 0.274, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.748156284324494, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 8.084000000000001e-06, | |
| "loss": 0.272, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.7506668758826298, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 8.044000000000001e-06, | |
| "loss": 0.2826, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.7531774674407657, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 8.004e-06, | |
| "loss": 0.2799, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.7556880589989016, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 7.964e-06, | |
| "loss": 0.2919, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.7581986505570375, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 7.924e-06, | |
| "loss": 0.2634, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.7607092421151734, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 7.884000000000001e-06, | |
| "loss": 0.259, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.7632198336733093, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 7.844e-06, | |
| "loss": 0.2687, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.7657304252314452, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 7.804e-06, | |
| "loss": 0.2644, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.768241016789581, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 7.764e-06, | |
| "loss": 0.2541, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.7707516083477169, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 7.724000000000001e-06, | |
| "loss": 0.2555, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.7732621999058528, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 7.684e-06, | |
| "loss": 0.2745, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.7757727914639887, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 7.644e-06, | |
| "loss": 0.2924, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.7782833830221246, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 7.604e-06, | |
| "loss": 0.2859, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.7807939745802605, | |
| "grad_norm": 1.75, | |
| "learning_rate": 7.564e-06, | |
| "loss": 0.2752, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.7833045661383964, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 7.524e-06, | |
| "loss": 0.2611, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.7858151576965322, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 7.484e-06, | |
| "loss": 0.256, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.7883257492546681, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 7.444e-06, | |
| "loss": 0.2707, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.790836340812804, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 7.404e-06, | |
| "loss": 0.2711, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.7933469323709399, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 7.364000000000001e-06, | |
| "loss": 0.2588, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.7958575239290758, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 7.324000000000001e-06, | |
| "loss": 0.2618, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.7983681154872116, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 7.284000000000001e-06, | |
| "loss": 0.2755, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.8008787070453476, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 7.244000000000001e-06, | |
| "loss": 0.2727, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.8033892986034834, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 7.204000000000001e-06, | |
| "loss": 0.2584, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.8058998901616193, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 7.164000000000001e-06, | |
| "loss": 0.2767, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.8084104817197553, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 7.124000000000001e-06, | |
| "loss": 0.2735, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.8109210732778911, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 7.084000000000001e-06, | |
| "loss": 0.277, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.813431664836027, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 7.044000000000001e-06, | |
| "loss": 0.2743, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.8159422563941628, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 7.004000000000001e-06, | |
| "loss": 0.2804, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.8184528479522988, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 6.964000000000001e-06, | |
| "loss": 0.27, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.8209634395104346, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 6.924000000000001e-06, | |
| "loss": 0.2672, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.8234740310685705, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 6.8840000000000005e-06, | |
| "loss": 0.2787, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.8259846226267065, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 6.844000000000001e-06, | |
| "loss": 0.2624, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.8284952141848423, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 6.804e-06, | |
| "loss": 0.2655, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.8310058057429782, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 6.764000000000001e-06, | |
| "loss": 0.2677, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.833516397301114, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 6.724e-06, | |
| "loss": 0.2655, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.83602698885925, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 6.684000000000001e-06, | |
| "loss": 0.27, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.8385375804173858, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 6.644e-06, | |
| "loss": 0.2503, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.8410481719755217, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 6.604000000000001e-06, | |
| "loss": 0.2773, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.8435587635336577, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 6.564e-06, | |
| "loss": 0.274, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.8460693550917935, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 6.5240000000000006e-06, | |
| "loss": 0.2587, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.8485799466499294, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 6.484e-06, | |
| "loss": 0.2699, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.8510905382080652, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 6.4440000000000005e-06, | |
| "loss": 0.255, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.8536011297662012, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 6.404e-06, | |
| "loss": 0.2532, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.856111721324337, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 6.364e-06, | |
| "loss": 0.2745, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.8586223128824729, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 6.324e-06, | |
| "loss": 0.2713, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.8611329044406089, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 6.284e-06, | |
| "loss": 0.271, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.8636434959987447, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 6.244e-06, | |
| "loss": 0.2485, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.8661540875568806, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 6.204e-06, | |
| "loss": 0.2648, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.8686646791150164, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 6.164e-06, | |
| "loss": 0.2683, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.8711752706731524, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 6.124000000000001e-06, | |
| "loss": 0.261, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.8736858622312883, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 6.084000000000001e-06, | |
| "loss": 0.2541, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.8761964537894241, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 6.044000000000001e-06, | |
| "loss": 0.2607, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.8787070453475601, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 6.004000000000001e-06, | |
| "loss": 0.2516, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.8812176369056959, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 5.964000000000001e-06, | |
| "loss": 0.2678, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.8837282284638318, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 5.924000000000001e-06, | |
| "loss": 0.2628, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.8862388200219676, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 5.884000000000001e-06, | |
| "loss": 0.2557, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.8887494115801036, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 5.844000000000001e-06, | |
| "loss": 0.2663, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.8912600031382395, | |
| "grad_norm": 2.125, | |
| "learning_rate": 5.804000000000001e-06, | |
| "loss": 0.2789, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.8937705946963753, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 5.764000000000001e-06, | |
| "loss": 0.2646, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.8962811862545113, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 5.724000000000001e-06, | |
| "loss": 0.2627, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.8987917778126471, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 5.684000000000001e-06, | |
| "loss": 0.281, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.901302369370783, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 5.6440000000000005e-06, | |
| "loss": 0.2588, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.9038129609289188, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 5.604000000000001e-06, | |
| "loss": 0.267, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.9063235524870548, | |
| "grad_norm": 1.75, | |
| "learning_rate": 5.5640000000000004e-06, | |
| "loss": 0.2546, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.9088341440451907, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 5.524000000000001e-06, | |
| "loss": 0.2607, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.9113447356033265, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 5.484e-06, | |
| "loss": 0.2804, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.9138553271614624, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 5.444000000000001e-06, | |
| "loss": 0.2595, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.9163659187195983, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 5.404e-06, | |
| "loss": 0.2697, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.9188765102777342, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 5.364000000000001e-06, | |
| "loss": 0.2725, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.92138710183587, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 5.324e-06, | |
| "loss": 0.2613, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 0.923897693394006, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 5.2840000000000006e-06, | |
| "loss": 0.2708, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.9264082849521419, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 5.244e-06, | |
| "loss": 0.2685, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 0.9289188765102777, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 5.2040000000000005e-06, | |
| "loss": 0.272, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.9314294680684136, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 5.164e-06, | |
| "loss": 0.2801, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 0.9339400596265495, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 5.124e-06, | |
| "loss": 0.257, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.9364506511846854, | |
| "grad_norm": 1.5, | |
| "learning_rate": 5.084e-06, | |
| "loss": 0.2511, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 0.9389612427428212, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 5.044e-06, | |
| "loss": 0.2645, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.9414718343009572, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 5.004e-06, | |
| "loss": 0.2711, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.9439824258590931, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 4.964e-06, | |
| "loss": 0.2547, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.9464930174172289, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 4.924000000000001e-06, | |
| "loss": 0.2728, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 0.9490036089753648, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 4.884e-06, | |
| "loss": 0.2666, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.9515142005335007, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 4.8440000000000005e-06, | |
| "loss": 0.2509, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 0.9540247920916366, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 4.804e-06, | |
| "loss": 0.2643, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.9565353836497725, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 4.7640000000000005e-06, | |
| "loss": 0.2568, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 0.9590459752079084, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 4.724e-06, | |
| "loss": 0.2673, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.9615565667660443, | |
| "grad_norm": 1.625, | |
| "learning_rate": 4.684e-06, | |
| "loss": 0.2622, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 0.9640671583241801, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 4.644e-06, | |
| "loss": 0.2656, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.966577749882316, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 4.604e-06, | |
| "loss": 0.2581, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.9690883414404519, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 4.564e-06, | |
| "loss": 0.2529, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.9715989329985878, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 4.524e-06, | |
| "loss": 0.284, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 0.9741095245567237, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 4.484000000000001e-06, | |
| "loss": 0.2596, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.9766201161148595, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 4.444e-06, | |
| "loss": 0.2759, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 0.9791307076729955, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 4.4040000000000005e-06, | |
| "loss": 0.2563, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.9816412992311313, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 4.364e-06, | |
| "loss": 0.25, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 0.9841518907892672, | |
| "grad_norm": 1.875, | |
| "learning_rate": 4.3240000000000004e-06, | |
| "loss": 0.2747, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.9866624823474031, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 4.284e-06, | |
| "loss": 0.2691, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 0.989173073905539, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 4.244e-06, | |
| "loss": 0.2431, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.9916836654636749, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 4.204e-06, | |
| "loss": 0.2422, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.9941942570218107, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 4.164e-06, | |
| "loss": 0.2554, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.9967048485799467, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 4.124e-06, | |
| "loss": 0.2659, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 0.9992154401380825, | |
| "grad_norm": 1.875, | |
| "learning_rate": 4.084e-06, | |
| "loss": 0.2563, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 1.0015063549348815, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 4.044e-06, | |
| "loss": 0.2382, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 1.0040169464930173, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 4.004e-06, | |
| "loss": 0.2385, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.0065275380511534, | |
| "grad_norm": 1.4921875, | |
| "learning_rate": 3.964e-06, | |
| "loss": 0.2354, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 1.0090381296092892, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 3.924000000000001e-06, | |
| "loss": 0.2446, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 1.011548721167425, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 3.884e-06, | |
| "loss": 0.2355, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 1.014059312725561, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 3.844000000000001e-06, | |
| "loss": 0.2358, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 1.0165699042836969, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 3.8040000000000003e-06, | |
| "loss": 0.2387, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.0190804958418327, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 3.7640000000000003e-06, | |
| "loss": 0.2342, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 1.0215910873999685, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 3.7240000000000003e-06, | |
| "loss": 0.2381, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 1.0241016789581046, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 3.6840000000000002e-06, | |
| "loss": 0.2392, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 1.0266122705162404, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 3.644e-06, | |
| "loss": 0.2335, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 1.0291228620743762, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 3.604e-06, | |
| "loss": 0.2544, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.0316334536325122, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 3.564e-06, | |
| "loss": 0.2254, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 1.034144045190648, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 3.524e-06, | |
| "loss": 0.2443, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 1.0366546367487839, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 3.484e-06, | |
| "loss": 0.2492, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 1.0391652283069197, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 3.444e-06, | |
| "loss": 0.2285, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 1.0416758198650558, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 3.404e-06, | |
| "loss": 0.2309, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.0441864114231916, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 3.364e-06, | |
| "loss": 0.2477, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 1.0466970029813274, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 3.324e-06, | |
| "loss": 0.2395, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 1.0492075945394634, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 3.2840000000000007e-06, | |
| "loss": 0.2266, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 1.0517181860975993, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 3.2440000000000006e-06, | |
| "loss": 0.2382, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 1.054228777655735, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 3.2040000000000006e-06, | |
| "loss": 0.2315, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.056739369213871, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 3.1640000000000005e-06, | |
| "loss": 0.2399, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 1.059249960772007, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 3.1240000000000005e-06, | |
| "loss": 0.2287, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 1.0617605523301428, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 3.0840000000000005e-06, | |
| "loss": 0.2399, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 1.0642711438882786, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 3.0440000000000004e-06, | |
| "loss": 0.2334, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 1.0667817354464146, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 3.0040000000000004e-06, | |
| "loss": 0.2324, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.0692923270045505, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 2.9640000000000003e-06, | |
| "loss": 0.231, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 1.0718029185626863, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 2.9240000000000003e-06, | |
| "loss": 0.2465, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 1.074313510120822, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 2.8840000000000003e-06, | |
| "loss": 0.2465, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 1.0768241016789581, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 2.8440000000000002e-06, | |
| "loss": 0.2514, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 1.079334693237094, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 2.804e-06, | |
| "loss": 0.2318, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.0818452847952298, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 2.764e-06, | |
| "loss": 0.2366, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 1.0843558763533658, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 2.724e-06, | |
| "loss": 0.2328, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 1.0868664679115017, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 2.6840000000000005e-06, | |
| "loss": 0.2566, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 1.0893770594696375, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 2.6440000000000004e-06, | |
| "loss": 0.2243, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 1.0918876510277735, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 2.6040000000000004e-06, | |
| "loss": 0.2234, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.0943982425859093, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 2.5640000000000004e-06, | |
| "loss": 0.2287, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 1.0969088341440452, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 2.5240000000000003e-06, | |
| "loss": 0.2365, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 1.099419425702181, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 2.4840000000000003e-06, | |
| "loss": 0.2299, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 1.101930017260317, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 2.4440000000000002e-06, | |
| "loss": 0.24, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 1.1044406088184529, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 2.404e-06, | |
| "loss": 0.2292, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.1069512003765887, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 2.364e-06, | |
| "loss": 0.2356, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 1.1094617919347247, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 2.324e-06, | |
| "loss": 0.2395, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 1.1119723834928605, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 2.284e-06, | |
| "loss": 0.2534, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 1.1144829750509964, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 2.244e-06, | |
| "loss": 0.2294, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 1.1169935666091322, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 2.2040000000000004e-06, | |
| "loss": 0.2428, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.1195041581672682, | |
| "grad_norm": 1.5, | |
| "learning_rate": 2.1640000000000004e-06, | |
| "loss": 0.2486, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 1.122014749725404, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 2.1240000000000003e-06, | |
| "loss": 0.2428, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 1.1245253412835399, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 2.0840000000000003e-06, | |
| "loss": 0.2438, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 1.127035932841676, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 2.0440000000000003e-06, | |
| "loss": 0.2278, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 1.1295465243998117, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 2.004e-06, | |
| "loss": 0.2306, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.1320571159579476, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.964e-06, | |
| "loss": 0.24, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 1.1345677075160836, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 1.924e-06, | |
| "loss": 0.2492, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 1.1370782990742194, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.8840000000000003e-06, | |
| "loss": 0.251, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 1.1395888906323552, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 1.8440000000000003e-06, | |
| "loss": 0.238, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 1.142099482190491, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 1.8040000000000002e-06, | |
| "loss": 0.2424, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.144610073748627, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 1.7640000000000002e-06, | |
| "loss": 0.2433, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 1.147120665306763, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.7240000000000001e-06, | |
| "loss": 0.2496, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 1.1496312568648988, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 1.684e-06, | |
| "loss": 0.2571, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 1.1521418484230348, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 1.644e-06, | |
| "loss": 0.2407, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 1.1546524399811706, | |
| "grad_norm": 1.375, | |
| "learning_rate": 1.604e-06, | |
| "loss": 0.2283, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.1571630315393064, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 1.5640000000000002e-06, | |
| "loss": 0.2366, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 1.1596736230974423, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 1.5240000000000001e-06, | |
| "loss": 0.2286, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 1.1621842146555783, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.4840000000000001e-06, | |
| "loss": 0.2388, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 1.1646948062137141, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 1.444e-06, | |
| "loss": 0.2334, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 1.16720539777185, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 1.404e-06, | |
| "loss": 0.2291, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.169715989329986, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.364e-06, | |
| "loss": 0.237, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 1.1722265808881218, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 1.324e-06, | |
| "loss": 0.2413, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 1.1747371724462576, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.284e-06, | |
| "loss": 0.2466, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 1.1772477640043935, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 1.244e-06, | |
| "loss": 0.2375, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 1.1797583555625295, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.204e-06, | |
| "loss": 0.2333, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.1822689471206653, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 1.1640000000000002e-06, | |
| "loss": 0.222, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 1.1847795386788011, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 1.1240000000000002e-06, | |
| "loss": 0.238, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 1.1872901302369372, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 1.0840000000000001e-06, | |
| "loss": 0.2399, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 1.189800721795073, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 1.044e-06, | |
| "loss": 0.2276, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 1.1923113133532088, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 1.004e-06, | |
| "loss": 0.2247, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.1948219049113447, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 9.64e-07, | |
| "loss": 0.2379, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 1.1973324964694807, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 9.240000000000001e-07, | |
| "loss": 0.233, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 1.1998430880276165, | |
| "grad_norm": 1.4921875, | |
| "learning_rate": 8.840000000000001e-07, | |
| "loss": 0.233, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 1.2023536795857523, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 8.440000000000001e-07, | |
| "loss": 0.2334, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 1.2048642711438884, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 8.04e-07, | |
| "loss": 0.2459, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.2073748627020242, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 7.64e-07, | |
| "loss": 0.2286, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 1.20988545426016, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 7.240000000000001e-07, | |
| "loss": 0.2372, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 1.2123960458182959, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 6.84e-07, | |
| "loss": 0.2399, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 1.214906637376432, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 6.44e-07, | |
| "loss": 0.2309, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 1.2174172289345677, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 6.040000000000001e-07, | |
| "loss": 0.2481, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.2199278204927035, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 5.64e-07, | |
| "loss": 0.234, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 1.2224384120508396, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 5.240000000000001e-07, | |
| "loss": 0.2331, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 1.2249490036089754, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 4.84e-07, | |
| "loss": 0.2528, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 1.2274595951671112, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 4.4400000000000006e-07, | |
| "loss": 0.2312, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 1.229970186725247, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 4.04e-07, | |
| "loss": 0.241, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.232480778283383, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 3.6400000000000003e-07, | |
| "loss": 0.2346, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 1.234991369841519, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 3.24e-07, | |
| "loss": 0.2492, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 1.2375019613996547, | |
| "grad_norm": 1.625, | |
| "learning_rate": 2.8400000000000005e-07, | |
| "loss": 0.2437, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 1.2400125529577908, | |
| "grad_norm": 2.0, | |
| "learning_rate": 2.44e-07, | |
| "loss": 0.2566, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 1.2425231445159266, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 2.0400000000000003e-07, | |
| "loss": 0.2319, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.2450337360740624, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 1.6400000000000004e-07, | |
| "loss": 0.2338, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 1.2475443276321982, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.24e-07, | |
| "loss": 0.2302, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 1.2500549191903343, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 8.4e-08, | |
| "loss": 0.2347, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 1.2525655107484701, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 4.4000000000000004e-08, | |
| "loss": 0.2331, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 1.255076102306606, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 4e-09, | |
| "loss": 0.2249, | |
| "step": 5000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 5000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.5481985197050757e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |