| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 8.182620867879832, | |
| "eval_steps": 500, | |
| "global_step": 9500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.008614191881124151, | |
| "grad_norm": 3.921875, | |
| "learning_rate": 1.9982000000000003e-05, | |
| "loss": 1.3429, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.017228383762248303, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 1.9962000000000003e-05, | |
| "loss": 0.7212, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.025842575643372456, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 1.9942e-05, | |
| "loss": 0.6892, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.034456767524496605, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 1.9922e-05, | |
| "loss": 0.6611, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.04307095940562076, | |
| "grad_norm": 2.875, | |
| "learning_rate": 1.9902e-05, | |
| "loss": 0.6514, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.05168515128674491, | |
| "grad_norm": 3.0, | |
| "learning_rate": 1.9882e-05, | |
| "loss": 0.6437, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.060299343167869064, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 1.9862e-05, | |
| "loss": 0.6161, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.06891353504899321, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 1.9842e-05, | |
| "loss": 0.6083, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.07752772693011736, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 1.9822e-05, | |
| "loss": 0.5967, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.08614191881124152, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 1.9802e-05, | |
| "loss": 0.5773, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.09475611069236567, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 1.9782e-05, | |
| "loss": 0.5722, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.10337030257348982, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 1.9762e-05, | |
| "loss": 0.5528, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.11198449445461398, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 1.9742000000000002e-05, | |
| "loss": 0.5952, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.12059868633573813, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 1.9722000000000002e-05, | |
| "loss": 0.5309, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.12921287821686228, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 1.9702000000000002e-05, | |
| "loss": 0.5353, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.13782707009798642, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 1.9682000000000002e-05, | |
| "loss": 0.5447, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.1464412619791106, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 1.9662000000000003e-05, | |
| "loss": 0.4998, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.15505545386023473, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 1.9642000000000003e-05, | |
| "loss": 0.5191, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.1636696457413589, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 1.9622e-05, | |
| "loss": 0.5358, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.17228383762248303, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 1.9602e-05, | |
| "loss": 0.4914, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.1808980295036072, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 1.9582e-05, | |
| "loss": 0.4943, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.18951222138473134, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 1.9562e-05, | |
| "loss": 0.4731, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.1981264132658555, | |
| "grad_norm": 2.625, | |
| "learning_rate": 1.9542e-05, | |
| "loss": 0.497, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.20674060514697964, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 1.9522e-05, | |
| "loss": 0.4746, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.2153547970281038, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 1.9502e-05, | |
| "loss": 0.4763, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.22396898890922795, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 1.9482e-05, | |
| "loss": 0.4759, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.23258318079035212, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 1.9462e-05, | |
| "loss": 0.5039, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.24119737267147626, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 1.9442e-05, | |
| "loss": 0.4799, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.2498115645526004, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 1.9422e-05, | |
| "loss": 0.4446, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.25842575643372456, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 1.9402e-05, | |
| "loss": 0.4727, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.26703994831484873, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 1.9382000000000002e-05, | |
| "loss": 0.4189, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.27565414019597284, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 1.9362000000000002e-05, | |
| "loss": 0.4409, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.284268332077097, | |
| "grad_norm": 3.25, | |
| "learning_rate": 1.9342000000000002e-05, | |
| "loss": 0.4656, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.2928825239582212, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 1.9322000000000002e-05, | |
| "loss": 0.4713, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.30149671583934534, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.9302e-05, | |
| "loss": 0.4282, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.31011090772046945, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 1.9282e-05, | |
| "loss": 0.4565, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.3187250996015936, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.9262e-05, | |
| "loss": 0.4346, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.3273392914827178, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 1.9242e-05, | |
| "loss": 0.426, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.33595348336384195, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 1.9222e-05, | |
| "loss": 0.42, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.34456767524496607, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 1.9202e-05, | |
| "loss": 0.4317, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.35318186712609023, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 1.9182e-05, | |
| "loss": 0.4311, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.3617960590072144, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 1.9162e-05, | |
| "loss": 0.4056, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.3704102508883385, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.9142e-05, | |
| "loss": 0.4029, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.3790244427694627, | |
| "grad_norm": 3.125, | |
| "learning_rate": 1.9122e-05, | |
| "loss": 0.4337, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.38763863465058684, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 1.9102e-05, | |
| "loss": 0.4381, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.396252826531711, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 1.9082e-05, | |
| "loss": 0.4174, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.4048670184128351, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 1.9062e-05, | |
| "loss": 0.3928, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.4134812102939593, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 1.9042e-05, | |
| "loss": 0.4051, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.42209540217508346, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 1.9022000000000002e-05, | |
| "loss": 0.3992, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.4307095940562076, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 1.9002000000000002e-05, | |
| "loss": 0.4194, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.43932378593733173, | |
| "grad_norm": 3.328125, | |
| "learning_rate": 1.8982000000000002e-05, | |
| "loss": 0.3951, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.4479379778184559, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.8962000000000002e-05, | |
| "loss": 0.3918, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.45655216969958007, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 1.8942000000000003e-05, | |
| "loss": 0.3854, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.46516636158070424, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.8922000000000003e-05, | |
| "loss": 0.3836, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.47378055346182835, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 1.8902000000000003e-05, | |
| "loss": 0.3824, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.4823947453429525, | |
| "grad_norm": 2.625, | |
| "learning_rate": 1.8882000000000003e-05, | |
| "loss": 0.3913, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.4910089372240767, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 1.8862000000000003e-05, | |
| "loss": 0.3834, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.4996231291052008, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.8842000000000004e-05, | |
| "loss": 0.3848, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.508237320986325, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 1.8822000000000004e-05, | |
| "loss": 0.3845, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.5168515128674491, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 1.8802000000000004e-05, | |
| "loss": 0.3836, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.5254657047485732, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.8782e-05, | |
| "loss": 0.3799, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.5340798966296975, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.8762e-05, | |
| "loss": 0.3715, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.5426940885108216, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.8742e-05, | |
| "loss": 0.3825, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.5513082803919457, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.8722e-05, | |
| "loss": 0.364, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.5599224722730699, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.8702e-05, | |
| "loss": 0.3765, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.568536664154194, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 1.8682000000000002e-05, | |
| "loss": 0.3748, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.5771508560353182, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 1.8662000000000002e-05, | |
| "loss": 0.3751, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.5857650479164423, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.8642000000000002e-05, | |
| "loss": 0.3778, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.5943792397975665, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.8622000000000002e-05, | |
| "loss": 0.3798, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.6029934316786907, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 1.8602000000000002e-05, | |
| "loss": 0.3682, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.6116076235598148, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.8582000000000003e-05, | |
| "loss": 0.3652, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.6202218154409389, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.8562000000000003e-05, | |
| "loss": 0.3658, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.6288360073220631, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 1.8542000000000003e-05, | |
| "loss": 0.389, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.6374501992031872, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 1.8522000000000003e-05, | |
| "loss": 0.375, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.6460643910843114, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.8502000000000003e-05, | |
| "loss": 0.3617, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.6546785829654356, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 1.8482000000000004e-05, | |
| "loss": 0.3777, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.6632927748465597, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.8462000000000004e-05, | |
| "loss": 0.3599, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.6719069667276839, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.8442e-05, | |
| "loss": 0.3495, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.680521158608808, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.8422e-05, | |
| "loss": 0.3461, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.6891353504899321, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.8402e-05, | |
| "loss": 0.3661, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.6977495423710564, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.8382e-05, | |
| "loss": 0.3594, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.7063637342521805, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 1.8362e-05, | |
| "loss": 0.3512, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.7149779261333046, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.8342e-05, | |
| "loss": 0.3616, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.7235921180144288, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 1.8322000000000002e-05, | |
| "loss": 0.3575, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.7322063098955529, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.8302000000000002e-05, | |
| "loss": 0.3712, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.740820501776677, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.8282000000000002e-05, | |
| "loss": 0.3724, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.7494346936578012, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 1.8262000000000002e-05, | |
| "loss": 0.3524, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.7580488855389254, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.8242000000000003e-05, | |
| "loss": 0.3543, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.7666630774200496, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.8222000000000003e-05, | |
| "loss": 0.3697, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.7752772693011737, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.8202000000000003e-05, | |
| "loss": 0.3583, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.7838914611822978, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 1.8182000000000003e-05, | |
| "loss": 0.3753, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.792505653063422, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 1.8162000000000003e-05, | |
| "loss": 0.3581, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.8011198449445461, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 1.8142000000000004e-05, | |
| "loss": 0.3534, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.8097340368256702, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 1.8122e-05, | |
| "loss": 0.3654, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.8183482287067945, | |
| "grad_norm": 1.875, | |
| "learning_rate": 1.8102e-05, | |
| "loss": 0.3663, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.8269624205879186, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 1.8082e-05, | |
| "loss": 0.3599, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.8355766124690428, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.8062e-05, | |
| "loss": 0.3511, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.8441908043501669, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.8042e-05, | |
| "loss": 0.3615, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.852804996231291, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 1.8022e-05, | |
| "loss": 0.3523, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.8614191881124152, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.8002e-05, | |
| "loss": 0.3591, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.8700333799935394, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.7982e-05, | |
| "loss": 0.3567, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.8786475718746635, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 1.7962000000000002e-05, | |
| "loss": 0.3568, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.8872617637557877, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.7942000000000002e-05, | |
| "loss": 0.3492, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.8958759556369118, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 1.7922000000000002e-05, | |
| "loss": 0.3386, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.9044901475180359, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.7902000000000002e-05, | |
| "loss": 0.3496, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.9131043393991601, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.7882000000000003e-05, | |
| "loss": 0.3278, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.9217185312802842, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 1.7862000000000003e-05, | |
| "loss": 0.3343, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.9303327231614085, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 1.7842000000000003e-05, | |
| "loss": 0.3389, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.9389469150425326, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 1.7822000000000003e-05, | |
| "loss": 0.351, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.9475611069236567, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.7802e-05, | |
| "loss": 0.3625, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.9561752988047809, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.7782e-05, | |
| "loss": 0.3597, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.964789490685905, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.7762e-05, | |
| "loss": 0.3631, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.9734036825670291, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 1.7742e-05, | |
| "loss": 0.3378, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.9820178744481534, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 1.7722e-05, | |
| "loss": 0.3461, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.9906320663292775, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 1.7702e-05, | |
| "loss": 0.3691, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.9992462582104016, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 1.7682e-05, | |
| "loss": 0.3332, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.0077527726930118, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.7662e-05, | |
| "loss": 0.2853, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.0163669645741358, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 1.7642e-05, | |
| "loss": 0.284, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.02498115645526, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.7622000000000002e-05, | |
| "loss": 0.2768, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.0335953483363842, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.7602000000000002e-05, | |
| "loss": 0.2734, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.0422095402175084, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 1.7582000000000002e-05, | |
| "loss": 0.2697, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.0508237320986324, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.7562000000000002e-05, | |
| "loss": 0.2799, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.0594379239797567, | |
| "grad_norm": 1.875, | |
| "learning_rate": 1.7542000000000002e-05, | |
| "loss": 0.2766, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.068052115860881, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 1.7522000000000003e-05, | |
| "loss": 0.2742, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.076666307742005, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 1.7502000000000003e-05, | |
| "loss": 0.2882, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.0852804996231291, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.7482e-05, | |
| "loss": 0.2882, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.0938946915042533, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.7462e-05, | |
| "loss": 0.2828, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.1025088833853773, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 1.7442e-05, | |
| "loss": 0.2732, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.1111230752665016, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 1.7422e-05, | |
| "loss": 0.2798, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.1197372671476258, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 1.7402e-05, | |
| "loss": 0.293, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.1283514590287498, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 1.7382e-05, | |
| "loss": 0.2971, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.136965650909874, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 1.7362e-05, | |
| "loss": 0.2673, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.1455798427909982, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.7342e-05, | |
| "loss": 0.2875, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.1541940346721222, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 1.7322e-05, | |
| "loss": 0.2723, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.1628082265532464, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.7302e-05, | |
| "loss": 0.3001, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.1714224184343707, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 1.7282e-05, | |
| "loss": 0.2878, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.180036610315495, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 1.7262000000000002e-05, | |
| "loss": 0.2663, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.188650802196619, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 1.7242000000000002e-05, | |
| "loss": 0.3061, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.1972649940777431, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 1.7222000000000002e-05, | |
| "loss": 0.3015, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.2058791859588673, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 1.7202000000000002e-05, | |
| "loss": 0.2825, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.2144933778399913, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 1.7182000000000003e-05, | |
| "loss": 0.2818, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.2231075697211156, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 1.7162e-05, | |
| "loss": 0.2755, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.2317217616022398, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 1.7142e-05, | |
| "loss": 0.2897, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.2403359534833638, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.7122e-05, | |
| "loss": 0.2816, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.248950145364488, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.7102e-05, | |
| "loss": 0.2798, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.2575643372456122, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.7082e-05, | |
| "loss": 0.2799, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.2661785291267362, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.7062e-05, | |
| "loss": 0.2843, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.2747927210078605, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 1.7042e-05, | |
| "loss": 0.2876, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.2834069128889847, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.7022e-05, | |
| "loss": 0.2912, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.2920211047701087, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.7002e-05, | |
| "loss": 0.2811, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.300635296651233, | |
| "grad_norm": 1.875, | |
| "learning_rate": 1.6982e-05, | |
| "loss": 0.2944, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.3092494885323571, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 1.6962e-05, | |
| "loss": 0.2796, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.3178636804134811, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.6942e-05, | |
| "loss": 0.2857, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.3264778722946053, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 1.6922e-05, | |
| "loss": 0.278, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.3350920641757296, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.6902000000000002e-05, | |
| "loss": 0.2711, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.3437062560568536, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 1.6882000000000002e-05, | |
| "loss": 0.2894, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.3523204479379778, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 1.6862000000000002e-05, | |
| "loss": 0.2834, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.360934639819102, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 1.6842e-05, | |
| "loss": 0.2748, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.369548831700226, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 1.6822e-05, | |
| "loss": 0.2904, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.3781630235813502, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 1.6802e-05, | |
| "loss": 0.2816, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.3867772154624745, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 1.6782e-05, | |
| "loss": 0.2993, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.3953914073435985, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 1.6762e-05, | |
| "loss": 0.2747, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.4040055992247227, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 1.6742e-05, | |
| "loss": 0.2814, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.412619791105847, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 1.6722e-05, | |
| "loss": 0.2777, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.421233982986971, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.6702e-05, | |
| "loss": 0.2789, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.4298481748680951, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 1.6682e-05, | |
| "loss": 0.2876, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.4384623667492193, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 1.6662e-05, | |
| "loss": 0.2855, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.4470765586303436, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.6642e-05, | |
| "loss": 0.2823, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.4556907505114676, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 1.6622e-05, | |
| "loss": 0.2759, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.4643049423925918, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 1.6602e-05, | |
| "loss": 0.2804, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.472919134273716, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 1.6582e-05, | |
| "loss": 0.2772, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.4815333261548402, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.6562e-05, | |
| "loss": 0.2834, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.4901475180359642, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.6542000000000002e-05, | |
| "loss": 0.2788, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.4987617099170885, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 1.6522e-05, | |
| "loss": 0.2796, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.5073759017982127, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 1.6502e-05, | |
| "loss": 0.2764, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.5159900936793367, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 1.6482000000000002e-05, | |
| "loss": 0.2893, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.524604285560461, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 1.6462000000000003e-05, | |
| "loss": 0.2868, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.5332184774415851, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.6442000000000003e-05, | |
| "loss": 0.2801, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.5418326693227091, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.6422000000000003e-05, | |
| "loss": 0.2845, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.5504468612038333, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 1.6402000000000003e-05, | |
| "loss": 0.2814, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.5590610530849576, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.6382000000000003e-05, | |
| "loss": 0.2707, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.5676752449660816, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.6362000000000004e-05, | |
| "loss": 0.2809, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.5762894368472058, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 1.6342000000000004e-05, | |
| "loss": 0.2875, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.58490362872833, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 1.6322e-05, | |
| "loss": 0.292, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.593517820609454, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 1.6302e-05, | |
| "loss": 0.2954, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.6021320124905782, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.6282e-05, | |
| "loss": 0.2823, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.6107462043717025, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 1.6262e-05, | |
| "loss": 0.2822, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.6193603962528265, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.6242e-05, | |
| "loss": 0.2776, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.6279745881339507, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.6222e-05, | |
| "loss": 0.2798, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.636588780015075, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.6202000000000002e-05, | |
| "loss": 0.2873, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.645202971896199, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.6182000000000002e-05, | |
| "loss": 0.2783, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.6538171637773231, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.6162000000000002e-05, | |
| "loss": 0.2847, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.6624313556584474, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.6142000000000002e-05, | |
| "loss": 0.2917, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.6710455475395714, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.6122000000000003e-05, | |
| "loss": 0.2682, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.6796597394206956, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.6102000000000003e-05, | |
| "loss": 0.2837, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.6882739313018198, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.6082000000000003e-05, | |
| "loss": 0.2852, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.6968881231829438, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.6062000000000003e-05, | |
| "loss": 0.2896, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.705502315064068, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.6042000000000003e-05, | |
| "loss": 0.2827, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.7141165069451922, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.6022000000000003e-05, | |
| "loss": 0.2725, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.7227306988263162, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 1.6002000000000004e-05, | |
| "loss": 0.2835, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.7313448907074405, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 1.5982e-05, | |
| "loss": 0.2779, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.7399590825885647, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.5962e-05, | |
| "loss": 0.2856, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.7485732744696887, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.5942e-05, | |
| "loss": 0.2835, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.757187466350813, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 1.5922e-05, | |
| "loss": 0.2777, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.7658016582319371, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 1.5902e-05, | |
| "loss": 0.2787, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.7744158501130611, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.5882e-05, | |
| "loss": 0.2842, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.7830300419941856, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.5862e-05, | |
| "loss": 0.2654, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.7916442338753096, | |
| "grad_norm": 1.875, | |
| "learning_rate": 1.5842000000000002e-05, | |
| "loss": 0.2701, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 1.8002584257564336, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.5822000000000002e-05, | |
| "loss": 0.2709, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 1.808872617637558, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.5802000000000002e-05, | |
| "loss": 0.2781, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.817486809518682, | |
| "grad_norm": 1.875, | |
| "learning_rate": 1.5782000000000002e-05, | |
| "loss": 0.2823, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 1.826101001399806, | |
| "grad_norm": 1.875, | |
| "learning_rate": 1.5762000000000003e-05, | |
| "loss": 0.2747, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 1.8347151932809305, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 1.5742000000000003e-05, | |
| "loss": 0.2731, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 1.8433293851620545, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 1.5722000000000003e-05, | |
| "loss": 0.2738, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 1.8519435770431787, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 1.5702000000000003e-05, | |
| "loss": 0.2887, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.860557768924303, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.5682000000000003e-05, | |
| "loss": 0.2841, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.869171960805427, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.5662e-05, | |
| "loss": 0.2916, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 1.8777861526865511, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 1.5642e-05, | |
| "loss": 0.2747, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 1.8864003445676754, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.5622e-05, | |
| "loss": 0.2769, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 1.8950145364487994, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 1.5602e-05, | |
| "loss": 0.2798, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.9036287283299236, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 1.5582e-05, | |
| "loss": 0.2791, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 1.9122429202110478, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.5562e-05, | |
| "loss": 0.2721, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 1.9208571120921718, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.5542e-05, | |
| "loss": 0.2778, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 1.929471303973296, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.5522e-05, | |
| "loss": 0.2792, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 1.9380854958544202, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 1.5502e-05, | |
| "loss": 0.2819, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.9466996877355442, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.5482000000000002e-05, | |
| "loss": 0.2691, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 1.9553138796166685, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 1.5462000000000002e-05, | |
| "loss": 0.2731, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 1.9639280714977927, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 1.5442000000000002e-05, | |
| "loss": 0.2709, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 1.9725422633789167, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.5422000000000002e-05, | |
| "loss": 0.288, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 1.981156455260041, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.5402000000000003e-05, | |
| "loss": 0.2807, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.9897706471411651, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 1.5382000000000003e-05, | |
| "loss": 0.2769, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 1.9983848390222891, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 1.5362000000000003e-05, | |
| "loss": 0.2855, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 2.006891353504899, | |
| "grad_norm": 1.875, | |
| "learning_rate": 1.5342e-05, | |
| "loss": 0.2429, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 2.0155055453860236, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.5322e-05, | |
| "loss": 0.2166, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 2.0241197372671476, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 1.5302e-05, | |
| "loss": 0.2022, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.0327339291482716, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 1.5282e-05, | |
| "loss": 0.1995, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 2.041348121029396, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 1.5262e-05, | |
| "loss": 0.1997, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 2.04996231291052, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.5242e-05, | |
| "loss": 0.2031, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 2.0585765047916444, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 1.5222000000000001e-05, | |
| "loss": 0.21, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 2.0671906966727684, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.5202000000000001e-05, | |
| "loss": 0.2159, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.0758048885538924, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.5182000000000001e-05, | |
| "loss": 0.1967, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 2.084419080435017, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.5162000000000002e-05, | |
| "loss": 0.2168, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 2.093033272316141, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 1.5142000000000002e-05, | |
| "loss": 0.2028, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 2.101647464197265, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.5122000000000002e-05, | |
| "loss": 0.1958, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 2.1102616560783893, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.5102e-05, | |
| "loss": 0.2105, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 2.1188758479595133, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 1.5082e-05, | |
| "loss": 0.2027, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 2.1274900398406373, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.5062e-05, | |
| "loss": 0.207, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 2.136104231721762, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.5042000000000001e-05, | |
| "loss": 0.2125, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 2.144718423602886, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.5022000000000001e-05, | |
| "loss": 0.2052, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 2.15333261548401, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.5002000000000001e-05, | |
| "loss": 0.2057, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.1619468073651342, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.4982000000000002e-05, | |
| "loss": 0.2159, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 2.1705609992462582, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.4962000000000002e-05, | |
| "loss": 0.2088, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 2.1791751911273822, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.4942e-05, | |
| "loss": 0.2128, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 2.1877893830085067, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 1.4922e-05, | |
| "loss": 0.2143, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 2.1964035748896307, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.4902e-05, | |
| "loss": 0.2098, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 2.2050177667707547, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.4882e-05, | |
| "loss": 0.2027, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 2.213631958651879, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.4862000000000001e-05, | |
| "loss": 0.2082, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 2.222246150533003, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.4842000000000001e-05, | |
| "loss": 0.2132, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 2.230860342414127, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.4822000000000001e-05, | |
| "loss": 0.2008, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 2.2394745342952516, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.4802000000000002e-05, | |
| "loss": 0.207, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.2480887261763756, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.4782e-05, | |
| "loss": 0.2129, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 2.2567029180574996, | |
| "grad_norm": 2.375, | |
| "learning_rate": 1.4762e-05, | |
| "loss": 0.2085, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 2.265317109938624, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.4742e-05, | |
| "loss": 0.216, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 2.273931301819748, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 1.4722e-05, | |
| "loss": 0.2133, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 2.282545493700872, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 1.4702000000000001e-05, | |
| "loss": 0.2032, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 2.2911596855819965, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.4682000000000001e-05, | |
| "loss": 0.2248, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 2.2997738774631205, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.4662000000000001e-05, | |
| "loss": 0.2056, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 2.3083880693442445, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.4642000000000001e-05, | |
| "loss": 0.2115, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 2.317002261225369, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.4622e-05, | |
| "loss": 0.1984, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 2.325616453106493, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.4602e-05, | |
| "loss": 0.2122, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.334230644987617, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.4582e-05, | |
| "loss": 0.2094, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 2.3428448368687413, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.4562e-05, | |
| "loss": 0.2225, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 2.3514590287498653, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 1.4542e-05, | |
| "loss": 0.2131, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 2.36007322063099, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 1.4522000000000001e-05, | |
| "loss": 0.2118, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 2.368687412512114, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.4502000000000001e-05, | |
| "loss": 0.2148, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 2.377301604393238, | |
| "grad_norm": 2.375, | |
| "learning_rate": 1.4482000000000001e-05, | |
| "loss": 0.214, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 2.385915796274362, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.4462e-05, | |
| "loss": 0.2028, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 2.3945299881554862, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.4442e-05, | |
| "loss": 0.2073, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 2.4031441800366102, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 1.4422e-05, | |
| "loss": 0.2145, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 2.4117583719177347, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 1.4402e-05, | |
| "loss": 0.2069, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.4203725637988587, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 1.4382e-05, | |
| "loss": 0.2216, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 2.4289867556799827, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.4362e-05, | |
| "loss": 0.2117, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 2.4376009475611067, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 1.4342000000000001e-05, | |
| "loss": 0.219, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 2.446215139442231, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 1.4322000000000001e-05, | |
| "loss": 0.2061, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 2.454829331323355, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.4302e-05, | |
| "loss": 0.2182, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 2.4634435232044796, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 1.4282e-05, | |
| "loss": 0.2135, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 2.4720577150856036, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.4262e-05, | |
| "loss": 0.215, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 2.4806719069667276, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.4242e-05, | |
| "loss": 0.2105, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 2.489286098847852, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.4222e-05, | |
| "loss": 0.2067, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 2.497900290728976, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.4202e-05, | |
| "loss": 0.2145, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.5065144826101, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 1.4182e-05, | |
| "loss": 0.2191, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 2.5151286744912245, | |
| "grad_norm": 2.375, | |
| "learning_rate": 1.4162000000000001e-05, | |
| "loss": 0.2068, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 2.5237428663723485, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.4142e-05, | |
| "loss": 0.2025, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 2.5323570582534725, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.4122e-05, | |
| "loss": 0.2089, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 2.540971250134597, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.4102e-05, | |
| "loss": 0.2167, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 2.549585442015721, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.4082e-05, | |
| "loss": 0.2049, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 2.558199633896845, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 1.4062e-05, | |
| "loss": 0.2034, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 2.5668138257779693, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.4042e-05, | |
| "loss": 0.2095, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 2.5754280176590933, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 1.4022e-05, | |
| "loss": 0.2047, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 2.5840422095402173, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.4002e-05, | |
| "loss": 0.2055, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.592656401421342, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.3982000000000003e-05, | |
| "loss": 0.211, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 2.601270593302466, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 1.3962000000000003e-05, | |
| "loss": 0.2009, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 2.6098847851835902, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 1.3942000000000001e-05, | |
| "loss": 0.2173, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 2.6184989770647142, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.3922000000000002e-05, | |
| "loss": 0.2102, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 2.6271131689458382, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 1.3902000000000002e-05, | |
| "loss": 0.2102, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 2.6357273608269622, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 1.3882000000000002e-05, | |
| "loss": 0.213, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 2.6443415527080867, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.3862000000000002e-05, | |
| "loss": 0.2117, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 2.6529557445892107, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.3842000000000002e-05, | |
| "loss": 0.2142, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 2.661569936470335, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.3822000000000003e-05, | |
| "loss": 0.202, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 2.670184128351459, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 1.3802000000000003e-05, | |
| "loss": 0.2133, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 2.678798320232583, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.3782000000000001e-05, | |
| "loss": 0.2086, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 2.687412512113707, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 1.3762000000000001e-05, | |
| "loss": 0.2032, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 2.6960267039948316, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 1.3742000000000002e-05, | |
| "loss": 0.2157, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 2.7046408958759556, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 1.3722000000000002e-05, | |
| "loss": 0.2139, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 2.71325508775708, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.3702000000000002e-05, | |
| "loss": 0.2177, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 2.721869279638204, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.3682000000000002e-05, | |
| "loss": 0.2082, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 2.730483471519328, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.3662000000000002e-05, | |
| "loss": 0.2093, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 2.739097663400452, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 1.3642000000000003e-05, | |
| "loss": 0.2054, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 2.7477118552815765, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 1.3622000000000003e-05, | |
| "loss": 0.2007, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 2.7563260471627005, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 1.3602000000000001e-05, | |
| "loss": 0.2109, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 2.764940239043825, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.3582000000000001e-05, | |
| "loss": 0.2106, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 2.773554430924949, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 1.3562000000000002e-05, | |
| "loss": 0.2101, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 2.782168622806073, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.3542000000000002e-05, | |
| "loss": 0.216, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 2.790782814687197, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.3522000000000002e-05, | |
| "loss": 0.2103, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 2.7993970065683214, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.3502000000000002e-05, | |
| "loss": 0.2083, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 2.8080111984494454, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.3482000000000002e-05, | |
| "loss": 0.2124, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 2.81662539033057, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 1.3462000000000003e-05, | |
| "loss": 0.2114, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 2.825239582211694, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.3442000000000001e-05, | |
| "loss": 0.2122, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 2.833853774092818, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.3422000000000001e-05, | |
| "loss": 0.217, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 2.842467965973942, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 1.3402000000000001e-05, | |
| "loss": 0.2137, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 2.8510821578550662, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.3382000000000002e-05, | |
| "loss": 0.2171, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 2.8596963497361902, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.3362000000000002e-05, | |
| "loss": 0.21, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 2.8683105416173147, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.3342000000000002e-05, | |
| "loss": 0.2083, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 2.8769247334984387, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 1.3322000000000002e-05, | |
| "loss": 0.2178, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 2.8855389253795627, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.3302000000000002e-05, | |
| "loss": 0.2086, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 2.894153117260687, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 1.3282000000000001e-05, | |
| "loss": 0.2144, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 2.902767309141811, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.3262000000000001e-05, | |
| "loss": 0.2067, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 2.911381501022935, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.3242000000000001e-05, | |
| "loss": 0.2104, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 2.9199956929040596, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.3222000000000001e-05, | |
| "loss": 0.2135, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 2.9286098847851836, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 1.3202000000000002e-05, | |
| "loss": 0.2095, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 2.9372240766663076, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.3182000000000002e-05, | |
| "loss": 0.2147, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 2.945838268547432, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 1.3162000000000002e-05, | |
| "loss": 0.2146, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 2.954452460428556, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 1.3142000000000002e-05, | |
| "loss": 0.2137, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 2.9630666523096805, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.3122e-05, | |
| "loss": 0.2111, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 2.9716808441908045, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.3102000000000001e-05, | |
| "loss": 0.2194, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 2.9802950360719285, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 1.3082000000000001e-05, | |
| "loss": 0.21, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 2.9889092279530525, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.3062000000000001e-05, | |
| "loss": 0.2004, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 2.997523419834177, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.3042000000000002e-05, | |
| "loss": 0.212, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 3.006029934316787, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 1.3022000000000002e-05, | |
| "loss": 0.1743, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 3.014644126197911, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 1.3002000000000002e-05, | |
| "loss": 0.1524, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 3.0232583180790353, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 1.2982000000000002e-05, | |
| "loss": 0.1476, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 3.0318725099601593, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.2962e-05, | |
| "loss": 0.1408, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 3.0404867018412833, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 1.2942e-05, | |
| "loss": 0.1495, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 3.049100893722408, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.2922000000000001e-05, | |
| "loss": 0.1457, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 3.057715085603532, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 1.2902000000000001e-05, | |
| "loss": 0.1483, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 3.066329277484656, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.2882000000000001e-05, | |
| "loss": 0.1451, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 3.0749434693657802, | |
| "grad_norm": 2.5, | |
| "learning_rate": 1.2862000000000002e-05, | |
| "loss": 0.1378, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 3.0835576612469042, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 1.2842000000000002e-05, | |
| "loss": 0.1539, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 3.0921718531280282, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 1.2822000000000002e-05, | |
| "loss": 0.1503, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 3.1007860450091527, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.2802e-05, | |
| "loss": 0.147, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 3.1094002368902767, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 1.2782e-05, | |
| "loss": 0.1447, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 3.1180144287714007, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 1.2762e-05, | |
| "loss": 0.1568, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 3.126628620652525, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 1.2742000000000001e-05, | |
| "loss": 0.1397, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 3.135242812533649, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 1.2722000000000001e-05, | |
| "loss": 0.1347, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 3.143857004414773, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 1.2702000000000001e-05, | |
| "loss": 0.1487, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 3.1524711962958976, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.2682000000000002e-05, | |
| "loss": 0.1458, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 3.1610853881770216, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.2662000000000002e-05, | |
| "loss": 0.139, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 3.169699580058146, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 1.2642e-05, | |
| "loss": 0.141, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 3.17831377193927, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 1.2622e-05, | |
| "loss": 0.1355, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 3.186927963820394, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 1.2602e-05, | |
| "loss": 0.149, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 3.1955421557015184, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 1.2582e-05, | |
| "loss": 0.1485, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 3.2041563475826425, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 1.2562000000000001e-05, | |
| "loss": 0.1475, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 3.2127705394637665, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 1.2542000000000001e-05, | |
| "loss": 0.1469, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 3.221384731344891, | |
| "grad_norm": 2.625, | |
| "learning_rate": 1.2522000000000001e-05, | |
| "loss": 0.1448, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 3.229998923226015, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.2502000000000002e-05, | |
| "loss": 0.1454, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 3.238613115107139, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.2482e-05, | |
| "loss": 0.1437, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 3.2472273069882633, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 1.2462e-05, | |
| "loss": 0.1521, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 3.2558414988693873, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 1.2442e-05, | |
| "loss": 0.1462, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 3.2644556907505113, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 1.2422e-05, | |
| "loss": 0.1528, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 3.273069882631636, | |
| "grad_norm": 2.5, | |
| "learning_rate": 1.2402000000000001e-05, | |
| "loss": 0.1438, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 3.28168407451276, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 1.2382000000000001e-05, | |
| "loss": 0.1481, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 3.290298266393884, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 1.2362000000000001e-05, | |
| "loss": 0.1521, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 3.2989124582750082, | |
| "grad_norm": 2.375, | |
| "learning_rate": 1.2342000000000001e-05, | |
| "loss": 0.1453, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 3.3075266501561322, | |
| "grad_norm": 2.75, | |
| "learning_rate": 1.2322e-05, | |
| "loss": 0.1453, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 3.3161408420372562, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 1.2302e-05, | |
| "loss": 0.1512, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 3.3247550339183807, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 1.2282e-05, | |
| "loss": 0.1484, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 3.3333692257995047, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 1.2262e-05, | |
| "loss": 0.1374, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 3.3419834176806287, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 1.2242e-05, | |
| "loss": 0.149, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 3.350597609561753, | |
| "grad_norm": 2.75, | |
| "learning_rate": 1.2222000000000001e-05, | |
| "loss": 0.151, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 3.359211801442877, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 1.2202000000000001e-05, | |
| "loss": 0.143, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 3.367825993324001, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 1.2182000000000001e-05, | |
| "loss": 0.1484, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 3.3764401852051256, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.2162e-05, | |
| "loss": 0.1532, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 3.3850543770862496, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 1.2142e-05, | |
| "loss": 0.1472, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 3.3936685689673736, | |
| "grad_norm": 2.5, | |
| "learning_rate": 1.2122e-05, | |
| "loss": 0.1448, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 3.402282760848498, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 1.2102e-05, | |
| "loss": 0.1548, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 3.410896952729622, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 1.2082e-05, | |
| "loss": 0.1372, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 3.419511144610746, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 1.2062e-05, | |
| "loss": 0.1466, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 3.4281253364918705, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 1.2042000000000001e-05, | |
| "loss": 0.1406, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 3.4367395283729945, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 1.2022000000000001e-05, | |
| "loss": 0.1516, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 3.4453537202541185, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 1.2002e-05, | |
| "loss": 0.1491, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 3.453967912135243, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 1.1982e-05, | |
| "loss": 0.146, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 3.462582104016367, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 1.1962e-05, | |
| "loss": 0.1438, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 3.4711962958974913, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 1.1942e-05, | |
| "loss": 0.1372, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 3.4798104877786153, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.1922e-05, | |
| "loss": 0.1501, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 3.4884246796597393, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 1.1902e-05, | |
| "loss": 0.1515, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 3.4970388715408633, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 1.1882e-05, | |
| "loss": 0.151, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 3.505653063421988, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 1.1862000000000001e-05, | |
| "loss": 0.1468, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 3.514267255303112, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 1.1842e-05, | |
| "loss": 0.1477, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 3.5228814471842362, | |
| "grad_norm": 2.375, | |
| "learning_rate": 1.1822e-05, | |
| "loss": 0.1516, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 3.5314956390653602, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 1.1802e-05, | |
| "loss": 0.145, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 3.5401098309464842, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 1.1782e-05, | |
| "loss": 0.1434, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 3.5487240228276082, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 1.1762e-05, | |
| "loss": 0.1424, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 3.5573382147087327, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.1742e-05, | |
| "loss": 0.1433, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 3.5659524065898567, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 1.1722e-05, | |
| "loss": 0.1511, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 3.574566598470981, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 1.1702e-05, | |
| "loss": 0.149, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 3.583180790352105, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 1.1682e-05, | |
| "loss": 0.151, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 3.591794982233229, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 1.1662e-05, | |
| "loss": 0.1412, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 3.600409174114353, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.1642e-05, | |
| "loss": 0.1461, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 3.6090233659954776, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 1.1622e-05, | |
| "loss": 0.1481, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 3.6176375578766016, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 1.1602e-05, | |
| "loss": 0.1535, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 3.626251749757726, | |
| "grad_norm": 2.5, | |
| "learning_rate": 1.1582e-05, | |
| "loss": 0.1579, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 3.63486594163885, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 1.1562e-05, | |
| "loss": 0.1528, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 3.643480133519974, | |
| "grad_norm": 2.625, | |
| "learning_rate": 1.1542e-05, | |
| "loss": 0.1436, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 3.6520943254010985, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 1.1521999999999999e-05, | |
| "loss": 0.1488, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 3.6607085172822225, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 1.1502e-05, | |
| "loss": 0.1404, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 3.6693227091633465, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 1.1482000000000001e-05, | |
| "loss": 0.1584, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 3.677936901044471, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 1.1462000000000001e-05, | |
| "loss": 0.1429, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 3.686551092925595, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 1.1442000000000002e-05, | |
| "loss": 0.1478, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 3.695165284806719, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 1.1422000000000002e-05, | |
| "loss": 0.1495, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 3.7037794766878434, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 1.1402000000000002e-05, | |
| "loss": 0.1487, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 3.7123936685689674, | |
| "grad_norm": 2.75, | |
| "learning_rate": 1.1382000000000002e-05, | |
| "loss": 0.1515, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 3.721007860450092, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 1.1362000000000002e-05, | |
| "loss": 0.1524, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 3.729622052331216, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 1.1342000000000003e-05, | |
| "loss": 0.1506, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 3.73823624421234, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 1.1322000000000001e-05, | |
| "loss": 0.1448, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 3.746850436093464, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 1.1302000000000001e-05, | |
| "loss": 0.1444, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 3.7554646279745882, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.1282000000000001e-05, | |
| "loss": 0.1483, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 3.7640788198557122, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 1.1262000000000002e-05, | |
| "loss": 0.1524, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 3.7726930117368367, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 1.1242000000000002e-05, | |
| "loss": 0.1553, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 3.7813072036179607, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 1.1222000000000002e-05, | |
| "loss": 0.156, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 3.7899213954990847, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 1.1202000000000002e-05, | |
| "loss": 0.1543, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 3.7985355873802087, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 1.1182000000000002e-05, | |
| "loss": 0.1521, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 3.807149779261333, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 1.1162000000000003e-05, | |
| "loss": 0.1515, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 3.815763971142457, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 1.1142000000000001e-05, | |
| "loss": 0.1527, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 3.8243781630235816, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 1.1122000000000001e-05, | |
| "loss": 0.1543, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 3.8329923549047056, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 1.1102000000000001e-05, | |
| "loss": 0.1461, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 3.8416065467858296, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 1.1082000000000002e-05, | |
| "loss": 0.1448, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 3.8502207386669536, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 1.1062000000000002e-05, | |
| "loss": 0.1462, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 3.858834930548078, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 1.1042000000000002e-05, | |
| "loss": 0.146, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 3.867449122429202, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 1.1022000000000002e-05, | |
| "loss": 0.1416, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 3.8760633143103265, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.1002000000000002e-05, | |
| "loss": 0.156, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 3.8846775061914505, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 1.0982000000000001e-05, | |
| "loss": 0.1462, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 3.8932916980725745, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 1.0962000000000001e-05, | |
| "loss": 0.1447, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 3.9019058899536985, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 1.0942000000000001e-05, | |
| "loss": 0.1467, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 3.910520081834823, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 1.0922000000000001e-05, | |
| "loss": 0.1452, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 3.919134273715947, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 1.0902000000000002e-05, | |
| "loss": 0.1519, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 3.9277484655970714, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.0882000000000002e-05, | |
| "loss": 0.1485, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 3.9363626574781954, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 1.0862000000000002e-05, | |
| "loss": 0.1473, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 3.9449768493593194, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 1.0842000000000002e-05, | |
| "loss": 0.1462, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 3.9535910412404434, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 1.0822e-05, | |
| "loss": 0.1472, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 3.962205233121568, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 1.0802000000000001e-05, | |
| "loss": 0.1497, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 3.970819425002692, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 1.0782000000000001e-05, | |
| "loss": 0.1474, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 3.9794336168838162, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 1.0762000000000001e-05, | |
| "loss": 0.1477, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 3.9880478087649402, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 1.0742000000000002e-05, | |
| "loss": 0.1508, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 3.9966620006460643, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 1.0722000000000002e-05, | |
| "loss": 0.1449, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 4.005168515128674, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 1.0702000000000002e-05, | |
| "loss": 0.1182, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 4.013782707009798, | |
| "grad_norm": 2.75, | |
| "learning_rate": 1.0682000000000002e-05, | |
| "loss": 0.1047, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 4.022396898890923, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 1.0662e-05, | |
| "loss": 0.0972, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 4.031011090772047, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 1.0642e-05, | |
| "loss": 0.099, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 4.039625282653171, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 1.0622000000000001e-05, | |
| "loss": 0.0977, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 4.048239474534295, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 1.0602000000000001e-05, | |
| "loss": 0.1003, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 4.056853666415419, | |
| "grad_norm": 2.625, | |
| "learning_rate": 1.0582000000000001e-05, | |
| "loss": 0.0964, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 4.065467858296543, | |
| "grad_norm": 2.625, | |
| "learning_rate": 1.0562000000000002e-05, | |
| "loss": 0.0969, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 4.074082050177668, | |
| "grad_norm": 3.234375, | |
| "learning_rate": 1.0542000000000002e-05, | |
| "loss": 0.1009, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 4.082696242058792, | |
| "grad_norm": 2.625, | |
| "learning_rate": 1.0522000000000002e-05, | |
| "loss": 0.1039, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 4.091310433939916, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 1.0502e-05, | |
| "loss": 0.102, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 4.09992462582104, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 1.0482e-05, | |
| "loss": 0.0998, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 4.108538817702164, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 1.0462e-05, | |
| "loss": 0.1001, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 4.117153009583289, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 1.0442000000000001e-05, | |
| "loss": 0.0929, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 4.125767201464413, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 1.0422000000000001e-05, | |
| "loss": 0.0971, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 4.134381393345537, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 1.0402000000000001e-05, | |
| "loss": 0.0997, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 4.142995585226661, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 1.0382000000000002e-05, | |
| "loss": 0.1048, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 4.151609777107785, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 1.0362000000000002e-05, | |
| "loss": 0.1024, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 4.160223968988909, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 1.0342e-05, | |
| "loss": 0.1015, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 4.168838160870034, | |
| "grad_norm": 2.875, | |
| "learning_rate": 1.0322e-05, | |
| "loss": 0.1015, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 4.177452352751158, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 1.0302e-05, | |
| "loss": 0.0962, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 4.186066544632282, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 1.0282e-05, | |
| "loss": 0.1004, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 4.194680736513406, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 1.0262000000000001e-05, | |
| "loss": 0.0913, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 4.20329492839453, | |
| "grad_norm": 4.0, | |
| "learning_rate": 1.0242000000000001e-05, | |
| "loss": 0.0974, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 4.211909120275654, | |
| "grad_norm": 2.875, | |
| "learning_rate": 1.0222000000000001e-05, | |
| "loss": 0.1002, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 4.220523312156779, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 1.0202000000000002e-05, | |
| "loss": 0.1032, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 4.229137504037903, | |
| "grad_norm": 3.0, | |
| "learning_rate": 1.0182e-05, | |
| "loss": 0.0937, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 4.237751695919027, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 1.0162e-05, | |
| "loss": 0.0974, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 4.246365887800151, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 1.0142e-05, | |
| "loss": 0.1001, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 4.254980079681275, | |
| "grad_norm": 3.578125, | |
| "learning_rate": 1.0122e-05, | |
| "loss": 0.1008, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 4.263594271562399, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 1.0102000000000001e-05, | |
| "loss": 0.0994, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 4.272208463443524, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.0082000000000001e-05, | |
| "loss": 0.1013, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 4.280822655324648, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 1.0062000000000001e-05, | |
| "loss": 0.0959, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 4.289436847205772, | |
| "grad_norm": 3.59375, | |
| "learning_rate": 1.0042000000000001e-05, | |
| "loss": 0.0992, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 4.298051039086896, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 1.0022e-05, | |
| "loss": 0.0962, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 4.30666523096802, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 1.0002e-05, | |
| "loss": 0.1062, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 4.315279422849144, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 9.982e-06, | |
| "loss": 0.0995, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 4.3238936147302685, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 9.962e-06, | |
| "loss": 0.1022, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 4.3325078066113925, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 9.942e-06, | |
| "loss": 0.0996, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 4.3411219984925165, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 9.922000000000001e-06, | |
| "loss": 0.0993, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 4.3497361903736405, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 9.902000000000001e-06, | |
| "loss": 0.1029, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 4.3583503822547645, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 9.882000000000001e-06, | |
| "loss": 0.1003, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 4.3669645741358885, | |
| "grad_norm": 2.984375, | |
| "learning_rate": 9.862e-06, | |
| "loss": 0.0979, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 4.375578766017013, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 9.842e-06, | |
| "loss": 0.1003, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 4.384192957898137, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 9.822e-06, | |
| "loss": 0.0981, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 4.392807149779261, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 9.802e-06, | |
| "loss": 0.098, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 4.401421341660385, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 9.782e-06, | |
| "loss": 0.1007, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 4.410035533541509, | |
| "grad_norm": 2.75, | |
| "learning_rate": 9.762e-06, | |
| "loss": 0.0968, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 4.418649725422634, | |
| "grad_norm": 2.75, | |
| "learning_rate": 9.742000000000001e-06, | |
| "loss": 0.1031, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 4.427263917303758, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 9.722000000000001e-06, | |
| "loss": 0.0996, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 4.435878109184882, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 9.702e-06, | |
| "loss": 0.0982, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 4.444492301066006, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 9.682e-06, | |
| "loss": 0.0991, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 4.45310649294713, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 9.662e-06, | |
| "loss": 0.0999, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 4.461720684828254, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 9.642e-06, | |
| "loss": 0.1042, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 4.470334876709378, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 9.622000000000002e-06, | |
| "loss": 0.1016, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 4.478949068590503, | |
| "grad_norm": 2.5, | |
| "learning_rate": 9.602e-06, | |
| "loss": 0.0992, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 4.487563260471627, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 9.582e-06, | |
| "loss": 0.0986, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 4.496177452352751, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 9.562000000000001e-06, | |
| "loss": 0.0995, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 4.504791644233875, | |
| "grad_norm": 3.25, | |
| "learning_rate": 9.542000000000001e-06, | |
| "loss": 0.1039, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 4.513405836114999, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 9.522000000000001e-06, | |
| "loss": 0.1017, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 4.522020027996124, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 9.502000000000002e-06, | |
| "loss": 0.1018, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 4.530634219877248, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 9.482000000000002e-06, | |
| "loss": 0.1021, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 4.539248411758372, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 9.462000000000002e-06, | |
| "loss": 0.1007, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 4.547862603639496, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 9.442e-06, | |
| "loss": 0.0974, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 4.55647679552062, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 9.422e-06, | |
| "loss": 0.0994, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 4.565090987401744, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 9.402e-06, | |
| "loss": 0.1027, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 4.573705179282869, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 9.382000000000001e-06, | |
| "loss": 0.1081, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 4.582319371163993, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 9.362000000000001e-06, | |
| "loss": 0.0977, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 4.590933563045117, | |
| "grad_norm": 2.75, | |
| "learning_rate": 9.342000000000001e-06, | |
| "loss": 0.1057, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 4.599547754926241, | |
| "grad_norm": 3.0, | |
| "learning_rate": 9.322000000000002e-06, | |
| "loss": 0.1008, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 4.608161946807365, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 9.302000000000002e-06, | |
| "loss": 0.102, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 4.616776138688489, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 9.282e-06, | |
| "loss": 0.106, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 4.625390330569614, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 9.262e-06, | |
| "loss": 0.1025, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 4.634004522450738, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 9.242e-06, | |
| "loss": 0.0968, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 4.642618714331862, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 9.222e-06, | |
| "loss": 0.0951, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 4.651232906212986, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 9.202000000000001e-06, | |
| "loss": 0.1027, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 4.65984709809411, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 9.182000000000001e-06, | |
| "loss": 0.1014, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 4.668461289975234, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 9.162000000000001e-06, | |
| "loss": 0.1017, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 4.677075481856359, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 9.142000000000002e-06, | |
| "loss": 0.1019, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 4.685689673737483, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 9.122e-06, | |
| "loss": 0.1034, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 4.694303865618607, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 9.102e-06, | |
| "loss": 0.098, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 4.702918057499731, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 9.082e-06, | |
| "loss": 0.0971, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 4.711532249380855, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 9.062e-06, | |
| "loss": 0.0989, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 4.72014644126198, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 9.042e-06, | |
| "loss": 0.0987, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 4.728760633143104, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 9.022000000000001e-06, | |
| "loss": 0.0983, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 4.737374825024228, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 9.002000000000001e-06, | |
| "loss": 0.102, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 4.745989016905352, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 8.982000000000001e-06, | |
| "loss": 0.1022, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 4.754603208786476, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 8.962e-06, | |
| "loss": 0.0997, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 4.7632174006676, | |
| "grad_norm": 2.625, | |
| "learning_rate": 8.942e-06, | |
| "loss": 0.1032, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 4.771831592548724, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 8.922e-06, | |
| "loss": 0.0995, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 4.7804457844298485, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 8.902e-06, | |
| "loss": 0.1053, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 4.7890599763109725, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 8.882e-06, | |
| "loss": 0.0929, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 4.7976741681920965, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 8.862000000000001e-06, | |
| "loss": 0.1015, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 4.8062883600732205, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 8.842000000000001e-06, | |
| "loss": 0.0987, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 4.8149025519543445, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 8.822000000000001e-06, | |
| "loss": 0.1039, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 4.823516743835469, | |
| "grad_norm": 3.34375, | |
| "learning_rate": 8.802e-06, | |
| "loss": 0.102, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 4.832130935716593, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 8.782e-06, | |
| "loss": 0.1002, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 4.840745127597717, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 8.762e-06, | |
| "loss": 0.1023, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 4.849359319478841, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 8.742e-06, | |
| "loss": 0.1007, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 4.857973511359965, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 8.722e-06, | |
| "loss": 0.1036, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 4.866587703241089, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 8.702e-06, | |
| "loss": 0.1044, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 4.875201895122213, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 8.682000000000001e-06, | |
| "loss": 0.0943, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 4.883816087003338, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 8.662000000000001e-06, | |
| "loss": 0.0982, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 4.892430278884462, | |
| "grad_norm": 3.125, | |
| "learning_rate": 8.642e-06, | |
| "loss": 0.1033, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 4.901044470765586, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 8.622e-06, | |
| "loss": 0.1034, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 4.90965866264671, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 8.602e-06, | |
| "loss": 0.0999, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 4.918272854527834, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 8.582e-06, | |
| "loss": 0.1048, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 4.926887046408959, | |
| "grad_norm": 3.75, | |
| "learning_rate": 8.562e-06, | |
| "loss": 0.0956, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 4.935501238290083, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 8.542e-06, | |
| "loss": 0.1041, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 4.944115430171207, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 8.522e-06, | |
| "loss": 0.1029, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 4.952729622052331, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 8.502000000000001e-06, | |
| "loss": 0.1009, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 4.961343813933455, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 8.482e-06, | |
| "loss": 0.1065, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 4.969958005814579, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 8.462e-06, | |
| "loss": 0.0968, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 4.978572197695704, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 8.442e-06, | |
| "loss": 0.1061, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 4.987186389576828, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 8.422e-06, | |
| "loss": 0.0998, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 4.995800581457952, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 8.402e-06, | |
| "loss": 0.1004, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 5.004307095940562, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 8.382e-06, | |
| "loss": 0.0826, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 5.012921287821686, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 8.362e-06, | |
| "loss": 0.0723, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 5.02153547970281, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 8.342e-06, | |
| "loss": 0.0668, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 5.030149671583935, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 8.322000000000001e-06, | |
| "loss": 0.072, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 5.038763863465059, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 8.302000000000001e-06, | |
| "loss": 0.0673, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 5.047378055346183, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 8.282000000000001e-06, | |
| "loss": 0.0656, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 5.055992247227307, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 8.262000000000002e-06, | |
| "loss": 0.0721, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 5.064606439108431, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 8.242000000000002e-06, | |
| "loss": 0.0737, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 5.073220630989555, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 8.222000000000002e-06, | |
| "loss": 0.068, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 5.08183482287068, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 8.202e-06, | |
| "loss": 0.0682, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 5.090449014751804, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 8.182e-06, | |
| "loss": 0.0677, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 5.099063206632928, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 8.162e-06, | |
| "loss": 0.0686, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 5.107677398514052, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 8.142000000000001e-06, | |
| "loss": 0.0688, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 5.116291590395176, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 8.122000000000001e-06, | |
| "loss": 0.072, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 5.1249057822763, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 8.102000000000001e-06, | |
| "loss": 0.0707, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 5.133519974157425, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 8.082000000000002e-06, | |
| "loss": 0.0733, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 5.142134166038549, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 8.062000000000002e-06, | |
| "loss": 0.0726, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 5.150748357919673, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 8.042e-06, | |
| "loss": 0.069, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 5.159362549800797, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 8.022e-06, | |
| "loss": 0.0699, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 5.167976741681921, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 8.002e-06, | |
| "loss": 0.0704, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 5.176590933563045, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 7.982e-06, | |
| "loss": 0.0676, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 5.18520512544417, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 7.962000000000001e-06, | |
| "loss": 0.0674, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 5.193819317325294, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 7.942000000000001e-06, | |
| "loss": 0.0721, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 5.202433509206418, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 7.922000000000001e-06, | |
| "loss": 0.0727, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 5.211047701087542, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 7.902000000000002e-06, | |
| "loss": 0.071, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 5.219661892968666, | |
| "grad_norm": 3.5, | |
| "learning_rate": 7.882e-06, | |
| "loss": 0.0682, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 5.2282760848497905, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 7.862e-06, | |
| "loss": 0.0708, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 5.2368902767309145, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 7.842e-06, | |
| "loss": 0.0695, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 5.2455044686120385, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 7.822e-06, | |
| "loss": 0.0731, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 5.2541186604931625, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 7.802000000000001e-06, | |
| "loss": 0.0695, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 5.2627328523742865, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 7.782000000000001e-06, | |
| "loss": 0.0765, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 5.2713470442554105, | |
| "grad_norm": 2.984375, | |
| "learning_rate": 7.762000000000001e-06, | |
| "loss": 0.0702, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 5.279961236136535, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 7.742000000000001e-06, | |
| "loss": 0.0698, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 5.288575428017659, | |
| "grad_norm": 2.75, | |
| "learning_rate": 7.722e-06, | |
| "loss": 0.0724, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 5.297189619898783, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 7.702e-06, | |
| "loss": 0.0708, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 5.305803811779907, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 7.682e-06, | |
| "loss": 0.0703, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 5.314418003661031, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 7.662e-06, | |
| "loss": 0.0704, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 5.323032195542155, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 7.642e-06, | |
| "loss": 0.0674, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 5.33164638742328, | |
| "grad_norm": 4.5, | |
| "learning_rate": 7.622000000000001e-06, | |
| "loss": 0.077, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 5.340260579304404, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 7.602e-06, | |
| "loss": 0.0722, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 5.348874771185528, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 7.582e-06, | |
| "loss": 0.0728, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 5.357488963066652, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 7.562000000000001e-06, | |
| "loss": 0.0709, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 5.366103154947776, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 7.542000000000001e-06, | |
| "loss": 0.069, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 5.3747173468289, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 7.522e-06, | |
| "loss": 0.0717, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 5.383331538710025, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 7.502e-06, | |
| "loss": 0.0677, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 5.391945730591149, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 7.4820000000000005e-06, | |
| "loss": 0.0706, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 5.400559922472273, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 7.462000000000001e-06, | |
| "loss": 0.0753, | |
| "step": 6270 | |
| }, | |
| { | |
| "epoch": 5.409174114353397, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 7.442e-06, | |
| "loss": 0.0698, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 5.417788306234521, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 7.422e-06, | |
| "loss": 0.0733, | |
| "step": 6290 | |
| }, | |
| { | |
| "epoch": 5.426402498115645, | |
| "grad_norm": 3.171875, | |
| "learning_rate": 7.4020000000000005e-06, | |
| "loss": 0.0672, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 5.43501668999677, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 7.382000000000001e-06, | |
| "loss": 0.0701, | |
| "step": 6310 | |
| }, | |
| { | |
| "epoch": 5.443630881877894, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 7.362e-06, | |
| "loss": 0.0677, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 5.452245073759018, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 7.342e-06, | |
| "loss": 0.0715, | |
| "step": 6330 | |
| }, | |
| { | |
| "epoch": 5.460859265640142, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 7.322e-06, | |
| "loss": 0.0713, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 5.469473457521266, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 7.3020000000000006e-06, | |
| "loss": 0.0697, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 5.47808764940239, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 7.282e-06, | |
| "loss": 0.0686, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 5.486701841283515, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 7.262e-06, | |
| "loss": 0.0685, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 5.495316033164639, | |
| "grad_norm": 2.625, | |
| "learning_rate": 7.242e-06, | |
| "loss": 0.0697, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 5.503930225045763, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 7.2220000000000005e-06, | |
| "loss": 0.0684, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 5.512544416926887, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 7.202e-06, | |
| "loss": 0.0674, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 5.521158608808011, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 7.182e-06, | |
| "loss": 0.0728, | |
| "step": 6410 | |
| }, | |
| { | |
| "epoch": 5.529772800689136, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 7.162e-06, | |
| "loss": 0.0706, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 5.53838699257026, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 7.142e-06, | |
| "loss": 0.0725, | |
| "step": 6430 | |
| }, | |
| { | |
| "epoch": 5.547001184451384, | |
| "grad_norm": 2.375, | |
| "learning_rate": 7.1220000000000014e-06, | |
| "loss": 0.07, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 5.555615376332508, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 7.102000000000001e-06, | |
| "loss": 0.0682, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 5.564229568213632, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 7.082000000000001e-06, | |
| "loss": 0.0715, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 5.572843760094756, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 7.062000000000001e-06, | |
| "loss": 0.0715, | |
| "step": 6470 | |
| }, | |
| { | |
| "epoch": 5.58145795197588, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 7.042000000000001e-06, | |
| "loss": 0.0702, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 5.590072143857005, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 7.022000000000001e-06, | |
| "loss": 0.0729, | |
| "step": 6490 | |
| }, | |
| { | |
| "epoch": 5.598686335738129, | |
| "grad_norm": 2.984375, | |
| "learning_rate": 7.002000000000001e-06, | |
| "loss": 0.0734, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 5.607300527619253, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 6.982000000000001e-06, | |
| "loss": 0.0785, | |
| "step": 6510 | |
| }, | |
| { | |
| "epoch": 5.615914719500377, | |
| "grad_norm": 2.875, | |
| "learning_rate": 6.962000000000001e-06, | |
| "loss": 0.0727, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 5.624528911381501, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 6.942000000000001e-06, | |
| "loss": 0.0703, | |
| "step": 6530 | |
| }, | |
| { | |
| "epoch": 5.633143103262626, | |
| "grad_norm": 3.453125, | |
| "learning_rate": 6.922000000000001e-06, | |
| "loss": 0.0698, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 5.64175729514375, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 6.902000000000001e-06, | |
| "loss": 0.0667, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 5.650371487024874, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 6.882000000000001e-06, | |
| "loss": 0.0707, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 5.658985678905998, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 6.8620000000000005e-06, | |
| "loss": 0.0696, | |
| "step": 6570 | |
| }, | |
| { | |
| "epoch": 5.667599870787122, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 6.842000000000001e-06, | |
| "loss": 0.0726, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 5.676214062668246, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 6.822000000000001e-06, | |
| "loss": 0.0737, | |
| "step": 6590 | |
| }, | |
| { | |
| "epoch": 5.6848282545493705, | |
| "grad_norm": 2.875, | |
| "learning_rate": 6.802000000000001e-06, | |
| "loss": 0.0752, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 5.6934424464304945, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 6.7820000000000005e-06, | |
| "loss": 0.0711, | |
| "step": 6610 | |
| }, | |
| { | |
| "epoch": 5.7020566383116185, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 6.762000000000001e-06, | |
| "loss": 0.0715, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 5.7106708301927425, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 6.742000000000001e-06, | |
| "loss": 0.0715, | |
| "step": 6630 | |
| }, | |
| { | |
| "epoch": 5.7192850220738665, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 6.722000000000001e-06, | |
| "loss": 0.0706, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 5.7278992139549905, | |
| "grad_norm": 2.875, | |
| "learning_rate": 6.702e-06, | |
| "loss": 0.0671, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 5.736513405836115, | |
| "grad_norm": 2.984375, | |
| "learning_rate": 6.6820000000000006e-06, | |
| "loss": 0.0754, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 5.745127597717239, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 6.662000000000001e-06, | |
| "loss": 0.0702, | |
| "step": 6670 | |
| }, | |
| { | |
| "epoch": 5.753741789598363, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 6.642000000000001e-06, | |
| "loss": 0.071, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 5.762355981479487, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 6.622e-06, | |
| "loss": 0.0724, | |
| "step": 6690 | |
| }, | |
| { | |
| "epoch": 5.770970173360611, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 6.6020000000000005e-06, | |
| "loss": 0.0739, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 5.779584365241735, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 6.582000000000001e-06, | |
| "loss": 0.0713, | |
| "step": 6710 | |
| }, | |
| { | |
| "epoch": 5.78819855712286, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 6.562000000000001e-06, | |
| "loss": 0.0691, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 5.796812749003984, | |
| "grad_norm": 2.75, | |
| "learning_rate": 6.542e-06, | |
| "loss": 0.0713, | |
| "step": 6730 | |
| }, | |
| { | |
| "epoch": 5.805426940885108, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 6.522e-06, | |
| "loss": 0.0685, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 5.814041132766232, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 6.502000000000001e-06, | |
| "loss": 0.0728, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 5.822655324647356, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 6.482000000000001e-06, | |
| "loss": 0.0722, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 5.831269516528481, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 6.462e-06, | |
| "loss": 0.0687, | |
| "step": 6770 | |
| }, | |
| { | |
| "epoch": 5.839883708409605, | |
| "grad_norm": 3.0, | |
| "learning_rate": 6.442e-06, | |
| "loss": 0.0682, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 5.848497900290729, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 6.4220000000000005e-06, | |
| "loss": 0.0674, | |
| "step": 6790 | |
| }, | |
| { | |
| "epoch": 5.857112092171853, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 6.402000000000001e-06, | |
| "loss": 0.0743, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 5.865726284052977, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 6.382e-06, | |
| "loss": 0.071, | |
| "step": 6810 | |
| }, | |
| { | |
| "epoch": 5.874340475934101, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 6.362e-06, | |
| "loss": 0.0688, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 5.882954667815225, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 6.3420000000000004e-06, | |
| "loss": 0.0696, | |
| "step": 6830 | |
| }, | |
| { | |
| "epoch": 5.89156885969635, | |
| "grad_norm": 2.875, | |
| "learning_rate": 6.322000000000001e-06, | |
| "loss": 0.0698, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 5.900183051577474, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 6.302e-06, | |
| "loss": 0.0688, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 5.908797243458598, | |
| "grad_norm": 3.0, | |
| "learning_rate": 6.282e-06, | |
| "loss": 0.0704, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 5.917411435339722, | |
| "grad_norm": 3.0, | |
| "learning_rate": 6.262e-06, | |
| "loss": 0.0742, | |
| "step": 6870 | |
| }, | |
| { | |
| "epoch": 5.926025627220846, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 6.2420000000000005e-06, | |
| "loss": 0.0656, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 5.934639819101971, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 6.222e-06, | |
| "loss": 0.072, | |
| "step": 6890 | |
| }, | |
| { | |
| "epoch": 5.943254010983095, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 6.202e-06, | |
| "loss": 0.0668, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 5.951868202864219, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 6.182e-06, | |
| "loss": 0.0706, | |
| "step": 6910 | |
| }, | |
| { | |
| "epoch": 5.960482394745343, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 6.1620000000000005e-06, | |
| "loss": 0.0685, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 5.969096586626467, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 6.142e-06, | |
| "loss": 0.074, | |
| "step": 6930 | |
| }, | |
| { | |
| "epoch": 5.977710778507591, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 6.122e-06, | |
| "loss": 0.0713, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 5.986324970388715, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 6.102e-06, | |
| "loss": 0.0719, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 5.99493916226984, | |
| "grad_norm": 3.0, | |
| "learning_rate": 6.082e-06, | |
| "loss": 0.0704, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 6.00344567675245, | |
| "grad_norm": 2.0, | |
| "learning_rate": 6.062e-06, | |
| "loss": 0.0591, | |
| "step": 6970 | |
| }, | |
| { | |
| "epoch": 6.012059868633574, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 6.042e-06, | |
| "loss": 0.0506, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 6.020674060514698, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 6.022e-06, | |
| "loss": 0.055, | |
| "step": 6990 | |
| }, | |
| { | |
| "epoch": 6.029288252395822, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 6.002e-06, | |
| "loss": 0.0564, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 6.037902444276947, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 5.982e-06, | |
| "loss": 0.0567, | |
| "step": 7010 | |
| }, | |
| { | |
| "epoch": 6.046516636158071, | |
| "grad_norm": 2.5, | |
| "learning_rate": 5.962e-06, | |
| "loss": 0.056, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 6.055130828039195, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 5.942e-06, | |
| "loss": 0.0564, | |
| "step": 7030 | |
| }, | |
| { | |
| "epoch": 6.063745019920319, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 5.922e-06, | |
| "loss": 0.0511, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 6.072359211801443, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 5.9019999999999996e-06, | |
| "loss": 0.0535, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 6.080973403682567, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 5.882e-06, | |
| "loss": 0.0558, | |
| "step": 7060 | |
| }, | |
| { | |
| "epoch": 6.089587595563692, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 5.862000000000001e-06, | |
| "loss": 0.0526, | |
| "step": 7070 | |
| }, | |
| { | |
| "epoch": 6.098201787444816, | |
| "grad_norm": 2.25, | |
| "learning_rate": 5.842000000000001e-06, | |
| "loss": 0.0541, | |
| "step": 7080 | |
| }, | |
| { | |
| "epoch": 6.10681597932594, | |
| "grad_norm": 3.328125, | |
| "learning_rate": 5.822000000000001e-06, | |
| "loss": 0.0552, | |
| "step": 7090 | |
| }, | |
| { | |
| "epoch": 6.115430171207064, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 5.802000000000001e-06, | |
| "loss": 0.0551, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 6.124044363088188, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 5.782000000000001e-06, | |
| "loss": 0.0528, | |
| "step": 7110 | |
| }, | |
| { | |
| "epoch": 6.132658554969312, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 5.762000000000001e-06, | |
| "loss": 0.0528, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 6.1412727468504364, | |
| "grad_norm": 2.125, | |
| "learning_rate": 5.742000000000001e-06, | |
| "loss": 0.0538, | |
| "step": 7130 | |
| }, | |
| { | |
| "epoch": 6.1498869387315604, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 5.722000000000001e-06, | |
| "loss": 0.0521, | |
| "step": 7140 | |
| }, | |
| { | |
| "epoch": 6.1585011306126844, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 5.702000000000001e-06, | |
| "loss": 0.0557, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 6.1671153224938084, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 5.682000000000001e-06, | |
| "loss": 0.0536, | |
| "step": 7160 | |
| }, | |
| { | |
| "epoch": 6.1757295143749324, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 5.662000000000001e-06, | |
| "loss": 0.0526, | |
| "step": 7170 | |
| }, | |
| { | |
| "epoch": 6.1843437062560564, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 5.642000000000001e-06, | |
| "loss": 0.0547, | |
| "step": 7180 | |
| }, | |
| { | |
| "epoch": 6.192957898137181, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 5.6220000000000006e-06, | |
| "loss": 0.0508, | |
| "step": 7190 | |
| }, | |
| { | |
| "epoch": 6.201572090018305, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 5.602000000000001e-06, | |
| "loss": 0.0541, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 6.210186281899429, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 5.582000000000001e-06, | |
| "loss": 0.0547, | |
| "step": 7210 | |
| }, | |
| { | |
| "epoch": 6.218800473780553, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 5.562000000000001e-06, | |
| "loss": 0.056, | |
| "step": 7220 | |
| }, | |
| { | |
| "epoch": 6.227414665661677, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 5.5420000000000005e-06, | |
| "loss": 0.0538, | |
| "step": 7230 | |
| }, | |
| { | |
| "epoch": 6.236028857542801, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 5.522000000000001e-06, | |
| "loss": 0.0491, | |
| "step": 7240 | |
| }, | |
| { | |
| "epoch": 6.244643049423926, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 5.502000000000001e-06, | |
| "loss": 0.0532, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 6.25325724130505, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 5.482000000000001e-06, | |
| "loss": 0.0496, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 6.261871433186174, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 5.462e-06, | |
| "loss": 0.0587, | |
| "step": 7270 | |
| }, | |
| { | |
| "epoch": 6.270485625067298, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 5.442000000000001e-06, | |
| "loss": 0.055, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 6.279099816948422, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 5.422000000000001e-06, | |
| "loss": 0.0559, | |
| "step": 7290 | |
| }, | |
| { | |
| "epoch": 6.287714008829546, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 5.402000000000001e-06, | |
| "loss": 0.0572, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 6.296328200710671, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 5.382e-06, | |
| "loss": 0.0516, | |
| "step": 7310 | |
| }, | |
| { | |
| "epoch": 6.304942392591795, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 5.3620000000000005e-06, | |
| "loss": 0.05, | |
| "step": 7320 | |
| }, | |
| { | |
| "epoch": 6.313556584472919, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 5.342000000000001e-06, | |
| "loss": 0.0534, | |
| "step": 7330 | |
| }, | |
| { | |
| "epoch": 6.322170776354043, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 5.322000000000001e-06, | |
| "loss": 0.056, | |
| "step": 7340 | |
| }, | |
| { | |
| "epoch": 6.330784968235167, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 5.302e-06, | |
| "loss": 0.0542, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 6.339399160116292, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 5.282e-06, | |
| "loss": 0.0573, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 6.348013351997416, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 5.262000000000001e-06, | |
| "loss": 0.0518, | |
| "step": 7370 | |
| }, | |
| { | |
| "epoch": 6.35662754387854, | |
| "grad_norm": 2.5, | |
| "learning_rate": 5.242000000000001e-06, | |
| "loss": 0.0567, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 6.365241735759664, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 5.222e-06, | |
| "loss": 0.0563, | |
| "step": 7390 | |
| }, | |
| { | |
| "epoch": 6.373855927640788, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 5.202e-06, | |
| "loss": 0.0534, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 6.382470119521912, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 5.1820000000000005e-06, | |
| "loss": 0.0515, | |
| "step": 7410 | |
| }, | |
| { | |
| "epoch": 6.391084311403037, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 5.162000000000001e-06, | |
| "loss": 0.0568, | |
| "step": 7420 | |
| }, | |
| { | |
| "epoch": 6.399698503284161, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 5.142e-06, | |
| "loss": 0.0527, | |
| "step": 7430 | |
| }, | |
| { | |
| "epoch": 6.408312695165285, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 5.122e-06, | |
| "loss": 0.0528, | |
| "step": 7440 | |
| }, | |
| { | |
| "epoch": 6.416926887046409, | |
| "grad_norm": 3.515625, | |
| "learning_rate": 5.1020000000000004e-06, | |
| "loss": 0.0553, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 6.425541078927533, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 5.082000000000001e-06, | |
| "loss": 0.0531, | |
| "step": 7460 | |
| }, | |
| { | |
| "epoch": 6.434155270808657, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 5.062e-06, | |
| "loss": 0.05, | |
| "step": 7470 | |
| }, | |
| { | |
| "epoch": 6.442769462689782, | |
| "grad_norm": 3.25, | |
| "learning_rate": 5.042e-06, | |
| "loss": 0.0562, | |
| "step": 7480 | |
| }, | |
| { | |
| "epoch": 6.451383654570906, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 5.022e-06, | |
| "loss": 0.0531, | |
| "step": 7490 | |
| }, | |
| { | |
| "epoch": 6.45999784645203, | |
| "grad_norm": 2.625, | |
| "learning_rate": 5.0020000000000006e-06, | |
| "loss": 0.0545, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 6.468612038333154, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 4.982e-06, | |
| "loss": 0.0528, | |
| "step": 7510 | |
| }, | |
| { | |
| "epoch": 6.477226230214278, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 4.962e-06, | |
| "loss": 0.0519, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 6.485840422095402, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 4.942e-06, | |
| "loss": 0.0548, | |
| "step": 7530 | |
| }, | |
| { | |
| "epoch": 6.494454613976527, | |
| "grad_norm": 3.390625, | |
| "learning_rate": 4.9220000000000005e-06, | |
| "loss": 0.0534, | |
| "step": 7540 | |
| }, | |
| { | |
| "epoch": 6.503068805857651, | |
| "grad_norm": 2.625, | |
| "learning_rate": 4.902000000000001e-06, | |
| "loss": 0.0516, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 6.511682997738775, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 4.882000000000001e-06, | |
| "loss": 0.055, | |
| "step": 7560 | |
| }, | |
| { | |
| "epoch": 6.520297189619899, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 4.862e-06, | |
| "loss": 0.0508, | |
| "step": 7570 | |
| }, | |
| { | |
| "epoch": 6.528911381501023, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 4.842e-06, | |
| "loss": 0.0557, | |
| "step": 7580 | |
| }, | |
| { | |
| "epoch": 6.537525573382148, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 4.822000000000001e-06, | |
| "loss": 0.0564, | |
| "step": 7590 | |
| }, | |
| { | |
| "epoch": 6.546139765263272, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 4.802000000000001e-06, | |
| "loss": 0.0519, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 6.554753957144396, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 4.782e-06, | |
| "loss": 0.056, | |
| "step": 7610 | |
| }, | |
| { | |
| "epoch": 6.56336814902552, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 4.762e-06, | |
| "loss": 0.0554, | |
| "step": 7620 | |
| }, | |
| { | |
| "epoch": 6.571982340906644, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 4.7420000000000005e-06, | |
| "loss": 0.0538, | |
| "step": 7630 | |
| }, | |
| { | |
| "epoch": 6.580596532787768, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 4.722000000000001e-06, | |
| "loss": 0.0527, | |
| "step": 7640 | |
| }, | |
| { | |
| "epoch": 6.589210724668892, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 4.702e-06, | |
| "loss": 0.0558, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 6.5978249165500165, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 4.682e-06, | |
| "loss": 0.0506, | |
| "step": 7660 | |
| }, | |
| { | |
| "epoch": 6.6064391084311405, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 4.6620000000000004e-06, | |
| "loss": 0.055, | |
| "step": 7670 | |
| }, | |
| { | |
| "epoch": 6.6150533003122645, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 4.642000000000001e-06, | |
| "loss": 0.0535, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 6.6236674921933885, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 4.622e-06, | |
| "loss": 0.0564, | |
| "step": 7690 | |
| }, | |
| { | |
| "epoch": 6.6322816840745125, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 4.602e-06, | |
| "loss": 0.0567, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 6.640895875955637, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 4.582e-06, | |
| "loss": 0.0557, | |
| "step": 7710 | |
| }, | |
| { | |
| "epoch": 6.649510067836761, | |
| "grad_norm": 2.625, | |
| "learning_rate": 4.5620000000000005e-06, | |
| "loss": 0.0533, | |
| "step": 7720 | |
| }, | |
| { | |
| "epoch": 6.658124259717885, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 4.542e-06, | |
| "loss": 0.0548, | |
| "step": 7730 | |
| }, | |
| { | |
| "epoch": 6.666738451599009, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 4.522e-06, | |
| "loss": 0.0543, | |
| "step": 7740 | |
| }, | |
| { | |
| "epoch": 6.675352643480133, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 4.502e-06, | |
| "loss": 0.0498, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 6.683966835361257, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 4.4820000000000005e-06, | |
| "loss": 0.0559, | |
| "step": 7760 | |
| }, | |
| { | |
| "epoch": 6.692581027242381, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 4.462e-06, | |
| "loss": 0.0556, | |
| "step": 7770 | |
| }, | |
| { | |
| "epoch": 6.701195219123506, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 4.442e-06, | |
| "loss": 0.051, | |
| "step": 7780 | |
| }, | |
| { | |
| "epoch": 6.70980941100463, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 4.422e-06, | |
| "loss": 0.0555, | |
| "step": 7790 | |
| }, | |
| { | |
| "epoch": 6.718423602885754, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 4.402e-06, | |
| "loss": 0.0522, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 6.727037794766878, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 4.382e-06, | |
| "loss": 0.0555, | |
| "step": 7810 | |
| }, | |
| { | |
| "epoch": 6.735651986648002, | |
| "grad_norm": 4.125, | |
| "learning_rate": 4.362e-06, | |
| "loss": 0.0519, | |
| "step": 7820 | |
| }, | |
| { | |
| "epoch": 6.744266178529127, | |
| "grad_norm": 3.25, | |
| "learning_rate": 4.342e-06, | |
| "loss": 0.0538, | |
| "step": 7830 | |
| }, | |
| { | |
| "epoch": 6.752880370410251, | |
| "grad_norm": 3.328125, | |
| "learning_rate": 4.322e-06, | |
| "loss": 0.056, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 6.761494562291375, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 4.3020000000000005e-06, | |
| "loss": 0.0538, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 6.770108754172499, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 4.282000000000001e-06, | |
| "loss": 0.0513, | |
| "step": 7860 | |
| }, | |
| { | |
| "epoch": 6.778722946053623, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 4.262000000000001e-06, | |
| "loss": 0.0548, | |
| "step": 7870 | |
| }, | |
| { | |
| "epoch": 6.787337137934747, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 4.242e-06, | |
| "loss": 0.0538, | |
| "step": 7880 | |
| }, | |
| { | |
| "epoch": 6.795951329815872, | |
| "grad_norm": 2.75, | |
| "learning_rate": 4.222e-06, | |
| "loss": 0.0537, | |
| "step": 7890 | |
| }, | |
| { | |
| "epoch": 6.804565521696996, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 4.202000000000001e-06, | |
| "loss": 0.0582, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 6.81317971357812, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 4.182000000000001e-06, | |
| "loss": 0.0577, | |
| "step": 7910 | |
| }, | |
| { | |
| "epoch": 6.821793905459244, | |
| "grad_norm": 2.75, | |
| "learning_rate": 4.162e-06, | |
| "loss": 0.0564, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 6.830408097340368, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 4.142e-06, | |
| "loss": 0.0551, | |
| "step": 7930 | |
| }, | |
| { | |
| "epoch": 6.839022289221492, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 4.1220000000000005e-06, | |
| "loss": 0.0501, | |
| "step": 7940 | |
| }, | |
| { | |
| "epoch": 6.847636481102617, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 4.102000000000001e-06, | |
| "loss": 0.0498, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 6.856250672983741, | |
| "grad_norm": 3.125, | |
| "learning_rate": 4.082e-06, | |
| "loss": 0.0512, | |
| "step": 7960 | |
| }, | |
| { | |
| "epoch": 6.864864864864865, | |
| "grad_norm": 2.5, | |
| "learning_rate": 4.062e-06, | |
| "loss": 0.052, | |
| "step": 7970 | |
| }, | |
| { | |
| "epoch": 6.873479056745989, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 4.0420000000000004e-06, | |
| "loss": 0.0563, | |
| "step": 7980 | |
| }, | |
| { | |
| "epoch": 6.882093248627113, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 4.022000000000001e-06, | |
| "loss": 0.0526, | |
| "step": 7990 | |
| }, | |
| { | |
| "epoch": 6.890707440508237, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 4.002e-06, | |
| "loss": 0.0556, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 6.899321632389362, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 3.982e-06, | |
| "loss": 0.0553, | |
| "step": 8010 | |
| }, | |
| { | |
| "epoch": 6.907935824270486, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 3.962e-06, | |
| "loss": 0.0538, | |
| "step": 8020 | |
| }, | |
| { | |
| "epoch": 6.91655001615161, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 3.9420000000000005e-06, | |
| "loss": 0.056, | |
| "step": 8030 | |
| }, | |
| { | |
| "epoch": 6.925164208032734, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 3.922e-06, | |
| "loss": 0.0553, | |
| "step": 8040 | |
| }, | |
| { | |
| "epoch": 6.933778399913858, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 3.902e-06, | |
| "loss": 0.054, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 6.942392591794983, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 3.882e-06, | |
| "loss": 0.056, | |
| "step": 8060 | |
| }, | |
| { | |
| "epoch": 6.951006783676107, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 3.8620000000000005e-06, | |
| "loss": 0.055, | |
| "step": 8070 | |
| }, | |
| { | |
| "epoch": 6.959620975557231, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 3.842e-06, | |
| "loss": 0.0542, | |
| "step": 8080 | |
| }, | |
| { | |
| "epoch": 6.968235167438355, | |
| "grad_norm": 2.5, | |
| "learning_rate": 3.822e-06, | |
| "loss": 0.052, | |
| "step": 8090 | |
| }, | |
| { | |
| "epoch": 6.976849359319479, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 3.802e-06, | |
| "loss": 0.0528, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 6.985463551200603, | |
| "grad_norm": 2.875, | |
| "learning_rate": 3.782e-06, | |
| "loss": 0.0551, | |
| "step": 8110 | |
| }, | |
| { | |
| "epoch": 6.994077743081727, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 3.762e-06, | |
| "loss": 0.0545, | |
| "step": 8120 | |
| }, | |
| { | |
| "epoch": 7.0025842575643376, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 3.742e-06, | |
| "loss": 0.0536, | |
| "step": 8130 | |
| }, | |
| { | |
| "epoch": 7.0111984494454616, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 3.722e-06, | |
| "loss": 0.0441, | |
| "step": 8140 | |
| }, | |
| { | |
| "epoch": 7.019812641326586, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 3.702e-06, | |
| "loss": 0.0494, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 7.02842683320771, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 3.6820000000000005e-06, | |
| "loss": 0.0473, | |
| "step": 8160 | |
| }, | |
| { | |
| "epoch": 7.037041025088834, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 3.6620000000000007e-06, | |
| "loss": 0.0443, | |
| "step": 8170 | |
| }, | |
| { | |
| "epoch": 7.0456552169699584, | |
| "grad_norm": 2.25, | |
| "learning_rate": 3.6420000000000005e-06, | |
| "loss": 0.0454, | |
| "step": 8180 | |
| }, | |
| { | |
| "epoch": 7.0542694088510824, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 3.6220000000000006e-06, | |
| "loss": 0.0465, | |
| "step": 8190 | |
| }, | |
| { | |
| "epoch": 7.0628836007322064, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 3.6020000000000004e-06, | |
| "loss": 0.0463, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 7.0714977926133304, | |
| "grad_norm": 2.375, | |
| "learning_rate": 3.5820000000000006e-06, | |
| "loss": 0.0454, | |
| "step": 8210 | |
| }, | |
| { | |
| "epoch": 7.0801119844944544, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 3.5620000000000004e-06, | |
| "loss": 0.0453, | |
| "step": 8220 | |
| }, | |
| { | |
| "epoch": 7.0887261763755784, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 3.5420000000000006e-06, | |
| "loss": 0.0459, | |
| "step": 8230 | |
| }, | |
| { | |
| "epoch": 7.097340368256703, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 3.5220000000000003e-06, | |
| "loss": 0.0462, | |
| "step": 8240 | |
| }, | |
| { | |
| "epoch": 7.105954560137827, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 3.5020000000000005e-06, | |
| "loss": 0.0486, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 7.114568752018951, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 3.4820000000000003e-06, | |
| "loss": 0.0436, | |
| "step": 8260 | |
| }, | |
| { | |
| "epoch": 7.123182943900075, | |
| "grad_norm": 2.375, | |
| "learning_rate": 3.4620000000000005e-06, | |
| "loss": 0.0462, | |
| "step": 8270 | |
| }, | |
| { | |
| "epoch": 7.131797135781199, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 3.4420000000000002e-06, | |
| "loss": 0.045, | |
| "step": 8280 | |
| }, | |
| { | |
| "epoch": 7.140411327662323, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 3.4220000000000004e-06, | |
| "loss": 0.0461, | |
| "step": 8290 | |
| }, | |
| { | |
| "epoch": 7.149025519543448, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 3.402e-06, | |
| "loss": 0.0453, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 7.157639711424572, | |
| "grad_norm": 2.375, | |
| "learning_rate": 3.3820000000000004e-06, | |
| "loss": 0.045, | |
| "step": 8310 | |
| }, | |
| { | |
| "epoch": 7.166253903305696, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 3.362e-06, | |
| "loss": 0.0483, | |
| "step": 8320 | |
| }, | |
| { | |
| "epoch": 7.17486809518682, | |
| "grad_norm": 2.25, | |
| "learning_rate": 3.3420000000000004e-06, | |
| "loss": 0.0452, | |
| "step": 8330 | |
| }, | |
| { | |
| "epoch": 7.183482287067944, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 3.322e-06, | |
| "loss": 0.0485, | |
| "step": 8340 | |
| }, | |
| { | |
| "epoch": 7.192096478949068, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 3.3020000000000003e-06, | |
| "loss": 0.0485, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 7.200710670830193, | |
| "grad_norm": 2.125, | |
| "learning_rate": 3.282e-06, | |
| "loss": 0.0461, | |
| "step": 8360 | |
| }, | |
| { | |
| "epoch": 7.209324862711317, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 3.2620000000000003e-06, | |
| "loss": 0.0498, | |
| "step": 8370 | |
| }, | |
| { | |
| "epoch": 7.217939054592441, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 3.242e-06, | |
| "loss": 0.0469, | |
| "step": 8380 | |
| }, | |
| { | |
| "epoch": 7.226553246473565, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 3.2220000000000002e-06, | |
| "loss": 0.0428, | |
| "step": 8390 | |
| }, | |
| { | |
| "epoch": 7.235167438354689, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 3.202e-06, | |
| "loss": 0.0449, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 7.243781630235813, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 3.182e-06, | |
| "loss": 0.0441, | |
| "step": 8410 | |
| }, | |
| { | |
| "epoch": 7.252395822116938, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 3.162e-06, | |
| "loss": 0.0452, | |
| "step": 8420 | |
| }, | |
| { | |
| "epoch": 7.261010013998062, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 3.142e-06, | |
| "loss": 0.0458, | |
| "step": 8430 | |
| }, | |
| { | |
| "epoch": 7.269624205879186, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 3.122e-06, | |
| "loss": 0.0421, | |
| "step": 8440 | |
| }, | |
| { | |
| "epoch": 7.27823839776031, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 3.102e-06, | |
| "loss": 0.0443, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 7.286852589641434, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 3.082e-06, | |
| "loss": 0.0446, | |
| "step": 8460 | |
| }, | |
| { | |
| "epoch": 7.295466781522558, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 3.0620000000000005e-06, | |
| "loss": 0.0449, | |
| "step": 8470 | |
| }, | |
| { | |
| "epoch": 7.304080973403683, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 3.0420000000000007e-06, | |
| "loss": 0.0413, | |
| "step": 8480 | |
| }, | |
| { | |
| "epoch": 7.312695165284807, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 3.0220000000000005e-06, | |
| "loss": 0.0456, | |
| "step": 8490 | |
| }, | |
| { | |
| "epoch": 7.321309357165931, | |
| "grad_norm": 2.5, | |
| "learning_rate": 3.0020000000000006e-06, | |
| "loss": 0.0459, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 7.329923549047055, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 2.9820000000000004e-06, | |
| "loss": 0.0477, | |
| "step": 8510 | |
| }, | |
| { | |
| "epoch": 7.338537740928179, | |
| "grad_norm": 2.375, | |
| "learning_rate": 2.9620000000000006e-06, | |
| "loss": 0.0462, | |
| "step": 8520 | |
| }, | |
| { | |
| "epoch": 7.347151932809304, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 2.9420000000000004e-06, | |
| "loss": 0.0428, | |
| "step": 8530 | |
| }, | |
| { | |
| "epoch": 7.355766124690428, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 2.9220000000000006e-06, | |
| "loss": 0.0438, | |
| "step": 8540 | |
| }, | |
| { | |
| "epoch": 7.364380316571552, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 2.9020000000000003e-06, | |
| "loss": 0.0451, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 7.372994508452676, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 2.8820000000000005e-06, | |
| "loss": 0.0468, | |
| "step": 8560 | |
| }, | |
| { | |
| "epoch": 7.3816087003338, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 2.8620000000000003e-06, | |
| "loss": 0.0462, | |
| "step": 8570 | |
| }, | |
| { | |
| "epoch": 7.390222892214924, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 2.8420000000000005e-06, | |
| "loss": 0.0465, | |
| "step": 8580 | |
| }, | |
| { | |
| "epoch": 7.398837084096048, | |
| "grad_norm": 2.375, | |
| "learning_rate": 2.8220000000000003e-06, | |
| "loss": 0.0471, | |
| "step": 8590 | |
| }, | |
| { | |
| "epoch": 7.407451275977173, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 2.8020000000000004e-06, | |
| "loss": 0.0473, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 7.416065467858297, | |
| "grad_norm": 2.625, | |
| "learning_rate": 2.7820000000000002e-06, | |
| "loss": 0.0486, | |
| "step": 8610 | |
| }, | |
| { | |
| "epoch": 7.424679659739421, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 2.7620000000000004e-06, | |
| "loss": 0.0457, | |
| "step": 8620 | |
| }, | |
| { | |
| "epoch": 7.433293851620545, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 2.742e-06, | |
| "loss": 0.0481, | |
| "step": 8630 | |
| }, | |
| { | |
| "epoch": 7.441908043501669, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 2.7220000000000004e-06, | |
| "loss": 0.0452, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 7.450522235382794, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 2.702e-06, | |
| "loss": 0.0428, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 7.459136427263918, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 2.6820000000000003e-06, | |
| "loss": 0.0508, | |
| "step": 8660 | |
| }, | |
| { | |
| "epoch": 7.467750619145042, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 2.662e-06, | |
| "loss": 0.0443, | |
| "step": 8670 | |
| }, | |
| { | |
| "epoch": 7.476364811026166, | |
| "grad_norm": 2.5, | |
| "learning_rate": 2.6420000000000003e-06, | |
| "loss": 0.047, | |
| "step": 8680 | |
| }, | |
| { | |
| "epoch": 7.48497900290729, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 2.622e-06, | |
| "loss": 0.0457, | |
| "step": 8690 | |
| }, | |
| { | |
| "epoch": 7.493593194788414, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 2.6020000000000002e-06, | |
| "loss": 0.0467, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 7.502207386669538, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 2.582e-06, | |
| "loss": 0.0468, | |
| "step": 8710 | |
| }, | |
| { | |
| "epoch": 7.5108215785506625, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 2.562e-06, | |
| "loss": 0.0447, | |
| "step": 8720 | |
| }, | |
| { | |
| "epoch": 7.5194357704317865, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 2.542e-06, | |
| "loss": 0.0449, | |
| "step": 8730 | |
| }, | |
| { | |
| "epoch": 7.5280499623129105, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 2.522e-06, | |
| "loss": 0.0466, | |
| "step": 8740 | |
| }, | |
| { | |
| "epoch": 7.5366641541940345, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 2.502e-06, | |
| "loss": 0.0429, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 7.5452783460751585, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 2.482e-06, | |
| "loss": 0.0474, | |
| "step": 8760 | |
| }, | |
| { | |
| "epoch": 7.553892537956283, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 2.4620000000000003e-06, | |
| "loss": 0.0469, | |
| "step": 8770 | |
| }, | |
| { | |
| "epoch": 7.562506729837407, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 2.442e-06, | |
| "loss": 0.0467, | |
| "step": 8780 | |
| }, | |
| { | |
| "epoch": 7.571120921718531, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 2.4220000000000003e-06, | |
| "loss": 0.0497, | |
| "step": 8790 | |
| }, | |
| { | |
| "epoch": 7.579735113599655, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 2.402e-06, | |
| "loss": 0.045, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 7.588349305480779, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 2.3820000000000002e-06, | |
| "loss": 0.0472, | |
| "step": 8810 | |
| }, | |
| { | |
| "epoch": 7.596963497361903, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 2.362e-06, | |
| "loss": 0.0495, | |
| "step": 8820 | |
| }, | |
| { | |
| "epoch": 7.605577689243028, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 2.342e-06, | |
| "loss": 0.0441, | |
| "step": 8830 | |
| }, | |
| { | |
| "epoch": 7.614191881124152, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 2.322e-06, | |
| "loss": 0.0466, | |
| "step": 8840 | |
| }, | |
| { | |
| "epoch": 7.622806073005276, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 2.302e-06, | |
| "loss": 0.0447, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 7.6314202648864, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 2.282e-06, | |
| "loss": 0.0469, | |
| "step": 8860 | |
| }, | |
| { | |
| "epoch": 7.640034456767524, | |
| "grad_norm": 3.34375, | |
| "learning_rate": 2.262e-06, | |
| "loss": 0.0475, | |
| "step": 8870 | |
| }, | |
| { | |
| "epoch": 7.648648648648649, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 2.2420000000000003e-06, | |
| "loss": 0.0457, | |
| "step": 8880 | |
| }, | |
| { | |
| "epoch": 7.657262840529773, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 2.222e-06, | |
| "loss": 0.0437, | |
| "step": 8890 | |
| }, | |
| { | |
| "epoch": 7.665877032410897, | |
| "grad_norm": 2.25, | |
| "learning_rate": 2.2020000000000003e-06, | |
| "loss": 0.0467, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 7.674491224292021, | |
| "grad_norm": 2.75, | |
| "learning_rate": 2.182e-06, | |
| "loss": 0.0472, | |
| "step": 8910 | |
| }, | |
| { | |
| "epoch": 7.683105416173145, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 2.1620000000000002e-06, | |
| "loss": 0.0477, | |
| "step": 8920 | |
| }, | |
| { | |
| "epoch": 7.691719608054269, | |
| "grad_norm": 2.875, | |
| "learning_rate": 2.142e-06, | |
| "loss": 0.0469, | |
| "step": 8930 | |
| }, | |
| { | |
| "epoch": 7.700333799935393, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 2.122e-06, | |
| "loss": 0.0481, | |
| "step": 8940 | |
| }, | |
| { | |
| "epoch": 7.708947991816518, | |
| "grad_norm": 2.984375, | |
| "learning_rate": 2.102e-06, | |
| "loss": 0.0464, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 7.717562183697642, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 2.082e-06, | |
| "loss": 0.0469, | |
| "step": 8960 | |
| }, | |
| { | |
| "epoch": 7.726176375578766, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 2.062e-06, | |
| "loss": 0.045, | |
| "step": 8970 | |
| }, | |
| { | |
| "epoch": 7.73479056745989, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 2.042e-06, | |
| "loss": 0.0434, | |
| "step": 8980 | |
| }, | |
| { | |
| "epoch": 7.743404759341014, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 2.022e-06, | |
| "loss": 0.0437, | |
| "step": 8990 | |
| }, | |
| { | |
| "epoch": 7.752018951222139, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 2.002e-06, | |
| "loss": 0.0469, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 7.760633143103263, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 1.982e-06, | |
| "loss": 0.0458, | |
| "step": 9010 | |
| }, | |
| { | |
| "epoch": 7.769247334984387, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 1.9620000000000004e-06, | |
| "loss": 0.049, | |
| "step": 9020 | |
| }, | |
| { | |
| "epoch": 7.777861526865511, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 1.942e-06, | |
| "loss": 0.0472, | |
| "step": 9030 | |
| }, | |
| { | |
| "epoch": 7.786475718746635, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.9220000000000004e-06, | |
| "loss": 0.0474, | |
| "step": 9040 | |
| }, | |
| { | |
| "epoch": 7.795089910627759, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 1.9020000000000002e-06, | |
| "loss": 0.0456, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 7.803704102508883, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.8820000000000001e-06, | |
| "loss": 0.0427, | |
| "step": 9060 | |
| }, | |
| { | |
| "epoch": 7.812318294390008, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 1.8620000000000001e-06, | |
| "loss": 0.0441, | |
| "step": 9070 | |
| }, | |
| { | |
| "epoch": 7.820932486271132, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.8420000000000001e-06, | |
| "loss": 0.0435, | |
| "step": 9080 | |
| }, | |
| { | |
| "epoch": 7.829546678152256, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 1.822e-06, | |
| "loss": 0.0489, | |
| "step": 9090 | |
| }, | |
| { | |
| "epoch": 7.83816087003338, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 1.802e-06, | |
| "loss": 0.0443, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 7.846775061914504, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 1.782e-06, | |
| "loss": 0.045, | |
| "step": 9110 | |
| }, | |
| { | |
| "epoch": 7.855389253795629, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 1.762e-06, | |
| "loss": 0.0422, | |
| "step": 9120 | |
| }, | |
| { | |
| "epoch": 7.864003445676753, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 1.742e-06, | |
| "loss": 0.0454, | |
| "step": 9130 | |
| }, | |
| { | |
| "epoch": 7.872617637557877, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 1.722e-06, | |
| "loss": 0.0441, | |
| "step": 9140 | |
| }, | |
| { | |
| "epoch": 7.881231829439001, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 1.702e-06, | |
| "loss": 0.046, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 7.889846021320125, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 1.682e-06, | |
| "loss": 0.0467, | |
| "step": 9160 | |
| }, | |
| { | |
| "epoch": 7.898460213201249, | |
| "grad_norm": 2.75, | |
| "learning_rate": 1.662e-06, | |
| "loss": 0.0486, | |
| "step": 9170 | |
| }, | |
| { | |
| "epoch": 7.907074405082374, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 1.6420000000000003e-06, | |
| "loss": 0.0475, | |
| "step": 9180 | |
| }, | |
| { | |
| "epoch": 7.915688596963498, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 1.6220000000000003e-06, | |
| "loss": 0.0476, | |
| "step": 9190 | |
| }, | |
| { | |
| "epoch": 7.924302788844622, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.6020000000000003e-06, | |
| "loss": 0.0425, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 7.932916980725746, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 1.5820000000000003e-06, | |
| "loss": 0.0447, | |
| "step": 9210 | |
| }, | |
| { | |
| "epoch": 7.94153117260687, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.5620000000000002e-06, | |
| "loss": 0.0484, | |
| "step": 9220 | |
| }, | |
| { | |
| "epoch": 7.950145364487994, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 1.5420000000000002e-06, | |
| "loss": 0.0455, | |
| "step": 9230 | |
| }, | |
| { | |
| "epoch": 7.9587595563691185, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 1.5220000000000002e-06, | |
| "loss": 0.0462, | |
| "step": 9240 | |
| }, | |
| { | |
| "epoch": 7.9673737482502425, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 1.5020000000000002e-06, | |
| "loss": 0.045, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 7.9759879401313665, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 1.4820000000000002e-06, | |
| "loss": 0.0447, | |
| "step": 9260 | |
| }, | |
| { | |
| "epoch": 7.9846021320124905, | |
| "grad_norm": 3.34375, | |
| "learning_rate": 1.4620000000000001e-06, | |
| "loss": 0.0472, | |
| "step": 9270 | |
| }, | |
| { | |
| "epoch": 7.9932163238936145, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 1.4420000000000001e-06, | |
| "loss": 0.047, | |
| "step": 9280 | |
| }, | |
| { | |
| "epoch": 8.001722838376224, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.4220000000000001e-06, | |
| "loss": 0.0444, | |
| "step": 9290 | |
| }, | |
| { | |
| "epoch": 8.010337030257348, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 1.402e-06, | |
| "loss": 0.0444, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 8.018951222138472, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 1.382e-06, | |
| "loss": 0.0397, | |
| "step": 9310 | |
| }, | |
| { | |
| "epoch": 8.027565414019596, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 1.362e-06, | |
| "loss": 0.0425, | |
| "step": 9320 | |
| }, | |
| { | |
| "epoch": 8.036179605900722, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.3420000000000002e-06, | |
| "loss": 0.043, | |
| "step": 9330 | |
| }, | |
| { | |
| "epoch": 8.044793797781846, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 1.3220000000000002e-06, | |
| "loss": 0.0451, | |
| "step": 9340 | |
| }, | |
| { | |
| "epoch": 8.05340798966297, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.3020000000000002e-06, | |
| "loss": 0.0442, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 8.062022181544094, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 1.2820000000000002e-06, | |
| "loss": 0.0428, | |
| "step": 9360 | |
| }, | |
| { | |
| "epoch": 8.070636373425218, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 1.2620000000000002e-06, | |
| "loss": 0.0441, | |
| "step": 9370 | |
| }, | |
| { | |
| "epoch": 8.079250565306342, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 1.2420000000000001e-06, | |
| "loss": 0.045, | |
| "step": 9380 | |
| }, | |
| { | |
| "epoch": 8.087864757187466, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 1.2220000000000001e-06, | |
| "loss": 0.0443, | |
| "step": 9390 | |
| }, | |
| { | |
| "epoch": 8.09647894906859, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 1.202e-06, | |
| "loss": 0.0422, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 8.105093140949714, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 1.182e-06, | |
| "loss": 0.0417, | |
| "step": 9410 | |
| }, | |
| { | |
| "epoch": 8.113707332830838, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 1.162e-06, | |
| "loss": 0.0432, | |
| "step": 9420 | |
| }, | |
| { | |
| "epoch": 8.122321524711962, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 1.142e-06, | |
| "loss": 0.0419, | |
| "step": 9430 | |
| }, | |
| { | |
| "epoch": 8.130935716593086, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.122e-06, | |
| "loss": 0.0416, | |
| "step": 9440 | |
| }, | |
| { | |
| "epoch": 8.139549908474212, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.1020000000000002e-06, | |
| "loss": 0.0422, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 8.148164100355336, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.0820000000000002e-06, | |
| "loss": 0.0455, | |
| "step": 9460 | |
| }, | |
| { | |
| "epoch": 8.15677829223646, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 1.0620000000000002e-06, | |
| "loss": 0.0455, | |
| "step": 9470 | |
| }, | |
| { | |
| "epoch": 8.165392484117584, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.0420000000000001e-06, | |
| "loss": 0.0441, | |
| "step": 9480 | |
| }, | |
| { | |
| "epoch": 8.174006675998708, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 1.0220000000000001e-06, | |
| "loss": 0.0433, | |
| "step": 9490 | |
| }, | |
| { | |
| "epoch": 8.182620867879832, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 1.002e-06, | |
| "loss": 0.045, | |
| "step": 9500 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 10000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 9, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.698799670922543e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |