| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 2500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.004, |
| "grad_norm": 194.0, |
| "learning_rate": 7.2e-07, |
| "loss": 1.6508, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.008, |
| "grad_norm": 166.0, |
| "learning_rate": 1.52e-06, |
| "loss": 1.6883, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.012, |
| "grad_norm": 161.0, |
| "learning_rate": 2.3200000000000002e-06, |
| "loss": 1.4875, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.016, |
| "grad_norm": 219.0, |
| "learning_rate": 3.12e-06, |
| "loss": 1.6227, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 116.0, |
| "learning_rate": 3.920000000000001e-06, |
| "loss": 1.4516, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.024, |
| "grad_norm": 23.375, |
| "learning_rate": 4.7200000000000005e-06, |
| "loss": 1.3695, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.028, |
| "grad_norm": 8.8125, |
| "learning_rate": 5.5200000000000005e-06, |
| "loss": 1.1266, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.032, |
| "grad_norm": 27.125, |
| "learning_rate": 6.3200000000000005e-06, |
| "loss": 0.993, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.036, |
| "grad_norm": 11.875, |
| "learning_rate": 7.1200000000000004e-06, |
| "loss": 0.9371, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 234.0, |
| "learning_rate": 7.92e-06, |
| "loss": 0.9625, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.044, |
| "grad_norm": 4.25, |
| "learning_rate": 8.720000000000001e-06, |
| "loss": 0.8336, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.048, |
| "grad_norm": 4.0, |
| "learning_rate": 9.52e-06, |
| "loss": 0.7688, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.052, |
| "grad_norm": 3.890625, |
| "learning_rate": 1.0320000000000001e-05, |
| "loss": 0.8051, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.056, |
| "grad_norm": 3.015625, |
| "learning_rate": 1.1120000000000002e-05, |
| "loss": 0.7707, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 3.890625, |
| "learning_rate": 1.1920000000000001e-05, |
| "loss": 0.7824, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.064, |
| "grad_norm": 3.4375, |
| "learning_rate": 1.2720000000000002e-05, |
| "loss": 0.7668, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.068, |
| "grad_norm": 2.953125, |
| "learning_rate": 1.3520000000000003e-05, |
| "loss": 0.6629, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.072, |
| "grad_norm": 4.1875, |
| "learning_rate": 1.432e-05, |
| "loss": 0.7113, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.076, |
| "grad_norm": 3.03125, |
| "learning_rate": 1.5120000000000001e-05, |
| "loss": 0.6758, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 4.21875, |
| "learning_rate": 1.5920000000000003e-05, |
| "loss": 0.6562, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.084, |
| "grad_norm": 2.40625, |
| "learning_rate": 1.672e-05, |
| "loss": 0.6422, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.088, |
| "grad_norm": 2.4375, |
| "learning_rate": 1.752e-05, |
| "loss": 0.6508, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.092, |
| "grad_norm": 3.09375, |
| "learning_rate": 1.832e-05, |
| "loss": 0.6668, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.096, |
| "grad_norm": 2.28125, |
| "learning_rate": 1.912e-05, |
| "loss": 0.6273, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 3.671875, |
| "learning_rate": 1.9920000000000002e-05, |
| "loss": 0.6539, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.104, |
| "grad_norm": 2.875, |
| "learning_rate": 1.9999210442038164e-05, |
| "loss": 0.657, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.108, |
| "grad_norm": 2.421875, |
| "learning_rate": 1.9996481265944146e-05, |
| "loss": 0.6375, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.112, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.9991803256020393e-05, |
| "loss": 0.6051, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.116, |
| "grad_norm": 2.3125, |
| "learning_rate": 1.99851773242542e-05, |
| "loss": 0.591, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 2.96875, |
| "learning_rate": 1.99766047623841e-05, |
| "loss": 0.5902, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.124, |
| "grad_norm": 2.375, |
| "learning_rate": 1.996608724164801e-05, |
| "loss": 0.6107, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.128, |
| "grad_norm": 2.4375, |
| "learning_rate": 1.995362681245744e-05, |
| "loss": 0.5801, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.132, |
| "grad_norm": 2.421875, |
| "learning_rate": 1.9939225903997748e-05, |
| "loss": 0.6066, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.136, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.992288732375458e-05, |
| "loss": 0.5309, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 2.109375, |
| "learning_rate": 1.9904614256966514e-05, |
| "loss": 0.5473, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.144, |
| "grad_norm": 2.015625, |
| "learning_rate": 1.9884410266004134e-05, |
| "loss": 0.5816, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.148, |
| "grad_norm": 3.0625, |
| "learning_rate": 1.986227928967551e-05, |
| "loss": 0.5543, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.152, |
| "grad_norm": 1.9296875, |
| "learning_rate": 1.983822564245833e-05, |
| "loss": 0.5283, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.156, |
| "grad_norm": 2.421875, |
| "learning_rate": 1.981225401365877e-05, |
| "loss": 0.5176, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 1.7890625, |
| "learning_rate": 1.9784369466497333e-05, |
| "loss": 0.5184, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.164, |
| "grad_norm": 2.0625, |
| "learning_rate": 1.9754577437121733e-05, |
| "loss": 0.5652, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.168, |
| "grad_norm": 3.125, |
| "learning_rate": 1.9722883733547128e-05, |
| "loss": 0.5262, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.172, |
| "grad_norm": 1.953125, |
| "learning_rate": 1.968929453452383e-05, |
| "loss": 0.5352, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.176, |
| "grad_norm": 1.6015625, |
| "learning_rate": 1.965381638833274e-05, |
| "loss": 0.4859, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 1.78125, |
| "learning_rate": 1.9616456211508756e-05, |
| "loss": 0.5305, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.184, |
| "grad_norm": 1.859375, |
| "learning_rate": 1.9577221287492368e-05, |
| "loss": 0.5123, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.188, |
| "grad_norm": 1.8828125, |
| "learning_rate": 1.9536119265209763e-05, |
| "loss": 0.5275, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.192, |
| "grad_norm": 1.9765625, |
| "learning_rate": 1.9493158157581617e-05, |
| "loss": 0.5264, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.196, |
| "grad_norm": 1.875, |
| "learning_rate": 1.9448346339960984e-05, |
| "loss": 0.5463, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 1.8359375, |
| "learning_rate": 1.9401692548500504e-05, |
| "loss": 0.4748, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.204, |
| "grad_norm": 1.875, |
| "learning_rate": 1.935320587844926e-05, |
| "loss": 0.5252, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.208, |
| "grad_norm": 1.890625, |
| "learning_rate": 1.9302895782379648e-05, |
| "loss": 0.4865, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.212, |
| "grad_norm": 1.65625, |
| "learning_rate": 1.925077206834458e-05, |
| "loss": 0.4857, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.216, |
| "grad_norm": 1.8359375, |
| "learning_rate": 1.9196844897965393e-05, |
| "loss": 0.4979, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 1.7734375, |
| "learning_rate": 1.914112478445079e-05, |
| "loss": 0.4949, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.224, |
| "grad_norm": 1.65625, |
| "learning_rate": 1.9083622590547313e-05, |
| "loss": 0.4988, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.228, |
| "grad_norm": 1.9296875, |
| "learning_rate": 1.9024349526421596e-05, |
| "loss": 0.5363, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.232, |
| "grad_norm": 2.078125, |
| "learning_rate": 1.896331714747493e-05, |
| "loss": 0.5225, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.236, |
| "grad_norm": 1.9140625, |
| "learning_rate": 1.8900537352090523e-05, |
| "loss": 0.5123, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 1.59375, |
| "learning_rate": 1.8836022379313884e-05, |
| "loss": 0.4934, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.244, |
| "grad_norm": 1.765625, |
| "learning_rate": 1.8769784806466768e-05, |
| "loss": 0.4902, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.248, |
| "grad_norm": 1.6875, |
| "learning_rate": 1.870183754669526e-05, |
| "loss": 0.4932, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.252, |
| "grad_norm": 1.9453125, |
| "learning_rate": 1.863219384645227e-05, |
| "loss": 0.4648, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.256, |
| "grad_norm": 2.109375, |
| "learning_rate": 1.8560867282915164e-05, |
| "loss": 0.5135, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 1.859375, |
| "learning_rate": 1.848787176133882e-05, |
| "loss": 0.4836, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.264, |
| "grad_norm": 2.734375, |
| "learning_rate": 1.8413221512344805e-05, |
| "loss": 0.5049, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.268, |
| "grad_norm": 2.171875, |
| "learning_rate": 1.8336931089147076e-05, |
| "loss": 0.5375, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.272, |
| "grad_norm": 1.7109375, |
| "learning_rate": 1.8259015364714786e-05, |
| "loss": 0.5139, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.276, |
| "grad_norm": 1.578125, |
| "learning_rate": 1.8179489528872808e-05, |
| "loss": 0.5223, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 1.6484375, |
| "learning_rate": 1.80983690853404e-05, |
| "loss": 0.5131, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.284, |
| "grad_norm": 1.9921875, |
| "learning_rate": 1.8015669848708768e-05, |
| "loss": 0.468, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.288, |
| "grad_norm": 1.8828125, |
| "learning_rate": 1.793140794135795e-05, |
| "loss": 0.4797, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.292, |
| "grad_norm": 2.015625, |
| "learning_rate": 1.7845599790313735e-05, |
| "loss": 0.4699, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.296, |
| "grad_norm": 1.6171875, |
| "learning_rate": 1.7758262124045195e-05, |
| "loss": 0.5324, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 2.03125, |
| "learning_rate": 1.7669411969203417e-05, |
| "loss": 0.4785, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.304, |
| "grad_norm": 1.71875, |
| "learning_rate": 1.7579066647302134e-05, |
| "loss": 0.4814, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.308, |
| "grad_norm": 1.6171875, |
| "learning_rate": 1.7487243771340862e-05, |
| "loss": 0.4537, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.312, |
| "grad_norm": 4.3125, |
| "learning_rate": 1.7393961242371203e-05, |
| "loss": 0.4742, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.316, |
| "grad_norm": 2.125, |
| "learning_rate": 1.7299237246007018e-05, |
| "loss": 0.4926, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 1.75, |
| "learning_rate": 1.720309024887907e-05, |
| "loss": 0.4881, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.324, |
| "grad_norm": 1.8984375, |
| "learning_rate": 1.710553899503496e-05, |
| "loss": 0.4857, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.328, |
| "grad_norm": 1.7578125, |
| "learning_rate": 1.700660250228492e-05, |
| "loss": 0.5352, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.332, |
| "grad_norm": 1.5390625, |
| "learning_rate": 1.690630005849423e-05, |
| "loss": 0.4445, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.336, |
| "grad_norm": 1.890625, |
| "learning_rate": 1.6804651217823055e-05, |
| "loss": 0.5189, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 1.546875, |
| "learning_rate": 1.6701675796914284e-05, |
| "loss": 0.4311, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.344, |
| "grad_norm": 1.7109375, |
| "learning_rate": 1.6597393871030264e-05, |
| "loss": 0.4873, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.348, |
| "grad_norm": 1.6875, |
| "learning_rate": 1.649182577013906e-05, |
| "loss": 0.4633, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.352, |
| "grad_norm": 1.5625, |
| "learning_rate": 1.6384992074951124e-05, |
| "loss": 0.4436, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.356, |
| "grad_norm": 1.65625, |
| "learning_rate": 1.6276913612907005e-05, |
| "loss": 0.4871, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 1.7421875, |
| "learning_rate": 1.6167611454117027e-05, |
| "loss": 0.4613, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.364, |
| "grad_norm": 1.65625, |
| "learning_rate": 1.6057106907253617e-05, |
| "loss": 0.4684, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.368, |
| "grad_norm": 1.734375, |
| "learning_rate": 1.5945421515397135e-05, |
| "loss": 0.5123, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.372, |
| "grad_norm": 1.7890625, |
| "learning_rate": 1.5832577051836016e-05, |
| "loss": 0.5119, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.376, |
| "grad_norm": 1.4375, |
| "learning_rate": 1.5718595515822027e-05, |
| "loss": 0.4494, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 1.609375, |
| "learning_rate": 1.5603499128281447e-05, |
| "loss": 0.4803, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.384, |
| "grad_norm": 1.7265625, |
| "learning_rate": 1.5487310327483087e-05, |
| "loss": 0.5055, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.388, |
| "grad_norm": 1.546875, |
| "learning_rate": 1.5370051764663872e-05, |
| "loss": 0.4959, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.392, |
| "grad_norm": 1.8515625, |
| "learning_rate": 1.5251746299612959e-05, |
| "loss": 0.4793, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.396, |
| "grad_norm": 1.8203125, |
| "learning_rate": 1.5132416996215171e-05, |
| "loss": 0.516, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 1.578125, |
| "learning_rate": 1.5012087117954643e-05, |
| "loss": 0.4549, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.404, |
| "grad_norm": 1.46875, |
| "learning_rate": 1.4890780123379565e-05, |
| "loss": 0.467, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.408, |
| "grad_norm": 1.6953125, |
| "learning_rate": 1.4768519661528879e-05, |
| "loss": 0.492, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.412, |
| "grad_norm": 1.6640625, |
| "learning_rate": 1.464532956732188e-05, |
| "loss": 0.4711, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.416, |
| "grad_norm": 4.03125, |
| "learning_rate": 1.4521233856911507e-05, |
| "loss": 0.4842, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 1.59375, |
| "learning_rate": 1.43962567230024e-05, |
| "loss": 0.4535, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.424, |
| "grad_norm": 1.6171875, |
| "learning_rate": 1.4270422530134433e-05, |
| "loss": 0.4813, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.428, |
| "grad_norm": 1.2578125, |
| "learning_rate": 1.4143755809932843e-05, |
| "loss": 0.4453, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.432, |
| "grad_norm": 1.5625, |
| "learning_rate": 1.4016281256325702e-05, |
| "loss": 0.4578, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.436, |
| "grad_norm": 1.953125, |
| "learning_rate": 1.388802372072981e-05, |
| "loss": 0.4559, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 1.9296875, |
| "learning_rate": 1.3759008207205869e-05, |
| "loss": 0.5078, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.444, |
| "grad_norm": 1.75, |
| "learning_rate": 1.3629259867583864e-05, |
| "loss": 0.4645, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.448, |
| "grad_norm": 1.8046875, |
| "learning_rate": 1.349880399655969e-05, |
| "loss": 0.4395, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.452, |
| "grad_norm": 1.4921875, |
| "learning_rate": 1.3367666026763884e-05, |
| "loss": 0.4641, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.456, |
| "grad_norm": 1.625, |
| "learning_rate": 1.3235871523803496e-05, |
| "loss": 0.4324, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 1.6796875, |
| "learning_rate": 1.3103446181278015e-05, |
| "loss": 0.4684, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.464, |
| "grad_norm": 1.78125, |
| "learning_rate": 1.297041581577035e-05, |
| "loss": 0.4635, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.468, |
| "grad_norm": 1.765625, |
| "learning_rate": 1.2836806361813846e-05, |
| "loss": 0.4705, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.472, |
| "grad_norm": 1.53125, |
| "learning_rate": 1.270264386683628e-05, |
| "loss": 0.4299, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.476, |
| "grad_norm": 1.6328125, |
| "learning_rate": 1.256795448608188e-05, |
| "loss": 0.4813, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 1.6953125, |
| "learning_rate": 1.2432764477512294e-05, |
| "loss": 0.4609, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.484, |
| "grad_norm": 1.8046875, |
| "learning_rate": 1.2297100196687557e-05, |
| "loss": 0.4916, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.488, |
| "grad_norm": 1.5703125, |
| "learning_rate": 1.2160988091628023e-05, |
| "loss": 0.459, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.492, |
| "grad_norm": 2.078125, |
| "learning_rate": 1.202445469765826e-05, |
| "loss": 0.4824, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.496, |
| "grad_norm": 1.78125, |
| "learning_rate": 1.1887526632233954e-05, |
| "loss": 0.4771, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 1.8359375, |
| "learning_rate": 1.1750230589752763e-05, |
| "loss": 0.4734, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.504, |
| "grad_norm": 1.734375, |
| "learning_rate": 1.1612593336350209e-05, |
| "loss": 0.5332, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.508, |
| "grad_norm": 1.40625, |
| "learning_rate": 1.1474641704681551e-05, |
| "loss": 0.4105, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.512, |
| "grad_norm": 1.40625, |
| "learning_rate": 1.1336402588690727e-05, |
| "loss": 0.4398, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.516, |
| "grad_norm": 1.59375, |
| "learning_rate": 1.1197902938367297e-05, |
| "loss": 0.4486, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 1.375, |
| "learning_rate": 1.105916975449252e-05, |
| "loss": 0.4705, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.524, |
| "grad_norm": 1.5859375, |
| "learning_rate": 1.0920230083375474e-05, |
| "loss": 0.4553, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.528, |
| "grad_norm": 1.2734375, |
| "learning_rate": 1.0781111011580336e-05, |
| "loss": 0.4098, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.532, |
| "grad_norm": 1.609375, |
| "learning_rate": 1.0641839660645806e-05, |
| "loss": 0.4551, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.536, |
| "grad_norm": 1.6015625, |
| "learning_rate": 1.0502443181797696e-05, |
| "loss": 0.4047, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 1.609375, |
| "learning_rate": 1.036294875065576e-05, |
| "loss": 0.4584, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.544, |
| "grad_norm": 1.4921875, |
| "learning_rate": 1.0223383561935738e-05, |
| "loss": 0.4338, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.548, |
| "grad_norm": 1.8125, |
| "learning_rate": 1.0083774824147707e-05, |
| "loss": 0.4584, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.552, |
| "grad_norm": 1.5, |
| "learning_rate": 9.944149754291719e-06, |
| "loss": 0.442, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.556, |
| "grad_norm": 1.296875, |
| "learning_rate": 9.80453557255179e-06, |
| "loss": 0.4059, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 1.4375, |
| "learning_rate": 9.664959496989286e-06, |
| "loss": 0.4334, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.564, |
| "grad_norm": 1.515625, |
| "learning_rate": 9.525448738236691e-06, |
| "loss": 0.4059, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.568, |
| "grad_norm": 1.5078125, |
| "learning_rate": 9.386030494192847e-06, |
| "loss": 0.4322, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.572, |
| "grad_norm": 1.546875, |
| "learning_rate": 9.246731944720675e-06, |
| "loss": 0.435, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.576, |
| "grad_norm": 1.5, |
| "learning_rate": 9.107580246348395e-06, |
| "loss": 0.4232, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 1.6328125, |
| "learning_rate": 8.968602526975329e-06, |
| "loss": 0.4484, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.584, |
| "grad_norm": 1.4609375, |
| "learning_rate": 8.829825880583228e-06, |
| "loss": 0.4379, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.588, |
| "grad_norm": 1.7734375, |
| "learning_rate": 8.69127736195428e-06, |
| "loss": 0.4096, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.592, |
| "grad_norm": 1.3984375, |
| "learning_rate": 8.552983981396709e-06, |
| "loss": 0.4352, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.596, |
| "grad_norm": 1.546875, |
| "learning_rate": 8.414972699479076e-06, |
| "loss": 0.4191, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 1.46875, |
| "learning_rate": 8.277270421774234e-06, |
| "loss": 0.4219, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.604, |
| "grad_norm": 1.734375, |
| "learning_rate": 8.139903993614069e-06, |
| "loss": 0.4318, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.608, |
| "grad_norm": 1.3359375, |
| "learning_rate": 8.00290019485593e-06, |
| "loss": 0.3777, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.612, |
| "grad_norm": 1.6015625, |
| "learning_rate": 7.866285734661842e-06, |
| "loss": 0.4092, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.616, |
| "grad_norm": 1.7109375, |
| "learning_rate": 7.730087246291503e-06, |
| "loss": 0.468, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.62, |
| "grad_norm": 1.71875, |
| "learning_rate": 7.594331281910082e-06, |
| "loss": 0.4207, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.624, |
| "grad_norm": 1.6015625, |
| "learning_rate": 7.4590443074118325e-06, |
| "loss": 0.3861, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.628, |
| "grad_norm": 1.8203125, |
| "learning_rate": 7.324252697260475e-06, |
| "loss": 0.3957, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.632, |
| "grad_norm": 1.6875, |
| "learning_rate": 7.189982729347491e-06, |
| "loss": 0.4182, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.636, |
| "grad_norm": 1.6328125, |
| "learning_rate": 7.056260579869165e-06, |
| "loss": 0.4166, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 1.6484375, |
| "learning_rate": 6.923112318223497e-06, |
| "loss": 0.4355, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.644, |
| "grad_norm": 1.5703125, |
| "learning_rate": 6.790563901927907e-06, |
| "loss": 0.385, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.648, |
| "grad_norm": 1.640625, |
| "learning_rate": 6.658641171558785e-06, |
| "loss": 0.409, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.652, |
| "grad_norm": 1.375, |
| "learning_rate": 6.52736984571381e-06, |
| "loss": 0.3865, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.656, |
| "grad_norm": 1.6015625, |
| "learning_rate": 6.396775515998055e-06, |
| "loss": 0.3941, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.66, |
| "grad_norm": 1.4296875, |
| "learning_rate": 6.2668836420348535e-06, |
| "loss": 0.3771, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.664, |
| "grad_norm": 1.7421875, |
| "learning_rate": 6.137719546502401e-06, |
| "loss": 0.4125, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.668, |
| "grad_norm": 1.6796875, |
| "learning_rate": 6.009308410197048e-06, |
| "loss": 0.399, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.672, |
| "grad_norm": 1.7421875, |
| "learning_rate": 5.881675267124254e-06, |
| "loss": 0.4168, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.676, |
| "grad_norm": 1.5078125, |
| "learning_rate": 5.754844999618144e-06, |
| "loss": 0.3693, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 1.6484375, |
| "learning_rate": 5.628842333490674e-06, |
| "loss": 0.4236, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.684, |
| "grad_norm": 1.5234375, |
| "learning_rate": 5.50369183321126e-06, |
| "loss": 0.4057, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.688, |
| "grad_norm": 1.703125, |
| "learning_rate": 5.379417897117917e-06, |
| "loss": 0.3937, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.692, |
| "grad_norm": 1.78125, |
| "learning_rate": 5.256044752660709e-06, |
| "loss": 0.3998, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.696, |
| "grad_norm": 1.8984375, |
| "learning_rate": 5.133596451678603e-06, |
| "loss": 0.4027, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 1.625, |
| "learning_rate": 5.012096865710494e-06, |
| "loss": 0.4459, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.704, |
| "grad_norm": 1.5078125, |
| "learning_rate": 4.891569681341403e-06, |
| "loss": 0.3854, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.708, |
| "grad_norm": 1.7734375, |
| "learning_rate": 4.772038395584735e-06, |
| "loss": 0.4125, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.712, |
| "grad_norm": 1.3359375, |
| "learning_rate": 4.6535263113014885e-06, |
| "loss": 0.3865, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.716, |
| "grad_norm": 1.578125, |
| "learning_rate": 4.53605653265731e-06, |
| "loss": 0.3973, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 1.4609375, |
| "learning_rate": 4.419651960618302e-06, |
| "loss": 0.4137, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.724, |
| "grad_norm": 1.6015625, |
| "learning_rate": 4.304335288486426e-06, |
| "loss": 0.4145, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.728, |
| "grad_norm": 1.4296875, |
| "learning_rate": 4.190128997475402e-06, |
| "loss": 0.3926, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.732, |
| "grad_norm": 1.71875, |
| "learning_rate": 4.0770553523279535e-06, |
| "loss": 0.3898, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.736, |
| "grad_norm": 1.5625, |
| "learning_rate": 3.965136396975235e-06, |
| "loss": 0.4184, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.74, |
| "grad_norm": 1.953125, |
| "learning_rate": 3.854393950239356e-06, |
| "loss": 0.4225, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.744, |
| "grad_norm": 1.75, |
| "learning_rate": 3.7448496015797296e-06, |
| "loss": 0.3936, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.748, |
| "grad_norm": 1.5703125, |
| "learning_rate": 3.636524706884181e-06, |
| "loss": 0.4172, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.752, |
| "grad_norm": 1.4765625, |
| "learning_rate": 3.5294403843055604e-06, |
| "loss": 0.3791, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.756, |
| "grad_norm": 1.5625, |
| "learning_rate": 3.4236175101447265e-06, |
| "loss": 0.3812, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 1.53125, |
| "learning_rate": 3.3190767147806825e-06, |
| "loss": 0.4004, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.764, |
| "grad_norm": 1.5703125, |
| "learning_rate": 3.2158383786486204e-06, |
| "loss": 0.3898, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.768, |
| "grad_norm": 1.46875, |
| "learning_rate": 3.113922628266718e-06, |
| "loss": 0.4053, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.772, |
| "grad_norm": 1.65625, |
| "learning_rate": 3.013349332312451e-06, |
| "loss": 0.4156, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.776, |
| "grad_norm": 1.5859375, |
| "learning_rate": 2.9141380977491373e-06, |
| "loss": 0.3939, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.78, |
| "grad_norm": 1.359375, |
| "learning_rate": 2.816308266003541e-06, |
| "loss": 0.3773, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.784, |
| "grad_norm": 1.578125, |
| "learning_rate": 2.7198789091951903e-06, |
| "loss": 0.374, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.788, |
| "grad_norm": 1.453125, |
| "learning_rate": 2.624868826418262e-06, |
| "loss": 0.3885, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.792, |
| "grad_norm": 1.4921875, |
| "learning_rate": 2.5312965400766475e-06, |
| "loss": 0.3891, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.796, |
| "grad_norm": 1.6328125, |
| "learning_rate": 2.4391802922729703e-06, |
| "loss": 0.3861, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 1.3125, |
| "learning_rate": 2.3485380412522586e-06, |
| "loss": 0.3674, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.804, |
| "grad_norm": 1.6015625, |
| "learning_rate": 2.259387457900948e-06, |
| "loss": 0.391, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.808, |
| "grad_norm": 1.6875, |
| "learning_rate": 2.171745922301903e-06, |
| "loss": 0.4186, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.812, |
| "grad_norm": 1.53125, |
| "learning_rate": 2.0856305203461436e-06, |
| "loss": 0.3592, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.816, |
| "grad_norm": 1.4609375, |
| "learning_rate": 2.0010580404019066e-06, |
| "loss": 0.3977, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.82, |
| "grad_norm": 1.4453125, |
| "learning_rate": 1.918044970041729e-06, |
| "loss": 0.398, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.824, |
| "grad_norm": 1.6171875, |
| "learning_rate": 1.8366074928281608e-06, |
| "loss": 0.3736, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.828, |
| "grad_norm": 1.703125, |
| "learning_rate": 1.7567614851587444e-06, |
| "loss": 0.3521, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.832, |
| "grad_norm": 1.6484375, |
| "learning_rate": 1.6785225131708749e-06, |
| "loss": 0.3947, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.836, |
| "grad_norm": 1.5234375, |
| "learning_rate": 1.601905829707171e-06, |
| "loss": 0.377, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 1.375, |
| "learning_rate": 1.526926371341878e-06, |
| "loss": 0.3893, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.844, |
| "grad_norm": 1.3984375, |
| "learning_rate": 1.4535987554689712e-06, |
| "loss": 0.4205, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.848, |
| "grad_norm": 1.546875, |
| "learning_rate": 1.381937277452451e-06, |
| "loss": 0.4303, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.852, |
| "grad_norm": 1.6953125, |
| "learning_rate": 1.3119559078394462e-06, |
| "loss": 0.4213, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.856, |
| "grad_norm": 1.8828125, |
| "learning_rate": 1.2436682896366282e-06, |
| "loss": 0.391, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.86, |
| "grad_norm": 1.6015625, |
| "learning_rate": 1.1770877356504684e-06, |
| "loss": 0.3861, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.864, |
| "grad_norm": 1.640625, |
| "learning_rate": 1.1122272258918864e-06, |
| "loss": 0.4026, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.868, |
| "grad_norm": 1.4765625, |
| "learning_rate": 1.0490994050457748e-06, |
| "loss": 0.3893, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.872, |
| "grad_norm": 3.59375, |
| "learning_rate": 9.877165800058874e-07, |
| "loss": 0.4078, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.876, |
| "grad_norm": 1.75, |
| "learning_rate": 9.280907174755916e-07, |
| "loss": 0.373, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 1.6796875, |
| "learning_rate": 8.702334416349279e-07, |
| "loss": 0.4193, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.884, |
| "grad_norm": 1.5234375, |
| "learning_rate": 8.141560318744601e-07, |
| "loss": 0.3754, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.888, |
| "grad_norm": 1.6484375, |
| "learning_rate": 7.598694205963331e-07, |
| "loss": 0.3652, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.892, |
| "grad_norm": 1.53125, |
| "learning_rate": 7.073841910829771e-07, |
| "loss": 0.3934, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.896, |
| "grad_norm": 1.5234375, |
| "learning_rate": 6.567105754338798e-07, |
| "loss": 0.4051, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 1.859375, |
| "learning_rate": 6.078584525708175e-07, |
| "loss": 0.3695, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.904, |
| "grad_norm": 1.484375, |
| "learning_rate": 5.608373463119354e-07, |
| "loss": 0.4188, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.908, |
| "grad_norm": 1.703125, |
| "learning_rate": 5.156564235150686e-07, |
| "loss": 0.3867, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.912, |
| "grad_norm": 1.5625, |
| "learning_rate": 4.723244922906356e-07, |
| "loss": 0.4287, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.916, |
| "grad_norm": 1.4765625, |
| "learning_rate": 4.308500002844862e-07, |
| "loss": 0.368, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 1.4140625, |
| "learning_rate": 3.912410330310157e-07, |
| "loss": 0.3799, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.924, |
| "grad_norm": 1.4765625, |
| "learning_rate": 3.5350531237686723e-07, |
| "loss": 0.3721, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.928, |
| "grad_norm": 1.59375, |
| "learning_rate": 3.1765019497555617e-07, |
| "loss": 0.3943, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.932, |
| "grad_norm": 1.7109375, |
| "learning_rate": 2.836826708532603e-07, |
| "loss": 0.376, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.936, |
| "grad_norm": 1.8125, |
| "learning_rate": 2.516093620461124e-07, |
| "loss": 0.398, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.94, |
| "grad_norm": 1.8046875, |
| "learning_rate": 2.214365213092118e-07, |
| "loss": 0.3855, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.944, |
| "grad_norm": 1.890625, |
| "learning_rate": 1.9317003089764365e-07, |
| "loss": 0.4172, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.948, |
| "grad_norm": 1.4375, |
| "learning_rate": 1.668154014197243e-07, |
| "loss": 0.425, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.952, |
| "grad_norm": 1.453125, |
| "learning_rate": 1.4237777076268723e-07, |
| "loss": 0.4049, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.956, |
| "grad_norm": 1.59375, |
| "learning_rate": 1.1986190309104861e-07, |
| "loss": 0.3668, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 1.8984375, |
| "learning_rate": 9.9272187917826e-08, |
| "loss": 0.4242, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.964, |
| "grad_norm": 1.4296875, |
| "learning_rate": 8.061263924878604e-08, |
| "loss": 0.4172, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.968, |
| "grad_norm": 1.7578125, |
| "learning_rate": 6.388689479991606e-08, |
| "loss": 0.3926, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.972, |
| "grad_norm": 1.6328125, |
| "learning_rate": 4.9098215288235776e-08, |
| "loss": 0.451, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.976, |
| "grad_norm": 1.734375, |
| "learning_rate": 3.6249483796116924e-08, |
| "loss": 0.4084, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.98, |
| "grad_norm": 1.671875, |
| "learning_rate": 2.5343205209225062e-08, |
| "loss": 0.4176, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.984, |
| "grad_norm": 1.4921875, |
| "learning_rate": 1.6381505728176872e-08, |
| "loss": 0.3951, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.988, |
| "grad_norm": 1.328125, |
| "learning_rate": 9.366132454046162e-09, |
| "loss": 0.4207, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.992, |
| "grad_norm": 1.8046875, |
| "learning_rate": 4.298453047749674e-09, |
| "loss": 0.409, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.996, |
| "grad_norm": 1.6640625, |
| "learning_rate": 1.1794554634314558e-09, |
| "loss": 0.4086, |
| "step": 2490 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 1.8203125, |
| "learning_rate": 9.74775584916543e-12, |
| "loss": 0.401, |
| "step": 2500 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 2500, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 2500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.3916038848512e+17, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|