| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9973474801061006, | |
| "eval_steps": 500, | |
| "global_step": 2826, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.010610079575596816, | |
| "grad_norm": 4.634474754333496, | |
| "learning_rate": 1.5901060070671379e-07, | |
| "loss": 0.741, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.021220159151193633, | |
| "grad_norm": 2.9002726078033447, | |
| "learning_rate": 3.356890459363958e-07, | |
| "loss": 0.5551, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03183023872679045, | |
| "grad_norm": 4.242003917694092, | |
| "learning_rate": 5.123674911660778e-07, | |
| "loss": 0.6185, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.042440318302387266, | |
| "grad_norm": 3.8156638145446777, | |
| "learning_rate": 6.890459363957598e-07, | |
| "loss": 0.6358, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.05305039787798409, | |
| "grad_norm": 3.047624349594116, | |
| "learning_rate": 8.657243816254418e-07, | |
| "loss": 0.5922, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0636604774535809, | |
| "grad_norm": 2.2943954467773438, | |
| "learning_rate": 1.0424028268551239e-06, | |
| "loss": 0.6282, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.07427055702917772, | |
| "grad_norm": 2.831937551498413, | |
| "learning_rate": 1.2190812720848057e-06, | |
| "loss": 0.5836, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.08488063660477453, | |
| "grad_norm": 3.941297769546509, | |
| "learning_rate": 1.3957597173144876e-06, | |
| "loss": 0.5836, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.09549071618037135, | |
| "grad_norm": 2.4598379135131836, | |
| "learning_rate": 1.5724381625441699e-06, | |
| "loss": 0.4983, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.10610079575596817, | |
| "grad_norm": 2.533829927444458, | |
| "learning_rate": 1.7491166077738517e-06, | |
| "loss": 0.6057, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.11671087533156499, | |
| "grad_norm": 2.412334442138672, | |
| "learning_rate": 1.925795053003534e-06, | |
| "loss": 0.5135, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.1273209549071618, | |
| "grad_norm": 2.7505877017974854, | |
| "learning_rate": 2.1024734982332157e-06, | |
| "loss": 0.4844, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.13793103448275862, | |
| "grad_norm": 2.701307535171509, | |
| "learning_rate": 2.279151943462898e-06, | |
| "loss": 0.5386, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.14854111405835543, | |
| "grad_norm": 2.8261961936950684, | |
| "learning_rate": 2.45583038869258e-06, | |
| "loss": 0.4774, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.15915119363395225, | |
| "grad_norm": 2.4490256309509277, | |
| "learning_rate": 2.6325088339222617e-06, | |
| "loss": 0.5035, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.16976127320954906, | |
| "grad_norm": 2.418158769607544, | |
| "learning_rate": 2.8091872791519436e-06, | |
| "loss": 0.4897, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.18037135278514588, | |
| "grad_norm": 3.5972161293029785, | |
| "learning_rate": 2.985865724381626e-06, | |
| "loss": 0.5196, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.1909814323607427, | |
| "grad_norm": 2.814927577972412, | |
| "learning_rate": 3.162544169611308e-06, | |
| "loss": 0.4791, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.20159151193633953, | |
| "grad_norm": 2.6151270866394043, | |
| "learning_rate": 3.3392226148409896e-06, | |
| "loss": 0.5024, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.21220159151193635, | |
| "grad_norm": 2.8331387042999268, | |
| "learning_rate": 3.5159010600706715e-06, | |
| "loss": 0.5781, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.22281167108753316, | |
| "grad_norm": 2.433027744293213, | |
| "learning_rate": 3.6925795053003538e-06, | |
| "loss": 0.4186, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.23342175066312998, | |
| "grad_norm": 2.671696186065674, | |
| "learning_rate": 3.869257950530036e-06, | |
| "loss": 0.4819, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.2440318302387268, | |
| "grad_norm": 2.5337982177734375, | |
| "learning_rate": 4.045936395759718e-06, | |
| "loss": 0.547, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2546419098143236, | |
| "grad_norm": 2.2034990787506104, | |
| "learning_rate": 4.222614840989399e-06, | |
| "loss": 0.5603, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.26525198938992045, | |
| "grad_norm": 2.2893121242523193, | |
| "learning_rate": 4.399293286219082e-06, | |
| "loss": 0.4483, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.27586206896551724, | |
| "grad_norm": 1.8757219314575195, | |
| "learning_rate": 4.575971731448763e-06, | |
| "loss": 0.5178, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.2864721485411141, | |
| "grad_norm": 2.3748602867126465, | |
| "learning_rate": 4.752650176678445e-06, | |
| "loss": 0.5264, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.29708222811671087, | |
| "grad_norm": 3.0481033325195312, | |
| "learning_rate": 4.929328621908128e-06, | |
| "loss": 0.5124, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3076923076923077, | |
| "grad_norm": 2.682847023010254, | |
| "learning_rate": 4.99993132201408e-06, | |
| "loss": 0.4977, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.3183023872679045, | |
| "grad_norm": 2.472842216491699, | |
| "learning_rate": 4.9995116368759e-06, | |
| "loss": 0.5005, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.32891246684350134, | |
| "grad_norm": 2.582815647125244, | |
| "learning_rate": 4.998710485009401e-06, | |
| "loss": 0.4857, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.3395225464190981, | |
| "grad_norm": 2.3572824001312256, | |
| "learning_rate": 4.99752798868358e-06, | |
| "loss": 0.4637, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.35013262599469497, | |
| "grad_norm": 2.3432295322418213, | |
| "learning_rate": 4.99596432836689e-06, | |
| "loss": 0.4775, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.36074270557029176, | |
| "grad_norm": 2.7486777305603027, | |
| "learning_rate": 4.994019742699705e-06, | |
| "loss": 0.5779, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.3713527851458886, | |
| "grad_norm": 2.3831562995910645, | |
| "learning_rate": 4.991694528457891e-06, | |
| "loss": 0.5057, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.3819628647214854, | |
| "grad_norm": 2.5414721965789795, | |
| "learning_rate": 4.988989040507518e-06, | |
| "loss": 0.5313, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.3925729442970822, | |
| "grad_norm": 2.4140472412109375, | |
| "learning_rate": 4.985903691750697e-06, | |
| "loss": 0.4441, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.40318302387267907, | |
| "grad_norm": 2.4907593727111816, | |
| "learning_rate": 4.982438953062572e-06, | |
| "loss": 0.4778, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.41379310344827586, | |
| "grad_norm": 2.579932928085327, | |
| "learning_rate": 4.978595353219449e-06, | |
| "loss": 0.4848, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.4244031830238727, | |
| "grad_norm": 2.5512266159057617, | |
| "learning_rate": 4.974373478818098e-06, | |
| "loss": 0.4891, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.4350132625994695, | |
| "grad_norm": 2.3293063640594482, | |
| "learning_rate": 4.969773974186235e-06, | |
| "loss": 0.4954, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.44562334217506633, | |
| "grad_norm": 2.6347479820251465, | |
| "learning_rate": 4.964797541284175e-06, | |
| "loss": 0.5353, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.4562334217506631, | |
| "grad_norm": 2.7719151973724365, | |
| "learning_rate": 4.959444939597712e-06, | |
| "loss": 0.5726, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.46684350132625996, | |
| "grad_norm": 2.1757211685180664, | |
| "learning_rate": 4.953716986022204e-06, | |
| "loss": 0.5642, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.47745358090185674, | |
| "grad_norm": 2.432244300842285, | |
| "learning_rate": 4.947614554737904e-06, | |
| "loss": 0.4429, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.4880636604774536, | |
| "grad_norm": 1.972844123840332, | |
| "learning_rate": 4.941138577076538e-06, | |
| "loss": 0.4683, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.4986737400530504, | |
| "grad_norm": 2.484992742538452, | |
| "learning_rate": 4.934290041379182e-06, | |
| "loss": 0.4385, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5092838196286472, | |
| "grad_norm": 2.0424418449401855, | |
| "learning_rate": 4.92706999284541e-06, | |
| "loss": 0.4935, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.519893899204244, | |
| "grad_norm": 2.3754308223724365, | |
| "learning_rate": 4.9194795333737925e-06, | |
| "loss": 0.4548, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.5305039787798409, | |
| "grad_norm": 3.0801432132720947, | |
| "learning_rate": 4.911519821393718e-06, | |
| "loss": 0.5486, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5411140583554377, | |
| "grad_norm": 2.2712507247924805, | |
| "learning_rate": 4.9031920716886035e-06, | |
| "loss": 0.5121, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.5517241379310345, | |
| "grad_norm": 2.0000548362731934, | |
| "learning_rate": 4.894497555210499e-06, | |
| "loss": 0.4495, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.5623342175066313, | |
| "grad_norm": 2.590303897857666, | |
| "learning_rate": 4.8854375988861134e-06, | |
| "loss": 0.5028, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.5729442970822282, | |
| "grad_norm": 2.377298355102539, | |
| "learning_rate": 4.87601358541431e-06, | |
| "loss": 0.5193, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.583554376657825, | |
| "grad_norm": 2.966008186340332, | |
| "learning_rate": 4.8662269530550825e-06, | |
| "loss": 0.545, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.5941644562334217, | |
| "grad_norm": 2.250293254852295, | |
| "learning_rate": 4.856079195410046e-06, | |
| "loss": 0.5219, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.6047745358090185, | |
| "grad_norm": 2.437361240386963, | |
| "learning_rate": 4.845571861194501e-06, | |
| "loss": 0.4725, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.6153846153846154, | |
| "grad_norm": 2.435994863510132, | |
| "learning_rate": 4.834706554001065e-06, | |
| "loss": 0.4232, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.6259946949602122, | |
| "grad_norm": 2.705902099609375, | |
| "learning_rate": 4.823484932054937e-06, | |
| "loss": 0.4834, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.636604774535809, | |
| "grad_norm": 2.1471517086029053, | |
| "learning_rate": 4.811908707960832e-06, | |
| "loss": 0.5302, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.6472148541114059, | |
| "grad_norm": 2.0760443210601807, | |
| "learning_rate": 4.799979648441602e-06, | |
| "loss": 0.494, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.6578249336870027, | |
| "grad_norm": 2.334944009780884, | |
| "learning_rate": 4.787699574068611e-06, | |
| "loss": 0.487, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.6684350132625995, | |
| "grad_norm": 2.3444855213165283, | |
| "learning_rate": 4.775070358983881e-06, | |
| "loss": 0.4911, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.6790450928381963, | |
| "grad_norm": 2.127737045288086, | |
| "learning_rate": 4.7620939306140696e-06, | |
| "loss": 0.4744, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.6896551724137931, | |
| "grad_norm": 2.2132568359375, | |
| "learning_rate": 4.748772269376312e-06, | |
| "loss": 0.4789, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.7002652519893899, | |
| "grad_norm": 1.9452372789382935, | |
| "learning_rate": 4.735107408375977e-06, | |
| "loss": 0.488, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.7108753315649867, | |
| "grad_norm": 2.7268893718719482, | |
| "learning_rate": 4.721101433096381e-06, | |
| "loss": 0.4462, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.7214854111405835, | |
| "grad_norm": 2.1095452308654785, | |
| "learning_rate": 4.706756481080511e-06, | |
| "loss": 0.5087, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.7320954907161804, | |
| "grad_norm": 2.278555154800415, | |
| "learning_rate": 4.692074741604795e-06, | |
| "loss": 0.5304, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.7427055702917772, | |
| "grad_norm": 2.455960512161255, | |
| "learning_rate": 4.677058455344989e-06, | |
| "loss": 0.5177, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.753315649867374, | |
| "grad_norm": 2.1136856079101562, | |
| "learning_rate": 4.661709914034209e-06, | |
| "loss": 0.4841, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.7639257294429708, | |
| "grad_norm": 2.296614646911621, | |
| "learning_rate": 4.646031460113175e-06, | |
| "loss": 0.4544, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.7745358090185677, | |
| "grad_norm": 1.8733782768249512, | |
| "learning_rate": 4.630025486372715e-06, | |
| "loss": 0.4715, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.7851458885941645, | |
| "grad_norm": 2.526837110519409, | |
| "learning_rate": 4.613694435588589e-06, | |
| "loss": 0.4824, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.7957559681697612, | |
| "grad_norm": 2.2026150226593018, | |
| "learning_rate": 4.597040800148679e-06, | |
| "loss": 0.4852, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.8063660477453581, | |
| "grad_norm": 2.214277744293213, | |
| "learning_rate": 4.580067121672607e-06, | |
| "loss": 0.4134, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.8169761273209549, | |
| "grad_norm": 2.623305559158325, | |
| "learning_rate": 4.562775990623847e-06, | |
| "loss": 0.4493, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.8275862068965517, | |
| "grad_norm": 2.9433794021606445, | |
| "learning_rate": 4.5451700459143735e-06, | |
| "loss": 0.5255, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.8381962864721485, | |
| "grad_norm": 2.143739938735962, | |
| "learning_rate": 4.527251974501923e-06, | |
| "loss": 0.4503, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.8488063660477454, | |
| "grad_norm": 2.1592986583709717, | |
| "learning_rate": 4.509024510979917e-06, | |
| "loss": 0.4636, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.8594164456233422, | |
| "grad_norm": 2.2622759342193604, | |
| "learning_rate": 4.4904904371601176e-06, | |
| "loss": 0.4685, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.870026525198939, | |
| "grad_norm": 2.3408522605895996, | |
| "learning_rate": 4.4716525816480816e-06, | |
| "loss": 0.5248, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.8806366047745358, | |
| "grad_norm": 2.5351459980010986, | |
| "learning_rate": 4.4525138194114644e-06, | |
| "loss": 0.4747, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.8912466843501327, | |
| "grad_norm": 2.4038591384887695, | |
| "learning_rate": 4.4330770713412555e-06, | |
| "loss": 0.4198, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.9018567639257294, | |
| "grad_norm": 2.2719292640686035, | |
| "learning_rate": 4.413345303805996e-06, | |
| "loss": 0.4545, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.9124668435013262, | |
| "grad_norm": 3.1209301948547363, | |
| "learning_rate": 4.393321528199072e-06, | |
| "loss": 0.5003, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.9230769230769231, | |
| "grad_norm": 2.414945125579834, | |
| "learning_rate": 4.373008800479118e-06, | |
| "loss": 0.472, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.9336870026525199, | |
| "grad_norm": 2.21144437789917, | |
| "learning_rate": 4.352410220703629e-06, | |
| "loss": 0.4661, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.9442970822281167, | |
| "grad_norm": 2.210827589035034, | |
| "learning_rate": 4.331528932555844e-06, | |
| "loss": 0.4614, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.9549071618037135, | |
| "grad_norm": 2.403038740158081, | |
| "learning_rate": 4.3103681228649626e-06, | |
| "loss": 0.4623, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.9655172413793104, | |
| "grad_norm": 2.588114023208618, | |
| "learning_rate": 4.288931021119788e-06, | |
| "loss": 0.4902, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.9761273209549072, | |
| "grad_norm": 2.288691997528076, | |
| "learning_rate": 4.267220898975848e-06, | |
| "loss": 0.5047, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.986737400530504, | |
| "grad_norm": 2.2487804889678955, | |
| "learning_rate": 4.245241069756092e-06, | |
| "loss": 0.5358, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.9973474801061007, | |
| "grad_norm": 2.5266008377075195, | |
| "learning_rate": 4.222994887945219e-06, | |
| "loss": 0.4928, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.0074270557029177, | |
| "grad_norm": 2.5962352752685547, | |
| "learning_rate": 4.20048574867773e-06, | |
| "loss": 0.3963, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.0180371352785147, | |
| "grad_norm": 2.707613229751587, | |
| "learning_rate": 4.1777170872197725e-06, | |
| "loss": 0.3125, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.0286472148541115, | |
| "grad_norm": 2.4237964153289795, | |
| "learning_rate": 4.1546923784448646e-06, | |
| "loss": 0.3457, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.0392572944297083, | |
| "grad_norm": 1.6531928777694702, | |
| "learning_rate": 4.1314151363035705e-06, | |
| "loss": 0.3029, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.049867374005305, | |
| "grad_norm": 2.1669981479644775, | |
| "learning_rate": 4.1078889132872145e-06, | |
| "loss": 0.3289, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.0604774535809018, | |
| "grad_norm": 2.445012092590332, | |
| "learning_rate": 4.084117299885712e-06, | |
| "loss": 0.3234, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.0710875331564986, | |
| "grad_norm": 2.0615527629852295, | |
| "learning_rate": 4.060103924039599e-06, | |
| "loss": 0.3139, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.0816976127320954, | |
| "grad_norm": 1.990400791168213, | |
| "learning_rate": 4.035852450586352e-06, | |
| "loss": 0.3144, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.0923076923076924, | |
| "grad_norm": 2.5510122776031494, | |
| "learning_rate": 4.011366580701073e-06, | |
| "loss": 0.323, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.1029177718832892, | |
| "grad_norm": 2.462083101272583, | |
| "learning_rate": 3.9866500513316274e-06, | |
| "loss": 0.3694, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.113527851458886, | |
| "grad_norm": 2.4385085105895996, | |
| "learning_rate": 3.961706634628323e-06, | |
| "loss": 0.3351, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.1241379310344828, | |
| "grad_norm": 1.7553578615188599, | |
| "learning_rate": 3.936540137368222e-06, | |
| "loss": 0.3459, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.1347480106100796, | |
| "grad_norm": 2.513950824737549, | |
| "learning_rate": 3.911154400374159e-06, | |
| "loss": 0.3186, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.1453580901856764, | |
| "grad_norm": 2.6273515224456787, | |
| "learning_rate": 3.885553297928573e-06, | |
| "loss": 0.3333, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.1559681697612731, | |
| "grad_norm": 2.4155592918395996, | |
| "learning_rate": 3.859740737182222e-06, | |
| "loss": 0.3137, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.16657824933687, | |
| "grad_norm": 2.719611644744873, | |
| "learning_rate": 3.833720657557894e-06, | |
| "loss": 0.3426, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.1771883289124667, | |
| "grad_norm": 2.5729358196258545, | |
| "learning_rate": 3.807497030149181e-06, | |
| "loss": 0.3709, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.1877984084880637, | |
| "grad_norm": 1.9626141786575317, | |
| "learning_rate": 3.7810738571144257e-06, | |
| "loss": 0.329, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.1984084880636605, | |
| "grad_norm": 2.601951837539673, | |
| "learning_rate": 3.7544551710659296e-06, | |
| "loss": 0.305, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.2090185676392573, | |
| "grad_norm": 2.4118540287017822, | |
| "learning_rate": 3.7276450344545024e-06, | |
| "loss": 0.3449, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.219628647214854, | |
| "grad_norm": 2.5080604553222656, | |
| "learning_rate": 3.7006475389494723e-06, | |
| "loss": 0.3403, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.2302387267904509, | |
| "grad_norm": 2.6882951259613037, | |
| "learning_rate": 3.6734668048142273e-06, | |
| "loss": 0.3342, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.2408488063660477, | |
| "grad_norm": 2.3755247592926025, | |
| "learning_rate": 3.646106980277394e-06, | |
| "loss": 0.3589, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.2514588859416444, | |
| "grad_norm": 2.4138166904449463, | |
| "learning_rate": 3.618572240899748e-06, | |
| "loss": 0.3447, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.2620689655172415, | |
| "grad_norm": 2.6930105686187744, | |
| "learning_rate": 3.5908667889369603e-06, | |
| "loss": 0.3787, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.2726790450928382, | |
| "grad_norm": 2.732795476913452, | |
| "learning_rate": 3.5629948526982563e-06, | |
| "loss": 0.3376, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.283289124668435, | |
| "grad_norm": 1.8468087911605835, | |
| "learning_rate": 3.534960685901111e-06, | |
| "loss": 0.3461, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.2938992042440318, | |
| "grad_norm": 2.3408284187316895, | |
| "learning_rate": 3.506768567022062e-06, | |
| "loss": 0.3396, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.3045092838196286, | |
| "grad_norm": 2.7420434951782227, | |
| "learning_rate": 3.478422798643737e-06, | |
| "loss": 0.3364, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.3151193633952254, | |
| "grad_norm": 2.634403705596924, | |
| "learning_rate": 3.4499277067982177e-06, | |
| "loss": 0.3126, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.3257294429708222, | |
| "grad_norm": 2.4217336177825928, | |
| "learning_rate": 3.421287640306809e-06, | |
| "loss": 0.3092, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.3363395225464192, | |
| "grad_norm": 1.7107937335968018, | |
| "learning_rate": 3.3925069701163406e-06, | |
| "loss": 0.3374, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.346949602122016, | |
| "grad_norm": 2.1515822410583496, | |
| "learning_rate": 3.363590088632085e-06, | |
| "loss": 0.3436, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.3575596816976128, | |
| "grad_norm": 2.0105717182159424, | |
| "learning_rate": 3.334541409047408e-06, | |
| "loss": 0.3283, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.3681697612732096, | |
| "grad_norm": 1.8952791690826416, | |
| "learning_rate": 3.3053653646702422e-06, | |
| "loss": 0.358, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.3787798408488063, | |
| "grad_norm": 1.8639928102493286, | |
| "learning_rate": 3.276066408246487e-06, | |
| "loss": 0.3084, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.3893899204244031, | |
| "grad_norm": 2.563251256942749, | |
| "learning_rate": 3.2466490112804484e-06, | |
| "loss": 0.3508, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 2.214616060256958, | |
| "learning_rate": 3.217117663352417e-06, | |
| "loss": 0.3215, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.410610079575597, | |
| "grad_norm": 1.793468952178955, | |
| "learning_rate": 3.187476871433478e-06, | |
| "loss": 0.3193, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.4212201591511937, | |
| "grad_norm": 2.204789638519287, | |
| "learning_rate": 3.1577311591976766e-06, | |
| "loss": 0.3019, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.4318302387267905, | |
| "grad_norm": 2.307568311691284, | |
| "learning_rate": 3.1278850663316307e-06, | |
| "loss": 0.3099, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.4424403183023873, | |
| "grad_norm": 2.485848903656006, | |
| "learning_rate": 3.0979431478416987e-06, | |
| "loss": 0.3085, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.453050397877984, | |
| "grad_norm": 1.953053593635559, | |
| "learning_rate": 3.067909973358811e-06, | |
| "loss": 0.3211, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.4636604774535809, | |
| "grad_norm": 2.2350101470947266, | |
| "learning_rate": 3.0377901264410673e-06, | |
| "loss": 0.3329, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.4742705570291776, | |
| "grad_norm": 2.542452335357666, | |
| "learning_rate": 3.0075882038742133e-06, | |
| "loss": 0.3376, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.4848806366047747, | |
| "grad_norm": 2.3203530311584473, | |
| "learning_rate": 2.9773088149700923e-06, | |
| "loss": 0.2896, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.4954907161803712, | |
| "grad_norm": 1.9708584547042847, | |
| "learning_rate": 2.9469565808631888e-06, | |
| "loss": 0.299, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.5061007957559682, | |
| "grad_norm": 2.63698148727417, | |
| "learning_rate": 2.9165361338053683e-06, | |
| "loss": 0.3484, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.516710875331565, | |
| "grad_norm": 2.091648578643799, | |
| "learning_rate": 2.886052116458918e-06, | |
| "loss": 0.3316, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.5273209549071618, | |
| "grad_norm": 1.955355167388916, | |
| "learning_rate": 2.8555091811880004e-06, | |
| "loss": 0.328, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.5379310344827586, | |
| "grad_norm": 1.6724951267242432, | |
| "learning_rate": 2.8249119893486252e-06, | |
| "loss": 0.3215, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.5485411140583554, | |
| "grad_norm": 2.1872570514678955, | |
| "learning_rate": 2.7942652105772516e-06, | |
| "loss": 0.3118, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.5591511936339524, | |
| "grad_norm": 3.0710208415985107, | |
| "learning_rate": 2.7635735220781214e-06, | |
| "loss": 0.2973, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.569761273209549, | |
| "grad_norm": 2.357663631439209, | |
| "learning_rate": 2.7328416079094412e-06, | |
| "loss": 0.3423, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.580371352785146, | |
| "grad_norm": 2.2559144496917725, | |
| "learning_rate": 2.7020741582685217e-06, | |
| "loss": 0.3211, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.5909814323607427, | |
| "grad_norm": 2.0730817317962646, | |
| "learning_rate": 2.6712758687759706e-06, | |
| "loss": 0.2733, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.6015915119363395, | |
| "grad_norm": 2.6119141578674316, | |
| "learning_rate": 2.6404514397590657e-06, | |
| "loss": 0.338, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.6122015915119363, | |
| "grad_norm": 2.315875768661499, | |
| "learning_rate": 2.6096055755344113e-06, | |
| "loss": 0.3124, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.622811671087533, | |
| "grad_norm": 2.2880892753601074, | |
| "learning_rate": 2.578742983689973e-06, | |
| "loss": 0.3538, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.6334217506631301, | |
| "grad_norm": 2.2615041732788086, | |
| "learning_rate": 2.547868374366631e-06, | |
| "loss": 0.3353, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.6440318302387267, | |
| "grad_norm": 1.9062315225601196, | |
| "learning_rate": 2.5169864595393295e-06, | |
| "loss": 0.302, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.6546419098143237, | |
| "grad_norm": 2.7016942501068115, | |
| "learning_rate": 2.4861019522979537e-06, | |
| "loss": 0.3124, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.6652519893899205, | |
| "grad_norm": 2.4618184566497803, | |
| "learning_rate": 2.455219566128034e-06, | |
| "loss": 0.3497, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.6758620689655173, | |
| "grad_norm": 2.8924951553344727, | |
| "learning_rate": 2.4243440141913905e-06, | |
| "loss": 0.3233, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.686472148541114, | |
| "grad_norm": 2.32255482673645, | |
| "learning_rate": 2.393480008606825e-06, | |
| "loss": 0.3067, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.6970822281167108, | |
| "grad_norm": 1.8984359502792358, | |
| "learning_rate": 2.3626322597309774e-06, | |
| "loss": 0.2893, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.7076923076923078, | |
| "grad_norm": 1.8360289335250854, | |
| "learning_rate": 2.331805475439445e-06, | |
| "loss": 0.2825, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.7183023872679044, | |
| "grad_norm": 2.331998109817505, | |
| "learning_rate": 2.3010043604082824e-06, | |
| "loss": 0.3379, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.7289124668435014, | |
| "grad_norm": 2.3304574489593506, | |
| "learning_rate": 2.2702336153959925e-06, | |
| "loss": 0.301, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.739522546419098, | |
| "grad_norm": 2.534090518951416, | |
| "learning_rate": 2.2394979365261134e-06, | |
| "loss": 0.404, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.750132625994695, | |
| "grad_norm": 2.273122549057007, | |
| "learning_rate": 2.208802014570507e-06, | |
| "loss": 0.3242, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.7607427055702918, | |
| "grad_norm": 1.8859643936157227, | |
| "learning_rate": 2.1781505342334775e-06, | |
| "loss": 0.3152, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.7713527851458886, | |
| "grad_norm": 2.567715644836426, | |
| "learning_rate": 2.147548173436805e-06, | |
| "loss": 0.3302, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.7819628647214856, | |
| "grad_norm": 2.7930519580841064, | |
| "learning_rate": 2.116999602605814e-06, | |
| "loss": 0.293, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.7925729442970821, | |
| "grad_norm": 2.646296262741089, | |
| "learning_rate": 2.086509483956594e-06, | |
| "loss": 0.2683, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.8031830238726791, | |
| "grad_norm": 2.3010053634643555, | |
| "learning_rate": 2.056082470784469e-06, | |
| "loss": 0.313, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.8137931034482757, | |
| "grad_norm": 2.3864669799804688, | |
| "learning_rate": 2.0257232067538213e-06, | |
| "loss": 0.262, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.8244031830238727, | |
| "grad_norm": 2.63028883934021, | |
| "learning_rate": 1.9954363251894007e-06, | |
| "loss": 0.3457, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.8350132625994695, | |
| "grad_norm": 2.0011484622955322, | |
| "learning_rate": 1.9652264483691933e-06, | |
| "loss": 0.2739, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.8456233421750663, | |
| "grad_norm": 2.6818690299987793, | |
| "learning_rate": 1.9350981868189944e-06, | |
| "loss": 0.3109, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.856233421750663, | |
| "grad_norm": 2.6978225708007812, | |
| "learning_rate": 1.9050561386087618e-06, | |
| "loss": 0.3269, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.8668435013262599, | |
| "grad_norm": 2.578031301498413, | |
| "learning_rate": 1.8751048886508711e-06, | |
| "loss": 0.3617, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.8774535809018569, | |
| "grad_norm": 2.5525052547454834, | |
| "learning_rate": 1.8452490080003888e-06, | |
| "loss": 0.3228, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.8880636604774534, | |
| "grad_norm": 2.1095635890960693, | |
| "learning_rate": 1.8154930531574521e-06, | |
| "loss": 0.2857, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.8986737400530505, | |
| "grad_norm": 2.3965845108032227, | |
| "learning_rate": 1.785841565371868e-06, | |
| "loss": 0.3622, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.9092838196286472, | |
| "grad_norm": 2.293715238571167, | |
| "learning_rate": 1.7562990699500482e-06, | |
| "loss": 0.3031, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.919893899204244, | |
| "grad_norm": 2.026015281677246, | |
| "learning_rate": 1.7268700755643708e-06, | |
| "loss": 0.3019, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.9305039787798408, | |
| "grad_norm": 1.7175791263580322, | |
| "learning_rate": 1.6975590735650812e-06, | |
| "loss": 0.3047, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.9411140583554376, | |
| "grad_norm": 2.0024490356445312, | |
| "learning_rate": 1.668370537294841e-06, | |
| "loss": 0.3048, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.9517241379310346, | |
| "grad_norm": 2.8226239681243896, | |
| "learning_rate": 1.6393089214060204e-06, | |
| "loss": 0.3205, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.9623342175066312, | |
| "grad_norm": 1.9452221393585205, | |
| "learning_rate": 1.6103786611808414e-06, | |
| "loss": 0.321, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.9729442970822282, | |
| "grad_norm": 2.304274320602417, | |
| "learning_rate": 1.5815841718544884e-06, | |
| "loss": 0.2954, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.983554376657825, | |
| "grad_norm": 2.502206802368164, | |
| "learning_rate": 1.5529298479412636e-06, | |
| "loss": 0.2945, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.9941644562334218, | |
| "grad_norm": 2.5796189308166504, | |
| "learning_rate": 1.524420062563912e-06, | |
| "loss": 0.3291, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 2.004244031830239, | |
| "grad_norm": 1.9198871850967407, | |
| "learning_rate": 1.4960591667862163e-06, | |
| "loss": 0.234, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 2.0148541114058354, | |
| "grad_norm": 1.7082706689834595, | |
| "learning_rate": 1.4678514889489464e-06, | |
| "loss": 0.1943, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.0254641909814324, | |
| "grad_norm": 1.8571817874908447, | |
| "learning_rate": 1.4398013340092864e-06, | |
| "loss": 0.1911, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 2.0360742705570294, | |
| "grad_norm": 2.454561233520508, | |
| "learning_rate": 1.4119129828838275e-06, | |
| "loss": 0.1895, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.046684350132626, | |
| "grad_norm": 2.3714683055877686, | |
| "learning_rate": 1.384190691795226e-06, | |
| "loss": 0.2177, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 2.057294429708223, | |
| "grad_norm": 2.1356313228607178, | |
| "learning_rate": 1.3566386916226373e-06, | |
| "loss": 0.2252, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 2.0679045092838195, | |
| "grad_norm": 2.446906089782715, | |
| "learning_rate": 1.3292611872560134e-06, | |
| "loss": 0.1982, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.0785145888594165, | |
| "grad_norm": 2.1040875911712646, | |
| "learning_rate": 1.302062356954365e-06, | |
| "loss": 0.1696, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 2.089124668435013, | |
| "grad_norm": 2.220742702484131, | |
| "learning_rate": 1.2750463517080922e-06, | |
| "loss": 0.1936, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 2.09973474801061, | |
| "grad_norm": 2.7784054279327393, | |
| "learning_rate": 1.2482172946054753e-06, | |
| "loss": 0.1604, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 2.110344827586207, | |
| "grad_norm": 2.0539498329162598, | |
| "learning_rate": 1.2215792802034187e-06, | |
| "loss": 0.2069, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 2.1209549071618037, | |
| "grad_norm": 1.8337138891220093, | |
| "learning_rate": 1.1951363739025618e-06, | |
| "loss": 0.1964, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.1315649867374007, | |
| "grad_norm": 1.7631642818450928, | |
| "learning_rate": 1.168892611326827e-06, | |
| "loss": 0.1871, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 2.1421750663129973, | |
| "grad_norm": 2.386589527130127, | |
| "learning_rate": 1.1428519977075136e-06, | |
| "loss": 0.2595, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 2.1527851458885943, | |
| "grad_norm": 2.553382635116577, | |
| "learning_rate": 1.1170185072720434e-06, | |
| "loss": 0.185, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 2.163395225464191, | |
| "grad_norm": 2.870973825454712, | |
| "learning_rate": 1.091396082637419e-06, | |
| "loss": 0.228, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 2.174005305039788, | |
| "grad_norm": 2.643745183944702, | |
| "learning_rate": 1.065988634208516e-06, | |
| "loss": 0.2098, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 2.184615384615385, | |
| "grad_norm": 2.369596481323242, | |
| "learning_rate": 1.0408000395812961e-06, | |
| "loss": 0.1982, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 2.1952254641909814, | |
| "grad_norm": 2.1093883514404297, | |
| "learning_rate": 1.0158341429510194e-06, | |
| "loss": 0.1844, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 2.2058355437665784, | |
| "grad_norm": 1.951935052871704, | |
| "learning_rate": 9.910947545255523e-07, | |
| "loss": 0.1654, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 2.216445623342175, | |
| "grad_norm": 2.230781078338623, | |
| "learning_rate": 9.665856499438744e-07, | |
| "loss": 0.2037, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 2.227055702917772, | |
| "grad_norm": 2.6240904331207275, | |
| "learning_rate": 9.423105696998491e-07, | |
| "loss": 0.2087, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.2376657824933686, | |
| "grad_norm": 1.712857723236084, | |
| "learning_rate": 9.182732185713633e-07, | |
| "loss": 0.2105, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 2.2482758620689656, | |
| "grad_norm": 2.036086082458496, | |
| "learning_rate": 8.94477265054918e-07, | |
| "loss": 0.2186, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 2.2588859416445626, | |
| "grad_norm": 2.3545398712158203, | |
| "learning_rate": 8.709263408057522e-07, | |
| "loss": 0.1879, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 2.269496021220159, | |
| "grad_norm": 1.9098992347717285, | |
| "learning_rate": 8.476240400835972e-07, | |
| "loss": 0.2177, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 2.280106100795756, | |
| "grad_norm": 2.107959270477295, | |
| "learning_rate": 8.245739192041311e-07, | |
| "loss": 0.165, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 2.2907161803713527, | |
| "grad_norm": 2.550719976425171, | |
| "learning_rate": 8.017794959962225e-07, | |
| "loss": 0.2018, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 2.3013262599469497, | |
| "grad_norm": 2.354701280593872, | |
| "learning_rate": 7.792442492650587e-07, | |
| "loss": 0.1955, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 2.3119363395225463, | |
| "grad_norm": 2.3547091484069824, | |
| "learning_rate": 7.569716182612177e-07, | |
| "loss": 0.1976, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 2.3225464190981433, | |
| "grad_norm": 1.4048022031784058, | |
| "learning_rate": 7.349650021557839e-07, | |
| "loss": 0.1685, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 2.33315649867374, | |
| "grad_norm": 2.568500280380249, | |
| "learning_rate": 7.132277595215773e-07, | |
| "loss": 0.1519, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.343766578249337, | |
| "grad_norm": 2.205993413925171, | |
| "learning_rate": 6.917632078205805e-07, | |
| "loss": 0.1573, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 2.3543766578249334, | |
| "grad_norm": 2.067505121231079, | |
| "learning_rate": 6.705746228976387e-07, | |
| "loss": 0.184, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 2.3649867374005304, | |
| "grad_norm": 2.4360201358795166, | |
| "learning_rate": 6.496652384805125e-07, | |
| "loss": 0.1968, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 2.3755968169761275, | |
| "grad_norm": 2.042179584503174, | |
| "learning_rate": 6.290382456863584e-07, | |
| "loss": 0.1846, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 2.386206896551724, | |
| "grad_norm": 2.849271535873413, | |
| "learning_rate": 6.086967925347075e-07, | |
| "loss": 0.1858, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.396816976127321, | |
| "grad_norm": 2.0765082836151123, | |
| "learning_rate": 5.88643983467033e-07, | |
| "loss": 0.1837, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 2.4074270557029176, | |
| "grad_norm": 1.9958840608596802, | |
| "learning_rate": 5.688828788729547e-07, | |
| "loss": 0.1659, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 2.4180371352785146, | |
| "grad_norm": 2.253602981567383, | |
| "learning_rate": 5.494164946231747e-07, | |
| "loss": 0.2095, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 2.428647214854111, | |
| "grad_norm": 1.5552992820739746, | |
| "learning_rate": 5.302478016092075e-07, | |
| "loss": 0.1862, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 2.439257294429708, | |
| "grad_norm": 2.721445322036743, | |
| "learning_rate": 5.113797252899728e-07, | |
| "loss": 0.2085, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.449867374005305, | |
| "grad_norm": 2.3488707542419434, | |
| "learning_rate": 4.928151452453184e-07, | |
| "loss": 0.1914, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 2.4604774535809018, | |
| "grad_norm": 2.49068021774292, | |
| "learning_rate": 4.745568947365542e-07, | |
| "loss": 0.1718, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 2.4710875331564988, | |
| "grad_norm": 1.4638549089431763, | |
| "learning_rate": 4.5660776027404654e-07, | |
| "loss": 0.1669, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 2.4816976127320953, | |
| "grad_norm": 2.288776159286499, | |
| "learning_rate": 4.389704811919507e-07, | |
| "loss": 0.1731, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 2.4923076923076923, | |
| "grad_norm": 2.385162115097046, | |
| "learning_rate": 4.216477492301455e-07, | |
| "loss": 0.1802, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.502917771883289, | |
| "grad_norm": 2.0100815296173096, | |
| "learning_rate": 4.0464220812342526e-07, | |
| "loss": 0.2232, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 2.513527851458886, | |
| "grad_norm": 1.8439091444015503, | |
| "learning_rate": 3.87956453198027e-07, | |
| "loss": 0.1432, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 2.524137931034483, | |
| "grad_norm": 2.3093338012695312, | |
| "learning_rate": 3.715930309755389e-07, | |
| "loss": 0.1834, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 2.5347480106100795, | |
| "grad_norm": 2.3250088691711426, | |
| "learning_rate": 3.5555443878425635e-07, | |
| "loss": 0.2123, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 2.5453580901856765, | |
| "grad_norm": 1.8003133535385132, | |
| "learning_rate": 3.398431243780531e-07, | |
| "loss": 0.2034, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.555968169761273, | |
| "grad_norm": 2.8948135375976562, | |
| "learning_rate": 3.2446148556281117e-07, | |
| "loss": 0.1778, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 2.56657824933687, | |
| "grad_norm": 1.8556360006332397, | |
| "learning_rate": 3.0941186983047543e-07, | |
| "loss": 0.1892, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 2.5771883289124666, | |
| "grad_norm": 2.771932363510132, | |
| "learning_rate": 2.9469657400078925e-07, | |
| "loss": 0.1935, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 2.5877984084880636, | |
| "grad_norm": 2.5325114727020264, | |
| "learning_rate": 2.8031784387076186e-07, | |
| "loss": 0.1858, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 2.5984084880636606, | |
| "grad_norm": 2.4069302082061768, | |
| "learning_rate": 2.6627787387191934e-07, | |
| "loss": 0.2118, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 2.609018567639257, | |
| "grad_norm": 2.053656816482544, | |
| "learning_rate": 2.5257880673540376e-07, | |
| "loss": 0.1929, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 2.6196286472148542, | |
| "grad_norm": 1.8820626735687256, | |
| "learning_rate": 2.392227331649527e-07, | |
| "loss": 0.1745, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 2.630238726790451, | |
| "grad_norm": 1.9418586492538452, | |
| "learning_rate": 2.2621169151782417e-07, | |
| "loss": 0.1823, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 2.640848806366048, | |
| "grad_norm": 2.519037961959839, | |
| "learning_rate": 2.1354766749371093e-07, | |
| "loss": 0.2037, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 2.6514588859416444, | |
| "grad_norm": 2.010211944580078, | |
| "learning_rate": 2.0123259383169031e-07, | |
| "loss": 0.2196, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.6620689655172414, | |
| "grad_norm": 1.9838532209396362, | |
| "learning_rate": 1.8926835001525257e-07, | |
| "loss": 0.1848, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 2.6726790450928384, | |
| "grad_norm": 2.3488149642944336, | |
| "learning_rate": 1.776567619854655e-07, | |
| "loss": 0.1823, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 2.683289124668435, | |
| "grad_norm": 2.839651584625244, | |
| "learning_rate": 1.6639960186230293e-07, | |
| "loss": 0.2039, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 2.693899204244032, | |
| "grad_norm": 2.050480842590332, | |
| "learning_rate": 1.5549858767419018e-07, | |
| "loss": 0.1796, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 2.7045092838196285, | |
| "grad_norm": 1.2738044261932373, | |
| "learning_rate": 1.449553830958053e-07, | |
| "loss": 0.1893, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 2.7151193633952255, | |
| "grad_norm": 1.8912787437438965, | |
| "learning_rate": 1.347715971941746e-07, | |
| "loss": 0.1947, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 2.725729442970822, | |
| "grad_norm": 1.8385730981826782, | |
| "learning_rate": 1.2494878418310234e-07, | |
| "loss": 0.1744, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 2.736339522546419, | |
| "grad_norm": 2.1071712970733643, | |
| "learning_rate": 1.1548844318597208e-07, | |
| "loss": 0.2351, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 2.746949602122016, | |
| "grad_norm": 2.054392099380493, | |
| "learning_rate": 1.0639201800695553e-07, | |
| "loss": 0.2245, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 2.7575596816976127, | |
| "grad_norm": 1.656562328338623, | |
| "learning_rate": 9.76608969106646e-08, | |
| "loss": 0.2014, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.7681697612732097, | |
| "grad_norm": 2.6887638568878174, | |
| "learning_rate": 8.929641241027937e-08, | |
| "loss": 0.1824, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 2.7787798408488062, | |
| "grad_norm": 2.4606659412384033, | |
| "learning_rate": 8.129984106418354e-08, | |
| "loss": 0.1706, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 2.7893899204244033, | |
| "grad_norm": 2.5548455715179443, | |
| "learning_rate": 7.3672403281142e-08, | |
| "loss": 0.2195, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 1.7952167987823486, | |
| "learning_rate": 6.641526313404534e-08, | |
| "loss": 0.1748, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 2.810610079575597, | |
| "grad_norm": 2.376830816268921, | |
| "learning_rate": 5.952952818225416e-08, | |
| "loss": 0.2061, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 2.821220159151194, | |
| "grad_norm": 1.7183632850646973, | |
| "learning_rate": 5.3016249302565436e-08, | |
| "loss": 0.1742, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 2.8318302387267904, | |
| "grad_norm": 2.11011004447937, | |
| "learning_rate": 4.6876420528833014e-08, | |
| "loss": 0.2082, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 2.8424403183023874, | |
| "grad_norm": 1.8799868822097778, | |
| "learning_rate": 4.111097890026089e-08, | |
| "loss": 0.1805, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 2.853050397877984, | |
| "grad_norm": 2.5171291828155518, | |
| "learning_rate": 3.5720804318395976e-08, | |
| "loss": 0.2058, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 2.863660477453581, | |
| "grad_norm": 2.142263650894165, | |
| "learning_rate": 3.0706719412839926e-08, | |
| "loss": 0.2027, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.8742705570291776, | |
| "grad_norm": 2.2124040126800537, | |
| "learning_rate": 2.6069489415703197e-08, | |
| "loss": 0.1941, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 2.8848806366047746, | |
| "grad_norm": 2.033259153366089, | |
| "learning_rate": 2.18098220448168e-08, | |
| "loss": 0.2029, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 2.8954907161803716, | |
| "grad_norm": 2.416912794113159, | |
| "learning_rate": 1.7928367395725066e-08, | |
| "loss": 0.2062, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 2.906100795755968, | |
| "grad_norm": 2.193751096725464, | |
| "learning_rate": 1.442571784246699e-08, | |
| "loss": 0.1873, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 2.916710875331565, | |
| "grad_norm": 1.5729731321334839, | |
| "learning_rate": 1.1302407947173522e-08, | |
| "loss": 0.1653, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 2.9273209549071617, | |
| "grad_norm": 1.7562044858932495, | |
| "learning_rate": 8.558914378481996e-09, | |
| "loss": 0.1743, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 2.9379310344827587, | |
| "grad_norm": 2.183967351913452, | |
| "learning_rate": 6.195655838790726e-09, | |
| "loss": 0.1821, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 2.9485411140583553, | |
| "grad_norm": 1.9312433004379272, | |
| "learning_rate": 4.212993000356491e-09, | |
| "loss": 0.1954, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 2.9591511936339523, | |
| "grad_norm": 2.2055087089538574, | |
| "learning_rate": 2.611228450250802e-09, | |
| "loss": 0.1925, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 2.9697612732095493, | |
| "grad_norm": 1.6606404781341553, | |
| "learning_rate": 1.3906066441798927e-09, | |
| "loss": 0.1805, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.980371352785146, | |
| "grad_norm": 2.594404458999634, | |
| "learning_rate": 5.513138691767839e-10, | |
| "loss": 0.2084, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 2.9909814323607424, | |
| "grad_norm": 2.007861375808716, | |
| "learning_rate": 9.347821517069477e-11, | |
| "loss": 0.2115, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 2.9973474801061006, | |
| "step": 2826, | |
| "total_flos": 1.0915292825780224e+17, | |
| "train_loss": 0.34044326600333263, | |
| "train_runtime": 16671.2674, | |
| "train_samples_per_second": 2.713, | |
| "train_steps_per_second": 0.17 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2826, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 943, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.0915292825780224e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |