{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9973474801061006, "eval_steps": 500, "global_step": 2826, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010610079575596816, "grad_norm": 4.634474754333496, "learning_rate": 1.5901060070671379e-07, "loss": 0.741, "step": 10 }, { "epoch": 0.021220159151193633, "grad_norm": 2.9002726078033447, "learning_rate": 3.356890459363958e-07, "loss": 0.5551, "step": 20 }, { "epoch": 0.03183023872679045, "grad_norm": 4.242003917694092, "learning_rate": 5.123674911660778e-07, "loss": 0.6185, "step": 30 }, { "epoch": 0.042440318302387266, "grad_norm": 3.8156638145446777, "learning_rate": 6.890459363957598e-07, "loss": 0.6358, "step": 40 }, { "epoch": 0.05305039787798409, "grad_norm": 3.047624349594116, "learning_rate": 8.657243816254418e-07, "loss": 0.5922, "step": 50 }, { "epoch": 0.0636604774535809, "grad_norm": 2.2943954467773438, "learning_rate": 1.0424028268551239e-06, "loss": 0.6282, "step": 60 }, { "epoch": 0.07427055702917772, "grad_norm": 2.831937551498413, "learning_rate": 1.2190812720848057e-06, "loss": 0.5836, "step": 70 }, { "epoch": 0.08488063660477453, "grad_norm": 3.941297769546509, "learning_rate": 1.3957597173144876e-06, "loss": 0.5836, "step": 80 }, { "epoch": 0.09549071618037135, "grad_norm": 2.4598379135131836, "learning_rate": 1.5724381625441699e-06, "loss": 0.4983, "step": 90 }, { "epoch": 0.10610079575596817, "grad_norm": 2.533829927444458, "learning_rate": 1.7491166077738517e-06, "loss": 0.6057, "step": 100 }, { "epoch": 0.11671087533156499, "grad_norm": 2.412334442138672, "learning_rate": 1.925795053003534e-06, "loss": 0.5135, "step": 110 }, { "epoch": 0.1273209549071618, "grad_norm": 2.7505877017974854, "learning_rate": 2.1024734982332157e-06, "loss": 0.4844, "step": 120 }, { "epoch": 0.13793103448275862, "grad_norm": 2.701307535171509, "learning_rate": 2.279151943462898e-06, "loss": 0.5386, "step": 130 }, { "epoch": 0.14854111405835543, "grad_norm": 2.8261961936950684, "learning_rate": 2.45583038869258e-06, "loss": 0.4774, "step": 140 }, { "epoch": 0.15915119363395225, "grad_norm": 2.4490256309509277, "learning_rate": 2.6325088339222617e-06, "loss": 0.5035, "step": 150 }, { "epoch": 0.16976127320954906, "grad_norm": 2.418158769607544, "learning_rate": 2.8091872791519436e-06, "loss": 0.4897, "step": 160 }, { "epoch": 0.18037135278514588, "grad_norm": 3.5972161293029785, "learning_rate": 2.985865724381626e-06, "loss": 0.5196, "step": 170 }, { "epoch": 0.1909814323607427, "grad_norm": 2.814927577972412, "learning_rate": 3.162544169611308e-06, "loss": 0.4791, "step": 180 }, { "epoch": 0.20159151193633953, "grad_norm": 2.6151270866394043, "learning_rate": 3.3392226148409896e-06, "loss": 0.5024, "step": 190 }, { "epoch": 0.21220159151193635, "grad_norm": 2.8331387042999268, "learning_rate": 3.5159010600706715e-06, "loss": 0.5781, "step": 200 }, { "epoch": 0.22281167108753316, "grad_norm": 2.433027744293213, "learning_rate": 3.6925795053003538e-06, "loss": 0.4186, "step": 210 }, { "epoch": 0.23342175066312998, "grad_norm": 2.671696186065674, "learning_rate": 3.869257950530036e-06, "loss": 0.4819, "step": 220 }, { "epoch": 0.2440318302387268, "grad_norm": 2.5337982177734375, "learning_rate": 4.045936395759718e-06, "loss": 0.547, "step": 230 }, { "epoch": 0.2546419098143236, "grad_norm": 2.2034990787506104, "learning_rate": 4.222614840989399e-06, "loss": 0.5603, "step": 240 }, { "epoch": 0.26525198938992045, "grad_norm": 2.2893121242523193, "learning_rate": 4.399293286219082e-06, "loss": 0.4483, "step": 250 }, { "epoch": 0.27586206896551724, "grad_norm": 1.8757219314575195, "learning_rate": 4.575971731448763e-06, "loss": 0.5178, "step": 260 }, { "epoch": 0.2864721485411141, "grad_norm": 2.3748602867126465, "learning_rate": 4.752650176678445e-06, "loss": 0.5264, "step": 270 }, { "epoch": 0.29708222811671087, "grad_norm": 3.0481033325195312, "learning_rate": 4.929328621908128e-06, "loss": 0.5124, "step": 280 }, { "epoch": 0.3076923076923077, "grad_norm": 2.682847023010254, "learning_rate": 4.99993132201408e-06, "loss": 0.4977, "step": 290 }, { "epoch": 0.3183023872679045, "grad_norm": 2.472842216491699, "learning_rate": 4.9995116368759e-06, "loss": 0.5005, "step": 300 }, { "epoch": 0.32891246684350134, "grad_norm": 2.582815647125244, "learning_rate": 4.998710485009401e-06, "loss": 0.4857, "step": 310 }, { "epoch": 0.3395225464190981, "grad_norm": 2.3572824001312256, "learning_rate": 4.99752798868358e-06, "loss": 0.4637, "step": 320 }, { "epoch": 0.35013262599469497, "grad_norm": 2.3432295322418213, "learning_rate": 4.99596432836689e-06, "loss": 0.4775, "step": 330 }, { "epoch": 0.36074270557029176, "grad_norm": 2.7486777305603027, "learning_rate": 4.994019742699705e-06, "loss": 0.5779, "step": 340 }, { "epoch": 0.3713527851458886, "grad_norm": 2.3831562995910645, "learning_rate": 4.991694528457891e-06, "loss": 0.5057, "step": 350 }, { "epoch": 0.3819628647214854, "grad_norm": 2.5414721965789795, "learning_rate": 4.988989040507518e-06, "loss": 0.5313, "step": 360 }, { "epoch": 0.3925729442970822, "grad_norm": 2.4140472412109375, "learning_rate": 4.985903691750697e-06, "loss": 0.4441, "step": 370 }, { "epoch": 0.40318302387267907, "grad_norm": 2.4907593727111816, "learning_rate": 4.982438953062572e-06, "loss": 0.4778, "step": 380 }, { "epoch": 0.41379310344827586, "grad_norm": 2.579932928085327, "learning_rate": 4.978595353219449e-06, "loss": 0.4848, "step": 390 }, { "epoch": 0.4244031830238727, "grad_norm": 2.5512266159057617, "learning_rate": 4.974373478818098e-06, "loss": 0.4891, "step": 400 }, { "epoch": 0.4350132625994695, "grad_norm": 2.3293063640594482, "learning_rate": 4.969773974186235e-06, "loss": 0.4954, "step": 410 }, { "epoch": 0.44562334217506633, "grad_norm": 2.6347479820251465, "learning_rate": 4.964797541284175e-06, "loss": 0.5353, "step": 420 }, { "epoch": 0.4562334217506631, "grad_norm": 2.7719151973724365, "learning_rate": 4.959444939597712e-06, "loss": 0.5726, "step": 430 }, { "epoch": 0.46684350132625996, "grad_norm": 2.1757211685180664, "learning_rate": 4.953716986022204e-06, "loss": 0.5642, "step": 440 }, { "epoch": 0.47745358090185674, "grad_norm": 2.432244300842285, "learning_rate": 4.947614554737904e-06, "loss": 0.4429, "step": 450 }, { "epoch": 0.4880636604774536, "grad_norm": 1.972844123840332, "learning_rate": 4.941138577076538e-06, "loss": 0.4683, "step": 460 }, { "epoch": 0.4986737400530504, "grad_norm": 2.484992742538452, "learning_rate": 4.934290041379182e-06, "loss": 0.4385, "step": 470 }, { "epoch": 0.5092838196286472, "grad_norm": 2.0424418449401855, "learning_rate": 4.92706999284541e-06, "loss": 0.4935, "step": 480 }, { "epoch": 0.519893899204244, "grad_norm": 2.3754308223724365, "learning_rate": 4.9194795333737925e-06, "loss": 0.4548, "step": 490 }, { "epoch": 0.5305039787798409, "grad_norm": 3.0801432132720947, "learning_rate": 4.911519821393718e-06, "loss": 0.5486, "step": 500 }, { "epoch": 0.5411140583554377, "grad_norm": 2.2712507247924805, "learning_rate": 4.9031920716886035e-06, "loss": 0.5121, "step": 510 }, { "epoch": 0.5517241379310345, "grad_norm": 2.0000548362731934, "learning_rate": 4.894497555210499e-06, "loss": 0.4495, "step": 520 }, { "epoch": 0.5623342175066313, "grad_norm": 2.590303897857666, "learning_rate": 4.8854375988861134e-06, "loss": 0.5028, "step": 530 }, { "epoch": 0.5729442970822282, "grad_norm": 2.377298355102539, "learning_rate": 4.87601358541431e-06, "loss": 0.5193, "step": 540 }, { "epoch": 0.583554376657825, "grad_norm": 2.966008186340332, "learning_rate": 4.8662269530550825e-06, "loss": 0.545, "step": 550 }, { "epoch": 0.5941644562334217, "grad_norm": 2.250293254852295, "learning_rate": 4.856079195410046e-06, "loss": 0.5219, "step": 560 }, { "epoch": 0.6047745358090185, "grad_norm": 2.437361240386963, "learning_rate": 4.845571861194501e-06, "loss": 0.4725, "step": 570 }, { "epoch": 0.6153846153846154, "grad_norm": 2.435994863510132, "learning_rate": 4.834706554001065e-06, "loss": 0.4232, "step": 580 }, { "epoch": 0.6259946949602122, "grad_norm": 2.705902099609375, "learning_rate": 4.823484932054937e-06, "loss": 0.4834, "step": 590 }, { "epoch": 0.636604774535809, "grad_norm": 2.1471517086029053, "learning_rate": 4.811908707960832e-06, "loss": 0.5302, "step": 600 }, { "epoch": 0.6472148541114059, "grad_norm": 2.0760443210601807, "learning_rate": 4.799979648441602e-06, "loss": 0.494, "step": 610 }, { "epoch": 0.6578249336870027, "grad_norm": 2.334944009780884, "learning_rate": 4.787699574068611e-06, "loss": 0.487, "step": 620 }, { "epoch": 0.6684350132625995, "grad_norm": 2.3444855213165283, "learning_rate": 4.775070358983881e-06, "loss": 0.4911, "step": 630 }, { "epoch": 0.6790450928381963, "grad_norm": 2.127737045288086, "learning_rate": 4.7620939306140696e-06, "loss": 0.4744, "step": 640 }, { "epoch": 0.6896551724137931, "grad_norm": 2.2132568359375, "learning_rate": 4.748772269376312e-06, "loss": 0.4789, "step": 650 }, { "epoch": 0.7002652519893899, "grad_norm": 1.9452372789382935, "learning_rate": 4.735107408375977e-06, "loss": 0.488, "step": 660 }, { "epoch": 0.7108753315649867, "grad_norm": 2.7268893718719482, "learning_rate": 4.721101433096381e-06, "loss": 0.4462, "step": 670 }, { "epoch": 0.7214854111405835, "grad_norm": 2.1095452308654785, "learning_rate": 4.706756481080511e-06, "loss": 0.5087, "step": 680 }, { "epoch": 0.7320954907161804, "grad_norm": 2.278555154800415, "learning_rate": 4.692074741604795e-06, "loss": 0.5304, "step": 690 }, { "epoch": 0.7427055702917772, "grad_norm": 2.455960512161255, "learning_rate": 4.677058455344989e-06, "loss": 0.5177, "step": 700 }, { "epoch": 0.753315649867374, "grad_norm": 2.1136856079101562, "learning_rate": 4.661709914034209e-06, "loss": 0.4841, "step": 710 }, { "epoch": 0.7639257294429708, "grad_norm": 2.296614646911621, "learning_rate": 4.646031460113175e-06, "loss": 0.4544, "step": 720 }, { "epoch": 0.7745358090185677, "grad_norm": 1.8733782768249512, "learning_rate": 4.630025486372715e-06, "loss": 0.4715, "step": 730 }, { "epoch": 0.7851458885941645, "grad_norm": 2.526837110519409, "learning_rate": 4.613694435588589e-06, "loss": 0.4824, "step": 740 }, { "epoch": 0.7957559681697612, "grad_norm": 2.2026150226593018, "learning_rate": 4.597040800148679e-06, "loss": 0.4852, "step": 750 }, { "epoch": 0.8063660477453581, "grad_norm": 2.214277744293213, "learning_rate": 4.580067121672607e-06, "loss": 0.4134, "step": 760 }, { "epoch": 0.8169761273209549, "grad_norm": 2.623305559158325, "learning_rate": 4.562775990623847e-06, "loss": 0.4493, "step": 770 }, { "epoch": 0.8275862068965517, "grad_norm": 2.9433794021606445, "learning_rate": 4.5451700459143735e-06, "loss": 0.5255, "step": 780 }, { "epoch": 0.8381962864721485, "grad_norm": 2.143739938735962, "learning_rate": 4.527251974501923e-06, "loss": 0.4503, "step": 790 }, { "epoch": 0.8488063660477454, "grad_norm": 2.1592986583709717, "learning_rate": 4.509024510979917e-06, "loss": 0.4636, "step": 800 }, { "epoch": 0.8594164456233422, "grad_norm": 2.2622759342193604, "learning_rate": 4.4904904371601176e-06, "loss": 0.4685, "step": 810 }, { "epoch": 0.870026525198939, "grad_norm": 2.3408522605895996, "learning_rate": 4.4716525816480816e-06, "loss": 0.5248, "step": 820 }, { "epoch": 0.8806366047745358, "grad_norm": 2.5351459980010986, "learning_rate": 4.4525138194114644e-06, "loss": 0.4747, "step": 830 }, { "epoch": 0.8912466843501327, "grad_norm": 2.4038591384887695, "learning_rate": 4.4330770713412555e-06, "loss": 0.4198, "step": 840 }, { "epoch": 0.9018567639257294, "grad_norm": 2.2719292640686035, "learning_rate": 4.413345303805996e-06, "loss": 0.4545, "step": 850 }, { "epoch": 0.9124668435013262, "grad_norm": 3.1209301948547363, "learning_rate": 4.393321528199072e-06, "loss": 0.5003, "step": 860 }, { "epoch": 0.9230769230769231, "grad_norm": 2.414945125579834, "learning_rate": 4.373008800479118e-06, "loss": 0.472, "step": 870 }, { "epoch": 0.9336870026525199, "grad_norm": 2.21144437789917, "learning_rate": 4.352410220703629e-06, "loss": 0.4661, "step": 880 }, { "epoch": 0.9442970822281167, "grad_norm": 2.210827589035034, "learning_rate": 4.331528932555844e-06, "loss": 0.4614, "step": 890 }, { "epoch": 0.9549071618037135, "grad_norm": 2.403038740158081, "learning_rate": 4.3103681228649626e-06, "loss": 0.4623, "step": 900 }, { "epoch": 0.9655172413793104, "grad_norm": 2.588114023208618, "learning_rate": 4.288931021119788e-06, "loss": 0.4902, "step": 910 }, { "epoch": 0.9761273209549072, "grad_norm": 2.288691997528076, "learning_rate": 4.267220898975848e-06, "loss": 0.5047, "step": 920 }, { "epoch": 0.986737400530504, "grad_norm": 2.2487804889678955, "learning_rate": 4.245241069756092e-06, "loss": 0.5358, "step": 930 }, { "epoch": 0.9973474801061007, "grad_norm": 2.5266008377075195, "learning_rate": 4.222994887945219e-06, "loss": 0.4928, "step": 940 }, { "epoch": 1.0074270557029177, "grad_norm": 2.5962352752685547, "learning_rate": 4.20048574867773e-06, "loss": 0.3963, "step": 950 }, { "epoch": 1.0180371352785147, "grad_norm": 2.707613229751587, "learning_rate": 4.1777170872197725e-06, "loss": 0.3125, "step": 960 }, { "epoch": 1.0286472148541115, "grad_norm": 2.4237964153289795, "learning_rate": 4.1546923784448646e-06, "loss": 0.3457, "step": 970 }, { "epoch": 1.0392572944297083, "grad_norm": 1.6531928777694702, "learning_rate": 4.1314151363035705e-06, "loss": 0.3029, "step": 980 }, { "epoch": 1.049867374005305, "grad_norm": 2.1669981479644775, "learning_rate": 4.1078889132872145e-06, "loss": 0.3289, "step": 990 }, { "epoch": 1.0604774535809018, "grad_norm": 2.445012092590332, "learning_rate": 4.084117299885712e-06, "loss": 0.3234, "step": 1000 }, { "epoch": 1.0710875331564986, "grad_norm": 2.0615527629852295, "learning_rate": 4.060103924039599e-06, "loss": 0.3139, "step": 1010 }, { "epoch": 1.0816976127320954, "grad_norm": 1.990400791168213, "learning_rate": 4.035852450586352e-06, "loss": 0.3144, "step": 1020 }, { "epoch": 1.0923076923076924, "grad_norm": 2.5510122776031494, "learning_rate": 4.011366580701073e-06, "loss": 0.323, "step": 1030 }, { "epoch": 1.1029177718832892, "grad_norm": 2.462083101272583, "learning_rate": 3.9866500513316274e-06, "loss": 0.3694, "step": 1040 }, { "epoch": 1.113527851458886, "grad_norm": 2.4385085105895996, "learning_rate": 3.961706634628323e-06, "loss": 0.3351, "step": 1050 }, { "epoch": 1.1241379310344828, "grad_norm": 1.7553578615188599, "learning_rate": 3.936540137368222e-06, "loss": 0.3459, "step": 1060 }, { "epoch": 1.1347480106100796, "grad_norm": 2.513950824737549, "learning_rate": 3.911154400374159e-06, "loss": 0.3186, "step": 1070 }, { "epoch": 1.1453580901856764, "grad_norm": 2.6273515224456787, "learning_rate": 3.885553297928573e-06, "loss": 0.3333, "step": 1080 }, { "epoch": 1.1559681697612731, "grad_norm": 2.4155592918395996, "learning_rate": 3.859740737182222e-06, "loss": 0.3137, "step": 1090 }, { "epoch": 1.16657824933687, "grad_norm": 2.719611644744873, "learning_rate": 3.833720657557894e-06, "loss": 0.3426, "step": 1100 }, { "epoch": 1.1771883289124667, "grad_norm": 2.5729358196258545, "learning_rate": 3.807497030149181e-06, "loss": 0.3709, "step": 1110 }, { "epoch": 1.1877984084880637, "grad_norm": 1.9626141786575317, "learning_rate": 3.7810738571144257e-06, "loss": 0.329, "step": 1120 }, { "epoch": 1.1984084880636605, "grad_norm": 2.601951837539673, "learning_rate": 3.7544551710659296e-06, "loss": 0.305, "step": 1130 }, { "epoch": 1.2090185676392573, "grad_norm": 2.4118540287017822, "learning_rate": 3.7276450344545024e-06, "loss": 0.3449, "step": 1140 }, { "epoch": 1.219628647214854, "grad_norm": 2.5080604553222656, "learning_rate": 3.7006475389494723e-06, "loss": 0.3403, "step": 1150 }, { "epoch": 1.2302387267904509, "grad_norm": 2.6882951259613037, "learning_rate": 3.6734668048142273e-06, "loss": 0.3342, "step": 1160 }, { "epoch": 1.2408488063660477, "grad_norm": 2.3755247592926025, "learning_rate": 3.646106980277394e-06, "loss": 0.3589, "step": 1170 }, { "epoch": 1.2514588859416444, "grad_norm": 2.4138166904449463, "learning_rate": 3.618572240899748e-06, "loss": 0.3447, "step": 1180 }, { "epoch": 1.2620689655172415, "grad_norm": 2.6930105686187744, "learning_rate": 3.5908667889369603e-06, "loss": 0.3787, "step": 1190 }, { "epoch": 1.2726790450928382, "grad_norm": 2.732795476913452, "learning_rate": 3.5629948526982563e-06, "loss": 0.3376, "step": 1200 }, { "epoch": 1.283289124668435, "grad_norm": 1.8468087911605835, "learning_rate": 3.534960685901111e-06, "loss": 0.3461, "step": 1210 }, { "epoch": 1.2938992042440318, "grad_norm": 2.3408284187316895, "learning_rate": 3.506768567022062e-06, "loss": 0.3396, "step": 1220 }, { "epoch": 1.3045092838196286, "grad_norm": 2.7420434951782227, "learning_rate": 3.478422798643737e-06, "loss": 0.3364, "step": 1230 }, { "epoch": 1.3151193633952254, "grad_norm": 2.634403705596924, "learning_rate": 3.4499277067982177e-06, "loss": 0.3126, "step": 1240 }, { "epoch": 1.3257294429708222, "grad_norm": 2.4217336177825928, "learning_rate": 3.421287640306809e-06, "loss": 0.3092, "step": 1250 }, { "epoch": 1.3363395225464192, "grad_norm": 1.7107937335968018, "learning_rate": 3.3925069701163406e-06, "loss": 0.3374, "step": 1260 }, { "epoch": 1.346949602122016, "grad_norm": 2.1515822410583496, "learning_rate": 3.363590088632085e-06, "loss": 0.3436, "step": 1270 }, { "epoch": 1.3575596816976128, "grad_norm": 2.0105717182159424, "learning_rate": 3.334541409047408e-06, "loss": 0.3283, "step": 1280 }, { "epoch": 1.3681697612732096, "grad_norm": 1.8952791690826416, "learning_rate": 3.3053653646702422e-06, "loss": 0.358, "step": 1290 }, { "epoch": 1.3787798408488063, "grad_norm": 1.8639928102493286, "learning_rate": 3.276066408246487e-06, "loss": 0.3084, "step": 1300 }, { "epoch": 1.3893899204244031, "grad_norm": 2.563251256942749, "learning_rate": 3.2466490112804484e-06, "loss": 0.3508, "step": 1310 }, { "epoch": 1.4, "grad_norm": 2.214616060256958, "learning_rate": 3.217117663352417e-06, "loss": 0.3215, "step": 1320 }, { "epoch": 1.410610079575597, "grad_norm": 1.793468952178955, "learning_rate": 3.187476871433478e-06, "loss": 0.3193, "step": 1330 }, { "epoch": 1.4212201591511937, "grad_norm": 2.204789638519287, "learning_rate": 3.1577311591976766e-06, "loss": 0.3019, "step": 1340 }, { "epoch": 1.4318302387267905, "grad_norm": 2.307568311691284, "learning_rate": 3.1278850663316307e-06, "loss": 0.3099, "step": 1350 }, { "epoch": 1.4424403183023873, "grad_norm": 2.485848903656006, "learning_rate": 3.0979431478416987e-06, "loss": 0.3085, "step": 1360 }, { "epoch": 1.453050397877984, "grad_norm": 1.953053593635559, "learning_rate": 3.067909973358811e-06, "loss": 0.3211, "step": 1370 }, { "epoch": 1.4636604774535809, "grad_norm": 2.2350101470947266, "learning_rate": 3.0377901264410673e-06, "loss": 0.3329, "step": 1380 }, { "epoch": 1.4742705570291776, "grad_norm": 2.542452335357666, "learning_rate": 3.0075882038742133e-06, "loss": 0.3376, "step": 1390 }, { "epoch": 1.4848806366047747, "grad_norm": 2.3203530311584473, "learning_rate": 2.9773088149700923e-06, "loss": 0.2896, "step": 1400 }, { "epoch": 1.4954907161803712, "grad_norm": 1.9708584547042847, "learning_rate": 2.9469565808631888e-06, "loss": 0.299, "step": 1410 }, { "epoch": 1.5061007957559682, "grad_norm": 2.63698148727417, "learning_rate": 2.9165361338053683e-06, "loss": 0.3484, "step": 1420 }, { "epoch": 1.516710875331565, "grad_norm": 2.091648578643799, "learning_rate": 2.886052116458918e-06, "loss": 0.3316, "step": 1430 }, { "epoch": 1.5273209549071618, "grad_norm": 1.955355167388916, "learning_rate": 2.8555091811880004e-06, "loss": 0.328, "step": 1440 }, { "epoch": 1.5379310344827586, "grad_norm": 1.6724951267242432, "learning_rate": 2.8249119893486252e-06, "loss": 0.3215, "step": 1450 }, { "epoch": 1.5485411140583554, "grad_norm": 2.1872570514678955, "learning_rate": 2.7942652105772516e-06, "loss": 0.3118, "step": 1460 }, { "epoch": 1.5591511936339524, "grad_norm": 3.0710208415985107, "learning_rate": 2.7635735220781214e-06, "loss": 0.2973, "step": 1470 }, { "epoch": 1.569761273209549, "grad_norm": 2.357663631439209, "learning_rate": 2.7328416079094412e-06, "loss": 0.3423, "step": 1480 }, { "epoch": 1.580371352785146, "grad_norm": 2.2559144496917725, "learning_rate": 2.7020741582685217e-06, "loss": 0.3211, "step": 1490 }, { "epoch": 1.5909814323607427, "grad_norm": 2.0730817317962646, "learning_rate": 2.6712758687759706e-06, "loss": 0.2733, "step": 1500 }, { "epoch": 1.6015915119363395, "grad_norm": 2.6119141578674316, "learning_rate": 2.6404514397590657e-06, "loss": 0.338, "step": 1510 }, { "epoch": 1.6122015915119363, "grad_norm": 2.315875768661499, "learning_rate": 2.6096055755344113e-06, "loss": 0.3124, "step": 1520 }, { "epoch": 1.622811671087533, "grad_norm": 2.2880892753601074, "learning_rate": 2.578742983689973e-06, "loss": 0.3538, "step": 1530 }, { "epoch": 1.6334217506631301, "grad_norm": 2.2615041732788086, "learning_rate": 2.547868374366631e-06, "loss": 0.3353, "step": 1540 }, { "epoch": 1.6440318302387267, "grad_norm": 1.9062315225601196, "learning_rate": 2.5169864595393295e-06, "loss": 0.302, "step": 1550 }, { "epoch": 1.6546419098143237, "grad_norm": 2.7016942501068115, "learning_rate": 2.4861019522979537e-06, "loss": 0.3124, "step": 1560 }, { "epoch": 1.6652519893899205, "grad_norm": 2.4618184566497803, "learning_rate": 2.455219566128034e-06, "loss": 0.3497, "step": 1570 }, { "epoch": 1.6758620689655173, "grad_norm": 2.8924951553344727, "learning_rate": 2.4243440141913905e-06, "loss": 0.3233, "step": 1580 }, { "epoch": 1.686472148541114, "grad_norm": 2.32255482673645, "learning_rate": 2.393480008606825e-06, "loss": 0.3067, "step": 1590 }, { "epoch": 1.6970822281167108, "grad_norm": 1.8984359502792358, "learning_rate": 2.3626322597309774e-06, "loss": 0.2893, "step": 1600 }, { "epoch": 1.7076923076923078, "grad_norm": 1.8360289335250854, "learning_rate": 2.331805475439445e-06, "loss": 0.2825, "step": 1610 }, { "epoch": 1.7183023872679044, "grad_norm": 2.331998109817505, "learning_rate": 2.3010043604082824e-06, "loss": 0.3379, "step": 1620 }, { "epoch": 1.7289124668435014, "grad_norm": 2.3304574489593506, "learning_rate": 2.2702336153959925e-06, "loss": 0.301, "step": 1630 }, { "epoch": 1.739522546419098, "grad_norm": 2.534090518951416, "learning_rate": 2.2394979365261134e-06, "loss": 0.404, "step": 1640 }, { "epoch": 1.750132625994695, "grad_norm": 2.273122549057007, "learning_rate": 2.208802014570507e-06, "loss": 0.3242, "step": 1650 }, { "epoch": 1.7607427055702918, "grad_norm": 1.8859643936157227, "learning_rate": 2.1781505342334775e-06, "loss": 0.3152, "step": 1660 }, { "epoch": 1.7713527851458886, "grad_norm": 2.567715644836426, "learning_rate": 2.147548173436805e-06, "loss": 0.3302, "step": 1670 }, { "epoch": 1.7819628647214856, "grad_norm": 2.7930519580841064, "learning_rate": 2.116999602605814e-06, "loss": 0.293, "step": 1680 }, { "epoch": 1.7925729442970821, "grad_norm": 2.646296262741089, "learning_rate": 2.086509483956594e-06, "loss": 0.2683, "step": 1690 }, { "epoch": 1.8031830238726791, "grad_norm": 2.3010053634643555, "learning_rate": 2.056082470784469e-06, "loss": 0.313, "step": 1700 }, { "epoch": 1.8137931034482757, "grad_norm": 2.3864669799804688, "learning_rate": 2.0257232067538213e-06, "loss": 0.262, "step": 1710 }, { "epoch": 1.8244031830238727, "grad_norm": 2.63028883934021, "learning_rate": 1.9954363251894007e-06, "loss": 0.3457, "step": 1720 }, { "epoch": 1.8350132625994695, "grad_norm": 2.0011484622955322, "learning_rate": 1.9652264483691933e-06, "loss": 0.2739, "step": 1730 }, { "epoch": 1.8456233421750663, "grad_norm": 2.6818690299987793, "learning_rate": 1.9350981868189944e-06, "loss": 0.3109, "step": 1740 }, { "epoch": 1.856233421750663, "grad_norm": 2.6978225708007812, "learning_rate": 1.9050561386087618e-06, "loss": 0.3269, "step": 1750 }, { "epoch": 1.8668435013262599, "grad_norm": 2.578031301498413, "learning_rate": 1.8751048886508711e-06, "loss": 0.3617, "step": 1760 }, { "epoch": 1.8774535809018569, "grad_norm": 2.5525052547454834, "learning_rate": 1.8452490080003888e-06, "loss": 0.3228, "step": 1770 }, { "epoch": 1.8880636604774534, "grad_norm": 2.1095635890960693, "learning_rate": 1.8154930531574521e-06, "loss": 0.2857, "step": 1780 }, { "epoch": 1.8986737400530505, "grad_norm": 2.3965845108032227, "learning_rate": 1.785841565371868e-06, "loss": 0.3622, "step": 1790 }, { "epoch": 1.9092838196286472, "grad_norm": 2.293715238571167, "learning_rate": 1.7562990699500482e-06, "loss": 0.3031, "step": 1800 }, { "epoch": 1.919893899204244, "grad_norm": 2.026015281677246, "learning_rate": 1.7268700755643708e-06, "loss": 0.3019, "step": 1810 }, { "epoch": 1.9305039787798408, "grad_norm": 1.7175791263580322, "learning_rate": 1.6975590735650812e-06, "loss": 0.3047, "step": 1820 }, { "epoch": 1.9411140583554376, "grad_norm": 2.0024490356445312, "learning_rate": 1.668370537294841e-06, "loss": 0.3048, "step": 1830 }, { "epoch": 1.9517241379310346, "grad_norm": 2.8226239681243896, "learning_rate": 1.6393089214060204e-06, "loss": 0.3205, "step": 1840 }, { "epoch": 1.9623342175066312, "grad_norm": 1.9452221393585205, "learning_rate": 1.6103786611808414e-06, "loss": 0.321, "step": 1850 }, { "epoch": 1.9729442970822282, "grad_norm": 2.304274320602417, "learning_rate": 1.5815841718544884e-06, "loss": 0.2954, "step": 1860 }, { "epoch": 1.983554376657825, "grad_norm": 2.502206802368164, "learning_rate": 1.5529298479412636e-06, "loss": 0.2945, "step": 1870 }, { "epoch": 1.9941644562334218, "grad_norm": 2.5796189308166504, "learning_rate": 1.524420062563912e-06, "loss": 0.3291, "step": 1880 }, { "epoch": 2.004244031830239, "grad_norm": 1.9198871850967407, "learning_rate": 1.4960591667862163e-06, "loss": 0.234, "step": 1890 }, { "epoch": 2.0148541114058354, "grad_norm": 1.7082706689834595, "learning_rate": 1.4678514889489464e-06, "loss": 0.1943, "step": 1900 }, { "epoch": 2.0254641909814324, "grad_norm": 1.8571817874908447, "learning_rate": 1.4398013340092864e-06, "loss": 0.1911, "step": 1910 }, { "epoch": 2.0360742705570294, "grad_norm": 2.454561233520508, "learning_rate": 1.4119129828838275e-06, "loss": 0.1895, "step": 1920 }, { "epoch": 2.046684350132626, "grad_norm": 2.3714683055877686, "learning_rate": 1.384190691795226e-06, "loss": 0.2177, "step": 1930 }, { "epoch": 2.057294429708223, "grad_norm": 2.1356313228607178, "learning_rate": 1.3566386916226373e-06, "loss": 0.2252, "step": 1940 }, { "epoch": 2.0679045092838195, "grad_norm": 2.446906089782715, "learning_rate": 1.3292611872560134e-06, "loss": 0.1982, "step": 1950 }, { "epoch": 2.0785145888594165, "grad_norm": 2.1040875911712646, "learning_rate": 1.302062356954365e-06, "loss": 0.1696, "step": 1960 }, { "epoch": 2.089124668435013, "grad_norm": 2.220742702484131, "learning_rate": 1.2750463517080922e-06, "loss": 0.1936, "step": 1970 }, { "epoch": 2.09973474801061, "grad_norm": 2.7784054279327393, "learning_rate": 1.2482172946054753e-06, "loss": 0.1604, "step": 1980 }, { "epoch": 2.110344827586207, "grad_norm": 2.0539498329162598, "learning_rate": 1.2215792802034187e-06, "loss": 0.2069, "step": 1990 }, { "epoch": 2.1209549071618037, "grad_norm": 1.8337138891220093, "learning_rate": 1.1951363739025618e-06, "loss": 0.1964, "step": 2000 }, { "epoch": 2.1315649867374007, "grad_norm": 1.7631642818450928, "learning_rate": 1.168892611326827e-06, "loss": 0.1871, "step": 2010 }, { "epoch": 2.1421750663129973, "grad_norm": 2.386589527130127, "learning_rate": 1.1428519977075136e-06, "loss": 0.2595, "step": 2020 }, { "epoch": 2.1527851458885943, "grad_norm": 2.553382635116577, "learning_rate": 1.1170185072720434e-06, "loss": 0.185, "step": 2030 }, { "epoch": 2.163395225464191, "grad_norm": 2.870973825454712, "learning_rate": 1.091396082637419e-06, "loss": 0.228, "step": 2040 }, { "epoch": 2.174005305039788, "grad_norm": 2.643745183944702, "learning_rate": 1.065988634208516e-06, "loss": 0.2098, "step": 2050 }, { "epoch": 2.184615384615385, "grad_norm": 2.369596481323242, "learning_rate": 1.0408000395812961e-06, "loss": 0.1982, "step": 2060 }, { "epoch": 2.1952254641909814, "grad_norm": 2.1093883514404297, "learning_rate": 1.0158341429510194e-06, "loss": 0.1844, "step": 2070 }, { "epoch": 2.2058355437665784, "grad_norm": 1.951935052871704, "learning_rate": 9.910947545255523e-07, "loss": 0.1654, "step": 2080 }, { "epoch": 2.216445623342175, "grad_norm": 2.230781078338623, "learning_rate": 9.665856499438744e-07, "loss": 0.2037, "step": 2090 }, { "epoch": 2.227055702917772, "grad_norm": 2.6240904331207275, "learning_rate": 9.423105696998491e-07, "loss": 0.2087, "step": 2100 }, { "epoch": 2.2376657824933686, "grad_norm": 1.712857723236084, "learning_rate": 9.182732185713633e-07, "loss": 0.2105, "step": 2110 }, { "epoch": 2.2482758620689656, "grad_norm": 2.036086082458496, "learning_rate": 8.94477265054918e-07, "loss": 0.2186, "step": 2120 }, { "epoch": 2.2588859416445626, "grad_norm": 2.3545398712158203, "learning_rate": 8.709263408057522e-07, "loss": 0.1879, "step": 2130 }, { "epoch": 2.269496021220159, "grad_norm": 1.9098992347717285, "learning_rate": 8.476240400835972e-07, "loss": 0.2177, "step": 2140 }, { "epoch": 2.280106100795756, "grad_norm": 2.107959270477295, "learning_rate": 8.245739192041311e-07, "loss": 0.165, "step": 2150 }, { "epoch": 2.2907161803713527, "grad_norm": 2.550719976425171, "learning_rate": 8.017794959962225e-07, "loss": 0.2018, "step": 2160 }, { "epoch": 2.3013262599469497, "grad_norm": 2.354701280593872, "learning_rate": 7.792442492650587e-07, "loss": 0.1955, "step": 2170 }, { "epoch": 2.3119363395225463, "grad_norm": 2.3547091484069824, "learning_rate": 7.569716182612177e-07, "loss": 0.1976, "step": 2180 }, { "epoch": 2.3225464190981433, "grad_norm": 1.4048022031784058, "learning_rate": 7.349650021557839e-07, "loss": 0.1685, "step": 2190 }, { "epoch": 2.33315649867374, "grad_norm": 2.568500280380249, "learning_rate": 7.132277595215773e-07, "loss": 0.1519, "step": 2200 }, { "epoch": 2.343766578249337, "grad_norm": 2.205993413925171, "learning_rate": 6.917632078205805e-07, "loss": 0.1573, "step": 2210 }, { "epoch": 2.3543766578249334, "grad_norm": 2.067505121231079, "learning_rate": 6.705746228976387e-07, "loss": 0.184, "step": 2220 }, { "epoch": 2.3649867374005304, "grad_norm": 2.4360201358795166, "learning_rate": 6.496652384805125e-07, "loss": 0.1968, "step": 2230 }, { "epoch": 2.3755968169761275, "grad_norm": 2.042179584503174, "learning_rate": 6.290382456863584e-07, "loss": 0.1846, "step": 2240 }, { "epoch": 2.386206896551724, "grad_norm": 2.849271535873413, "learning_rate": 6.086967925347075e-07, "loss": 0.1858, "step": 2250 }, { "epoch": 2.396816976127321, "grad_norm": 2.0765082836151123, "learning_rate": 5.88643983467033e-07, "loss": 0.1837, "step": 2260 }, { "epoch": 2.4074270557029176, "grad_norm": 1.9958840608596802, "learning_rate": 5.688828788729547e-07, "loss": 0.1659, "step": 2270 }, { "epoch": 2.4180371352785146, "grad_norm": 2.253602981567383, "learning_rate": 5.494164946231747e-07, "loss": 0.2095, "step": 2280 }, { "epoch": 2.428647214854111, "grad_norm": 1.5552992820739746, "learning_rate": 5.302478016092075e-07, "loss": 0.1862, "step": 2290 }, { "epoch": 2.439257294429708, "grad_norm": 2.721445322036743, "learning_rate": 5.113797252899728e-07, "loss": 0.2085, "step": 2300 }, { "epoch": 2.449867374005305, "grad_norm": 2.3488707542419434, "learning_rate": 4.928151452453184e-07, "loss": 0.1914, "step": 2310 }, { "epoch": 2.4604774535809018, "grad_norm": 2.49068021774292, "learning_rate": 4.745568947365542e-07, "loss": 0.1718, "step": 2320 }, { "epoch": 2.4710875331564988, "grad_norm": 1.4638549089431763, "learning_rate": 4.5660776027404654e-07, "loss": 0.1669, "step": 2330 }, { "epoch": 2.4816976127320953, "grad_norm": 2.288776159286499, "learning_rate": 4.389704811919507e-07, "loss": 0.1731, "step": 2340 }, { "epoch": 2.4923076923076923, "grad_norm": 2.385162115097046, "learning_rate": 4.216477492301455e-07, "loss": 0.1802, "step": 2350 }, { "epoch": 2.502917771883289, "grad_norm": 2.0100815296173096, "learning_rate": 4.0464220812342526e-07, "loss": 0.2232, "step": 2360 }, { "epoch": 2.513527851458886, "grad_norm": 1.8439091444015503, "learning_rate": 3.87956453198027e-07, "loss": 0.1432, "step": 2370 }, { "epoch": 2.524137931034483, "grad_norm": 2.3093338012695312, "learning_rate": 3.715930309755389e-07, "loss": 0.1834, "step": 2380 }, { "epoch": 2.5347480106100795, "grad_norm": 2.3250088691711426, "learning_rate": 3.5555443878425635e-07, "loss": 0.2123, "step": 2390 }, { "epoch": 2.5453580901856765, "grad_norm": 1.8003133535385132, "learning_rate": 3.398431243780531e-07, "loss": 0.2034, "step": 2400 }, { "epoch": 2.555968169761273, "grad_norm": 2.8948135375976562, "learning_rate": 3.2446148556281117e-07, "loss": 0.1778, "step": 2410 }, { "epoch": 2.56657824933687, "grad_norm": 1.8556360006332397, "learning_rate": 3.0941186983047543e-07, "loss": 0.1892, "step": 2420 }, { "epoch": 2.5771883289124666, "grad_norm": 2.771932363510132, "learning_rate": 2.9469657400078925e-07, "loss": 0.1935, "step": 2430 }, { "epoch": 2.5877984084880636, "grad_norm": 2.5325114727020264, "learning_rate": 2.8031784387076186e-07, "loss": 0.1858, "step": 2440 }, { "epoch": 2.5984084880636606, "grad_norm": 2.4069302082061768, "learning_rate": 2.6627787387191934e-07, "loss": 0.2118, "step": 2450 }, { "epoch": 2.609018567639257, "grad_norm": 2.053656816482544, "learning_rate": 2.5257880673540376e-07, "loss": 0.1929, "step": 2460 }, { "epoch": 2.6196286472148542, "grad_norm": 1.8820626735687256, "learning_rate": 2.392227331649527e-07, "loss": 0.1745, "step": 2470 }, { "epoch": 2.630238726790451, "grad_norm": 1.9418586492538452, "learning_rate": 2.2621169151782417e-07, "loss": 0.1823, "step": 2480 }, { "epoch": 2.640848806366048, "grad_norm": 2.519037961959839, "learning_rate": 2.1354766749371093e-07, "loss": 0.2037, "step": 2490 }, { "epoch": 2.6514588859416444, "grad_norm": 2.010211944580078, "learning_rate": 2.0123259383169031e-07, "loss": 0.2196, "step": 2500 }, { "epoch": 2.6620689655172414, "grad_norm": 1.9838532209396362, "learning_rate": 1.8926835001525257e-07, "loss": 0.1848, "step": 2510 }, { "epoch": 2.6726790450928384, "grad_norm": 2.3488149642944336, "learning_rate": 1.776567619854655e-07, "loss": 0.1823, "step": 2520 }, { "epoch": 2.683289124668435, "grad_norm": 2.839651584625244, "learning_rate": 1.6639960186230293e-07, "loss": 0.2039, "step": 2530 }, { "epoch": 2.693899204244032, "grad_norm": 2.050480842590332, "learning_rate": 1.5549858767419018e-07, "loss": 0.1796, "step": 2540 }, { "epoch": 2.7045092838196285, "grad_norm": 1.2738044261932373, "learning_rate": 1.449553830958053e-07, "loss": 0.1893, "step": 2550 }, { "epoch": 2.7151193633952255, "grad_norm": 1.8912787437438965, "learning_rate": 1.347715971941746e-07, "loss": 0.1947, "step": 2560 }, { "epoch": 2.725729442970822, "grad_norm": 1.8385730981826782, "learning_rate": 1.2494878418310234e-07, "loss": 0.1744, "step": 2570 }, { "epoch": 2.736339522546419, "grad_norm": 2.1071712970733643, "learning_rate": 1.1548844318597208e-07, "loss": 0.2351, "step": 2580 }, { "epoch": 2.746949602122016, "grad_norm": 2.054392099380493, "learning_rate": 1.0639201800695553e-07, "loss": 0.2245, "step": 2590 }, { "epoch": 2.7575596816976127, "grad_norm": 1.656562328338623, "learning_rate": 9.76608969106646e-08, "loss": 0.2014, "step": 2600 }, { "epoch": 2.7681697612732097, "grad_norm": 2.6887638568878174, "learning_rate": 8.929641241027937e-08, "loss": 0.1824, "step": 2610 }, { "epoch": 2.7787798408488062, "grad_norm": 2.4606659412384033, "learning_rate": 8.129984106418354e-08, "loss": 0.1706, "step": 2620 }, { "epoch": 2.7893899204244033, "grad_norm": 2.5548455715179443, "learning_rate": 7.3672403281142e-08, "loss": 0.2195, "step": 2630 }, { "epoch": 2.8, "grad_norm": 1.7952167987823486, "learning_rate": 6.641526313404534e-08, "loss": 0.1748, "step": 2640 }, { "epoch": 2.810610079575597, "grad_norm": 2.376830816268921, "learning_rate": 5.952952818225416e-08, "loss": 0.2061, "step": 2650 }, { "epoch": 2.821220159151194, "grad_norm": 1.7183632850646973, "learning_rate": 5.3016249302565436e-08, "loss": 0.1742, "step": 2660 }, { "epoch": 2.8318302387267904, "grad_norm": 2.11011004447937, "learning_rate": 4.6876420528833014e-08, "loss": 0.2082, "step": 2670 }, { "epoch": 2.8424403183023874, "grad_norm": 1.8799868822097778, "learning_rate": 4.111097890026089e-08, "loss": 0.1805, "step": 2680 }, { "epoch": 2.853050397877984, "grad_norm": 2.5171291828155518, "learning_rate": 3.5720804318395976e-08, "loss": 0.2058, "step": 2690 }, { "epoch": 2.863660477453581, "grad_norm": 2.142263650894165, "learning_rate": 3.0706719412839926e-08, "loss": 0.2027, "step": 2700 }, { "epoch": 2.8742705570291776, "grad_norm": 2.2124040126800537, "learning_rate": 2.6069489415703197e-08, "loss": 0.1941, "step": 2710 }, { "epoch": 2.8848806366047746, "grad_norm": 2.033259153366089, "learning_rate": 2.18098220448168e-08, "loss": 0.2029, "step": 2720 }, { "epoch": 2.8954907161803716, "grad_norm": 2.416912794113159, "learning_rate": 1.7928367395725066e-08, "loss": 0.2062, "step": 2730 }, { "epoch": 2.906100795755968, "grad_norm": 2.193751096725464, "learning_rate": 1.442571784246699e-08, "loss": 0.1873, "step": 2740 }, { "epoch": 2.916710875331565, "grad_norm": 1.5729731321334839, "learning_rate": 1.1302407947173522e-08, "loss": 0.1653, "step": 2750 }, { "epoch": 2.9273209549071617, "grad_norm": 1.7562044858932495, "learning_rate": 8.558914378481996e-09, "loss": 0.1743, "step": 2760 }, { "epoch": 2.9379310344827587, "grad_norm": 2.183967351913452, "learning_rate": 6.195655838790726e-09, "loss": 0.1821, "step": 2770 }, { "epoch": 2.9485411140583553, "grad_norm": 1.9312433004379272, "learning_rate": 4.212993000356491e-09, "loss": 0.1954, "step": 2780 }, { "epoch": 2.9591511936339523, "grad_norm": 2.2055087089538574, "learning_rate": 2.611228450250802e-09, "loss": 0.1925, "step": 2790 }, { "epoch": 2.9697612732095493, "grad_norm": 1.6606404781341553, "learning_rate": 1.3906066441798927e-09, "loss": 0.1805, "step": 2800 }, { "epoch": 2.980371352785146, "grad_norm": 2.594404458999634, "learning_rate": 5.513138691767839e-10, "loss": 0.2084, "step": 2810 }, { "epoch": 2.9909814323607424, "grad_norm": 2.007861375808716, "learning_rate": 9.347821517069477e-11, "loss": 0.2115, "step": 2820 }, { "epoch": 2.9973474801061006, "step": 2826, "total_flos": 1.0915292825780224e+17, "train_loss": 0.34044326600333263, "train_runtime": 16671.2674, "train_samples_per_second": 2.713, "train_steps_per_second": 0.17 } ], "logging_steps": 10, "max_steps": 2826, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 943, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0915292825780224e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }