aepo_light / trainer_state.json
dongguanting's picture
Upload folder using huggingface_hub
7caaa15 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9973474801061006,
"eval_steps": 500,
"global_step": 2826,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010610079575596816,
"grad_norm": 4.634474754333496,
"learning_rate": 1.5901060070671379e-07,
"loss": 0.741,
"step": 10
},
{
"epoch": 0.021220159151193633,
"grad_norm": 2.9002726078033447,
"learning_rate": 3.356890459363958e-07,
"loss": 0.5551,
"step": 20
},
{
"epoch": 0.03183023872679045,
"grad_norm": 4.242003917694092,
"learning_rate": 5.123674911660778e-07,
"loss": 0.6185,
"step": 30
},
{
"epoch": 0.042440318302387266,
"grad_norm": 3.8156638145446777,
"learning_rate": 6.890459363957598e-07,
"loss": 0.6358,
"step": 40
},
{
"epoch": 0.05305039787798409,
"grad_norm": 3.047624349594116,
"learning_rate": 8.657243816254418e-07,
"loss": 0.5922,
"step": 50
},
{
"epoch": 0.0636604774535809,
"grad_norm": 2.2943954467773438,
"learning_rate": 1.0424028268551239e-06,
"loss": 0.6282,
"step": 60
},
{
"epoch": 0.07427055702917772,
"grad_norm": 2.831937551498413,
"learning_rate": 1.2190812720848057e-06,
"loss": 0.5836,
"step": 70
},
{
"epoch": 0.08488063660477453,
"grad_norm": 3.941297769546509,
"learning_rate": 1.3957597173144876e-06,
"loss": 0.5836,
"step": 80
},
{
"epoch": 0.09549071618037135,
"grad_norm": 2.4598379135131836,
"learning_rate": 1.5724381625441699e-06,
"loss": 0.4983,
"step": 90
},
{
"epoch": 0.10610079575596817,
"grad_norm": 2.533829927444458,
"learning_rate": 1.7491166077738517e-06,
"loss": 0.6057,
"step": 100
},
{
"epoch": 0.11671087533156499,
"grad_norm": 2.412334442138672,
"learning_rate": 1.925795053003534e-06,
"loss": 0.5135,
"step": 110
},
{
"epoch": 0.1273209549071618,
"grad_norm": 2.7505877017974854,
"learning_rate": 2.1024734982332157e-06,
"loss": 0.4844,
"step": 120
},
{
"epoch": 0.13793103448275862,
"grad_norm": 2.701307535171509,
"learning_rate": 2.279151943462898e-06,
"loss": 0.5386,
"step": 130
},
{
"epoch": 0.14854111405835543,
"grad_norm": 2.8261961936950684,
"learning_rate": 2.45583038869258e-06,
"loss": 0.4774,
"step": 140
},
{
"epoch": 0.15915119363395225,
"grad_norm": 2.4490256309509277,
"learning_rate": 2.6325088339222617e-06,
"loss": 0.5035,
"step": 150
},
{
"epoch": 0.16976127320954906,
"grad_norm": 2.418158769607544,
"learning_rate": 2.8091872791519436e-06,
"loss": 0.4897,
"step": 160
},
{
"epoch": 0.18037135278514588,
"grad_norm": 3.5972161293029785,
"learning_rate": 2.985865724381626e-06,
"loss": 0.5196,
"step": 170
},
{
"epoch": 0.1909814323607427,
"grad_norm": 2.814927577972412,
"learning_rate": 3.162544169611308e-06,
"loss": 0.4791,
"step": 180
},
{
"epoch": 0.20159151193633953,
"grad_norm": 2.6151270866394043,
"learning_rate": 3.3392226148409896e-06,
"loss": 0.5024,
"step": 190
},
{
"epoch": 0.21220159151193635,
"grad_norm": 2.8331387042999268,
"learning_rate": 3.5159010600706715e-06,
"loss": 0.5781,
"step": 200
},
{
"epoch": 0.22281167108753316,
"grad_norm": 2.433027744293213,
"learning_rate": 3.6925795053003538e-06,
"loss": 0.4186,
"step": 210
},
{
"epoch": 0.23342175066312998,
"grad_norm": 2.671696186065674,
"learning_rate": 3.869257950530036e-06,
"loss": 0.4819,
"step": 220
},
{
"epoch": 0.2440318302387268,
"grad_norm": 2.5337982177734375,
"learning_rate": 4.045936395759718e-06,
"loss": 0.547,
"step": 230
},
{
"epoch": 0.2546419098143236,
"grad_norm": 2.2034990787506104,
"learning_rate": 4.222614840989399e-06,
"loss": 0.5603,
"step": 240
},
{
"epoch": 0.26525198938992045,
"grad_norm": 2.2893121242523193,
"learning_rate": 4.399293286219082e-06,
"loss": 0.4483,
"step": 250
},
{
"epoch": 0.27586206896551724,
"grad_norm": 1.8757219314575195,
"learning_rate": 4.575971731448763e-06,
"loss": 0.5178,
"step": 260
},
{
"epoch": 0.2864721485411141,
"grad_norm": 2.3748602867126465,
"learning_rate": 4.752650176678445e-06,
"loss": 0.5264,
"step": 270
},
{
"epoch": 0.29708222811671087,
"grad_norm": 3.0481033325195312,
"learning_rate": 4.929328621908128e-06,
"loss": 0.5124,
"step": 280
},
{
"epoch": 0.3076923076923077,
"grad_norm": 2.682847023010254,
"learning_rate": 4.99993132201408e-06,
"loss": 0.4977,
"step": 290
},
{
"epoch": 0.3183023872679045,
"grad_norm": 2.472842216491699,
"learning_rate": 4.9995116368759e-06,
"loss": 0.5005,
"step": 300
},
{
"epoch": 0.32891246684350134,
"grad_norm": 2.582815647125244,
"learning_rate": 4.998710485009401e-06,
"loss": 0.4857,
"step": 310
},
{
"epoch": 0.3395225464190981,
"grad_norm": 2.3572824001312256,
"learning_rate": 4.99752798868358e-06,
"loss": 0.4637,
"step": 320
},
{
"epoch": 0.35013262599469497,
"grad_norm": 2.3432295322418213,
"learning_rate": 4.99596432836689e-06,
"loss": 0.4775,
"step": 330
},
{
"epoch": 0.36074270557029176,
"grad_norm": 2.7486777305603027,
"learning_rate": 4.994019742699705e-06,
"loss": 0.5779,
"step": 340
},
{
"epoch": 0.3713527851458886,
"grad_norm": 2.3831562995910645,
"learning_rate": 4.991694528457891e-06,
"loss": 0.5057,
"step": 350
},
{
"epoch": 0.3819628647214854,
"grad_norm": 2.5414721965789795,
"learning_rate": 4.988989040507518e-06,
"loss": 0.5313,
"step": 360
},
{
"epoch": 0.3925729442970822,
"grad_norm": 2.4140472412109375,
"learning_rate": 4.985903691750697e-06,
"loss": 0.4441,
"step": 370
},
{
"epoch": 0.40318302387267907,
"grad_norm": 2.4907593727111816,
"learning_rate": 4.982438953062572e-06,
"loss": 0.4778,
"step": 380
},
{
"epoch": 0.41379310344827586,
"grad_norm": 2.579932928085327,
"learning_rate": 4.978595353219449e-06,
"loss": 0.4848,
"step": 390
},
{
"epoch": 0.4244031830238727,
"grad_norm": 2.5512266159057617,
"learning_rate": 4.974373478818098e-06,
"loss": 0.4891,
"step": 400
},
{
"epoch": 0.4350132625994695,
"grad_norm": 2.3293063640594482,
"learning_rate": 4.969773974186235e-06,
"loss": 0.4954,
"step": 410
},
{
"epoch": 0.44562334217506633,
"grad_norm": 2.6347479820251465,
"learning_rate": 4.964797541284175e-06,
"loss": 0.5353,
"step": 420
},
{
"epoch": 0.4562334217506631,
"grad_norm": 2.7719151973724365,
"learning_rate": 4.959444939597712e-06,
"loss": 0.5726,
"step": 430
},
{
"epoch": 0.46684350132625996,
"grad_norm": 2.1757211685180664,
"learning_rate": 4.953716986022204e-06,
"loss": 0.5642,
"step": 440
},
{
"epoch": 0.47745358090185674,
"grad_norm": 2.432244300842285,
"learning_rate": 4.947614554737904e-06,
"loss": 0.4429,
"step": 450
},
{
"epoch": 0.4880636604774536,
"grad_norm": 1.972844123840332,
"learning_rate": 4.941138577076538e-06,
"loss": 0.4683,
"step": 460
},
{
"epoch": 0.4986737400530504,
"grad_norm": 2.484992742538452,
"learning_rate": 4.934290041379182e-06,
"loss": 0.4385,
"step": 470
},
{
"epoch": 0.5092838196286472,
"grad_norm": 2.0424418449401855,
"learning_rate": 4.92706999284541e-06,
"loss": 0.4935,
"step": 480
},
{
"epoch": 0.519893899204244,
"grad_norm": 2.3754308223724365,
"learning_rate": 4.9194795333737925e-06,
"loss": 0.4548,
"step": 490
},
{
"epoch": 0.5305039787798409,
"grad_norm": 3.0801432132720947,
"learning_rate": 4.911519821393718e-06,
"loss": 0.5486,
"step": 500
},
{
"epoch": 0.5411140583554377,
"grad_norm": 2.2712507247924805,
"learning_rate": 4.9031920716886035e-06,
"loss": 0.5121,
"step": 510
},
{
"epoch": 0.5517241379310345,
"grad_norm": 2.0000548362731934,
"learning_rate": 4.894497555210499e-06,
"loss": 0.4495,
"step": 520
},
{
"epoch": 0.5623342175066313,
"grad_norm": 2.590303897857666,
"learning_rate": 4.8854375988861134e-06,
"loss": 0.5028,
"step": 530
},
{
"epoch": 0.5729442970822282,
"grad_norm": 2.377298355102539,
"learning_rate": 4.87601358541431e-06,
"loss": 0.5193,
"step": 540
},
{
"epoch": 0.583554376657825,
"grad_norm": 2.966008186340332,
"learning_rate": 4.8662269530550825e-06,
"loss": 0.545,
"step": 550
},
{
"epoch": 0.5941644562334217,
"grad_norm": 2.250293254852295,
"learning_rate": 4.856079195410046e-06,
"loss": 0.5219,
"step": 560
},
{
"epoch": 0.6047745358090185,
"grad_norm": 2.437361240386963,
"learning_rate": 4.845571861194501e-06,
"loss": 0.4725,
"step": 570
},
{
"epoch": 0.6153846153846154,
"grad_norm": 2.435994863510132,
"learning_rate": 4.834706554001065e-06,
"loss": 0.4232,
"step": 580
},
{
"epoch": 0.6259946949602122,
"grad_norm": 2.705902099609375,
"learning_rate": 4.823484932054937e-06,
"loss": 0.4834,
"step": 590
},
{
"epoch": 0.636604774535809,
"grad_norm": 2.1471517086029053,
"learning_rate": 4.811908707960832e-06,
"loss": 0.5302,
"step": 600
},
{
"epoch": 0.6472148541114059,
"grad_norm": 2.0760443210601807,
"learning_rate": 4.799979648441602e-06,
"loss": 0.494,
"step": 610
},
{
"epoch": 0.6578249336870027,
"grad_norm": 2.334944009780884,
"learning_rate": 4.787699574068611e-06,
"loss": 0.487,
"step": 620
},
{
"epoch": 0.6684350132625995,
"grad_norm": 2.3444855213165283,
"learning_rate": 4.775070358983881e-06,
"loss": 0.4911,
"step": 630
},
{
"epoch": 0.6790450928381963,
"grad_norm": 2.127737045288086,
"learning_rate": 4.7620939306140696e-06,
"loss": 0.4744,
"step": 640
},
{
"epoch": 0.6896551724137931,
"grad_norm": 2.2132568359375,
"learning_rate": 4.748772269376312e-06,
"loss": 0.4789,
"step": 650
},
{
"epoch": 0.7002652519893899,
"grad_norm": 1.9452372789382935,
"learning_rate": 4.735107408375977e-06,
"loss": 0.488,
"step": 660
},
{
"epoch": 0.7108753315649867,
"grad_norm": 2.7268893718719482,
"learning_rate": 4.721101433096381e-06,
"loss": 0.4462,
"step": 670
},
{
"epoch": 0.7214854111405835,
"grad_norm": 2.1095452308654785,
"learning_rate": 4.706756481080511e-06,
"loss": 0.5087,
"step": 680
},
{
"epoch": 0.7320954907161804,
"grad_norm": 2.278555154800415,
"learning_rate": 4.692074741604795e-06,
"loss": 0.5304,
"step": 690
},
{
"epoch": 0.7427055702917772,
"grad_norm": 2.455960512161255,
"learning_rate": 4.677058455344989e-06,
"loss": 0.5177,
"step": 700
},
{
"epoch": 0.753315649867374,
"grad_norm": 2.1136856079101562,
"learning_rate": 4.661709914034209e-06,
"loss": 0.4841,
"step": 710
},
{
"epoch": 0.7639257294429708,
"grad_norm": 2.296614646911621,
"learning_rate": 4.646031460113175e-06,
"loss": 0.4544,
"step": 720
},
{
"epoch": 0.7745358090185677,
"grad_norm": 1.8733782768249512,
"learning_rate": 4.630025486372715e-06,
"loss": 0.4715,
"step": 730
},
{
"epoch": 0.7851458885941645,
"grad_norm": 2.526837110519409,
"learning_rate": 4.613694435588589e-06,
"loss": 0.4824,
"step": 740
},
{
"epoch": 0.7957559681697612,
"grad_norm": 2.2026150226593018,
"learning_rate": 4.597040800148679e-06,
"loss": 0.4852,
"step": 750
},
{
"epoch": 0.8063660477453581,
"grad_norm": 2.214277744293213,
"learning_rate": 4.580067121672607e-06,
"loss": 0.4134,
"step": 760
},
{
"epoch": 0.8169761273209549,
"grad_norm": 2.623305559158325,
"learning_rate": 4.562775990623847e-06,
"loss": 0.4493,
"step": 770
},
{
"epoch": 0.8275862068965517,
"grad_norm": 2.9433794021606445,
"learning_rate": 4.5451700459143735e-06,
"loss": 0.5255,
"step": 780
},
{
"epoch": 0.8381962864721485,
"grad_norm": 2.143739938735962,
"learning_rate": 4.527251974501923e-06,
"loss": 0.4503,
"step": 790
},
{
"epoch": 0.8488063660477454,
"grad_norm": 2.1592986583709717,
"learning_rate": 4.509024510979917e-06,
"loss": 0.4636,
"step": 800
},
{
"epoch": 0.8594164456233422,
"grad_norm": 2.2622759342193604,
"learning_rate": 4.4904904371601176e-06,
"loss": 0.4685,
"step": 810
},
{
"epoch": 0.870026525198939,
"grad_norm": 2.3408522605895996,
"learning_rate": 4.4716525816480816e-06,
"loss": 0.5248,
"step": 820
},
{
"epoch": 0.8806366047745358,
"grad_norm": 2.5351459980010986,
"learning_rate": 4.4525138194114644e-06,
"loss": 0.4747,
"step": 830
},
{
"epoch": 0.8912466843501327,
"grad_norm": 2.4038591384887695,
"learning_rate": 4.4330770713412555e-06,
"loss": 0.4198,
"step": 840
},
{
"epoch": 0.9018567639257294,
"grad_norm": 2.2719292640686035,
"learning_rate": 4.413345303805996e-06,
"loss": 0.4545,
"step": 850
},
{
"epoch": 0.9124668435013262,
"grad_norm": 3.1209301948547363,
"learning_rate": 4.393321528199072e-06,
"loss": 0.5003,
"step": 860
},
{
"epoch": 0.9230769230769231,
"grad_norm": 2.414945125579834,
"learning_rate": 4.373008800479118e-06,
"loss": 0.472,
"step": 870
},
{
"epoch": 0.9336870026525199,
"grad_norm": 2.21144437789917,
"learning_rate": 4.352410220703629e-06,
"loss": 0.4661,
"step": 880
},
{
"epoch": 0.9442970822281167,
"grad_norm": 2.210827589035034,
"learning_rate": 4.331528932555844e-06,
"loss": 0.4614,
"step": 890
},
{
"epoch": 0.9549071618037135,
"grad_norm": 2.403038740158081,
"learning_rate": 4.3103681228649626e-06,
"loss": 0.4623,
"step": 900
},
{
"epoch": 0.9655172413793104,
"grad_norm": 2.588114023208618,
"learning_rate": 4.288931021119788e-06,
"loss": 0.4902,
"step": 910
},
{
"epoch": 0.9761273209549072,
"grad_norm": 2.288691997528076,
"learning_rate": 4.267220898975848e-06,
"loss": 0.5047,
"step": 920
},
{
"epoch": 0.986737400530504,
"grad_norm": 2.2487804889678955,
"learning_rate": 4.245241069756092e-06,
"loss": 0.5358,
"step": 930
},
{
"epoch": 0.9973474801061007,
"grad_norm": 2.5266008377075195,
"learning_rate": 4.222994887945219e-06,
"loss": 0.4928,
"step": 940
},
{
"epoch": 1.0074270557029177,
"grad_norm": 2.5962352752685547,
"learning_rate": 4.20048574867773e-06,
"loss": 0.3963,
"step": 950
},
{
"epoch": 1.0180371352785147,
"grad_norm": 2.707613229751587,
"learning_rate": 4.1777170872197725e-06,
"loss": 0.3125,
"step": 960
},
{
"epoch": 1.0286472148541115,
"grad_norm": 2.4237964153289795,
"learning_rate": 4.1546923784448646e-06,
"loss": 0.3457,
"step": 970
},
{
"epoch": 1.0392572944297083,
"grad_norm": 1.6531928777694702,
"learning_rate": 4.1314151363035705e-06,
"loss": 0.3029,
"step": 980
},
{
"epoch": 1.049867374005305,
"grad_norm": 2.1669981479644775,
"learning_rate": 4.1078889132872145e-06,
"loss": 0.3289,
"step": 990
},
{
"epoch": 1.0604774535809018,
"grad_norm": 2.445012092590332,
"learning_rate": 4.084117299885712e-06,
"loss": 0.3234,
"step": 1000
},
{
"epoch": 1.0710875331564986,
"grad_norm": 2.0615527629852295,
"learning_rate": 4.060103924039599e-06,
"loss": 0.3139,
"step": 1010
},
{
"epoch": 1.0816976127320954,
"grad_norm": 1.990400791168213,
"learning_rate": 4.035852450586352e-06,
"loss": 0.3144,
"step": 1020
},
{
"epoch": 1.0923076923076924,
"grad_norm": 2.5510122776031494,
"learning_rate": 4.011366580701073e-06,
"loss": 0.323,
"step": 1030
},
{
"epoch": 1.1029177718832892,
"grad_norm": 2.462083101272583,
"learning_rate": 3.9866500513316274e-06,
"loss": 0.3694,
"step": 1040
},
{
"epoch": 1.113527851458886,
"grad_norm": 2.4385085105895996,
"learning_rate": 3.961706634628323e-06,
"loss": 0.3351,
"step": 1050
},
{
"epoch": 1.1241379310344828,
"grad_norm": 1.7553578615188599,
"learning_rate": 3.936540137368222e-06,
"loss": 0.3459,
"step": 1060
},
{
"epoch": 1.1347480106100796,
"grad_norm": 2.513950824737549,
"learning_rate": 3.911154400374159e-06,
"loss": 0.3186,
"step": 1070
},
{
"epoch": 1.1453580901856764,
"grad_norm": 2.6273515224456787,
"learning_rate": 3.885553297928573e-06,
"loss": 0.3333,
"step": 1080
},
{
"epoch": 1.1559681697612731,
"grad_norm": 2.4155592918395996,
"learning_rate": 3.859740737182222e-06,
"loss": 0.3137,
"step": 1090
},
{
"epoch": 1.16657824933687,
"grad_norm": 2.719611644744873,
"learning_rate": 3.833720657557894e-06,
"loss": 0.3426,
"step": 1100
},
{
"epoch": 1.1771883289124667,
"grad_norm": 2.5729358196258545,
"learning_rate": 3.807497030149181e-06,
"loss": 0.3709,
"step": 1110
},
{
"epoch": 1.1877984084880637,
"grad_norm": 1.9626141786575317,
"learning_rate": 3.7810738571144257e-06,
"loss": 0.329,
"step": 1120
},
{
"epoch": 1.1984084880636605,
"grad_norm": 2.601951837539673,
"learning_rate": 3.7544551710659296e-06,
"loss": 0.305,
"step": 1130
},
{
"epoch": 1.2090185676392573,
"grad_norm": 2.4118540287017822,
"learning_rate": 3.7276450344545024e-06,
"loss": 0.3449,
"step": 1140
},
{
"epoch": 1.219628647214854,
"grad_norm": 2.5080604553222656,
"learning_rate": 3.7006475389494723e-06,
"loss": 0.3403,
"step": 1150
},
{
"epoch": 1.2302387267904509,
"grad_norm": 2.6882951259613037,
"learning_rate": 3.6734668048142273e-06,
"loss": 0.3342,
"step": 1160
},
{
"epoch": 1.2408488063660477,
"grad_norm": 2.3755247592926025,
"learning_rate": 3.646106980277394e-06,
"loss": 0.3589,
"step": 1170
},
{
"epoch": 1.2514588859416444,
"grad_norm": 2.4138166904449463,
"learning_rate": 3.618572240899748e-06,
"loss": 0.3447,
"step": 1180
},
{
"epoch": 1.2620689655172415,
"grad_norm": 2.6930105686187744,
"learning_rate": 3.5908667889369603e-06,
"loss": 0.3787,
"step": 1190
},
{
"epoch": 1.2726790450928382,
"grad_norm": 2.732795476913452,
"learning_rate": 3.5629948526982563e-06,
"loss": 0.3376,
"step": 1200
},
{
"epoch": 1.283289124668435,
"grad_norm": 1.8468087911605835,
"learning_rate": 3.534960685901111e-06,
"loss": 0.3461,
"step": 1210
},
{
"epoch": 1.2938992042440318,
"grad_norm": 2.3408284187316895,
"learning_rate": 3.506768567022062e-06,
"loss": 0.3396,
"step": 1220
},
{
"epoch": 1.3045092838196286,
"grad_norm": 2.7420434951782227,
"learning_rate": 3.478422798643737e-06,
"loss": 0.3364,
"step": 1230
},
{
"epoch": 1.3151193633952254,
"grad_norm": 2.634403705596924,
"learning_rate": 3.4499277067982177e-06,
"loss": 0.3126,
"step": 1240
},
{
"epoch": 1.3257294429708222,
"grad_norm": 2.4217336177825928,
"learning_rate": 3.421287640306809e-06,
"loss": 0.3092,
"step": 1250
},
{
"epoch": 1.3363395225464192,
"grad_norm": 1.7107937335968018,
"learning_rate": 3.3925069701163406e-06,
"loss": 0.3374,
"step": 1260
},
{
"epoch": 1.346949602122016,
"grad_norm": 2.1515822410583496,
"learning_rate": 3.363590088632085e-06,
"loss": 0.3436,
"step": 1270
},
{
"epoch": 1.3575596816976128,
"grad_norm": 2.0105717182159424,
"learning_rate": 3.334541409047408e-06,
"loss": 0.3283,
"step": 1280
},
{
"epoch": 1.3681697612732096,
"grad_norm": 1.8952791690826416,
"learning_rate": 3.3053653646702422e-06,
"loss": 0.358,
"step": 1290
},
{
"epoch": 1.3787798408488063,
"grad_norm": 1.8639928102493286,
"learning_rate": 3.276066408246487e-06,
"loss": 0.3084,
"step": 1300
},
{
"epoch": 1.3893899204244031,
"grad_norm": 2.563251256942749,
"learning_rate": 3.2466490112804484e-06,
"loss": 0.3508,
"step": 1310
},
{
"epoch": 1.4,
"grad_norm": 2.214616060256958,
"learning_rate": 3.217117663352417e-06,
"loss": 0.3215,
"step": 1320
},
{
"epoch": 1.410610079575597,
"grad_norm": 1.793468952178955,
"learning_rate": 3.187476871433478e-06,
"loss": 0.3193,
"step": 1330
},
{
"epoch": 1.4212201591511937,
"grad_norm": 2.204789638519287,
"learning_rate": 3.1577311591976766e-06,
"loss": 0.3019,
"step": 1340
},
{
"epoch": 1.4318302387267905,
"grad_norm": 2.307568311691284,
"learning_rate": 3.1278850663316307e-06,
"loss": 0.3099,
"step": 1350
},
{
"epoch": 1.4424403183023873,
"grad_norm": 2.485848903656006,
"learning_rate": 3.0979431478416987e-06,
"loss": 0.3085,
"step": 1360
},
{
"epoch": 1.453050397877984,
"grad_norm": 1.953053593635559,
"learning_rate": 3.067909973358811e-06,
"loss": 0.3211,
"step": 1370
},
{
"epoch": 1.4636604774535809,
"grad_norm": 2.2350101470947266,
"learning_rate": 3.0377901264410673e-06,
"loss": 0.3329,
"step": 1380
},
{
"epoch": 1.4742705570291776,
"grad_norm": 2.542452335357666,
"learning_rate": 3.0075882038742133e-06,
"loss": 0.3376,
"step": 1390
},
{
"epoch": 1.4848806366047747,
"grad_norm": 2.3203530311584473,
"learning_rate": 2.9773088149700923e-06,
"loss": 0.2896,
"step": 1400
},
{
"epoch": 1.4954907161803712,
"grad_norm": 1.9708584547042847,
"learning_rate": 2.9469565808631888e-06,
"loss": 0.299,
"step": 1410
},
{
"epoch": 1.5061007957559682,
"grad_norm": 2.63698148727417,
"learning_rate": 2.9165361338053683e-06,
"loss": 0.3484,
"step": 1420
},
{
"epoch": 1.516710875331565,
"grad_norm": 2.091648578643799,
"learning_rate": 2.886052116458918e-06,
"loss": 0.3316,
"step": 1430
},
{
"epoch": 1.5273209549071618,
"grad_norm": 1.955355167388916,
"learning_rate": 2.8555091811880004e-06,
"loss": 0.328,
"step": 1440
},
{
"epoch": 1.5379310344827586,
"grad_norm": 1.6724951267242432,
"learning_rate": 2.8249119893486252e-06,
"loss": 0.3215,
"step": 1450
},
{
"epoch": 1.5485411140583554,
"grad_norm": 2.1872570514678955,
"learning_rate": 2.7942652105772516e-06,
"loss": 0.3118,
"step": 1460
},
{
"epoch": 1.5591511936339524,
"grad_norm": 3.0710208415985107,
"learning_rate": 2.7635735220781214e-06,
"loss": 0.2973,
"step": 1470
},
{
"epoch": 1.569761273209549,
"grad_norm": 2.357663631439209,
"learning_rate": 2.7328416079094412e-06,
"loss": 0.3423,
"step": 1480
},
{
"epoch": 1.580371352785146,
"grad_norm": 2.2559144496917725,
"learning_rate": 2.7020741582685217e-06,
"loss": 0.3211,
"step": 1490
},
{
"epoch": 1.5909814323607427,
"grad_norm": 2.0730817317962646,
"learning_rate": 2.6712758687759706e-06,
"loss": 0.2733,
"step": 1500
},
{
"epoch": 1.6015915119363395,
"grad_norm": 2.6119141578674316,
"learning_rate": 2.6404514397590657e-06,
"loss": 0.338,
"step": 1510
},
{
"epoch": 1.6122015915119363,
"grad_norm": 2.315875768661499,
"learning_rate": 2.6096055755344113e-06,
"loss": 0.3124,
"step": 1520
},
{
"epoch": 1.622811671087533,
"grad_norm": 2.2880892753601074,
"learning_rate": 2.578742983689973e-06,
"loss": 0.3538,
"step": 1530
},
{
"epoch": 1.6334217506631301,
"grad_norm": 2.2615041732788086,
"learning_rate": 2.547868374366631e-06,
"loss": 0.3353,
"step": 1540
},
{
"epoch": 1.6440318302387267,
"grad_norm": 1.9062315225601196,
"learning_rate": 2.5169864595393295e-06,
"loss": 0.302,
"step": 1550
},
{
"epoch": 1.6546419098143237,
"grad_norm": 2.7016942501068115,
"learning_rate": 2.4861019522979537e-06,
"loss": 0.3124,
"step": 1560
},
{
"epoch": 1.6652519893899205,
"grad_norm": 2.4618184566497803,
"learning_rate": 2.455219566128034e-06,
"loss": 0.3497,
"step": 1570
},
{
"epoch": 1.6758620689655173,
"grad_norm": 2.8924951553344727,
"learning_rate": 2.4243440141913905e-06,
"loss": 0.3233,
"step": 1580
},
{
"epoch": 1.686472148541114,
"grad_norm": 2.32255482673645,
"learning_rate": 2.393480008606825e-06,
"loss": 0.3067,
"step": 1590
},
{
"epoch": 1.6970822281167108,
"grad_norm": 1.8984359502792358,
"learning_rate": 2.3626322597309774e-06,
"loss": 0.2893,
"step": 1600
},
{
"epoch": 1.7076923076923078,
"grad_norm": 1.8360289335250854,
"learning_rate": 2.331805475439445e-06,
"loss": 0.2825,
"step": 1610
},
{
"epoch": 1.7183023872679044,
"grad_norm": 2.331998109817505,
"learning_rate": 2.3010043604082824e-06,
"loss": 0.3379,
"step": 1620
},
{
"epoch": 1.7289124668435014,
"grad_norm": 2.3304574489593506,
"learning_rate": 2.2702336153959925e-06,
"loss": 0.301,
"step": 1630
},
{
"epoch": 1.739522546419098,
"grad_norm": 2.534090518951416,
"learning_rate": 2.2394979365261134e-06,
"loss": 0.404,
"step": 1640
},
{
"epoch": 1.750132625994695,
"grad_norm": 2.273122549057007,
"learning_rate": 2.208802014570507e-06,
"loss": 0.3242,
"step": 1650
},
{
"epoch": 1.7607427055702918,
"grad_norm": 1.8859643936157227,
"learning_rate": 2.1781505342334775e-06,
"loss": 0.3152,
"step": 1660
},
{
"epoch": 1.7713527851458886,
"grad_norm": 2.567715644836426,
"learning_rate": 2.147548173436805e-06,
"loss": 0.3302,
"step": 1670
},
{
"epoch": 1.7819628647214856,
"grad_norm": 2.7930519580841064,
"learning_rate": 2.116999602605814e-06,
"loss": 0.293,
"step": 1680
},
{
"epoch": 1.7925729442970821,
"grad_norm": 2.646296262741089,
"learning_rate": 2.086509483956594e-06,
"loss": 0.2683,
"step": 1690
},
{
"epoch": 1.8031830238726791,
"grad_norm": 2.3010053634643555,
"learning_rate": 2.056082470784469e-06,
"loss": 0.313,
"step": 1700
},
{
"epoch": 1.8137931034482757,
"grad_norm": 2.3864669799804688,
"learning_rate": 2.0257232067538213e-06,
"loss": 0.262,
"step": 1710
},
{
"epoch": 1.8244031830238727,
"grad_norm": 2.63028883934021,
"learning_rate": 1.9954363251894007e-06,
"loss": 0.3457,
"step": 1720
},
{
"epoch": 1.8350132625994695,
"grad_norm": 2.0011484622955322,
"learning_rate": 1.9652264483691933e-06,
"loss": 0.2739,
"step": 1730
},
{
"epoch": 1.8456233421750663,
"grad_norm": 2.6818690299987793,
"learning_rate": 1.9350981868189944e-06,
"loss": 0.3109,
"step": 1740
},
{
"epoch": 1.856233421750663,
"grad_norm": 2.6978225708007812,
"learning_rate": 1.9050561386087618e-06,
"loss": 0.3269,
"step": 1750
},
{
"epoch": 1.8668435013262599,
"grad_norm": 2.578031301498413,
"learning_rate": 1.8751048886508711e-06,
"loss": 0.3617,
"step": 1760
},
{
"epoch": 1.8774535809018569,
"grad_norm": 2.5525052547454834,
"learning_rate": 1.8452490080003888e-06,
"loss": 0.3228,
"step": 1770
},
{
"epoch": 1.8880636604774534,
"grad_norm": 2.1095635890960693,
"learning_rate": 1.8154930531574521e-06,
"loss": 0.2857,
"step": 1780
},
{
"epoch": 1.8986737400530505,
"grad_norm": 2.3965845108032227,
"learning_rate": 1.785841565371868e-06,
"loss": 0.3622,
"step": 1790
},
{
"epoch": 1.9092838196286472,
"grad_norm": 2.293715238571167,
"learning_rate": 1.7562990699500482e-06,
"loss": 0.3031,
"step": 1800
},
{
"epoch": 1.919893899204244,
"grad_norm": 2.026015281677246,
"learning_rate": 1.7268700755643708e-06,
"loss": 0.3019,
"step": 1810
},
{
"epoch": 1.9305039787798408,
"grad_norm": 1.7175791263580322,
"learning_rate": 1.6975590735650812e-06,
"loss": 0.3047,
"step": 1820
},
{
"epoch": 1.9411140583554376,
"grad_norm": 2.0024490356445312,
"learning_rate": 1.668370537294841e-06,
"loss": 0.3048,
"step": 1830
},
{
"epoch": 1.9517241379310346,
"grad_norm": 2.8226239681243896,
"learning_rate": 1.6393089214060204e-06,
"loss": 0.3205,
"step": 1840
},
{
"epoch": 1.9623342175066312,
"grad_norm": 1.9452221393585205,
"learning_rate": 1.6103786611808414e-06,
"loss": 0.321,
"step": 1850
},
{
"epoch": 1.9729442970822282,
"grad_norm": 2.304274320602417,
"learning_rate": 1.5815841718544884e-06,
"loss": 0.2954,
"step": 1860
},
{
"epoch": 1.983554376657825,
"grad_norm": 2.502206802368164,
"learning_rate": 1.5529298479412636e-06,
"loss": 0.2945,
"step": 1870
},
{
"epoch": 1.9941644562334218,
"grad_norm": 2.5796189308166504,
"learning_rate": 1.524420062563912e-06,
"loss": 0.3291,
"step": 1880
},
{
"epoch": 2.004244031830239,
"grad_norm": 1.9198871850967407,
"learning_rate": 1.4960591667862163e-06,
"loss": 0.234,
"step": 1890
},
{
"epoch": 2.0148541114058354,
"grad_norm": 1.7082706689834595,
"learning_rate": 1.4678514889489464e-06,
"loss": 0.1943,
"step": 1900
},
{
"epoch": 2.0254641909814324,
"grad_norm": 1.8571817874908447,
"learning_rate": 1.4398013340092864e-06,
"loss": 0.1911,
"step": 1910
},
{
"epoch": 2.0360742705570294,
"grad_norm": 2.454561233520508,
"learning_rate": 1.4119129828838275e-06,
"loss": 0.1895,
"step": 1920
},
{
"epoch": 2.046684350132626,
"grad_norm": 2.3714683055877686,
"learning_rate": 1.384190691795226e-06,
"loss": 0.2177,
"step": 1930
},
{
"epoch": 2.057294429708223,
"grad_norm": 2.1356313228607178,
"learning_rate": 1.3566386916226373e-06,
"loss": 0.2252,
"step": 1940
},
{
"epoch": 2.0679045092838195,
"grad_norm": 2.446906089782715,
"learning_rate": 1.3292611872560134e-06,
"loss": 0.1982,
"step": 1950
},
{
"epoch": 2.0785145888594165,
"grad_norm": 2.1040875911712646,
"learning_rate": 1.302062356954365e-06,
"loss": 0.1696,
"step": 1960
},
{
"epoch": 2.089124668435013,
"grad_norm": 2.220742702484131,
"learning_rate": 1.2750463517080922e-06,
"loss": 0.1936,
"step": 1970
},
{
"epoch": 2.09973474801061,
"grad_norm": 2.7784054279327393,
"learning_rate": 1.2482172946054753e-06,
"loss": 0.1604,
"step": 1980
},
{
"epoch": 2.110344827586207,
"grad_norm": 2.0539498329162598,
"learning_rate": 1.2215792802034187e-06,
"loss": 0.2069,
"step": 1990
},
{
"epoch": 2.1209549071618037,
"grad_norm": 1.8337138891220093,
"learning_rate": 1.1951363739025618e-06,
"loss": 0.1964,
"step": 2000
},
{
"epoch": 2.1315649867374007,
"grad_norm": 1.7631642818450928,
"learning_rate": 1.168892611326827e-06,
"loss": 0.1871,
"step": 2010
},
{
"epoch": 2.1421750663129973,
"grad_norm": 2.386589527130127,
"learning_rate": 1.1428519977075136e-06,
"loss": 0.2595,
"step": 2020
},
{
"epoch": 2.1527851458885943,
"grad_norm": 2.553382635116577,
"learning_rate": 1.1170185072720434e-06,
"loss": 0.185,
"step": 2030
},
{
"epoch": 2.163395225464191,
"grad_norm": 2.870973825454712,
"learning_rate": 1.091396082637419e-06,
"loss": 0.228,
"step": 2040
},
{
"epoch": 2.174005305039788,
"grad_norm": 2.643745183944702,
"learning_rate": 1.065988634208516e-06,
"loss": 0.2098,
"step": 2050
},
{
"epoch": 2.184615384615385,
"grad_norm": 2.369596481323242,
"learning_rate": 1.0408000395812961e-06,
"loss": 0.1982,
"step": 2060
},
{
"epoch": 2.1952254641909814,
"grad_norm": 2.1093883514404297,
"learning_rate": 1.0158341429510194e-06,
"loss": 0.1844,
"step": 2070
},
{
"epoch": 2.2058355437665784,
"grad_norm": 1.951935052871704,
"learning_rate": 9.910947545255523e-07,
"loss": 0.1654,
"step": 2080
},
{
"epoch": 2.216445623342175,
"grad_norm": 2.230781078338623,
"learning_rate": 9.665856499438744e-07,
"loss": 0.2037,
"step": 2090
},
{
"epoch": 2.227055702917772,
"grad_norm": 2.6240904331207275,
"learning_rate": 9.423105696998491e-07,
"loss": 0.2087,
"step": 2100
},
{
"epoch": 2.2376657824933686,
"grad_norm": 1.712857723236084,
"learning_rate": 9.182732185713633e-07,
"loss": 0.2105,
"step": 2110
},
{
"epoch": 2.2482758620689656,
"grad_norm": 2.036086082458496,
"learning_rate": 8.94477265054918e-07,
"loss": 0.2186,
"step": 2120
},
{
"epoch": 2.2588859416445626,
"grad_norm": 2.3545398712158203,
"learning_rate": 8.709263408057522e-07,
"loss": 0.1879,
"step": 2130
},
{
"epoch": 2.269496021220159,
"grad_norm": 1.9098992347717285,
"learning_rate": 8.476240400835972e-07,
"loss": 0.2177,
"step": 2140
},
{
"epoch": 2.280106100795756,
"grad_norm": 2.107959270477295,
"learning_rate": 8.245739192041311e-07,
"loss": 0.165,
"step": 2150
},
{
"epoch": 2.2907161803713527,
"grad_norm": 2.550719976425171,
"learning_rate": 8.017794959962225e-07,
"loss": 0.2018,
"step": 2160
},
{
"epoch": 2.3013262599469497,
"grad_norm": 2.354701280593872,
"learning_rate": 7.792442492650587e-07,
"loss": 0.1955,
"step": 2170
},
{
"epoch": 2.3119363395225463,
"grad_norm": 2.3547091484069824,
"learning_rate": 7.569716182612177e-07,
"loss": 0.1976,
"step": 2180
},
{
"epoch": 2.3225464190981433,
"grad_norm": 1.4048022031784058,
"learning_rate": 7.349650021557839e-07,
"loss": 0.1685,
"step": 2190
},
{
"epoch": 2.33315649867374,
"grad_norm": 2.568500280380249,
"learning_rate": 7.132277595215773e-07,
"loss": 0.1519,
"step": 2200
},
{
"epoch": 2.343766578249337,
"grad_norm": 2.205993413925171,
"learning_rate": 6.917632078205805e-07,
"loss": 0.1573,
"step": 2210
},
{
"epoch": 2.3543766578249334,
"grad_norm": 2.067505121231079,
"learning_rate": 6.705746228976387e-07,
"loss": 0.184,
"step": 2220
},
{
"epoch": 2.3649867374005304,
"grad_norm": 2.4360201358795166,
"learning_rate": 6.496652384805125e-07,
"loss": 0.1968,
"step": 2230
},
{
"epoch": 2.3755968169761275,
"grad_norm": 2.042179584503174,
"learning_rate": 6.290382456863584e-07,
"loss": 0.1846,
"step": 2240
},
{
"epoch": 2.386206896551724,
"grad_norm": 2.849271535873413,
"learning_rate": 6.086967925347075e-07,
"loss": 0.1858,
"step": 2250
},
{
"epoch": 2.396816976127321,
"grad_norm": 2.0765082836151123,
"learning_rate": 5.88643983467033e-07,
"loss": 0.1837,
"step": 2260
},
{
"epoch": 2.4074270557029176,
"grad_norm": 1.9958840608596802,
"learning_rate": 5.688828788729547e-07,
"loss": 0.1659,
"step": 2270
},
{
"epoch": 2.4180371352785146,
"grad_norm": 2.253602981567383,
"learning_rate": 5.494164946231747e-07,
"loss": 0.2095,
"step": 2280
},
{
"epoch": 2.428647214854111,
"grad_norm": 1.5552992820739746,
"learning_rate": 5.302478016092075e-07,
"loss": 0.1862,
"step": 2290
},
{
"epoch": 2.439257294429708,
"grad_norm": 2.721445322036743,
"learning_rate": 5.113797252899728e-07,
"loss": 0.2085,
"step": 2300
},
{
"epoch": 2.449867374005305,
"grad_norm": 2.3488707542419434,
"learning_rate": 4.928151452453184e-07,
"loss": 0.1914,
"step": 2310
},
{
"epoch": 2.4604774535809018,
"grad_norm": 2.49068021774292,
"learning_rate": 4.745568947365542e-07,
"loss": 0.1718,
"step": 2320
},
{
"epoch": 2.4710875331564988,
"grad_norm": 1.4638549089431763,
"learning_rate": 4.5660776027404654e-07,
"loss": 0.1669,
"step": 2330
},
{
"epoch": 2.4816976127320953,
"grad_norm": 2.288776159286499,
"learning_rate": 4.389704811919507e-07,
"loss": 0.1731,
"step": 2340
},
{
"epoch": 2.4923076923076923,
"grad_norm": 2.385162115097046,
"learning_rate": 4.216477492301455e-07,
"loss": 0.1802,
"step": 2350
},
{
"epoch": 2.502917771883289,
"grad_norm": 2.0100815296173096,
"learning_rate": 4.0464220812342526e-07,
"loss": 0.2232,
"step": 2360
},
{
"epoch": 2.513527851458886,
"grad_norm": 1.8439091444015503,
"learning_rate": 3.87956453198027e-07,
"loss": 0.1432,
"step": 2370
},
{
"epoch": 2.524137931034483,
"grad_norm": 2.3093338012695312,
"learning_rate": 3.715930309755389e-07,
"loss": 0.1834,
"step": 2380
},
{
"epoch": 2.5347480106100795,
"grad_norm": 2.3250088691711426,
"learning_rate": 3.5555443878425635e-07,
"loss": 0.2123,
"step": 2390
},
{
"epoch": 2.5453580901856765,
"grad_norm": 1.8003133535385132,
"learning_rate": 3.398431243780531e-07,
"loss": 0.2034,
"step": 2400
},
{
"epoch": 2.555968169761273,
"grad_norm": 2.8948135375976562,
"learning_rate": 3.2446148556281117e-07,
"loss": 0.1778,
"step": 2410
},
{
"epoch": 2.56657824933687,
"grad_norm": 1.8556360006332397,
"learning_rate": 3.0941186983047543e-07,
"loss": 0.1892,
"step": 2420
},
{
"epoch": 2.5771883289124666,
"grad_norm": 2.771932363510132,
"learning_rate": 2.9469657400078925e-07,
"loss": 0.1935,
"step": 2430
},
{
"epoch": 2.5877984084880636,
"grad_norm": 2.5325114727020264,
"learning_rate": 2.8031784387076186e-07,
"loss": 0.1858,
"step": 2440
},
{
"epoch": 2.5984084880636606,
"grad_norm": 2.4069302082061768,
"learning_rate": 2.6627787387191934e-07,
"loss": 0.2118,
"step": 2450
},
{
"epoch": 2.609018567639257,
"grad_norm": 2.053656816482544,
"learning_rate": 2.5257880673540376e-07,
"loss": 0.1929,
"step": 2460
},
{
"epoch": 2.6196286472148542,
"grad_norm": 1.8820626735687256,
"learning_rate": 2.392227331649527e-07,
"loss": 0.1745,
"step": 2470
},
{
"epoch": 2.630238726790451,
"grad_norm": 1.9418586492538452,
"learning_rate": 2.2621169151782417e-07,
"loss": 0.1823,
"step": 2480
},
{
"epoch": 2.640848806366048,
"grad_norm": 2.519037961959839,
"learning_rate": 2.1354766749371093e-07,
"loss": 0.2037,
"step": 2490
},
{
"epoch": 2.6514588859416444,
"grad_norm": 2.010211944580078,
"learning_rate": 2.0123259383169031e-07,
"loss": 0.2196,
"step": 2500
},
{
"epoch": 2.6620689655172414,
"grad_norm": 1.9838532209396362,
"learning_rate": 1.8926835001525257e-07,
"loss": 0.1848,
"step": 2510
},
{
"epoch": 2.6726790450928384,
"grad_norm": 2.3488149642944336,
"learning_rate": 1.776567619854655e-07,
"loss": 0.1823,
"step": 2520
},
{
"epoch": 2.683289124668435,
"grad_norm": 2.839651584625244,
"learning_rate": 1.6639960186230293e-07,
"loss": 0.2039,
"step": 2530
},
{
"epoch": 2.693899204244032,
"grad_norm": 2.050480842590332,
"learning_rate": 1.5549858767419018e-07,
"loss": 0.1796,
"step": 2540
},
{
"epoch": 2.7045092838196285,
"grad_norm": 1.2738044261932373,
"learning_rate": 1.449553830958053e-07,
"loss": 0.1893,
"step": 2550
},
{
"epoch": 2.7151193633952255,
"grad_norm": 1.8912787437438965,
"learning_rate": 1.347715971941746e-07,
"loss": 0.1947,
"step": 2560
},
{
"epoch": 2.725729442970822,
"grad_norm": 1.8385730981826782,
"learning_rate": 1.2494878418310234e-07,
"loss": 0.1744,
"step": 2570
},
{
"epoch": 2.736339522546419,
"grad_norm": 2.1071712970733643,
"learning_rate": 1.1548844318597208e-07,
"loss": 0.2351,
"step": 2580
},
{
"epoch": 2.746949602122016,
"grad_norm": 2.054392099380493,
"learning_rate": 1.0639201800695553e-07,
"loss": 0.2245,
"step": 2590
},
{
"epoch": 2.7575596816976127,
"grad_norm": 1.656562328338623,
"learning_rate": 9.76608969106646e-08,
"loss": 0.2014,
"step": 2600
},
{
"epoch": 2.7681697612732097,
"grad_norm": 2.6887638568878174,
"learning_rate": 8.929641241027937e-08,
"loss": 0.1824,
"step": 2610
},
{
"epoch": 2.7787798408488062,
"grad_norm": 2.4606659412384033,
"learning_rate": 8.129984106418354e-08,
"loss": 0.1706,
"step": 2620
},
{
"epoch": 2.7893899204244033,
"grad_norm": 2.5548455715179443,
"learning_rate": 7.3672403281142e-08,
"loss": 0.2195,
"step": 2630
},
{
"epoch": 2.8,
"grad_norm": 1.7952167987823486,
"learning_rate": 6.641526313404534e-08,
"loss": 0.1748,
"step": 2640
},
{
"epoch": 2.810610079575597,
"grad_norm": 2.376830816268921,
"learning_rate": 5.952952818225416e-08,
"loss": 0.2061,
"step": 2650
},
{
"epoch": 2.821220159151194,
"grad_norm": 1.7183632850646973,
"learning_rate": 5.3016249302565436e-08,
"loss": 0.1742,
"step": 2660
},
{
"epoch": 2.8318302387267904,
"grad_norm": 2.11011004447937,
"learning_rate": 4.6876420528833014e-08,
"loss": 0.2082,
"step": 2670
},
{
"epoch": 2.8424403183023874,
"grad_norm": 1.8799868822097778,
"learning_rate": 4.111097890026089e-08,
"loss": 0.1805,
"step": 2680
},
{
"epoch": 2.853050397877984,
"grad_norm": 2.5171291828155518,
"learning_rate": 3.5720804318395976e-08,
"loss": 0.2058,
"step": 2690
},
{
"epoch": 2.863660477453581,
"grad_norm": 2.142263650894165,
"learning_rate": 3.0706719412839926e-08,
"loss": 0.2027,
"step": 2700
},
{
"epoch": 2.8742705570291776,
"grad_norm": 2.2124040126800537,
"learning_rate": 2.6069489415703197e-08,
"loss": 0.1941,
"step": 2710
},
{
"epoch": 2.8848806366047746,
"grad_norm": 2.033259153366089,
"learning_rate": 2.18098220448168e-08,
"loss": 0.2029,
"step": 2720
},
{
"epoch": 2.8954907161803716,
"grad_norm": 2.416912794113159,
"learning_rate": 1.7928367395725066e-08,
"loss": 0.2062,
"step": 2730
},
{
"epoch": 2.906100795755968,
"grad_norm": 2.193751096725464,
"learning_rate": 1.442571784246699e-08,
"loss": 0.1873,
"step": 2740
},
{
"epoch": 2.916710875331565,
"grad_norm": 1.5729731321334839,
"learning_rate": 1.1302407947173522e-08,
"loss": 0.1653,
"step": 2750
},
{
"epoch": 2.9273209549071617,
"grad_norm": 1.7562044858932495,
"learning_rate": 8.558914378481996e-09,
"loss": 0.1743,
"step": 2760
},
{
"epoch": 2.9379310344827587,
"grad_norm": 2.183967351913452,
"learning_rate": 6.195655838790726e-09,
"loss": 0.1821,
"step": 2770
},
{
"epoch": 2.9485411140583553,
"grad_norm": 1.9312433004379272,
"learning_rate": 4.212993000356491e-09,
"loss": 0.1954,
"step": 2780
},
{
"epoch": 2.9591511936339523,
"grad_norm": 2.2055087089538574,
"learning_rate": 2.611228450250802e-09,
"loss": 0.1925,
"step": 2790
},
{
"epoch": 2.9697612732095493,
"grad_norm": 1.6606404781341553,
"learning_rate": 1.3906066441798927e-09,
"loss": 0.1805,
"step": 2800
},
{
"epoch": 2.980371352785146,
"grad_norm": 2.594404458999634,
"learning_rate": 5.513138691767839e-10,
"loss": 0.2084,
"step": 2810
},
{
"epoch": 2.9909814323607424,
"grad_norm": 2.007861375808716,
"learning_rate": 9.347821517069477e-11,
"loss": 0.2115,
"step": 2820
},
{
"epoch": 2.9973474801061006,
"step": 2826,
"total_flos": 1.0915292825780224e+17,
"train_loss": 0.34044326600333263,
"train_runtime": 16671.2674,
"train_samples_per_second": 2.713,
"train_steps_per_second": 0.17
}
],
"logging_steps": 10,
"max_steps": 2826,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 943,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0915292825780224e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}