our_gemini_fail / trainer_state.json
happyhappy-jun's picture
Upload folder using huggingface_hub
5ecb6f8 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 4835,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010346611484738748,
"grad_norm": 6.139026641845703,
"learning_rate": 1.8595041322314052e-07,
"loss": 0.9459,
"step": 10
},
{
"epoch": 0.020693222969477496,
"grad_norm": 5.49014139175415,
"learning_rate": 3.925619834710744e-07,
"loss": 0.9235,
"step": 20
},
{
"epoch": 0.031039834454216243,
"grad_norm": 3.730520009994507,
"learning_rate": 5.991735537190084e-07,
"loss": 0.8539,
"step": 30
},
{
"epoch": 0.04138644593895499,
"grad_norm": 2.3603694438934326,
"learning_rate": 8.057851239669421e-07,
"loss": 0.7718,
"step": 40
},
{
"epoch": 0.05173305742369374,
"grad_norm": 2.0935893058776855,
"learning_rate": 1.012396694214876e-06,
"loss": 0.68,
"step": 50
},
{
"epoch": 0.062079668908432487,
"grad_norm": 1.742537021636963,
"learning_rate": 1.21900826446281e-06,
"loss": 0.6411,
"step": 60
},
{
"epoch": 0.07242628039317124,
"grad_norm": 1.5265495777130127,
"learning_rate": 1.425619834710744e-06,
"loss": 0.598,
"step": 70
},
{
"epoch": 0.08277289187790998,
"grad_norm": 1.4529454708099365,
"learning_rate": 1.632231404958678e-06,
"loss": 0.5599,
"step": 80
},
{
"epoch": 0.09311950336264874,
"grad_norm": 1.4188324213027954,
"learning_rate": 1.8388429752066117e-06,
"loss": 0.5307,
"step": 90
},
{
"epoch": 0.10346611484738748,
"grad_norm": 1.5109107494354248,
"learning_rate": 2.0454545454545457e-06,
"loss": 0.525,
"step": 100
},
{
"epoch": 0.11381272633212623,
"grad_norm": 1.3689301013946533,
"learning_rate": 2.2520661157024796e-06,
"loss": 0.5239,
"step": 110
},
{
"epoch": 0.12415933781686497,
"grad_norm": 1.3892041444778442,
"learning_rate": 2.4586776859504134e-06,
"loss": 0.5,
"step": 120
},
{
"epoch": 0.13450594930160373,
"grad_norm": 1.3815661668777466,
"learning_rate": 2.6652892561983473e-06,
"loss": 0.4901,
"step": 130
},
{
"epoch": 0.14485256078634248,
"grad_norm": 1.5329068899154663,
"learning_rate": 2.871900826446281e-06,
"loss": 0.4787,
"step": 140
},
{
"epoch": 0.1551991722710812,
"grad_norm": 1.2414484024047852,
"learning_rate": 3.078512396694215e-06,
"loss": 0.4835,
"step": 150
},
{
"epoch": 0.16554578375581996,
"grad_norm": 1.4333281517028809,
"learning_rate": 3.2851239669421493e-06,
"loss": 0.4706,
"step": 160
},
{
"epoch": 0.17589239524055872,
"grad_norm": 1.2515960931777954,
"learning_rate": 3.491735537190083e-06,
"loss": 0.4713,
"step": 170
},
{
"epoch": 0.18623900672529747,
"grad_norm": 1.5042508840560913,
"learning_rate": 3.6983471074380165e-06,
"loss": 0.4646,
"step": 180
},
{
"epoch": 0.1965856182100362,
"grad_norm": 1.4549144506454468,
"learning_rate": 3.904958677685951e-06,
"loss": 0.4496,
"step": 190
},
{
"epoch": 0.20693222969477496,
"grad_norm": 1.3692591190338135,
"learning_rate": 4.111570247933885e-06,
"loss": 0.4545,
"step": 200
},
{
"epoch": 0.2172788411795137,
"grad_norm": 1.3498661518096924,
"learning_rate": 4.3181818181818185e-06,
"loss": 0.451,
"step": 210
},
{
"epoch": 0.22762545266425246,
"grad_norm": 1.5415807962417603,
"learning_rate": 4.524793388429753e-06,
"loss": 0.4517,
"step": 220
},
{
"epoch": 0.23797206414899122,
"grad_norm": 1.339154601097107,
"learning_rate": 4.731404958677686e-06,
"loss": 0.4397,
"step": 230
},
{
"epoch": 0.24831867563372995,
"grad_norm": 1.3699935674667358,
"learning_rate": 4.9380165289256205e-06,
"loss": 0.4405,
"step": 240
},
{
"epoch": 0.2586652871184687,
"grad_norm": 1.2297754287719727,
"learning_rate": 5.144628099173554e-06,
"loss": 0.4355,
"step": 250
},
{
"epoch": 0.26901189860320746,
"grad_norm": 1.222337007522583,
"learning_rate": 5.351239669421488e-06,
"loss": 0.4435,
"step": 260
},
{
"epoch": 0.2793585100879462,
"grad_norm": 1.474328637123108,
"learning_rate": 5.557851239669422e-06,
"loss": 0.4285,
"step": 270
},
{
"epoch": 0.28970512157268496,
"grad_norm": 1.3817209005355835,
"learning_rate": 5.764462809917356e-06,
"loss": 0.4343,
"step": 280
},
{
"epoch": 0.3000517330574237,
"grad_norm": 1.2763681411743164,
"learning_rate": 5.971074380165289e-06,
"loss": 0.435,
"step": 290
},
{
"epoch": 0.3103983445421624,
"grad_norm": 1.4916797876358032,
"learning_rate": 6.1776859504132236e-06,
"loss": 0.4334,
"step": 300
},
{
"epoch": 0.3207449560269012,
"grad_norm": 1.3628357648849487,
"learning_rate": 6.384297520661158e-06,
"loss": 0.4273,
"step": 310
},
{
"epoch": 0.33109156751163993,
"grad_norm": 1.3091474771499634,
"learning_rate": 6.590909090909091e-06,
"loss": 0.4245,
"step": 320
},
{
"epoch": 0.3414381789963787,
"grad_norm": 1.4376888275146484,
"learning_rate": 6.7975206611570255e-06,
"loss": 0.428,
"step": 330
},
{
"epoch": 0.35178479048111744,
"grad_norm": 1.4433401823043823,
"learning_rate": 7.00413223140496e-06,
"loss": 0.4306,
"step": 340
},
{
"epoch": 0.3621314019658562,
"grad_norm": 1.3882417678833008,
"learning_rate": 7.210743801652893e-06,
"loss": 0.4292,
"step": 350
},
{
"epoch": 0.37247801345059495,
"grad_norm": 1.3860538005828857,
"learning_rate": 7.4173553719008275e-06,
"loss": 0.4292,
"step": 360
},
{
"epoch": 0.3828246249353337,
"grad_norm": 1.1911530494689941,
"learning_rate": 7.623966942148761e-06,
"loss": 0.4303,
"step": 370
},
{
"epoch": 0.3931712364200724,
"grad_norm": 1.315438151359558,
"learning_rate": 7.830578512396695e-06,
"loss": 0.4127,
"step": 380
},
{
"epoch": 0.40351784790481116,
"grad_norm": 1.3872874975204468,
"learning_rate": 8.03719008264463e-06,
"loss": 0.4305,
"step": 390
},
{
"epoch": 0.4138644593895499,
"grad_norm": 1.2904235124588013,
"learning_rate": 8.243801652892562e-06,
"loss": 0.4189,
"step": 400
},
{
"epoch": 0.42421107087428866,
"grad_norm": 1.256445050239563,
"learning_rate": 8.450413223140496e-06,
"loss": 0.4201,
"step": 410
},
{
"epoch": 0.4345576823590274,
"grad_norm": 1.3195158243179321,
"learning_rate": 8.65702479338843e-06,
"loss": 0.4214,
"step": 420
},
{
"epoch": 0.4449042938437662,
"grad_norm": 1.4306244850158691,
"learning_rate": 8.863636363636365e-06,
"loss": 0.4198,
"step": 430
},
{
"epoch": 0.45525090532850493,
"grad_norm": 1.2186839580535889,
"learning_rate": 9.070247933884299e-06,
"loss": 0.4163,
"step": 440
},
{
"epoch": 0.4655975168132437,
"grad_norm": 1.2255494594573975,
"learning_rate": 9.276859504132232e-06,
"loss": 0.4103,
"step": 450
},
{
"epoch": 0.47594412829798244,
"grad_norm": 1.1726787090301514,
"learning_rate": 9.483471074380166e-06,
"loss": 0.4087,
"step": 460
},
{
"epoch": 0.48629073978272114,
"grad_norm": 1.4234670400619507,
"learning_rate": 9.6900826446281e-06,
"loss": 0.4098,
"step": 470
},
{
"epoch": 0.4966373512674599,
"grad_norm": 1.1076370477676392,
"learning_rate": 9.896694214876033e-06,
"loss": 0.4058,
"step": 480
},
{
"epoch": 0.5069839627521987,
"grad_norm": 1.0841426849365234,
"learning_rate": 9.999967416245537e-06,
"loss": 0.3993,
"step": 490
},
{
"epoch": 0.5173305742369374,
"grad_norm": 1.409942865371704,
"learning_rate": 9.999706748757903e-06,
"loss": 0.4133,
"step": 500
},
{
"epoch": 0.5276771857216761,
"grad_norm": 1.2055038213729858,
"learning_rate": 9.999185427372232e-06,
"loss": 0.4043,
"step": 510
},
{
"epoch": 0.5380237972064149,
"grad_norm": 1.4699640274047852,
"learning_rate": 9.998403479267005e-06,
"loss": 0.4075,
"step": 520
},
{
"epoch": 0.5483704086911536,
"grad_norm": 1.1911301612854004,
"learning_rate": 9.99736094520818e-06,
"loss": 0.4032,
"step": 530
},
{
"epoch": 0.5587170201758924,
"grad_norm": 1.1372137069702148,
"learning_rate": 9.996057879547059e-06,
"loss": 0.4073,
"step": 540
},
{
"epoch": 0.5690636316606311,
"grad_norm": 1.1768054962158203,
"learning_rate": 9.994494350217452e-06,
"loss": 0.4035,
"step": 550
},
{
"epoch": 0.5794102431453699,
"grad_norm": 1.343031406402588,
"learning_rate": 9.992670438732146e-06,
"loss": 0.4105,
"step": 560
},
{
"epoch": 0.5897568546301086,
"grad_norm": 1.0798144340515137,
"learning_rate": 9.990586240178642e-06,
"loss": 0.4018,
"step": 570
},
{
"epoch": 0.6001034661148474,
"grad_norm": 1.3839797973632812,
"learning_rate": 9.988241863214212e-06,
"loss": 0.4068,
"step": 580
},
{
"epoch": 0.6104500775995861,
"grad_norm": 1.1565276384353638,
"learning_rate": 9.98563743006022e-06,
"loss": 0.4057,
"step": 590
},
{
"epoch": 0.6207966890843248,
"grad_norm": 1.14547860622406,
"learning_rate": 9.982773076495762e-06,
"loss": 0.3988,
"step": 600
},
{
"epoch": 0.6311433005690636,
"grad_norm": 1.0178049802780151,
"learning_rate": 9.97964895185058e-06,
"loss": 0.3975,
"step": 610
},
{
"epoch": 0.6414899120538023,
"grad_norm": 1.0690793991088867,
"learning_rate": 9.97626521899728e-06,
"loss": 0.4026,
"step": 620
},
{
"epoch": 0.6518365235385412,
"grad_norm": 1.1290843486785889,
"learning_rate": 9.972622054342842e-06,
"loss": 0.3923,
"step": 630
},
{
"epoch": 0.6621831350232799,
"grad_norm": 1.1035066843032837,
"learning_rate": 9.968719647819414e-06,
"loss": 0.4012,
"step": 640
},
{
"epoch": 0.6725297465080187,
"grad_norm": 1.1912881135940552,
"learning_rate": 9.964558202874428e-06,
"loss": 0.4001,
"step": 650
},
{
"epoch": 0.6828763579927574,
"grad_norm": 1.020052433013916,
"learning_rate": 9.960137936459975e-06,
"loss": 0.3962,
"step": 660
},
{
"epoch": 0.6932229694774961,
"grad_norm": 1.0658594369888306,
"learning_rate": 9.955459079021505e-06,
"loss": 0.3984,
"step": 670
},
{
"epoch": 0.7035695809622349,
"grad_norm": 1.0133702754974365,
"learning_rate": 9.95052187448581e-06,
"loss": 0.3924,
"step": 680
},
{
"epoch": 0.7139161924469736,
"grad_norm": 0.9635597467422485,
"learning_rate": 9.94532658024831e-06,
"loss": 0.3928,
"step": 690
},
{
"epoch": 0.7242628039317124,
"grad_norm": 1.1104060411453247,
"learning_rate": 9.939873467159627e-06,
"loss": 0.395,
"step": 700
},
{
"epoch": 0.7346094154164511,
"grad_norm": 1.020979404449463,
"learning_rate": 9.934162819511473e-06,
"loss": 0.4048,
"step": 710
},
{
"epoch": 0.7449560269011899,
"grad_norm": 0.934624969959259,
"learning_rate": 9.928194935021821e-06,
"loss": 0.4007,
"step": 720
},
{
"epoch": 0.7553026383859286,
"grad_norm": 1.048291563987732,
"learning_rate": 9.921970124819392e-06,
"loss": 0.3921,
"step": 730
},
{
"epoch": 0.7656492498706674,
"grad_norm": 1.120107889175415,
"learning_rate": 9.915488713427427e-06,
"loss": 0.39,
"step": 740
},
{
"epoch": 0.7759958613554061,
"grad_norm": 0.9637139439582825,
"learning_rate": 9.908751038746773e-06,
"loss": 0.3998,
"step": 750
},
{
"epoch": 0.7863424728401448,
"grad_norm": 0.9837620854377747,
"learning_rate": 9.901757452038268e-06,
"loss": 0.3861,
"step": 760
},
{
"epoch": 0.7966890843248836,
"grad_norm": 1.0107730627059937,
"learning_rate": 9.894508317904418e-06,
"loss": 0.3918,
"step": 770
},
{
"epoch": 0.8070356958096223,
"grad_norm": 1.0242232084274292,
"learning_rate": 9.887004014270408e-06,
"loss": 0.391,
"step": 780
},
{
"epoch": 0.8173823072943611,
"grad_norm": 0.9810847640037537,
"learning_rate": 9.879244932364375e-06,
"loss": 0.3908,
"step": 790
},
{
"epoch": 0.8277289187790998,
"grad_norm": 0.9811373353004456,
"learning_rate": 9.87123147669704e-06,
"loss": 0.3947,
"step": 800
},
{
"epoch": 0.8380755302638386,
"grad_norm": 0.956855058670044,
"learning_rate": 9.862964065040596e-06,
"loss": 0.3867,
"step": 810
},
{
"epoch": 0.8484221417485773,
"grad_norm": 0.9348600506782532,
"learning_rate": 9.854443128406931e-06,
"loss": 0.3864,
"step": 820
},
{
"epoch": 0.8587687532333161,
"grad_norm": 1.0585615634918213,
"learning_rate": 9.845669111025175e-06,
"loss": 0.3874,
"step": 830
},
{
"epoch": 0.8691153647180548,
"grad_norm": 1.041060447692871,
"learning_rate": 9.83664247031852e-06,
"loss": 0.3841,
"step": 840
},
{
"epoch": 0.8794619762027935,
"grad_norm": 1.0687028169631958,
"learning_rate": 9.827363676880383e-06,
"loss": 0.385,
"step": 850
},
{
"epoch": 0.8898085876875323,
"grad_norm": 1.1607829332351685,
"learning_rate": 9.817833214449873e-06,
"loss": 0.3821,
"step": 860
},
{
"epoch": 0.900155199172271,
"grad_norm": 0.9849240183830261,
"learning_rate": 9.808051579886566e-06,
"loss": 0.3819,
"step": 870
},
{
"epoch": 0.9105018106570099,
"grad_norm": 1.220577597618103,
"learning_rate": 9.798019283144607e-06,
"loss": 0.3886,
"step": 880
},
{
"epoch": 0.9208484221417486,
"grad_norm": 1.0422636270523071,
"learning_rate": 9.787736847246121e-06,
"loss": 0.3856,
"step": 890
},
{
"epoch": 0.9311950336264874,
"grad_norm": 0.875372588634491,
"learning_rate": 9.777204808253949e-06,
"loss": 0.3851,
"step": 900
},
{
"epoch": 0.9415416451112261,
"grad_norm": 0.918285608291626,
"learning_rate": 9.766423715243697e-06,
"loss": 0.3826,
"step": 910
},
{
"epoch": 0.9518882565959649,
"grad_norm": 1.1889722347259521,
"learning_rate": 9.755394130275116e-06,
"loss": 0.3834,
"step": 920
},
{
"epoch": 0.9622348680807036,
"grad_norm": 0.9664179682731628,
"learning_rate": 9.74411662836279e-06,
"loss": 0.3799,
"step": 930
},
{
"epoch": 0.9725814795654423,
"grad_norm": 1.064846396446228,
"learning_rate": 9.73259179744617e-06,
"loss": 0.3816,
"step": 940
},
{
"epoch": 0.9829280910501811,
"grad_norm": 1.0509345531463623,
"learning_rate": 9.720820238358918e-06,
"loss": 0.3838,
"step": 950
},
{
"epoch": 0.9932747025349198,
"grad_norm": 0.9682555794715881,
"learning_rate": 9.70880256479758e-06,
"loss": 0.3785,
"step": 960
},
{
"epoch": 1.0031039834454216,
"grad_norm": 0.9066140651702881,
"learning_rate": 9.696539403289589e-06,
"loss": 0.3685,
"step": 970
},
{
"epoch": 1.0134505949301604,
"grad_norm": 0.9822126030921936,
"learning_rate": 9.684031393160614e-06,
"loss": 0.3446,
"step": 980
},
{
"epoch": 1.023797206414899,
"grad_norm": 0.9022179245948792,
"learning_rate": 9.671279186501221e-06,
"loss": 0.3405,
"step": 990
},
{
"epoch": 1.0341438178996378,
"grad_norm": 0.9742788076400757,
"learning_rate": 9.658283448132875e-06,
"loss": 0.3348,
"step": 1000
},
{
"epoch": 1.0444904293843766,
"grad_norm": 0.9199923276901245,
"learning_rate": 9.645044855573286e-06,
"loss": 0.3425,
"step": 1010
},
{
"epoch": 1.0548370408691154,
"grad_norm": 0.9245515465736389,
"learning_rate": 9.631564099001085e-06,
"loss": 0.344,
"step": 1020
},
{
"epoch": 1.065183652353854,
"grad_norm": 0.865315854549408,
"learning_rate": 9.617841881219841e-06,
"loss": 0.3481,
"step": 1030
},
{
"epoch": 1.0755302638385928,
"grad_norm": 0.8868467211723328,
"learning_rate": 9.603878917621422e-06,
"loss": 0.3458,
"step": 1040
},
{
"epoch": 1.0858768753233317,
"grad_norm": 0.9886205196380615,
"learning_rate": 9.5896759361487e-06,
"loss": 0.3413,
"step": 1050
},
{
"epoch": 1.0962234868080705,
"grad_norm": 0.9479049444198608,
"learning_rate": 9.5752336772576e-06,
"loss": 0.3371,
"step": 1060
},
{
"epoch": 1.106570098292809,
"grad_norm": 0.9921103715896606,
"learning_rate": 9.560552893878499e-06,
"loss": 0.3431,
"step": 1070
},
{
"epoch": 1.1169167097775479,
"grad_norm": 0.9266170263290405,
"learning_rate": 9.545634351376965e-06,
"loss": 0.3412,
"step": 1080
},
{
"epoch": 1.1272633212622867,
"grad_norm": 0.9916461110115051,
"learning_rate": 9.530478827513867e-06,
"loss": 0.3429,
"step": 1090
},
{
"epoch": 1.1376099327470253,
"grad_norm": 0.90711510181427,
"learning_rate": 9.51508711240482e-06,
"loss": 0.3461,
"step": 1100
},
{
"epoch": 1.147956544231764,
"grad_norm": 0.9366750121116638,
"learning_rate": 9.499460008478996e-06,
"loss": 0.3392,
"step": 1110
},
{
"epoch": 1.1583031557165029,
"grad_norm": 1.0261372327804565,
"learning_rate": 9.483598330437285e-06,
"loss": 0.3434,
"step": 1120
},
{
"epoch": 1.1686497672012415,
"grad_norm": 0.9353191256523132,
"learning_rate": 9.46750290520983e-06,
"loss": 0.344,
"step": 1130
},
{
"epoch": 1.1789963786859803,
"grad_norm": 0.9214949011802673,
"learning_rate": 9.451174571912913e-06,
"loss": 0.3404,
"step": 1140
},
{
"epoch": 1.189342990170719,
"grad_norm": 0.9317702054977417,
"learning_rate": 9.434614181805203e-06,
"loss": 0.3453,
"step": 1150
},
{
"epoch": 1.199689601655458,
"grad_norm": 0.8729979395866394,
"learning_rate": 9.417822598243382e-06,
"loss": 0.3505,
"step": 1160
},
{
"epoch": 1.2100362131401965,
"grad_norm": 0.9862754940986633,
"learning_rate": 9.400800696637136e-06,
"loss": 0.3416,
"step": 1170
},
{
"epoch": 1.2203828246249353,
"grad_norm": 0.9723279476165771,
"learning_rate": 9.383549364403515e-06,
"loss": 0.343,
"step": 1180
},
{
"epoch": 1.2307294361096741,
"grad_norm": 0.8829843997955322,
"learning_rate": 9.366069500920662e-06,
"loss": 0.3375,
"step": 1190
},
{
"epoch": 1.241076047594413,
"grad_norm": 0.9550700783729553,
"learning_rate": 9.34836201748094e-06,
"loss": 0.3423,
"step": 1200
},
{
"epoch": 1.2514226590791515,
"grad_norm": 0.9076991081237793,
"learning_rate": 9.330427837243408e-06,
"loss": 0.3369,
"step": 1210
},
{
"epoch": 1.2617692705638903,
"grad_norm": 0.888508677482605,
"learning_rate": 9.312267895185697e-06,
"loss": 0.3405,
"step": 1220
},
{
"epoch": 1.2721158820486291,
"grad_norm": 0.9597383141517639,
"learning_rate": 9.293883138055275e-06,
"loss": 0.351,
"step": 1230
},
{
"epoch": 1.2824624935333677,
"grad_norm": 0.9085503816604614,
"learning_rate": 9.275274524320075e-06,
"loss": 0.3405,
"step": 1240
},
{
"epoch": 1.2928091050181065,
"grad_norm": 0.9536021947860718,
"learning_rate": 9.25644302411854e-06,
"loss": 0.3408,
"step": 1250
},
{
"epoch": 1.3031557165028453,
"grad_norm": 0.9334466457366943,
"learning_rate": 9.237389619209037e-06,
"loss": 0.3447,
"step": 1260
},
{
"epoch": 1.313502327987584,
"grad_norm": 0.9981200098991394,
"learning_rate": 9.218115302918676e-06,
"loss": 0.337,
"step": 1270
},
{
"epoch": 1.3238489394723227,
"grad_norm": 0.9816781282424927,
"learning_rate": 9.198621080091525e-06,
"loss": 0.35,
"step": 1280
},
{
"epoch": 1.3341955509570615,
"grad_norm": 0.9127581119537354,
"learning_rate": 9.178907967036229e-06,
"loss": 0.3432,
"step": 1290
},
{
"epoch": 1.3445421624418004,
"grad_norm": 0.8629497289657593,
"learning_rate": 9.15897699147301e-06,
"loss": 0.3353,
"step": 1300
},
{
"epoch": 1.3548887739265392,
"grad_norm": 0.9100907444953918,
"learning_rate": 9.138829192480106e-06,
"loss": 0.3358,
"step": 1310
},
{
"epoch": 1.3652353854112778,
"grad_norm": 0.953328549861908,
"learning_rate": 9.118465620439594e-06,
"loss": 0.3368,
"step": 1320
},
{
"epoch": 1.3755819968960166,
"grad_norm": 0.9011934399604797,
"learning_rate": 9.097887336982625e-06,
"loss": 0.3438,
"step": 1330
},
{
"epoch": 1.3859286083807554,
"grad_norm": 0.9293943047523499,
"learning_rate": 9.077095414934076e-06,
"loss": 0.3449,
"step": 1340
},
{
"epoch": 1.396275219865494,
"grad_norm": 0.8752354979515076,
"learning_rate": 9.056090938256633e-06,
"loss": 0.3418,
"step": 1350
},
{
"epoch": 1.4066218313502328,
"grad_norm": 0.978129506111145,
"learning_rate": 9.034875001994258e-06,
"loss": 0.3369,
"step": 1360
},
{
"epoch": 1.4169684428349716,
"grad_norm": 0.8396763801574707,
"learning_rate": 9.013448712215127e-06,
"loss": 0.341,
"step": 1370
},
{
"epoch": 1.4273150543197102,
"grad_norm": 0.8739905953407288,
"learning_rate": 8.991813185953942e-06,
"loss": 0.3427,
"step": 1380
},
{
"epoch": 1.437661665804449,
"grad_norm": 0.8390309810638428,
"learning_rate": 8.969969551153706e-06,
"loss": 0.3435,
"step": 1390
},
{
"epoch": 1.4480082772891878,
"grad_norm": 0.9313223958015442,
"learning_rate": 8.947918946606924e-06,
"loss": 0.3348,
"step": 1400
},
{
"epoch": 1.4583548887739266,
"grad_norm": 0.8645867109298706,
"learning_rate": 8.925662521896224e-06,
"loss": 0.3377,
"step": 1410
},
{
"epoch": 1.4687015002586654,
"grad_norm": 0.9142380356788635,
"learning_rate": 8.90320143733443e-06,
"loss": 0.336,
"step": 1420
},
{
"epoch": 1.479048111743404,
"grad_norm": 0.9506648182868958,
"learning_rate": 8.88053686390407e-06,
"loss": 0.3414,
"step": 1430
},
{
"epoch": 1.4893947232281428,
"grad_norm": 0.855963945388794,
"learning_rate": 8.857669983196326e-06,
"loss": 0.3293,
"step": 1440
},
{
"epoch": 1.4997413347128816,
"grad_norm": 0.9154329299926758,
"learning_rate": 8.834601987349436e-06,
"loss": 0.3355,
"step": 1450
},
{
"epoch": 1.5100879461976202,
"grad_norm": 0.8513219356536865,
"learning_rate": 8.811334078986542e-06,
"loss": 0.3289,
"step": 1460
},
{
"epoch": 1.520434557682359,
"grad_norm": 0.9121130108833313,
"learning_rate": 8.787867471152992e-06,
"loss": 0.3413,
"step": 1470
},
{
"epoch": 1.5307811691670978,
"grad_norm": 0.8203767538070679,
"learning_rate": 8.764203387253102e-06,
"loss": 0.3342,
"step": 1480
},
{
"epoch": 1.5411277806518364,
"grad_norm": 0.8152233362197876,
"learning_rate": 8.740343060986366e-06,
"loss": 0.339,
"step": 1490
},
{
"epoch": 1.5514743921365752,
"grad_norm": 0.881718635559082,
"learning_rate": 8.716287736283158e-06,
"loss": 0.3342,
"step": 1500
},
{
"epoch": 1.561821003621314,
"grad_norm": 0.851697564125061,
"learning_rate": 8.692038667239857e-06,
"loss": 0.338,
"step": 1510
},
{
"epoch": 1.5721676151060526,
"grad_norm": 0.9294906258583069,
"learning_rate": 8.66759711805348e-06,
"loss": 0.3359,
"step": 1520
},
{
"epoch": 1.5825142265907917,
"grad_norm": 0.8974320888519287,
"learning_rate": 8.642964362955781e-06,
"loss": 0.3349,
"step": 1530
},
{
"epoch": 1.5928608380755302,
"grad_norm": 0.9681616425514221,
"learning_rate": 8.618141686146803e-06,
"loss": 0.3335,
"step": 1540
},
{
"epoch": 1.603207449560269,
"grad_norm": 0.8356232643127441,
"learning_rate": 8.593130381727938e-06,
"loss": 0.3332,
"step": 1550
},
{
"epoch": 1.6135540610450079,
"grad_norm": 0.8890557289123535,
"learning_rate": 8.567931753634462e-06,
"loss": 0.3317,
"step": 1560
},
{
"epoch": 1.6239006725297465,
"grad_norm": 0.9087671637535095,
"learning_rate": 8.542547115567553e-06,
"loss": 0.3416,
"step": 1570
},
{
"epoch": 1.6342472840144853,
"grad_norm": 0.7936503887176514,
"learning_rate": 8.516977790925799e-06,
"loss": 0.3366,
"step": 1580
},
{
"epoch": 1.644593895499224,
"grad_norm": 0.849400520324707,
"learning_rate": 8.491225112736209e-06,
"loss": 0.3394,
"step": 1590
},
{
"epoch": 1.6549405069839627,
"grad_norm": 0.833982527256012,
"learning_rate": 8.465290423584718e-06,
"loss": 0.3373,
"step": 1600
},
{
"epoch": 1.6652871184687015,
"grad_norm": 0.8153632879257202,
"learning_rate": 8.439175075546191e-06,
"loss": 0.3413,
"step": 1610
},
{
"epoch": 1.6756337299534403,
"grad_norm": 0.8862062692642212,
"learning_rate": 8.412880430113931e-06,
"loss": 0.3334,
"step": 1620
},
{
"epoch": 1.6859803414381789,
"grad_norm": 0.7980442643165588,
"learning_rate": 8.386407858128707e-06,
"loss": 0.331,
"step": 1630
},
{
"epoch": 1.696326952922918,
"grad_norm": 0.8371018171310425,
"learning_rate": 8.359758739707275e-06,
"loss": 0.3409,
"step": 1640
},
{
"epoch": 1.7066735644076565,
"grad_norm": 0.8669474720954895,
"learning_rate": 8.33293446417044e-06,
"loss": 0.3325,
"step": 1650
},
{
"epoch": 1.717020175892395,
"grad_norm": 0.8137986660003662,
"learning_rate": 8.305936429970622e-06,
"loss": 0.3355,
"step": 1660
},
{
"epoch": 1.7273667873771341,
"grad_norm": 0.8193359375,
"learning_rate": 8.278766044618936e-06,
"loss": 0.3323,
"step": 1670
},
{
"epoch": 1.7377133988618727,
"grad_norm": 0.859048068523407,
"learning_rate": 8.251424724611832e-06,
"loss": 0.3314,
"step": 1680
},
{
"epoch": 1.7480600103466115,
"grad_norm": 0.924875795841217,
"learning_rate": 8.22391389535724e-06,
"loss": 0.3377,
"step": 1690
},
{
"epoch": 1.7584066218313503,
"grad_norm": 0.8729901313781738,
"learning_rate": 8.196234991100256e-06,
"loss": 0.3298,
"step": 1700
},
{
"epoch": 1.768753233316089,
"grad_norm": 0.8952008485794067,
"learning_rate": 8.168389454848366e-06,
"loss": 0.3324,
"step": 1710
},
{
"epoch": 1.7790998448008277,
"grad_norm": 0.8685171604156494,
"learning_rate": 8.140378738296233e-06,
"loss": 0.3356,
"step": 1720
},
{
"epoch": 1.7894464562855665,
"grad_norm": 0.7909799218177795,
"learning_rate": 8.112204301749988e-06,
"loss": 0.3331,
"step": 1730
},
{
"epoch": 1.7997930677703051,
"grad_norm": 0.7982770204544067,
"learning_rate": 8.083867614051125e-06,
"loss": 0.3422,
"step": 1740
},
{
"epoch": 1.810139679255044,
"grad_norm": 0.8534769415855408,
"learning_rate": 8.055370152499909e-06,
"loss": 0.3345,
"step": 1750
},
{
"epoch": 1.8204862907397827,
"grad_norm": 0.8252643346786499,
"learning_rate": 8.026713402778362e-06,
"loss": 0.3394,
"step": 1760
},
{
"epoch": 1.8308329022245213,
"grad_norm": 0.8682839274406433,
"learning_rate": 7.99789885887281e-06,
"loss": 0.3329,
"step": 1770
},
{
"epoch": 1.8411795137092604,
"grad_norm": 0.9072077870368958,
"learning_rate": 7.968928022996e-06,
"loss": 0.3349,
"step": 1780
},
{
"epoch": 1.851526125193999,
"grad_norm": 0.802394688129425,
"learning_rate": 7.939802405508772e-06,
"loss": 0.3374,
"step": 1790
},
{
"epoch": 1.8618727366787378,
"grad_norm": 0.8025832772254944,
"learning_rate": 7.910523524841329e-06,
"loss": 0.3329,
"step": 1800
},
{
"epoch": 1.8722193481634766,
"grad_norm": 0.8657801151275635,
"learning_rate": 7.88109290741407e-06,
"loss": 0.3379,
"step": 1810
},
{
"epoch": 1.8825659596482152,
"grad_norm": 0.9007260203361511,
"learning_rate": 7.851512087558016e-06,
"loss": 0.3338,
"step": 1820
},
{
"epoch": 1.892912571132954,
"grad_norm": 0.7836032509803772,
"learning_rate": 7.821782607434815e-06,
"loss": 0.3278,
"step": 1830
},
{
"epoch": 1.9032591826176928,
"grad_norm": 0.8562266230583191,
"learning_rate": 7.791906016956348e-06,
"loss": 0.3352,
"step": 1840
},
{
"epoch": 1.9136057941024314,
"grad_norm": 0.8395200371742249,
"learning_rate": 7.761883873703919e-06,
"loss": 0.337,
"step": 1850
},
{
"epoch": 1.9239524055871702,
"grad_norm": 0.8483885526657104,
"learning_rate": 7.731717742847059e-06,
"loss": 0.335,
"step": 1860
},
{
"epoch": 1.934299017071909,
"grad_norm": 0.8115087151527405,
"learning_rate": 7.701409197061927e-06,
"loss": 0.334,
"step": 1870
},
{
"epoch": 1.9446456285566476,
"grad_norm": 0.8322962522506714,
"learning_rate": 7.670959816449313e-06,
"loss": 0.3291,
"step": 1880
},
{
"epoch": 1.9549922400413866,
"grad_norm": 0.8189446926116943,
"learning_rate": 7.640371188452274e-06,
"loss": 0.3291,
"step": 1890
},
{
"epoch": 1.9653388515261252,
"grad_norm": 0.8812534809112549,
"learning_rate": 7.609644907773365e-06,
"loss": 0.3283,
"step": 1900
},
{
"epoch": 1.9756854630108638,
"grad_norm": 0.8735312223434448,
"learning_rate": 7.578782576291501e-06,
"loss": 0.3291,
"step": 1910
},
{
"epoch": 1.9860320744956028,
"grad_norm": 0.8245293498039246,
"learning_rate": 7.547785802978449e-06,
"loss": 0.3291,
"step": 1920
},
{
"epoch": 1.9963786859803414,
"grad_norm": 0.7975995540618896,
"learning_rate": 7.516656203814945e-06,
"loss": 0.3334,
"step": 1930
},
{
"epoch": 2.0062079668908432,
"grad_norm": 0.8532512187957764,
"learning_rate": 7.485395401706447e-06,
"loss": 0.2926,
"step": 1940
},
{
"epoch": 2.016554578375582,
"grad_norm": 0.8140570521354675,
"learning_rate": 7.454005026398525e-06,
"loss": 0.2638,
"step": 1950
},
{
"epoch": 2.026901189860321,
"grad_norm": 0.8759050965309143,
"learning_rate": 7.422486714391896e-06,
"loss": 0.268,
"step": 1960
},
{
"epoch": 2.0372478013450595,
"grad_norm": 0.8712063431739807,
"learning_rate": 7.390842108857111e-06,
"loss": 0.2633,
"step": 1970
},
{
"epoch": 2.047594412829798,
"grad_norm": 0.8876409530639648,
"learning_rate": 7.359072859548884e-06,
"loss": 0.2695,
"step": 1980
},
{
"epoch": 2.057941024314537,
"grad_norm": 0.8743641972541809,
"learning_rate": 7.327180622720087e-06,
"loss": 0.2661,
"step": 1990
},
{
"epoch": 2.0682876357992757,
"grad_norm": 0.834858238697052,
"learning_rate": 7.29516706103541e-06,
"loss": 0.2681,
"step": 2000
},
{
"epoch": 2.0786342472840147,
"grad_norm": 0.8777759671211243,
"learning_rate": 7.263033843484664e-06,
"loss": 0.2639,
"step": 2010
},
{
"epoch": 2.0889808587687533,
"grad_norm": 0.8941816091537476,
"learning_rate": 7.2307826452957855e-06,
"loss": 0.2648,
"step": 2020
},
{
"epoch": 2.099327470253492,
"grad_norm": 0.9012478590011597,
"learning_rate": 7.198415147847491e-06,
"loss": 0.2664,
"step": 2030
},
{
"epoch": 2.109674081738231,
"grad_norm": 0.8317115902900696,
"learning_rate": 7.165933038581627e-06,
"loss": 0.2703,
"step": 2040
},
{
"epoch": 2.1200206932229695,
"grad_norm": 0.8598811030387878,
"learning_rate": 7.1333380109151916e-06,
"loss": 0.2694,
"step": 2050
},
{
"epoch": 2.130367304707708,
"grad_norm": 0.7854169607162476,
"learning_rate": 7.100631764152054e-06,
"loss": 0.2737,
"step": 2060
},
{
"epoch": 2.140713916192447,
"grad_norm": 0.8262772560119629,
"learning_rate": 7.067816003394359e-06,
"loss": 0.2687,
"step": 2070
},
{
"epoch": 2.1510605276771857,
"grad_norm": 0.8128632307052612,
"learning_rate": 7.034892439453639e-06,
"loss": 0.2682,
"step": 2080
},
{
"epoch": 2.1614071391619243,
"grad_norm": 0.8013981580734253,
"learning_rate": 7.001862788761617e-06,
"loss": 0.2643,
"step": 2090
},
{
"epoch": 2.1717537506466633,
"grad_norm": 0.8377342820167542,
"learning_rate": 6.96872877328073e-06,
"loss": 0.2709,
"step": 2100
},
{
"epoch": 2.182100362131402,
"grad_norm": 0.8179459571838379,
"learning_rate": 6.935492120414347e-06,
"loss": 0.2652,
"step": 2110
},
{
"epoch": 2.192446973616141,
"grad_norm": 0.8413109183311462,
"learning_rate": 6.902154562916722e-06,
"loss": 0.2695,
"step": 2120
},
{
"epoch": 2.2027935851008795,
"grad_norm": 0.8183903098106384,
"learning_rate": 6.86871783880265e-06,
"loss": 0.2725,
"step": 2130
},
{
"epoch": 2.213140196585618,
"grad_norm": 0.8539458513259888,
"learning_rate": 6.835183691256866e-06,
"loss": 0.268,
"step": 2140
},
{
"epoch": 2.223486808070357,
"grad_norm": 0.9063196778297424,
"learning_rate": 6.801553868543163e-06,
"loss": 0.2742,
"step": 2150
},
{
"epoch": 2.2338334195550957,
"grad_norm": 0.816985011100769,
"learning_rate": 6.767830123913241e-06,
"loss": 0.2713,
"step": 2160
},
{
"epoch": 2.2441800310398343,
"grad_norm": 0.8586083650588989,
"learning_rate": 6.734014215515321e-06,
"loss": 0.2746,
"step": 2170
},
{
"epoch": 2.2545266425245734,
"grad_norm": 0.8248201608657837,
"learning_rate": 6.700107906302463e-06,
"loss": 0.2667,
"step": 2180
},
{
"epoch": 2.264873254009312,
"grad_norm": 0.8500617146492004,
"learning_rate": 6.6661129639406785e-06,
"loss": 0.2673,
"step": 2190
},
{
"epoch": 2.2752198654940505,
"grad_norm": 0.8682879209518433,
"learning_rate": 6.6320311607167565e-06,
"loss": 0.267,
"step": 2200
},
{
"epoch": 2.2855664769787896,
"grad_norm": 0.8802211284637451,
"learning_rate": 6.597864273445882e-06,
"loss": 0.2701,
"step": 2210
},
{
"epoch": 2.295913088463528,
"grad_norm": 0.8036802411079407,
"learning_rate": 6.563614083378996e-06,
"loss": 0.2712,
"step": 2220
},
{
"epoch": 2.306259699948267,
"grad_norm": 0.8377422094345093,
"learning_rate": 6.5292823761099355e-06,
"loss": 0.2694,
"step": 2230
},
{
"epoch": 2.3166063114330058,
"grad_norm": 0.8472813963890076,
"learning_rate": 6.494870941482336e-06,
"loss": 0.2709,
"step": 2240
},
{
"epoch": 2.3269529229177444,
"grad_norm": 0.8234586715698242,
"learning_rate": 6.460381573496337e-06,
"loss": 0.2664,
"step": 2250
},
{
"epoch": 2.337299534402483,
"grad_norm": 0.8779090642929077,
"learning_rate": 6.425816070215032e-06,
"loss": 0.2702,
"step": 2260
},
{
"epoch": 2.347646145887222,
"grad_norm": 0.8126579523086548,
"learning_rate": 6.391176233670744e-06,
"loss": 0.2699,
"step": 2270
},
{
"epoch": 2.3579927573719606,
"grad_norm": 0.796710193157196,
"learning_rate": 6.356463869771077e-06,
"loss": 0.2676,
"step": 2280
},
{
"epoch": 2.3683393688566996,
"grad_norm": 0.8544085025787354,
"learning_rate": 6.3216807882047585e-06,
"loss": 0.2699,
"step": 2290
},
{
"epoch": 2.378685980341438,
"grad_norm": 0.7962964773178101,
"learning_rate": 6.286828802347307e-06,
"loss": 0.267,
"step": 2300
},
{
"epoch": 2.3890325918261768,
"grad_norm": 0.8767339587211609,
"learning_rate": 6.251909729166478e-06,
"loss": 0.2646,
"step": 2310
},
{
"epoch": 2.399379203310916,
"grad_norm": 0.8584967255592346,
"learning_rate": 6.216925389127552e-06,
"loss": 0.2645,
"step": 2320
},
{
"epoch": 2.4097258147956544,
"grad_norm": 0.8135905265808105,
"learning_rate": 6.18187760609842e-06,
"loss": 0.2699,
"step": 2330
},
{
"epoch": 2.420072426280393,
"grad_norm": 0.8041039705276489,
"learning_rate": 6.146768207254498e-06,
"loss": 0.2712,
"step": 2340
},
{
"epoch": 2.430419037765132,
"grad_norm": 0.8718064427375793,
"learning_rate": 6.111599022983473e-06,
"loss": 0.2726,
"step": 2350
},
{
"epoch": 2.4407656492498706,
"grad_norm": 0.7888765931129456,
"learning_rate": 6.076371886789875e-06,
"loss": 0.2713,
"step": 2360
},
{
"epoch": 2.451112260734609,
"grad_norm": 0.870827317237854,
"learning_rate": 6.0410886351994864e-06,
"loss": 0.2672,
"step": 2370
},
{
"epoch": 2.4614588722193482,
"grad_norm": 0.8175879716873169,
"learning_rate": 6.005751107663609e-06,
"loss": 0.2606,
"step": 2380
},
{
"epoch": 2.471805483704087,
"grad_norm": 0.9149560332298279,
"learning_rate": 5.970361146463149e-06,
"loss": 0.2707,
"step": 2390
},
{
"epoch": 2.482152095188826,
"grad_norm": 0.85008704662323,
"learning_rate": 5.934920596612588e-06,
"loss": 0.2703,
"step": 2400
},
{
"epoch": 2.4924987066735644,
"grad_norm": 0.7932586669921875,
"learning_rate": 5.89943130576378e-06,
"loss": 0.266,
"step": 2410
},
{
"epoch": 2.502845318158303,
"grad_norm": 0.8065005540847778,
"learning_rate": 5.863895124109642e-06,
"loss": 0.2677,
"step": 2420
},
{
"epoch": 2.513191929643042,
"grad_norm": 0.8142856955528259,
"learning_rate": 5.8283139042876865e-06,
"loss": 0.2699,
"step": 2430
},
{
"epoch": 2.5235385411277806,
"grad_norm": 0.8366583585739136,
"learning_rate": 5.792689501283436e-06,
"loss": 0.262,
"step": 2440
},
{
"epoch": 2.5338851526125197,
"grad_norm": 0.800625205039978,
"learning_rate": 5.757023772333721e-06,
"loss": 0.2641,
"step": 2450
},
{
"epoch": 2.5442317640972583,
"grad_norm": 0.7906954884529114,
"learning_rate": 5.721318576829848e-06,
"loss": 0.2675,
"step": 2460
},
{
"epoch": 2.554578375581997,
"grad_norm": 0.7770724296569824,
"learning_rate": 5.685575776220671e-06,
"loss": 0.2664,
"step": 2470
},
{
"epoch": 2.5649249870667354,
"grad_norm": 0.8194068074226379,
"learning_rate": 5.649797233915539e-06,
"loss": 0.2702,
"step": 2480
},
{
"epoch": 2.5752715985514745,
"grad_norm": 0.881852388381958,
"learning_rate": 5.6139848151871575e-06,
"loss": 0.2681,
"step": 2490
},
{
"epoch": 2.585618210036213,
"grad_norm": 0.8650614023208618,
"learning_rate": 5.578140387074335e-06,
"loss": 0.2676,
"step": 2500
},
{
"epoch": 2.595964821520952,
"grad_norm": 0.8367857933044434,
"learning_rate": 5.542265818284652e-06,
"loss": 0.2702,
"step": 2510
},
{
"epoch": 2.6063114330056907,
"grad_norm": 0.8549493551254272,
"learning_rate": 5.506362979097042e-06,
"loss": 0.2694,
"step": 2520
},
{
"epoch": 2.6166580444904293,
"grad_norm": 0.8078193068504333,
"learning_rate": 5.470433741264281e-06,
"loss": 0.266,
"step": 2530
},
{
"epoch": 2.627004655975168,
"grad_norm": 0.7991025447845459,
"learning_rate": 5.4344799779154045e-06,
"loss": 0.2674,
"step": 2540
},
{
"epoch": 2.637351267459907,
"grad_norm": 0.8104662895202637,
"learning_rate": 5.398503563458064e-06,
"loss": 0.2654,
"step": 2550
},
{
"epoch": 2.6476978789446455,
"grad_norm": 0.8707109093666077,
"learning_rate": 5.362506373480794e-06,
"loss": 0.2665,
"step": 2560
},
{
"epoch": 2.6580444904293845,
"grad_norm": 0.8271766304969788,
"learning_rate": 5.326490284655238e-06,
"loss": 0.2719,
"step": 2570
},
{
"epoch": 2.668391101914123,
"grad_norm": 0.8149173855781555,
"learning_rate": 5.290457174638314e-06,
"loss": 0.2676,
"step": 2580
},
{
"epoch": 2.6787377133988617,
"grad_norm": 0.8067767024040222,
"learning_rate": 5.254408921974312e-06,
"loss": 0.2726,
"step": 2590
},
{
"epoch": 2.6890843248836007,
"grad_norm": 0.8474196791648865,
"learning_rate": 5.218347405996973e-06,
"loss": 0.2681,
"step": 2600
},
{
"epoch": 2.6994309363683393,
"grad_norm": 0.8074548244476318,
"learning_rate": 5.1822745067315e-06,
"loss": 0.267,
"step": 2610
},
{
"epoch": 2.7097775478530783,
"grad_norm": 0.8411250114440918,
"learning_rate": 5.146192104796556e-06,
"loss": 0.2651,
"step": 2620
},
{
"epoch": 2.720124159337817,
"grad_norm": 0.8280041217803955,
"learning_rate": 5.110102081306208e-06,
"loss": 0.2758,
"step": 2630
},
{
"epoch": 2.7304707708225555,
"grad_norm": 0.8358988165855408,
"learning_rate": 5.074006317771873e-06,
"loss": 0.2614,
"step": 2640
},
{
"epoch": 2.740817382307294,
"grad_norm": 0.8451923131942749,
"learning_rate": 5.037906696004209e-06,
"loss": 0.262,
"step": 2650
},
{
"epoch": 2.751163993792033,
"grad_norm": 0.8164005279541016,
"learning_rate": 5.0018050980150244e-06,
"loss": 0.2644,
"step": 2660
},
{
"epoch": 2.7615106052767717,
"grad_norm": 0.8651638031005859,
"learning_rate": 4.965703405919154e-06,
"loss": 0.2702,
"step": 2670
},
{
"epoch": 2.7718572167615108,
"grad_norm": 0.8426215648651123,
"learning_rate": 4.929603501836336e-06,
"loss": 0.2597,
"step": 2680
},
{
"epoch": 2.7822038282462493,
"grad_norm": 0.8239193558692932,
"learning_rate": 4.8935072677931e-06,
"loss": 0.2637,
"step": 2690
},
{
"epoch": 2.792550439730988,
"grad_norm": 0.8287822008132935,
"learning_rate": 4.857416585624635e-06,
"loss": 0.2643,
"step": 2700
},
{
"epoch": 2.802897051215727,
"grad_norm": 0.8135037422180176,
"learning_rate": 4.821333336876691e-06,
"loss": 0.2623,
"step": 2710
},
{
"epoch": 2.8132436627004656,
"grad_norm": 0.8966447710990906,
"learning_rate": 4.785259402707489e-06,
"loss": 0.2633,
"step": 2720
},
{
"epoch": 2.8235902741852046,
"grad_norm": 0.836792528629303,
"learning_rate": 4.749196663789642e-06,
"loss": 0.2683,
"step": 2730
},
{
"epoch": 2.833936885669943,
"grad_norm": 0.806907057762146,
"learning_rate": 4.713147000212112e-06,
"loss": 0.2698,
"step": 2740
},
{
"epoch": 2.8442834971546818,
"grad_norm": 0.8352254629135132,
"learning_rate": 4.677112291382191e-06,
"loss": 0.2673,
"step": 2750
},
{
"epoch": 2.8546301086394203,
"grad_norm": 0.8209269642829895,
"learning_rate": 4.641094415927529e-06,
"loss": 0.2672,
"step": 2760
},
{
"epoch": 2.8649767201241594,
"grad_norm": 0.7578653693199158,
"learning_rate": 4.605095251598184e-06,
"loss": 0.2649,
"step": 2770
},
{
"epoch": 2.875323331608898,
"grad_norm": 0.8572791814804077,
"learning_rate": 4.569116675168729e-06,
"loss": 0.2642,
"step": 2780
},
{
"epoch": 2.885669943093637,
"grad_norm": 0.8002661466598511,
"learning_rate": 4.53316056234041e-06,
"loss": 0.2669,
"step": 2790
},
{
"epoch": 2.8960165545783756,
"grad_norm": 0.8416422009468079,
"learning_rate": 4.4972287876433675e-06,
"loss": 0.2684,
"step": 2800
},
{
"epoch": 2.906363166063114,
"grad_norm": 0.7953474521636963,
"learning_rate": 4.461323224338895e-06,
"loss": 0.2616,
"step": 2810
},
{
"epoch": 2.916709777547853,
"grad_norm": 0.9063292145729065,
"learning_rate": 4.425445744321785e-06,
"loss": 0.2654,
"step": 2820
},
{
"epoch": 2.927056389032592,
"grad_norm": 0.8513032793998718,
"learning_rate": 4.389598218022742e-06,
"loss": 0.2707,
"step": 2830
},
{
"epoch": 2.937403000517331,
"grad_norm": 0.8203792572021484,
"learning_rate": 4.353782514310872e-06,
"loss": 0.2672,
"step": 2840
},
{
"epoch": 2.9477496120020694,
"grad_norm": 0.8411531448364258,
"learning_rate": 4.318000500396242e-06,
"loss": 0.264,
"step": 2850
},
{
"epoch": 2.958096223486808,
"grad_norm": 0.8241459727287292,
"learning_rate": 4.28225404173254e-06,
"loss": 0.2692,
"step": 2860
},
{
"epoch": 2.9684428349715466,
"grad_norm": 0.8481301665306091,
"learning_rate": 4.24654500191983e-06,
"loss": 0.2617,
"step": 2870
},
{
"epoch": 2.9787894464562856,
"grad_norm": 0.8325083255767822,
"learning_rate": 4.210875242607381e-06,
"loss": 0.271,
"step": 2880
},
{
"epoch": 2.989136057941024,
"grad_norm": 0.7716158032417297,
"learning_rate": 4.175246623396619e-06,
"loss": 0.2606,
"step": 2890
},
{
"epoch": 2.9994826694257632,
"grad_norm": 0.8174447417259216,
"learning_rate": 4.139661001744178e-06,
"loss": 0.261,
"step": 2900
},
{
"epoch": 3.009311950336265,
"grad_norm": 0.8203701972961426,
"learning_rate": 4.104120232865072e-06,
"loss": 0.1988,
"step": 2910
},
{
"epoch": 3.0196585618210037,
"grad_norm": 0.7871333956718445,
"learning_rate": 4.06862616963596e-06,
"loss": 0.1842,
"step": 2920
},
{
"epoch": 3.0300051733057423,
"grad_norm": 0.846492350101471,
"learning_rate": 4.033180662498557e-06,
"loss": 0.1912,
"step": 2930
},
{
"epoch": 3.0403517847904813,
"grad_norm": 0.8526950478553772,
"learning_rate": 3.997785559363163e-06,
"loss": 0.1851,
"step": 2940
},
{
"epoch": 3.05069839627522,
"grad_norm": 0.8476978540420532,
"learning_rate": 3.9624427055123285e-06,
"loss": 0.1853,
"step": 2950
},
{
"epoch": 3.0610450077599585,
"grad_norm": 0.8506020307540894,
"learning_rate": 3.927153943504644e-06,
"loss": 0.1863,
"step": 2960
},
{
"epoch": 3.0713916192446975,
"grad_norm": 0.8641788959503174,
"learning_rate": 3.891921113078684e-06,
"loss": 0.1849,
"step": 2970
},
{
"epoch": 3.081738230729436,
"grad_norm": 0.8302586674690247,
"learning_rate": 3.856746051057096e-06,
"loss": 0.1823,
"step": 2980
},
{
"epoch": 3.0920848422141747,
"grad_norm": 0.8413335084915161,
"learning_rate": 3.8216305912508425e-06,
"loss": 0.1819,
"step": 2990
},
{
"epoch": 3.1024314536989137,
"grad_norm": 0.8483896851539612,
"learning_rate": 3.786576564363588e-06,
"loss": 0.1851,
"step": 3000
},
{
"epoch": 3.1127780651836523,
"grad_norm": 0.864549458026886,
"learning_rate": 3.7515857978962666e-06,
"loss": 0.1879,
"step": 3010
},
{
"epoch": 3.1231246766683913,
"grad_norm": 0.8734090328216553,
"learning_rate": 3.7166601160518025e-06,
"loss": 0.185,
"step": 3020
},
{
"epoch": 3.13347128815313,
"grad_norm": 0.8504880666732788,
"learning_rate": 3.681801339640012e-06,
"loss": 0.1866,
"step": 3030
},
{
"epoch": 3.1438178996378685,
"grad_norm": 0.8266710638999939,
"learning_rate": 3.64701128598267e-06,
"loss": 0.1865,
"step": 3040
},
{
"epoch": 3.1541645111226075,
"grad_norm": 0.8905497789382935,
"learning_rate": 3.612291768818772e-06,
"loss": 0.1906,
"step": 3050
},
{
"epoch": 3.164511122607346,
"grad_norm": 0.8969372510910034,
"learning_rate": 3.5776445982099774e-06,
"loss": 0.1829,
"step": 3060
},
{
"epoch": 3.1748577340920847,
"grad_norm": 0.8385536670684814,
"learning_rate": 3.54307158044624e-06,
"loss": 0.1871,
"step": 3070
},
{
"epoch": 3.1852043455768237,
"grad_norm": 0.8617509007453918,
"learning_rate": 3.508574517951642e-06,
"loss": 0.189,
"step": 3080
},
{
"epoch": 3.1955509570615623,
"grad_norm": 0.8500669002532959,
"learning_rate": 3.474155209190425e-06,
"loss": 0.1864,
"step": 3090
},
{
"epoch": 3.205897568546301,
"grad_norm": 0.843228816986084,
"learning_rate": 3.439815448573231e-06,
"loss": 0.1874,
"step": 3100
},
{
"epoch": 3.21624418003104,
"grad_norm": 0.8573629856109619,
"learning_rate": 3.405557026363554e-06,
"loss": 0.1838,
"step": 3110
},
{
"epoch": 3.2265907915157785,
"grad_norm": 0.9198394417762756,
"learning_rate": 3.3713817285844005e-06,
"loss": 0.1896,
"step": 3120
},
{
"epoch": 3.236937403000517,
"grad_norm": 0.8321995735168457,
"learning_rate": 3.337291336925183e-06,
"loss": 0.1869,
"step": 3130
},
{
"epoch": 3.247284014485256,
"grad_norm": 0.820503830909729,
"learning_rate": 3.3032876286488342e-06,
"loss": 0.1852,
"step": 3140
},
{
"epoch": 3.2576306259699948,
"grad_norm": 0.887699544429779,
"learning_rate": 3.269372376499148e-06,
"loss": 0.1874,
"step": 3150
},
{
"epoch": 3.2679772374547333,
"grad_norm": 0.8699961304664612,
"learning_rate": 3.23554734860836e-06,
"loss": 0.1882,
"step": 3160
},
{
"epoch": 3.2783238489394724,
"grad_norm": 0.8927399516105652,
"learning_rate": 3.2018143084049718e-06,
"loss": 0.1865,
"step": 3170
},
{
"epoch": 3.288670460424211,
"grad_norm": 0.9565382599830627,
"learning_rate": 3.1681750145218094e-06,
"loss": 0.1878,
"step": 3180
},
{
"epoch": 3.29901707190895,
"grad_norm": 0.8601614236831665,
"learning_rate": 3.134631220704348e-06,
"loss": 0.1833,
"step": 3190
},
{
"epoch": 3.3093636833936886,
"grad_norm": 0.8467817902565002,
"learning_rate": 3.101184675719274e-06,
"loss": 0.1883,
"step": 3200
},
{
"epoch": 3.319710294878427,
"grad_norm": 0.832239031791687,
"learning_rate": 3.0678371232633232e-06,
"loss": 0.1863,
"step": 3210
},
{
"epoch": 3.330056906363166,
"grad_norm": 0.8744021654129028,
"learning_rate": 3.0345903018723677e-06,
"loss": 0.1868,
"step": 3220
},
{
"epoch": 3.340403517847905,
"grad_norm": 0.8571172952651978,
"learning_rate": 3.001445944830782e-06,
"loss": 0.1905,
"step": 3230
},
{
"epoch": 3.3507501293326434,
"grad_norm": 0.8718222975730896,
"learning_rate": 2.9684057800810844e-06,
"loss": 0.1842,
"step": 3240
},
{
"epoch": 3.3610967408173824,
"grad_norm": 0.8766878247261047,
"learning_rate": 2.9354715301338477e-06,
"loss": 0.1929,
"step": 3250
},
{
"epoch": 3.371443352302121,
"grad_norm": 0.8255637288093567,
"learning_rate": 2.9026449119778978e-06,
"loss": 0.1842,
"step": 3260
},
{
"epoch": 3.3817899637868596,
"grad_norm": 0.8655493855476379,
"learning_rate": 2.8699276369908042e-06,
"loss": 0.1864,
"step": 3270
},
{
"epoch": 3.3921365752715986,
"grad_norm": 0.8465432524681091,
"learning_rate": 2.8373214108496574e-06,
"loss": 0.186,
"step": 3280
},
{
"epoch": 3.402483186756337,
"grad_norm": 0.855995237827301,
"learning_rate": 2.8048279334421468e-06,
"loss": 0.1855,
"step": 3290
},
{
"epoch": 3.4128297982410762,
"grad_norm": 0.8721122741699219,
"learning_rate": 2.772448898777932e-06,
"loss": 0.1874,
"step": 3300
},
{
"epoch": 3.423176409725815,
"grad_norm": 0.8814127445220947,
"learning_rate": 2.740185994900339e-06,
"loss": 0.1842,
"step": 3310
},
{
"epoch": 3.4335230212105534,
"grad_norm": 0.8728169798851013,
"learning_rate": 2.7080409037983484e-06,
"loss": 0.1856,
"step": 3320
},
{
"epoch": 3.4438696326952924,
"grad_norm": 0.8483415842056274,
"learning_rate": 2.6760153013189115e-06,
"loss": 0.183,
"step": 3330
},
{
"epoch": 3.454216244180031,
"grad_norm": 0.9060035943984985,
"learning_rate": 2.6441108570795717e-06,
"loss": 0.1867,
"step": 3340
},
{
"epoch": 3.4645628556647696,
"grad_norm": 0.8824280500411987,
"learning_rate": 2.6123292343814345e-06,
"loss": 0.1853,
"step": 3350
},
{
"epoch": 3.4749094671495087,
"grad_norm": 0.8550218343734741,
"learning_rate": 2.5806720901224474e-06,
"loss": 0.1856,
"step": 3360
},
{
"epoch": 3.4852560786342472,
"grad_norm": 0.9069307446479797,
"learning_rate": 2.549141074711019e-06,
"loss": 0.1845,
"step": 3370
},
{
"epoch": 3.495602690118986,
"grad_norm": 0.891136646270752,
"learning_rate": 2.5177378319799707e-06,
"loss": 0.1826,
"step": 3380
},
{
"epoch": 3.505949301603725,
"grad_norm": 0.8266403079032898,
"learning_rate": 2.4864639991008526e-06,
"loss": 0.1849,
"step": 3390
},
{
"epoch": 3.5162959130884635,
"grad_norm": 0.8583815693855286,
"learning_rate": 2.4553212064985776e-06,
"loss": 0.1867,
"step": 3400
},
{
"epoch": 3.5266425245732025,
"grad_norm": 0.8285118937492371,
"learning_rate": 2.42431107776643e-06,
"loss": 0.1866,
"step": 3410
},
{
"epoch": 3.536989136057941,
"grad_norm": 0.8632713556289673,
"learning_rate": 2.3934352295814094e-06,
"loss": 0.1885,
"step": 3420
},
{
"epoch": 3.5473357475426797,
"grad_norm": 0.8700264692306519,
"learning_rate": 2.3626952716199647e-06,
"loss": 0.1863,
"step": 3430
},
{
"epoch": 3.5576823590274183,
"grad_norm": 0.8530793786048889,
"learning_rate": 2.332092806474061e-06,
"loss": 0.1859,
"step": 3440
},
{
"epoch": 3.5680289705121573,
"grad_norm": 0.8744956254959106,
"learning_rate": 2.301629429567638e-06,
"loss": 0.1835,
"step": 3450
},
{
"epoch": 3.578375581996896,
"grad_norm": 0.919748067855835,
"learning_rate": 2.2713067290734262e-06,
"loss": 0.1802,
"step": 3460
},
{
"epoch": 3.588722193481635,
"grad_norm": 0.8674561977386475,
"learning_rate": 2.2411262858301613e-06,
"loss": 0.1872,
"step": 3470
},
{
"epoch": 3.5990688049663735,
"grad_norm": 0.86203533411026,
"learning_rate": 2.21108967326016e-06,
"loss": 0.1839,
"step": 3480
},
{
"epoch": 3.609415416451112,
"grad_norm": 0.8447973132133484,
"learning_rate": 2.181198457287295e-06,
"loss": 0.1851,
"step": 3490
},
{
"epoch": 3.619762027935851,
"grad_norm": 0.8653585910797119,
"learning_rate": 2.1514541962553533e-06,
"loss": 0.1835,
"step": 3500
},
{
"epoch": 3.6301086394205897,
"grad_norm": 0.8519960045814514,
"learning_rate": 2.1218584408467996e-06,
"loss": 0.1843,
"step": 3510
},
{
"epoch": 3.6404552509053287,
"grad_norm": 0.8715120553970337,
"learning_rate": 2.092412734001932e-06,
"loss": 0.1837,
"step": 3520
},
{
"epoch": 3.6508018623900673,
"grad_norm": 0.8701560497283936,
"learning_rate": 2.06311861083844e-06,
"loss": 0.1826,
"step": 3530
},
{
"epoch": 3.661148473874806,
"grad_norm": 0.8274082541465759,
"learning_rate": 2.03397759857137e-06,
"loss": 0.1832,
"step": 3540
},
{
"epoch": 3.6714950853595445,
"grad_norm": 0.8705015778541565,
"learning_rate": 2.0049912164335155e-06,
"loss": 0.1845,
"step": 3550
},
{
"epoch": 3.6818416968442835,
"grad_norm": 0.8589960932731628,
"learning_rate": 1.9761609755962064e-06,
"loss": 0.1819,
"step": 3560
},
{
"epoch": 3.692188308329022,
"grad_norm": 0.8815021514892578,
"learning_rate": 1.947488379090527e-06,
"loss": 0.1858,
"step": 3570
},
{
"epoch": 3.702534919813761,
"grad_norm": 0.8352708220481873,
"learning_rate": 1.9189749217289576e-06,
"loss": 0.1856,
"step": 3580
},
{
"epoch": 3.7128815312984997,
"grad_norm": 0.8597686886787415,
"learning_rate": 1.890622090027443e-06,
"loss": 0.1871,
"step": 3590
},
{
"epoch": 3.7232281427832383,
"grad_norm": 0.8759775757789612,
"learning_rate": 1.8624313621278984e-06,
"loss": 0.1818,
"step": 3600
},
{
"epoch": 3.7335747542679774,
"grad_norm": 0.8692873120307922,
"learning_rate": 1.8344042077211438e-06,
"loss": 0.1861,
"step": 3610
},
{
"epoch": 3.743921365752716,
"grad_norm": 0.8584245443344116,
"learning_rate": 1.8065420879702888e-06,
"loss": 0.1836,
"step": 3620
},
{
"epoch": 3.754267977237455,
"grad_norm": 0.861050546169281,
"learning_rate": 1.7788464554345468e-06,
"loss": 0.1877,
"step": 3630
},
{
"epoch": 3.7646145887221936,
"grad_norm": 0.879095196723938,
"learning_rate": 1.7513187539935188e-06,
"loss": 0.1861,
"step": 3640
},
{
"epoch": 3.774961200206932,
"grad_norm": 0.8849213719367981,
"learning_rate": 1.7239604187719127e-06,
"loss": 0.1833,
"step": 3650
},
{
"epoch": 3.7853078116916707,
"grad_norm": 0.9280983209609985,
"learning_rate": 1.6967728760647267e-06,
"loss": 0.1829,
"step": 3660
},
{
"epoch": 3.7956544231764098,
"grad_norm": 0.8512122631072998,
"learning_rate": 1.6697575432628842e-06,
"loss": 0.181,
"step": 3670
},
{
"epoch": 3.8060010346611484,
"grad_norm": 0.8548404574394226,
"learning_rate": 1.6429158287793512e-06,
"loss": 0.1804,
"step": 3680
},
{
"epoch": 3.8163476461458874,
"grad_norm": 0.919730007648468,
"learning_rate": 1.6162491319757029e-06,
"loss": 0.1848,
"step": 3690
},
{
"epoch": 3.826694257630626,
"grad_norm": 0.9120166301727295,
"learning_rate": 1.589758843089172e-06,
"loss": 0.1811,
"step": 3700
},
{
"epoch": 3.8370408691153646,
"grad_norm": 0.9141139984130859,
"learning_rate": 1.5634463431601655e-06,
"loss": 0.1815,
"step": 3710
},
{
"epoch": 3.8473874806001036,
"grad_norm": 0.8539378046989441,
"learning_rate": 1.5373130039602753e-06,
"loss": 0.1826,
"step": 3720
},
{
"epoch": 3.857734092084842,
"grad_norm": 0.8497625589370728,
"learning_rate": 1.5113601879207536e-06,
"loss": 0.1826,
"step": 3730
},
{
"epoch": 3.8680807035695812,
"grad_norm": 0.8764408230781555,
"learning_rate": 1.4855892480614903e-06,
"loss": 0.1834,
"step": 3740
},
{
"epoch": 3.87842731505432,
"grad_norm": 0.8945449590682983,
"learning_rate": 1.460001527920467e-06,
"loss": 0.1848,
"step": 3750
},
{
"epoch": 3.8887739265390584,
"grad_norm": 0.890354335308075,
"learning_rate": 1.4345983614837239e-06,
"loss": 0.1805,
"step": 3760
},
{
"epoch": 3.899120538023797,
"grad_norm": 0.8389227986335754,
"learning_rate": 1.4093810731158058e-06,
"loss": 0.1804,
"step": 3770
},
{
"epoch": 3.909467149508536,
"grad_norm": 0.8829708695411682,
"learning_rate": 1.3843509774907222e-06,
"loss": 0.1847,
"step": 3780
},
{
"epoch": 3.9198137609932746,
"grad_norm": 0.8937520384788513,
"learning_rate": 1.359509379523402e-06,
"loss": 0.1835,
"step": 3790
},
{
"epoch": 3.9301603724780136,
"grad_norm": 0.8619959354400635,
"learning_rate": 1.3348575743016735e-06,
"loss": 0.1801,
"step": 3800
},
{
"epoch": 3.9405069839627522,
"grad_norm": 0.8984729051589966,
"learning_rate": 1.3103968470187384e-06,
"loss": 0.1788,
"step": 3810
},
{
"epoch": 3.950853595447491,
"grad_norm": 0.9100914597511292,
"learning_rate": 1.286128472906173e-06,
"loss": 0.1822,
"step": 3820
},
{
"epoch": 3.96120020693223,
"grad_norm": 0.8695194721221924,
"learning_rate": 1.2620537171674418e-06,
"loss": 0.1858,
"step": 3830
},
{
"epoch": 3.9715468184169684,
"grad_norm": 0.950613796710968,
"learning_rate": 1.2381738349119443e-06,
"loss": 0.1849,
"step": 3840
},
{
"epoch": 3.981893429901707,
"grad_norm": 0.8430088758468628,
"learning_rate": 1.2144900710895757e-06,
"loss": 0.1793,
"step": 3850
},
{
"epoch": 3.992240041386446,
"grad_norm": 0.8936494588851929,
"learning_rate": 1.1910036604258268e-06,
"loss": 0.1822,
"step": 3860
},
{
"epoch": 4.002069322296948,
"grad_norm": 0.7884753346443176,
"learning_rate": 1.1677158273574047e-06,
"loss": 0.1719,
"step": 3870
},
{
"epoch": 4.0124159337816865,
"grad_norm": 0.778055727481842,
"learning_rate": 1.1446277859684146e-06,
"loss": 0.1306,
"step": 3880
},
{
"epoch": 4.022762545266425,
"grad_norm": 0.8549284934997559,
"learning_rate": 1.1217407399270459e-06,
"loss": 0.1255,
"step": 3890
},
{
"epoch": 4.033109156751164,
"grad_norm": 0.8523449301719666,
"learning_rate": 1.0990558824228336e-06,
"loss": 0.1266,
"step": 3900
},
{
"epoch": 4.043455768235903,
"grad_norm": 0.816981852054596,
"learning_rate": 1.0765743961044445e-06,
"loss": 0.125,
"step": 3910
},
{
"epoch": 4.053802379720642,
"grad_norm": 0.8329877853393555,
"learning_rate": 1.0542974530180327e-06,
"loss": 0.1256,
"step": 3920
},
{
"epoch": 4.06414899120538,
"grad_norm": 0.8189502954483032,
"learning_rate": 1.0322262145461203e-06,
"loss": 0.1249,
"step": 3930
},
{
"epoch": 4.074495602690119,
"grad_norm": 0.8544175624847412,
"learning_rate": 1.0103618313470642e-06,
"loss": 0.1259,
"step": 3940
},
{
"epoch": 4.0848422141748575,
"grad_norm": 0.8433131575584412,
"learning_rate": 9.88705443295057e-07,
"loss": 0.1265,
"step": 3950
},
{
"epoch": 4.095188825659596,
"grad_norm": 0.84726482629776,
"learning_rate": 9.672581794207141e-07,
"loss": 0.1262,
"step": 3960
},
{
"epoch": 4.1055354371443356,
"grad_norm": 0.8315642476081848,
"learning_rate": 9.460211578521966e-07,
"loss": 0.1247,
"step": 3970
},
{
"epoch": 4.115882048629074,
"grad_norm": 0.8212100267410278,
"learning_rate": 9.249954857569326e-07,
"loss": 0.1243,
"step": 3980
},
{
"epoch": 4.126228660113813,
"grad_norm": 0.9180471897125244,
"learning_rate": 9.041822592838873e-07,
"loss": 0.1237,
"step": 3990
},
{
"epoch": 4.136575271598551,
"grad_norm": 0.8375534415245056,
"learning_rate": 8.835825635064266e-07,
"loss": 0.124,
"step": 4000
},
{
"epoch": 4.14692188308329,
"grad_norm": 0.8534741997718811,
"learning_rate": 8.631974723657344e-07,
"loss": 0.1242,
"step": 4010
},
{
"epoch": 4.157268494568029,
"grad_norm": 0.8509218692779541,
"learning_rate": 8.430280486148368e-07,
"loss": 0.1222,
"step": 4020
},
{
"epoch": 4.167615106052768,
"grad_norm": 0.8550025820732117,
"learning_rate": 8.230753437631889e-07,
"loss": 0.125,
"step": 4030
},
{
"epoch": 4.177961717537507,
"grad_norm": 0.8948724865913391,
"learning_rate": 8.033403980218596e-07,
"loss": 0.1241,
"step": 4040
},
{
"epoch": 4.188308329022245,
"grad_norm": 0.858658492565155,
"learning_rate": 7.838242402492973e-07,
"loss": 0.1237,
"step": 4050
},
{
"epoch": 4.198654940506984,
"grad_norm": 0.8689480423927307,
"learning_rate": 7.645278878976952e-07,
"loss": 0.1272,
"step": 4060
},
{
"epoch": 4.209001551991722,
"grad_norm": 0.8573870062828064,
"learning_rate": 7.454523469599484e-07,
"loss": 0.1254,
"step": 4070
},
{
"epoch": 4.219348163476462,
"grad_norm": 0.9156485199928284,
"learning_rate": 7.265986119172036e-07,
"loss": 0.1227,
"step": 4080
},
{
"epoch": 4.2296947749612,
"grad_norm": 0.9127551317214966,
"learning_rate": 7.079676656870138e-07,
"loss": 0.1272,
"step": 4090
},
{
"epoch": 4.240041386445939,
"grad_norm": 0.8755181431770325,
"learning_rate": 6.895604795720989e-07,
"loss": 0.1273,
"step": 4100
},
{
"epoch": 4.250387997930678,
"grad_norm": 0.8257960677146912,
"learning_rate": 6.713780132097053e-07,
"loss": 0.1248,
"step": 4110
},
{
"epoch": 4.260734609415416,
"grad_norm": 0.8848944902420044,
"learning_rate": 6.534212145215751e-07,
"loss": 0.1234,
"step": 4120
},
{
"epoch": 4.271081220900156,
"grad_norm": 0.8648582100868225,
"learning_rate": 6.356910196645271e-07,
"loss": 0.1251,
"step": 4130
},
{
"epoch": 4.281427832384894,
"grad_norm": 0.8881234526634216,
"learning_rate": 6.181883529816585e-07,
"loss": 0.1275,
"step": 4140
},
{
"epoch": 4.291774443869633,
"grad_norm": 0.8552895784378052,
"learning_rate": 6.009141269541424e-07,
"loss": 0.1242,
"step": 4150
},
{
"epoch": 4.302121055354371,
"grad_norm": 0.8629468679428101,
"learning_rate": 5.838692421536696e-07,
"loss": 0.1228,
"step": 4160
},
{
"epoch": 4.31246766683911,
"grad_norm": 0.8863692283630371,
"learning_rate": 5.67054587195488e-07,
"loss": 0.1232,
"step": 4170
},
{
"epoch": 4.322814278323849,
"grad_norm": 0.8393474221229553,
"learning_rate": 5.504710386920869e-07,
"loss": 0.1225,
"step": 4180
},
{
"epoch": 4.333160889808588,
"grad_norm": 0.8772891759872437,
"learning_rate": 5.341194612074824e-07,
"loss": 0.1248,
"step": 4190
},
{
"epoch": 4.343507501293327,
"grad_norm": 0.8435797095298767,
"learning_rate": 5.180007072121562e-07,
"loss": 0.1233,
"step": 4200
},
{
"epoch": 4.353854112778065,
"grad_norm": 0.860095739364624,
"learning_rate": 5.021156170386021e-07,
"loss": 0.1266,
"step": 4210
},
{
"epoch": 4.364200724262804,
"grad_norm": 0.8484596610069275,
"learning_rate": 4.86465018837532e-07,
"loss": 0.1223,
"step": 4220
},
{
"epoch": 4.374547335747542,
"grad_norm": 0.8837497234344482,
"learning_rate": 4.71049728534681e-07,
"loss": 0.1232,
"step": 4230
},
{
"epoch": 4.384893947232282,
"grad_norm": 0.8328167200088501,
"learning_rate": 4.5587054978828814e-07,
"loss": 0.1285,
"step": 4240
},
{
"epoch": 4.3952405587170205,
"grad_norm": 0.8719879984855652,
"learning_rate": 4.4092827394718485e-07,
"loss": 0.124,
"step": 4250
},
{
"epoch": 4.405587170201759,
"grad_norm": 0.8502692580223083,
"learning_rate": 4.2622368000955207e-07,
"loss": 0.1219,
"step": 4260
},
{
"epoch": 4.415933781686498,
"grad_norm": 0.8925228118896484,
"learning_rate": 4.1175753458229495e-07,
"loss": 0.1266,
"step": 4270
},
{
"epoch": 4.426280393171236,
"grad_norm": 0.8601601719856262,
"learning_rate": 3.9753059184108547e-07,
"loss": 0.1229,
"step": 4280
},
{
"epoch": 4.436627004655975,
"grad_norm": 0.8607948422431946,
"learning_rate": 3.8354359349104143e-07,
"loss": 0.1254,
"step": 4290
},
{
"epoch": 4.446973616140714,
"grad_norm": 0.8845228552818298,
"learning_rate": 3.697972687280599e-07,
"loss": 0.1245,
"step": 4300
},
{
"epoch": 4.457320227625453,
"grad_norm": 0.9021503329277039,
"learning_rate": 3.5629233420079733e-07,
"loss": 0.1242,
"step": 4310
},
{
"epoch": 4.4676668391101915,
"grad_norm": 0.8827393054962158,
"learning_rate": 3.430294939733131e-07,
"loss": 0.1265,
"step": 4320
},
{
"epoch": 4.47801345059493,
"grad_norm": 0.9011342525482178,
"learning_rate": 3.3000943948836183e-07,
"loss": 0.1245,
"step": 4330
},
{
"epoch": 4.488360062079669,
"grad_norm": 0.8301414251327515,
"learning_rate": 3.1723284953134594e-07,
"loss": 0.125,
"step": 4340
},
{
"epoch": 4.498706673564408,
"grad_norm": 0.8322227597236633,
"learning_rate": 3.047003901949258e-07,
"loss": 0.1231,
"step": 4350
},
{
"epoch": 4.509053285049147,
"grad_norm": 0.8764147758483887,
"learning_rate": 2.9241271484429736e-07,
"loss": 0.1229,
"step": 4360
},
{
"epoch": 4.519399896533885,
"grad_norm": 0.8860657215118408,
"learning_rate": 2.803704640831284e-07,
"loss": 0.1232,
"step": 4370
},
{
"epoch": 4.529746508018624,
"grad_norm": 0.9050237536430359,
"learning_rate": 2.685742657201601e-07,
"loss": 0.1221,
"step": 4380
},
{
"epoch": 4.5400931195033625,
"grad_norm": 0.8230318427085876,
"learning_rate": 2.570247347364779e-07,
"loss": 0.1221,
"step": 4390
},
{
"epoch": 4.550439730988101,
"grad_norm": 0.8647773265838623,
"learning_rate": 2.4572247325345135e-07,
"loss": 0.1234,
"step": 4400
},
{
"epoch": 4.56078634247284,
"grad_norm": 0.833219051361084,
"learning_rate": 2.34668070501341e-07,
"loss": 0.1233,
"step": 4410
},
{
"epoch": 4.571132953957579,
"grad_norm": 0.8522652983665466,
"learning_rate": 2.2386210278858124e-07,
"loss": 0.1212,
"step": 4420
},
{
"epoch": 4.581479565442318,
"grad_norm": 0.9003979563713074,
"learning_rate": 2.1330513347173398e-07,
"loss": 0.1255,
"step": 4430
},
{
"epoch": 4.591826176927056,
"grad_norm": 0.8572881817817688,
"learning_rate": 2.0299771292612014e-07,
"loss": 0.1231,
"step": 4440
},
{
"epoch": 4.602172788411795,
"grad_norm": 0.8578411340713501,
"learning_rate": 1.9294037851712465e-07,
"loss": 0.1236,
"step": 4450
},
{
"epoch": 4.612519399896534,
"grad_norm": 0.8606471419334412,
"learning_rate": 1.83133654572184e-07,
"loss": 0.1249,
"step": 4460
},
{
"epoch": 4.622866011381273,
"grad_norm": 0.8600118160247803,
"learning_rate": 1.7357805235344694e-07,
"loss": 0.1231,
"step": 4470
},
{
"epoch": 4.6332126228660115,
"grad_norm": 0.848580539226532,
"learning_rate": 1.6427407003112517e-07,
"loss": 0.1212,
"step": 4480
},
{
"epoch": 4.64355923435075,
"grad_norm": 0.8755767941474915,
"learning_rate": 1.5522219265751925e-07,
"loss": 0.1208,
"step": 4490
},
{
"epoch": 4.653905845835489,
"grad_norm": 0.8640385866165161,
"learning_rate": 1.464228921417321e-07,
"loss": 0.1202,
"step": 4500
},
{
"epoch": 4.664252457320227,
"grad_norm": 0.8280548453330994,
"learning_rate": 1.3787662722506256e-07,
"loss": 0.1206,
"step": 4510
},
{
"epoch": 4.674599068804966,
"grad_norm": 0.8286001682281494,
"learning_rate": 1.295838434570973e-07,
"loss": 0.1211,
"step": 4520
},
{
"epoch": 4.684945680289705,
"grad_norm": 0.8765918612480164,
"learning_rate": 1.2154497317247439e-07,
"loss": 0.1238,
"step": 4530
},
{
"epoch": 4.695292291774444,
"grad_norm": 0.9076476693153381,
"learning_rate": 1.137604354683497e-07,
"loss": 0.1261,
"step": 4540
},
{
"epoch": 4.7056389032591825,
"grad_norm": 0.8170851469039917,
"learning_rate": 1.0623063618254548e-07,
"loss": 0.1222,
"step": 4550
},
{
"epoch": 4.715985514743921,
"grad_norm": 0.8892994523048401,
"learning_rate": 9.895596787239114e-08,
"loss": 0.1244,
"step": 4560
},
{
"epoch": 4.726332126228661,
"grad_norm": 0.864942729473114,
"learning_rate": 9.193680979426189e-08,
"loss": 0.1227,
"step": 4570
},
{
"epoch": 4.736678737713399,
"grad_norm": 0.873085618019104,
"learning_rate": 8.517352788380173e-08,
"loss": 0.1241,
"step": 4580
},
{
"epoch": 4.747025349198138,
"grad_norm": 0.8733391761779785,
"learning_rate": 7.866647473685041e-08,
"loss": 0.1222,
"step": 4590
},
{
"epoch": 4.757371960682876,
"grad_norm": 0.8656638264656067,
"learning_rate": 7.241598959105645e-08,
"loss": 0.1196,
"step": 4600
},
{
"epoch": 4.767718572167615,
"grad_norm": 0.8693123459815979,
"learning_rate": 6.642239830819574e-08,
"loss": 0.123,
"step": 4610
},
{
"epoch": 4.7780651836523536,
"grad_norm": 0.8679444789886475,
"learning_rate": 6.068601335718127e-08,
"loss": 0.1236,
"step": 4620
},
{
"epoch": 4.788411795137092,
"grad_norm": 0.9099794030189514,
"learning_rate": 5.520713379777276e-08,
"loss": 0.1237,
"step": 4630
},
{
"epoch": 4.798758406621832,
"grad_norm": 0.8581921458244324,
"learning_rate": 4.9986045264984185e-08,
"loss": 0.1235,
"step": 4640
},
{
"epoch": 4.80910501810657,
"grad_norm": 0.8875443339347839,
"learning_rate": 4.5023019954197334e-08,
"loss": 0.121,
"step": 4650
},
{
"epoch": 4.819451629591309,
"grad_norm": 0.9026442170143127,
"learning_rate": 4.031831660696484e-08,
"loss": 0.1232,
"step": 4660
},
{
"epoch": 4.829798241076047,
"grad_norm": 0.8439733982086182,
"learning_rate": 3.5872180497526497e-08,
"loss": 0.1204,
"step": 4670
},
{
"epoch": 4.840144852560786,
"grad_norm": 0.8329970240592957,
"learning_rate": 3.1684843420017316e-08,
"loss": 0.1251,
"step": 4680
},
{
"epoch": 4.8504914640455254,
"grad_norm": 0.8968592286109924,
"learning_rate": 2.775652367638826e-08,
"loss": 0.1207,
"step": 4690
},
{
"epoch": 4.860838075530264,
"grad_norm": 0.8602431416511536,
"learning_rate": 2.4087426065020902e-08,
"loss": 0.1223,
"step": 4700
},
{
"epoch": 4.871184687015003,
"grad_norm": 0.8694334626197815,
"learning_rate": 2.0677741870053227e-08,
"loss": 0.1247,
"step": 4710
},
{
"epoch": 4.881531298499741,
"grad_norm": 0.8522930145263672,
"learning_rate": 1.7527648851406453e-08,
"loss": 0.1245,
"step": 4720
},
{
"epoch": 4.89187790998448,
"grad_norm": 0.8731361627578735,
"learning_rate": 1.4637311235516926e-08,
"loss": 0.121,
"step": 4730
},
{
"epoch": 4.902224521469218,
"grad_norm": 0.8769620656967163,
"learning_rate": 1.2006879706776276e-08,
"loss": 0.1225,
"step": 4740
},
{
"epoch": 4.912571132953958,
"grad_norm": 0.8584715127944946,
"learning_rate": 9.636491399673264e-09,
"loss": 0.1194,
"step": 4750
},
{
"epoch": 4.9229177444386965,
"grad_norm": 0.8418834805488586,
"learning_rate": 7.526269891646176e-09,
"loss": 0.122,
"step": 4760
},
{
"epoch": 4.933264355923435,
"grad_norm": 0.8689851760864258,
"learning_rate": 5.676325196640187e-09,
"loss": 0.1208,
"step": 4770
},
{
"epoch": 4.943610967408174,
"grad_norm": 0.8356772661209106,
"learning_rate": 4.086753759370288e-09,
"loss": 0.1204,
"step": 4780
},
{
"epoch": 4.953957578892912,
"grad_norm": 0.9326891303062439,
"learning_rate": 2.757638450295308e-09,
"loss": 0.125,
"step": 4790
},
{
"epoch": 4.964304190377652,
"grad_norm": 0.892158031463623,
"learning_rate": 1.6890485612963691e-09,
"loss": 0.1217,
"step": 4800
},
{
"epoch": 4.97465080186239,
"grad_norm": 0.8835722208023071,
"learning_rate": 8.810398020653311e-10,
"loss": 0.1231,
"step": 4810
},
{
"epoch": 4.984997413347129,
"grad_norm": 0.8562841415405273,
"learning_rate": 3.3365429719933727e-10,
"loss": 0.1219,
"step": 4820
},
{
"epoch": 4.9953440248318675,
"grad_norm": 0.8547988533973694,
"learning_rate": 4.692058400479482e-11,
"loss": 0.1233,
"step": 4830
},
{
"epoch": 5.0,
"step": 4835,
"total_flos": 7.821379912118514e+19,
"train_loss": 0.2716523540056972,
"train_runtime": 48548.948,
"train_samples_per_second": 6.369,
"train_steps_per_second": 0.1
}
],
"logging_steps": 10,
"max_steps": 4835,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.821379912118514e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}