Munnafaisal's picture
Upload folder using huggingface_hub
5b87fe6 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.2558087074216107,
"eval_steps": 500,
"global_step": 10000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0022558087074216106,
"grad_norm": 2.141827344894409,
"learning_rate": 0.0002998781863297992,
"loss": 0.7581,
"mean_token_accuracy": 0.8328841328620911,
"num_tokens": 20380.0,
"step": 10
},
{
"epoch": 0.004511617414843221,
"grad_norm": 1.2108855247497559,
"learning_rate": 0.0002997428378073539,
"loss": 0.5535,
"mean_token_accuracy": 0.8761807084083557,
"num_tokens": 30607.0,
"step": 20
},
{
"epoch": 0.006767426122264832,
"grad_norm": 1.2379798889160156,
"learning_rate": 0.0002996074892849086,
"loss": 0.5122,
"mean_token_accuracy": 0.8828346908092499,
"num_tokens": 40842.0,
"step": 30
},
{
"epoch": 0.009023234829686443,
"grad_norm": 1.1645982265472412,
"learning_rate": 0.0002994721407624633,
"loss": 0.5342,
"mean_token_accuracy": 0.8773131012916565,
"num_tokens": 51063.0,
"step": 40
},
{
"epoch": 0.011279043537108053,
"grad_norm": 0.6872125267982483,
"learning_rate": 0.000299336792240018,
"loss": 0.4112,
"mean_token_accuracy": 0.9047324001789093,
"num_tokens": 61285.0,
"step": 50
},
{
"epoch": 0.013534852244529664,
"grad_norm": 0.7911831736564636,
"learning_rate": 0.00029920144371757275,
"loss": 0.449,
"mean_token_accuracy": 0.8905096411705017,
"num_tokens": 71440.0,
"step": 60
},
{
"epoch": 0.015790660951951276,
"grad_norm": 0.8042737245559692,
"learning_rate": 0.00029906609519512746,
"loss": 0.5045,
"mean_token_accuracy": 0.8851543009281159,
"num_tokens": 81592.0,
"step": 70
},
{
"epoch": 0.018046469659372885,
"grad_norm": 0.8313478827476501,
"learning_rate": 0.00029893074667268217,
"loss": 0.4416,
"mean_token_accuracy": 0.8926478564739228,
"num_tokens": 91800.0,
"step": 80
},
{
"epoch": 0.020302278366794498,
"grad_norm": 0.7176661491394043,
"learning_rate": 0.0002987953981502368,
"loss": 0.4081,
"mean_token_accuracy": 0.8967904210090637,
"num_tokens": 102030.0,
"step": 90
},
{
"epoch": 0.022558087074216106,
"grad_norm": 0.6425905227661133,
"learning_rate": 0.00029866004962779153,
"loss": 0.445,
"mean_token_accuracy": 0.8905546844005585,
"num_tokens": 112153.0,
"step": 100
},
{
"epoch": 0.02481389578163772,
"grad_norm": 0.4888269305229187,
"learning_rate": 0.00029852470110534624,
"loss": 0.4923,
"mean_token_accuracy": 0.8825848281383515,
"num_tokens": 122331.0,
"step": 110
},
{
"epoch": 0.027069704489059328,
"grad_norm": 0.5978611707687378,
"learning_rate": 0.00029838935258290095,
"loss": 0.362,
"mean_token_accuracy": 0.9085110425949097,
"num_tokens": 132562.0,
"step": 120
},
{
"epoch": 0.02932551319648094,
"grad_norm": 0.5825199484825134,
"learning_rate": 0.0002982540040604556,
"loss": 0.322,
"mean_token_accuracy": 0.911745023727417,
"num_tokens": 142724.0,
"step": 130
},
{
"epoch": 0.03158132190390255,
"grad_norm": 0.5778432488441467,
"learning_rate": 0.0002981186555380103,
"loss": 0.3264,
"mean_token_accuracy": 0.9135629117488862,
"num_tokens": 152950.0,
"step": 140
},
{
"epoch": 0.03383713061132416,
"grad_norm": 0.8815613389015198,
"learning_rate": 0.0002979833070155651,
"loss": 0.3902,
"mean_token_accuracy": 0.9044237017631531,
"num_tokens": 162418.0,
"step": 150
},
{
"epoch": 0.03609293931874577,
"grad_norm": 0.5860086679458618,
"learning_rate": 0.0002978479584931198,
"loss": 0.3639,
"mean_token_accuracy": 0.903467881679535,
"num_tokens": 172644.0,
"step": 160
},
{
"epoch": 0.03834874802616738,
"grad_norm": 0.8777345418930054,
"learning_rate": 0.0002977126099706745,
"loss": 0.3447,
"mean_token_accuracy": 0.9112663745880127,
"num_tokens": 182800.0,
"step": 170
},
{
"epoch": 0.040604556733588995,
"grad_norm": 0.5808679461479187,
"learning_rate": 0.00029757726144822916,
"loss": 0.3847,
"mean_token_accuracy": 0.9011166810989379,
"num_tokens": 192976.0,
"step": 180
},
{
"epoch": 0.0428603654410106,
"grad_norm": 0.8659303784370422,
"learning_rate": 0.00029744191292578387,
"loss": 0.3495,
"mean_token_accuracy": 0.9079599380493164,
"num_tokens": 203148.0,
"step": 190
},
{
"epoch": 0.04511617414843221,
"grad_norm": 0.6106019616127014,
"learning_rate": 0.0002973065644033386,
"loss": 0.3652,
"mean_token_accuracy": 0.9020362496376038,
"num_tokens": 213312.0,
"step": 200
},
{
"epoch": 0.047371982855853825,
"grad_norm": 0.4957314431667328,
"learning_rate": 0.0002971712158808933,
"loss": 0.4095,
"mean_token_accuracy": 0.8981463789939881,
"num_tokens": 223516.0,
"step": 210
},
{
"epoch": 0.04962779156327544,
"grad_norm": 0.7348946928977966,
"learning_rate": 0.00029703586735844794,
"loss": 0.3619,
"mean_token_accuracy": 0.9077386081218719,
"num_tokens": 233691.0,
"step": 220
},
{
"epoch": 0.05188360027069704,
"grad_norm": 1.0192636251449585,
"learning_rate": 0.00029690051883600265,
"loss": 0.3664,
"mean_token_accuracy": 0.9049350261688233,
"num_tokens": 243886.0,
"step": 230
},
{
"epoch": 0.054139408978118655,
"grad_norm": 0.6710547804832458,
"learning_rate": 0.00029676517031355736,
"loss": 0.4232,
"mean_token_accuracy": 0.8985056221485138,
"num_tokens": 254069.0,
"step": 240
},
{
"epoch": 0.05639521768554027,
"grad_norm": 0.49837666749954224,
"learning_rate": 0.0002966298217911121,
"loss": 0.391,
"mean_token_accuracy": 0.8973807156085968,
"num_tokens": 264303.0,
"step": 250
},
{
"epoch": 0.05865102639296188,
"grad_norm": 0.5633454918861389,
"learning_rate": 0.0002964944732686668,
"loss": 0.3754,
"mean_token_accuracy": 0.9069048821926117,
"num_tokens": 274483.0,
"step": 260
},
{
"epoch": 0.060906835100383486,
"grad_norm": 0.4087347686290741,
"learning_rate": 0.0002963591247462215,
"loss": 0.2673,
"mean_token_accuracy": 0.9232019662857056,
"num_tokens": 284690.0,
"step": 270
},
{
"epoch": 0.0631626438078051,
"grad_norm": 0.5431217551231384,
"learning_rate": 0.0002962237762237762,
"loss": 0.4413,
"mean_token_accuracy": 0.895687735080719,
"num_tokens": 294929.0,
"step": 280
},
{
"epoch": 0.06541845251522671,
"grad_norm": 0.646186351776123,
"learning_rate": 0.0002960884277013309,
"loss": 0.3284,
"mean_token_accuracy": 0.9093968510627747,
"num_tokens": 305144.0,
"step": 290
},
{
"epoch": 0.06767426122264832,
"grad_norm": 0.8424251079559326,
"learning_rate": 0.00029595307917888557,
"loss": 0.3412,
"mean_token_accuracy": 0.9124035537242889,
"num_tokens": 315322.0,
"step": 300
},
{
"epoch": 0.06993006993006994,
"grad_norm": 0.4986512362957001,
"learning_rate": 0.0002958177306564403,
"loss": 0.3524,
"mean_token_accuracy": 0.9054706990718842,
"num_tokens": 325526.0,
"step": 310
},
{
"epoch": 0.07218587863749154,
"grad_norm": 0.3398238718509674,
"learning_rate": 0.000295682382133995,
"loss": 0.3065,
"mean_token_accuracy": 0.9175658822059631,
"num_tokens": 335687.0,
"step": 320
},
{
"epoch": 0.07444168734491315,
"grad_norm": 0.5017867088317871,
"learning_rate": 0.0002955470336115497,
"loss": 0.409,
"mean_token_accuracy": 0.8957060396671295,
"num_tokens": 345870.0,
"step": 330
},
{
"epoch": 0.07669749605233477,
"grad_norm": 0.6093907356262207,
"learning_rate": 0.00029541168508910446,
"loss": 0.2775,
"mean_token_accuracy": 0.9198772490024567,
"num_tokens": 356074.0,
"step": 340
},
{
"epoch": 0.07895330475975637,
"grad_norm": 0.6241074800491333,
"learning_rate": 0.0002952763365666591,
"loss": 0.3153,
"mean_token_accuracy": 0.9163550198078155,
"num_tokens": 366305.0,
"step": 350
},
{
"epoch": 0.08120911346717799,
"grad_norm": 0.5702779293060303,
"learning_rate": 0.0002951409880442138,
"loss": 0.2969,
"mean_token_accuracy": 0.9146891295909881,
"num_tokens": 376503.0,
"step": 360
},
{
"epoch": 0.0834649221745996,
"grad_norm": 0.9562923908233643,
"learning_rate": 0.00029500563952176853,
"loss": 0.3551,
"mean_token_accuracy": 0.9118620991706848,
"num_tokens": 386698.0,
"step": 370
},
{
"epoch": 0.0857207308820212,
"grad_norm": 0.4826742112636566,
"learning_rate": 0.00029487029099932324,
"loss": 0.2599,
"mean_token_accuracy": 0.9280467808246613,
"num_tokens": 396740.0,
"step": 380
},
{
"epoch": 0.08797653958944282,
"grad_norm": 0.4703806936740875,
"learning_rate": 0.0002947349424768779,
"loss": 0.3232,
"mean_token_accuracy": 0.9132184386253357,
"num_tokens": 406940.0,
"step": 390
},
{
"epoch": 0.09023234829686443,
"grad_norm": 0.6047152876853943,
"learning_rate": 0.0002945995939544326,
"loss": 0.3714,
"mean_token_accuracy": 0.9036615669727326,
"num_tokens": 417166.0,
"step": 400
},
{
"epoch": 0.09248815700428603,
"grad_norm": 0.49187320470809937,
"learning_rate": 0.0002944642454319873,
"loss": 0.3322,
"mean_token_accuracy": 0.9098089098930359,
"num_tokens": 427377.0,
"step": 410
},
{
"epoch": 0.09474396571170765,
"grad_norm": 0.3881978988647461,
"learning_rate": 0.00029432889690954203,
"loss": 0.3955,
"mean_token_accuracy": 0.9027588307857514,
"num_tokens": 437556.0,
"step": 420
},
{
"epoch": 0.09699977441912926,
"grad_norm": 0.6978868842124939,
"learning_rate": 0.00029419354838709674,
"loss": 0.3128,
"mean_token_accuracy": 0.9132681012153625,
"num_tokens": 447662.0,
"step": 430
},
{
"epoch": 0.09925558312655088,
"grad_norm": 0.47630035877227783,
"learning_rate": 0.00029405819986465145,
"loss": 0.3218,
"mean_token_accuracy": 0.9099625885486603,
"num_tokens": 457829.0,
"step": 440
},
{
"epoch": 0.10151139183397248,
"grad_norm": 0.6838335990905762,
"learning_rate": 0.00029392285134220616,
"loss": 0.3311,
"mean_token_accuracy": 0.9098457574844361,
"num_tokens": 468031.0,
"step": 450
},
{
"epoch": 0.10376720054139409,
"grad_norm": 0.49004867672920227,
"learning_rate": 0.00029378750281976087,
"loss": 0.3187,
"mean_token_accuracy": 0.910044276714325,
"num_tokens": 478268.0,
"step": 460
},
{
"epoch": 0.1060230092488157,
"grad_norm": 0.4654693007469177,
"learning_rate": 0.0002936521542973156,
"loss": 0.3103,
"mean_token_accuracy": 0.9164334952831268,
"num_tokens": 488472.0,
"step": 470
},
{
"epoch": 0.10827881795623731,
"grad_norm": 0.45917677879333496,
"learning_rate": 0.00029351680577487023,
"loss": 0.2829,
"mean_token_accuracy": 0.9221859157085419,
"num_tokens": 498643.0,
"step": 480
},
{
"epoch": 0.11053462666365892,
"grad_norm": 0.5772504210472107,
"learning_rate": 0.00029338145725242494,
"loss": 0.2643,
"mean_token_accuracy": 0.9243384599685669,
"num_tokens": 508807.0,
"step": 490
},
{
"epoch": 0.11279043537108054,
"grad_norm": 0.39301231503486633,
"learning_rate": 0.00029324610872997965,
"loss": 0.3203,
"mean_token_accuracy": 0.9116944551467896,
"num_tokens": 519035.0,
"step": 500
},
{
"epoch": 0.11504624407850214,
"grad_norm": 0.5303699374198914,
"learning_rate": 0.00029311076020753436,
"loss": 0.2946,
"mean_token_accuracy": 0.9144574999809265,
"num_tokens": 529274.0,
"step": 510
},
{
"epoch": 0.11730205278592376,
"grad_norm": 0.7598997354507446,
"learning_rate": 0.0002929754116850891,
"loss": 0.2854,
"mean_token_accuracy": 0.916255134344101,
"num_tokens": 539479.0,
"step": 520
},
{
"epoch": 0.11955786149334537,
"grad_norm": 0.5954500436782837,
"learning_rate": 0.0002928400631626438,
"loss": 0.2921,
"mean_token_accuracy": 0.9176213085651398,
"num_tokens": 549688.0,
"step": 530
},
{
"epoch": 0.12181367020076697,
"grad_norm": 0.4145963191986084,
"learning_rate": 0.0002927047146401985,
"loss": 0.3326,
"mean_token_accuracy": 0.9075924575328826,
"num_tokens": 559924.0,
"step": 540
},
{
"epoch": 0.12406947890818859,
"grad_norm": 0.4656633138656616,
"learning_rate": 0.0002925693661177532,
"loss": 0.3169,
"mean_token_accuracy": 0.915722268819809,
"num_tokens": 570150.0,
"step": 550
},
{
"epoch": 0.1263252876156102,
"grad_norm": 0.5974048972129822,
"learning_rate": 0.00029243401759530786,
"loss": 0.3155,
"mean_token_accuracy": 0.9142574846744538,
"num_tokens": 580375.0,
"step": 560
},
{
"epoch": 0.1285810963230318,
"grad_norm": 0.6624695658683777,
"learning_rate": 0.00029229866907286257,
"loss": 0.2879,
"mean_token_accuracy": 0.9158149421215057,
"num_tokens": 590551.0,
"step": 570
},
{
"epoch": 0.13083690503045342,
"grad_norm": 0.3690856099128723,
"learning_rate": 0.0002921633205504173,
"loss": 0.2915,
"mean_token_accuracy": 0.9185068488121033,
"num_tokens": 600656.0,
"step": 580
},
{
"epoch": 0.13309271373787504,
"grad_norm": 0.45435959100723267,
"learning_rate": 0.000292027972027972,
"loss": 0.3514,
"mean_token_accuracy": 0.9071869254112244,
"num_tokens": 610856.0,
"step": 590
},
{
"epoch": 0.13534852244529663,
"grad_norm": 0.9437525868415833,
"learning_rate": 0.0002918926235055267,
"loss": 0.2643,
"mean_token_accuracy": 0.9242019116878509,
"num_tokens": 621079.0,
"step": 600
},
{
"epoch": 0.13760433115271825,
"grad_norm": 0.6915507912635803,
"learning_rate": 0.0002917572749830814,
"loss": 0.3138,
"mean_token_accuracy": 0.9152492702007293,
"num_tokens": 631319.0,
"step": 610
},
{
"epoch": 0.13986013986013987,
"grad_norm": 0.5025702714920044,
"learning_rate": 0.0002916219264606361,
"loss": 0.3617,
"mean_token_accuracy": 0.9049429833889008,
"num_tokens": 641505.0,
"step": 620
},
{
"epoch": 0.14211594856756146,
"grad_norm": 0.46047383546829224,
"learning_rate": 0.00029148657793819083,
"loss": 0.3169,
"mean_token_accuracy": 0.9166115164756775,
"num_tokens": 651730.0,
"step": 630
},
{
"epoch": 0.14437175727498308,
"grad_norm": 0.7841493487358093,
"learning_rate": 0.00029135122941574554,
"loss": 0.3393,
"mean_token_accuracy": 0.9131287157535553,
"num_tokens": 661844.0,
"step": 640
},
{
"epoch": 0.1466275659824047,
"grad_norm": 0.402266263961792,
"learning_rate": 0.0002912158808933002,
"loss": 0.3463,
"mean_token_accuracy": 0.9074052631855011,
"num_tokens": 672009.0,
"step": 650
},
{
"epoch": 0.1488833746898263,
"grad_norm": 0.49512675404548645,
"learning_rate": 0.0002910805323708549,
"loss": 0.2876,
"mean_token_accuracy": 0.9215861678123474,
"num_tokens": 682162.0,
"step": 660
},
{
"epoch": 0.1511391833972479,
"grad_norm": 0.3438393175601959,
"learning_rate": 0.0002909451838484096,
"loss": 0.2377,
"mean_token_accuracy": 0.9303798198699951,
"num_tokens": 692399.0,
"step": 670
},
{
"epoch": 0.15339499210466953,
"grad_norm": 0.29958051443099976,
"learning_rate": 0.0002908098353259643,
"loss": 0.296,
"mean_token_accuracy": 0.9132875919342041,
"num_tokens": 702578.0,
"step": 680
},
{
"epoch": 0.15565080081209112,
"grad_norm": 0.4435781240463257,
"learning_rate": 0.00029067448680351903,
"loss": 0.3736,
"mean_token_accuracy": 0.8985945582389832,
"num_tokens": 712750.0,
"step": 690
},
{
"epoch": 0.15790660951951274,
"grad_norm": 0.8465012311935425,
"learning_rate": 0.00029053913828107374,
"loss": 0.2601,
"mean_token_accuracy": 0.9246440231800079,
"num_tokens": 722979.0,
"step": 700
},
{
"epoch": 0.16016241822693436,
"grad_norm": 0.6590988636016846,
"learning_rate": 0.00029040378975862845,
"loss": 0.3242,
"mean_token_accuracy": 0.9071028470993042,
"num_tokens": 733197.0,
"step": 710
},
{
"epoch": 0.16241822693435598,
"grad_norm": 0.6681052446365356,
"learning_rate": 0.00029026844123618316,
"loss": 0.2906,
"mean_token_accuracy": 0.9161325991153717,
"num_tokens": 743426.0,
"step": 720
},
{
"epoch": 0.16467403564177757,
"grad_norm": 0.3675503134727478,
"learning_rate": 0.0002901330927137378,
"loss": 0.2886,
"mean_token_accuracy": 0.9143774032592773,
"num_tokens": 753654.0,
"step": 730
},
{
"epoch": 0.1669298443491992,
"grad_norm": 0.38154202699661255,
"learning_rate": 0.00028999774419129253,
"loss": 0.389,
"mean_token_accuracy": 0.9036036729812622,
"num_tokens": 763516.0,
"step": 740
},
{
"epoch": 0.1691856530566208,
"grad_norm": 0.86027991771698,
"learning_rate": 0.00028986239566884724,
"loss": 0.288,
"mean_token_accuracy": 0.918215936422348,
"num_tokens": 773748.0,
"step": 750
},
{
"epoch": 0.1714414617640424,
"grad_norm": 0.33669188618659973,
"learning_rate": 0.00028972704714640195,
"loss": 0.2693,
"mean_token_accuracy": 0.9215926826000214,
"num_tokens": 783906.0,
"step": 760
},
{
"epoch": 0.17369727047146402,
"grad_norm": 0.534418523311615,
"learning_rate": 0.00028959169862395666,
"loss": 0.2914,
"mean_token_accuracy": 0.9151590466499329,
"num_tokens": 794110.0,
"step": 770
},
{
"epoch": 0.17595307917888564,
"grad_norm": 0.4096381366252899,
"learning_rate": 0.00028945635010151137,
"loss": 0.3095,
"mean_token_accuracy": 0.9127452373504639,
"num_tokens": 804320.0,
"step": 780
},
{
"epoch": 0.17820888788630723,
"grad_norm": 0.4493885636329651,
"learning_rate": 0.0002893210015790661,
"loss": 0.3061,
"mean_token_accuracy": 0.9174495875835419,
"num_tokens": 814498.0,
"step": 790
},
{
"epoch": 0.18046469659372885,
"grad_norm": 0.3796924948692322,
"learning_rate": 0.0002891856530566208,
"loss": 0.3325,
"mean_token_accuracy": 0.9142964720726013,
"num_tokens": 824645.0,
"step": 800
},
{
"epoch": 0.18272050530115047,
"grad_norm": 0.43683764338493347,
"learning_rate": 0.0002890503045341755,
"loss": 0.3365,
"mean_token_accuracy": 0.9071278512477875,
"num_tokens": 834842.0,
"step": 810
},
{
"epoch": 0.18497631400857206,
"grad_norm": 0.7502022385597229,
"learning_rate": 0.00028891495601173015,
"loss": 0.2807,
"mean_token_accuracy": 0.9220738291740418,
"num_tokens": 845039.0,
"step": 820
},
{
"epoch": 0.18723212271599368,
"grad_norm": 0.5301753282546997,
"learning_rate": 0.00028877960748928486,
"loss": 0.3563,
"mean_token_accuracy": 0.9055928111076355,
"num_tokens": 855269.0,
"step": 830
},
{
"epoch": 0.1894879314234153,
"grad_norm": 0.32157281041145325,
"learning_rate": 0.0002886442589668396,
"loss": 0.2889,
"mean_token_accuracy": 0.9182112574577331,
"num_tokens": 865472.0,
"step": 840
},
{
"epoch": 0.1917437401308369,
"grad_norm": 0.4322583079338074,
"learning_rate": 0.0002885089104443943,
"loss": 0.2695,
"mean_token_accuracy": 0.9204015076160431,
"num_tokens": 875623.0,
"step": 850
},
{
"epoch": 0.1939995488382585,
"grad_norm": 1.213575839996338,
"learning_rate": 0.000288373561921949,
"loss": 0.3,
"mean_token_accuracy": 0.9166274607181549,
"num_tokens": 885852.0,
"step": 860
},
{
"epoch": 0.19625535754568013,
"grad_norm": 0.5853558778762817,
"learning_rate": 0.0002882382133995037,
"loss": 0.3285,
"mean_token_accuracy": 0.9095144391059875,
"num_tokens": 896040.0,
"step": 870
},
{
"epoch": 0.19851116625310175,
"grad_norm": 0.6323179602622986,
"learning_rate": 0.0002881028648770584,
"loss": 0.2662,
"mean_token_accuracy": 0.9212876856327057,
"num_tokens": 906277.0,
"step": 880
},
{
"epoch": 0.20076697496052334,
"grad_norm": 0.3301967680454254,
"learning_rate": 0.0002879675163546131,
"loss": 0.2902,
"mean_token_accuracy": 0.9195581197738647,
"num_tokens": 915505.0,
"step": 890
},
{
"epoch": 0.20302278366794496,
"grad_norm": 0.4250761866569519,
"learning_rate": 0.0002878321678321678,
"loss": 0.2529,
"mean_token_accuracy": 0.9280913352966309,
"num_tokens": 925676.0,
"step": 900
},
{
"epoch": 0.20527859237536658,
"grad_norm": 0.4531536102294922,
"learning_rate": 0.0002876968193097225,
"loss": 0.3005,
"mean_token_accuracy": 0.9147605180740357,
"num_tokens": 935916.0,
"step": 910
},
{
"epoch": 0.20753440108278817,
"grad_norm": 0.5399945974349976,
"learning_rate": 0.0002875614707872772,
"loss": 0.2359,
"mean_token_accuracy": 0.9297860860824585,
"num_tokens": 946142.0,
"step": 920
},
{
"epoch": 0.2097902097902098,
"grad_norm": 0.4450409412384033,
"learning_rate": 0.0002874261222648319,
"loss": 0.2989,
"mean_token_accuracy": 0.9163881063461303,
"num_tokens": 956351.0,
"step": 930
},
{
"epoch": 0.2120460184976314,
"grad_norm": 0.3771935999393463,
"learning_rate": 0.0002872907737423866,
"loss": 0.3236,
"mean_token_accuracy": 0.9066317915916443,
"num_tokens": 966521.0,
"step": 940
},
{
"epoch": 0.214301827205053,
"grad_norm": 0.6111851930618286,
"learning_rate": 0.00028715542521994133,
"loss": 0.2397,
"mean_token_accuracy": 0.9319941163063049,
"num_tokens": 975920.0,
"step": 950
},
{
"epoch": 0.21655763591247462,
"grad_norm": 0.7245665788650513,
"learning_rate": 0.00028702007669749604,
"loss": 0.3517,
"mean_token_accuracy": 0.9068562746047973,
"num_tokens": 986149.0,
"step": 960
},
{
"epoch": 0.21881344461989624,
"grad_norm": 0.7466909289360046,
"learning_rate": 0.00028688472817505075,
"loss": 0.2991,
"mean_token_accuracy": 0.9244312167167663,
"num_tokens": 996354.0,
"step": 970
},
{
"epoch": 0.22106925332731783,
"grad_norm": 0.7455071210861206,
"learning_rate": 0.00028674937965260546,
"loss": 0.3036,
"mean_token_accuracy": 0.9157386660575867,
"num_tokens": 1006580.0,
"step": 980
},
{
"epoch": 0.22332506203473945,
"grad_norm": 0.5593414902687073,
"learning_rate": 0.0002866140311301601,
"loss": 0.2942,
"mean_token_accuracy": 0.9155135810375213,
"num_tokens": 1016754.0,
"step": 990
},
{
"epoch": 0.22558087074216107,
"grad_norm": 0.3225398659706116,
"learning_rate": 0.0002864786826077148,
"loss": 0.2647,
"mean_token_accuracy": 0.9190112292766571,
"num_tokens": 1026969.0,
"step": 1000
},
{
"epoch": 0.22783667944958266,
"grad_norm": 1.3003923892974854,
"learning_rate": 0.00028634333408526953,
"loss": 0.3036,
"mean_token_accuracy": 0.9123725950717926,
"num_tokens": 1037080.0,
"step": 1010
},
{
"epoch": 0.23009248815700428,
"grad_norm": 0.4882226884365082,
"learning_rate": 0.00028620798556282424,
"loss": 0.2961,
"mean_token_accuracy": 0.9176577150821685,
"num_tokens": 1047291.0,
"step": 1020
},
{
"epoch": 0.2323482968644259,
"grad_norm": 0.3756118714809418,
"learning_rate": 0.00028607263704037895,
"loss": 0.278,
"mean_token_accuracy": 0.9258804321289062,
"num_tokens": 1057514.0,
"step": 1030
},
{
"epoch": 0.23460410557184752,
"grad_norm": 0.5116491317749023,
"learning_rate": 0.00028593728851793366,
"loss": 0.2741,
"mean_token_accuracy": 0.9172999203205109,
"num_tokens": 1067690.0,
"step": 1040
},
{
"epoch": 0.2368599142792691,
"grad_norm": 0.6103722453117371,
"learning_rate": 0.00028580193999548837,
"loss": 0.3095,
"mean_token_accuracy": 0.9097223103046417,
"num_tokens": 1077902.0,
"step": 1050
},
{
"epoch": 0.23911572298669073,
"grad_norm": 0.36436164379119873,
"learning_rate": 0.0002856665914730431,
"loss": 0.2427,
"mean_token_accuracy": 0.9301242768764496,
"num_tokens": 1088129.0,
"step": 1060
},
{
"epoch": 0.24137153169411235,
"grad_norm": 0.4570798873901367,
"learning_rate": 0.00028553124295059774,
"loss": 0.2629,
"mean_token_accuracy": 0.9264809966087342,
"num_tokens": 1098350.0,
"step": 1070
},
{
"epoch": 0.24362734040153394,
"grad_norm": 0.36314597725868225,
"learning_rate": 0.00028539589442815245,
"loss": 0.3147,
"mean_token_accuracy": 0.916249018907547,
"num_tokens": 1108454.0,
"step": 1080
},
{
"epoch": 0.24588314910895556,
"grad_norm": 0.4478132724761963,
"learning_rate": 0.00028526054590570716,
"loss": 0.2765,
"mean_token_accuracy": 0.9221442401409149,
"num_tokens": 1118688.0,
"step": 1090
},
{
"epoch": 0.24813895781637718,
"grad_norm": 0.42659443616867065,
"learning_rate": 0.00028512519738326187,
"loss": 0.2727,
"mean_token_accuracy": 0.917762154340744,
"num_tokens": 1128861.0,
"step": 1100
},
{
"epoch": 0.2503947665237988,
"grad_norm": 0.4264533221721649,
"learning_rate": 0.0002849898488608166,
"loss": 0.2285,
"mean_token_accuracy": 0.9330006301403045,
"num_tokens": 1139094.0,
"step": 1110
},
{
"epoch": 0.2526505752312204,
"grad_norm": 0.6633393168449402,
"learning_rate": 0.0002848545003383713,
"loss": 0.3234,
"mean_token_accuracy": 0.914995151758194,
"num_tokens": 1149311.0,
"step": 1120
},
{
"epoch": 0.254906383938642,
"grad_norm": 0.4123522937297821,
"learning_rate": 0.000284719151815926,
"loss": 0.3043,
"mean_token_accuracy": 0.9156239628791809,
"num_tokens": 1159539.0,
"step": 1130
},
{
"epoch": 0.2571621926460636,
"grad_norm": 0.5609915256500244,
"learning_rate": 0.0002845838032934807,
"loss": 0.2797,
"mean_token_accuracy": 0.9212962448596954,
"num_tokens": 1169703.0,
"step": 1140
},
{
"epoch": 0.25941800135348525,
"grad_norm": 0.4521436393260956,
"learning_rate": 0.0002844484547710354,
"loss": 0.2507,
"mean_token_accuracy": 0.9307784140110016,
"num_tokens": 1179907.0,
"step": 1150
},
{
"epoch": 0.26167381006090684,
"grad_norm": 0.45429497957229614,
"learning_rate": 0.00028431310624859007,
"loss": 0.258,
"mean_token_accuracy": 0.9245835185050965,
"num_tokens": 1190130.0,
"step": 1160
},
{
"epoch": 0.26392961876832843,
"grad_norm": 0.677976131439209,
"learning_rate": 0.0002841777577261448,
"loss": 0.3474,
"mean_token_accuracy": 0.914616483449936,
"num_tokens": 1200295.0,
"step": 1170
},
{
"epoch": 0.2661854274757501,
"grad_norm": 0.7482908368110657,
"learning_rate": 0.0002840424092036995,
"loss": 0.2255,
"mean_token_accuracy": 0.9334000170230865,
"num_tokens": 1210496.0,
"step": 1180
},
{
"epoch": 0.26844123618317167,
"grad_norm": 0.6767547726631165,
"learning_rate": 0.0002839070606812542,
"loss": 0.2856,
"mean_token_accuracy": 0.9159020364284516,
"num_tokens": 1220649.0,
"step": 1190
},
{
"epoch": 0.27069704489059326,
"grad_norm": 0.5904352068901062,
"learning_rate": 0.0002837717121588089,
"loss": 0.3042,
"mean_token_accuracy": 0.9098004937171936,
"num_tokens": 1230803.0,
"step": 1200
},
{
"epoch": 0.2729528535980149,
"grad_norm": 0.5992192625999451,
"learning_rate": 0.0002836363636363636,
"loss": 0.2964,
"mean_token_accuracy": 0.9183099627494812,
"num_tokens": 1240929.0,
"step": 1210
},
{
"epoch": 0.2752086623054365,
"grad_norm": 1.1047067642211914,
"learning_rate": 0.00028350101511391833,
"loss": 0.2256,
"mean_token_accuracy": 0.9382818222045899,
"num_tokens": 1251084.0,
"step": 1220
},
{
"epoch": 0.2774644710128581,
"grad_norm": 0.30373415350914,
"learning_rate": 0.00028336566659147304,
"loss": 0.2873,
"mean_token_accuracy": 0.9165784835815429,
"num_tokens": 1261319.0,
"step": 1230
},
{
"epoch": 0.27972027972027974,
"grad_norm": 0.5601250529289246,
"learning_rate": 0.0002832303180690277,
"loss": 0.3025,
"mean_token_accuracy": 0.915376091003418,
"num_tokens": 1271463.0,
"step": 1240
},
{
"epoch": 0.28197608842770133,
"grad_norm": 0.3787858784198761,
"learning_rate": 0.0002830949695465824,
"loss": 0.2926,
"mean_token_accuracy": 0.9160200178623199,
"num_tokens": 1281613.0,
"step": 1250
},
{
"epoch": 0.2842318971351229,
"grad_norm": 0.41434407234191895,
"learning_rate": 0.0002829596210241371,
"loss": 0.2365,
"mean_token_accuracy": 0.9311029613018036,
"num_tokens": 1291786.0,
"step": 1260
},
{
"epoch": 0.28648770584254457,
"grad_norm": 0.5667926073074341,
"learning_rate": 0.0002828242725016918,
"loss": 0.231,
"mean_token_accuracy": 0.9344726800918579,
"num_tokens": 1301555.0,
"step": 1270
},
{
"epoch": 0.28874351454996616,
"grad_norm": 0.5918124318122864,
"learning_rate": 0.00028268892397924654,
"loss": 0.2174,
"mean_token_accuracy": 0.9305562138557434,
"num_tokens": 1311787.0,
"step": 1280
},
{
"epoch": 0.29099932325738775,
"grad_norm": 0.3638257682323456,
"learning_rate": 0.00028255357545680125,
"loss": 0.2193,
"mean_token_accuracy": 0.9338454186916352,
"num_tokens": 1321926.0,
"step": 1290
},
{
"epoch": 0.2932551319648094,
"grad_norm": 0.3877502977848053,
"learning_rate": 0.00028241822693435596,
"loss": 0.2489,
"mean_token_accuracy": 0.92558473944664,
"num_tokens": 1332162.0,
"step": 1300
},
{
"epoch": 0.295510940672231,
"grad_norm": 0.7278009653091431,
"learning_rate": 0.00028228287841191067,
"loss": 0.2429,
"mean_token_accuracy": 0.9280037820339203,
"num_tokens": 1342350.0,
"step": 1310
},
{
"epoch": 0.2977667493796526,
"grad_norm": 0.43354156613349915,
"learning_rate": 0.0002821475298894654,
"loss": 0.2416,
"mean_token_accuracy": 0.9291642665863037,
"num_tokens": 1352516.0,
"step": 1320
},
{
"epoch": 0.30002255808707423,
"grad_norm": 0.5411070585250854,
"learning_rate": 0.00028201218136702003,
"loss": 0.263,
"mean_token_accuracy": 0.92619389295578,
"num_tokens": 1362735.0,
"step": 1330
},
{
"epoch": 0.3022783667944958,
"grad_norm": 0.40434572100639343,
"learning_rate": 0.00028187683284457474,
"loss": 0.2217,
"mean_token_accuracy": 0.9316042363643646,
"num_tokens": 1372866.0,
"step": 1340
},
{
"epoch": 0.3045341755019174,
"grad_norm": 0.5762608051300049,
"learning_rate": 0.00028174148432212945,
"loss": 0.282,
"mean_token_accuracy": 0.9198409378528595,
"num_tokens": 1383044.0,
"step": 1350
},
{
"epoch": 0.30678998420933906,
"grad_norm": 0.35077670216560364,
"learning_rate": 0.00028160613579968416,
"loss": 0.2627,
"mean_token_accuracy": 0.9248356401920319,
"num_tokens": 1392552.0,
"step": 1360
},
{
"epoch": 0.30904579291676065,
"grad_norm": 0.4673321843147278,
"learning_rate": 0.00028147078727723887,
"loss": 0.2734,
"mean_token_accuracy": 0.9211951434612274,
"num_tokens": 1402739.0,
"step": 1370
},
{
"epoch": 0.31130160162418224,
"grad_norm": 0.68485426902771,
"learning_rate": 0.0002813354387547936,
"loss": 0.2287,
"mean_token_accuracy": 0.9316923320293427,
"num_tokens": 1412889.0,
"step": 1380
},
{
"epoch": 0.3135574103316039,
"grad_norm": 0.48916900157928467,
"learning_rate": 0.0002812000902323483,
"loss": 0.2509,
"mean_token_accuracy": 0.9270589649677277,
"num_tokens": 1423073.0,
"step": 1390
},
{
"epoch": 0.3158132190390255,
"grad_norm": 0.49239906668663025,
"learning_rate": 0.000281064741709903,
"loss": 0.2615,
"mean_token_accuracy": 0.9245356857776642,
"num_tokens": 1433313.0,
"step": 1400
},
{
"epoch": 0.3180690277464471,
"grad_norm": 0.5933843851089478,
"learning_rate": 0.00028092939318745766,
"loss": 0.2958,
"mean_token_accuracy": 0.9148936092853546,
"num_tokens": 1443495.0,
"step": 1410
},
{
"epoch": 0.3203248364538687,
"grad_norm": 0.410168319940567,
"learning_rate": 0.00028079404466501237,
"loss": 0.2521,
"mean_token_accuracy": 0.9283298313617706,
"num_tokens": 1453715.0,
"step": 1420
},
{
"epoch": 0.3225806451612903,
"grad_norm": 0.5238193273544312,
"learning_rate": 0.0002806586961425671,
"loss": 0.3188,
"mean_token_accuracy": 0.9118652939796448,
"num_tokens": 1463900.0,
"step": 1430
},
{
"epoch": 0.32483645386871196,
"grad_norm": 0.347003310918808,
"learning_rate": 0.0002805233476201218,
"loss": 0.2707,
"mean_token_accuracy": 0.921840351819992,
"num_tokens": 1474039.0,
"step": 1440
},
{
"epoch": 0.32709226257613355,
"grad_norm": 0.49886128306388855,
"learning_rate": 0.0002803879990976765,
"loss": 0.2386,
"mean_token_accuracy": 0.9320064127445221,
"num_tokens": 1484218.0,
"step": 1450
},
{
"epoch": 0.32934807128355514,
"grad_norm": 0.7472530603408813,
"learning_rate": 0.0002802526505752312,
"loss": 0.2504,
"mean_token_accuracy": 0.9290566742420197,
"num_tokens": 1494448.0,
"step": 1460
},
{
"epoch": 0.3316038799909768,
"grad_norm": 0.39901986718177795,
"learning_rate": 0.0002801173020527859,
"loss": 0.3014,
"mean_token_accuracy": 0.9148159503936768,
"num_tokens": 1504582.0,
"step": 1470
},
{
"epoch": 0.3338596886983984,
"grad_norm": 0.6466052532196045,
"learning_rate": 0.0002799819535303406,
"loss": 0.2645,
"mean_token_accuracy": 0.9249064564704895,
"num_tokens": 1514759.0,
"step": 1480
},
{
"epoch": 0.33611549740582,
"grad_norm": 0.5997007489204407,
"learning_rate": 0.00027984660500789534,
"loss": 0.246,
"mean_token_accuracy": 0.9261491954326629,
"num_tokens": 1524964.0,
"step": 1490
},
{
"epoch": 0.3383713061132416,
"grad_norm": 0.8568662405014038,
"learning_rate": 0.00027971125648545,
"loss": 0.2607,
"mean_token_accuracy": 0.9246239781379699,
"num_tokens": 1535168.0,
"step": 1500
},
{
"epoch": 0.3406271148206632,
"grad_norm": 0.4184776544570923,
"learning_rate": 0.0002795759079630047,
"loss": 0.2996,
"mean_token_accuracy": 0.9106807947158814,
"num_tokens": 1545312.0,
"step": 1510
},
{
"epoch": 0.3428829235280848,
"grad_norm": 0.7694135308265686,
"learning_rate": 0.0002794405594405594,
"loss": 0.2432,
"mean_token_accuracy": 0.9285914778709412,
"num_tokens": 1555532.0,
"step": 1520
},
{
"epoch": 0.34513873223550645,
"grad_norm": 0.5472086071968079,
"learning_rate": 0.0002793052109181141,
"loss": 0.2343,
"mean_token_accuracy": 0.935352087020874,
"num_tokens": 1565702.0,
"step": 1530
},
{
"epoch": 0.34739454094292804,
"grad_norm": 0.5112503170967102,
"learning_rate": 0.00027916986239566883,
"loss": 0.3255,
"mean_token_accuracy": 0.9154317140579223,
"num_tokens": 1574947.0,
"step": 1540
},
{
"epoch": 0.34965034965034963,
"grad_norm": 0.3371566832065582,
"learning_rate": 0.00027903451387322354,
"loss": 0.2386,
"mean_token_accuracy": 0.9288936614990234,
"num_tokens": 1585140.0,
"step": 1550
},
{
"epoch": 0.3519061583577713,
"grad_norm": 0.6675296425819397,
"learning_rate": 0.00027889916535077825,
"loss": 0.2715,
"mean_token_accuracy": 0.9202642977237702,
"num_tokens": 1595358.0,
"step": 1560
},
{
"epoch": 0.3541619670651929,
"grad_norm": 0.921124279499054,
"learning_rate": 0.00027876381682833296,
"loss": 0.2739,
"mean_token_accuracy": 0.9199114978313446,
"num_tokens": 1605534.0,
"step": 1570
},
{
"epoch": 0.35641777577261446,
"grad_norm": 0.3380034565925598,
"learning_rate": 0.0002786284683058876,
"loss": 0.2683,
"mean_token_accuracy": 0.9206540703773498,
"num_tokens": 1615751.0,
"step": 1580
},
{
"epoch": 0.3586735844800361,
"grad_norm": 0.6300131678581238,
"learning_rate": 0.0002784931197834423,
"loss": 0.2902,
"mean_token_accuracy": 0.9209059238433838,
"num_tokens": 1625928.0,
"step": 1590
},
{
"epoch": 0.3609293931874577,
"grad_norm": 0.6872547268867493,
"learning_rate": 0.00027835777126099704,
"loss": 0.3625,
"mean_token_accuracy": 0.9109048128128052,
"num_tokens": 1636144.0,
"step": 1600
},
{
"epoch": 0.3631852018948793,
"grad_norm": 0.4150646924972534,
"learning_rate": 0.00027822242273855174,
"loss": 0.2687,
"mean_token_accuracy": 0.9247509896755218,
"num_tokens": 1646331.0,
"step": 1610
},
{
"epoch": 0.36544101060230094,
"grad_norm": 0.44850772619247437,
"learning_rate": 0.00027808707421610645,
"loss": 0.2995,
"mean_token_accuracy": 0.9172296404838562,
"num_tokens": 1656544.0,
"step": 1620
},
{
"epoch": 0.36769681930972253,
"grad_norm": 0.48656392097473145,
"learning_rate": 0.00027795172569366116,
"loss": 0.277,
"mean_token_accuracy": 0.9188252389431,
"num_tokens": 1666778.0,
"step": 1630
},
{
"epoch": 0.3699526280171441,
"grad_norm": 0.5935309529304504,
"learning_rate": 0.0002778163771712159,
"loss": 0.2761,
"mean_token_accuracy": 0.9251180648803711,
"num_tokens": 1676959.0,
"step": 1640
},
{
"epoch": 0.37220843672456577,
"grad_norm": 0.7823290228843689,
"learning_rate": 0.0002776810286487706,
"loss": 0.2672,
"mean_token_accuracy": 0.922256076335907,
"num_tokens": 1687115.0,
"step": 1650
},
{
"epoch": 0.37446424543198736,
"grad_norm": 0.8532506227493286,
"learning_rate": 0.0002775456801263253,
"loss": 0.2739,
"mean_token_accuracy": 0.9220530927181244,
"num_tokens": 1697239.0,
"step": 1660
},
{
"epoch": 0.37672005413940896,
"grad_norm": 0.5606103539466858,
"learning_rate": 0.00027741033160387995,
"loss": 0.2311,
"mean_token_accuracy": 0.9349769711494446,
"num_tokens": 1707409.0,
"step": 1670
},
{
"epoch": 0.3789758628468306,
"grad_norm": 0.4320582151412964,
"learning_rate": 0.00027727498308143466,
"loss": 0.3131,
"mean_token_accuracy": 0.916411018371582,
"num_tokens": 1717583.0,
"step": 1680
},
{
"epoch": 0.3812316715542522,
"grad_norm": 0.5167767405509949,
"learning_rate": 0.00027713963455898937,
"loss": 0.2443,
"mean_token_accuracy": 0.9284303724765778,
"num_tokens": 1727785.0,
"step": 1690
},
{
"epoch": 0.3834874802616738,
"grad_norm": 0.40260276198387146,
"learning_rate": 0.0002770042860365441,
"loss": 0.254,
"mean_token_accuracy": 0.9235418915748597,
"num_tokens": 1738023.0,
"step": 1700
},
{
"epoch": 0.38574328896909543,
"grad_norm": 0.2824370563030243,
"learning_rate": 0.0002768689375140988,
"loss": 0.2086,
"mean_token_accuracy": 0.939811784029007,
"num_tokens": 1748222.0,
"step": 1710
},
{
"epoch": 0.387999097676517,
"grad_norm": 0.37908050417900085,
"learning_rate": 0.0002767335889916535,
"loss": 0.2753,
"mean_token_accuracy": 0.923250812292099,
"num_tokens": 1758431.0,
"step": 1720
},
{
"epoch": 0.3902549063839386,
"grad_norm": 0.5015047192573547,
"learning_rate": 0.0002765982404692082,
"loss": 0.2278,
"mean_token_accuracy": 0.9315616250038147,
"num_tokens": 1768555.0,
"step": 1730
},
{
"epoch": 0.39251071509136026,
"grad_norm": 1.0540778636932373,
"learning_rate": 0.0002764628919467629,
"loss": 0.2969,
"mean_token_accuracy": 0.9173891127109528,
"num_tokens": 1778765.0,
"step": 1740
},
{
"epoch": 0.39476652379878185,
"grad_norm": 0.37756413221359253,
"learning_rate": 0.00027632754342431763,
"loss": 0.2314,
"mean_token_accuracy": 0.9306451320648194,
"num_tokens": 1788880.0,
"step": 1750
},
{
"epoch": 0.3970223325062035,
"grad_norm": 0.3676840662956238,
"learning_rate": 0.0002761921949018723,
"loss": 0.2115,
"mean_token_accuracy": 0.9343382716178894,
"num_tokens": 1799070.0,
"step": 1760
},
{
"epoch": 0.3992781412136251,
"grad_norm": 0.33806851506233215,
"learning_rate": 0.000276056846379427,
"loss": 0.2772,
"mean_token_accuracy": 0.9242539584636689,
"num_tokens": 1809273.0,
"step": 1770
},
{
"epoch": 0.4015339499210467,
"grad_norm": 0.8772180676460266,
"learning_rate": 0.0002759214978569817,
"loss": 0.2342,
"mean_token_accuracy": 0.9347168564796448,
"num_tokens": 1819458.0,
"step": 1780
},
{
"epoch": 0.40378975862846833,
"grad_norm": 0.8381493091583252,
"learning_rate": 0.0002757861493345364,
"loss": 0.2367,
"mean_token_accuracy": 0.9324899137020111,
"num_tokens": 1829636.0,
"step": 1790
},
{
"epoch": 0.4060455673358899,
"grad_norm": 0.3281577527523041,
"learning_rate": 0.0002756508008120911,
"loss": 0.2424,
"mean_token_accuracy": 0.9290665745735168,
"num_tokens": 1839857.0,
"step": 1800
},
{
"epoch": 0.4083013760433115,
"grad_norm": 0.4396291971206665,
"learning_rate": 0.00027551545228964583,
"loss": 0.3203,
"mean_token_accuracy": 0.919403862953186,
"num_tokens": 1850052.0,
"step": 1810
},
{
"epoch": 0.41055718475073316,
"grad_norm": 0.3047076165676117,
"learning_rate": 0.00027538010376720054,
"loss": 0.208,
"mean_token_accuracy": 0.937256783246994,
"num_tokens": 1860236.0,
"step": 1820
},
{
"epoch": 0.41281299345815475,
"grad_norm": 0.592738151550293,
"learning_rate": 0.00027524475524475525,
"loss": 0.2222,
"mean_token_accuracy": 0.9334645926952362,
"num_tokens": 1870414.0,
"step": 1830
},
{
"epoch": 0.41506880216557634,
"grad_norm": 0.30050572752952576,
"learning_rate": 0.0002751094067223099,
"loss": 0.2439,
"mean_token_accuracy": 0.9300739288330078,
"num_tokens": 1880626.0,
"step": 1840
},
{
"epoch": 0.417324610872998,
"grad_norm": 0.39272746443748474,
"learning_rate": 0.0002749740581998646,
"loss": 0.2569,
"mean_token_accuracy": 0.9248918652534485,
"num_tokens": 1890861.0,
"step": 1850
},
{
"epoch": 0.4195804195804196,
"grad_norm": 0.3860650062561035,
"learning_rate": 0.00027483870967741933,
"loss": 0.2459,
"mean_token_accuracy": 0.9251980602741241,
"num_tokens": 1901049.0,
"step": 1860
},
{
"epoch": 0.4218362282878412,
"grad_norm": 0.36546534299850464,
"learning_rate": 0.00027470336115497404,
"loss": 0.2296,
"mean_token_accuracy": 0.9312162160873413,
"num_tokens": 1911268.0,
"step": 1870
},
{
"epoch": 0.4240920369952628,
"grad_norm": 0.469930499792099,
"learning_rate": 0.00027456801263252875,
"loss": 0.2448,
"mean_token_accuracy": 0.9255866289138794,
"num_tokens": 1921450.0,
"step": 1880
},
{
"epoch": 0.4263478457026844,
"grad_norm": 0.43246912956237793,
"learning_rate": 0.00027443266411008346,
"loss": 0.2746,
"mean_token_accuracy": 0.9213305950164795,
"num_tokens": 1931629.0,
"step": 1890
},
{
"epoch": 0.428603654410106,
"grad_norm": 0.5429182648658752,
"learning_rate": 0.00027429731558763817,
"loss": 0.29,
"mean_token_accuracy": 0.9243273913860321,
"num_tokens": 1941815.0,
"step": 1900
},
{
"epoch": 0.43085946311752765,
"grad_norm": 0.2941581904888153,
"learning_rate": 0.0002741619670651929,
"loss": 0.2088,
"mean_token_accuracy": 0.9382153749465942,
"num_tokens": 1952040.0,
"step": 1910
},
{
"epoch": 0.43311527182494924,
"grad_norm": 0.3362172544002533,
"learning_rate": 0.0002740266185427476,
"loss": 0.2576,
"mean_token_accuracy": 0.9279570162296296,
"num_tokens": 1962222.0,
"step": 1920
},
{
"epoch": 0.43537108053237084,
"grad_norm": 0.6598260998725891,
"learning_rate": 0.00027389127002030224,
"loss": 0.2667,
"mean_token_accuracy": 0.9221050620079041,
"num_tokens": 1972369.0,
"step": 1930
},
{
"epoch": 0.4376268892397925,
"grad_norm": 0.5338302254676819,
"learning_rate": 0.00027375592149785695,
"loss": 0.2525,
"mean_token_accuracy": 0.9305463373661041,
"num_tokens": 1982601.0,
"step": 1940
},
{
"epoch": 0.4398826979472141,
"grad_norm": 0.5087529420852661,
"learning_rate": 0.00027362057297541166,
"loss": 0.3157,
"mean_token_accuracy": 0.9138010561466217,
"num_tokens": 1992715.0,
"step": 1950
},
{
"epoch": 0.44213850665463567,
"grad_norm": 0.9174799919128418,
"learning_rate": 0.0002734852244529664,
"loss": 0.3144,
"mean_token_accuracy": 0.9178781092166901,
"num_tokens": 2002936.0,
"step": 1960
},
{
"epoch": 0.4443943153620573,
"grad_norm": 0.6057170033454895,
"learning_rate": 0.00027334987593052103,
"loss": 0.2902,
"mean_token_accuracy": 0.9211130678653717,
"num_tokens": 2013103.0,
"step": 1970
},
{
"epoch": 0.4466501240694789,
"grad_norm": 0.2543405592441559,
"learning_rate": 0.0002732145274080758,
"loss": 0.2468,
"mean_token_accuracy": 0.9278612732887268,
"num_tokens": 2023318.0,
"step": 1980
},
{
"epoch": 0.4489059327769005,
"grad_norm": 0.7571674585342407,
"learning_rate": 0.0002730791788856305,
"loss": 0.2915,
"mean_token_accuracy": 0.9226119935512542,
"num_tokens": 2033414.0,
"step": 1990
},
{
"epoch": 0.45116174148432214,
"grad_norm": 0.2644379436969757,
"learning_rate": 0.0002729438303631852,
"loss": 0.2211,
"mean_token_accuracy": 0.9352829337120057,
"num_tokens": 2043581.0,
"step": 2000
},
{
"epoch": 0.45341755019174373,
"grad_norm": 0.39117881655693054,
"learning_rate": 0.00027280848184073987,
"loss": 0.2923,
"mean_token_accuracy": 0.9223450303077698,
"num_tokens": 2053806.0,
"step": 2010
},
{
"epoch": 0.4556733588991653,
"grad_norm": 0.46938055753707886,
"learning_rate": 0.0002726731333182946,
"loss": 0.2391,
"mean_token_accuracy": 0.932051545381546,
"num_tokens": 2064004.0,
"step": 2020
},
{
"epoch": 0.457929167606587,
"grad_norm": 0.5239212512969971,
"learning_rate": 0.0002725377847958493,
"loss": 0.2686,
"mean_token_accuracy": 0.922615134716034,
"num_tokens": 2074201.0,
"step": 2030
},
{
"epoch": 0.46018497631400856,
"grad_norm": 0.5799335241317749,
"learning_rate": 0.000272402436273404,
"loss": 0.28,
"mean_token_accuracy": 0.9241182446479798,
"num_tokens": 2084366.0,
"step": 2040
},
{
"epoch": 0.46244078502143016,
"grad_norm": 0.38910114765167236,
"learning_rate": 0.00027226708775095865,
"loss": 0.2841,
"mean_token_accuracy": 0.9211917340755462,
"num_tokens": 2094603.0,
"step": 2050
},
{
"epoch": 0.4646965937288518,
"grad_norm": 0.5675938725471497,
"learning_rate": 0.00027213173922851336,
"loss": 0.268,
"mean_token_accuracy": 0.9224383175373078,
"num_tokens": 2104772.0,
"step": 2060
},
{
"epoch": 0.4669524024362734,
"grad_norm": 0.3569573163986206,
"learning_rate": 0.00027199639070606813,
"loss": 0.2722,
"mean_token_accuracy": 0.9221753597259521,
"num_tokens": 2114988.0,
"step": 2070
},
{
"epoch": 0.46920821114369504,
"grad_norm": 0.7027791142463684,
"learning_rate": 0.00027186104218362284,
"loss": 0.2583,
"mean_token_accuracy": 0.926697039604187,
"num_tokens": 2125174.0,
"step": 2080
},
{
"epoch": 0.47146401985111663,
"grad_norm": 0.35335201025009155,
"learning_rate": 0.00027172569366117755,
"loss": 0.2886,
"mean_token_accuracy": 0.9233569622039794,
"num_tokens": 2135341.0,
"step": 2090
},
{
"epoch": 0.4737198285585382,
"grad_norm": 0.28648829460144043,
"learning_rate": 0.0002715903451387322,
"loss": 0.2232,
"mean_token_accuracy": 0.9318738996982574,
"num_tokens": 2145570.0,
"step": 2100
},
{
"epoch": 0.47597563726595987,
"grad_norm": 0.264739990234375,
"learning_rate": 0.0002714549966162869,
"loss": 0.271,
"mean_token_accuracy": 0.9208465099334717,
"num_tokens": 2155801.0,
"step": 2110
},
{
"epoch": 0.47823144597338146,
"grad_norm": 0.7098460793495178,
"learning_rate": 0.0002713196480938416,
"loss": 0.2683,
"mean_token_accuracy": 0.9297817826271058,
"num_tokens": 2166020.0,
"step": 2120
},
{
"epoch": 0.48048725468080306,
"grad_norm": 0.4726192355155945,
"learning_rate": 0.00027118429957139633,
"loss": 0.242,
"mean_token_accuracy": 0.9297101378440857,
"num_tokens": 2176220.0,
"step": 2130
},
{
"epoch": 0.4827430633882247,
"grad_norm": 0.42030471563339233,
"learning_rate": 0.000271048951048951,
"loss": 0.2867,
"mean_token_accuracy": 0.9191558003425598,
"num_tokens": 2186379.0,
"step": 2140
},
{
"epoch": 0.4849988720956463,
"grad_norm": 0.2791607677936554,
"learning_rate": 0.0002709136025265057,
"loss": 0.2437,
"mean_token_accuracy": 0.9270163774490356,
"num_tokens": 2196596.0,
"step": 2150
},
{
"epoch": 0.4872546808030679,
"grad_norm": 0.5415903329849243,
"learning_rate": 0.00027077825400406046,
"loss": 0.232,
"mean_token_accuracy": 0.9309392213821411,
"num_tokens": 2206772.0,
"step": 2160
},
{
"epoch": 0.48951048951048953,
"grad_norm": 0.446532666683197,
"learning_rate": 0.00027064290548161517,
"loss": 0.2198,
"mean_token_accuracy": 0.9335504353046418,
"num_tokens": 2216969.0,
"step": 2170
},
{
"epoch": 0.4917662982179111,
"grad_norm": 0.5093636512756348,
"learning_rate": 0.00027050755695916983,
"loss": 0.2644,
"mean_token_accuracy": 0.9249256730079651,
"num_tokens": 2227148.0,
"step": 2180
},
{
"epoch": 0.4940221069253327,
"grad_norm": 0.33594754338264465,
"learning_rate": 0.00027037220843672454,
"loss": 0.2651,
"mean_token_accuracy": 0.9275108754634858,
"num_tokens": 2237357.0,
"step": 2190
},
{
"epoch": 0.49627791563275436,
"grad_norm": 0.3259807229042053,
"learning_rate": 0.00027023685991427925,
"loss": 0.2641,
"mean_token_accuracy": 0.9278050780296325,
"num_tokens": 2247548.0,
"step": 2200
},
{
"epoch": 0.49853372434017595,
"grad_norm": 0.5676048398017883,
"learning_rate": 0.00027010151139183396,
"loss": 0.1965,
"mean_token_accuracy": 0.9407685935497284,
"num_tokens": 2257703.0,
"step": 2210
},
{
"epoch": 0.5007895330475975,
"grad_norm": 0.5289788246154785,
"learning_rate": 0.0002699661628693886,
"loss": 0.2754,
"mean_token_accuracy": 0.9240131139755249,
"num_tokens": 2267831.0,
"step": 2220
},
{
"epoch": 0.5030453417550191,
"grad_norm": 0.8352831602096558,
"learning_rate": 0.0002698308143469433,
"loss": 0.2304,
"mean_token_accuracy": 0.9294693827629089,
"num_tokens": 2277973.0,
"step": 2230
},
{
"epoch": 0.5053011504624408,
"grad_norm": 0.5197424292564392,
"learning_rate": 0.00026969546582449803,
"loss": 0.2337,
"mean_token_accuracy": 0.9289329349994659,
"num_tokens": 2288198.0,
"step": 2240
},
{
"epoch": 0.5075569591698624,
"grad_norm": 0.4816972613334656,
"learning_rate": 0.00026956011730205274,
"loss": 0.243,
"mean_token_accuracy": 0.9350267231464386,
"num_tokens": 2298386.0,
"step": 2250
},
{
"epoch": 0.509812767877284,
"grad_norm": 0.40002062916755676,
"learning_rate": 0.0002694247687796075,
"loss": 0.2129,
"mean_token_accuracy": 0.9361107170581817,
"num_tokens": 2307935.0,
"step": 2260
},
{
"epoch": 0.5120685765847056,
"grad_norm": 0.3949548602104187,
"learning_rate": 0.00026928942025716216,
"loss": 0.2367,
"mean_token_accuracy": 0.9292949497699737,
"num_tokens": 2318169.0,
"step": 2270
},
{
"epoch": 0.5143243852921272,
"grad_norm": 0.6020212173461914,
"learning_rate": 0.00026915407173471687,
"loss": 0.2293,
"mean_token_accuracy": 0.9293237924575806,
"num_tokens": 2328312.0,
"step": 2280
},
{
"epoch": 0.5165801939995488,
"grad_norm": 0.460500031709671,
"learning_rate": 0.0002690187232122716,
"loss": 0.2328,
"mean_token_accuracy": 0.9300384640693664,
"num_tokens": 2338522.0,
"step": 2290
},
{
"epoch": 0.5188360027069705,
"grad_norm": 0.37255415320396423,
"learning_rate": 0.0002688833746898263,
"loss": 0.213,
"mean_token_accuracy": 0.9355834662914276,
"num_tokens": 2348691.0,
"step": 2300
},
{
"epoch": 0.5210918114143921,
"grad_norm": 0.3450946807861328,
"learning_rate": 0.00026874802616738095,
"loss": 0.2733,
"mean_token_accuracy": 0.9193537712097168,
"num_tokens": 2358900.0,
"step": 2310
},
{
"epoch": 0.5233476201218137,
"grad_norm": 0.5672959685325623,
"learning_rate": 0.00026861267764493566,
"loss": 0.2934,
"mean_token_accuracy": 0.9169221520423889,
"num_tokens": 2369048.0,
"step": 2320
},
{
"epoch": 0.5256034288292353,
"grad_norm": 0.349274218082428,
"learning_rate": 0.00026847732912249037,
"loss": 0.2575,
"mean_token_accuracy": 0.9255650520324707,
"num_tokens": 2379209.0,
"step": 2330
},
{
"epoch": 0.5278592375366569,
"grad_norm": 0.7018752694129944,
"learning_rate": 0.0002683419806000451,
"loss": 0.2404,
"mean_token_accuracy": 0.9304228484630584,
"num_tokens": 2389369.0,
"step": 2340
},
{
"epoch": 0.5301150462440785,
"grad_norm": 0.6136831045150757,
"learning_rate": 0.0002682066320775998,
"loss": 0.2828,
"mean_token_accuracy": 0.9221972823143005,
"num_tokens": 2399599.0,
"step": 2350
},
{
"epoch": 0.5323708549515002,
"grad_norm": 0.4122675955295563,
"learning_rate": 0.0002680712835551545,
"loss": 0.2509,
"mean_token_accuracy": 0.9257995843887329,
"num_tokens": 2409765.0,
"step": 2360
},
{
"epoch": 0.5346266636589218,
"grad_norm": 0.6374993324279785,
"learning_rate": 0.0002679359350327092,
"loss": 0.233,
"mean_token_accuracy": 0.9314781188964844,
"num_tokens": 2419954.0,
"step": 2370
},
{
"epoch": 0.5368824723663433,
"grad_norm": 0.5488554239273071,
"learning_rate": 0.0002678005865102639,
"loss": 0.238,
"mean_token_accuracy": 0.930390727519989,
"num_tokens": 2430192.0,
"step": 2380
},
{
"epoch": 0.5391382810737649,
"grad_norm": 0.40995532274246216,
"learning_rate": 0.0002676652379878186,
"loss": 0.2214,
"mean_token_accuracy": 0.9314346790313721,
"num_tokens": 2440278.0,
"step": 2390
},
{
"epoch": 0.5413940897811865,
"grad_norm": 0.3758867084980011,
"learning_rate": 0.0002675298894653733,
"loss": 0.3038,
"mean_token_accuracy": 0.918562513589859,
"num_tokens": 2450475.0,
"step": 2400
},
{
"epoch": 0.5436498984886081,
"grad_norm": 0.3616684675216675,
"learning_rate": 0.000267394540942928,
"loss": 0.2434,
"mean_token_accuracy": 0.9236611008644104,
"num_tokens": 2460701.0,
"step": 2410
},
{
"epoch": 0.5459057071960298,
"grad_norm": 0.3775900602340698,
"learning_rate": 0.0002672591924204827,
"loss": 0.1832,
"mean_token_accuracy": 0.9434400200843811,
"num_tokens": 2470851.0,
"step": 2420
},
{
"epoch": 0.5481615159034514,
"grad_norm": 0.40669533610343933,
"learning_rate": 0.0002671238438980374,
"loss": 0.2112,
"mean_token_accuracy": 0.9340478003025054,
"num_tokens": 2481053.0,
"step": 2430
},
{
"epoch": 0.550417324610873,
"grad_norm": 0.3886403739452362,
"learning_rate": 0.0002669884953755921,
"loss": 0.2396,
"mean_token_accuracy": 0.9314120352268219,
"num_tokens": 2491228.0,
"step": 2440
},
{
"epoch": 0.5526731333182946,
"grad_norm": 0.36737385392189026,
"learning_rate": 0.00026685314685314683,
"loss": 0.2247,
"mean_token_accuracy": 0.9330944359302521,
"num_tokens": 2501462.0,
"step": 2450
},
{
"epoch": 0.5549289420257162,
"grad_norm": 0.3836795389652252,
"learning_rate": 0.00026671779833070154,
"loss": 0.2207,
"mean_token_accuracy": 0.9301238179206848,
"num_tokens": 2511691.0,
"step": 2460
},
{
"epoch": 0.5571847507331378,
"grad_norm": 0.41037148237228394,
"learning_rate": 0.00026658244980825625,
"loss": 0.2079,
"mean_token_accuracy": 0.9370009183883667,
"num_tokens": 2521883.0,
"step": 2470
},
{
"epoch": 0.5594405594405595,
"grad_norm": 0.5383064150810242,
"learning_rate": 0.0002664471012858109,
"loss": 0.2509,
"mean_token_accuracy": 0.9259045839309692,
"num_tokens": 2532053.0,
"step": 2480
},
{
"epoch": 0.5616963681479811,
"grad_norm": 0.4217754900455475,
"learning_rate": 0.0002663117527633656,
"loss": 0.2151,
"mean_token_accuracy": 0.9337125539779663,
"num_tokens": 2542249.0,
"step": 2490
},
{
"epoch": 0.5639521768554027,
"grad_norm": 0.3590957820415497,
"learning_rate": 0.0002661764042409203,
"loss": 0.2406,
"mean_token_accuracy": 0.9339839398860932,
"num_tokens": 2552448.0,
"step": 2500
},
{
"epoch": 0.5662079855628243,
"grad_norm": 0.643915593624115,
"learning_rate": 0.00026604105571847504,
"loss": 0.2362,
"mean_token_accuracy": 0.9307669878005982,
"num_tokens": 2562490.0,
"step": 2510
},
{
"epoch": 0.5684637942702458,
"grad_norm": 0.3068816661834717,
"learning_rate": 0.00026590570719602975,
"loss": 0.1945,
"mean_token_accuracy": 0.9392113566398621,
"num_tokens": 2572707.0,
"step": 2520
},
{
"epoch": 0.5707196029776674,
"grad_norm": 0.42321914434432983,
"learning_rate": 0.00026577035867358446,
"loss": 0.2256,
"mean_token_accuracy": 0.9326376616954803,
"num_tokens": 2582925.0,
"step": 2530
},
{
"epoch": 0.5729754116850891,
"grad_norm": 0.6707413196563721,
"learning_rate": 0.00026563501015113917,
"loss": 0.2324,
"mean_token_accuracy": 0.9275128602981567,
"num_tokens": 2593099.0,
"step": 2540
},
{
"epoch": 0.5752312203925107,
"grad_norm": 0.4198610186576843,
"learning_rate": 0.0002654996616286939,
"loss": 0.2607,
"mean_token_accuracy": 0.9212849795818329,
"num_tokens": 2603310.0,
"step": 2550
},
{
"epoch": 0.5774870290999323,
"grad_norm": 0.9983633160591125,
"learning_rate": 0.0002653643131062486,
"loss": 0.2259,
"mean_token_accuracy": 0.9339877903461457,
"num_tokens": 2613439.0,
"step": 2560
},
{
"epoch": 0.5797428378073539,
"grad_norm": 0.6598926782608032,
"learning_rate": 0.00026522896458380324,
"loss": 0.2703,
"mean_token_accuracy": 0.9226635038852692,
"num_tokens": 2623574.0,
"step": 2570
},
{
"epoch": 0.5819986465147755,
"grad_norm": 0.307011216878891,
"learning_rate": 0.00026509361606135795,
"loss": 0.2233,
"mean_token_accuracy": 0.9347078561782837,
"num_tokens": 2633755.0,
"step": 2580
},
{
"epoch": 0.5842544552221972,
"grad_norm": 0.49420446157455444,
"learning_rate": 0.00026495826753891266,
"loss": 0.2208,
"mean_token_accuracy": 0.9309853732585907,
"num_tokens": 2643927.0,
"step": 2590
},
{
"epoch": 0.5865102639296188,
"grad_norm": 0.46068477630615234,
"learning_rate": 0.00026482291901646737,
"loss": 0.257,
"mean_token_accuracy": 0.9284857034683227,
"num_tokens": 2654118.0,
"step": 2600
},
{
"epoch": 0.5887660726370404,
"grad_norm": 0.5037050247192383,
"learning_rate": 0.0002646875704940221,
"loss": 0.2189,
"mean_token_accuracy": 0.9315983355045319,
"num_tokens": 2664170.0,
"step": 2610
},
{
"epoch": 0.591021881344462,
"grad_norm": 0.47586536407470703,
"learning_rate": 0.0002645522219715768,
"loss": 0.2318,
"mean_token_accuracy": 0.9364044010639191,
"num_tokens": 2674349.0,
"step": 2620
},
{
"epoch": 0.5932776900518836,
"grad_norm": 0.9741955399513245,
"learning_rate": 0.0002644168734491315,
"loss": 0.1985,
"mean_token_accuracy": 0.9385083496570588,
"num_tokens": 2684419.0,
"step": 2630
},
{
"epoch": 0.5955334987593052,
"grad_norm": 0.5099119544029236,
"learning_rate": 0.0002642815249266862,
"loss": 0.2197,
"mean_token_accuracy": 0.9376397907733918,
"num_tokens": 2694553.0,
"step": 2640
},
{
"epoch": 0.5977893074667269,
"grad_norm": 0.30093830823898315,
"learning_rate": 0.00026414617640424087,
"loss": 0.2122,
"mean_token_accuracy": 0.9366639375686645,
"num_tokens": 2704780.0,
"step": 2650
},
{
"epoch": 0.6000451161741485,
"grad_norm": 0.6657422780990601,
"learning_rate": 0.0002640108278817956,
"loss": 0.2269,
"mean_token_accuracy": 0.9306385815143585,
"num_tokens": 2714979.0,
"step": 2660
},
{
"epoch": 0.60230092488157,
"grad_norm": 0.37251630425453186,
"learning_rate": 0.0002638754793593503,
"loss": 0.1887,
"mean_token_accuracy": 0.9465994536876678,
"num_tokens": 2724781.0,
"step": 2670
},
{
"epoch": 0.6045567335889916,
"grad_norm": 0.5274510979652405,
"learning_rate": 0.000263740130836905,
"loss": 0.2289,
"mean_token_accuracy": 0.9338556706905365,
"num_tokens": 2734993.0,
"step": 2680
},
{
"epoch": 0.6068125422964132,
"grad_norm": 0.5593155026435852,
"learning_rate": 0.0002636047823144597,
"loss": 0.2237,
"mean_token_accuracy": 0.9336115419864655,
"num_tokens": 2744872.0,
"step": 2690
},
{
"epoch": 0.6090683510038348,
"grad_norm": 0.7985104918479919,
"learning_rate": 0.0002634694337920144,
"loss": 0.2949,
"mean_token_accuracy": 0.9198081076145173,
"num_tokens": 2755075.0,
"step": 2700
},
{
"epoch": 0.6113241597112565,
"grad_norm": 0.41699644923210144,
"learning_rate": 0.0002633340852695691,
"loss": 0.1981,
"mean_token_accuracy": 0.9422951698303222,
"num_tokens": 2765210.0,
"step": 2710
},
{
"epoch": 0.6135799684186781,
"grad_norm": 0.5942256450653076,
"learning_rate": 0.00026319873674712384,
"loss": 0.251,
"mean_token_accuracy": 0.9282591760158538,
"num_tokens": 2775369.0,
"step": 2720
},
{
"epoch": 0.6158357771260997,
"grad_norm": 0.5558121800422668,
"learning_rate": 0.00026306338822467855,
"loss": 0.2312,
"mean_token_accuracy": 0.9309241354465485,
"num_tokens": 2785569.0,
"step": 2730
},
{
"epoch": 0.6180915858335213,
"grad_norm": 0.3139027953147888,
"learning_rate": 0.0002629280397022332,
"loss": 0.2244,
"mean_token_accuracy": 0.9360554933547973,
"num_tokens": 2795773.0,
"step": 2740
},
{
"epoch": 0.6203473945409429,
"grad_norm": 0.44965052604675293,
"learning_rate": 0.0002627926911797879,
"loss": 0.2224,
"mean_token_accuracy": 0.9316644787788391,
"num_tokens": 2806012.0,
"step": 2750
},
{
"epoch": 0.6226032032483645,
"grad_norm": 0.5226691365242004,
"learning_rate": 0.0002626573426573426,
"loss": 0.2398,
"mean_token_accuracy": 0.9320449590682983,
"num_tokens": 2816212.0,
"step": 2760
},
{
"epoch": 0.6248590119557862,
"grad_norm": 0.5641390681266785,
"learning_rate": 0.00026252199413489733,
"loss": 0.2383,
"mean_token_accuracy": 0.9291798174381256,
"num_tokens": 2826418.0,
"step": 2770
},
{
"epoch": 0.6271148206632078,
"grad_norm": 0.6037927865982056,
"learning_rate": 0.00026238664561245204,
"loss": 0.2289,
"mean_token_accuracy": 0.9318684577941895,
"num_tokens": 2836641.0,
"step": 2780
},
{
"epoch": 0.6293706293706294,
"grad_norm": 0.47394031286239624,
"learning_rate": 0.00026225129709000675,
"loss": 0.2258,
"mean_token_accuracy": 0.9298848390579224,
"num_tokens": 2846865.0,
"step": 2790
},
{
"epoch": 0.631626438078051,
"grad_norm": 0.6290483474731445,
"learning_rate": 0.00026211594856756146,
"loss": 0.2514,
"mean_token_accuracy": 0.9306814074516296,
"num_tokens": 2857102.0,
"step": 2800
},
{
"epoch": 0.6338822467854726,
"grad_norm": 0.6657432317733765,
"learning_rate": 0.00026198060004511617,
"loss": 0.2262,
"mean_token_accuracy": 0.9337584257125855,
"num_tokens": 2867299.0,
"step": 2810
},
{
"epoch": 0.6361380554928941,
"grad_norm": 0.53950434923172,
"learning_rate": 0.0002618452515226708,
"loss": 0.2341,
"mean_token_accuracy": 0.9332832813262939,
"num_tokens": 2877485.0,
"step": 2820
},
{
"epoch": 0.6383938642003159,
"grad_norm": 0.5629047155380249,
"learning_rate": 0.00026170990300022554,
"loss": 0.1989,
"mean_token_accuracy": 0.9396433293819427,
"num_tokens": 2887688.0,
"step": 2830
},
{
"epoch": 0.6406496729077374,
"grad_norm": 0.4131242334842682,
"learning_rate": 0.00026157455447778025,
"loss": 0.2266,
"mean_token_accuracy": 0.9329676389694214,
"num_tokens": 2897887.0,
"step": 2840
},
{
"epoch": 0.642905481615159,
"grad_norm": 0.5747202634811401,
"learning_rate": 0.00026143920595533496,
"loss": 0.2601,
"mean_token_accuracy": 0.9348642468452454,
"num_tokens": 2908059.0,
"step": 2850
},
{
"epoch": 0.6451612903225806,
"grad_norm": 0.6643345952033997,
"learning_rate": 0.00026130385743288967,
"loss": 0.2004,
"mean_token_accuracy": 0.9394226372241974,
"num_tokens": 2918269.0,
"step": 2860
},
{
"epoch": 0.6474170990300022,
"grad_norm": 0.5732499361038208,
"learning_rate": 0.0002611685089104444,
"loss": 0.2398,
"mean_token_accuracy": 0.9273908972740174,
"num_tokens": 2928456.0,
"step": 2870
},
{
"epoch": 0.6496729077374239,
"grad_norm": 0.4198378622531891,
"learning_rate": 0.0002610331603879991,
"loss": 0.239,
"mean_token_accuracy": 0.9306033670902252,
"num_tokens": 2938625.0,
"step": 2880
},
{
"epoch": 0.6519287164448455,
"grad_norm": 0.8750539422035217,
"learning_rate": 0.0002608978118655538,
"loss": 0.2565,
"mean_token_accuracy": 0.9291310131549835,
"num_tokens": 2948751.0,
"step": 2890
},
{
"epoch": 0.6541845251522671,
"grad_norm": 0.5238725543022156,
"learning_rate": 0.0002607624633431085,
"loss": 0.2972,
"mean_token_accuracy": 0.9199288547039032,
"num_tokens": 2958976.0,
"step": 2900
},
{
"epoch": 0.6564403338596887,
"grad_norm": 0.6253301501274109,
"learning_rate": 0.00026062711482066316,
"loss": 0.2486,
"mean_token_accuracy": 0.9288740932941437,
"num_tokens": 2969180.0,
"step": 2910
},
{
"epoch": 0.6586961425671103,
"grad_norm": 0.5708739161491394,
"learning_rate": 0.00026049176629821787,
"loss": 0.2126,
"mean_token_accuracy": 0.9343720078468323,
"num_tokens": 2979355.0,
"step": 2920
},
{
"epoch": 0.6609519512745319,
"grad_norm": 0.5066877603530884,
"learning_rate": 0.0002603564177757726,
"loss": 0.2085,
"mean_token_accuracy": 0.9371103644371033,
"num_tokens": 2989482.0,
"step": 2930
},
{
"epoch": 0.6632077599819536,
"grad_norm": 0.71226966381073,
"learning_rate": 0.0002602210692533273,
"loss": 0.2329,
"mean_token_accuracy": 0.9393065094947814,
"num_tokens": 2999682.0,
"step": 2940
},
{
"epoch": 0.6654635686893752,
"grad_norm": 0.6169939637184143,
"learning_rate": 0.000260085720730882,
"loss": 0.2863,
"mean_token_accuracy": 0.9209560215473175,
"num_tokens": 3009855.0,
"step": 2950
},
{
"epoch": 0.6677193773967968,
"grad_norm": 0.41732147336006165,
"learning_rate": 0.0002599503722084367,
"loss": 0.2062,
"mean_token_accuracy": 0.9381140649318696,
"num_tokens": 3020060.0,
"step": 2960
},
{
"epoch": 0.6699751861042184,
"grad_norm": 0.5298788547515869,
"learning_rate": 0.0002598150236859914,
"loss": 0.2941,
"mean_token_accuracy": 0.9189365446567536,
"num_tokens": 3030266.0,
"step": 2970
},
{
"epoch": 0.67223099481164,
"grad_norm": 0.500662088394165,
"learning_rate": 0.00025967967516354613,
"loss": 0.2187,
"mean_token_accuracy": 0.9356793701648712,
"num_tokens": 3040506.0,
"step": 2980
},
{
"epoch": 0.6744868035190615,
"grad_norm": 0.9615169763565063,
"learning_rate": 0.0002595443266411008,
"loss": 0.2256,
"mean_token_accuracy": 0.9331699669361114,
"num_tokens": 3050615.0,
"step": 2990
},
{
"epoch": 0.6767426122264832,
"grad_norm": 0.7886420488357544,
"learning_rate": 0.0002594089781186555,
"loss": 0.3148,
"mean_token_accuracy": 0.9215950310230255,
"num_tokens": 3060815.0,
"step": 3000
},
{
"epoch": 0.6789984209339048,
"grad_norm": 0.5439404845237732,
"learning_rate": 0.0002592736295962102,
"loss": 0.2323,
"mean_token_accuracy": 0.9314812004566193,
"num_tokens": 3071015.0,
"step": 3010
},
{
"epoch": 0.6812542296413264,
"grad_norm": 0.4758981764316559,
"learning_rate": 0.0002591382810737649,
"loss": 0.2825,
"mean_token_accuracy": 0.9206907093524933,
"num_tokens": 3081209.0,
"step": 3020
},
{
"epoch": 0.683510038348748,
"grad_norm": 0.48459869623184204,
"learning_rate": 0.0002590029325513196,
"loss": 0.2391,
"mean_token_accuracy": 0.9332017719745636,
"num_tokens": 3091395.0,
"step": 3030
},
{
"epoch": 0.6857658470561696,
"grad_norm": 0.5378035306930542,
"learning_rate": 0.00025886758402887433,
"loss": 0.2443,
"mean_token_accuracy": 0.9284430742263794,
"num_tokens": 3101565.0,
"step": 3040
},
{
"epoch": 0.6880216557635912,
"grad_norm": 0.45487913489341736,
"learning_rate": 0.00025873223550642904,
"loss": 0.2181,
"mean_token_accuracy": 0.9427953720092773,
"num_tokens": 3111730.0,
"step": 3050
},
{
"epoch": 0.6902774644710129,
"grad_norm": 0.9604068398475647,
"learning_rate": 0.00025859688698398375,
"loss": 0.2565,
"mean_token_accuracy": 0.9298881590366364,
"num_tokens": 3121965.0,
"step": 3060
},
{
"epoch": 0.6925332731784345,
"grad_norm": 0.5915318131446838,
"learning_rate": 0.00025846153846153846,
"loss": 0.2816,
"mean_token_accuracy": 0.9280205249786377,
"num_tokens": 3132199.0,
"step": 3070
},
{
"epoch": 0.6947890818858561,
"grad_norm": 0.47584712505340576,
"learning_rate": 0.0002583261899390931,
"loss": 0.1796,
"mean_token_accuracy": 0.945578533411026,
"num_tokens": 3142379.0,
"step": 3080
},
{
"epoch": 0.6970448905932777,
"grad_norm": 0.4383523464202881,
"learning_rate": 0.00025819084141664783,
"loss": 0.2353,
"mean_token_accuracy": 0.9329768180847168,
"num_tokens": 3152556.0,
"step": 3090
},
{
"epoch": 0.6993006993006993,
"grad_norm": 0.9148581624031067,
"learning_rate": 0.00025805549289420254,
"loss": 0.2627,
"mean_token_accuracy": 0.9258742690086365,
"num_tokens": 3162716.0,
"step": 3100
},
{
"epoch": 0.7015565080081209,
"grad_norm": 0.667614758014679,
"learning_rate": 0.00025792014437175725,
"loss": 0.2331,
"mean_token_accuracy": 0.9343131303787231,
"num_tokens": 3172920.0,
"step": 3110
},
{
"epoch": 0.7038123167155426,
"grad_norm": 0.33134734630584717,
"learning_rate": 0.00025778479584931196,
"loss": 0.2198,
"mean_token_accuracy": 0.938484913110733,
"num_tokens": 3183152.0,
"step": 3120
},
{
"epoch": 0.7060681254229642,
"grad_norm": 0.3356286883354187,
"learning_rate": 0.00025764944732686667,
"loss": 0.2183,
"mean_token_accuracy": 0.932779735326767,
"num_tokens": 3193354.0,
"step": 3130
},
{
"epoch": 0.7083239341303857,
"grad_norm": 0.3519227206707001,
"learning_rate": 0.0002575140988044214,
"loss": 0.2357,
"mean_token_accuracy": 0.9315909683704376,
"num_tokens": 3203523.0,
"step": 3140
},
{
"epoch": 0.7105797428378073,
"grad_norm": 0.844898521900177,
"learning_rate": 0.0002573787502819761,
"loss": 0.2278,
"mean_token_accuracy": 0.9361746132373809,
"num_tokens": 3213735.0,
"step": 3150
},
{
"epoch": 0.7128355515452289,
"grad_norm": 0.4979631304740906,
"learning_rate": 0.00025724340175953074,
"loss": 0.2013,
"mean_token_accuracy": 0.9381689071655274,
"num_tokens": 3223919.0,
"step": 3160
},
{
"epoch": 0.7150913602526505,
"grad_norm": 0.5635648369789124,
"learning_rate": 0.00025710805323708545,
"loss": 0.2066,
"mean_token_accuracy": 0.9401957809925079,
"num_tokens": 3234145.0,
"step": 3170
},
{
"epoch": 0.7173471689600722,
"grad_norm": 0.7126004695892334,
"learning_rate": 0.00025697270471464016,
"loss": 0.2267,
"mean_token_accuracy": 0.9334402441978454,
"num_tokens": 3244336.0,
"step": 3180
},
{
"epoch": 0.7196029776674938,
"grad_norm": 0.7653904557228088,
"learning_rate": 0.0002568373561921949,
"loss": 0.2367,
"mean_token_accuracy": 0.9352998495101928,
"num_tokens": 3254473.0,
"step": 3190
},
{
"epoch": 0.7218587863749154,
"grad_norm": 0.44523507356643677,
"learning_rate": 0.0002567020076697496,
"loss": 0.195,
"mean_token_accuracy": 0.9434902846813202,
"num_tokens": 3264693.0,
"step": 3200
},
{
"epoch": 0.724114595082337,
"grad_norm": 0.5072572827339172,
"learning_rate": 0.0002565666591473043,
"loss": 0.1927,
"mean_token_accuracy": 0.9416171848773957,
"num_tokens": 3274913.0,
"step": 3210
},
{
"epoch": 0.7263704037897586,
"grad_norm": 0.42991572618484497,
"learning_rate": 0.000256431310624859,
"loss": 0.2878,
"mean_token_accuracy": 0.9258961975574493,
"num_tokens": 3285114.0,
"step": 3220
},
{
"epoch": 0.7286262124971803,
"grad_norm": 0.61916583776474,
"learning_rate": 0.0002562959621024137,
"loss": 0.2563,
"mean_token_accuracy": 0.9288025736808777,
"num_tokens": 3295220.0,
"step": 3230
},
{
"epoch": 0.7308820212046019,
"grad_norm": 0.725781261920929,
"learning_rate": 0.0002561606135799684,
"loss": 0.2307,
"mean_token_accuracy": 0.9322963118553161,
"num_tokens": 3305451.0,
"step": 3240
},
{
"epoch": 0.7331378299120235,
"grad_norm": 0.6131793260574341,
"learning_rate": 0.0002560252650575231,
"loss": 0.2268,
"mean_token_accuracy": 0.9349699139595031,
"num_tokens": 3315579.0,
"step": 3250
},
{
"epoch": 0.7353936386194451,
"grad_norm": 0.6788907051086426,
"learning_rate": 0.0002558899165350778,
"loss": 0.2282,
"mean_token_accuracy": 0.935053151845932,
"num_tokens": 3325813.0,
"step": 3260
},
{
"epoch": 0.7376494473268667,
"grad_norm": 0.4873131811618805,
"learning_rate": 0.0002557545680126325,
"loss": 0.1999,
"mean_token_accuracy": 0.9406931400299072,
"num_tokens": 3335985.0,
"step": 3270
},
{
"epoch": 0.7399052560342883,
"grad_norm": 0.5387445688247681,
"learning_rate": 0.0002556192194901872,
"loss": 0.2805,
"mean_token_accuracy": 0.9226358532905579,
"num_tokens": 3346199.0,
"step": 3280
},
{
"epoch": 0.74216106474171,
"grad_norm": 0.6286031603813171,
"learning_rate": 0.0002554838709677419,
"loss": 0.192,
"mean_token_accuracy": 0.940284013748169,
"num_tokens": 3356424.0,
"step": 3290
},
{
"epoch": 0.7444168734491315,
"grad_norm": 0.8641782402992249,
"learning_rate": 0.00025534852244529663,
"loss": 0.1805,
"mean_token_accuracy": 0.9438497364521027,
"num_tokens": 3366633.0,
"step": 3300
},
{
"epoch": 0.7466726821565531,
"grad_norm": 0.6660944223403931,
"learning_rate": 0.00025521317392285134,
"loss": 0.2103,
"mean_token_accuracy": 0.9389594733715058,
"num_tokens": 3376532.0,
"step": 3310
},
{
"epoch": 0.7489284908639747,
"grad_norm": 0.6905304193496704,
"learning_rate": 0.00025507782540040605,
"loss": 0.1885,
"mean_token_accuracy": 0.9422814130783081,
"num_tokens": 3386719.0,
"step": 3320
},
{
"epoch": 0.7511842995713963,
"grad_norm": 1.0209304094314575,
"learning_rate": 0.0002549424768779607,
"loss": 0.2332,
"mean_token_accuracy": 0.9365878164768219,
"num_tokens": 3396927.0,
"step": 3330
},
{
"epoch": 0.7534401082788179,
"grad_norm": 0.3851501941680908,
"learning_rate": 0.0002548071283555154,
"loss": 0.2496,
"mean_token_accuracy": 0.9241067111492157,
"num_tokens": 3407161.0,
"step": 3340
},
{
"epoch": 0.7556959169862396,
"grad_norm": 0.5775346755981445,
"learning_rate": 0.0002546717798330701,
"loss": 0.2284,
"mean_token_accuracy": 0.9372583270072937,
"num_tokens": 3417372.0,
"step": 3350
},
{
"epoch": 0.7579517256936612,
"grad_norm": 0.513624370098114,
"learning_rate": 0.00025453643131062483,
"loss": 0.2308,
"mean_token_accuracy": 0.9305113971233367,
"num_tokens": 3427598.0,
"step": 3360
},
{
"epoch": 0.7602075344010828,
"grad_norm": 0.3917955160140991,
"learning_rate": 0.00025440108278817954,
"loss": 0.3156,
"mean_token_accuracy": 0.9161118984222412,
"num_tokens": 3437826.0,
"step": 3370
},
{
"epoch": 0.7624633431085044,
"grad_norm": 0.9259962439537048,
"learning_rate": 0.00025426573426573425,
"loss": 0.2738,
"mean_token_accuracy": 0.926971298456192,
"num_tokens": 3447977.0,
"step": 3380
},
{
"epoch": 0.764719151815926,
"grad_norm": 0.5740894079208374,
"learning_rate": 0.00025413038574328896,
"loss": 0.2039,
"mean_token_accuracy": 0.940199863910675,
"num_tokens": 3458171.0,
"step": 3390
},
{
"epoch": 0.7669749605233476,
"grad_norm": 0.6751810908317566,
"learning_rate": 0.00025399503722084367,
"loss": 0.226,
"mean_token_accuracy": 0.9333556115627288,
"num_tokens": 3468359.0,
"step": 3400
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.663020133972168,
"learning_rate": 0.0002538596886983984,
"loss": 0.2014,
"mean_token_accuracy": 0.938248485326767,
"num_tokens": 3478585.0,
"step": 3410
},
{
"epoch": 0.7714865779381909,
"grad_norm": 0.6453900337219238,
"learning_rate": 0.00025372434017595304,
"loss": 0.2258,
"mean_token_accuracy": 0.9338716864585876,
"num_tokens": 3488706.0,
"step": 3420
},
{
"epoch": 0.7737423866456125,
"grad_norm": 0.3844014108181,
"learning_rate": 0.00025358899165350775,
"loss": 0.2112,
"mean_token_accuracy": 0.9385376691818237,
"num_tokens": 3498930.0,
"step": 3430
},
{
"epoch": 0.775998195353034,
"grad_norm": 0.5107812285423279,
"learning_rate": 0.00025345364313106246,
"loss": 0.2576,
"mean_token_accuracy": 0.929598605632782,
"num_tokens": 3509052.0,
"step": 3440
},
{
"epoch": 0.7782540040604556,
"grad_norm": 0.4670540392398834,
"learning_rate": 0.00025331829460861717,
"loss": 0.1817,
"mean_token_accuracy": 0.9437798678874969,
"num_tokens": 3519236.0,
"step": 3450
},
{
"epoch": 0.7805098127678772,
"grad_norm": 0.5630539655685425,
"learning_rate": 0.0002531829460861719,
"loss": 0.2292,
"mean_token_accuracy": 0.9310850441455841,
"num_tokens": 3529476.0,
"step": 3460
},
{
"epoch": 0.7827656214752989,
"grad_norm": 2.5165903568267822,
"learning_rate": 0.0002530475975637266,
"loss": 0.1869,
"mean_token_accuracy": 0.9452771425247193,
"num_tokens": 3539702.0,
"step": 3470
},
{
"epoch": 0.7850214301827205,
"grad_norm": 0.32777339220046997,
"learning_rate": 0.0002529122490412813,
"loss": 0.2173,
"mean_token_accuracy": 0.9382402420043945,
"num_tokens": 3549863.0,
"step": 3480
},
{
"epoch": 0.7872772388901421,
"grad_norm": 0.9197781682014465,
"learning_rate": 0.000252776900518836,
"loss": 0.2861,
"mean_token_accuracy": 0.9292895257472992,
"num_tokens": 3560004.0,
"step": 3490
},
{
"epoch": 0.7895330475975637,
"grad_norm": 0.8597625494003296,
"learning_rate": 0.00025264155199639066,
"loss": 0.1964,
"mean_token_accuracy": 0.9433383345603943,
"num_tokens": 3570177.0,
"step": 3500
},
{
"epoch": 0.7917888563049853,
"grad_norm": 0.5637295842170715,
"learning_rate": 0.00025250620347394537,
"loss": 0.19,
"mean_token_accuracy": 0.94198077917099,
"num_tokens": 3580404.0,
"step": 3510
},
{
"epoch": 0.794044665012407,
"grad_norm": 0.4464856684207916,
"learning_rate": 0.0002523708549515001,
"loss": 0.1661,
"mean_token_accuracy": 0.9511018455028534,
"num_tokens": 3590640.0,
"step": 3520
},
{
"epoch": 0.7963004737198286,
"grad_norm": 0.4399365484714508,
"learning_rate": 0.0002522355064290548,
"loss": 0.2588,
"mean_token_accuracy": 0.9256435573101044,
"num_tokens": 3600798.0,
"step": 3530
},
{
"epoch": 0.7985562824272502,
"grad_norm": 0.3305966854095459,
"learning_rate": 0.0002521001579066095,
"loss": 0.233,
"mean_token_accuracy": 0.9323022544384003,
"num_tokens": 3610941.0,
"step": 3540
},
{
"epoch": 0.8008120911346718,
"grad_norm": 0.44945451617240906,
"learning_rate": 0.0002519648093841642,
"loss": 0.1944,
"mean_token_accuracy": 0.9411108553409576,
"num_tokens": 3621174.0,
"step": 3550
},
{
"epoch": 0.8030678998420934,
"grad_norm": 0.40088480710983276,
"learning_rate": 0.0002518294608617189,
"loss": 0.1724,
"mean_token_accuracy": 0.9453522205352783,
"num_tokens": 3631338.0,
"step": 3560
},
{
"epoch": 0.805323708549515,
"grad_norm": 0.6701235771179199,
"learning_rate": 0.00025169411233927363,
"loss": 0.2435,
"mean_token_accuracy": 0.9314075529575347,
"num_tokens": 3640587.0,
"step": 3570
},
{
"epoch": 0.8075795172569367,
"grad_norm": 0.4659329354763031,
"learning_rate": 0.00025155876381682834,
"loss": 0.2118,
"mean_token_accuracy": 0.9398883581161499,
"num_tokens": 3650742.0,
"step": 3580
},
{
"epoch": 0.8098353259643583,
"grad_norm": 0.5958136320114136,
"learning_rate": 0.000251423415294383,
"loss": 0.1925,
"mean_token_accuracy": 0.9447057127952576,
"num_tokens": 3660870.0,
"step": 3590
},
{
"epoch": 0.8120911346717798,
"grad_norm": 0.45198115706443787,
"learning_rate": 0.0002512880667719377,
"loss": 0.1964,
"mean_token_accuracy": 0.9426902115345002,
"num_tokens": 3671105.0,
"step": 3600
},
{
"epoch": 0.8143469433792014,
"grad_norm": 0.5948301553726196,
"learning_rate": 0.0002511527182494924,
"loss": 0.1765,
"mean_token_accuracy": 0.9449096560478211,
"num_tokens": 3681293.0,
"step": 3610
},
{
"epoch": 0.816602752086623,
"grad_norm": 0.7184245586395264,
"learning_rate": 0.0002510173697270471,
"loss": 0.1735,
"mean_token_accuracy": 0.9478671312332153,
"num_tokens": 3691481.0,
"step": 3620
},
{
"epoch": 0.8188585607940446,
"grad_norm": 0.603680431842804,
"learning_rate": 0.00025088202120460184,
"loss": 0.2391,
"mean_token_accuracy": 0.9321896970272064,
"num_tokens": 3701670.0,
"step": 3630
},
{
"epoch": 0.8211143695014663,
"grad_norm": 0.5140425562858582,
"learning_rate": 0.00025074667268215655,
"loss": 0.2408,
"mean_token_accuracy": 0.933314174413681,
"num_tokens": 3711844.0,
"step": 3640
},
{
"epoch": 0.8233701782088879,
"grad_norm": 0.516891360282898,
"learning_rate": 0.00025061132415971126,
"loss": 0.2389,
"mean_token_accuracy": 0.9327298462390899,
"num_tokens": 3722013.0,
"step": 3650
},
{
"epoch": 0.8256259869163095,
"grad_norm": 0.5612062215805054,
"learning_rate": 0.00025047597563726597,
"loss": 0.2317,
"mean_token_accuracy": 0.9376449286937714,
"num_tokens": 3731596.0,
"step": 3660
},
{
"epoch": 0.8278817956237311,
"grad_norm": 0.4045218825340271,
"learning_rate": 0.0002503406271148207,
"loss": 0.1935,
"mean_token_accuracy": 0.9420763790607453,
"num_tokens": 3741743.0,
"step": 3670
},
{
"epoch": 0.8301376043311527,
"grad_norm": 0.45677274465560913,
"learning_rate": 0.00025020527859237533,
"loss": 0.2115,
"mean_token_accuracy": 0.9396097540855408,
"num_tokens": 3751873.0,
"step": 3680
},
{
"epoch": 0.8323934130385743,
"grad_norm": 0.3399205505847931,
"learning_rate": 0.00025006993006993004,
"loss": 0.1888,
"mean_token_accuracy": 0.9462385714054108,
"num_tokens": 3762092.0,
"step": 3690
},
{
"epoch": 0.834649221745996,
"grad_norm": 0.6759438514709473,
"learning_rate": 0.00024993458154748475,
"loss": 0.237,
"mean_token_accuracy": 0.9307721853256226,
"num_tokens": 3772329.0,
"step": 3700
},
{
"epoch": 0.8369050304534176,
"grad_norm": 0.39573797583580017,
"learning_rate": 0.00024979923302503946,
"loss": 0.23,
"mean_token_accuracy": 0.9365652084350586,
"num_tokens": 3782523.0,
"step": 3710
},
{
"epoch": 0.8391608391608392,
"grad_norm": 0.6500651240348816,
"learning_rate": 0.00024966388450259417,
"loss": 0.2188,
"mean_token_accuracy": 0.9423346817493439,
"num_tokens": 3792662.0,
"step": 3720
},
{
"epoch": 0.8414166478682608,
"grad_norm": 0.5717751383781433,
"learning_rate": 0.0002495285359801489,
"loss": 0.2367,
"mean_token_accuracy": 0.9352416634559632,
"num_tokens": 3802837.0,
"step": 3730
},
{
"epoch": 0.8436724565756824,
"grad_norm": 0.6442503929138184,
"learning_rate": 0.0002493931874577036,
"loss": 0.1753,
"mean_token_accuracy": 0.9477633893489837,
"num_tokens": 3813070.0,
"step": 3740
},
{
"epoch": 0.8459282652831039,
"grad_norm": 0.4221307337284088,
"learning_rate": 0.0002492578389352583,
"loss": 0.1869,
"mean_token_accuracy": 0.9431292831897735,
"num_tokens": 3823243.0,
"step": 3750
},
{
"epoch": 0.8481840739905256,
"grad_norm": 0.5930933952331543,
"learning_rate": 0.00024912249041281296,
"loss": 0.2455,
"mean_token_accuracy": 0.9271996915340424,
"num_tokens": 3833364.0,
"step": 3760
},
{
"epoch": 0.8504398826979472,
"grad_norm": 0.42684584856033325,
"learning_rate": 0.00024898714189036767,
"loss": 0.197,
"mean_token_accuracy": 0.941864401102066,
"num_tokens": 3843576.0,
"step": 3770
},
{
"epoch": 0.8526956914053688,
"grad_norm": 0.4301314949989319,
"learning_rate": 0.0002488517933679224,
"loss": 0.2091,
"mean_token_accuracy": 0.9357395350933075,
"num_tokens": 3853722.0,
"step": 3780
},
{
"epoch": 0.8549515001127904,
"grad_norm": 0.7294553518295288,
"learning_rate": 0.0002487164448454771,
"loss": 0.2457,
"mean_token_accuracy": 0.930441266298294,
"num_tokens": 3863953.0,
"step": 3790
},
{
"epoch": 0.857207308820212,
"grad_norm": 0.7167822122573853,
"learning_rate": 0.0002485810963230318,
"loss": 0.1813,
"mean_token_accuracy": 0.9461415946483612,
"num_tokens": 3874173.0,
"step": 3800
},
{
"epoch": 0.8594631175276336,
"grad_norm": 0.45225095748901367,
"learning_rate": 0.0002484457478005865,
"loss": 0.2711,
"mean_token_accuracy": 0.9266981244087219,
"num_tokens": 3884297.0,
"step": 3810
},
{
"epoch": 0.8617189262350553,
"grad_norm": 0.5794314742088318,
"learning_rate": 0.0002483103992781412,
"loss": 0.1905,
"mean_token_accuracy": 0.9450049281120301,
"num_tokens": 3894478.0,
"step": 3820
},
{
"epoch": 0.8639747349424769,
"grad_norm": 0.6747323870658875,
"learning_rate": 0.0002481750507556959,
"loss": 0.2213,
"mean_token_accuracy": 0.9325950682163239,
"num_tokens": 3904677.0,
"step": 3830
},
{
"epoch": 0.8662305436498985,
"grad_norm": 0.664412796497345,
"learning_rate": 0.00024803970223325064,
"loss": 0.1948,
"mean_token_accuracy": 0.943292647600174,
"num_tokens": 3914898.0,
"step": 3840
},
{
"epoch": 0.8684863523573201,
"grad_norm": 1.082534909248352,
"learning_rate": 0.0002479043537108053,
"loss": 0.2153,
"mean_token_accuracy": 0.9366356670856476,
"num_tokens": 3925076.0,
"step": 3850
},
{
"epoch": 0.8707421610647417,
"grad_norm": 0.9012080430984497,
"learning_rate": 0.00024776900518836,
"loss": 0.2589,
"mean_token_accuracy": 0.9262183904647827,
"num_tokens": 3935233.0,
"step": 3860
},
{
"epoch": 0.8729979697721634,
"grad_norm": 0.6924684047698975,
"learning_rate": 0.0002476336566659147,
"loss": 0.2281,
"mean_token_accuracy": 0.9359488189220428,
"num_tokens": 3945432.0,
"step": 3870
},
{
"epoch": 0.875253778479585,
"grad_norm": 0.4230230152606964,
"learning_rate": 0.0002474983081434694,
"loss": 0.2253,
"mean_token_accuracy": 0.9308477878570557,
"num_tokens": 3955665.0,
"step": 3880
},
{
"epoch": 0.8775095871870066,
"grad_norm": 0.5450705885887146,
"learning_rate": 0.00024736295962102413,
"loss": 0.2488,
"mean_token_accuracy": 0.9260222852230072,
"num_tokens": 3965842.0,
"step": 3890
},
{
"epoch": 0.8797653958944281,
"grad_norm": 0.7850773930549622,
"learning_rate": 0.00024722761109857884,
"loss": 0.1976,
"mean_token_accuracy": 0.9407764256000519,
"num_tokens": 3976064.0,
"step": 3900
},
{
"epoch": 0.8820212046018497,
"grad_norm": 0.6162165999412537,
"learning_rate": 0.00024709226257613355,
"loss": 0.2032,
"mean_token_accuracy": 0.939377635717392,
"num_tokens": 3986249.0,
"step": 3910
},
{
"epoch": 0.8842770133092713,
"grad_norm": 0.3530557453632355,
"learning_rate": 0.00024695691405368826,
"loss": 0.2005,
"mean_token_accuracy": 0.9405902981758117,
"num_tokens": 3996443.0,
"step": 3920
},
{
"epoch": 0.886532822016693,
"grad_norm": 0.47658300399780273,
"learning_rate": 0.0002468215655312429,
"loss": 0.2388,
"mean_token_accuracy": 0.934827721118927,
"num_tokens": 4006670.0,
"step": 3930
},
{
"epoch": 0.8887886307241146,
"grad_norm": 0.45294544100761414,
"learning_rate": 0.0002466862170087976,
"loss": 0.1719,
"mean_token_accuracy": 0.9483415603637695,
"num_tokens": 4016900.0,
"step": 3940
},
{
"epoch": 0.8910444394315362,
"grad_norm": 0.650928258895874,
"learning_rate": 0.00024655086848635234,
"loss": 0.2177,
"mean_token_accuracy": 0.9376309931278228,
"num_tokens": 4027125.0,
"step": 3950
},
{
"epoch": 0.8933002481389578,
"grad_norm": 0.3383767604827881,
"learning_rate": 0.00024641551996390705,
"loss": 0.1752,
"mean_token_accuracy": 0.94934002161026,
"num_tokens": 4037360.0,
"step": 3960
},
{
"epoch": 0.8955560568463794,
"grad_norm": 1.4327523708343506,
"learning_rate": 0.0002462801714414617,
"loss": 0.1836,
"mean_token_accuracy": 0.9417614221572876,
"num_tokens": 4047540.0,
"step": 3970
},
{
"epoch": 0.897811865553801,
"grad_norm": 0.5295197367668152,
"learning_rate": 0.0002461448229190164,
"loss": 0.2672,
"mean_token_accuracy": 0.9268444120883942,
"num_tokens": 4057757.0,
"step": 3980
},
{
"epoch": 0.9000676742612227,
"grad_norm": 0.7444450855255127,
"learning_rate": 0.0002460094743965712,
"loss": 0.1885,
"mean_token_accuracy": 0.9425229966640473,
"num_tokens": 4067940.0,
"step": 3990
},
{
"epoch": 0.9023234829686443,
"grad_norm": 0.45960938930511475,
"learning_rate": 0.0002458741258741259,
"loss": 0.1876,
"mean_token_accuracy": 0.9434334516525269,
"num_tokens": 4078168.0,
"step": 4000
},
{
"epoch": 0.9045792916760659,
"grad_norm": 0.42837294936180115,
"learning_rate": 0.0002457387773516806,
"loss": 0.181,
"mean_token_accuracy": 0.9469284176826477,
"num_tokens": 4088367.0,
"step": 4010
},
{
"epoch": 0.9068351003834875,
"grad_norm": 0.6162718534469604,
"learning_rate": 0.00024560342882923525,
"loss": 0.1887,
"mean_token_accuracy": 0.9453383386135101,
"num_tokens": 4098559.0,
"step": 4020
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.6896116137504578,
"learning_rate": 0.00024546808030678996,
"loss": 0.2,
"mean_token_accuracy": 0.9440081357955933,
"num_tokens": 4108750.0,
"step": 4030
},
{
"epoch": 0.9113467177983307,
"grad_norm": 0.4971648156642914,
"learning_rate": 0.00024533273178434467,
"loss": 0.1918,
"mean_token_accuracy": 0.943715387582779,
"num_tokens": 4118931.0,
"step": 4040
},
{
"epoch": 0.9136025265057524,
"grad_norm": 0.4439462125301361,
"learning_rate": 0.0002451973832618994,
"loss": 0.1893,
"mean_token_accuracy": 0.9430592775344848,
"num_tokens": 4128996.0,
"step": 4050
},
{
"epoch": 0.915858335213174,
"grad_norm": 0.5335512161254883,
"learning_rate": 0.00024506203473945404,
"loss": 0.2348,
"mean_token_accuracy": 0.938604736328125,
"num_tokens": 4139224.0,
"step": 4060
},
{
"epoch": 0.9181141439205955,
"grad_norm": 0.40199655294418335,
"learning_rate": 0.00024492668621700875,
"loss": 0.2864,
"mean_token_accuracy": 0.928019517660141,
"num_tokens": 4149335.0,
"step": 4070
},
{
"epoch": 0.9203699526280171,
"grad_norm": 0.47056302428245544,
"learning_rate": 0.0002447913376945635,
"loss": 0.1971,
"mean_token_accuracy": 0.9424499988555908,
"num_tokens": 4159563.0,
"step": 4080
},
{
"epoch": 0.9226257613354387,
"grad_norm": 0.6596531271934509,
"learning_rate": 0.0002446559891721182,
"loss": 0.2427,
"mean_token_accuracy": 0.933097630739212,
"num_tokens": 4169779.0,
"step": 4090
},
{
"epoch": 0.9248815700428603,
"grad_norm": 0.8583846092224121,
"learning_rate": 0.0002445206406496729,
"loss": 0.2194,
"mean_token_accuracy": 0.9355522513389587,
"num_tokens": 4179948.0,
"step": 4100
},
{
"epoch": 0.927137378750282,
"grad_norm": 0.9784001708030701,
"learning_rate": 0.0002443852921272276,
"loss": 0.2335,
"mean_token_accuracy": 0.9337855577468872,
"num_tokens": 4190170.0,
"step": 4110
},
{
"epoch": 0.9293931874577036,
"grad_norm": 0.6085280179977417,
"learning_rate": 0.0002442499436047823,
"loss": 0.2133,
"mean_token_accuracy": 0.9390449047088623,
"num_tokens": 4200347.0,
"step": 4120
},
{
"epoch": 0.9316489961651252,
"grad_norm": 0.8368825912475586,
"learning_rate": 0.000244114595082337,
"loss": 0.2028,
"mean_token_accuracy": 0.9399807095527649,
"num_tokens": 4210514.0,
"step": 4130
},
{
"epoch": 0.9339048048725468,
"grad_norm": 0.8934887647628784,
"learning_rate": 0.0002439792465598917,
"loss": 0.2312,
"mean_token_accuracy": 0.9369701504707336,
"num_tokens": 4220722.0,
"step": 4140
},
{
"epoch": 0.9361606135799684,
"grad_norm": 0.8492236137390137,
"learning_rate": 0.0002438438980374464,
"loss": 0.1979,
"mean_token_accuracy": 0.9407924175262451,
"num_tokens": 4230927.0,
"step": 4150
},
{
"epoch": 0.9384164222873901,
"grad_norm": 0.6423715949058533,
"learning_rate": 0.0002437085495150011,
"loss": 0.151,
"mean_token_accuracy": 0.9534207642078399,
"num_tokens": 4241153.0,
"step": 4160
},
{
"epoch": 0.9406722309948117,
"grad_norm": 0.7069862484931946,
"learning_rate": 0.00024357320099255582,
"loss": 0.1772,
"mean_token_accuracy": 0.947028785943985,
"num_tokens": 4251293.0,
"step": 4170
},
{
"epoch": 0.9429280397022333,
"grad_norm": 0.5257987380027771,
"learning_rate": 0.00024343785247011053,
"loss": 0.2198,
"mean_token_accuracy": 0.9393032670021058,
"num_tokens": 4261424.0,
"step": 4180
},
{
"epoch": 0.9451838484096549,
"grad_norm": 0.5543506741523743,
"learning_rate": 0.0002433025039476652,
"loss": 0.1789,
"mean_token_accuracy": 0.9457716882228852,
"num_tokens": 4271649.0,
"step": 4190
},
{
"epoch": 0.9474396571170765,
"grad_norm": 0.5377987623214722,
"learning_rate": 0.00024316715542521992,
"loss": 0.1661,
"mean_token_accuracy": 0.9503224551677704,
"num_tokens": 4281830.0,
"step": 4200
},
{
"epoch": 0.949695465824498,
"grad_norm": 0.5515894889831543,
"learning_rate": 0.00024303180690277463,
"loss": 0.2387,
"mean_token_accuracy": 0.9334891140460968,
"num_tokens": 4292015.0,
"step": 4210
},
{
"epoch": 0.9519512745319197,
"grad_norm": 0.44863930344581604,
"learning_rate": 0.00024289645838032934,
"loss": 0.2127,
"mean_token_accuracy": 0.9371248781681061,
"num_tokens": 4302252.0,
"step": 4220
},
{
"epoch": 0.9542070832393413,
"grad_norm": 0.6548046469688416,
"learning_rate": 0.00024276110985788402,
"loss": 0.1915,
"mean_token_accuracy": 0.9437361776828765,
"num_tokens": 4312481.0,
"step": 4230
},
{
"epoch": 0.9564628919467629,
"grad_norm": 0.7284629940986633,
"learning_rate": 0.00024262576133543873,
"loss": 0.2319,
"mean_token_accuracy": 0.9398886144161225,
"num_tokens": 4322667.0,
"step": 4240
},
{
"epoch": 0.9587187006541845,
"grad_norm": 0.5784673690795898,
"learning_rate": 0.00024249041281299344,
"loss": 0.2213,
"mean_token_accuracy": 0.9347177267074585,
"num_tokens": 4332825.0,
"step": 4250
},
{
"epoch": 0.9609745093616061,
"grad_norm": 0.7612512707710266,
"learning_rate": 0.00024235506429054815,
"loss": 0.1805,
"mean_token_accuracy": 0.9467396676540375,
"num_tokens": 4342910.0,
"step": 4260
},
{
"epoch": 0.9632303180690277,
"grad_norm": 0.6591640114784241,
"learning_rate": 0.00024221971576810283,
"loss": 0.2696,
"mean_token_accuracy": 0.9323035717010498,
"num_tokens": 4352998.0,
"step": 4270
},
{
"epoch": 0.9654861267764494,
"grad_norm": 0.6853541135787964,
"learning_rate": 0.00024208436724565754,
"loss": 0.2211,
"mean_token_accuracy": 0.9332985162734986,
"num_tokens": 4363163.0,
"step": 4280
},
{
"epoch": 0.967741935483871,
"grad_norm": 0.5725349187850952,
"learning_rate": 0.00024194901872321225,
"loss": 0.2287,
"mean_token_accuracy": 0.9395670652389526,
"num_tokens": 4373354.0,
"step": 4290
},
{
"epoch": 0.9699977441912926,
"grad_norm": 0.7916859984397888,
"learning_rate": 0.00024181367020076696,
"loss": 0.2157,
"mean_token_accuracy": 0.9374566435813904,
"num_tokens": 4383585.0,
"step": 4300
},
{
"epoch": 0.9722535528987142,
"grad_norm": 0.5497978925704956,
"learning_rate": 0.00024167832167832167,
"loss": 0.1723,
"mean_token_accuracy": 0.9466311931610107,
"num_tokens": 4393705.0,
"step": 4310
},
{
"epoch": 0.9745093616061358,
"grad_norm": 0.5773023366928101,
"learning_rate": 0.00024154297315587636,
"loss": 0.2037,
"mean_token_accuracy": 0.9397094666957855,
"num_tokens": 4403932.0,
"step": 4320
},
{
"epoch": 0.9767651703135574,
"grad_norm": 0.7037113308906555,
"learning_rate": 0.00024140762463343107,
"loss": 0.2129,
"mean_token_accuracy": 0.935580176115036,
"num_tokens": 4414060.0,
"step": 4330
},
{
"epoch": 0.9790209790209791,
"grad_norm": 0.44150829315185547,
"learning_rate": 0.00024127227611098578,
"loss": 0.1892,
"mean_token_accuracy": 0.9447483420372009,
"num_tokens": 4424251.0,
"step": 4340
},
{
"epoch": 0.9812767877284007,
"grad_norm": 1.1108092069625854,
"learning_rate": 0.0002411369275885405,
"loss": 0.2264,
"mean_token_accuracy": 0.9399719953536987,
"num_tokens": 4434422.0,
"step": 4350
},
{
"epoch": 0.9835325964358222,
"grad_norm": 0.5706174969673157,
"learning_rate": 0.00024100157906609517,
"loss": 0.1756,
"mean_token_accuracy": 0.9461008369922638,
"num_tokens": 4444650.0,
"step": 4360
},
{
"epoch": 0.9857884051432438,
"grad_norm": 0.6826101541519165,
"learning_rate": 0.00024086623054364988,
"loss": 0.2267,
"mean_token_accuracy": 0.9352168619632721,
"num_tokens": 4454873.0,
"step": 4370
},
{
"epoch": 0.9880442138506654,
"grad_norm": 0.957870602607727,
"learning_rate": 0.0002407308820212046,
"loss": 0.2347,
"mean_token_accuracy": 0.9347853481769561,
"num_tokens": 4465075.0,
"step": 4380
},
{
"epoch": 0.990300022558087,
"grad_norm": 0.6360123157501221,
"learning_rate": 0.0002405955334987593,
"loss": 0.1876,
"mean_token_accuracy": 0.9438979625701904,
"num_tokens": 4475268.0,
"step": 4390
},
{
"epoch": 0.9925558312655087,
"grad_norm": 0.5211275815963745,
"learning_rate": 0.00024046018497631398,
"loss": 0.1791,
"mean_token_accuracy": 0.9438638925552368,
"num_tokens": 4485503.0,
"step": 4400
},
{
"epoch": 0.9948116399729303,
"grad_norm": 0.977278470993042,
"learning_rate": 0.0002403248364538687,
"loss": 0.2014,
"mean_token_accuracy": 0.9406830132007599,
"num_tokens": 4495710.0,
"step": 4410
},
{
"epoch": 0.9970674486803519,
"grad_norm": 0.45597749948501587,
"learning_rate": 0.0002401894879314234,
"loss": 0.2089,
"mean_token_accuracy": 0.9395563662052154,
"num_tokens": 4505875.0,
"step": 4420
},
{
"epoch": 0.9993232573877735,
"grad_norm": 0.5130080580711365,
"learning_rate": 0.0002400541394089781,
"loss": 0.1854,
"mean_token_accuracy": 0.9489345788955689,
"num_tokens": 4516011.0,
"step": 4430
},
{
"epoch": 1.001579066095195,
"grad_norm": 0.3131079077720642,
"learning_rate": 0.0002399187908865328,
"loss": 0.1764,
"mean_token_accuracy": 0.9488461136817932,
"num_tokens": 4526245.0,
"step": 4440
},
{
"epoch": 1.0038348748026167,
"grad_norm": 0.8407638072967529,
"learning_rate": 0.0002397834423640875,
"loss": 0.1264,
"mean_token_accuracy": 0.9605377078056335,
"num_tokens": 4536415.0,
"step": 4450
},
{
"epoch": 1.0060906835100383,
"grad_norm": 0.5652706027030945,
"learning_rate": 0.00023964809384164221,
"loss": 0.1479,
"mean_token_accuracy": 0.9554095208644867,
"num_tokens": 4546571.0,
"step": 4460
},
{
"epoch": 1.0083464922174599,
"grad_norm": 0.920248806476593,
"learning_rate": 0.00023951274531919692,
"loss": 0.1819,
"mean_token_accuracy": 0.9463054478168488,
"num_tokens": 4556755.0,
"step": 4470
},
{
"epoch": 1.0106023009248817,
"grad_norm": 1.09098482131958,
"learning_rate": 0.00023937739679675163,
"loss": 0.188,
"mean_token_accuracy": 0.9430878520011902,
"num_tokens": 4566934.0,
"step": 4480
},
{
"epoch": 1.0128581096323033,
"grad_norm": 0.6064813137054443,
"learning_rate": 0.00023924204827430632,
"loss": 0.1509,
"mean_token_accuracy": 0.9536380171775818,
"num_tokens": 4577167.0,
"step": 4490
},
{
"epoch": 1.0151139183397249,
"grad_norm": 0.4210309386253357,
"learning_rate": 0.00023910669975186103,
"loss": 0.1356,
"mean_token_accuracy": 0.958275294303894,
"num_tokens": 4587306.0,
"step": 4500
},
{
"epoch": 1.0173697270471465,
"grad_norm": 0.6188949942588806,
"learning_rate": 0.00023897135122941574,
"loss": 0.1699,
"mean_token_accuracy": 0.9480436682701111,
"num_tokens": 4597517.0,
"step": 4510
},
{
"epoch": 1.019625535754568,
"grad_norm": 0.9677643775939941,
"learning_rate": 0.00023883600270697045,
"loss": 0.1729,
"mean_token_accuracy": 0.9452177286148071,
"num_tokens": 4607748.0,
"step": 4520
},
{
"epoch": 1.0218813444619896,
"grad_norm": 1.0022025108337402,
"learning_rate": 0.00023870065418452513,
"loss": 0.226,
"mean_token_accuracy": 0.9398165047168732,
"num_tokens": 4617907.0,
"step": 4530
},
{
"epoch": 1.0241371531694112,
"grad_norm": 0.5817978382110596,
"learning_rate": 0.00023856530566207984,
"loss": 0.1559,
"mean_token_accuracy": 0.9518312573432922,
"num_tokens": 4628126.0,
"step": 4540
},
{
"epoch": 1.0263929618768328,
"grad_norm": 0.5886203646659851,
"learning_rate": 0.00023842995713963455,
"loss": 0.194,
"mean_token_accuracy": 0.9445899367332459,
"num_tokens": 4638349.0,
"step": 4550
},
{
"epoch": 1.0286487705842544,
"grad_norm": 0.565995991230011,
"learning_rate": 0.00023829460861718926,
"loss": 0.1571,
"mean_token_accuracy": 0.9537485420703888,
"num_tokens": 4648556.0,
"step": 4560
},
{
"epoch": 1.030904579291676,
"grad_norm": 0.809699535369873,
"learning_rate": 0.00023815926009474394,
"loss": 0.1824,
"mean_token_accuracy": 0.9448329627513885,
"num_tokens": 4658749.0,
"step": 4570
},
{
"epoch": 1.0331603879990976,
"grad_norm": 0.5532633662223816,
"learning_rate": 0.00023802391157229865,
"loss": 0.199,
"mean_token_accuracy": 0.9399157762527466,
"num_tokens": 4668925.0,
"step": 4580
},
{
"epoch": 1.0354161967065192,
"grad_norm": 0.7670914530754089,
"learning_rate": 0.00023788856304985336,
"loss": 0.1814,
"mean_token_accuracy": 0.9444401502609253,
"num_tokens": 4679158.0,
"step": 4590
},
{
"epoch": 1.037672005413941,
"grad_norm": 0.24954642355442047,
"learning_rate": 0.00023775321452740807,
"loss": 0.1463,
"mean_token_accuracy": 0.9550969898700714,
"num_tokens": 4689389.0,
"step": 4600
},
{
"epoch": 1.0399278141213626,
"grad_norm": 0.5538378953933716,
"learning_rate": 0.00023761786600496275,
"loss": 0.1642,
"mean_token_accuracy": 0.9498744547367096,
"num_tokens": 4699613.0,
"step": 4610
},
{
"epoch": 1.0421836228287842,
"grad_norm": 0.5744291543960571,
"learning_rate": 0.00023748251748251746,
"loss": 0.1616,
"mean_token_accuracy": 0.9534083902835846,
"num_tokens": 4709817.0,
"step": 4620
},
{
"epoch": 1.0444394315362058,
"grad_norm": 0.6914228796958923,
"learning_rate": 0.00023734716896007217,
"loss": 0.1511,
"mean_token_accuracy": 0.9552951693534851,
"num_tokens": 4720047.0,
"step": 4630
},
{
"epoch": 1.0466952402436274,
"grad_norm": 0.7641374468803406,
"learning_rate": 0.00023721182043762688,
"loss": 0.1868,
"mean_token_accuracy": 0.9446970582008362,
"num_tokens": 4730215.0,
"step": 4640
},
{
"epoch": 1.048951048951049,
"grad_norm": 0.4645124673843384,
"learning_rate": 0.0002370764719151816,
"loss": 0.1831,
"mean_token_accuracy": 0.9454750001430512,
"num_tokens": 4740428.0,
"step": 4650
},
{
"epoch": 1.0512068576584706,
"grad_norm": 0.6806756854057312,
"learning_rate": 0.00023694112339273628,
"loss": 0.1451,
"mean_token_accuracy": 0.957587742805481,
"num_tokens": 4750619.0,
"step": 4660
},
{
"epoch": 1.0534626663658921,
"grad_norm": 0.6630045175552368,
"learning_rate": 0.00023680577487029099,
"loss": 0.1618,
"mean_token_accuracy": 0.950842559337616,
"num_tokens": 4760831.0,
"step": 4670
},
{
"epoch": 1.0557184750733137,
"grad_norm": 0.5879353284835815,
"learning_rate": 0.0002366704263478457,
"loss": 0.1502,
"mean_token_accuracy": 0.9544346511363984,
"num_tokens": 4771006.0,
"step": 4680
},
{
"epoch": 1.0579742837807353,
"grad_norm": 0.6933557987213135,
"learning_rate": 0.0002365350778254004,
"loss": 0.2272,
"mean_token_accuracy": 0.938098531961441,
"num_tokens": 4781241.0,
"step": 4690
},
{
"epoch": 1.060230092488157,
"grad_norm": 0.6140124201774597,
"learning_rate": 0.0002363997293029551,
"loss": 0.2053,
"mean_token_accuracy": 0.9433477580547333,
"num_tokens": 4791472.0,
"step": 4700
},
{
"epoch": 1.0624859011955785,
"grad_norm": 0.6066429615020752,
"learning_rate": 0.0002362643807805098,
"loss": 0.1449,
"mean_token_accuracy": 0.9551500737667084,
"num_tokens": 4801650.0,
"step": 4710
},
{
"epoch": 1.0647417099030003,
"grad_norm": 0.6566262245178223,
"learning_rate": 0.0002361290322580645,
"loss": 0.1428,
"mean_token_accuracy": 0.9594050168991088,
"num_tokens": 4811882.0,
"step": 4720
},
{
"epoch": 1.066997518610422,
"grad_norm": 2.896930694580078,
"learning_rate": 0.00023599368373561922,
"loss": 0.2053,
"mean_token_accuracy": 0.9418795704841614,
"num_tokens": 4821080.0,
"step": 4730
},
{
"epoch": 1.0692533273178435,
"grad_norm": 0.45278868079185486,
"learning_rate": 0.0002358583352131739,
"loss": 0.172,
"mean_token_accuracy": 0.9501473903656006,
"num_tokens": 4831283.0,
"step": 4740
},
{
"epoch": 1.071509136025265,
"grad_norm": 0.4739263951778412,
"learning_rate": 0.0002357229866907286,
"loss": 0.2002,
"mean_token_accuracy": 0.9419821977615357,
"num_tokens": 4841434.0,
"step": 4750
},
{
"epoch": 1.0737649447326867,
"grad_norm": 0.8713507652282715,
"learning_rate": 0.00023558763816828332,
"loss": 0.178,
"mean_token_accuracy": 0.9494198262691498,
"num_tokens": 4851630.0,
"step": 4760
},
{
"epoch": 1.0760207534401083,
"grad_norm": 0.6262771487236023,
"learning_rate": 0.00023545228964583803,
"loss": 0.1591,
"mean_token_accuracy": 0.9523061156272888,
"num_tokens": 4861774.0,
"step": 4770
},
{
"epoch": 1.0782765621475299,
"grad_norm": 0.6477741599082947,
"learning_rate": 0.0002353169411233927,
"loss": 0.203,
"mean_token_accuracy": 0.9406819343566895,
"num_tokens": 4872001.0,
"step": 4780
},
{
"epoch": 1.0805323708549515,
"grad_norm": 0.37195727229118347,
"learning_rate": 0.00023518159260094742,
"loss": 0.1458,
"mean_token_accuracy": 0.953748595714569,
"num_tokens": 4882143.0,
"step": 4790
},
{
"epoch": 1.082788179562373,
"grad_norm": 0.5744128227233887,
"learning_rate": 0.00023504624407850213,
"loss": 0.1852,
"mean_token_accuracy": 0.9459884941577912,
"num_tokens": 4892371.0,
"step": 4800
},
{
"epoch": 1.0850439882697946,
"grad_norm": 0.7021632194519043,
"learning_rate": 0.00023491089555605684,
"loss": 0.1736,
"mean_token_accuracy": 0.9457263708114624,
"num_tokens": 4902458.0,
"step": 4810
},
{
"epoch": 1.0872997969772162,
"grad_norm": 0.8485981822013855,
"learning_rate": 0.00023477554703361155,
"loss": 0.1775,
"mean_token_accuracy": 0.9493930280208588,
"num_tokens": 4912682.0,
"step": 4820
},
{
"epoch": 1.0895556056846378,
"grad_norm": 0.3264307975769043,
"learning_rate": 0.00023464019851116623,
"loss": 0.1782,
"mean_token_accuracy": 0.9486259996891022,
"num_tokens": 4922905.0,
"step": 4830
},
{
"epoch": 1.0918114143920596,
"grad_norm": 0.7745571136474609,
"learning_rate": 0.00023450484998872094,
"loss": 0.1528,
"mean_token_accuracy": 0.9531067311763763,
"num_tokens": 4933131.0,
"step": 4840
},
{
"epoch": 1.0940672230994812,
"grad_norm": 0.5187065601348877,
"learning_rate": 0.00023436950146627565,
"loss": 0.1421,
"mean_token_accuracy": 0.9564592063426971,
"num_tokens": 4943356.0,
"step": 4850
},
{
"epoch": 1.0963230318069028,
"grad_norm": 0.3341211676597595,
"learning_rate": 0.00023423415294383036,
"loss": 0.1483,
"mean_token_accuracy": 0.957532000541687,
"num_tokens": 4953583.0,
"step": 4860
},
{
"epoch": 1.0985788405143244,
"grad_norm": 0.6960042715072632,
"learning_rate": 0.00023409880442138502,
"loss": 0.1702,
"mean_token_accuracy": 0.950124329328537,
"num_tokens": 4963819.0,
"step": 4870
},
{
"epoch": 1.100834649221746,
"grad_norm": 0.5740098357200623,
"learning_rate": 0.00023396345589893976,
"loss": 0.1589,
"mean_token_accuracy": 0.9537279188632966,
"num_tokens": 4973668.0,
"step": 4880
},
{
"epoch": 1.1030904579291676,
"grad_norm": 0.7132306098937988,
"learning_rate": 0.00023382810737649447,
"loss": 0.1633,
"mean_token_accuracy": 0.9555535554885864,
"num_tokens": 4983783.0,
"step": 4890
},
{
"epoch": 1.1053462666365892,
"grad_norm": 0.5167339444160461,
"learning_rate": 0.00023369275885404918,
"loss": 0.2036,
"mean_token_accuracy": 0.9411836564540863,
"num_tokens": 4994007.0,
"step": 4900
},
{
"epoch": 1.1076020753440108,
"grad_norm": 0.4466972351074219,
"learning_rate": 0.00023355741033160383,
"loss": 0.167,
"mean_token_accuracy": 0.9488433957099914,
"num_tokens": 5004178.0,
"step": 4910
},
{
"epoch": 1.1098578840514324,
"grad_norm": 0.6669716835021973,
"learning_rate": 0.00023342206180915854,
"loss": 0.1465,
"mean_token_accuracy": 0.9539471685886383,
"num_tokens": 5014415.0,
"step": 4920
},
{
"epoch": 1.112113692758854,
"grad_norm": 0.5115512609481812,
"learning_rate": 0.00023328671328671328,
"loss": 0.1902,
"mean_token_accuracy": 0.94353746175766,
"num_tokens": 5024619.0,
"step": 4930
},
{
"epoch": 1.1143695014662756,
"grad_norm": 0.36398640275001526,
"learning_rate": 0.000233151364764268,
"loss": 0.1665,
"mean_token_accuracy": 0.951126629114151,
"num_tokens": 5034405.0,
"step": 4940
},
{
"epoch": 1.1166253101736974,
"grad_norm": 0.5980937480926514,
"learning_rate": 0.0002330160162418227,
"loss": 0.1722,
"mean_token_accuracy": 0.9499568104743957,
"num_tokens": 5044616.0,
"step": 4950
},
{
"epoch": 1.118881118881119,
"grad_norm": 0.7885400652885437,
"learning_rate": 0.00023288066771937735,
"loss": 0.1909,
"mean_token_accuracy": 0.9432080864906311,
"num_tokens": 5054818.0,
"step": 4960
},
{
"epoch": 1.1211369275885406,
"grad_norm": 0.7380874752998352,
"learning_rate": 0.0002327453191969321,
"loss": 0.1843,
"mean_token_accuracy": 0.9453416705131531,
"num_tokens": 5064910.0,
"step": 4970
},
{
"epoch": 1.1233927362959621,
"grad_norm": 0.46912482380867004,
"learning_rate": 0.0002326099706744868,
"loss": 0.1377,
"mean_token_accuracy": 0.9612941980361939,
"num_tokens": 5075124.0,
"step": 4980
},
{
"epoch": 1.1256485450033837,
"grad_norm": 0.7229384183883667,
"learning_rate": 0.0002324746221520415,
"loss": 0.1846,
"mean_token_accuracy": 0.9464283585548401,
"num_tokens": 5085268.0,
"step": 4990
},
{
"epoch": 1.1279043537108053,
"grad_norm": 0.6191129088401794,
"learning_rate": 0.00023233927362959617,
"loss": 0.1863,
"mean_token_accuracy": 0.9472849369049072,
"num_tokens": 5095405.0,
"step": 5000
},
{
"epoch": 1.130160162418227,
"grad_norm": 0.9127920866012573,
"learning_rate": 0.00023220392510715088,
"loss": 0.1621,
"mean_token_accuracy": 0.9521571576595307,
"num_tokens": 5105635.0,
"step": 5010
},
{
"epoch": 1.1324159711256485,
"grad_norm": 0.7310436367988586,
"learning_rate": 0.00023206857658470561,
"loss": 0.1762,
"mean_token_accuracy": 0.9507647454738617,
"num_tokens": 5115845.0,
"step": 5020
},
{
"epoch": 1.13467177983307,
"grad_norm": 0.6670770049095154,
"learning_rate": 0.00023193322806226032,
"loss": 0.1825,
"mean_token_accuracy": 0.9468953788280488,
"num_tokens": 5125969.0,
"step": 5030
},
{
"epoch": 1.1369275885404917,
"grad_norm": 0.970748245716095,
"learning_rate": 0.00023179787953981498,
"loss": 0.1982,
"mean_token_accuracy": 0.940336960554123,
"num_tokens": 5136130.0,
"step": 5040
},
{
"epoch": 1.1391833972479133,
"grad_norm": 0.6120514869689941,
"learning_rate": 0.0002316625310173697,
"loss": 0.1629,
"mean_token_accuracy": 0.9530200004577637,
"num_tokens": 5146326.0,
"step": 5050
},
{
"epoch": 1.141439205955335,
"grad_norm": 0.6671406030654907,
"learning_rate": 0.0002315271824949244,
"loss": 0.1627,
"mean_token_accuracy": 0.9505042195320129,
"num_tokens": 5156501.0,
"step": 5060
},
{
"epoch": 1.1436950146627567,
"grad_norm": 0.6213026642799377,
"learning_rate": 0.00023139183397247914,
"loss": 0.1497,
"mean_token_accuracy": 0.9575595736503602,
"num_tokens": 5166737.0,
"step": 5070
},
{
"epoch": 1.1459508233701783,
"grad_norm": 0.8337035179138184,
"learning_rate": 0.0002312564854500338,
"loss": 0.1522,
"mean_token_accuracy": 0.9550527095794678,
"num_tokens": 5176918.0,
"step": 5080
},
{
"epoch": 1.1482066320775999,
"grad_norm": 0.5575427412986755,
"learning_rate": 0.0002311211369275885,
"loss": 0.1688,
"mean_token_accuracy": 0.9502976775169373,
"num_tokens": 5187057.0,
"step": 5090
},
{
"epoch": 1.1504624407850215,
"grad_norm": 0.6455193758010864,
"learning_rate": 0.0002309857884051432,
"loss": 0.1778,
"mean_token_accuracy": 0.9466927468776702,
"num_tokens": 5197290.0,
"step": 5100
},
{
"epoch": 1.152718249492443,
"grad_norm": 0.5528632402420044,
"learning_rate": 0.00023085043988269795,
"loss": 0.2195,
"mean_token_accuracy": 0.9405810832977295,
"num_tokens": 5207449.0,
"step": 5110
},
{
"epoch": 1.1549740581998647,
"grad_norm": 0.5751885771751404,
"learning_rate": 0.00023071509136025266,
"loss": 0.1519,
"mean_token_accuracy": 0.9569954037666321,
"num_tokens": 5217669.0,
"step": 5120
},
{
"epoch": 1.1572298669072862,
"grad_norm": 0.7101620435714722,
"learning_rate": 0.00023057974283780731,
"loss": 0.1448,
"mean_token_accuracy": 0.9569979846477509,
"num_tokens": 5227874.0,
"step": 5130
},
{
"epoch": 1.1594856756147078,
"grad_norm": 0.684057354927063,
"learning_rate": 0.00023044439431536202,
"loss": 0.137,
"mean_token_accuracy": 0.959661203622818,
"num_tokens": 5238096.0,
"step": 5140
},
{
"epoch": 1.1617414843221294,
"grad_norm": 0.6585609912872314,
"learning_rate": 0.00023030904579291673,
"loss": 0.1685,
"mean_token_accuracy": 0.950641131401062,
"num_tokens": 5248279.0,
"step": 5150
},
{
"epoch": 1.163997293029551,
"grad_norm": 0.8113618493080139,
"learning_rate": 0.00023017369727047147,
"loss": 0.1723,
"mean_token_accuracy": 0.9519901633262634,
"num_tokens": 5258400.0,
"step": 5160
},
{
"epoch": 1.1662531017369726,
"grad_norm": 0.4892439842224121,
"learning_rate": 0.00023003834874802613,
"loss": 0.154,
"mean_token_accuracy": 0.9551227807998657,
"num_tokens": 5268609.0,
"step": 5170
},
{
"epoch": 1.1685089104443942,
"grad_norm": 0.7058376669883728,
"learning_rate": 0.00022990300022558084,
"loss": 0.2199,
"mean_token_accuracy": 0.9406746447086334,
"num_tokens": 5278776.0,
"step": 5180
},
{
"epoch": 1.170764719151816,
"grad_norm": 0.3844442367553711,
"learning_rate": 0.00022976765170313555,
"loss": 0.208,
"mean_token_accuracy": 0.9386930227279663,
"num_tokens": 5289013.0,
"step": 5190
},
{
"epoch": 1.1730205278592376,
"grad_norm": 0.7708050608634949,
"learning_rate": 0.00022963230318069026,
"loss": 0.1549,
"mean_token_accuracy": 0.9572183132171631,
"num_tokens": 5299229.0,
"step": 5200
},
{
"epoch": 1.1752763365666592,
"grad_norm": 0.9352420568466187,
"learning_rate": 0.00022949695465824494,
"loss": 0.1566,
"mean_token_accuracy": 0.9551738202571869,
"num_tokens": 5309341.0,
"step": 5210
},
{
"epoch": 1.1775321452740808,
"grad_norm": 0.7025801539421082,
"learning_rate": 0.00022936160613579965,
"loss": 0.1285,
"mean_token_accuracy": 0.9594053149223327,
"num_tokens": 5319543.0,
"step": 5220
},
{
"epoch": 1.1797879539815024,
"grad_norm": 0.5407170057296753,
"learning_rate": 0.00022922625761335436,
"loss": 0.1351,
"mean_token_accuracy": 0.9600762069225312,
"num_tokens": 5329768.0,
"step": 5230
},
{
"epoch": 1.182043762688924,
"grad_norm": 0.8326946496963501,
"learning_rate": 0.00022909090909090907,
"loss": 0.194,
"mean_token_accuracy": 0.9461723744869233,
"num_tokens": 5339915.0,
"step": 5240
},
{
"epoch": 1.1842995713963456,
"grad_norm": 0.7867940664291382,
"learning_rate": 0.00022895556056846375,
"loss": 0.2338,
"mean_token_accuracy": 0.9370923578739166,
"num_tokens": 5349979.0,
"step": 5250
},
{
"epoch": 1.1865553801037672,
"grad_norm": 0.5456581711769104,
"learning_rate": 0.00022882021204601846,
"loss": 0.1983,
"mean_token_accuracy": 0.9475960373878479,
"num_tokens": 5360121.0,
"step": 5260
},
{
"epoch": 1.1888111888111887,
"grad_norm": 0.6422356367111206,
"learning_rate": 0.00022868486352357317,
"loss": 0.1762,
"mean_token_accuracy": 0.9557766139507293,
"num_tokens": 5370302.0,
"step": 5270
},
{
"epoch": 1.1910669975186103,
"grad_norm": 0.5232251286506653,
"learning_rate": 0.00022854951500112788,
"loss": 0.1307,
"mean_token_accuracy": 0.9604880928993225,
"num_tokens": 5380536.0,
"step": 5280
},
{
"epoch": 1.193322806226032,
"grad_norm": 0.5937042832374573,
"learning_rate": 0.0002284141664786826,
"loss": 0.1605,
"mean_token_accuracy": 0.9531785011291504,
"num_tokens": 5390757.0,
"step": 5290
},
{
"epoch": 1.1955786149334537,
"grad_norm": 0.6249987483024597,
"learning_rate": 0.00022827881795623727,
"loss": 0.1344,
"mean_token_accuracy": 0.959484601020813,
"num_tokens": 5400977.0,
"step": 5300
},
{
"epoch": 1.1978344236408753,
"grad_norm": 0.7910540699958801,
"learning_rate": 0.00022814346943379198,
"loss": 0.1517,
"mean_token_accuracy": 0.9554309785366059,
"num_tokens": 5411187.0,
"step": 5310
},
{
"epoch": 1.200090232348297,
"grad_norm": 0.9017763137817383,
"learning_rate": 0.0002280081209113467,
"loss": 0.2021,
"mean_token_accuracy": 0.9464009702205658,
"num_tokens": 5421260.0,
"step": 5320
},
{
"epoch": 1.2023460410557185,
"grad_norm": 0.4037693738937378,
"learning_rate": 0.0002278727723889014,
"loss": 0.1439,
"mean_token_accuracy": 0.9570986866950989,
"num_tokens": 5431402.0,
"step": 5330
},
{
"epoch": 1.20460184976314,
"grad_norm": 0.36305737495422363,
"learning_rate": 0.00022773742386645609,
"loss": 0.1387,
"mean_token_accuracy": 0.9598490238189697,
"num_tokens": 5441591.0,
"step": 5340
},
{
"epoch": 1.2068576584705617,
"grad_norm": 0.8021289110183716,
"learning_rate": 0.0002276020753440108,
"loss": 0.1593,
"mean_token_accuracy": 0.9541407644748687,
"num_tokens": 5451828.0,
"step": 5350
},
{
"epoch": 1.2091134671779833,
"grad_norm": 0.8966614603996277,
"learning_rate": 0.0002274667268215655,
"loss": 0.1826,
"mean_token_accuracy": 0.9492299973964691,
"num_tokens": 5462020.0,
"step": 5360
},
{
"epoch": 1.2113692758854049,
"grad_norm": 0.5038102269172668,
"learning_rate": 0.00022733137829912022,
"loss": 0.1776,
"mean_token_accuracy": 0.9493005454540253,
"num_tokens": 5472213.0,
"step": 5370
},
{
"epoch": 1.2136250845928265,
"grad_norm": 0.8426568508148193,
"learning_rate": 0.0002271960297766749,
"loss": 0.1681,
"mean_token_accuracy": 0.9485830664634705,
"num_tokens": 5482433.0,
"step": 5380
},
{
"epoch": 1.215880893300248,
"grad_norm": 0.5926401019096375,
"learning_rate": 0.0002270606812542296,
"loss": 0.1725,
"mean_token_accuracy": 0.9492874026298523,
"num_tokens": 5492653.0,
"step": 5390
},
{
"epoch": 1.2181367020076697,
"grad_norm": 0.7410178184509277,
"learning_rate": 0.00022692533273178432,
"loss": 0.1678,
"mean_token_accuracy": 0.9544486105442047,
"num_tokens": 5502830.0,
"step": 5400
},
{
"epoch": 1.2203925107150915,
"grad_norm": 0.5833625793457031,
"learning_rate": 0.00022678998420933903,
"loss": 0.195,
"mean_token_accuracy": 0.9458029627799988,
"num_tokens": 5513013.0,
"step": 5410
},
{
"epoch": 1.222648319422513,
"grad_norm": 0.753453254699707,
"learning_rate": 0.0002266546356868937,
"loss": 0.1485,
"mean_token_accuracy": 0.9566802024841309,
"num_tokens": 5523249.0,
"step": 5420
},
{
"epoch": 1.2249041281299347,
"grad_norm": 0.8071026802062988,
"learning_rate": 0.00022651928716444842,
"loss": 0.162,
"mean_token_accuracy": 0.9501352787017823,
"num_tokens": 5533427.0,
"step": 5430
},
{
"epoch": 1.2271599368373562,
"grad_norm": 0.6357564926147461,
"learning_rate": 0.00022638393864200313,
"loss": 0.1418,
"mean_token_accuracy": 0.9574578821659088,
"num_tokens": 5543579.0,
"step": 5440
},
{
"epoch": 1.2294157455447778,
"grad_norm": 0.8859591484069824,
"learning_rate": 0.00022624859011955784,
"loss": 0.1743,
"mean_token_accuracy": 0.950443959236145,
"num_tokens": 5553797.0,
"step": 5450
},
{
"epoch": 1.2316715542521994,
"grad_norm": 0.6225905418395996,
"learning_rate": 0.00022611324159711255,
"loss": 0.1332,
"mean_token_accuracy": 0.9579242885112762,
"num_tokens": 5563967.0,
"step": 5460
},
{
"epoch": 1.233927362959621,
"grad_norm": 0.6247864961624146,
"learning_rate": 0.00022597789307466723,
"loss": 0.1506,
"mean_token_accuracy": 0.9570437967777252,
"num_tokens": 5574167.0,
"step": 5470
},
{
"epoch": 1.2361831716670426,
"grad_norm": 0.8502485752105713,
"learning_rate": 0.00022584254455222194,
"loss": 0.1444,
"mean_token_accuracy": 0.9581893026828766,
"num_tokens": 5584392.0,
"step": 5480
},
{
"epoch": 1.2384389803744642,
"grad_norm": 0.5714454054832458,
"learning_rate": 0.00022570719602977665,
"loss": 0.1446,
"mean_token_accuracy": 0.9594759464263916,
"num_tokens": 5594584.0,
"step": 5490
},
{
"epoch": 1.2406947890818858,
"grad_norm": 0.6027330756187439,
"learning_rate": 0.00022557184750733136,
"loss": 0.141,
"mean_token_accuracy": 0.9551207423210144,
"num_tokens": 5604775.0,
"step": 5500
},
{
"epoch": 1.2429505977893074,
"grad_norm": 0.6891913414001465,
"learning_rate": 0.00022543649898488605,
"loss": 0.1513,
"mean_token_accuracy": 0.9528867900371552,
"num_tokens": 5614990.0,
"step": 5510
},
{
"epoch": 1.2452064064967292,
"grad_norm": 0.5027481913566589,
"learning_rate": 0.00022530115046244075,
"loss": 0.1628,
"mean_token_accuracy": 0.9531277477741241,
"num_tokens": 5625183.0,
"step": 5520
},
{
"epoch": 1.2474622152041506,
"grad_norm": 0.4770359694957733,
"learning_rate": 0.00022516580193999546,
"loss": 0.1523,
"mean_token_accuracy": 0.9534553825855255,
"num_tokens": 5635367.0,
"step": 5530
},
{
"epoch": 1.2497180239115724,
"grad_norm": 0.8309736847877502,
"learning_rate": 0.00022503045341755017,
"loss": 0.1461,
"mean_token_accuracy": 0.9559681415557861,
"num_tokens": 5645594.0,
"step": 5540
},
{
"epoch": 1.251973832618994,
"grad_norm": 0.8505803942680359,
"learning_rate": 0.00022489510489510486,
"loss": 0.1787,
"mean_token_accuracy": 0.9484643459320068,
"num_tokens": 5655750.0,
"step": 5550
},
{
"epoch": 1.2542296413264156,
"grad_norm": 0.47796082496643066,
"learning_rate": 0.00022475975637265957,
"loss": 0.1481,
"mean_token_accuracy": 0.9564223170280457,
"num_tokens": 5665927.0,
"step": 5560
},
{
"epoch": 1.2564854500338372,
"grad_norm": 0.5004701018333435,
"learning_rate": 0.00022462440785021428,
"loss": 0.1376,
"mean_token_accuracy": 0.9571360588073731,
"num_tokens": 5676155.0,
"step": 5570
},
{
"epoch": 1.2587412587412588,
"grad_norm": 0.603234052658081,
"learning_rate": 0.000224489059327769,
"loss": 0.1299,
"mean_token_accuracy": 0.9614892840385437,
"num_tokens": 5686280.0,
"step": 5580
},
{
"epoch": 1.2609970674486803,
"grad_norm": 0.6253412961959839,
"learning_rate": 0.0002243537108053237,
"loss": 0.1412,
"mean_token_accuracy": 0.9608110129833222,
"num_tokens": 5696432.0,
"step": 5590
},
{
"epoch": 1.263252876156102,
"grad_norm": 0.46702322363853455,
"learning_rate": 0.00022421836228287838,
"loss": 0.1521,
"mean_token_accuracy": 0.9547717452049256,
"num_tokens": 5706592.0,
"step": 5600
},
{
"epoch": 1.2655086848635235,
"grad_norm": 0.9571554064750671,
"learning_rate": 0.0002240830137604331,
"loss": 0.1641,
"mean_token_accuracy": 0.9535882234573364,
"num_tokens": 5716770.0,
"step": 5610
},
{
"epoch": 1.2677644935709451,
"grad_norm": 0.5119358897209167,
"learning_rate": 0.0002239476652379878,
"loss": 0.1245,
"mean_token_accuracy": 0.9615546584129333,
"num_tokens": 5726960.0,
"step": 5620
},
{
"epoch": 1.270020302278367,
"grad_norm": 0.7958256602287292,
"learning_rate": 0.0002238123167155425,
"loss": 0.1158,
"mean_token_accuracy": 0.9650822639465332,
"num_tokens": 5737064.0,
"step": 5630
},
{
"epoch": 1.2722761109857883,
"grad_norm": 0.5314520001411438,
"learning_rate": 0.0002236769681930972,
"loss": 0.133,
"mean_token_accuracy": 0.9586548745632172,
"num_tokens": 5747248.0,
"step": 5640
},
{
"epoch": 1.2745319196932101,
"grad_norm": 0.578899085521698,
"learning_rate": 0.0002235416196706519,
"loss": 0.1621,
"mean_token_accuracy": 0.9567394137382508,
"num_tokens": 5757414.0,
"step": 5650
},
{
"epoch": 1.2767877284006317,
"grad_norm": 0.5973330140113831,
"learning_rate": 0.0002234062711482066,
"loss": 0.1489,
"mean_token_accuracy": 0.9594153523445129,
"num_tokens": 5767569.0,
"step": 5660
},
{
"epoch": 1.2790435371080533,
"grad_norm": 0.7263054251670837,
"learning_rate": 0.00022327092262576132,
"loss": 0.1831,
"mean_token_accuracy": 0.9471005976200104,
"num_tokens": 5777761.0,
"step": 5670
},
{
"epoch": 1.2812993458154749,
"grad_norm": 0.5503787994384766,
"learning_rate": 0.000223135574103316,
"loss": 0.1487,
"mean_token_accuracy": 0.9587855756282806,
"num_tokens": 5787938.0,
"step": 5680
},
{
"epoch": 1.2835551545228965,
"grad_norm": 0.974582314491272,
"learning_rate": 0.00022300022558087071,
"loss": 0.114,
"mean_token_accuracy": 0.9670296013355255,
"num_tokens": 5798169.0,
"step": 5690
},
{
"epoch": 1.285810963230318,
"grad_norm": 0.5530260801315308,
"learning_rate": 0.00022286487705842542,
"loss": 0.139,
"mean_token_accuracy": 0.958290958404541,
"num_tokens": 5808388.0,
"step": 5700
},
{
"epoch": 1.2880667719377397,
"grad_norm": 0.614759087562561,
"learning_rate": 0.00022272952853598013,
"loss": 0.1319,
"mean_token_accuracy": 0.960501229763031,
"num_tokens": 5818546.0,
"step": 5710
},
{
"epoch": 1.2903225806451613,
"grad_norm": 0.561374306678772,
"learning_rate": 0.00022259418001353482,
"loss": 0.1439,
"mean_token_accuracy": 0.9606938123703003,
"num_tokens": 5828710.0,
"step": 5720
},
{
"epoch": 1.2925783893525828,
"grad_norm": 1.0184314250946045,
"learning_rate": 0.00022245883149108953,
"loss": 0.1619,
"mean_token_accuracy": 0.954375433921814,
"num_tokens": 5838907.0,
"step": 5730
},
{
"epoch": 1.2948341980600044,
"grad_norm": 0.4517951011657715,
"learning_rate": 0.00022232348296864424,
"loss": 0.1599,
"mean_token_accuracy": 0.9561059713363648,
"num_tokens": 5848145.0,
"step": 5740
},
{
"epoch": 1.297090006767426,
"grad_norm": 0.6121286749839783,
"learning_rate": 0.00022218813444619895,
"loss": 0.1724,
"mean_token_accuracy": 0.9532230257987976,
"num_tokens": 5858356.0,
"step": 5750
},
{
"epoch": 1.2993458154748478,
"grad_norm": 0.5834189057350159,
"learning_rate": 0.00022205278592375366,
"loss": 0.1724,
"mean_token_accuracy": 0.9492375791072846,
"num_tokens": 5868590.0,
"step": 5760
},
{
"epoch": 1.3016016241822692,
"grad_norm": 0.6871075630187988,
"learning_rate": 0.00022191743740130834,
"loss": 0.1496,
"mean_token_accuracy": 0.9571017861366272,
"num_tokens": 5878761.0,
"step": 5770
},
{
"epoch": 1.303857432889691,
"grad_norm": 0.6542716026306152,
"learning_rate": 0.00022178208887886305,
"loss": 0.154,
"mean_token_accuracy": 0.9534028053283692,
"num_tokens": 5888986.0,
"step": 5780
},
{
"epoch": 1.3061132415971126,
"grad_norm": 0.6185471415519714,
"learning_rate": 0.00022164674035641776,
"loss": 0.1868,
"mean_token_accuracy": 0.9496482908725739,
"num_tokens": 5899224.0,
"step": 5790
},
{
"epoch": 1.3083690503045342,
"grad_norm": 0.7506686449050903,
"learning_rate": 0.00022151139183397247,
"loss": 0.1572,
"mean_token_accuracy": 0.9547667086124421,
"num_tokens": 5909447.0,
"step": 5800
},
{
"epoch": 1.3106248590119558,
"grad_norm": 0.5120002031326294,
"learning_rate": 0.00022137604331152715,
"loss": 0.1437,
"mean_token_accuracy": 0.9579054772853851,
"num_tokens": 5919631.0,
"step": 5810
},
{
"epoch": 1.3128806677193774,
"grad_norm": 0.4987918436527252,
"learning_rate": 0.00022124069478908186,
"loss": 0.1623,
"mean_token_accuracy": 0.9541470289230347,
"num_tokens": 5929849.0,
"step": 5820
},
{
"epoch": 1.315136476426799,
"grad_norm": 0.5856090784072876,
"learning_rate": 0.00022110534626663657,
"loss": 0.1631,
"mean_token_accuracy": 0.9521755337715149,
"num_tokens": 5940060.0,
"step": 5830
},
{
"epoch": 1.3173922851342206,
"grad_norm": 0.5429736971855164,
"learning_rate": 0.00022096999774419128,
"loss": 0.1354,
"mean_token_accuracy": 0.9596905529499054,
"num_tokens": 5950290.0,
"step": 5840
},
{
"epoch": 1.3196480938416422,
"grad_norm": 0.8014978170394897,
"learning_rate": 0.00022083464922174596,
"loss": 0.1462,
"mean_token_accuracy": 0.9545858979225159,
"num_tokens": 5960485.0,
"step": 5850
},
{
"epoch": 1.3219039025490638,
"grad_norm": 0.6086113452911377,
"learning_rate": 0.00022069930069930067,
"loss": 0.1698,
"mean_token_accuracy": 0.9500519633293152,
"num_tokens": 5970670.0,
"step": 5860
},
{
"epoch": 1.3241597112564856,
"grad_norm": 0.9041001200675964,
"learning_rate": 0.00022056395217685538,
"loss": 0.1866,
"mean_token_accuracy": 0.9488508880138398,
"num_tokens": 5980902.0,
"step": 5870
},
{
"epoch": 1.326415519963907,
"grad_norm": 0.9248059391975403,
"learning_rate": 0.0002204286036544101,
"loss": 0.157,
"mean_token_accuracy": 0.9538485944271088,
"num_tokens": 5991043.0,
"step": 5880
},
{
"epoch": 1.3286713286713288,
"grad_norm": 0.3965446650981903,
"learning_rate": 0.00022029325513196478,
"loss": 0.1668,
"mean_token_accuracy": 0.9556223392486572,
"num_tokens": 6001261.0,
"step": 5890
},
{
"epoch": 1.3309271373787503,
"grad_norm": 0.4829697906970978,
"learning_rate": 0.00022015790660951949,
"loss": 0.137,
"mean_token_accuracy": 0.9602304875850678,
"num_tokens": 6011366.0,
"step": 5900
},
{
"epoch": 1.333182946086172,
"grad_norm": 0.7699381709098816,
"learning_rate": 0.0002200225580870742,
"loss": 0.1797,
"mean_token_accuracy": 0.949476546049118,
"num_tokens": 6021531.0,
"step": 5910
},
{
"epoch": 1.3354387547935935,
"grad_norm": 0.3457069396972656,
"learning_rate": 0.0002198872095646289,
"loss": 0.136,
"mean_token_accuracy": 0.9613852322101593,
"num_tokens": 6031652.0,
"step": 5920
},
{
"epoch": 1.3376945635010151,
"grad_norm": 0.7896465063095093,
"learning_rate": 0.00021975186104218362,
"loss": 0.1561,
"mean_token_accuracy": 0.95490260720253,
"num_tokens": 6041831.0,
"step": 5930
},
{
"epoch": 1.3399503722084367,
"grad_norm": 0.5957865118980408,
"learning_rate": 0.0002196165125197383,
"loss": 0.1384,
"mean_token_accuracy": 0.9616434097290039,
"num_tokens": 6052062.0,
"step": 5940
},
{
"epoch": 1.3422061809158583,
"grad_norm": 0.986088216304779,
"learning_rate": 0.000219481163997293,
"loss": 0.1509,
"mean_token_accuracy": 0.9559194803237915,
"num_tokens": 6062238.0,
"step": 5950
},
{
"epoch": 1.34446198962328,
"grad_norm": 0.6079812049865723,
"learning_rate": 0.00021934581547484772,
"loss": 0.1378,
"mean_token_accuracy": 0.9577659904956818,
"num_tokens": 6072477.0,
"step": 5960
},
{
"epoch": 1.3467177983307015,
"grad_norm": 0.8939962387084961,
"learning_rate": 0.00021921046695240243,
"loss": 0.161,
"mean_token_accuracy": 0.9524900197982789,
"num_tokens": 6082680.0,
"step": 5970
},
{
"epoch": 1.3489736070381233,
"grad_norm": 0.7223296761512756,
"learning_rate": 0.0002190751184299571,
"loss": 0.2035,
"mean_token_accuracy": 0.948406583070755,
"num_tokens": 6092874.0,
"step": 5980
},
{
"epoch": 1.3512294157455447,
"grad_norm": 0.6591929793357849,
"learning_rate": 0.00021893976990751182,
"loss": 0.1405,
"mean_token_accuracy": 0.95966557264328,
"num_tokens": 6103032.0,
"step": 5990
},
{
"epoch": 1.3534852244529665,
"grad_norm": 0.9769535064697266,
"learning_rate": 0.00021880442138506653,
"loss": 0.1602,
"mean_token_accuracy": 0.9543199658393859,
"num_tokens": 6113265.0,
"step": 6000
},
{
"epoch": 1.355741033160388,
"grad_norm": 0.6889714598655701,
"learning_rate": 0.00021866907286262124,
"loss": 0.1598,
"mean_token_accuracy": 0.9535452902317048,
"num_tokens": 6123472.0,
"step": 6010
},
{
"epoch": 1.3579968418678097,
"grad_norm": 0.6904102563858032,
"learning_rate": 0.00021853372434017592,
"loss": 0.1482,
"mean_token_accuracy": 0.9565614700317383,
"num_tokens": 6133638.0,
"step": 6020
},
{
"epoch": 1.3602526505752313,
"grad_norm": 1.0406423807144165,
"learning_rate": 0.00021839837581773063,
"loss": 0.1532,
"mean_token_accuracy": 0.9554621934890747,
"num_tokens": 6143811.0,
"step": 6030
},
{
"epoch": 1.3625084592826529,
"grad_norm": 0.5132746696472168,
"learning_rate": 0.00021826302729528534,
"loss": 0.1688,
"mean_token_accuracy": 0.9485044598579406,
"num_tokens": 6154026.0,
"step": 6040
},
{
"epoch": 1.3647642679900744,
"grad_norm": 0.7688167095184326,
"learning_rate": 0.00021812767877284005,
"loss": 0.1777,
"mean_token_accuracy": 0.9504620730876923,
"num_tokens": 6164212.0,
"step": 6050
},
{
"epoch": 1.367020076697496,
"grad_norm": 0.8709307312965393,
"learning_rate": 0.00021799233025039474,
"loss": 0.2142,
"mean_token_accuracy": 0.9431596338748932,
"num_tokens": 6174378.0,
"step": 6060
},
{
"epoch": 1.3692758854049176,
"grad_norm": 0.4955613911151886,
"learning_rate": 0.00021785698172794945,
"loss": 0.1385,
"mean_token_accuracy": 0.9571300268173217,
"num_tokens": 6184541.0,
"step": 6070
},
{
"epoch": 1.3715316941123392,
"grad_norm": 0.9195837378501892,
"learning_rate": 0.00021772163320550416,
"loss": 0.151,
"mean_token_accuracy": 0.9560777604579925,
"num_tokens": 6194772.0,
"step": 6080
},
{
"epoch": 1.3737875028197608,
"grad_norm": 0.30228182673454285,
"learning_rate": 0.00021758628468305886,
"loss": 0.1203,
"mean_token_accuracy": 0.9646747708320618,
"num_tokens": 6205001.0,
"step": 6090
},
{
"epoch": 1.3760433115271824,
"grad_norm": 1.0205188989639282,
"learning_rate": 0.00021745093616061357,
"loss": 0.1578,
"mean_token_accuracy": 0.9546098172664642,
"num_tokens": 6215129.0,
"step": 6100
},
{
"epoch": 1.3782991202346042,
"grad_norm": 0.45732060074806213,
"learning_rate": 0.00021731558763816826,
"loss": 0.1372,
"mean_token_accuracy": 0.9615266501903534,
"num_tokens": 6225355.0,
"step": 6110
},
{
"epoch": 1.3805549289420256,
"grad_norm": 0.6275461316108704,
"learning_rate": 0.00021718023911572297,
"loss": 0.1343,
"mean_token_accuracy": 0.9584670960903168,
"num_tokens": 6235539.0,
"step": 6120
},
{
"epoch": 1.3828107376494474,
"grad_norm": 0.724653959274292,
"learning_rate": 0.00021704489059327768,
"loss": 0.1223,
"mean_token_accuracy": 0.9638613402843476,
"num_tokens": 6245761.0,
"step": 6130
},
{
"epoch": 1.385066546356869,
"grad_norm": 0.6194751858711243,
"learning_rate": 0.0002169095420708324,
"loss": 0.1206,
"mean_token_accuracy": 0.9652007341384887,
"num_tokens": 6255660.0,
"step": 6140
},
{
"epoch": 1.3873223550642906,
"grad_norm": 1.0562235116958618,
"learning_rate": 0.00021677419354838707,
"loss": 0.2017,
"mean_token_accuracy": 0.9480350255966187,
"num_tokens": 6265817.0,
"step": 6150
},
{
"epoch": 1.3895781637717122,
"grad_norm": 0.23528407514095306,
"learning_rate": 0.00021663884502594178,
"loss": 0.1662,
"mean_token_accuracy": 0.9554067730903626,
"num_tokens": 6275388.0,
"step": 6160
},
{
"epoch": 1.3918339724791338,
"grad_norm": 0.9895961284637451,
"learning_rate": 0.0002165034965034965,
"loss": 0.1573,
"mean_token_accuracy": 0.9531720578670502,
"num_tokens": 6285627.0,
"step": 6170
},
{
"epoch": 1.3940897811865554,
"grad_norm": 0.5095604062080383,
"learning_rate": 0.0002163681479810512,
"loss": 0.1517,
"mean_token_accuracy": 0.9555117368698121,
"num_tokens": 6295805.0,
"step": 6180
},
{
"epoch": 1.396345589893977,
"grad_norm": 0.787956953048706,
"learning_rate": 0.00021623279945860588,
"loss": 0.1872,
"mean_token_accuracy": 0.9512019693851471,
"num_tokens": 6306039.0,
"step": 6190
},
{
"epoch": 1.3986013986013985,
"grad_norm": 0.7012321352958679,
"learning_rate": 0.0002160974509361606,
"loss": 0.1559,
"mean_token_accuracy": 0.9556455969810486,
"num_tokens": 6316186.0,
"step": 6200
},
{
"epoch": 1.4008572073088201,
"grad_norm": 0.5284978747367859,
"learning_rate": 0.0002159621024137153,
"loss": 0.13,
"mean_token_accuracy": 0.9619249105453491,
"num_tokens": 6326413.0,
"step": 6210
},
{
"epoch": 1.403113016016242,
"grad_norm": 0.45027801394462585,
"learning_rate": 0.00021582675389127,
"loss": 0.1408,
"mean_token_accuracy": 0.9602871358394622,
"num_tokens": 6336576.0,
"step": 6220
},
{
"epoch": 1.4053688247236633,
"grad_norm": 0.5853649973869324,
"learning_rate": 0.00021569140536882472,
"loss": 0.1737,
"mean_token_accuracy": 0.9547618448734283,
"num_tokens": 6346767.0,
"step": 6230
},
{
"epoch": 1.4076246334310851,
"grad_norm": 0.4630914628505707,
"learning_rate": 0.0002155560568463794,
"loss": 0.1327,
"mean_token_accuracy": 0.9596076071262359,
"num_tokens": 6356979.0,
"step": 6240
},
{
"epoch": 1.4098804421385067,
"grad_norm": 0.6433979868888855,
"learning_rate": 0.00021542070832393411,
"loss": 0.1746,
"mean_token_accuracy": 0.9512283504009247,
"num_tokens": 6367167.0,
"step": 6250
},
{
"epoch": 1.4121362508459283,
"grad_norm": 0.649756908416748,
"learning_rate": 0.00021528535980148882,
"loss": 0.1216,
"mean_token_accuracy": 0.9655301630496979,
"num_tokens": 6377389.0,
"step": 6260
},
{
"epoch": 1.41439205955335,
"grad_norm": 0.4103543162345886,
"learning_rate": 0.00021515001127904353,
"loss": 0.153,
"mean_token_accuracy": 0.9575583398342132,
"num_tokens": 6387589.0,
"step": 6270
},
{
"epoch": 1.4166478682607715,
"grad_norm": 0.7882847189903259,
"learning_rate": 0.00021501466275659822,
"loss": 0.1386,
"mean_token_accuracy": 0.9569086253643035,
"num_tokens": 6397755.0,
"step": 6280
},
{
"epoch": 1.418903676968193,
"grad_norm": 0.8500162363052368,
"learning_rate": 0.00021487931423415293,
"loss": 0.1614,
"mean_token_accuracy": 0.9510623872280121,
"num_tokens": 6407938.0,
"step": 6290
},
{
"epoch": 1.4211594856756147,
"grad_norm": 0.5798035264015198,
"learning_rate": 0.00021474396571170764,
"loss": 0.121,
"mean_token_accuracy": 0.9637139558792114,
"num_tokens": 6418138.0,
"step": 6300
},
{
"epoch": 1.4234152943830363,
"grad_norm": 0.44802120327949524,
"learning_rate": 0.00021460861718926235,
"loss": 0.1261,
"mean_token_accuracy": 0.9629803776741028,
"num_tokens": 6428343.0,
"step": 6310
},
{
"epoch": 1.4256711030904579,
"grad_norm": 0.7741478085517883,
"learning_rate": 0.00021447326866681703,
"loss": 0.1576,
"mean_token_accuracy": 0.9575680911540985,
"num_tokens": 6438530.0,
"step": 6320
},
{
"epoch": 1.4279269117978797,
"grad_norm": 0.7088395357131958,
"learning_rate": 0.00021433792014437174,
"loss": 0.1479,
"mean_token_accuracy": 0.9561203837394714,
"num_tokens": 6448706.0,
"step": 6330
},
{
"epoch": 1.430182720505301,
"grad_norm": 0.8038097023963928,
"learning_rate": 0.00021420257162192645,
"loss": 0.1493,
"mean_token_accuracy": 0.9569381237030029,
"num_tokens": 6458868.0,
"step": 6340
},
{
"epoch": 1.4324385292127229,
"grad_norm": 0.6985066533088684,
"learning_rate": 0.00021406722309948116,
"loss": 0.1469,
"mean_token_accuracy": 0.9614906787872315,
"num_tokens": 6469064.0,
"step": 6350
},
{
"epoch": 1.4346943379201444,
"grad_norm": 0.5924888849258423,
"learning_rate": 0.00021393187457703584,
"loss": 0.1702,
"mean_token_accuracy": 0.9545105516910553,
"num_tokens": 6479264.0,
"step": 6360
},
{
"epoch": 1.436950146627566,
"grad_norm": 0.4779655933380127,
"learning_rate": 0.00021379652605459055,
"loss": 0.1185,
"mean_token_accuracy": 0.9624211072921753,
"num_tokens": 6489432.0,
"step": 6370
},
{
"epoch": 1.4392059553349876,
"grad_norm": 0.5000889301300049,
"learning_rate": 0.00021366117753214526,
"loss": 0.1218,
"mean_token_accuracy": 0.9632967412471771,
"num_tokens": 6499626.0,
"step": 6380
},
{
"epoch": 1.4414617640424092,
"grad_norm": 0.8437454700469971,
"learning_rate": 0.00021352582900969997,
"loss": 0.1697,
"mean_token_accuracy": 0.9546212613582611,
"num_tokens": 6509837.0,
"step": 6390
},
{
"epoch": 1.4437175727498308,
"grad_norm": 0.4896928668022156,
"learning_rate": 0.00021339048048725468,
"loss": 0.1125,
"mean_token_accuracy": 0.967903059720993,
"num_tokens": 6520000.0,
"step": 6400
},
{
"epoch": 1.4459733814572524,
"grad_norm": 0.49815863370895386,
"learning_rate": 0.00021325513196480936,
"loss": 0.1267,
"mean_token_accuracy": 0.9628502309322358,
"num_tokens": 6530239.0,
"step": 6410
},
{
"epoch": 1.448229190164674,
"grad_norm": 0.6222156286239624,
"learning_rate": 0.00021311978344236407,
"loss": 0.1311,
"mean_token_accuracy": 0.9611857533454895,
"num_tokens": 6540447.0,
"step": 6420
},
{
"epoch": 1.4504849988720956,
"grad_norm": 0.9778371453285217,
"learning_rate": 0.00021298443491991878,
"loss": 0.1832,
"mean_token_accuracy": 0.9495638191699982,
"num_tokens": 6550622.0,
"step": 6430
},
{
"epoch": 1.4527408075795172,
"grad_norm": 0.6410164833068848,
"learning_rate": 0.0002128490863974735,
"loss": 0.175,
"mean_token_accuracy": 0.9525443017482758,
"num_tokens": 6560832.0,
"step": 6440
},
{
"epoch": 1.4549966162869388,
"grad_norm": 0.389616459608078,
"learning_rate": 0.00021271373787502818,
"loss": 0.1227,
"mean_token_accuracy": 0.9669749975204468,
"num_tokens": 6571005.0,
"step": 6450
},
{
"epoch": 1.4572524249943606,
"grad_norm": 0.8085306286811829,
"learning_rate": 0.00021257838935258289,
"loss": 0.1726,
"mean_token_accuracy": 0.9466955840587616,
"num_tokens": 6581166.0,
"step": 6460
},
{
"epoch": 1.459508233701782,
"grad_norm": 0.5888795256614685,
"learning_rate": 0.0002124430408301376,
"loss": 0.1511,
"mean_token_accuracy": 0.9562407910823822,
"num_tokens": 6591391.0,
"step": 6470
},
{
"epoch": 1.4617640424092038,
"grad_norm": 0.6471384763717651,
"learning_rate": 0.0002123076923076923,
"loss": 0.1102,
"mean_token_accuracy": 0.9670383930206299,
"num_tokens": 6601626.0,
"step": 6480
},
{
"epoch": 1.4640198511166254,
"grad_norm": 0.5917261242866516,
"learning_rate": 0.000212172343785247,
"loss": 0.135,
"mean_token_accuracy": 0.9612210988998413,
"num_tokens": 6611751.0,
"step": 6490
},
{
"epoch": 1.466275659824047,
"grad_norm": 0.704924464225769,
"learning_rate": 0.0002120369952628017,
"loss": 0.1177,
"mean_token_accuracy": 0.9678965330123901,
"num_tokens": 6621977.0,
"step": 6500
},
{
"epoch": 1.4685314685314685,
"grad_norm": 0.47947514057159424,
"learning_rate": 0.0002119016467403564,
"loss": 0.1495,
"mean_token_accuracy": 0.958800095319748,
"num_tokens": 6632164.0,
"step": 6510
},
{
"epoch": 1.4707872772388901,
"grad_norm": 0.7660108208656311,
"learning_rate": 0.00021176629821791112,
"loss": 0.1384,
"mean_token_accuracy": 0.9571514368057251,
"num_tokens": 6642397.0,
"step": 6520
},
{
"epoch": 1.4730430859463117,
"grad_norm": 0.5778849124908447,
"learning_rate": 0.0002116309496954658,
"loss": 0.1679,
"mean_token_accuracy": 0.9551712095737457,
"num_tokens": 6652607.0,
"step": 6530
},
{
"epoch": 1.4752988946537333,
"grad_norm": 0.6587995290756226,
"learning_rate": 0.0002114956011730205,
"loss": 0.1685,
"mean_token_accuracy": 0.9550755620002747,
"num_tokens": 6662770.0,
"step": 6540
},
{
"epoch": 1.477554703361155,
"grad_norm": 0.95082026720047,
"learning_rate": 0.00021136025265057522,
"loss": 0.1931,
"mean_token_accuracy": 0.9517137944698334,
"num_tokens": 6672969.0,
"step": 6550
},
{
"epoch": 1.4798105120685765,
"grad_norm": 0.4370115399360657,
"learning_rate": 0.00021122490412812993,
"loss": 0.1635,
"mean_token_accuracy": 0.9526604056358338,
"num_tokens": 6683167.0,
"step": 6560
},
{
"epoch": 1.4820663207759983,
"grad_norm": 0.4797029197216034,
"learning_rate": 0.00021108955560568464,
"loss": 0.1795,
"mean_token_accuracy": 0.9507135689258576,
"num_tokens": 6693384.0,
"step": 6570
},
{
"epoch": 1.4843221294834197,
"grad_norm": 0.9287253022193909,
"learning_rate": 0.00021095420708323932,
"loss": 0.1423,
"mean_token_accuracy": 0.960996150970459,
"num_tokens": 6703601.0,
"step": 6580
},
{
"epoch": 1.4865779381908415,
"grad_norm": 1.040830135345459,
"learning_rate": 0.00021081885856079403,
"loss": 0.1147,
"mean_token_accuracy": 0.9659351110458374,
"num_tokens": 6713774.0,
"step": 6590
},
{
"epoch": 1.488833746898263,
"grad_norm": 0.5492799282073975,
"learning_rate": 0.00021068351003834874,
"loss": 0.1376,
"mean_token_accuracy": 0.9622784972190856,
"num_tokens": 6723950.0,
"step": 6600
},
{
"epoch": 1.4910895556056847,
"grad_norm": 0.5340641736984253,
"learning_rate": 0.00021054816151590345,
"loss": 0.1377,
"mean_token_accuracy": 0.9588437557220459,
"num_tokens": 6734151.0,
"step": 6610
},
{
"epoch": 1.4933453643131063,
"grad_norm": 0.3860383629798889,
"learning_rate": 0.00021041281299345814,
"loss": 0.1291,
"mean_token_accuracy": 0.9634852707386017,
"num_tokens": 6744322.0,
"step": 6620
},
{
"epoch": 1.4956011730205279,
"grad_norm": 0.8424534201622009,
"learning_rate": 0.00021027746447101285,
"loss": 0.1228,
"mean_token_accuracy": 0.9646449565887452,
"num_tokens": 6754539.0,
"step": 6630
},
{
"epoch": 1.4978569817279495,
"grad_norm": 0.8719351887702942,
"learning_rate": 0.00021014211594856756,
"loss": 0.1232,
"mean_token_accuracy": 0.9655321180820465,
"num_tokens": 6764694.0,
"step": 6640
},
{
"epoch": 1.500112790435371,
"grad_norm": 0.4581547677516937,
"learning_rate": 0.00021000676742612227,
"loss": 0.1326,
"mean_token_accuracy": 0.9618205249309539,
"num_tokens": 6774840.0,
"step": 6650
},
{
"epoch": 1.5023685991427926,
"grad_norm": 0.8103078007698059,
"learning_rate": 0.00020987141890367695,
"loss": 0.1246,
"mean_token_accuracy": 0.9630819737911225,
"num_tokens": 6785066.0,
"step": 6660
},
{
"epoch": 1.5046244078502142,
"grad_norm": 0.6315047740936279,
"learning_rate": 0.00020973607038123166,
"loss": 0.1473,
"mean_token_accuracy": 0.9575309336185456,
"num_tokens": 6795252.0,
"step": 6670
},
{
"epoch": 1.506880216557636,
"grad_norm": 0.8702731132507324,
"learning_rate": 0.00020960072185878637,
"loss": 0.1364,
"mean_token_accuracy": 0.9622900664806366,
"num_tokens": 6805452.0,
"step": 6680
},
{
"epoch": 1.5091360252650574,
"grad_norm": 0.6531904339790344,
"learning_rate": 0.00020946537333634108,
"loss": 0.1476,
"mean_token_accuracy": 0.9573485612869262,
"num_tokens": 6815659.0,
"step": 6690
},
{
"epoch": 1.5113918339724792,
"grad_norm": 0.5445907115936279,
"learning_rate": 0.00020933002481389576,
"loss": 0.1645,
"mean_token_accuracy": 0.953666216135025,
"num_tokens": 6825881.0,
"step": 6700
},
{
"epoch": 1.5136476426799006,
"grad_norm": 0.823176920413971,
"learning_rate": 0.00020919467629145047,
"loss": 0.1168,
"mean_token_accuracy": 0.9674039006233215,
"num_tokens": 6836081.0,
"step": 6710
},
{
"epoch": 1.5159034513873224,
"grad_norm": 0.9127213954925537,
"learning_rate": 0.00020905932776900518,
"loss": 0.1511,
"mean_token_accuracy": 0.9592125058174134,
"num_tokens": 6846311.0,
"step": 6720
},
{
"epoch": 1.518159260094744,
"grad_norm": 0.5406370162963867,
"learning_rate": 0.0002089239792465599,
"loss": 0.1493,
"mean_token_accuracy": 0.9566941797733307,
"num_tokens": 6856531.0,
"step": 6730
},
{
"epoch": 1.5204150688021656,
"grad_norm": 0.5357968807220459,
"learning_rate": 0.0002087886307241146,
"loss": 0.1742,
"mean_token_accuracy": 0.950842946767807,
"num_tokens": 6866646.0,
"step": 6740
},
{
"epoch": 1.5226708775095872,
"grad_norm": 0.3849286437034607,
"learning_rate": 0.00020865328220166928,
"loss": 0.1016,
"mean_token_accuracy": 0.9700107276439667,
"num_tokens": 6876800.0,
"step": 6750
},
{
"epoch": 1.5249266862170088,
"grad_norm": 0.6728788614273071,
"learning_rate": 0.000208517933679224,
"loss": 0.1458,
"mean_token_accuracy": 0.9588227331638336,
"num_tokens": 6886918.0,
"step": 6760
},
{
"epoch": 1.5271824949244304,
"grad_norm": 1.1982979774475098,
"learning_rate": 0.0002083825851567787,
"loss": 0.1186,
"mean_token_accuracy": 0.9652492642402649,
"num_tokens": 6897103.0,
"step": 6770
},
{
"epoch": 1.529438303631852,
"grad_norm": 0.6889848113059998,
"learning_rate": 0.0002082472366343334,
"loss": 0.1214,
"mean_token_accuracy": 0.9651449739933013,
"num_tokens": 6907282.0,
"step": 6780
},
{
"epoch": 1.5316941123392738,
"grad_norm": 0.5323916077613831,
"learning_rate": 0.00020811188811188807,
"loss": 0.117,
"mean_token_accuracy": 0.9649187743663787,
"num_tokens": 6917456.0,
"step": 6790
},
{
"epoch": 1.5339499210466951,
"grad_norm": 0.41671881079673767,
"learning_rate": 0.0002079765395894428,
"loss": 0.1259,
"mean_token_accuracy": 0.9648886620998383,
"num_tokens": 6927691.0,
"step": 6800
},
{
"epoch": 1.536205729754117,
"grad_norm": 0.7938935160636902,
"learning_rate": 0.00020784119106699751,
"loss": 0.1343,
"mean_token_accuracy": 0.9606435716152191,
"num_tokens": 6937915.0,
"step": 6810
},
{
"epoch": 1.5384615384615383,
"grad_norm": 0.6101523637771606,
"learning_rate": 0.00020770584254455222,
"loss": 0.1521,
"mean_token_accuracy": 0.9544383645057678,
"num_tokens": 6948128.0,
"step": 6820
},
{
"epoch": 1.5407173471689601,
"grad_norm": 0.46042972803115845,
"learning_rate": 0.00020757049402210688,
"loss": 0.1374,
"mean_token_accuracy": 0.9600911319255829,
"num_tokens": 6958310.0,
"step": 6830
},
{
"epoch": 1.5429731558763817,
"grad_norm": 0.47602182626724243,
"learning_rate": 0.00020743514549966162,
"loss": 0.162,
"mean_token_accuracy": 0.954825884103775,
"num_tokens": 6968547.0,
"step": 6840
},
{
"epoch": 1.5452289645838033,
"grad_norm": 1.1721361875534058,
"learning_rate": 0.00020729979697721633,
"loss": 0.141,
"mean_token_accuracy": 0.9617628633975983,
"num_tokens": 6978722.0,
"step": 6850
},
{
"epoch": 1.547484773291225,
"grad_norm": 0.7760915160179138,
"learning_rate": 0.00020716444845477104,
"loss": 0.1268,
"mean_token_accuracy": 0.9629230618476867,
"num_tokens": 6988914.0,
"step": 6860
},
{
"epoch": 1.5497405819986465,
"grad_norm": 0.6691769957542419,
"learning_rate": 0.00020702909993232575,
"loss": 0.1492,
"mean_token_accuracy": 0.9562802612781525,
"num_tokens": 6999110.0,
"step": 6870
},
{
"epoch": 1.551996390706068,
"grad_norm": 0.565916895866394,
"learning_rate": 0.0002068937514098804,
"loss": 0.1294,
"mean_token_accuracy": 0.9653188228607178,
"num_tokens": 7009304.0,
"step": 6880
},
{
"epoch": 1.5542521994134897,
"grad_norm": 0.6766910552978516,
"learning_rate": 0.00020675840288743514,
"loss": 0.1093,
"mean_token_accuracy": 0.9655160486698151,
"num_tokens": 7019515.0,
"step": 6890
},
{
"epoch": 1.5565080081209115,
"grad_norm": 0.6299457550048828,
"learning_rate": 0.00020662305436498985,
"loss": 0.1523,
"mean_token_accuracy": 0.9556167304515839,
"num_tokens": 7029604.0,
"step": 6900
},
{
"epoch": 1.5587638168283329,
"grad_norm": 0.8913159370422363,
"learning_rate": 0.00020648770584254456,
"loss": 0.1272,
"mean_token_accuracy": 0.9642847359180451,
"num_tokens": 7039834.0,
"step": 6910
},
{
"epoch": 1.5610196255357547,
"grad_norm": 0.7572903037071228,
"learning_rate": 0.00020635235732009921,
"loss": 0.1319,
"mean_token_accuracy": 0.9626257240772247,
"num_tokens": 7049997.0,
"step": 6920
},
{
"epoch": 1.563275434243176,
"grad_norm": 0.5362765789031982,
"learning_rate": 0.00020621700879765392,
"loss": 0.1401,
"mean_token_accuracy": 0.9602336525917053,
"num_tokens": 7060185.0,
"step": 6930
},
{
"epoch": 1.5655312429505979,
"grad_norm": 0.48010891675949097,
"learning_rate": 0.00020608166027520866,
"loss": 0.1291,
"mean_token_accuracy": 0.9639032125473023,
"num_tokens": 7070418.0,
"step": 6940
},
{
"epoch": 1.5677870516580192,
"grad_norm": 0.5274568200111389,
"learning_rate": 0.00020594631175276337,
"loss": 0.1453,
"mean_token_accuracy": 0.961913114786148,
"num_tokens": 7080638.0,
"step": 6950
},
{
"epoch": 1.570042860365441,
"grad_norm": 0.7161872386932373,
"learning_rate": 0.00020581096323031803,
"loss": 0.1229,
"mean_token_accuracy": 0.966279947757721,
"num_tokens": 7090667.0,
"step": 6960
},
{
"epoch": 1.5722986690728626,
"grad_norm": 0.6061383485794067,
"learning_rate": 0.00020567561470787274,
"loss": 0.1413,
"mean_token_accuracy": 0.958649742603302,
"num_tokens": 7100841.0,
"step": 6970
},
{
"epoch": 1.5745544777802842,
"grad_norm": 0.8167970180511475,
"learning_rate": 0.00020554026618542745,
"loss": 0.1267,
"mean_token_accuracy": 0.9638277113437652,
"num_tokens": 7111056.0,
"step": 6980
},
{
"epoch": 1.5768102864877058,
"grad_norm": 0.6468409895896912,
"learning_rate": 0.00020540491766298218,
"loss": 0.1179,
"mean_token_accuracy": 0.9672865152359009,
"num_tokens": 7121242.0,
"step": 6990
},
{
"epoch": 1.5790660951951274,
"grad_norm": 0.7238821983337402,
"learning_rate": 0.00020526956914053684,
"loss": 0.1433,
"mean_token_accuracy": 0.9589922726154327,
"num_tokens": 7131470.0,
"step": 7000
},
{
"epoch": 1.581321903902549,
"grad_norm": 0.5528222322463989,
"learning_rate": 0.00020513422061809155,
"loss": 0.1357,
"mean_token_accuracy": 0.9585762560367584,
"num_tokens": 7141689.0,
"step": 7010
},
{
"epoch": 1.5835777126099706,
"grad_norm": 0.5497334003448486,
"learning_rate": 0.00020499887209564626,
"loss": 0.1169,
"mean_token_accuracy": 0.9675468802452087,
"num_tokens": 7151896.0,
"step": 7020
},
{
"epoch": 1.5858335213173924,
"grad_norm": 0.6677653193473816,
"learning_rate": 0.000204863523573201,
"loss": 0.1397,
"mean_token_accuracy": 0.961426842212677,
"num_tokens": 7162088.0,
"step": 7030
},
{
"epoch": 1.5880893300248138,
"grad_norm": 0.8899350762367249,
"learning_rate": 0.0002047281750507557,
"loss": 0.1373,
"mean_token_accuracy": 0.9617144644260407,
"num_tokens": 7172269.0,
"step": 7040
},
{
"epoch": 1.5903451387322356,
"grad_norm": 0.5083749890327454,
"learning_rate": 0.00020459282652831036,
"loss": 0.1105,
"mean_token_accuracy": 0.9664644658565521,
"num_tokens": 7182507.0,
"step": 7050
},
{
"epoch": 1.592600947439657,
"grad_norm": 0.4294250011444092,
"learning_rate": 0.00020445747800586507,
"loss": 0.1074,
"mean_token_accuracy": 0.9671262919902801,
"num_tokens": 7192719.0,
"step": 7060
},
{
"epoch": 1.5948567561470788,
"grad_norm": 0.46395134925842285,
"learning_rate": 0.00020432212948341978,
"loss": 0.1865,
"mean_token_accuracy": 0.9541430771350861,
"num_tokens": 7202885.0,
"step": 7070
},
{
"epoch": 1.5971125648545004,
"grad_norm": 0.6415492296218872,
"learning_rate": 0.00020418678096097452,
"loss": 0.1065,
"mean_token_accuracy": 0.9670535027980804,
"num_tokens": 7213086.0,
"step": 7080
},
{
"epoch": 1.599368373561922,
"grad_norm": 0.4826420247554779,
"learning_rate": 0.00020405143243852917,
"loss": 0.1314,
"mean_token_accuracy": 0.9614757120609283,
"num_tokens": 7223238.0,
"step": 7090
},
{
"epoch": 1.6016241822693436,
"grad_norm": 0.8876851201057434,
"learning_rate": 0.00020391608391608388,
"loss": 0.1365,
"mean_token_accuracy": 0.9598813712596893,
"num_tokens": 7233407.0,
"step": 7100
},
{
"epoch": 1.6038799909767651,
"grad_norm": 0.8092009425163269,
"learning_rate": 0.0002037807353936386,
"loss": 0.1901,
"mean_token_accuracy": 0.9505953013896942,
"num_tokens": 7243542.0,
"step": 7110
},
{
"epoch": 1.6061357996841867,
"grad_norm": 0.5989572405815125,
"learning_rate": 0.0002036453868711933,
"loss": 0.1304,
"mean_token_accuracy": 0.9629263341426849,
"num_tokens": 7253772.0,
"step": 7120
},
{
"epoch": 1.6083916083916083,
"grad_norm": 0.832798421382904,
"learning_rate": 0.00020351003834874799,
"loss": 0.0883,
"mean_token_accuracy": 0.9736064672470093,
"num_tokens": 7263938.0,
"step": 7130
},
{
"epoch": 1.6106474170990301,
"grad_norm": 0.7204803228378296,
"learning_rate": 0.0002033746898263027,
"loss": 0.114,
"mean_token_accuracy": 0.9653593361377716,
"num_tokens": 7274164.0,
"step": 7140
},
{
"epoch": 1.6129032258064515,
"grad_norm": 0.5595267415046692,
"learning_rate": 0.0002032393413038574,
"loss": 0.1233,
"mean_token_accuracy": 0.9639688670635224,
"num_tokens": 7284362.0,
"step": 7150
},
{
"epoch": 1.6151590345138733,
"grad_norm": 0.4636051654815674,
"learning_rate": 0.00020310399278141212,
"loss": 0.1573,
"mean_token_accuracy": 0.9560793101787567,
"num_tokens": 7294517.0,
"step": 7160
},
{
"epoch": 1.6174148432212947,
"grad_norm": 0.5867611169815063,
"learning_rate": 0.0002029686442589668,
"loss": 0.1055,
"mean_token_accuracy": 0.967164421081543,
"num_tokens": 7304698.0,
"step": 7170
},
{
"epoch": 1.6196706519287165,
"grad_norm": 0.5442803502082825,
"learning_rate": 0.0002028332957365215,
"loss": 0.2436,
"mean_token_accuracy": 0.9414969682693481,
"num_tokens": 7314848.0,
"step": 7180
},
{
"epoch": 1.621926460636138,
"grad_norm": 1.0676794052124023,
"learning_rate": 0.00020269794721407622,
"loss": 0.129,
"mean_token_accuracy": 0.965214467048645,
"num_tokens": 7325069.0,
"step": 7190
},
{
"epoch": 1.6241822693435597,
"grad_norm": 0.5160236954689026,
"learning_rate": 0.00020256259869163093,
"loss": 0.1241,
"mean_token_accuracy": 0.9648967385292053,
"num_tokens": 7335222.0,
"step": 7200
},
{
"epoch": 1.6264380780509813,
"grad_norm": 0.6989278197288513,
"learning_rate": 0.00020242725016918564,
"loss": 0.1307,
"mean_token_accuracy": 0.9661418199539185,
"num_tokens": 7345433.0,
"step": 7210
},
{
"epoch": 1.6286938867584029,
"grad_norm": 0.6635163426399231,
"learning_rate": 0.00020229190164674032,
"loss": 0.1211,
"mean_token_accuracy": 0.963777381181717,
"num_tokens": 7355635.0,
"step": 7220
},
{
"epoch": 1.6309496954658245,
"grad_norm": 0.763543963432312,
"learning_rate": 0.00020215655312429503,
"loss": 0.1534,
"mean_token_accuracy": 0.9606209695339203,
"num_tokens": 7365833.0,
"step": 7230
},
{
"epoch": 1.633205504173246,
"grad_norm": 0.5316635966300964,
"learning_rate": 0.00020202120460184974,
"loss": 0.1378,
"mean_token_accuracy": 0.9582312524318695,
"num_tokens": 7376065.0,
"step": 7240
},
{
"epoch": 1.6354613128806679,
"grad_norm": 0.683691680431366,
"learning_rate": 0.00020188585607940445,
"loss": 0.0918,
"mean_token_accuracy": 0.9717151343822479,
"num_tokens": 7386250.0,
"step": 7250
},
{
"epoch": 1.6377171215880892,
"grad_norm": 0.584567129611969,
"learning_rate": 0.00020175050755695913,
"loss": 0.1201,
"mean_token_accuracy": 0.9670240402221679,
"num_tokens": 7396448.0,
"step": 7260
},
{
"epoch": 1.639972930295511,
"grad_norm": 0.34023529291152954,
"learning_rate": 0.00020161515903451384,
"loss": 0.1148,
"mean_token_accuracy": 0.9664259791374207,
"num_tokens": 7406629.0,
"step": 7270
},
{
"epoch": 1.6422287390029324,
"grad_norm": 0.6313138008117676,
"learning_rate": 0.00020147981051206855,
"loss": 0.1484,
"mean_token_accuracy": 0.9585749089717865,
"num_tokens": 7416830.0,
"step": 7280
},
{
"epoch": 1.6444845477103542,
"grad_norm": 0.7951473593711853,
"learning_rate": 0.00020134446198962326,
"loss": 0.1256,
"mean_token_accuracy": 0.9634339690208436,
"num_tokens": 7427068.0,
"step": 7290
},
{
"epoch": 1.6467403564177756,
"grad_norm": 0.7924121618270874,
"learning_rate": 0.00020120911346717795,
"loss": 0.1771,
"mean_token_accuracy": 0.9546219527721405,
"num_tokens": 7437275.0,
"step": 7300
},
{
"epoch": 1.6489961651251974,
"grad_norm": 0.3072904944419861,
"learning_rate": 0.00020107376494473266,
"loss": 0.1016,
"mean_token_accuracy": 0.9672701954841614,
"num_tokens": 7447492.0,
"step": 7310
},
{
"epoch": 1.651251973832619,
"grad_norm": 0.46338242292404175,
"learning_rate": 0.00020093841642228737,
"loss": 0.1036,
"mean_token_accuracy": 0.968935513496399,
"num_tokens": 7457654.0,
"step": 7320
},
{
"epoch": 1.6535077825400406,
"grad_norm": 0.8118281960487366,
"learning_rate": 0.00020080306789984208,
"loss": 0.1108,
"mean_token_accuracy": 0.9685357689857483,
"num_tokens": 7467849.0,
"step": 7330
},
{
"epoch": 1.6557635912474622,
"grad_norm": 0.7755963802337646,
"learning_rate": 0.00020066771937739676,
"loss": 0.1259,
"mean_token_accuracy": 0.9649538099765778,
"num_tokens": 7477315.0,
"step": 7340
},
{
"epoch": 1.6580193999548838,
"grad_norm": 0.5522985458374023,
"learning_rate": 0.00020053237085495147,
"loss": 0.1309,
"mean_token_accuracy": 0.9658150196075439,
"num_tokens": 7487505.0,
"step": 7350
},
{
"epoch": 1.6602752086623056,
"grad_norm": 0.5838783383369446,
"learning_rate": 0.00020039702233250618,
"loss": 0.1244,
"mean_token_accuracy": 0.9633826553821564,
"num_tokens": 7497725.0,
"step": 7360
},
{
"epoch": 1.662531017369727,
"grad_norm": 0.9119488596916199,
"learning_rate": 0.0002002616738100609,
"loss": 0.0837,
"mean_token_accuracy": 0.9742124974727631,
"num_tokens": 7507941.0,
"step": 7370
},
{
"epoch": 1.6647868260771488,
"grad_norm": 1.0427502393722534,
"learning_rate": 0.0002001263252876156,
"loss": 0.1457,
"mean_token_accuracy": 0.9598093390464782,
"num_tokens": 7518134.0,
"step": 7380
},
{
"epoch": 1.6670426347845702,
"grad_norm": 0.8163001537322998,
"learning_rate": 0.00019999097676517028,
"loss": 0.1522,
"mean_token_accuracy": 0.9592224955558777,
"num_tokens": 7528369.0,
"step": 7390
},
{
"epoch": 1.669298443491992,
"grad_norm": 0.37992551922798157,
"learning_rate": 0.000199855628242725,
"loss": 0.1174,
"mean_token_accuracy": 0.9663581132888794,
"num_tokens": 7538476.0,
"step": 7400
},
{
"epoch": 1.6715542521994133,
"grad_norm": 0.7863211035728455,
"learning_rate": 0.0001997202797202797,
"loss": 0.1304,
"mean_token_accuracy": 0.9637087225914002,
"num_tokens": 7548657.0,
"step": 7410
},
{
"epoch": 1.6738100609068352,
"grad_norm": 0.6097214818000793,
"learning_rate": 0.0001995849311978344,
"loss": 0.1527,
"mean_token_accuracy": 0.9590888619422913,
"num_tokens": 7558840.0,
"step": 7420
},
{
"epoch": 1.6760658696142567,
"grad_norm": 0.44298750162124634,
"learning_rate": 0.0001994495826753891,
"loss": 0.1137,
"mean_token_accuracy": 0.9671386659145356,
"num_tokens": 7569072.0,
"step": 7430
},
{
"epoch": 1.6783216783216783,
"grad_norm": 0.7080332636833191,
"learning_rate": 0.0001993142341529438,
"loss": 0.1486,
"mean_token_accuracy": 0.9594590961933136,
"num_tokens": 7579245.0,
"step": 7440
},
{
"epoch": 1.6805774870291,
"grad_norm": 0.4493122100830078,
"learning_rate": 0.0001991788856304985,
"loss": 0.1082,
"mean_token_accuracy": 0.9686892807483674,
"num_tokens": 7589468.0,
"step": 7450
},
{
"epoch": 1.6828332957365215,
"grad_norm": 1.414336919784546,
"learning_rate": 0.00019904353710805322,
"loss": 0.1421,
"mean_token_accuracy": 0.9614466667175293,
"num_tokens": 7599607.0,
"step": 7460
},
{
"epoch": 1.685089104443943,
"grad_norm": 0.7419303059577942,
"learning_rate": 0.0001989081885856079,
"loss": 0.1083,
"mean_token_accuracy": 0.9705542147159576,
"num_tokens": 7609836.0,
"step": 7470
},
{
"epoch": 1.6873449131513647,
"grad_norm": 0.7655690908432007,
"learning_rate": 0.00019877284006316261,
"loss": 0.1705,
"mean_token_accuracy": 0.9575229167938233,
"num_tokens": 7620039.0,
"step": 7480
},
{
"epoch": 1.6896007218587865,
"grad_norm": 0.6142362356185913,
"learning_rate": 0.00019863749154071732,
"loss": 0.1352,
"mean_token_accuracy": 0.9614336788654327,
"num_tokens": 7630163.0,
"step": 7490
},
{
"epoch": 1.6918565305662079,
"grad_norm": 0.4549749791622162,
"learning_rate": 0.00019850214301827203,
"loss": 0.1184,
"mean_token_accuracy": 0.9650885164737701,
"num_tokens": 7640378.0,
"step": 7500
},
{
"epoch": 1.6941123392736297,
"grad_norm": 0.7649783492088318,
"learning_rate": 0.00019836679449582674,
"loss": 0.1125,
"mean_token_accuracy": 0.9654013931751251,
"num_tokens": 7650518.0,
"step": 7510
},
{
"epoch": 1.696368147981051,
"grad_norm": 0.8831228613853455,
"learning_rate": 0.00019823144597338143,
"loss": 0.1224,
"mean_token_accuracy": 0.9647625327110291,
"num_tokens": 7660716.0,
"step": 7520
},
{
"epoch": 1.6986239566884729,
"grad_norm": 0.5036336779594421,
"learning_rate": 0.00019809609745093614,
"loss": 0.1187,
"mean_token_accuracy": 0.9658556759357453,
"num_tokens": 7670880.0,
"step": 7530
},
{
"epoch": 1.7008797653958945,
"grad_norm": 0.909724235534668,
"learning_rate": 0.00019796074892849085,
"loss": 0.1226,
"mean_token_accuracy": 0.9632156550884247,
"num_tokens": 7681090.0,
"step": 7540
},
{
"epoch": 1.703135574103316,
"grad_norm": 0.35965245962142944,
"learning_rate": 0.00019782540040604556,
"loss": 0.1177,
"mean_token_accuracy": 0.964445275068283,
"num_tokens": 7691232.0,
"step": 7550
},
{
"epoch": 1.7053913828107377,
"grad_norm": 0.5753873586654663,
"learning_rate": 0.00019769005188360024,
"loss": 0.1234,
"mean_token_accuracy": 0.9669915854930877,
"num_tokens": 7701429.0,
"step": 7560
},
{
"epoch": 1.7076471915181592,
"grad_norm": 0.4937607944011688,
"learning_rate": 0.00019755470336115495,
"loss": 0.1287,
"mean_token_accuracy": 0.963130658864975,
"num_tokens": 7711663.0,
"step": 7570
},
{
"epoch": 1.7099030002255808,
"grad_norm": 0.48959338665008545,
"learning_rate": 0.00019741935483870966,
"loss": 0.0879,
"mean_token_accuracy": 0.9724132180213928,
"num_tokens": 7721895.0,
"step": 7580
},
{
"epoch": 1.7121588089330024,
"grad_norm": 0.7161306738853455,
"learning_rate": 0.00019728400631626437,
"loss": 0.1306,
"mean_token_accuracy": 0.9645233631134034,
"num_tokens": 7732066.0,
"step": 7590
},
{
"epoch": 1.7144146176404242,
"grad_norm": 0.9491952061653137,
"learning_rate": 0.00019714865779381905,
"loss": 0.1113,
"mean_token_accuracy": 0.9660575449466705,
"num_tokens": 7742256.0,
"step": 7600
},
{
"epoch": 1.7166704263478456,
"grad_norm": 0.7226278185844421,
"learning_rate": 0.00019701330927137376,
"loss": 0.0989,
"mean_token_accuracy": 0.9679902136325836,
"num_tokens": 7752469.0,
"step": 7610
},
{
"epoch": 1.7189262350552674,
"grad_norm": 0.3322307765483856,
"learning_rate": 0.00019687796074892847,
"loss": 0.1009,
"mean_token_accuracy": 0.970562607049942,
"num_tokens": 7762644.0,
"step": 7620
},
{
"epoch": 1.7211820437626888,
"grad_norm": 0.629764199256897,
"learning_rate": 0.00019674261222648318,
"loss": 0.0939,
"mean_token_accuracy": 0.9733090102672577,
"num_tokens": 7772882.0,
"step": 7630
},
{
"epoch": 1.7234378524701106,
"grad_norm": 0.4462561011314392,
"learning_rate": 0.00019660726370403786,
"loss": 0.0984,
"mean_token_accuracy": 0.9717909157276153,
"num_tokens": 7783096.0,
"step": 7640
},
{
"epoch": 1.725693661177532,
"grad_norm": 0.504454493522644,
"learning_rate": 0.00019647191518159257,
"loss": 0.0827,
"mean_token_accuracy": 0.9726765751838684,
"num_tokens": 7793293.0,
"step": 7650
},
{
"epoch": 1.7279494698849538,
"grad_norm": 0.6186042428016663,
"learning_rate": 0.00019633656665914728,
"loss": 0.0995,
"mean_token_accuracy": 0.9694512605667114,
"num_tokens": 7803513.0,
"step": 7660
},
{
"epoch": 1.7302052785923754,
"grad_norm": 0.6722903251647949,
"learning_rate": 0.000196201218136702,
"loss": 0.1414,
"mean_token_accuracy": 0.9589116036891937,
"num_tokens": 7813743.0,
"step": 7670
},
{
"epoch": 1.732461087299797,
"grad_norm": 0.818533182144165,
"learning_rate": 0.0001960658696142567,
"loss": 0.1103,
"mean_token_accuracy": 0.9665028691291809,
"num_tokens": 7823182.0,
"step": 7680
},
{
"epoch": 1.7347168960072186,
"grad_norm": 0.43735024333000183,
"learning_rate": 0.0001959305210918114,
"loss": 0.0965,
"mean_token_accuracy": 0.969660896062851,
"num_tokens": 7833362.0,
"step": 7690
},
{
"epoch": 1.7369727047146402,
"grad_norm": 0.8122400045394897,
"learning_rate": 0.0001957951725693661,
"loss": 0.1217,
"mean_token_accuracy": 0.9671069800853729,
"num_tokens": 7843548.0,
"step": 7700
},
{
"epoch": 1.739228513422062,
"grad_norm": 0.9384815692901611,
"learning_rate": 0.0001956598240469208,
"loss": 0.1085,
"mean_token_accuracy": 0.9692832350730896,
"num_tokens": 7853778.0,
"step": 7710
},
{
"epoch": 1.7414843221294833,
"grad_norm": 0.6304382085800171,
"learning_rate": 0.00019552447552447552,
"loss": 0.1068,
"mean_token_accuracy": 0.9693370401859284,
"num_tokens": 7863983.0,
"step": 7720
},
{
"epoch": 1.7437401308369052,
"grad_norm": 0.6020340323448181,
"learning_rate": 0.0001953891270020302,
"loss": 0.0795,
"mean_token_accuracy": 0.9766138076782227,
"num_tokens": 7874213.0,
"step": 7730
},
{
"epoch": 1.7459959395443265,
"grad_norm": 0.6551034450531006,
"learning_rate": 0.0001952537784795849,
"loss": 0.094,
"mean_token_accuracy": 0.9719179630279541,
"num_tokens": 7884443.0,
"step": 7740
},
{
"epoch": 1.7482517482517483,
"grad_norm": 0.9742296934127808,
"learning_rate": 0.00019511842995713962,
"loss": 0.1558,
"mean_token_accuracy": 0.9584499776363373,
"num_tokens": 7894597.0,
"step": 7750
},
{
"epoch": 1.7505075569591697,
"grad_norm": 0.6788113117218018,
"learning_rate": 0.00019498308143469433,
"loss": 0.1265,
"mean_token_accuracy": 0.9651906430721283,
"num_tokens": 7904692.0,
"step": 7760
},
{
"epoch": 1.7527633656665915,
"grad_norm": 0.8864907026290894,
"learning_rate": 0.000194847732912249,
"loss": 0.1254,
"mean_token_accuracy": 0.9674174845218658,
"num_tokens": 7914807.0,
"step": 7770
},
{
"epoch": 1.7550191743740131,
"grad_norm": 0.9627271294593811,
"learning_rate": 0.00019471238438980372,
"loss": 0.0996,
"mean_token_accuracy": 0.9694191575050354,
"num_tokens": 7925013.0,
"step": 7780
},
{
"epoch": 1.7572749830814347,
"grad_norm": 0.5663970708847046,
"learning_rate": 0.00019457703586735843,
"loss": 0.1393,
"mean_token_accuracy": 0.9625206768512726,
"num_tokens": 7935221.0,
"step": 7790
},
{
"epoch": 1.7595307917888563,
"grad_norm": 0.506974995136261,
"learning_rate": 0.00019444168734491314,
"loss": 0.1484,
"mean_token_accuracy": 0.9573349118232727,
"num_tokens": 7945383.0,
"step": 7800
},
{
"epoch": 1.7617866004962779,
"grad_norm": 0.3117770254611969,
"learning_rate": 0.00019430633882246782,
"loss": 0.1104,
"mean_token_accuracy": 0.9667424440383912,
"num_tokens": 7955526.0,
"step": 7810
},
{
"epoch": 1.7640424092036995,
"grad_norm": 0.7051275968551636,
"learning_rate": 0.00019417099030002253,
"loss": 0.1049,
"mean_token_accuracy": 0.9673680782318115,
"num_tokens": 7965740.0,
"step": 7820
},
{
"epoch": 1.766298217911121,
"grad_norm": 0.6643967032432556,
"learning_rate": 0.00019403564177757724,
"loss": 0.1313,
"mean_token_accuracy": 0.9644637405872345,
"num_tokens": 7975965.0,
"step": 7830
},
{
"epoch": 1.7685540266185429,
"grad_norm": 0.8619371652603149,
"learning_rate": 0.00019390029325513195,
"loss": 0.1165,
"mean_token_accuracy": 0.9674583613872528,
"num_tokens": 7986109.0,
"step": 7840
},
{
"epoch": 1.7708098353259643,
"grad_norm": 0.7922738790512085,
"learning_rate": 0.00019376494473268666,
"loss": 0.1456,
"mean_token_accuracy": 0.9613803088665008,
"num_tokens": 7996242.0,
"step": 7850
},
{
"epoch": 1.773065644033386,
"grad_norm": 0.49722597002983093,
"learning_rate": 0.00019362959621024135,
"loss": 0.1501,
"mean_token_accuracy": 0.9591331839561462,
"num_tokens": 8006480.0,
"step": 7860
},
{
"epoch": 1.7753214527408074,
"grad_norm": 0.797990083694458,
"learning_rate": 0.00019349424768779606,
"loss": 0.1185,
"mean_token_accuracy": 0.9682628512382507,
"num_tokens": 8016688.0,
"step": 7870
},
{
"epoch": 1.7775772614482293,
"grad_norm": 0.6355772614479065,
"learning_rate": 0.00019335889916535077,
"loss": 0.1186,
"mean_token_accuracy": 0.965477454662323,
"num_tokens": 8026854.0,
"step": 7880
},
{
"epoch": 1.7798330701556508,
"grad_norm": 0.4601174294948578,
"learning_rate": 0.00019322355064290548,
"loss": 0.1037,
"mean_token_accuracy": 0.969682228565216,
"num_tokens": 8037088.0,
"step": 7890
},
{
"epoch": 1.7820888788630724,
"grad_norm": 0.6090744733810425,
"learning_rate": 0.00019308820212046016,
"loss": 0.0997,
"mean_token_accuracy": 0.9699548482894897,
"num_tokens": 8047280.0,
"step": 7900
},
{
"epoch": 1.784344687570494,
"grad_norm": 0.6074991226196289,
"learning_rate": 0.00019295285359801487,
"loss": 0.1002,
"mean_token_accuracy": 0.9707536101341248,
"num_tokens": 8057474.0,
"step": 7910
},
{
"epoch": 1.7866004962779156,
"grad_norm": 0.4148198068141937,
"learning_rate": 0.00019281750507556958,
"loss": 0.106,
"mean_token_accuracy": 0.9689879715442657,
"num_tokens": 8067611.0,
"step": 7920
},
{
"epoch": 1.7888563049853372,
"grad_norm": 0.6076057553291321,
"learning_rate": 0.0001926821565531243,
"loss": 0.1021,
"mean_token_accuracy": 0.9703522503376008,
"num_tokens": 8077802.0,
"step": 7930
},
{
"epoch": 1.7911121136927588,
"grad_norm": 1.0338988304138184,
"learning_rate": 0.00019254680803067897,
"loss": 0.1797,
"mean_token_accuracy": 0.9537634730339051,
"num_tokens": 8087956.0,
"step": 7940
},
{
"epoch": 1.7933679224001806,
"grad_norm": 0.5177671909332275,
"learning_rate": 0.00019241145950823368,
"loss": 0.094,
"mean_token_accuracy": 0.9720286428928375,
"num_tokens": 8098191.0,
"step": 7950
},
{
"epoch": 1.795623731107602,
"grad_norm": 0.6910755038261414,
"learning_rate": 0.0001922761109857884,
"loss": 0.1018,
"mean_token_accuracy": 0.9682543575763702,
"num_tokens": 8108272.0,
"step": 7960
},
{
"epoch": 1.7978795398150238,
"grad_norm": 0.6151471138000488,
"learning_rate": 0.0001921407624633431,
"loss": 0.1203,
"mean_token_accuracy": 0.9646850287914276,
"num_tokens": 8118449.0,
"step": 7970
},
{
"epoch": 1.8001353485224452,
"grad_norm": 0.8103246092796326,
"learning_rate": 0.00019200541394089778,
"loss": 0.1128,
"mean_token_accuracy": 0.9669484615325927,
"num_tokens": 8128647.0,
"step": 7980
},
{
"epoch": 1.802391157229867,
"grad_norm": 0.6422575116157532,
"learning_rate": 0.0001918700654184525,
"loss": 0.0805,
"mean_token_accuracy": 0.9736741304397583,
"num_tokens": 8138873.0,
"step": 7990
},
{
"epoch": 1.8046469659372886,
"grad_norm": 0.5007938742637634,
"learning_rate": 0.0001917347168960072,
"loss": 0.1595,
"mean_token_accuracy": 0.9583836376667023,
"num_tokens": 8149003.0,
"step": 8000
},
{
"epoch": 1.8069027746447102,
"grad_norm": 0.8241115808486938,
"learning_rate": 0.0001915993683735619,
"loss": 0.1099,
"mean_token_accuracy": 0.9675259053707123,
"num_tokens": 8159211.0,
"step": 8010
},
{
"epoch": 1.8091585833521318,
"grad_norm": 0.9143343567848206,
"learning_rate": 0.00019146401985111662,
"loss": 0.1099,
"mean_token_accuracy": 0.9669674336910248,
"num_tokens": 8169432.0,
"step": 8020
},
{
"epoch": 1.8114143920595533,
"grad_norm": 0.9340387582778931,
"learning_rate": 0.0001913286713286713,
"loss": 0.1322,
"mean_token_accuracy": 0.9651595711708069,
"num_tokens": 8179601.0,
"step": 8030
},
{
"epoch": 1.813670200766975,
"grad_norm": 1.0919502973556519,
"learning_rate": 0.00019119332280622601,
"loss": 0.157,
"mean_token_accuracy": 0.95865718126297,
"num_tokens": 8189813.0,
"step": 8040
},
{
"epoch": 1.8159260094743965,
"grad_norm": 0.9550254940986633,
"learning_rate": 0.00019105797428378072,
"loss": 0.1022,
"mean_token_accuracy": 0.9707346975803375,
"num_tokens": 8200037.0,
"step": 8050
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.6538637280464172,
"learning_rate": 0.00019092262576133543,
"loss": 0.1331,
"mean_token_accuracy": 0.9647262156009674,
"num_tokens": 8210241.0,
"step": 8060
},
{
"epoch": 1.8204376268892397,
"grad_norm": 0.5565653443336487,
"learning_rate": 0.00019078727723889012,
"loss": 0.1207,
"mean_token_accuracy": 0.9637064814567566,
"num_tokens": 8220471.0,
"step": 8070
},
{
"epoch": 1.8226934355966615,
"grad_norm": 0.804456889629364,
"learning_rate": 0.00019065192871644483,
"loss": 0.1091,
"mean_token_accuracy": 0.9688849687576294,
"num_tokens": 8230699.0,
"step": 8080
},
{
"epoch": 1.824949244304083,
"grad_norm": 0.5959724187850952,
"learning_rate": 0.00019051658019399954,
"loss": 0.1176,
"mean_token_accuracy": 0.9674373865127563,
"num_tokens": 8240936.0,
"step": 8090
},
{
"epoch": 1.8272050530115047,
"grad_norm": 0.671328067779541,
"learning_rate": 0.00019038123167155425,
"loss": 0.1,
"mean_token_accuracy": 0.9702531158924103,
"num_tokens": 8251138.0,
"step": 8100
},
{
"epoch": 1.829460861718926,
"grad_norm": 0.5372695922851562,
"learning_rate": 0.00019024588314910893,
"loss": 0.1265,
"mean_token_accuracy": 0.9620508432388306,
"num_tokens": 8261303.0,
"step": 8110
},
{
"epoch": 1.831716670426348,
"grad_norm": 0.6847373843193054,
"learning_rate": 0.00019011053462666364,
"loss": 0.1472,
"mean_token_accuracy": 0.9624595642089844,
"num_tokens": 8271418.0,
"step": 8120
},
{
"epoch": 1.8339724791337695,
"grad_norm": 0.5652275085449219,
"learning_rate": 0.00018997518610421835,
"loss": 0.0966,
"mean_token_accuracy": 0.970450347661972,
"num_tokens": 8281647.0,
"step": 8130
},
{
"epoch": 1.836228287841191,
"grad_norm": 0.21371297538280487,
"learning_rate": 0.00018983983758177306,
"loss": 0.0867,
"mean_token_accuracy": 0.9742642045021057,
"num_tokens": 8291483.0,
"step": 8140
},
{
"epoch": 1.8384840965486127,
"grad_norm": 0.7062776684761047,
"learning_rate": 0.00018970448905932777,
"loss": 0.1026,
"mean_token_accuracy": 0.9685595810413361,
"num_tokens": 8301705.0,
"step": 8150
},
{
"epoch": 1.8407399052560343,
"grad_norm": 0.6951163411140442,
"learning_rate": 0.00018956914053688245,
"loss": 0.0935,
"mean_token_accuracy": 0.9725548028945923,
"num_tokens": 8311909.0,
"step": 8160
},
{
"epoch": 1.8429957139634559,
"grad_norm": 0.5619714260101318,
"learning_rate": 0.00018943379201443716,
"loss": 0.105,
"mean_token_accuracy": 0.9708254158496856,
"num_tokens": 8322106.0,
"step": 8170
},
{
"epoch": 1.8452515226708774,
"grad_norm": 0.7393009662628174,
"learning_rate": 0.00018929844349199187,
"loss": 0.1224,
"mean_token_accuracy": 0.9665905058383941,
"num_tokens": 8332162.0,
"step": 8180
},
{
"epoch": 1.8475073313782993,
"grad_norm": 0.705487847328186,
"learning_rate": 0.00018916309496954658,
"loss": 0.1046,
"mean_token_accuracy": 0.9709407091140747,
"num_tokens": 8341682.0,
"step": 8190
},
{
"epoch": 1.8497631400857206,
"grad_norm": 0.7456323504447937,
"learning_rate": 0.00018902774644710126,
"loss": 0.1602,
"mean_token_accuracy": 0.9597473442554474,
"num_tokens": 8351800.0,
"step": 8200
},
{
"epoch": 1.8520189487931424,
"grad_norm": 0.4004161059856415,
"learning_rate": 0.00018889239792465597,
"loss": 0.072,
"mean_token_accuracy": 0.9788155972957611,
"num_tokens": 8362016.0,
"step": 8210
},
{
"epoch": 1.8542747575005638,
"grad_norm": 0.8284559845924377,
"learning_rate": 0.00018875704940221068,
"loss": 0.156,
"mean_token_accuracy": 0.9555248856544495,
"num_tokens": 8372161.0,
"step": 8220
},
{
"epoch": 1.8565305662079856,
"grad_norm": 0.5218929052352905,
"learning_rate": 0.0001886217008797654,
"loss": 0.1166,
"mean_token_accuracy": 0.9678529024124145,
"num_tokens": 8382286.0,
"step": 8230
},
{
"epoch": 1.8587863749154072,
"grad_norm": 0.65278160572052,
"learning_rate": 0.00018848635235732008,
"loss": 0.1258,
"mean_token_accuracy": 0.9670017480850219,
"num_tokens": 8392414.0,
"step": 8240
},
{
"epoch": 1.8610421836228288,
"grad_norm": 0.507361888885498,
"learning_rate": 0.0001883510038348748,
"loss": 0.1036,
"mean_token_accuracy": 0.9709287106990814,
"num_tokens": 8402535.0,
"step": 8250
},
{
"epoch": 1.8632979923302504,
"grad_norm": 0.5099123120307922,
"learning_rate": 0.0001882156553124295,
"loss": 0.1396,
"mean_token_accuracy": 0.9616460084915162,
"num_tokens": 8412680.0,
"step": 8260
},
{
"epoch": 1.865553801037672,
"grad_norm": 0.8495880961418152,
"learning_rate": 0.0001880803067899842,
"loss": 0.1025,
"mean_token_accuracy": 0.9701466143131257,
"num_tokens": 8422874.0,
"step": 8270
},
{
"epoch": 1.8678096097450936,
"grad_norm": 0.20612621307373047,
"learning_rate": 0.0001879449582675389,
"loss": 0.1181,
"mean_token_accuracy": 0.9679302871227264,
"num_tokens": 8433083.0,
"step": 8280
},
{
"epoch": 1.8700654184525152,
"grad_norm": 0.5034496188163757,
"learning_rate": 0.0001878096097450936,
"loss": 0.1,
"mean_token_accuracy": 0.9701763093471527,
"num_tokens": 8443293.0,
"step": 8290
},
{
"epoch": 1.872321227159937,
"grad_norm": 0.8790525794029236,
"learning_rate": 0.0001876742612226483,
"loss": 0.0974,
"mean_token_accuracy": 0.9718030214309692,
"num_tokens": 8453487.0,
"step": 8300
},
{
"epoch": 1.8745770358673584,
"grad_norm": 0.4784752428531647,
"learning_rate": 0.00018753891270020302,
"loss": 0.092,
"mean_token_accuracy": 0.9723727226257324,
"num_tokens": 8463707.0,
"step": 8310
},
{
"epoch": 1.8768328445747802,
"grad_norm": 0.7611798644065857,
"learning_rate": 0.00018740356417775773,
"loss": 0.1092,
"mean_token_accuracy": 0.9694570362567901,
"num_tokens": 8473928.0,
"step": 8320
},
{
"epoch": 1.8790886532822015,
"grad_norm": 0.4173850417137146,
"learning_rate": 0.0001872682156553124,
"loss": 0.1328,
"mean_token_accuracy": 0.9634264826774597,
"num_tokens": 8484091.0,
"step": 8330
},
{
"epoch": 1.8813444619896234,
"grad_norm": 0.515573263168335,
"learning_rate": 0.00018713286713286712,
"loss": 0.1139,
"mean_token_accuracy": 0.966448575258255,
"num_tokens": 8494322.0,
"step": 8340
},
{
"epoch": 1.883600270697045,
"grad_norm": 0.7708337306976318,
"learning_rate": 0.00018699751861042183,
"loss": 0.1153,
"mean_token_accuracy": 0.9649886667728425,
"num_tokens": 8504469.0,
"step": 8350
},
{
"epoch": 1.8858560794044665,
"grad_norm": 1.0154062509536743,
"learning_rate": 0.00018686217008797654,
"loss": 0.1283,
"mean_token_accuracy": 0.9634427964687348,
"num_tokens": 8514619.0,
"step": 8360
},
{
"epoch": 1.8881118881118881,
"grad_norm": 0.6789688467979431,
"learning_rate": 0.00018672682156553122,
"loss": 0.1395,
"mean_token_accuracy": 0.9595559239387512,
"num_tokens": 8524806.0,
"step": 8370
},
{
"epoch": 1.8903676968193097,
"grad_norm": 0.7808207273483276,
"learning_rate": 0.00018659147304308593,
"loss": 0.1796,
"mean_token_accuracy": 0.9548425853252411,
"num_tokens": 8534972.0,
"step": 8380
},
{
"epoch": 1.8926235055267313,
"grad_norm": 0.8315436840057373,
"learning_rate": 0.00018645612452064064,
"loss": 0.1295,
"mean_token_accuracy": 0.9659779012203217,
"num_tokens": 8545209.0,
"step": 8390
},
{
"epoch": 1.894879314234153,
"grad_norm": 0.6535826921463013,
"learning_rate": 0.00018632077599819535,
"loss": 0.1134,
"mean_token_accuracy": 0.9662691533565522,
"num_tokens": 8555446.0,
"step": 8400
},
{
"epoch": 1.8971351229415747,
"grad_norm": 0.5366471409797668,
"learning_rate": 0.00018618542747575004,
"loss": 0.1369,
"mean_token_accuracy": 0.9663753628730773,
"num_tokens": 8565580.0,
"step": 8410
},
{
"epoch": 1.899390931648996,
"grad_norm": 0.5309544205665588,
"learning_rate": 0.00018605007895330475,
"loss": 0.0862,
"mean_token_accuracy": 0.9748947978019714,
"num_tokens": 8575743.0,
"step": 8420
},
{
"epoch": 1.901646740356418,
"grad_norm": 0.4496230483055115,
"learning_rate": 0.00018591473043085946,
"loss": 0.1105,
"mean_token_accuracy": 0.969891893863678,
"num_tokens": 8585905.0,
"step": 8430
},
{
"epoch": 1.9039025490638393,
"grad_norm": 0.5235305428504944,
"learning_rate": 0.00018577938190841417,
"loss": 0.1068,
"mean_token_accuracy": 0.9693613052368164,
"num_tokens": 8596130.0,
"step": 8440
},
{
"epoch": 1.906158357771261,
"grad_norm": 0.3482052683830261,
"learning_rate": 0.00018564403338596885,
"loss": 0.0934,
"mean_token_accuracy": 0.9705743670463562,
"num_tokens": 8606333.0,
"step": 8450
},
{
"epoch": 1.9084141664786824,
"grad_norm": 0.3155059814453125,
"learning_rate": 0.00018550868486352356,
"loss": 0.1494,
"mean_token_accuracy": 0.9634097695350647,
"num_tokens": 8616546.0,
"step": 8460
},
{
"epoch": 1.9106699751861043,
"grad_norm": 0.6361089944839478,
"learning_rate": 0.00018537333634107827,
"loss": 0.1315,
"mean_token_accuracy": 0.9672301173210144,
"num_tokens": 8626008.0,
"step": 8470
},
{
"epoch": 1.9129257838935259,
"grad_norm": 0.37798941135406494,
"learning_rate": 0.00018523798781863298,
"loss": 0.1064,
"mean_token_accuracy": 0.9712181925773621,
"num_tokens": 8636230.0,
"step": 8480
},
{
"epoch": 1.9151815926009474,
"grad_norm": 0.60988849401474,
"learning_rate": 0.0001851026392961877,
"loss": 0.0886,
"mean_token_accuracy": 0.9740067481994629,
"num_tokens": 8646377.0,
"step": 8490
},
{
"epoch": 1.917437401308369,
"grad_norm": 0.9676559567451477,
"learning_rate": 0.00018496729077374237,
"loss": 0.1134,
"mean_token_accuracy": 0.9681664168834686,
"num_tokens": 8656547.0,
"step": 8500
},
{
"epoch": 1.9196932100157906,
"grad_norm": 0.7825080156326294,
"learning_rate": 0.00018483194225129708,
"loss": 0.1107,
"mean_token_accuracy": 0.9664545893669129,
"num_tokens": 8666781.0,
"step": 8510
},
{
"epoch": 1.9219490187232122,
"grad_norm": 0.34856459498405457,
"learning_rate": 0.0001846965937288518,
"loss": 0.1477,
"mean_token_accuracy": 0.9601014196872711,
"num_tokens": 8676896.0,
"step": 8520
},
{
"epoch": 1.9242048274306338,
"grad_norm": 0.4000522494316101,
"learning_rate": 0.0001845612452064065,
"loss": 0.1003,
"mean_token_accuracy": 0.9701106131076813,
"num_tokens": 8687076.0,
"step": 8530
},
{
"epoch": 1.9264606361380556,
"grad_norm": 0.859682559967041,
"learning_rate": 0.00018442589668396118,
"loss": 0.1115,
"mean_token_accuracy": 0.9683105528354645,
"num_tokens": 8697310.0,
"step": 8540
},
{
"epoch": 1.928716444845477,
"grad_norm": 0.39500635862350464,
"learning_rate": 0.0001842905481615159,
"loss": 0.114,
"mean_token_accuracy": 0.9657883048057556,
"num_tokens": 8707496.0,
"step": 8550
},
{
"epoch": 1.9309722535528988,
"grad_norm": 0.7265903353691101,
"learning_rate": 0.0001841551996390706,
"loss": 0.1104,
"mean_token_accuracy": 0.9682238936424256,
"num_tokens": 8717691.0,
"step": 8560
},
{
"epoch": 1.9332280622603202,
"grad_norm": 0.7054004073143005,
"learning_rate": 0.0001840198511166253,
"loss": 0.1201,
"mean_token_accuracy": 0.9653518259525299,
"num_tokens": 8727910.0,
"step": 8570
},
{
"epoch": 1.935483870967742,
"grad_norm": 0.630401611328125,
"learning_rate": 0.00018388450259418,
"loss": 0.0861,
"mean_token_accuracy": 0.97560213804245,
"num_tokens": 8737095.0,
"step": 8580
},
{
"epoch": 1.9377396796751636,
"grad_norm": 0.4708097279071808,
"learning_rate": 0.0001837491540717347,
"loss": 0.106,
"mean_token_accuracy": 0.9685471832752228,
"num_tokens": 8747301.0,
"step": 8590
},
{
"epoch": 1.9399954883825852,
"grad_norm": 0.43274563550949097,
"learning_rate": 0.00018361380554928942,
"loss": 0.1082,
"mean_token_accuracy": 0.9686918735504151,
"num_tokens": 8757435.0,
"step": 8600
},
{
"epoch": 1.9422512970900068,
"grad_norm": 0.4595888555049896,
"learning_rate": 0.00018347845702684412,
"loss": 0.0887,
"mean_token_accuracy": 0.9735994637012482,
"num_tokens": 8767568.0,
"step": 8610
},
{
"epoch": 1.9445071057974284,
"grad_norm": 0.6666122674942017,
"learning_rate": 0.0001833431085043988,
"loss": 0.1097,
"mean_token_accuracy": 0.9671940863132477,
"num_tokens": 8777783.0,
"step": 8620
},
{
"epoch": 1.94676291450485,
"grad_norm": 0.45760247111320496,
"learning_rate": 0.00018320775998195352,
"loss": 0.1247,
"mean_token_accuracy": 0.9661338448524475,
"num_tokens": 8787974.0,
"step": 8630
},
{
"epoch": 1.9490187232122715,
"grad_norm": 0.8178554177284241,
"learning_rate": 0.00018307241145950823,
"loss": 0.0977,
"mean_token_accuracy": 0.9710348606109619,
"num_tokens": 8798199.0,
"step": 8640
},
{
"epoch": 1.9512745319196934,
"grad_norm": 0.5238173007965088,
"learning_rate": 0.00018293706293706294,
"loss": 0.0859,
"mean_token_accuracy": 0.9734928429126739,
"num_tokens": 8808393.0,
"step": 8650
},
{
"epoch": 1.9535303406271147,
"grad_norm": 0.5230134129524231,
"learning_rate": 0.00018280171441461765,
"loss": 0.1213,
"mean_token_accuracy": 0.963700121641159,
"num_tokens": 8818596.0,
"step": 8660
},
{
"epoch": 1.9557861493345365,
"grad_norm": 0.4369744062423706,
"learning_rate": 0.00018266636589217233,
"loss": 0.0795,
"mean_token_accuracy": 0.9761726558208466,
"num_tokens": 8828762.0,
"step": 8670
},
{
"epoch": 1.958041958041958,
"grad_norm": 0.5722286105155945,
"learning_rate": 0.00018253101736972704,
"loss": 0.1321,
"mean_token_accuracy": 0.965042096376419,
"num_tokens": 8838866.0,
"step": 8680
},
{
"epoch": 1.9602977667493797,
"grad_norm": 0.6464564800262451,
"learning_rate": 0.00018239566884728175,
"loss": 0.114,
"mean_token_accuracy": 0.9651407182216645,
"num_tokens": 8849045.0,
"step": 8690
},
{
"epoch": 1.9625535754568013,
"grad_norm": 0.49602097272872925,
"learning_rate": 0.00018226032032483646,
"loss": 0.1277,
"mean_token_accuracy": 0.9648896515369415,
"num_tokens": 8859264.0,
"step": 8700
},
{
"epoch": 1.964809384164223,
"grad_norm": 0.4740694463253021,
"learning_rate": 0.00018212497180239112,
"loss": 0.0997,
"mean_token_accuracy": 0.9698693633079529,
"num_tokens": 8869487.0,
"step": 8710
},
{
"epoch": 1.9670651928716445,
"grad_norm": 0.672024130821228,
"learning_rate": 0.00018198962327994585,
"loss": 0.1151,
"mean_token_accuracy": 0.9668726742267608,
"num_tokens": 8879699.0,
"step": 8720
},
{
"epoch": 1.969321001579066,
"grad_norm": 0.4891161620616913,
"learning_rate": 0.00018185427475750056,
"loss": 0.1115,
"mean_token_accuracy": 0.9718928813934327,
"num_tokens": 8889850.0,
"step": 8730
},
{
"epoch": 1.9715768102864877,
"grad_norm": 0.7066110968589783,
"learning_rate": 0.00018171892623505527,
"loss": 0.132,
"mean_token_accuracy": 0.9635340571403503,
"num_tokens": 8899992.0,
"step": 8740
},
{
"epoch": 1.9738326189939093,
"grad_norm": 0.727239727973938,
"learning_rate": 0.00018158357771260993,
"loss": 0.1036,
"mean_token_accuracy": 0.970716518163681,
"num_tokens": 8910215.0,
"step": 8750
},
{
"epoch": 1.976088427701331,
"grad_norm": 0.4740559160709381,
"learning_rate": 0.00018144822919016466,
"loss": 0.1136,
"mean_token_accuracy": 0.9690234005451203,
"num_tokens": 8920410.0,
"step": 8760
},
{
"epoch": 1.9783442364087525,
"grad_norm": 0.45103248953819275,
"learning_rate": 0.00018131288066771937,
"loss": 0.0803,
"mean_token_accuracy": 0.9752394497394562,
"num_tokens": 8930634.0,
"step": 8770
},
{
"epoch": 1.9806000451161743,
"grad_norm": 0.7137285470962524,
"learning_rate": 0.00018117753214527408,
"loss": 0.0891,
"mean_token_accuracy": 0.9740745484828949,
"num_tokens": 8940866.0,
"step": 8780
},
{
"epoch": 1.9828558538235956,
"grad_norm": 0.7460572123527527,
"learning_rate": 0.0001810421836228288,
"loss": 0.1182,
"mean_token_accuracy": 0.9643153727054596,
"num_tokens": 8951069.0,
"step": 8790
},
{
"epoch": 1.9851116625310175,
"grad_norm": 0.7210149168968201,
"learning_rate": 0.00018090683510038345,
"loss": 0.0918,
"mean_token_accuracy": 0.9729661166667938,
"num_tokens": 8960810.0,
"step": 8800
},
{
"epoch": 1.9873674712384388,
"grad_norm": 0.4664352238178253,
"learning_rate": 0.0001807714865779382,
"loss": 0.0964,
"mean_token_accuracy": 0.9717876076698303,
"num_tokens": 8971031.0,
"step": 8810
},
{
"epoch": 1.9896232799458606,
"grad_norm": 0.7352085113525391,
"learning_rate": 0.0001806361380554929,
"loss": 0.1127,
"mean_token_accuracy": 0.9690958023071289,
"num_tokens": 8981169.0,
"step": 8820
},
{
"epoch": 1.9918790886532822,
"grad_norm": 0.6887751221656799,
"learning_rate": 0.0001805007895330476,
"loss": 0.1206,
"mean_token_accuracy": 0.964536988735199,
"num_tokens": 8991349.0,
"step": 8830
},
{
"epoch": 1.9941348973607038,
"grad_norm": 0.5474720001220703,
"learning_rate": 0.00018036544101060226,
"loss": 0.0761,
"mean_token_accuracy": 0.9767542481422424,
"num_tokens": 9001504.0,
"step": 8840
},
{
"epoch": 1.9963907060681254,
"grad_norm": 0.7074306607246399,
"learning_rate": 0.00018023009248815697,
"loss": 0.108,
"mean_token_accuracy": 0.9695817053318023,
"num_tokens": 9011695.0,
"step": 8850
},
{
"epoch": 1.998646514775547,
"grad_norm": 0.5465816259384155,
"learning_rate": 0.0001800947439657117,
"loss": 0.1602,
"mean_token_accuracy": 0.9601962268352509,
"num_tokens": 9021880.0,
"step": 8860
},
{
"epoch": 2.000902323482969,
"grad_norm": 0.6290394067764282,
"learning_rate": 0.00017995939544326642,
"loss": 0.0804,
"mean_token_accuracy": 0.9762711524963379,
"num_tokens": 9032059.0,
"step": 8870
},
{
"epoch": 2.00315813219039,
"grad_norm": 0.5912337303161621,
"learning_rate": 0.00017982404692082107,
"loss": 0.1058,
"mean_token_accuracy": 0.9690792024135589,
"num_tokens": 9042287.0,
"step": 8880
},
{
"epoch": 2.005413940897812,
"grad_norm": 0.591269850730896,
"learning_rate": 0.00017968869839837578,
"loss": 0.0892,
"mean_token_accuracy": 0.9733344137668609,
"num_tokens": 9052495.0,
"step": 8890
},
{
"epoch": 2.0076697496052334,
"grad_norm": 1.021296739578247,
"learning_rate": 0.0001795533498759305,
"loss": 0.1061,
"mean_token_accuracy": 0.9714495182037354,
"num_tokens": 9062732.0,
"step": 8900
},
{
"epoch": 2.009925558312655,
"grad_norm": 0.3600910007953644,
"learning_rate": 0.00017941800135348523,
"loss": 0.089,
"mean_token_accuracy": 0.9722425937652588,
"num_tokens": 9072909.0,
"step": 8910
},
{
"epoch": 2.0121813670200766,
"grad_norm": 0.8148804903030396,
"learning_rate": 0.0001792826528310399,
"loss": 0.0982,
"mean_token_accuracy": 0.9704652488231659,
"num_tokens": 9083116.0,
"step": 8920
},
{
"epoch": 2.0144371757274984,
"grad_norm": 0.6611748933792114,
"learning_rate": 0.0001791473043085946,
"loss": 0.0848,
"mean_token_accuracy": 0.9746616184711456,
"num_tokens": 9093299.0,
"step": 8930
},
{
"epoch": 2.0166929844349197,
"grad_norm": 0.32425668835639954,
"learning_rate": 0.0001790119557861493,
"loss": 0.0813,
"mean_token_accuracy": 0.976304441690445,
"num_tokens": 9103515.0,
"step": 8940
},
{
"epoch": 2.0189487931423415,
"grad_norm": 0.61649489402771,
"learning_rate": 0.00017887660726370404,
"loss": 0.0898,
"mean_token_accuracy": 0.9724894106388092,
"num_tokens": 9113738.0,
"step": 8950
},
{
"epoch": 2.0212046018497634,
"grad_norm": 0.4641760289669037,
"learning_rate": 0.00017874125874125875,
"loss": 0.0822,
"mean_token_accuracy": 0.9755397379398346,
"num_tokens": 9123963.0,
"step": 8960
},
{
"epoch": 2.0234604105571847,
"grad_norm": 0.46858569979667664,
"learning_rate": 0.0001786059102188134,
"loss": 0.0884,
"mean_token_accuracy": 0.974696409702301,
"num_tokens": 9134154.0,
"step": 8970
},
{
"epoch": 2.0257162192646065,
"grad_norm": 0.7182545065879822,
"learning_rate": 0.00017847056169636812,
"loss": 0.0936,
"mean_token_accuracy": 0.9723126113414764,
"num_tokens": 9144310.0,
"step": 8980
},
{
"epoch": 2.027972027972028,
"grad_norm": 0.832270622253418,
"learning_rate": 0.00017833521317392283,
"loss": 0.0871,
"mean_token_accuracy": 0.9737198114395141,
"num_tokens": 9154434.0,
"step": 8990
},
{
"epoch": 2.0302278366794497,
"grad_norm": 0.6002981066703796,
"learning_rate": 0.00017819986465147757,
"loss": 0.0994,
"mean_token_accuracy": 0.9703204393386841,
"num_tokens": 9164498.0,
"step": 9000
},
{
"epoch": 2.032483645386871,
"grad_norm": 0.41282710433006287,
"learning_rate": 0.00017806451612903222,
"loss": 0.105,
"mean_token_accuracy": 0.9722214996814728,
"num_tokens": 9174726.0,
"step": 9010
},
{
"epoch": 2.034739454094293,
"grad_norm": 0.5806052088737488,
"learning_rate": 0.00017792916760658693,
"loss": 0.1056,
"mean_token_accuracy": 0.9686106681823731,
"num_tokens": 9184936.0,
"step": 9020
},
{
"epoch": 2.0369952628017143,
"grad_norm": 0.4993740916252136,
"learning_rate": 0.00017779381908414164,
"loss": 0.1028,
"mean_token_accuracy": 0.970165628194809,
"num_tokens": 9195123.0,
"step": 9030
},
{
"epoch": 2.039251071509136,
"grad_norm": 0.9050776958465576,
"learning_rate": 0.00017765847056169635,
"loss": 0.0967,
"mean_token_accuracy": 0.9713717639446259,
"num_tokens": 9205265.0,
"step": 9040
},
{
"epoch": 2.0415068802165575,
"grad_norm": 0.6305001974105835,
"learning_rate": 0.00017752312203925103,
"loss": 0.0995,
"mean_token_accuracy": 0.9726520895957946,
"num_tokens": 9215412.0,
"step": 9050
},
{
"epoch": 2.0437626889239793,
"grad_norm": 0.458783894777298,
"learning_rate": 0.00017738777351680574,
"loss": 0.1517,
"mean_token_accuracy": 0.9607601463794708,
"num_tokens": 9225569.0,
"step": 9060
},
{
"epoch": 2.0460184976314006,
"grad_norm": 0.47300443053245544,
"learning_rate": 0.00017725242499436045,
"loss": 0.0904,
"mean_token_accuracy": 0.972942465543747,
"num_tokens": 9235688.0,
"step": 9070
},
{
"epoch": 2.0482743063388225,
"grad_norm": 0.7441193461418152,
"learning_rate": 0.00017711707647191516,
"loss": 0.0901,
"mean_token_accuracy": 0.9721292495727539,
"num_tokens": 9245923.0,
"step": 9080
},
{
"epoch": 2.0505301150462443,
"grad_norm": 0.5683284997940063,
"learning_rate": 0.00017698172794946985,
"loss": 0.0899,
"mean_token_accuracy": 0.9734027445316314,
"num_tokens": 9256159.0,
"step": 9090
},
{
"epoch": 2.0527859237536656,
"grad_norm": 0.4916854798793793,
"learning_rate": 0.00017684637942702456,
"loss": 0.0858,
"mean_token_accuracy": 0.9744769513607026,
"num_tokens": 9266394.0,
"step": 9100
},
{
"epoch": 2.0550417324610875,
"grad_norm": 0.6196737885475159,
"learning_rate": 0.00017671103090457927,
"loss": 0.0822,
"mean_token_accuracy": 0.9734901905059814,
"num_tokens": 9276563.0,
"step": 9110
},
{
"epoch": 2.057297541168509,
"grad_norm": 0.4199332594871521,
"learning_rate": 0.00017657568238213398,
"loss": 0.0832,
"mean_token_accuracy": 0.9740483283996582,
"num_tokens": 9286778.0,
"step": 9120
},
{
"epoch": 2.0595533498759306,
"grad_norm": 0.8482980132102966,
"learning_rate": 0.00017644033385968869,
"loss": 0.1052,
"mean_token_accuracy": 0.9692319512367249,
"num_tokens": 9296960.0,
"step": 9130
},
{
"epoch": 2.061809158583352,
"grad_norm": 0.7818817496299744,
"learning_rate": 0.00017630498533724337,
"loss": 0.1009,
"mean_token_accuracy": 0.972999757528305,
"num_tokens": 9307194.0,
"step": 9140
},
{
"epoch": 2.064064967290774,
"grad_norm": 0.9999156594276428,
"learning_rate": 0.00017616963681479808,
"loss": 0.0764,
"mean_token_accuracy": 0.9767278075218201,
"num_tokens": 9317430.0,
"step": 9150
},
{
"epoch": 2.066320775998195,
"grad_norm": 0.3332657217979431,
"learning_rate": 0.0001760342882923528,
"loss": 0.0953,
"mean_token_accuracy": 0.9738193869590759,
"num_tokens": 9327609.0,
"step": 9160
},
{
"epoch": 2.068576584705617,
"grad_norm": 0.5017508268356323,
"learning_rate": 0.0001758989397699075,
"loss": 0.1039,
"mean_token_accuracy": 0.9686254799365998,
"num_tokens": 9337754.0,
"step": 9170
},
{
"epoch": 2.0708323934130384,
"grad_norm": 0.4979402422904968,
"learning_rate": 0.00017576359124746218,
"loss": 0.1049,
"mean_token_accuracy": 0.9714104771614075,
"num_tokens": 9347920.0,
"step": 9180
},
{
"epoch": 2.07308820212046,
"grad_norm": 0.4234994649887085,
"learning_rate": 0.0001756282427250169,
"loss": 0.0924,
"mean_token_accuracy": 0.9733720302581788,
"num_tokens": 9358064.0,
"step": 9190
},
{
"epoch": 2.075344010827882,
"grad_norm": 1.5676687955856323,
"learning_rate": 0.0001754928942025716,
"loss": 0.0943,
"mean_token_accuracy": 0.9714210510253907,
"num_tokens": 9368295.0,
"step": 9200
},
{
"epoch": 2.0775998195353034,
"grad_norm": 0.46907201409339905,
"learning_rate": 0.0001753575456801263,
"loss": 0.0821,
"mean_token_accuracy": 0.9753334045410156,
"num_tokens": 9378521.0,
"step": 9210
},
{
"epoch": 2.079855628242725,
"grad_norm": 0.7339596152305603,
"learning_rate": 0.000175222197157681,
"loss": 0.0935,
"mean_token_accuracy": 0.9735293567180634,
"num_tokens": 9388697.0,
"step": 9220
},
{
"epoch": 2.0821114369501466,
"grad_norm": 0.542509138584137,
"learning_rate": 0.0001750868486352357,
"loss": 0.0897,
"mean_token_accuracy": 0.9746010303497314,
"num_tokens": 9398881.0,
"step": 9230
},
{
"epoch": 2.0843672456575684,
"grad_norm": 0.5223485231399536,
"learning_rate": 0.0001749515001127904,
"loss": 0.1131,
"mean_token_accuracy": 0.9674809694290161,
"num_tokens": 9409095.0,
"step": 9240
},
{
"epoch": 2.0866230543649897,
"grad_norm": 0.7211313843727112,
"learning_rate": 0.00017481615159034512,
"loss": 0.1069,
"mean_token_accuracy": 0.9681223928928375,
"num_tokens": 9419227.0,
"step": 9250
},
{
"epoch": 2.0888788630724116,
"grad_norm": 0.5504807829856873,
"learning_rate": 0.0001746808030678998,
"loss": 0.1201,
"mean_token_accuracy": 0.9665165781974793,
"num_tokens": 9429353.0,
"step": 9260
},
{
"epoch": 2.091134671779833,
"grad_norm": 0.5223894715309143,
"learning_rate": 0.00017454545454545452,
"loss": 0.0947,
"mean_token_accuracy": 0.9707706809043884,
"num_tokens": 9439548.0,
"step": 9270
},
{
"epoch": 2.0933904804872547,
"grad_norm": 0.5483041405677795,
"learning_rate": 0.00017441010602300923,
"loss": 0.0905,
"mean_token_accuracy": 0.9730142235755921,
"num_tokens": 9449758.0,
"step": 9280
},
{
"epoch": 2.095646289194676,
"grad_norm": 0.4744812250137329,
"learning_rate": 0.00017427475750056394,
"loss": 0.0999,
"mean_token_accuracy": 0.9714322686195374,
"num_tokens": 9459901.0,
"step": 9290
},
{
"epoch": 2.097902097902098,
"grad_norm": 0.47873175144195557,
"learning_rate": 0.00017413940897811864,
"loss": 0.1186,
"mean_token_accuracy": 0.9649298131465912,
"num_tokens": 9470029.0,
"step": 9300
},
{
"epoch": 2.1001579066095193,
"grad_norm": 0.5018359422683716,
"learning_rate": 0.00017400406045567333,
"loss": 0.1124,
"mean_token_accuracy": 0.9666579306125641,
"num_tokens": 9480263.0,
"step": 9310
},
{
"epoch": 2.102413715316941,
"grad_norm": 0.6129780411720276,
"learning_rate": 0.00017386871193322804,
"loss": 0.0862,
"mean_token_accuracy": 0.9743798732757568,
"num_tokens": 9490447.0,
"step": 9320
},
{
"epoch": 2.104669524024363,
"grad_norm": 0.37456271052360535,
"learning_rate": 0.00017373336341078275,
"loss": 0.1023,
"mean_token_accuracy": 0.9715150356292724,
"num_tokens": 9500613.0,
"step": 9330
},
{
"epoch": 2.1069253327317843,
"grad_norm": 1.205664038658142,
"learning_rate": 0.00017359801488833746,
"loss": 0.1083,
"mean_token_accuracy": 0.9699222087860108,
"num_tokens": 9510834.0,
"step": 9340
},
{
"epoch": 2.109181141439206,
"grad_norm": 1.3629356622695923,
"learning_rate": 0.00017346266636589214,
"loss": 0.0932,
"mean_token_accuracy": 0.9720664858818054,
"num_tokens": 9521014.0,
"step": 9350
},
{
"epoch": 2.1114369501466275,
"grad_norm": 0.7105079293251038,
"learning_rate": 0.00017332731784344685,
"loss": 0.0966,
"mean_token_accuracy": 0.9712238907814026,
"num_tokens": 9531193.0,
"step": 9360
},
{
"epoch": 2.1136927588540493,
"grad_norm": 0.6067842841148376,
"learning_rate": 0.00017319196932100156,
"loss": 0.0939,
"mean_token_accuracy": 0.9708913087844848,
"num_tokens": 9541408.0,
"step": 9370
},
{
"epoch": 2.1159485675614707,
"grad_norm": 0.6547707915306091,
"learning_rate": 0.00017305662079855627,
"loss": 0.0896,
"mean_token_accuracy": 0.9725289940834045,
"num_tokens": 9551647.0,
"step": 9380
},
{
"epoch": 2.1182043762688925,
"grad_norm": 0.44134852290153503,
"learning_rate": 0.00017292127227611095,
"loss": 0.1144,
"mean_token_accuracy": 0.9662079870700836,
"num_tokens": 9561784.0,
"step": 9390
},
{
"epoch": 2.120460184976314,
"grad_norm": 0.8880122303962708,
"learning_rate": 0.00017278592375366566,
"loss": 0.1135,
"mean_token_accuracy": 0.967824912071228,
"num_tokens": 9571964.0,
"step": 9400
},
{
"epoch": 2.1227159936837356,
"grad_norm": 0.44176623225212097,
"learning_rate": 0.00017265057523122037,
"loss": 0.092,
"mean_token_accuracy": 0.9716949999332428,
"num_tokens": 9582130.0,
"step": 9410
},
{
"epoch": 2.124971802391157,
"grad_norm": 0.6585561037063599,
"learning_rate": 0.00017251522670877508,
"loss": 0.0847,
"mean_token_accuracy": 0.9737663388252258,
"num_tokens": 9592325.0,
"step": 9420
},
{
"epoch": 2.127227611098579,
"grad_norm": 0.6109176874160767,
"learning_rate": 0.0001723798781863298,
"loss": 0.1147,
"mean_token_accuracy": 0.9680899202823638,
"num_tokens": 9602555.0,
"step": 9430
},
{
"epoch": 2.1294834198060006,
"grad_norm": 0.4303933382034302,
"learning_rate": 0.00017224452966388447,
"loss": 0.0873,
"mean_token_accuracy": 0.9742409646511078,
"num_tokens": 9612766.0,
"step": 9440
},
{
"epoch": 2.131739228513422,
"grad_norm": 0.5676075220108032,
"learning_rate": 0.00017210918114143918,
"loss": 0.0815,
"mean_token_accuracy": 0.9740564048290252,
"num_tokens": 9622987.0,
"step": 9450
},
{
"epoch": 2.133995037220844,
"grad_norm": 0.3643551468849182,
"learning_rate": 0.0001719738326189939,
"loss": 0.0962,
"mean_token_accuracy": 0.9718107163906098,
"num_tokens": 9633207.0,
"step": 9460
},
{
"epoch": 2.136250845928265,
"grad_norm": 0.31422924995422363,
"learning_rate": 0.0001718384840965486,
"loss": 0.0781,
"mean_token_accuracy": 0.9776098966598511,
"num_tokens": 9643377.0,
"step": 9470
},
{
"epoch": 2.138506654635687,
"grad_norm": 0.5971875786781311,
"learning_rate": 0.0001717031355741033,
"loss": 0.1022,
"mean_token_accuracy": 0.9688220500946045,
"num_tokens": 9653564.0,
"step": 9480
},
{
"epoch": 2.1407624633431084,
"grad_norm": 0.5687423944473267,
"learning_rate": 0.000171567787051658,
"loss": 0.0996,
"mean_token_accuracy": 0.971611624956131,
"num_tokens": 9663760.0,
"step": 9490
},
{
"epoch": 2.14301827205053,
"grad_norm": 0.44779282808303833,
"learning_rate": 0.0001714324385292127,
"loss": 0.0887,
"mean_token_accuracy": 0.9741411805152893,
"num_tokens": 9673629.0,
"step": 9500
},
{
"epoch": 2.1452740807579516,
"grad_norm": 0.7396071553230286,
"learning_rate": 0.00017129709000676742,
"loss": 0.1019,
"mean_token_accuracy": 0.9707873880863189,
"num_tokens": 9683832.0,
"step": 9510
},
{
"epoch": 2.1475298894653734,
"grad_norm": 0.6024800539016724,
"learning_rate": 0.0001711617414843221,
"loss": 0.0935,
"mean_token_accuracy": 0.9714532136917114,
"num_tokens": 9694005.0,
"step": 9520
},
{
"epoch": 2.1497856981727947,
"grad_norm": 0.8652124404907227,
"learning_rate": 0.0001710263929618768,
"loss": 0.0953,
"mean_token_accuracy": 0.9726161181926727,
"num_tokens": 9704147.0,
"step": 9530
},
{
"epoch": 2.1520415068802166,
"grad_norm": 0.36145398020744324,
"learning_rate": 0.00017089104443943152,
"loss": 0.0884,
"mean_token_accuracy": 0.9719941794872284,
"num_tokens": 9714357.0,
"step": 9540
},
{
"epoch": 2.1542973155876384,
"grad_norm": 0.6179787516593933,
"learning_rate": 0.00017075569591698623,
"loss": 0.0783,
"mean_token_accuracy": 0.9779595613479615,
"num_tokens": 9723620.0,
"step": 9550
},
{
"epoch": 2.1565531242950597,
"grad_norm": 0.7116292119026184,
"learning_rate": 0.0001706203473945409,
"loss": 0.1045,
"mean_token_accuracy": 0.9707187950611115,
"num_tokens": 9733848.0,
"step": 9560
},
{
"epoch": 2.1588089330024816,
"grad_norm": 0.4667085111141205,
"learning_rate": 0.00017048499887209562,
"loss": 0.0713,
"mean_token_accuracy": 0.9795779466629029,
"num_tokens": 9743989.0,
"step": 9570
},
{
"epoch": 2.161064741709903,
"grad_norm": 0.5031459927558899,
"learning_rate": 0.00017034965034965033,
"loss": 0.0955,
"mean_token_accuracy": 0.9732905268669129,
"num_tokens": 9754221.0,
"step": 9580
},
{
"epoch": 2.1633205504173247,
"grad_norm": 0.42553097009658813,
"learning_rate": 0.00017021430182720504,
"loss": 0.0831,
"mean_token_accuracy": 0.9736292123794555,
"num_tokens": 9764370.0,
"step": 9590
},
{
"epoch": 2.165576359124746,
"grad_norm": 0.43609100580215454,
"learning_rate": 0.00017007895330475975,
"loss": 0.0977,
"mean_token_accuracy": 0.9721475183963776,
"num_tokens": 9774519.0,
"step": 9600
},
{
"epoch": 2.167832167832168,
"grad_norm": 0.3701200783252716,
"learning_rate": 0.00016994360478231443,
"loss": 0.0966,
"mean_token_accuracy": 0.9708240926265717,
"num_tokens": 9784743.0,
"step": 9610
},
{
"epoch": 2.1700879765395893,
"grad_norm": 0.7588269114494324,
"learning_rate": 0.00016980825625986914,
"loss": 0.0812,
"mean_token_accuracy": 0.9742601990699769,
"num_tokens": 9794970.0,
"step": 9620
},
{
"epoch": 2.172343785247011,
"grad_norm": 0.547471821308136,
"learning_rate": 0.00016967290773742385,
"loss": 0.1067,
"mean_token_accuracy": 0.9700693309307098,
"num_tokens": 9805155.0,
"step": 9630
},
{
"epoch": 2.1745995939544325,
"grad_norm": 0.5657344460487366,
"learning_rate": 0.00016953755921497856,
"loss": 0.1072,
"mean_token_accuracy": 0.9706630408763885,
"num_tokens": 9815333.0,
"step": 9640
},
{
"epoch": 2.1768554026618543,
"grad_norm": 0.6330400705337524,
"learning_rate": 0.00016940221069253325,
"loss": 0.0767,
"mean_token_accuracy": 0.9774750828742981,
"num_tokens": 9825549.0,
"step": 9650
},
{
"epoch": 2.1791112113692757,
"grad_norm": 1.0108215808868408,
"learning_rate": 0.00016926686217008796,
"loss": 0.1165,
"mean_token_accuracy": 0.9687487602233886,
"num_tokens": 9835749.0,
"step": 9660
},
{
"epoch": 2.1813670200766975,
"grad_norm": 0.559883177280426,
"learning_rate": 0.00016913151364764267,
"loss": 0.0684,
"mean_token_accuracy": 0.9800824344158172,
"num_tokens": 9845942.0,
"step": 9670
},
{
"epoch": 2.1836228287841193,
"grad_norm": 0.3940314054489136,
"learning_rate": 0.00016899616512519738,
"loss": 0.0889,
"mean_token_accuracy": 0.9734469890594483,
"num_tokens": 9856099.0,
"step": 9680
},
{
"epoch": 2.1858786374915407,
"grad_norm": 0.5258107781410217,
"learning_rate": 0.00016886081660275206,
"loss": 0.1021,
"mean_token_accuracy": 0.9695713877677917,
"num_tokens": 9866266.0,
"step": 9690
},
{
"epoch": 2.1881344461989625,
"grad_norm": 0.8086346387863159,
"learning_rate": 0.00016872546808030677,
"loss": 0.0823,
"mean_token_accuracy": 0.9752463161945343,
"num_tokens": 9876451.0,
"step": 9700
},
{
"epoch": 2.190390254906384,
"grad_norm": 0.4422919452190399,
"learning_rate": 0.00016859011955786148,
"loss": 0.0863,
"mean_token_accuracy": 0.9748727321624756,
"num_tokens": 9886651.0,
"step": 9710
},
{
"epoch": 2.1926460636138057,
"grad_norm": 0.5345275402069092,
"learning_rate": 0.0001684547710354162,
"loss": 0.092,
"mean_token_accuracy": 0.9734846889972687,
"num_tokens": 9896883.0,
"step": 9720
},
{
"epoch": 2.194901872321227,
"grad_norm": 0.6745142340660095,
"learning_rate": 0.00016831942251297087,
"loss": 0.0965,
"mean_token_accuracy": 0.9711147665977478,
"num_tokens": 9907087.0,
"step": 9730
},
{
"epoch": 2.197157681028649,
"grad_norm": 1.496596097946167,
"learning_rate": 0.00016818407399052558,
"loss": 0.0876,
"mean_token_accuracy": 0.9734964728355407,
"num_tokens": 9917322.0,
"step": 9740
},
{
"epoch": 2.19941348973607,
"grad_norm": 0.5157150626182556,
"learning_rate": 0.0001680487254680803,
"loss": 0.0947,
"mean_token_accuracy": 0.9723085820674896,
"num_tokens": 9927490.0,
"step": 9750
},
{
"epoch": 2.201669298443492,
"grad_norm": 0.689978301525116,
"learning_rate": 0.000167913376945635,
"loss": 0.0835,
"mean_token_accuracy": 0.9742916345596313,
"num_tokens": 9937648.0,
"step": 9760
},
{
"epoch": 2.203925107150914,
"grad_norm": 0.5611025094985962,
"learning_rate": 0.0001677780284231897,
"loss": 0.0785,
"mean_token_accuracy": 0.9771084129810333,
"num_tokens": 9947834.0,
"step": 9770
},
{
"epoch": 2.206180915858335,
"grad_norm": 0.6031618714332581,
"learning_rate": 0.0001676426799007444,
"loss": 0.0954,
"mean_token_accuracy": 0.9708240628242493,
"num_tokens": 9958057.0,
"step": 9780
},
{
"epoch": 2.208436724565757,
"grad_norm": 0.40428003668785095,
"learning_rate": 0.0001675073313782991,
"loss": 0.1122,
"mean_token_accuracy": 0.9690653860569001,
"num_tokens": 9968222.0,
"step": 9790
},
{
"epoch": 2.2106925332731784,
"grad_norm": 0.3744548559188843,
"learning_rate": 0.0001673719828558538,
"loss": 0.1027,
"mean_token_accuracy": 0.9691601276397706,
"num_tokens": 9978423.0,
"step": 9800
},
{
"epoch": 2.2129483419806,
"grad_norm": 0.901164710521698,
"learning_rate": 0.00016723663433340852,
"loss": 0.0715,
"mean_token_accuracy": 0.9771885097026825,
"num_tokens": 9988644.0,
"step": 9810
},
{
"epoch": 2.2152041506880216,
"grad_norm": 0.5163309574127197,
"learning_rate": 0.0001671012858109632,
"loss": 0.0705,
"mean_token_accuracy": 0.9775005519390106,
"num_tokens": 9998878.0,
"step": 9820
},
{
"epoch": 2.2174599593954434,
"grad_norm": 0.6472623944282532,
"learning_rate": 0.00016696593728851792,
"loss": 0.0867,
"mean_token_accuracy": 0.9749352514743805,
"num_tokens": 10009100.0,
"step": 9830
},
{
"epoch": 2.2197157681028648,
"grad_norm": 0.6509614586830139,
"learning_rate": 0.00016683058876607263,
"loss": 0.0922,
"mean_token_accuracy": 0.9715542435646057,
"num_tokens": 10019340.0,
"step": 9840
},
{
"epoch": 2.2219715768102866,
"grad_norm": 0.6951918005943298,
"learning_rate": 0.00016669524024362734,
"loss": 0.1022,
"mean_token_accuracy": 0.9700810790061951,
"num_tokens": 10029577.0,
"step": 9850
},
{
"epoch": 2.224227385517708,
"grad_norm": 0.894263505935669,
"learning_rate": 0.00016655989172118202,
"loss": 0.1011,
"mean_token_accuracy": 0.9722074806690216,
"num_tokens": 10039785.0,
"step": 9860
},
{
"epoch": 2.2264831942251297,
"grad_norm": 0.611517071723938,
"learning_rate": 0.00016642454319873673,
"loss": 0.092,
"mean_token_accuracy": 0.9712512075901032,
"num_tokens": 10049965.0,
"step": 9870
},
{
"epoch": 2.228739002932551,
"grad_norm": 0.3283737301826477,
"learning_rate": 0.00016628919467629144,
"loss": 0.1159,
"mean_token_accuracy": 0.9696214973926545,
"num_tokens": 10060161.0,
"step": 9880
},
{
"epoch": 2.230994811639973,
"grad_norm": 0.5803564786911011,
"learning_rate": 0.00016615384615384615,
"loss": 0.0653,
"mean_token_accuracy": 0.9800610899925232,
"num_tokens": 10070313.0,
"step": 9890
},
{
"epoch": 2.2332506203473947,
"grad_norm": 0.7790770530700684,
"learning_rate": 0.00016601849763140083,
"loss": 0.1069,
"mean_token_accuracy": 0.9694907486438751,
"num_tokens": 10080514.0,
"step": 9900
},
{
"epoch": 2.235506429054816,
"grad_norm": 0.31714528799057007,
"learning_rate": 0.00016588314910895554,
"loss": 0.0821,
"mean_token_accuracy": 0.9785817444324494,
"num_tokens": 10090403.0,
"step": 9910
},
{
"epoch": 2.237762237762238,
"grad_norm": 0.547499418258667,
"learning_rate": 0.00016574780058651025,
"loss": 0.1475,
"mean_token_accuracy": 0.9605782866477967,
"num_tokens": 10100605.0,
"step": 9920
},
{
"epoch": 2.2400180464696593,
"grad_norm": 0.2478630244731903,
"learning_rate": 0.00016561245206406496,
"loss": 0.0844,
"mean_token_accuracy": 0.9756044149398804,
"num_tokens": 10110816.0,
"step": 9930
},
{
"epoch": 2.242273855177081,
"grad_norm": 0.4540363550186157,
"learning_rate": 0.00016547710354161967,
"loss": 0.0754,
"mean_token_accuracy": 0.9743645370006562,
"num_tokens": 10120970.0,
"step": 9940
},
{
"epoch": 2.2445296638845025,
"grad_norm": 0.5862691402435303,
"learning_rate": 0.00016534175501917435,
"loss": 0.115,
"mean_token_accuracy": 0.9685929179191589,
"num_tokens": 10131187.0,
"step": 9950
},
{
"epoch": 2.2467854725919243,
"grad_norm": 0.892983078956604,
"learning_rate": 0.00016520640649672906,
"loss": 0.1041,
"mean_token_accuracy": 0.9705977141857147,
"num_tokens": 10141375.0,
"step": 9960
},
{
"epoch": 2.2490412812993457,
"grad_norm": 0.5582557320594788,
"learning_rate": 0.00016507105797428377,
"loss": 0.0623,
"mean_token_accuracy": 0.9798148095607757,
"num_tokens": 10151567.0,
"step": 9970
},
{
"epoch": 2.2512970900067675,
"grad_norm": 0.6146891117095947,
"learning_rate": 0.00016493570945183848,
"loss": 0.1045,
"mean_token_accuracy": 0.9701870501041412,
"num_tokens": 10161709.0,
"step": 9980
},
{
"epoch": 2.253552898714189,
"grad_norm": 0.6650478839874268,
"learning_rate": 0.00016480036092939316,
"loss": 0.1306,
"mean_token_accuracy": 0.9658924698829651,
"num_tokens": 10171850.0,
"step": 9990
},
{
"epoch": 2.2558087074216107,
"grad_norm": 0.6161482930183411,
"learning_rate": 0.00016466501240694787,
"loss": 0.0957,
"mean_token_accuracy": 0.9716996252536774,
"num_tokens": 10182072.0,
"step": 10000
}
],
"logging_steps": 10,
"max_steps": 22165,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.239786402987008e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}