qwen3-8b-rails / adapters /checkpoint-10000 /trainer_state.json
vmarcetic's picture
Add LoRA adapters
2dad959 verified
{
"best_global_step": 10000,
"best_metric": 0.6085147261619568,
"best_model_checkpoint": "/workspace/rails-finetune/adapters-qwen3-8b/checkpoint-10000",
"epoch": 1.7793594306049823,
"eval_steps": 500,
"global_step": 10000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0017793594306049821,
"grad_norm": 1.1857829093933105,
"learning_rate": 9.000000000000001e-07,
"loss": 1.7382530212402343,
"step": 10
},
{
"epoch": 0.0035587188612099642,
"grad_norm": 1.1971900463104248,
"learning_rate": 1.9000000000000002e-06,
"loss": 1.7001752853393555,
"step": 20
},
{
"epoch": 0.005338078291814947,
"grad_norm": 1.2064718008041382,
"learning_rate": 2.9e-06,
"loss": 1.7585294723510743,
"step": 30
},
{
"epoch": 0.0071174377224199285,
"grad_norm": 1.1466728448867798,
"learning_rate": 3.900000000000001e-06,
"loss": 1.6992141723632812,
"step": 40
},
{
"epoch": 0.008896797153024912,
"grad_norm": 1.0449944734573364,
"learning_rate": 4.9000000000000005e-06,
"loss": 1.7329919815063477,
"step": 50
},
{
"epoch": 0.010676156583629894,
"grad_norm": 1.25763738155365,
"learning_rate": 5.9e-06,
"loss": 1.5657649040222168,
"step": 60
},
{
"epoch": 0.012455516014234875,
"grad_norm": 0.9314267039299011,
"learning_rate": 6.9e-06,
"loss": 1.5626873970031738,
"step": 70
},
{
"epoch": 0.014234875444839857,
"grad_norm": 0.8502155542373657,
"learning_rate": 7.9e-06,
"loss": 1.3948446273803712,
"step": 80
},
{
"epoch": 0.01601423487544484,
"grad_norm": 0.505824625492096,
"learning_rate": 8.900000000000001e-06,
"loss": 1.318073558807373,
"step": 90
},
{
"epoch": 0.017793594306049824,
"grad_norm": 0.4828319251537323,
"learning_rate": 9.9e-06,
"loss": 1.2034348487854003,
"step": 100
},
{
"epoch": 0.019572953736654804,
"grad_norm": 0.4351405203342438,
"learning_rate": 1.0900000000000002e-05,
"loss": 1.2164586067199707,
"step": 110
},
{
"epoch": 0.021352313167259787,
"grad_norm": 0.3060351014137268,
"learning_rate": 1.1900000000000001e-05,
"loss": 1.1403413772583009,
"step": 120
},
{
"epoch": 0.023131672597864767,
"grad_norm": 0.30274662375450134,
"learning_rate": 1.2900000000000002e-05,
"loss": 1.0329069137573241,
"step": 130
},
{
"epoch": 0.02491103202846975,
"grad_norm": 0.27452540397644043,
"learning_rate": 1.39e-05,
"loss": 1.0137288093566894,
"step": 140
},
{
"epoch": 0.026690391459074734,
"grad_norm": 0.22874200344085693,
"learning_rate": 1.4900000000000001e-05,
"loss": 0.9552822113037109,
"step": 150
},
{
"epoch": 0.028469750889679714,
"grad_norm": 0.20503273606300354,
"learning_rate": 1.5900000000000004e-05,
"loss": 0.9580234527587891,
"step": 160
},
{
"epoch": 0.030249110320284697,
"grad_norm": 0.20734967291355133,
"learning_rate": 1.69e-05,
"loss": 0.9269493103027344,
"step": 170
},
{
"epoch": 0.03202846975088968,
"grad_norm": 0.2046293467283249,
"learning_rate": 1.79e-05,
"loss": 0.898008155822754,
"step": 180
},
{
"epoch": 0.033807829181494664,
"grad_norm": 0.20902672410011292,
"learning_rate": 1.8900000000000002e-05,
"loss": 0.8822259902954102,
"step": 190
},
{
"epoch": 0.03558718861209965,
"grad_norm": 0.265747606754303,
"learning_rate": 1.9900000000000003e-05,
"loss": 0.9367627143859864,
"step": 200
},
{
"epoch": 0.037366548042704624,
"grad_norm": 0.21941998600959778,
"learning_rate": 1.9999967204339314e-05,
"loss": 0.8784577369689941,
"step": 210
},
{
"epoch": 0.03914590747330961,
"grad_norm": 0.23806284368038177,
"learning_rate": 1.999985383689953e-05,
"loss": 0.8498974800109863,
"step": 220
},
{
"epoch": 0.04092526690391459,
"grad_norm": 0.2539118826389313,
"learning_rate": 1.9999659493713742e-05,
"loss": 0.8517349243164063,
"step": 230
},
{
"epoch": 0.042704626334519574,
"grad_norm": 0.279153436422348,
"learning_rate": 1.9999384176355685e-05,
"loss": 0.8303974151611329,
"step": 240
},
{
"epoch": 0.04448398576512456,
"grad_norm": 0.2661009132862091,
"learning_rate": 1.9999027887054793e-05,
"loss": 0.8362269401550293,
"step": 250
},
{
"epoch": 0.046263345195729534,
"grad_norm": 0.2643781006336212,
"learning_rate": 1.9998590628696186e-05,
"loss": 0.7995978832244873,
"step": 260
},
{
"epoch": 0.04804270462633452,
"grad_norm": 0.22854603826999664,
"learning_rate": 1.9998072404820648e-05,
"loss": 0.8313385009765625,
"step": 270
},
{
"epoch": 0.0498220640569395,
"grad_norm": 0.25225695967674255,
"learning_rate": 1.99974732196246e-05,
"loss": 0.8484455108642578,
"step": 280
},
{
"epoch": 0.051601423487544484,
"grad_norm": 0.2538894712924957,
"learning_rate": 1.999679307796006e-05,
"loss": 0.8153322219848633,
"step": 290
},
{
"epoch": 0.05338078291814947,
"grad_norm": 0.257697194814682,
"learning_rate": 1.9996031985334604e-05,
"loss": 0.8145216941833496,
"step": 300
},
{
"epoch": 0.05516014234875445,
"grad_norm": 0.2465839385986328,
"learning_rate": 1.9995189947911323e-05,
"loss": 0.8290293693542481,
"step": 310
},
{
"epoch": 0.05693950177935943,
"grad_norm": 0.2330036759376526,
"learning_rate": 1.9994266972508786e-05,
"loss": 0.8094453811645508,
"step": 320
},
{
"epoch": 0.05871886120996441,
"grad_norm": 0.24966399371623993,
"learning_rate": 1.9993263066600956e-05,
"loss": 0.7964043140411377,
"step": 330
},
{
"epoch": 0.060498220640569395,
"grad_norm": 0.2663300335407257,
"learning_rate": 1.9992178238317156e-05,
"loss": 0.7674037456512451,
"step": 340
},
{
"epoch": 0.06227758007117438,
"grad_norm": 0.29889997839927673,
"learning_rate": 1.9991012496442e-05,
"loss": 0.7757882118225098,
"step": 350
},
{
"epoch": 0.06405693950177936,
"grad_norm": 0.2283693104982376,
"learning_rate": 1.9989765850415303e-05,
"loss": 0.7695906162261963,
"step": 360
},
{
"epoch": 0.06583629893238434,
"grad_norm": 0.2109132707118988,
"learning_rate": 1.9988438310332015e-05,
"loss": 0.7833895683288574,
"step": 370
},
{
"epoch": 0.06761565836298933,
"grad_norm": 0.2546916604042053,
"learning_rate": 1.998702988694216e-05,
"loss": 0.8128045082092286,
"step": 380
},
{
"epoch": 0.0693950177935943,
"grad_norm": 0.2531549632549286,
"learning_rate": 1.998554059165071e-05,
"loss": 0.7902643203735351,
"step": 390
},
{
"epoch": 0.0711743772241993,
"grad_norm": 0.21756362915039062,
"learning_rate": 1.9983970436517523e-05,
"loss": 0.7454084873199462,
"step": 400
},
{
"epoch": 0.07295373665480427,
"grad_norm": 0.2294779121875763,
"learning_rate": 1.9982319434257236e-05,
"loss": 0.7487451553344726,
"step": 410
},
{
"epoch": 0.07473309608540925,
"grad_norm": 0.26356714963912964,
"learning_rate": 1.9980587598239155e-05,
"loss": 0.748570442199707,
"step": 420
},
{
"epoch": 0.07651245551601424,
"grad_norm": 0.22382904589176178,
"learning_rate": 1.9978774942487155e-05,
"loss": 0.7261887073516846,
"step": 430
},
{
"epoch": 0.07829181494661921,
"grad_norm": 0.22544927895069122,
"learning_rate": 1.997688148167957e-05,
"loss": 0.7761906623840332,
"step": 440
},
{
"epoch": 0.0800711743772242,
"grad_norm": 0.2679840922355652,
"learning_rate": 1.997490723114906e-05,
"loss": 0.7872249603271484,
"step": 450
},
{
"epoch": 0.08185053380782918,
"grad_norm": 0.20914621651172638,
"learning_rate": 1.9972852206882504e-05,
"loss": 0.7876029014587402,
"step": 460
},
{
"epoch": 0.08362989323843416,
"grad_norm": 0.36782026290893555,
"learning_rate": 1.9970716425520854e-05,
"loss": 0.7812703132629395,
"step": 470
},
{
"epoch": 0.08540925266903915,
"grad_norm": 0.2317405343055725,
"learning_rate": 1.9968499904359017e-05,
"loss": 0.7694793701171875,
"step": 480
},
{
"epoch": 0.08718861209964412,
"grad_norm": 0.23587286472320557,
"learning_rate": 1.9966202661345694e-05,
"loss": 0.7593471050262451,
"step": 490
},
{
"epoch": 0.08896797153024912,
"grad_norm": 0.3179067075252533,
"learning_rate": 1.9963824715083255e-05,
"loss": 0.804302978515625,
"step": 500
},
{
"epoch": 0.08896797153024912,
"eval_loss": 0.7566477656364441,
"eval_runtime": 406.3608,
"eval_samples_per_second": 12.376,
"eval_steps_per_second": 6.189,
"step": 500
},
{
"epoch": 0.09074733096085409,
"grad_norm": 0.2583659291267395,
"learning_rate": 1.9961366084827584e-05,
"loss": 0.7231699466705322,
"step": 510
},
{
"epoch": 0.09252669039145907,
"grad_norm": 0.3117372393608093,
"learning_rate": 1.9958826790487905e-05,
"loss": 0.7830834865570069,
"step": 520
},
{
"epoch": 0.09430604982206406,
"grad_norm": 0.26633507013320923,
"learning_rate": 1.995620685262665e-05,
"loss": 0.7565378189086914,
"step": 530
},
{
"epoch": 0.09608540925266904,
"grad_norm": 0.2491220384836197,
"learning_rate": 1.9953506292459275e-05,
"loss": 0.770139503479004,
"step": 540
},
{
"epoch": 0.09786476868327403,
"grad_norm": 0.33897528052330017,
"learning_rate": 1.9950725131854082e-05,
"loss": 0.8139609336853028,
"step": 550
},
{
"epoch": 0.099644128113879,
"grad_norm": 0.27655312418937683,
"learning_rate": 1.9947863393332053e-05,
"loss": 0.7632327556610108,
"step": 560
},
{
"epoch": 0.10142348754448399,
"grad_norm": 0.2421133816242218,
"learning_rate": 1.994492110006667e-05,
"loss": 0.7782410621643067,
"step": 570
},
{
"epoch": 0.10320284697508897,
"grad_norm": 0.26968705654144287,
"learning_rate": 1.994189827588372e-05,
"loss": 0.7438684940338135,
"step": 580
},
{
"epoch": 0.10498220640569395,
"grad_norm": 0.28697794675827026,
"learning_rate": 1.993879494526111e-05,
"loss": 0.7320491313934326,
"step": 590
},
{
"epoch": 0.10676156583629894,
"grad_norm": 0.327286034822464,
"learning_rate": 1.9935611133328657e-05,
"loss": 0.7248147010803223,
"step": 600
},
{
"epoch": 0.10854092526690391,
"grad_norm": 0.3679325580596924,
"learning_rate": 1.9932346865867885e-05,
"loss": 0.7461765766143799,
"step": 610
},
{
"epoch": 0.1103202846975089,
"grad_norm": 0.3118029832839966,
"learning_rate": 1.992900216931184e-05,
"loss": 0.764728593826294,
"step": 620
},
{
"epoch": 0.11209964412811388,
"grad_norm": 0.2683190107345581,
"learning_rate": 1.992557707074484e-05,
"loss": 0.7256179809570312,
"step": 630
},
{
"epoch": 0.11387900355871886,
"grad_norm": 0.32056325674057007,
"learning_rate": 1.9922071597902286e-05,
"loss": 0.7553198337554932,
"step": 640
},
{
"epoch": 0.11565836298932385,
"grad_norm": 0.4004197418689728,
"learning_rate": 1.9918485779170417e-05,
"loss": 0.7440505027770996,
"step": 650
},
{
"epoch": 0.11743772241992882,
"grad_norm": 0.3497128486633301,
"learning_rate": 1.9914819643586096e-05,
"loss": 0.7672629833221436,
"step": 660
},
{
"epoch": 0.11921708185053381,
"grad_norm": 0.3187069594860077,
"learning_rate": 1.9911073220836562e-05,
"loss": 0.7420164585113526,
"step": 670
},
{
"epoch": 0.12099644128113879,
"grad_norm": 0.29900017380714417,
"learning_rate": 1.9907246541259194e-05,
"loss": 0.7458691120147705,
"step": 680
},
{
"epoch": 0.12277580071174377,
"grad_norm": 0.298501193523407,
"learning_rate": 1.9903339635841274e-05,
"loss": 0.7346842765808106,
"step": 690
},
{
"epoch": 0.12455516014234876,
"grad_norm": 0.38077300786972046,
"learning_rate": 1.9899352536219713e-05,
"loss": 0.7980701446533203,
"step": 700
},
{
"epoch": 0.12633451957295375,
"grad_norm": 0.2950842082500458,
"learning_rate": 1.9895285274680826e-05,
"loss": 0.7282920837402344,
"step": 710
},
{
"epoch": 0.12811387900355872,
"grad_norm": 0.2746928632259369,
"learning_rate": 1.989113788416005e-05,
"loss": 0.7119527816772461,
"step": 720
},
{
"epoch": 0.1298932384341637,
"grad_norm": 0.30003389716148376,
"learning_rate": 1.9886910398241673e-05,
"loss": 0.7120148181915283,
"step": 730
},
{
"epoch": 0.13167259786476868,
"grad_norm": 0.3238595128059387,
"learning_rate": 1.9882602851158584e-05,
"loss": 0.7219894886016845,
"step": 740
},
{
"epoch": 0.13345195729537365,
"grad_norm": 0.4421483278274536,
"learning_rate": 1.9878215277791977e-05,
"loss": 0.6878085613250733,
"step": 750
},
{
"epoch": 0.13523131672597866,
"grad_norm": 0.3009251058101654,
"learning_rate": 1.9873747713671073e-05,
"loss": 0.777537488937378,
"step": 760
},
{
"epoch": 0.13701067615658363,
"grad_norm": 0.3649790287017822,
"learning_rate": 1.9869200194972828e-05,
"loss": 0.7608931541442872,
"step": 770
},
{
"epoch": 0.1387900355871886,
"grad_norm": 0.38479843735694885,
"learning_rate": 1.986457275852166e-05,
"loss": 0.7644641876220704,
"step": 780
},
{
"epoch": 0.14056939501779359,
"grad_norm": 0.3267346918582916,
"learning_rate": 1.9859865441789126e-05,
"loss": 0.7437977313995361,
"step": 790
},
{
"epoch": 0.1423487544483986,
"grad_norm": 0.3238297402858734,
"learning_rate": 1.985507828289363e-05,
"loss": 0.7596290111541748,
"step": 800
},
{
"epoch": 0.14412811387900357,
"grad_norm": 0.4067242443561554,
"learning_rate": 1.985021132060012e-05,
"loss": 0.723486328125,
"step": 810
},
{
"epoch": 0.14590747330960854,
"grad_norm": 0.3294743299484253,
"learning_rate": 1.9845264594319755e-05,
"loss": 0.8035991668701172,
"step": 820
},
{
"epoch": 0.14768683274021352,
"grad_norm": 0.2861204445362091,
"learning_rate": 1.9840238144109613e-05,
"loss": 0.7118996620178223,
"step": 830
},
{
"epoch": 0.1494661921708185,
"grad_norm": 0.3281143605709076,
"learning_rate": 1.9835132010672334e-05,
"loss": 0.7610855102539062,
"step": 840
},
{
"epoch": 0.1512455516014235,
"grad_norm": 0.3176390826702118,
"learning_rate": 1.982994623535583e-05,
"loss": 0.7489484310150146,
"step": 850
},
{
"epoch": 0.15302491103202848,
"grad_norm": 0.34338897466659546,
"learning_rate": 1.9824680860152914e-05,
"loss": 0.7180755615234375,
"step": 860
},
{
"epoch": 0.15480427046263345,
"grad_norm": 0.41952571272850037,
"learning_rate": 1.9819335927700975e-05,
"loss": 0.6884951591491699,
"step": 870
},
{
"epoch": 0.15658362989323843,
"grad_norm": 0.29155367612838745,
"learning_rate": 1.9813911481281637e-05,
"loss": 0.683270263671875,
"step": 880
},
{
"epoch": 0.1583629893238434,
"grad_norm": 0.3815101981163025,
"learning_rate": 1.98084075648204e-05,
"loss": 0.714734411239624,
"step": 890
},
{
"epoch": 0.1601423487544484,
"grad_norm": 0.364044189453125,
"learning_rate": 1.980282422288629e-05,
"loss": 0.7451518535614013,
"step": 900
},
{
"epoch": 0.1619217081850534,
"grad_norm": 0.4181061387062073,
"learning_rate": 1.9797161500691496e-05,
"loss": 0.7484359741210938,
"step": 910
},
{
"epoch": 0.16370106761565836,
"grad_norm": 0.3803650140762329,
"learning_rate": 1.9791419444091006e-05,
"loss": 0.7413453578948974,
"step": 920
},
{
"epoch": 0.16548042704626334,
"grad_norm": 0.3450170159339905,
"learning_rate": 1.9785598099582225e-05,
"loss": 0.732274341583252,
"step": 930
},
{
"epoch": 0.16725978647686832,
"grad_norm": 0.31141409277915955,
"learning_rate": 1.9779697514304624e-05,
"loss": 0.768674898147583,
"step": 940
},
{
"epoch": 0.16903914590747332,
"grad_norm": 0.33961221575737,
"learning_rate": 1.977371773603932e-05,
"loss": 0.7539153099060059,
"step": 950
},
{
"epoch": 0.1708185053380783,
"grad_norm": 0.3468526303768158,
"learning_rate": 1.9767658813208725e-05,
"loss": 0.6995216369628906,
"step": 960
},
{
"epoch": 0.17259786476868327,
"grad_norm": 0.3776821196079254,
"learning_rate": 1.976152079487614e-05,
"loss": 0.7344133853912354,
"step": 970
},
{
"epoch": 0.17437722419928825,
"grad_norm": 0.43669673800468445,
"learning_rate": 1.9755303730745344e-05,
"loss": 0.7037209510803223,
"step": 980
},
{
"epoch": 0.17615658362989323,
"grad_norm": 0.43197065591812134,
"learning_rate": 1.9749007671160223e-05,
"loss": 0.7632620334625244,
"step": 990
},
{
"epoch": 0.17793594306049823,
"grad_norm": 0.36352500319480896,
"learning_rate": 1.9742632667104332e-05,
"loss": 0.7493629455566406,
"step": 1000
},
{
"epoch": 0.17793594306049823,
"eval_loss": 0.7127183079719543,
"eval_runtime": 407.5587,
"eval_samples_per_second": 12.339,
"eval_steps_per_second": 6.171,
"step": 1000
},
{
"epoch": 0.1797153024911032,
"grad_norm": 0.4177298843860626,
"learning_rate": 1.9736178770200492e-05,
"loss": 0.7822850227355957,
"step": 1010
},
{
"epoch": 0.18149466192170818,
"grad_norm": 0.4624420404434204,
"learning_rate": 1.972964603271038e-05,
"loss": 0.7450732707977294,
"step": 1020
},
{
"epoch": 0.18327402135231316,
"grad_norm": 0.35002920031547546,
"learning_rate": 1.97230345075341e-05,
"loss": 0.7509373188018799,
"step": 1030
},
{
"epoch": 0.18505338078291814,
"grad_norm": 0.4136241674423218,
"learning_rate": 1.9716344248209754e-05,
"loss": 0.6727419853210449,
"step": 1040
},
{
"epoch": 0.18683274021352314,
"grad_norm": 0.41807156801223755,
"learning_rate": 1.9709575308913004e-05,
"loss": 0.7258425712585449,
"step": 1050
},
{
"epoch": 0.18861209964412812,
"grad_norm": 0.4383244514465332,
"learning_rate": 1.9702727744456645e-05,
"loss": 0.7470430374145508,
"step": 1060
},
{
"epoch": 0.1903914590747331,
"grad_norm": 0.44305190443992615,
"learning_rate": 1.969580161029015e-05,
"loss": 0.6925637722015381,
"step": 1070
},
{
"epoch": 0.19217081850533807,
"grad_norm": 0.45914319157600403,
"learning_rate": 1.9688796962499228e-05,
"loss": 0.7320804595947266,
"step": 1080
},
{
"epoch": 0.19395017793594305,
"grad_norm": 0.37220245599746704,
"learning_rate": 1.9681713857805367e-05,
"loss": 0.6863605499267578,
"step": 1090
},
{
"epoch": 0.19572953736654805,
"grad_norm": 0.473320871591568,
"learning_rate": 1.9674552353565374e-05,
"loss": 0.7069521427154541,
"step": 1100
},
{
"epoch": 0.19750889679715303,
"grad_norm": 0.34435564279556274,
"learning_rate": 1.9667312507770905e-05,
"loss": 0.7488323211669922,
"step": 1110
},
{
"epoch": 0.199288256227758,
"grad_norm": 0.4495692551136017,
"learning_rate": 1.9659994379048015e-05,
"loss": 0.7306941032409668,
"step": 1120
},
{
"epoch": 0.20106761565836298,
"grad_norm": 0.37990689277648926,
"learning_rate": 1.9652598026656666e-05,
"loss": 0.6781065464019775,
"step": 1130
},
{
"epoch": 0.20284697508896798,
"grad_norm": 0.374970942735672,
"learning_rate": 1.9645123510490242e-05,
"loss": 0.732900333404541,
"step": 1140
},
{
"epoch": 0.20462633451957296,
"grad_norm": 0.42331087589263916,
"learning_rate": 1.963757089107508e-05,
"loss": 0.7150296211242676,
"step": 1150
},
{
"epoch": 0.20640569395017794,
"grad_norm": 0.39357990026474,
"learning_rate": 1.962994022956998e-05,
"loss": 0.70610032081604,
"step": 1160
},
{
"epoch": 0.20818505338078291,
"grad_norm": 0.3616408407688141,
"learning_rate": 1.9622231587765688e-05,
"loss": 0.6808771610260009,
"step": 1170
},
{
"epoch": 0.2099644128113879,
"grad_norm": 0.4579455554485321,
"learning_rate": 1.9614445028084424e-05,
"loss": 0.7024923801422119,
"step": 1180
},
{
"epoch": 0.2117437722419929,
"grad_norm": 0.47354868054389954,
"learning_rate": 1.9606580613579352e-05,
"loss": 0.7523046493530273,
"step": 1190
},
{
"epoch": 0.21352313167259787,
"grad_norm": 0.3599710166454315,
"learning_rate": 1.9598638407934096e-05,
"loss": 0.7396236419677734,
"step": 1200
},
{
"epoch": 0.21530249110320285,
"grad_norm": 0.37311187386512756,
"learning_rate": 1.959061847546219e-05,
"loss": 0.6676182746887207,
"step": 1210
},
{
"epoch": 0.21708185053380782,
"grad_norm": 0.4571494460105896,
"learning_rate": 1.9582520881106585e-05,
"loss": 0.7176971435546875,
"step": 1220
},
{
"epoch": 0.2188612099644128,
"grad_norm": 0.39556118845939636,
"learning_rate": 1.9574345690439113e-05,
"loss": 0.6899125576019287,
"step": 1230
},
{
"epoch": 0.2206405693950178,
"grad_norm": 0.4152670204639435,
"learning_rate": 1.9566092969659964e-05,
"loss": 0.7056239128112793,
"step": 1240
},
{
"epoch": 0.22241992882562278,
"grad_norm": 0.5533052086830139,
"learning_rate": 1.9557762785597133e-05,
"loss": 0.7325549602508545,
"step": 1250
},
{
"epoch": 0.22419928825622776,
"grad_norm": 0.457738995552063,
"learning_rate": 1.9549355205705895e-05,
"loss": 0.7097938060760498,
"step": 1260
},
{
"epoch": 0.22597864768683273,
"grad_norm": 0.4812074303627014,
"learning_rate": 1.9540870298068247e-05,
"loss": 0.6999053478240966,
"step": 1270
},
{
"epoch": 0.2277580071174377,
"grad_norm": 0.42034661769866943,
"learning_rate": 1.9532308131392365e-05,
"loss": 0.7124747276306153,
"step": 1280
},
{
"epoch": 0.22953736654804271,
"grad_norm": 0.44799792766571045,
"learning_rate": 1.9523668775012053e-05,
"loss": 0.7096034049987793,
"step": 1290
},
{
"epoch": 0.2313167259786477,
"grad_norm": 0.4658997654914856,
"learning_rate": 1.9514952298886157e-05,
"loss": 0.7080921649932861,
"step": 1300
},
{
"epoch": 0.23309608540925267,
"grad_norm": 0.5269479751586914,
"learning_rate": 1.9506158773598035e-05,
"loss": 0.6962251663208008,
"step": 1310
},
{
"epoch": 0.23487544483985764,
"grad_norm": 0.45521607995033264,
"learning_rate": 1.9497288270354944e-05,
"loss": 0.7323726177215576,
"step": 1320
},
{
"epoch": 0.23665480427046262,
"grad_norm": 0.4322509169578552,
"learning_rate": 1.9488340860987504e-05,
"loss": 0.7227589607238769,
"step": 1330
},
{
"epoch": 0.23843416370106763,
"grad_norm": 0.47730758786201477,
"learning_rate": 1.9479316617949084e-05,
"loss": 0.702051305770874,
"step": 1340
},
{
"epoch": 0.2402135231316726,
"grad_norm": 0.42763814330101013,
"learning_rate": 1.9470215614315232e-05,
"loss": 0.7380130767822266,
"step": 1350
},
{
"epoch": 0.24199288256227758,
"grad_norm": 0.4630064070224762,
"learning_rate": 1.9461037923783087e-05,
"loss": 0.7470481395721436,
"step": 1360
},
{
"epoch": 0.24377224199288255,
"grad_norm": 0.367767870426178,
"learning_rate": 1.9451783620670767e-05,
"loss": 0.7009376049041748,
"step": 1370
},
{
"epoch": 0.24555160142348753,
"grad_norm": 0.38925161957740784,
"learning_rate": 1.9442452779916775e-05,
"loss": 0.7043869972229004,
"step": 1380
},
{
"epoch": 0.24733096085409254,
"grad_norm": 0.37959718704223633,
"learning_rate": 1.943304547707939e-05,
"loss": 0.7024062633514404,
"step": 1390
},
{
"epoch": 0.2491103202846975,
"grad_norm": 0.3639119267463684,
"learning_rate": 1.9423561788336073e-05,
"loss": 0.7025011539459228,
"step": 1400
},
{
"epoch": 0.2508896797153025,
"grad_norm": 0.5123258233070374,
"learning_rate": 1.9414001790482815e-05,
"loss": 0.7173181533813476,
"step": 1410
},
{
"epoch": 0.2526690391459075,
"grad_norm": 0.5145444273948669,
"learning_rate": 1.940436556093355e-05,
"loss": 0.6975203514099121,
"step": 1420
},
{
"epoch": 0.25444839857651247,
"grad_norm": 0.41073641180992126,
"learning_rate": 1.93946531777195e-05,
"loss": 0.7116940021514893,
"step": 1430
},
{
"epoch": 0.25622775800711745,
"grad_norm": 0.5241482853889465,
"learning_rate": 1.9384864719488562e-05,
"loss": 0.7322525024414063,
"step": 1440
},
{
"epoch": 0.2580071174377224,
"grad_norm": 0.41522547602653503,
"learning_rate": 1.9375000265504673e-05,
"loss": 0.7099958419799804,
"step": 1450
},
{
"epoch": 0.2597864768683274,
"grad_norm": 0.4306512773036957,
"learning_rate": 1.9365059895647146e-05,
"loss": 0.6963861465454102,
"step": 1460
},
{
"epoch": 0.2615658362989324,
"grad_norm": 0.4515567421913147,
"learning_rate": 1.935504369041004e-05,
"loss": 0.7317886829376221,
"step": 1470
},
{
"epoch": 0.26334519572953735,
"grad_norm": 0.46133843064308167,
"learning_rate": 1.9344951730901523e-05,
"loss": 0.6936720371246338,
"step": 1480
},
{
"epoch": 0.26512455516014233,
"grad_norm": 0.4332071840763092,
"learning_rate": 1.933478409884317e-05,
"loss": 0.7092292308807373,
"step": 1490
},
{
"epoch": 0.2669039145907473,
"grad_norm": 0.44672346115112305,
"learning_rate": 1.9324540876569356e-05,
"loss": 0.7138745784759521,
"step": 1500
},
{
"epoch": 0.2669039145907473,
"eval_loss": 0.6928849816322327,
"eval_runtime": 409.2608,
"eval_samples_per_second": 12.288,
"eval_steps_per_second": 6.145,
"step": 1500
},
{
"epoch": 0.26868327402135234,
"grad_norm": 0.4207611680030823,
"learning_rate": 1.9314222147026538e-05,
"loss": 0.682267141342163,
"step": 1510
},
{
"epoch": 0.2704626334519573,
"grad_norm": 0.4715620279312134,
"learning_rate": 1.9303827993772627e-05,
"loss": 0.6928095340728759,
"step": 1520
},
{
"epoch": 0.2722419928825623,
"grad_norm": 0.5061793327331543,
"learning_rate": 1.9293358500976284e-05,
"loss": 0.7393241405487061,
"step": 1530
},
{
"epoch": 0.27402135231316727,
"grad_norm": 0.46995067596435547,
"learning_rate": 1.9282813753416247e-05,
"loss": 0.7240311622619628,
"step": 1540
},
{
"epoch": 0.27580071174377224,
"grad_norm": 0.35846948623657227,
"learning_rate": 1.927219383648064e-05,
"loss": 0.6961266994476318,
"step": 1550
},
{
"epoch": 0.2775800711743772,
"grad_norm": 0.4667574465274811,
"learning_rate": 1.9261498836166297e-05,
"loss": 0.7295796871185303,
"step": 1560
},
{
"epoch": 0.2793594306049822,
"grad_norm": 0.5029377341270447,
"learning_rate": 1.9250728839078043e-05,
"loss": 0.7046424865722656,
"step": 1570
},
{
"epoch": 0.28113879003558717,
"grad_norm": 0.46402570605278015,
"learning_rate": 1.9239883932428002e-05,
"loss": 0.7341272830963135,
"step": 1580
},
{
"epoch": 0.28291814946619215,
"grad_norm": 0.45273569226264954,
"learning_rate": 1.9228964204034906e-05,
"loss": 0.7231084823608398,
"step": 1590
},
{
"epoch": 0.2846975088967972,
"grad_norm": 0.44853758811950684,
"learning_rate": 1.9217969742323358e-05,
"loss": 0.7228631019592285,
"step": 1600
},
{
"epoch": 0.28647686832740216,
"grad_norm": 0.4073372483253479,
"learning_rate": 1.9206900636323138e-05,
"loss": 0.7120136737823486,
"step": 1610
},
{
"epoch": 0.28825622775800713,
"grad_norm": 0.4837400019168854,
"learning_rate": 1.9195756975668463e-05,
"loss": 0.7246061325073242,
"step": 1620
},
{
"epoch": 0.2900355871886121,
"grad_norm": 0.48374027013778687,
"learning_rate": 1.918453885059728e-05,
"loss": 0.6975275039672851,
"step": 1630
},
{
"epoch": 0.2918149466192171,
"grad_norm": 0.40771543979644775,
"learning_rate": 1.9173246351950515e-05,
"loss": 0.7376579284667969,
"step": 1640
},
{
"epoch": 0.29359430604982206,
"grad_norm": 0.47605007886886597,
"learning_rate": 1.916187957117136e-05,
"loss": 0.7509649753570556,
"step": 1650
},
{
"epoch": 0.29537366548042704,
"grad_norm": 0.4246625006198883,
"learning_rate": 1.9150438600304514e-05,
"loss": 0.7086214065551758,
"step": 1660
},
{
"epoch": 0.297153024911032,
"grad_norm": 0.4986574053764343,
"learning_rate": 1.9138923531995448e-05,
"loss": 0.6780657768249512,
"step": 1670
},
{
"epoch": 0.298932384341637,
"grad_norm": 0.5241477489471436,
"learning_rate": 1.912733445948965e-05,
"loss": 0.7178135395050049,
"step": 1680
},
{
"epoch": 0.30071174377224197,
"grad_norm": 0.48822927474975586,
"learning_rate": 1.9115671476631865e-05,
"loss": 0.6914261817932129,
"step": 1690
},
{
"epoch": 0.302491103202847,
"grad_norm": 0.4603191018104553,
"learning_rate": 1.910393467786535e-05,
"loss": 0.7189798355102539,
"step": 1700
},
{
"epoch": 0.304270462633452,
"grad_norm": 0.5957316160202026,
"learning_rate": 1.90921241582311e-05,
"loss": 0.6929316520690918,
"step": 1710
},
{
"epoch": 0.30604982206405695,
"grad_norm": 0.4563639163970947,
"learning_rate": 1.9080240013367075e-05,
"loss": 0.6874090194702148,
"step": 1720
},
{
"epoch": 0.30782918149466193,
"grad_norm": 0.4933296740055084,
"learning_rate": 1.9068282339507433e-05,
"loss": 0.7047487258911133,
"step": 1730
},
{
"epoch": 0.3096085409252669,
"grad_norm": 0.46199893951416016,
"learning_rate": 1.9056251233481747e-05,
"loss": 0.6585260391235351,
"step": 1740
},
{
"epoch": 0.3113879003558719,
"grad_norm": 0.4224775433540344,
"learning_rate": 1.904414679271421e-05,
"loss": 0.7062453269958496,
"step": 1750
},
{
"epoch": 0.31316725978647686,
"grad_norm": 0.5592194199562073,
"learning_rate": 1.9031969115222876e-05,
"loss": 0.7372538089752197,
"step": 1760
},
{
"epoch": 0.31494661921708184,
"grad_norm": 0.447518527507782,
"learning_rate": 1.9019718299618836e-05,
"loss": 0.6815076351165772,
"step": 1770
},
{
"epoch": 0.3167259786476868,
"grad_norm": 0.4872969686985016,
"learning_rate": 1.9007394445105433e-05,
"loss": 0.6925329685211181,
"step": 1780
},
{
"epoch": 0.3185053380782918,
"grad_norm": 0.5796515941619873,
"learning_rate": 1.8994997651477457e-05,
"loss": 0.6911862850189209,
"step": 1790
},
{
"epoch": 0.3202846975088968,
"grad_norm": 0.45499077439308167,
"learning_rate": 1.8982528019120335e-05,
"loss": 0.689606761932373,
"step": 1800
},
{
"epoch": 0.3220640569395018,
"grad_norm": 0.5108110308647156,
"learning_rate": 1.8969985649009325e-05,
"loss": 0.698862886428833,
"step": 1810
},
{
"epoch": 0.3238434163701068,
"grad_norm": 0.5635261535644531,
"learning_rate": 1.8957370642708682e-05,
"loss": 0.7093265533447266,
"step": 1820
},
{
"epoch": 0.32562277580071175,
"grad_norm": 0.5035433769226074,
"learning_rate": 1.8944683102370862e-05,
"loss": 0.6829179763793946,
"step": 1830
},
{
"epoch": 0.3274021352313167,
"grad_norm": 0.5030480623245239,
"learning_rate": 1.8931923130735667e-05,
"loss": 0.6931506633758545,
"step": 1840
},
{
"epoch": 0.3291814946619217,
"grad_norm": 0.42200684547424316,
"learning_rate": 1.891909083112943e-05,
"loss": 0.7182873725891114,
"step": 1850
},
{
"epoch": 0.3309608540925267,
"grad_norm": 0.5390200614929199,
"learning_rate": 1.8906186307464168e-05,
"loss": 0.7040542602539063,
"step": 1860
},
{
"epoch": 0.33274021352313166,
"grad_norm": 0.4840771555900574,
"learning_rate": 1.889320966423676e-05,
"loss": 0.71949143409729,
"step": 1870
},
{
"epoch": 0.33451957295373663,
"grad_norm": 0.46363523602485657,
"learning_rate": 1.8880161006528075e-05,
"loss": 0.7095215797424317,
"step": 1880
},
{
"epoch": 0.33629893238434166,
"grad_norm": 0.5143831372261047,
"learning_rate": 1.8867040440002137e-05,
"loss": 0.6844244003295898,
"step": 1890
},
{
"epoch": 0.33807829181494664,
"grad_norm": 0.42344361543655396,
"learning_rate": 1.8853848070905264e-05,
"loss": 0.7266733169555664,
"step": 1900
},
{
"epoch": 0.3398576512455516,
"grad_norm": 0.5283271074295044,
"learning_rate": 1.884058400606521e-05,
"loss": 0.6759650707244873,
"step": 1910
},
{
"epoch": 0.3416370106761566,
"grad_norm": 0.6041154861450195,
"learning_rate": 1.88272483528903e-05,
"loss": 0.7138604164123535,
"step": 1920
},
{
"epoch": 0.34341637010676157,
"grad_norm": 0.4607132077217102,
"learning_rate": 1.8813841219368562e-05,
"loss": 0.7108243465423584,
"step": 1930
},
{
"epoch": 0.34519572953736655,
"grad_norm": 0.4770022928714752,
"learning_rate": 1.880036271406684e-05,
"loss": 0.700477123260498,
"step": 1940
},
{
"epoch": 0.3469750889679715,
"grad_norm": 0.5347304940223694,
"learning_rate": 1.8786812946129934e-05,
"loss": 0.7176999092102051,
"step": 1950
},
{
"epoch": 0.3487544483985765,
"grad_norm": 0.4638780355453491,
"learning_rate": 1.8773192025279712e-05,
"loss": 0.6859623908996582,
"step": 1960
},
{
"epoch": 0.3505338078291815,
"grad_norm": 0.4621883034706116,
"learning_rate": 1.87595000618142e-05,
"loss": 0.6809545516967773,
"step": 1970
},
{
"epoch": 0.35231316725978645,
"grad_norm": 0.4492979943752289,
"learning_rate": 1.8745737166606716e-05,
"loss": 0.7484791278839111,
"step": 1980
},
{
"epoch": 0.3540925266903915,
"grad_norm": 0.5058820843696594,
"learning_rate": 1.873190345110496e-05,
"loss": 0.6860589027404785,
"step": 1990
},
{
"epoch": 0.35587188612099646,
"grad_norm": 0.467847615480423,
"learning_rate": 1.8717999027330114e-05,
"loss": 0.6946381092071533,
"step": 2000
},
{
"epoch": 0.35587188612099646,
"eval_loss": 0.6786053776741028,
"eval_runtime": 408.2053,
"eval_samples_per_second": 12.32,
"eval_steps_per_second": 6.161,
"step": 2000
},
{
"epoch": 0.35765124555160144,
"grad_norm": 0.4924924373626709,
"learning_rate": 1.870402400787593e-05,
"loss": 0.7328177452087402,
"step": 2010
},
{
"epoch": 0.3594306049822064,
"grad_norm": 0.529029369354248,
"learning_rate": 1.8689978505907828e-05,
"loss": 0.6788232803344727,
"step": 2020
},
{
"epoch": 0.3612099644128114,
"grad_norm": 0.5123940110206604,
"learning_rate": 1.8675862635161968e-05,
"loss": 0.6493151664733887,
"step": 2030
},
{
"epoch": 0.36298932384341637,
"grad_norm": 0.4411087930202484,
"learning_rate": 1.866167650994434e-05,
"loss": 0.6652609825134277,
"step": 2040
},
{
"epoch": 0.36476868327402134,
"grad_norm": 0.5237643122673035,
"learning_rate": 1.8647420245129822e-05,
"loss": 0.6707428932189942,
"step": 2050
},
{
"epoch": 0.3665480427046263,
"grad_norm": 0.5926516056060791,
"learning_rate": 1.863309395616128e-05,
"loss": 0.7144505023956299,
"step": 2060
},
{
"epoch": 0.3683274021352313,
"grad_norm": 0.5331065058708191,
"learning_rate": 1.86186977590486e-05,
"loss": 0.6472876071929932,
"step": 2070
},
{
"epoch": 0.3701067615658363,
"grad_norm": 0.6008846163749695,
"learning_rate": 1.860423177036776e-05,
"loss": 0.6628296852111817,
"step": 2080
},
{
"epoch": 0.3718861209964413,
"grad_norm": 0.49397945404052734,
"learning_rate": 1.85896961072599e-05,
"loss": 0.7032846927642822,
"step": 2090
},
{
"epoch": 0.3736654804270463,
"grad_norm": 0.5322824716567993,
"learning_rate": 1.8575090887430354e-05,
"loss": 0.6903145790100098,
"step": 2100
},
{
"epoch": 0.37544483985765126,
"grad_norm": 0.45118311047554016,
"learning_rate": 1.8560416229147718e-05,
"loss": 0.7186954975128174,
"step": 2110
},
{
"epoch": 0.37722419928825623,
"grad_norm": 0.47973230481147766,
"learning_rate": 1.8545672251242855e-05,
"loss": 0.677478837966919,
"step": 2120
},
{
"epoch": 0.3790035587188612,
"grad_norm": 0.5959491729736328,
"learning_rate": 1.8530859073107973e-05,
"loss": 0.7285795211791992,
"step": 2130
},
{
"epoch": 0.3807829181494662,
"grad_norm": 0.5627938508987427,
"learning_rate": 1.851597681469565e-05,
"loss": 0.6971286773681641,
"step": 2140
},
{
"epoch": 0.38256227758007116,
"grad_norm": 0.5175175666809082,
"learning_rate": 1.850102559651784e-05,
"loss": 0.7243900775909424,
"step": 2150
},
{
"epoch": 0.38434163701067614,
"grad_norm": 0.5537509918212891,
"learning_rate": 1.848600553964491e-05,
"loss": 0.6653794765472412,
"step": 2160
},
{
"epoch": 0.3861209964412811,
"grad_norm": 0.6098501086235046,
"learning_rate": 1.847091676570468e-05,
"loss": 0.6803691864013672,
"step": 2170
},
{
"epoch": 0.3879003558718861,
"grad_norm": 0.5026776790618896,
"learning_rate": 1.8455759396881402e-05,
"loss": 0.7138524055480957,
"step": 2180
},
{
"epoch": 0.3896797153024911,
"grad_norm": 0.5244454741477966,
"learning_rate": 1.8440533555914795e-05,
"loss": 0.7204444885253907,
"step": 2190
},
{
"epoch": 0.3914590747330961,
"grad_norm": 0.5315225124359131,
"learning_rate": 1.842523936609905e-05,
"loss": 0.7181321144104004,
"step": 2200
},
{
"epoch": 0.3932384341637011,
"grad_norm": 0.5039063096046448,
"learning_rate": 1.8409876951281814e-05,
"loss": 0.7282841205596924,
"step": 2210
},
{
"epoch": 0.39501779359430605,
"grad_norm": 0.47619393467903137,
"learning_rate": 1.8394446435863206e-05,
"loss": 0.7016836643218994,
"step": 2220
},
{
"epoch": 0.39679715302491103,
"grad_norm": 0.5733043551445007,
"learning_rate": 1.8378947944794806e-05,
"loss": 0.6978562355041504,
"step": 2230
},
{
"epoch": 0.398576512455516,
"grad_norm": 0.5004534125328064,
"learning_rate": 1.8363381603578628e-05,
"loss": 0.6713067531585694,
"step": 2240
},
{
"epoch": 0.400355871886121,
"grad_norm": 0.6110662817955017,
"learning_rate": 1.8347747538266133e-05,
"loss": 0.6862231254577636,
"step": 2250
},
{
"epoch": 0.40213523131672596,
"grad_norm": 0.5308374166488647,
"learning_rate": 1.8332045875457174e-05,
"loss": 0.6602601528167724,
"step": 2260
},
{
"epoch": 0.40391459074733094,
"grad_norm": 0.6257572174072266,
"learning_rate": 1.8316276742299e-05,
"loss": 0.6580804347991943,
"step": 2270
},
{
"epoch": 0.40569395017793597,
"grad_norm": 0.6403529047966003,
"learning_rate": 1.830044026648521e-05,
"loss": 0.6985883712768555,
"step": 2280
},
{
"epoch": 0.40747330960854095,
"grad_norm": 0.5609093308448792,
"learning_rate": 1.828453657625472e-05,
"loss": 0.7306273937225342,
"step": 2290
},
{
"epoch": 0.4092526690391459,
"grad_norm": 0.5089350342750549,
"learning_rate": 1.8268565800390733e-05,
"loss": 0.6552363395690918,
"step": 2300
},
{
"epoch": 0.4110320284697509,
"grad_norm": 0.5988994240760803,
"learning_rate": 1.8252528068219683e-05,
"loss": 0.6836632251739502,
"step": 2310
},
{
"epoch": 0.4128113879003559,
"grad_norm": 0.558757483959198,
"learning_rate": 1.8236423509610207e-05,
"loss": 0.6945361137390137,
"step": 2320
},
{
"epoch": 0.41459074733096085,
"grad_norm": 0.5903414487838745,
"learning_rate": 1.8220252254972077e-05,
"loss": 0.6987195014953613,
"step": 2330
},
{
"epoch": 0.41637010676156583,
"grad_norm": 0.5187668800354004,
"learning_rate": 1.8204014435255136e-05,
"loss": 0.7381620407104492,
"step": 2340
},
{
"epoch": 0.4181494661921708,
"grad_norm": 0.5883176922798157,
"learning_rate": 1.8187710181948274e-05,
"loss": 0.6803225994110107,
"step": 2350
},
{
"epoch": 0.4199288256227758,
"grad_norm": 0.5465121269226074,
"learning_rate": 1.817133962707833e-05,
"loss": 0.6955878257751464,
"step": 2360
},
{
"epoch": 0.42170818505338076,
"grad_norm": 0.5035154223442078,
"learning_rate": 1.815490290320902e-05,
"loss": 0.6446046829223633,
"step": 2370
},
{
"epoch": 0.4234875444839858,
"grad_norm": 0.561638593673706,
"learning_rate": 1.8138400143439892e-05,
"loss": 0.6994437217712403,
"step": 2380
},
{
"epoch": 0.42526690391459077,
"grad_norm": 0.547978937625885,
"learning_rate": 1.812183148140523e-05,
"loss": 0.6748724937438965,
"step": 2390
},
{
"epoch": 0.42704626334519574,
"grad_norm": 0.7230744957923889,
"learning_rate": 1.8105197051272974e-05,
"loss": 0.6960064888000488,
"step": 2400
},
{
"epoch": 0.4288256227758007,
"grad_norm": 0.6158634424209595,
"learning_rate": 1.8088496987743623e-05,
"loss": 0.6599089622497558,
"step": 2410
},
{
"epoch": 0.4306049822064057,
"grad_norm": 0.5255537629127502,
"learning_rate": 1.807173142604917e-05,
"loss": 0.7354787349700928,
"step": 2420
},
{
"epoch": 0.43238434163701067,
"grad_norm": 0.5891067385673523,
"learning_rate": 1.8054900501951988e-05,
"loss": 0.6725128650665283,
"step": 2430
},
{
"epoch": 0.43416370106761565,
"grad_norm": 0.6183168888092041,
"learning_rate": 1.8038004351743726e-05,
"loss": 0.6672306537628174,
"step": 2440
},
{
"epoch": 0.4359430604982206,
"grad_norm": 0.5241208672523499,
"learning_rate": 1.8021043112244222e-05,
"loss": 0.6435032367706299,
"step": 2450
},
{
"epoch": 0.4377224199288256,
"grad_norm": 0.47406890988349915,
"learning_rate": 1.8004016920800392e-05,
"loss": 0.7117865085601807,
"step": 2460
},
{
"epoch": 0.4395017793594306,
"grad_norm": 0.45818889141082764,
"learning_rate": 1.7986925915285098e-05,
"loss": 0.7240960597991943,
"step": 2470
},
{
"epoch": 0.4412811387900356,
"grad_norm": 0.5167953372001648,
"learning_rate": 1.796977023409606e-05,
"loss": 0.6828316211700439,
"step": 2480
},
{
"epoch": 0.4430604982206406,
"grad_norm": 0.48911231756210327,
"learning_rate": 1.795255001615472e-05,
"loss": 0.6907057285308837,
"step": 2490
},
{
"epoch": 0.44483985765124556,
"grad_norm": 0.4384164810180664,
"learning_rate": 1.7935265400905107e-05,
"loss": 0.6647283554077148,
"step": 2500
},
{
"epoch": 0.44483985765124556,
"eval_loss": 0.667682409286499,
"eval_runtime": 408.4893,
"eval_samples_per_second": 12.311,
"eval_steps_per_second": 6.157,
"step": 2500
},
{
"epoch": 0.44661921708185054,
"grad_norm": 0.5545853972434998,
"learning_rate": 1.7917916528312735e-05,
"loss": 0.6821264743804931,
"step": 2510
},
{
"epoch": 0.4483985765124555,
"grad_norm": 0.5834780335426331,
"learning_rate": 1.7900503538863446e-05,
"loss": 0.6818623542785645,
"step": 2520
},
{
"epoch": 0.4501779359430605,
"grad_norm": 0.6038497090339661,
"learning_rate": 1.7883026573562278e-05,
"loss": 0.6842663764953614,
"step": 2530
},
{
"epoch": 0.45195729537366547,
"grad_norm": 0.5673312544822693,
"learning_rate": 1.7865485773932336e-05,
"loss": 0.6788026809692382,
"step": 2540
},
{
"epoch": 0.45373665480427045,
"grad_norm": 0.6115579605102539,
"learning_rate": 1.7847881282013623e-05,
"loss": 0.724776029586792,
"step": 2550
},
{
"epoch": 0.4555160142348754,
"grad_norm": 0.6251904964447021,
"learning_rate": 1.7830213240361916e-05,
"loss": 0.6907171249389649,
"step": 2560
},
{
"epoch": 0.45729537366548045,
"grad_norm": 0.5565618872642517,
"learning_rate": 1.7812481792047587e-05,
"loss": 0.6535940647125245,
"step": 2570
},
{
"epoch": 0.45907473309608543,
"grad_norm": 0.5508270263671875,
"learning_rate": 1.7794687080654462e-05,
"loss": 0.6779204845428467,
"step": 2580
},
{
"epoch": 0.4608540925266904,
"grad_norm": 0.6121755242347717,
"learning_rate": 1.777682925027865e-05,
"loss": 0.6833740234375,
"step": 2590
},
{
"epoch": 0.4626334519572954,
"grad_norm": 0.5710318684577942,
"learning_rate": 1.7758908445527376e-05,
"loss": 0.6820470333099365,
"step": 2600
},
{
"epoch": 0.46441281138790036,
"grad_norm": 0.624721884727478,
"learning_rate": 1.774092481151782e-05,
"loss": 0.7131676197052002,
"step": 2610
},
{
"epoch": 0.46619217081850534,
"grad_norm": 0.5539907813072205,
"learning_rate": 1.7722878493875922e-05,
"loss": 0.6834908962249756,
"step": 2620
},
{
"epoch": 0.4679715302491103,
"grad_norm": 0.7775362730026245,
"learning_rate": 1.7704769638735225e-05,
"loss": 0.7060842990875245,
"step": 2630
},
{
"epoch": 0.4697508896797153,
"grad_norm": 0.5198112726211548,
"learning_rate": 1.7686598392735678e-05,
"loss": 0.6912449836730957,
"step": 2640
},
{
"epoch": 0.47153024911032027,
"grad_norm": 0.5231760144233704,
"learning_rate": 1.766836490302245e-05,
"loss": 0.7053235054016114,
"step": 2650
},
{
"epoch": 0.47330960854092524,
"grad_norm": 0.7802064418792725,
"learning_rate": 1.7650069317244743e-05,
"loss": 0.6698042392730713,
"step": 2660
},
{
"epoch": 0.4750889679715303,
"grad_norm": 0.5244280099868774,
"learning_rate": 1.763171178355459e-05,
"loss": 0.6572854042053222,
"step": 2670
},
{
"epoch": 0.47686832740213525,
"grad_norm": 0.6323163509368896,
"learning_rate": 1.761329245060567e-05,
"loss": 0.6909306526184082,
"step": 2680
},
{
"epoch": 0.4786476868327402,
"grad_norm": 0.636985182762146,
"learning_rate": 1.7594811467552076e-05,
"loss": 0.7432591915130615,
"step": 2690
},
{
"epoch": 0.4804270462633452,
"grad_norm": 0.5935755372047424,
"learning_rate": 1.7576268984047146e-05,
"loss": 0.6589434146881104,
"step": 2700
},
{
"epoch": 0.4822064056939502,
"grad_norm": 0.5581539869308472,
"learning_rate": 1.7557665150242205e-05,
"loss": 0.6235795974731445,
"step": 2710
},
{
"epoch": 0.48398576512455516,
"grad_norm": 0.5456061363220215,
"learning_rate": 1.7539000116785402e-05,
"loss": 0.6965986251831054,
"step": 2720
},
{
"epoch": 0.48576512455516013,
"grad_norm": 0.6723548769950867,
"learning_rate": 1.752027403482043e-05,
"loss": 0.6792643070220947,
"step": 2730
},
{
"epoch": 0.4875444839857651,
"grad_norm": 0.5704858899116516,
"learning_rate": 1.7501487055985364e-05,
"loss": 0.6697192192077637,
"step": 2740
},
{
"epoch": 0.4893238434163701,
"grad_norm": 0.559579074382782,
"learning_rate": 1.7482639332411386e-05,
"loss": 0.6839393615722656,
"step": 2750
},
{
"epoch": 0.49110320284697506,
"grad_norm": 0.6328978538513184,
"learning_rate": 1.7463731016721574e-05,
"loss": 0.6757444858551025,
"step": 2760
},
{
"epoch": 0.4928825622775801,
"grad_norm": 0.5843029022216797,
"learning_rate": 1.744476226202966e-05,
"loss": 0.6616747379302979,
"step": 2770
},
{
"epoch": 0.49466192170818507,
"grad_norm": 0.593262791633606,
"learning_rate": 1.7425733221938802e-05,
"loss": 0.6773699283599853,
"step": 2780
},
{
"epoch": 0.49644128113879005,
"grad_norm": 0.4597082734107971,
"learning_rate": 1.740664405054032e-05,
"loss": 0.6922731876373291,
"step": 2790
},
{
"epoch": 0.498220640569395,
"grad_norm": 0.5115553140640259,
"learning_rate": 1.7387494902412462e-05,
"loss": 0.6650360107421875,
"step": 2800
},
{
"epoch": 0.5,
"grad_norm": 0.5256341695785522,
"learning_rate": 1.7368285932619152e-05,
"loss": 0.6744340896606446,
"step": 2810
},
{
"epoch": 0.501779359430605,
"grad_norm": 0.5476647019386292,
"learning_rate": 1.734901729670873e-05,
"loss": 0.6778338432312012,
"step": 2820
},
{
"epoch": 0.50355871886121,
"grad_norm": 0.5267303586006165,
"learning_rate": 1.7329689150712692e-05,
"loss": 0.6595091342926025,
"step": 2830
},
{
"epoch": 0.505338078291815,
"grad_norm": 0.49511632323265076,
"learning_rate": 1.7310301651144427e-05,
"loss": 0.6996967315673828,
"step": 2840
},
{
"epoch": 0.5071174377224199,
"grad_norm": 0.5894142985343933,
"learning_rate": 1.729085495499796e-05,
"loss": 0.7025831699371338,
"step": 2850
},
{
"epoch": 0.5088967971530249,
"grad_norm": 0.5248763561248779,
"learning_rate": 1.727134921974666e-05,
"loss": 0.6990334510803222,
"step": 2860
},
{
"epoch": 0.5106761565836299,
"grad_norm": 0.6369486451148987,
"learning_rate": 1.7251784603341984e-05,
"loss": 0.6775143146514893,
"step": 2870
},
{
"epoch": 0.5124555160142349,
"grad_norm": 0.7855329513549805,
"learning_rate": 1.723216126421219e-05,
"loss": 0.6654418468475342,
"step": 2880
},
{
"epoch": 0.5142348754448398,
"grad_norm": 0.5383808016777039,
"learning_rate": 1.7212479361261047e-05,
"loss": 0.7224104404449463,
"step": 2890
},
{
"epoch": 0.5160142348754448,
"grad_norm": 0.5875473618507385,
"learning_rate": 1.7192739053866568e-05,
"loss": 0.6993866920471191,
"step": 2900
},
{
"epoch": 0.5177935943060499,
"grad_norm": 0.5973880290985107,
"learning_rate": 1.7172940501879702e-05,
"loss": 0.6883405685424805,
"step": 2910
},
{
"epoch": 0.5195729537366548,
"grad_norm": 0.7815128564834595,
"learning_rate": 1.715308386562304e-05,
"loss": 0.6863636493682861,
"step": 2920
},
{
"epoch": 0.5213523131672598,
"grad_norm": 0.5661780834197998,
"learning_rate": 1.7133169305889526e-05,
"loss": 0.6976628303527832,
"step": 2930
},
{
"epoch": 0.5231316725978647,
"grad_norm": 0.6116606593132019,
"learning_rate": 1.7113196983941152e-05,
"loss": 0.7353427410125732,
"step": 2940
},
{
"epoch": 0.5249110320284698,
"grad_norm": 0.5908172726631165,
"learning_rate": 1.709316706150765e-05,
"loss": 0.7118365287780761,
"step": 2950
},
{
"epoch": 0.5266903914590747,
"grad_norm": 0.5671530365943909,
"learning_rate": 1.707307970078518e-05,
"loss": 0.6777832508087158,
"step": 2960
},
{
"epoch": 0.5284697508896797,
"grad_norm": 0.635502815246582,
"learning_rate": 1.7052935064435023e-05,
"loss": 0.6588366985321045,
"step": 2970
},
{
"epoch": 0.5302491103202847,
"grad_norm": 0.674394428730011,
"learning_rate": 1.7032733315582254e-05,
"loss": 0.6987817764282227,
"step": 2980
},
{
"epoch": 0.5320284697508897,
"grad_norm": 0.6007933020591736,
"learning_rate": 1.7012474617814433e-05,
"loss": 0.6640087604522705,
"step": 2990
},
{
"epoch": 0.5338078291814946,
"grad_norm": 0.5124489665031433,
"learning_rate": 1.6992159135180283e-05,
"loss": 0.642765474319458,
"step": 3000
},
{
"epoch": 0.5338078291814946,
"eval_loss": 0.6583885550498962,
"eval_runtime": 409.5482,
"eval_samples_per_second": 12.279,
"eval_steps_per_second": 6.141,
"step": 3000
},
{
"epoch": 0.5355871886120996,
"grad_norm": 0.6412521004676819,
"learning_rate": 1.6971787032188336e-05,
"loss": 0.6574789047241211,
"step": 3010
},
{
"epoch": 0.5373665480427047,
"grad_norm": 0.5690603852272034,
"learning_rate": 1.6951358473805633e-05,
"loss": 0.7070611476898193,
"step": 3020
},
{
"epoch": 0.5391459074733096,
"grad_norm": 0.5345459580421448,
"learning_rate": 1.6930873625456362e-05,
"loss": 0.6888082027435303,
"step": 3030
},
{
"epoch": 0.5409252669039146,
"grad_norm": 0.67616868019104,
"learning_rate": 1.6910332653020536e-05,
"loss": 0.6946770668029785,
"step": 3040
},
{
"epoch": 0.5427046263345195,
"grad_norm": 0.6721035838127136,
"learning_rate": 1.6889735722832643e-05,
"loss": 0.6662865161895752,
"step": 3050
},
{
"epoch": 0.5444839857651246,
"grad_norm": 0.6241431832313538,
"learning_rate": 1.6869083001680304e-05,
"loss": 0.6311937808990479,
"step": 3060
},
{
"epoch": 0.5462633451957295,
"grad_norm": 0.6454595923423767,
"learning_rate": 1.6848374656802912e-05,
"loss": 0.6600630283355713,
"step": 3070
},
{
"epoch": 0.5480427046263345,
"grad_norm": 0.5625023245811462,
"learning_rate": 1.6827610855890278e-05,
"loss": 0.7191053867340088,
"step": 3080
},
{
"epoch": 0.5498220640569395,
"grad_norm": 0.6803082823753357,
"learning_rate": 1.6806791767081296e-05,
"loss": 0.6553170680999756,
"step": 3090
},
{
"epoch": 0.5516014234875445,
"grad_norm": 0.6465727686882019,
"learning_rate": 1.6785917558962552e-05,
"loss": 0.6733019351959229,
"step": 3100
},
{
"epoch": 0.5533807829181495,
"grad_norm": 0.5595579743385315,
"learning_rate": 1.6764988400566973e-05,
"loss": 0.6531811714172363,
"step": 3110
},
{
"epoch": 0.5551601423487544,
"grad_norm": 0.6368974447250366,
"learning_rate": 1.6744004461372455e-05,
"loss": 0.6765477657318115,
"step": 3120
},
{
"epoch": 0.5569395017793595,
"grad_norm": 0.5315082669258118,
"learning_rate": 1.67229659113005e-05,
"loss": 0.6876490116119385,
"step": 3130
},
{
"epoch": 0.5587188612099644,
"grad_norm": 0.6011325716972351,
"learning_rate": 1.6701872920714822e-05,
"loss": 0.671757698059082,
"step": 3140
},
{
"epoch": 0.5604982206405694,
"grad_norm": 0.5254577994346619,
"learning_rate": 1.6680725660419987e-05,
"loss": 0.6709373950958252,
"step": 3150
},
{
"epoch": 0.5622775800711743,
"grad_norm": 0.5257652401924133,
"learning_rate": 1.6659524301660014e-05,
"loss": 0.6314177513122559,
"step": 3160
},
{
"epoch": 0.5640569395017794,
"grad_norm": 0.577984094619751,
"learning_rate": 1.6638269016116995e-05,
"loss": 0.6948809623718262,
"step": 3170
},
{
"epoch": 0.5658362989323843,
"grad_norm": 0.5697906613349915,
"learning_rate": 1.6616959975909706e-05,
"loss": 0.6561762809753418,
"step": 3180
},
{
"epoch": 0.5676156583629893,
"grad_norm": 0.5052287578582764,
"learning_rate": 1.6595597353592216e-05,
"loss": 0.6577863693237305,
"step": 3190
},
{
"epoch": 0.5693950177935944,
"grad_norm": 0.5690486431121826,
"learning_rate": 1.6574181322152477e-05,
"loss": 0.6753673553466797,
"step": 3200
},
{
"epoch": 0.5711743772241993,
"grad_norm": 0.6763755679130554,
"learning_rate": 1.6552712055010935e-05,
"loss": 0.6881022453308105,
"step": 3210
},
{
"epoch": 0.5729537366548043,
"grad_norm": 0.7030187249183655,
"learning_rate": 1.6531189726019127e-05,
"loss": 0.6971624851226806,
"step": 3220
},
{
"epoch": 0.5747330960854092,
"grad_norm": 0.5229400992393494,
"learning_rate": 1.6509614509458263e-05,
"loss": 0.6652966976165772,
"step": 3230
},
{
"epoch": 0.5765124555160143,
"grad_norm": 0.5338436961174011,
"learning_rate": 1.6487986580037822e-05,
"loss": 0.6946625709533691,
"step": 3240
},
{
"epoch": 0.5782918149466192,
"grad_norm": 0.5740528702735901,
"learning_rate": 1.646630611289414e-05,
"loss": 0.6570149421691894,
"step": 3250
},
{
"epoch": 0.5800711743772242,
"grad_norm": 0.5782309174537659,
"learning_rate": 1.6444573283588977e-05,
"loss": 0.657336950302124,
"step": 3260
},
{
"epoch": 0.5818505338078291,
"grad_norm": 0.5372493863105774,
"learning_rate": 1.6422788268108112e-05,
"loss": 0.7015529632568359,
"step": 3270
},
{
"epoch": 0.5836298932384342,
"grad_norm": 0.734725832939148,
"learning_rate": 1.6400951242859915e-05,
"loss": 0.7008133888244629,
"step": 3280
},
{
"epoch": 0.5854092526690391,
"grad_norm": 0.6092361211776733,
"learning_rate": 1.6379062384673914e-05,
"loss": 0.6785021305084229,
"step": 3290
},
{
"epoch": 0.5871886120996441,
"grad_norm": 0.5179940462112427,
"learning_rate": 1.6357121870799357e-05,
"loss": 0.6846163749694825,
"step": 3300
},
{
"epoch": 0.5889679715302492,
"grad_norm": 0.6822018027305603,
"learning_rate": 1.633512987890379e-05,
"loss": 0.6979034423828125,
"step": 3310
},
{
"epoch": 0.5907473309608541,
"grad_norm": 0.6708216667175293,
"learning_rate": 1.631308658707161e-05,
"loss": 0.616365623474121,
"step": 3320
},
{
"epoch": 0.5925266903914591,
"grad_norm": 0.5766979455947876,
"learning_rate": 1.6290992173802628e-05,
"loss": 0.6930478096008301,
"step": 3330
},
{
"epoch": 0.594306049822064,
"grad_norm": 0.5902812480926514,
"learning_rate": 1.6268846818010615e-05,
"loss": 0.6487626552581787,
"step": 3340
},
{
"epoch": 0.5960854092526691,
"grad_norm": 0.45683762431144714,
"learning_rate": 1.6246650699021866e-05,
"loss": 0.670246696472168,
"step": 3350
},
{
"epoch": 0.597864768683274,
"grad_norm": 0.4878489077091217,
"learning_rate": 1.6224403996573743e-05,
"loss": 0.6784106254577636,
"step": 3360
},
{
"epoch": 0.599644128113879,
"grad_norm": 0.6697226166725159,
"learning_rate": 1.62021068908132e-05,
"loss": 0.7080551147460937,
"step": 3370
},
{
"epoch": 0.6014234875444839,
"grad_norm": 0.6667253971099854,
"learning_rate": 1.6179759562295356e-05,
"loss": 0.6681561470031738,
"step": 3380
},
{
"epoch": 0.603202846975089,
"grad_norm": 0.560409426689148,
"learning_rate": 1.6157362191982025e-05,
"loss": 0.7013211727142334,
"step": 3390
},
{
"epoch": 0.604982206405694,
"grad_norm": 0.6660729050636292,
"learning_rate": 1.6134914961240224e-05,
"loss": 0.6561143398284912,
"step": 3400
},
{
"epoch": 0.6067615658362989,
"grad_norm": 0.5322085618972778,
"learning_rate": 1.6112418051840745e-05,
"loss": 0.6231056213378906,
"step": 3410
},
{
"epoch": 0.608540925266904,
"grad_norm": 0.6110619902610779,
"learning_rate": 1.6089871645956644e-05,
"loss": 0.696910810470581,
"step": 3420
},
{
"epoch": 0.6103202846975089,
"grad_norm": 0.7193135619163513,
"learning_rate": 1.6067275926161792e-05,
"loss": 0.6709835052490234,
"step": 3430
},
{
"epoch": 0.6120996441281139,
"grad_norm": 0.5821454524993896,
"learning_rate": 1.6044631075429406e-05,
"loss": 0.6750143527984619,
"step": 3440
},
{
"epoch": 0.6138790035587188,
"grad_norm": 0.6266823410987854,
"learning_rate": 1.6021937277130516e-05,
"loss": 0.6660190105438233,
"step": 3450
},
{
"epoch": 0.6156583629893239,
"grad_norm": 0.6361811757087708,
"learning_rate": 1.5999194715032543e-05,
"loss": 0.6241551399230957,
"step": 3460
},
{
"epoch": 0.6174377224199288,
"grad_norm": 0.5690405964851379,
"learning_rate": 1.5976403573297767e-05,
"loss": 0.6768513679504394,
"step": 3470
},
{
"epoch": 0.6192170818505338,
"grad_norm": 0.6438109278678894,
"learning_rate": 1.595356403648186e-05,
"loss": 0.6852968215942383,
"step": 3480
},
{
"epoch": 0.6209964412811388,
"grad_norm": 0.5949437022209167,
"learning_rate": 1.5930676289532373e-05,
"loss": 0.6737981319427491,
"step": 3490
},
{
"epoch": 0.6227758007117438,
"grad_norm": 0.6436291933059692,
"learning_rate": 1.590774051778726e-05,
"loss": 0.6780194282531739,
"step": 3500
},
{
"epoch": 0.6227758007117438,
"eval_loss": 0.6505803465843201,
"eval_runtime": 412.7337,
"eval_samples_per_second": 12.185,
"eval_steps_per_second": 6.094,
"step": 3500
},
{
"epoch": 0.6245551601423488,
"grad_norm": 0.5366589426994324,
"learning_rate": 1.588475690697335e-05,
"loss": 0.6628384590148926,
"step": 3510
},
{
"epoch": 0.6263345195729537,
"grad_norm": 0.5512357950210571,
"learning_rate": 1.5861725643204876e-05,
"loss": 0.6656132221221924,
"step": 3520
},
{
"epoch": 0.6281138790035588,
"grad_norm": 0.5239390134811401,
"learning_rate": 1.5838646912981937e-05,
"loss": 0.6585372447967529,
"step": 3530
},
{
"epoch": 0.6298932384341637,
"grad_norm": 0.611584484577179,
"learning_rate": 1.5815520903188998e-05,
"loss": 0.6700050354003906,
"step": 3540
},
{
"epoch": 0.6316725978647687,
"grad_norm": 0.5967355966567993,
"learning_rate": 1.5792347801093393e-05,
"loss": 0.6763735294342041,
"step": 3550
},
{
"epoch": 0.6334519572953736,
"grad_norm": 0.5486634373664856,
"learning_rate": 1.576912779434379e-05,
"loss": 0.6555093765258789,
"step": 3560
},
{
"epoch": 0.6352313167259787,
"grad_norm": 0.6155397891998291,
"learning_rate": 1.5745861070968667e-05,
"loss": 0.660044813156128,
"step": 3570
},
{
"epoch": 0.6370106761565836,
"grad_norm": 0.6666173338890076,
"learning_rate": 1.5722547819374807e-05,
"loss": 0.6919036388397217,
"step": 3580
},
{
"epoch": 0.6387900355871886,
"grad_norm": 0.628685474395752,
"learning_rate": 1.5699188228345765e-05,
"loss": 0.6886429786682129,
"step": 3590
},
{
"epoch": 0.6405693950177936,
"grad_norm": 0.5950825810432434,
"learning_rate": 1.5675782487040337e-05,
"loss": 0.6522699356079101,
"step": 3600
},
{
"epoch": 0.6423487544483986,
"grad_norm": 0.6900691390037537,
"learning_rate": 1.565233078499103e-05,
"loss": 0.6545799255371094,
"step": 3610
},
{
"epoch": 0.6441281138790036,
"grad_norm": 0.6600697040557861,
"learning_rate": 1.5628833312102526e-05,
"loss": 0.690045976638794,
"step": 3620
},
{
"epoch": 0.6459074733096085,
"grad_norm": 0.7045279145240784,
"learning_rate": 1.5605290258650144e-05,
"loss": 0.6638887405395508,
"step": 3630
},
{
"epoch": 0.6476868327402135,
"grad_norm": 0.6415700316429138,
"learning_rate": 1.5581701815278302e-05,
"loss": 0.6451498508453369,
"step": 3640
},
{
"epoch": 0.6494661921708185,
"grad_norm": 0.6846213340759277,
"learning_rate": 1.5558068172998977e-05,
"loss": 0.6113666534423828,
"step": 3650
},
{
"epoch": 0.6512455516014235,
"grad_norm": 0.584511935710907,
"learning_rate": 1.5534389523190142e-05,
"loss": 0.7053666591644288,
"step": 3660
},
{
"epoch": 0.6530249110320284,
"grad_norm": 0.7102410793304443,
"learning_rate": 1.551066605759424e-05,
"loss": 0.6806889533996582,
"step": 3670
},
{
"epoch": 0.6548042704626335,
"grad_norm": 0.5693709254264832,
"learning_rate": 1.5486897968316604e-05,
"loss": 0.6717594623565674,
"step": 3680
},
{
"epoch": 0.6565836298932385,
"grad_norm": 0.6451858878135681,
"learning_rate": 1.546308544782392e-05,
"loss": 0.69324951171875,
"step": 3690
},
{
"epoch": 0.6583629893238434,
"grad_norm": 0.5983235836029053,
"learning_rate": 1.543922868894268e-05,
"loss": 0.6728285312652588,
"step": 3700
},
{
"epoch": 0.6601423487544484,
"grad_norm": 0.5725272297859192,
"learning_rate": 1.541532788485758e-05,
"loss": 0.6560508728027343,
"step": 3710
},
{
"epoch": 0.6619217081850534,
"grad_norm": 0.6461522579193115,
"learning_rate": 1.5391383229110005e-05,
"loss": 0.6883309364318848,
"step": 3720
},
{
"epoch": 0.6637010676156584,
"grad_norm": 0.6512270569801331,
"learning_rate": 1.5367394915596414e-05,
"loss": 0.6622300624847413,
"step": 3730
},
{
"epoch": 0.6654804270462633,
"grad_norm": 0.6106224656105042,
"learning_rate": 1.534336313856681e-05,
"loss": 0.6878883361816406,
"step": 3740
},
{
"epoch": 0.6672597864768683,
"grad_norm": 0.6902608275413513,
"learning_rate": 1.5319288092623142e-05,
"loss": 0.6434782981872559,
"step": 3750
},
{
"epoch": 0.6690391459074733,
"grad_norm": 0.5784608721733093,
"learning_rate": 1.5295169972717743e-05,
"loss": 0.6367124557495117,
"step": 3760
},
{
"epoch": 0.6708185053380783,
"grad_norm": 0.6913280487060547,
"learning_rate": 1.5271008974151744e-05,
"loss": 0.630396556854248,
"step": 3770
},
{
"epoch": 0.6725978647686833,
"grad_norm": 0.7483834624290466,
"learning_rate": 1.5246805292573487e-05,
"loss": 0.6295557975769043,
"step": 3780
},
{
"epoch": 0.6743772241992882,
"grad_norm": 0.7149993777275085,
"learning_rate": 1.5222559123976962e-05,
"loss": 0.6769547939300538,
"step": 3790
},
{
"epoch": 0.6761565836298933,
"grad_norm": 0.6534271836280823,
"learning_rate": 1.5198270664700187e-05,
"loss": 0.7023432731628418,
"step": 3800
},
{
"epoch": 0.6779359430604982,
"grad_norm": 0.6507487297058105,
"learning_rate": 1.5173940111423657e-05,
"loss": 0.6808289527893067,
"step": 3810
},
{
"epoch": 0.6797153024911032,
"grad_norm": 0.5977747440338135,
"learning_rate": 1.5149567661168715e-05,
"loss": 0.6398194313049317,
"step": 3820
},
{
"epoch": 0.6814946619217082,
"grad_norm": 0.6036016941070557,
"learning_rate": 1.5125153511295989e-05,
"loss": 0.6317630767822265,
"step": 3830
},
{
"epoch": 0.6832740213523132,
"grad_norm": 0.8481062054634094,
"learning_rate": 1.5100697859503762e-05,
"loss": 0.6741732597351074,
"step": 3840
},
{
"epoch": 0.6850533807829181,
"grad_norm": 0.5988038182258606,
"learning_rate": 1.5076200903826391e-05,
"loss": 0.6638431072235107,
"step": 3850
},
{
"epoch": 0.6868327402135231,
"grad_norm": 0.6785585880279541,
"learning_rate": 1.5051662842632709e-05,
"loss": 0.6523032665252686,
"step": 3860
},
{
"epoch": 0.6886120996441281,
"grad_norm": 0.6749284863471985,
"learning_rate": 1.5027083874624392e-05,
"loss": 0.6517146110534668,
"step": 3870
},
{
"epoch": 0.6903914590747331,
"grad_norm": 0.6880838871002197,
"learning_rate": 1.5002464198834383e-05,
"loss": 0.6895311355590821,
"step": 3880
},
{
"epoch": 0.6921708185053381,
"grad_norm": 0.6296641826629639,
"learning_rate": 1.4977804014625257e-05,
"loss": 0.691303300857544,
"step": 3890
},
{
"epoch": 0.693950177935943,
"grad_norm": 0.62867671251297,
"learning_rate": 1.4953103521687612e-05,
"loss": 0.6549241065979003,
"step": 3900
},
{
"epoch": 0.6957295373665481,
"grad_norm": 0.6810404658317566,
"learning_rate": 1.4928362920038455e-05,
"loss": 0.6266399383544922,
"step": 3910
},
{
"epoch": 0.697508896797153,
"grad_norm": 0.7017929553985596,
"learning_rate": 1.4903582410019586e-05,
"loss": 0.6477387905120849,
"step": 3920
},
{
"epoch": 0.699288256227758,
"grad_norm": 0.5611311197280884,
"learning_rate": 1.487876219229596e-05,
"loss": 0.6562673568725585,
"step": 3930
},
{
"epoch": 0.701067615658363,
"grad_norm": 0.5652351975440979,
"learning_rate": 1.4853902467854084e-05,
"loss": 0.6768126487731934,
"step": 3940
},
{
"epoch": 0.702846975088968,
"grad_norm": 0.7303992509841919,
"learning_rate": 1.4829003438000374e-05,
"loss": 0.6467350959777832,
"step": 3950
},
{
"epoch": 0.7046263345195729,
"grad_norm": 0.5587636232376099,
"learning_rate": 1.4804065304359525e-05,
"loss": 0.6700319766998291,
"step": 3960
},
{
"epoch": 0.7064056939501779,
"grad_norm": 0.7100324630737305,
"learning_rate": 1.477908826887289e-05,
"loss": 0.6796034812927246,
"step": 3970
},
{
"epoch": 0.708185053380783,
"grad_norm": 0.6912830471992493,
"learning_rate": 1.4754072533796833e-05,
"loss": 0.6907623291015625,
"step": 3980
},
{
"epoch": 0.7099644128113879,
"grad_norm": 0.744030773639679,
"learning_rate": 1.4729018301701093e-05,
"loss": 0.6747735023498536,
"step": 3990
},
{
"epoch": 0.7117437722419929,
"grad_norm": 0.6915676593780518,
"learning_rate": 1.4703925775467149e-05,
"loss": 0.6172840595245361,
"step": 4000
},
{
"epoch": 0.7117437722419929,
"eval_loss": 0.6436223387718201,
"eval_runtime": 412.5315,
"eval_samples_per_second": 12.191,
"eval_steps_per_second": 6.097,
"step": 4000
},
{
"epoch": 0.7135231316725978,
"grad_norm": 0.5881261229515076,
"learning_rate": 1.4678795158286579e-05,
"loss": 0.6607324123382569,
"step": 4010
},
{
"epoch": 0.7153024911032029,
"grad_norm": 0.7214511036872864,
"learning_rate": 1.4653626653659399e-05,
"loss": 0.6573444366455078,
"step": 4020
},
{
"epoch": 0.7170818505338078,
"grad_norm": 0.6598398089408875,
"learning_rate": 1.4628420465392432e-05,
"loss": 0.6589923858642578,
"step": 4030
},
{
"epoch": 0.7188612099644128,
"grad_norm": 0.6202049255371094,
"learning_rate": 1.4603176797597654e-05,
"loss": 0.6494176387786865,
"step": 4040
},
{
"epoch": 0.7206405693950177,
"grad_norm": 0.6085755825042725,
"learning_rate": 1.4577895854690536e-05,
"loss": 0.6625244140625,
"step": 4050
},
{
"epoch": 0.7224199288256228,
"grad_norm": 0.799647331237793,
"learning_rate": 1.4552577841388388e-05,
"loss": 0.6426272392272949,
"step": 4060
},
{
"epoch": 0.7241992882562278,
"grad_norm": 0.5650002956390381,
"learning_rate": 1.4527222962708714e-05,
"loss": 0.6512536525726318,
"step": 4070
},
{
"epoch": 0.7259786476868327,
"grad_norm": 0.7105417251586914,
"learning_rate": 1.4501831423967531e-05,
"loss": 0.639466667175293,
"step": 4080
},
{
"epoch": 0.7277580071174378,
"grad_norm": 0.6997768878936768,
"learning_rate": 1.4476403430777729e-05,
"loss": 0.6274962425231934,
"step": 4090
},
{
"epoch": 0.7295373665480427,
"grad_norm": 0.6622691750526428,
"learning_rate": 1.4450939189047379e-05,
"loss": 0.6450656890869141,
"step": 4100
},
{
"epoch": 0.7313167259786477,
"grad_norm": 0.669937789440155,
"learning_rate": 1.4425438904978103e-05,
"loss": 0.6467487335205078,
"step": 4110
},
{
"epoch": 0.7330960854092526,
"grad_norm": 0.6928410530090332,
"learning_rate": 1.4399902785063366e-05,
"loss": 0.6678302764892579,
"step": 4120
},
{
"epoch": 0.7348754448398577,
"grad_norm": 0.7608679533004761,
"learning_rate": 1.4374331036086831e-05,
"loss": 0.6973666191101074,
"step": 4130
},
{
"epoch": 0.7366548042704626,
"grad_norm": 0.7009332180023193,
"learning_rate": 1.4348723865120663e-05,
"loss": 0.6456516742706299,
"step": 4140
},
{
"epoch": 0.7384341637010676,
"grad_norm": 0.7158817648887634,
"learning_rate": 1.4323081479523878e-05,
"loss": 0.6287565708160401,
"step": 4150
},
{
"epoch": 0.7402135231316725,
"grad_norm": 0.5314562916755676,
"learning_rate": 1.4297404086940635e-05,
"loss": 0.6649733543395996,
"step": 4160
},
{
"epoch": 0.7419928825622776,
"grad_norm": 0.6606214642524719,
"learning_rate": 1.4271691895298573e-05,
"loss": 0.6175446033477783,
"step": 4170
},
{
"epoch": 0.7437722419928826,
"grad_norm": 0.618217945098877,
"learning_rate": 1.4245945112807133e-05,
"loss": 0.6604631423950196,
"step": 4180
},
{
"epoch": 0.7455516014234875,
"grad_norm": 0.5399601459503174,
"learning_rate": 1.422016394795585e-05,
"loss": 0.6682997226715088,
"step": 4190
},
{
"epoch": 0.7473309608540926,
"grad_norm": 0.5808553695678711,
"learning_rate": 1.419434860951268e-05,
"loss": 0.6625009059906006,
"step": 4200
},
{
"epoch": 0.7491103202846975,
"grad_norm": 0.670625627040863,
"learning_rate": 1.416849930652231e-05,
"loss": 0.6778800010681152,
"step": 4210
},
{
"epoch": 0.7508896797153025,
"grad_norm": 0.6508112549781799,
"learning_rate": 1.4142616248304459e-05,
"loss": 0.6265085220336915,
"step": 4220
},
{
"epoch": 0.7526690391459074,
"grad_norm": 0.5993587970733643,
"learning_rate": 1.4116699644452182e-05,
"loss": 0.656840181350708,
"step": 4230
},
{
"epoch": 0.7544483985765125,
"grad_norm": 0.6819363236427307,
"learning_rate": 1.4090749704830184e-05,
"loss": 0.6575029373168946,
"step": 4240
},
{
"epoch": 0.7562277580071174,
"grad_norm": 0.6625942587852478,
"learning_rate": 1.4064766639573104e-05,
"loss": 0.6340457916259765,
"step": 4250
},
{
"epoch": 0.7580071174377224,
"grad_norm": 0.8185866475105286,
"learning_rate": 1.4038750659083831e-05,
"loss": 0.6835246086120605,
"step": 4260
},
{
"epoch": 0.7597864768683275,
"grad_norm": 0.8232684135437012,
"learning_rate": 1.4012701974031782e-05,
"loss": 0.6450761795043946,
"step": 4270
},
{
"epoch": 0.7615658362989324,
"grad_norm": 0.6913644671440125,
"learning_rate": 1.3986620795351214e-05,
"loss": 0.6313485145568848,
"step": 4280
},
{
"epoch": 0.7633451957295374,
"grad_norm": 0.641167163848877,
"learning_rate": 1.3960507334239501e-05,
"loss": 0.6450904846191406,
"step": 4290
},
{
"epoch": 0.7651245551601423,
"grad_norm": 0.5932332873344421,
"learning_rate": 1.3934361802155436e-05,
"loss": 0.6665386199951172,
"step": 4300
},
{
"epoch": 0.7669039145907474,
"grad_norm": 0.5669364929199219,
"learning_rate": 1.3908184410817511e-05,
"loss": 0.6564301013946533,
"step": 4310
},
{
"epoch": 0.7686832740213523,
"grad_norm": 0.6267620325088501,
"learning_rate": 1.3881975372202201e-05,
"loss": 0.6448751449584961,
"step": 4320
},
{
"epoch": 0.7704626334519573,
"grad_norm": 0.8893764615058899,
"learning_rate": 1.3855734898542252e-05,
"loss": 0.6395359516143799,
"step": 4330
},
{
"epoch": 0.7722419928825622,
"grad_norm": 0.6805179119110107,
"learning_rate": 1.3829463202324967e-05,
"loss": 0.6458981990814209,
"step": 4340
},
{
"epoch": 0.7740213523131673,
"grad_norm": 0.6416231393814087,
"learning_rate": 1.3803160496290472e-05,
"loss": 0.6462121963500976,
"step": 4350
},
{
"epoch": 0.7758007117437722,
"grad_norm": 0.7718709707260132,
"learning_rate": 1.3776826993430006e-05,
"loss": 0.6271074295043946,
"step": 4360
},
{
"epoch": 0.7775800711743772,
"grad_norm": 0.6111568212509155,
"learning_rate": 1.375046290698419e-05,
"loss": 0.6282791137695313,
"step": 4370
},
{
"epoch": 0.7793594306049823,
"grad_norm": 0.7178627252578735,
"learning_rate": 1.3724068450441303e-05,
"loss": 0.6567965507507324,
"step": 4380
},
{
"epoch": 0.7811387900355872,
"grad_norm": 0.6303468346595764,
"learning_rate": 1.3697643837535546e-05,
"loss": 0.6168845653533935,
"step": 4390
},
{
"epoch": 0.7829181494661922,
"grad_norm": 0.6654033660888672,
"learning_rate": 1.3671189282245326e-05,
"loss": 0.6331443309783935,
"step": 4400
},
{
"epoch": 0.7846975088967971,
"grad_norm": 0.7051156163215637,
"learning_rate": 1.3644704998791501e-05,
"loss": 0.6948952198028564,
"step": 4410
},
{
"epoch": 0.7864768683274022,
"grad_norm": 0.5967740416526794,
"learning_rate": 1.361819120163567e-05,
"loss": 0.6140963077545166,
"step": 4420
},
{
"epoch": 0.7882562277580071,
"grad_norm": 0.688831090927124,
"learning_rate": 1.3591648105478423e-05,
"loss": 0.6627942085266113,
"step": 4430
},
{
"epoch": 0.7900355871886121,
"grad_norm": 0.5357785820960999,
"learning_rate": 1.3565075925257605e-05,
"loss": 0.6558830261230468,
"step": 4440
},
{
"epoch": 0.791814946619217,
"grad_norm": 0.6449471712112427,
"learning_rate": 1.3538474876146567e-05,
"loss": 0.6528484344482421,
"step": 4450
},
{
"epoch": 0.7935943060498221,
"grad_norm": 0.6996321678161621,
"learning_rate": 1.3511845173552446e-05,
"loss": 0.6519684314727783,
"step": 4460
},
{
"epoch": 0.7953736654804271,
"grad_norm": 0.6798763871192932,
"learning_rate": 1.348518703311439e-05,
"loss": 0.6224774360656739,
"step": 4470
},
{
"epoch": 0.797153024911032,
"grad_norm": 0.698722243309021,
"learning_rate": 1.3458500670701833e-05,
"loss": 0.6481215953826904,
"step": 4480
},
{
"epoch": 0.798932384341637,
"grad_norm": 0.6421968340873718,
"learning_rate": 1.3431786302412749e-05,
"loss": 0.6016243934631348,
"step": 4490
},
{
"epoch": 0.800711743772242,
"grad_norm": 0.6529412269592285,
"learning_rate": 1.3405044144571888e-05,
"loss": 0.6958633899688721,
"step": 4500
},
{
"epoch": 0.800711743772242,
"eval_loss": 0.6375713348388672,
"eval_runtime": 411.392,
"eval_samples_per_second": 12.224,
"eval_steps_per_second": 6.113,
"step": 4500
},
{
"epoch": 0.802491103202847,
"grad_norm": 0.7270268797874451,
"learning_rate": 1.3378274413729036e-05,
"loss": 0.6834945201873779,
"step": 4510
},
{
"epoch": 0.8042704626334519,
"grad_norm": 0.6442169547080994,
"learning_rate": 1.335147732665725e-05,
"loss": 0.6535075187683106,
"step": 4520
},
{
"epoch": 0.806049822064057,
"grad_norm": 0.7817319631576538,
"learning_rate": 1.3324653100351117e-05,
"loss": 0.6588070869445801,
"step": 4530
},
{
"epoch": 0.8078291814946619,
"grad_norm": 0.6025936603546143,
"learning_rate": 1.3297801952024983e-05,
"loss": 0.6654253482818604,
"step": 4540
},
{
"epoch": 0.8096085409252669,
"grad_norm": 0.6981809735298157,
"learning_rate": 1.3270924099111204e-05,
"loss": 0.6565302848815918,
"step": 4550
},
{
"epoch": 0.8113879003558719,
"grad_norm": 0.7100082635879517,
"learning_rate": 1.3244019759258378e-05,
"loss": 0.6451606273651123,
"step": 4560
},
{
"epoch": 0.8131672597864769,
"grad_norm": 0.7197165489196777,
"learning_rate": 1.3217089150329589e-05,
"loss": 0.6707216739654541,
"step": 4570
},
{
"epoch": 0.8149466192170819,
"grad_norm": 0.6494991183280945,
"learning_rate": 1.3190132490400642e-05,
"loss": 0.6538206577301026,
"step": 4580
},
{
"epoch": 0.8167259786476868,
"grad_norm": 0.610467255115509,
"learning_rate": 1.316314999775829e-05,
"loss": 0.6370253562927246,
"step": 4590
},
{
"epoch": 0.8185053380782918,
"grad_norm": 0.765133261680603,
"learning_rate": 1.3136141890898473e-05,
"loss": 0.6969471454620362,
"step": 4600
},
{
"epoch": 0.8202846975088968,
"grad_norm": 0.6949911117553711,
"learning_rate": 1.3109108388524551e-05,
"loss": 0.6275270462036133,
"step": 4610
},
{
"epoch": 0.8220640569395018,
"grad_norm": 0.6720005869865417,
"learning_rate": 1.3082049709545524e-05,
"loss": 0.6380300521850586,
"step": 4620
},
{
"epoch": 0.8238434163701067,
"grad_norm": 0.7357544898986816,
"learning_rate": 1.3054966073074264e-05,
"loss": 0.6391136169433593,
"step": 4630
},
{
"epoch": 0.8256227758007118,
"grad_norm": 0.6352887749671936,
"learning_rate": 1.3027857698425748e-05,
"loss": 0.6527080059051513,
"step": 4640
},
{
"epoch": 0.8274021352313167,
"grad_norm": 0.6615159511566162,
"learning_rate": 1.3000724805115265e-05,
"loss": 0.6786240100860595,
"step": 4650
},
{
"epoch": 0.8291814946619217,
"grad_norm": 0.7004484534263611,
"learning_rate": 1.2973567612856659e-05,
"loss": 0.6390516281127929,
"step": 4660
},
{
"epoch": 0.8309608540925267,
"grad_norm": 0.7463746070861816,
"learning_rate": 1.294638634156053e-05,
"loss": 0.6174628734588623,
"step": 4670
},
{
"epoch": 0.8327402135231317,
"grad_norm": 0.7666842341423035,
"learning_rate": 1.2919181211332474e-05,
"loss": 0.6169525623321533,
"step": 4680
},
{
"epoch": 0.8345195729537367,
"grad_norm": 0.6636411547660828,
"learning_rate": 1.2891952442471274e-05,
"loss": 0.6446310043334961,
"step": 4690
},
{
"epoch": 0.8362989323843416,
"grad_norm": 0.9001480937004089,
"learning_rate": 1.2864700255467148e-05,
"loss": 0.6689016342163085,
"step": 4700
},
{
"epoch": 0.8380782918149466,
"grad_norm": 0.551397442817688,
"learning_rate": 1.2837424870999933e-05,
"loss": 0.6616122245788574,
"step": 4710
},
{
"epoch": 0.8398576512455516,
"grad_norm": 0.7142363786697388,
"learning_rate": 1.281012650993732e-05,
"loss": 0.690334415435791,
"step": 4720
},
{
"epoch": 0.8416370106761566,
"grad_norm": 0.701836347579956,
"learning_rate": 1.2782805393333054e-05,
"loss": 0.6755175113677978,
"step": 4730
},
{
"epoch": 0.8434163701067615,
"grad_norm": 0.6752656102180481,
"learning_rate": 1.2755461742425147e-05,
"loss": 0.6132485866546631,
"step": 4740
},
{
"epoch": 0.8451957295373665,
"grad_norm": 0.6739411354064941,
"learning_rate": 1.2728095778634094e-05,
"loss": 0.6823519706726074,
"step": 4750
},
{
"epoch": 0.8469750889679716,
"grad_norm": 0.6728504300117493,
"learning_rate": 1.2700707723561064e-05,
"loss": 0.6264513492584228,
"step": 4760
},
{
"epoch": 0.8487544483985765,
"grad_norm": 0.6379041075706482,
"learning_rate": 1.2673297798986118e-05,
"loss": 0.6372091770172119,
"step": 4770
},
{
"epoch": 0.8505338078291815,
"grad_norm": 0.6896198987960815,
"learning_rate": 1.2645866226866405e-05,
"loss": 0.6795585632324219,
"step": 4780
},
{
"epoch": 0.8523131672597865,
"grad_norm": 0.6732060313224792,
"learning_rate": 1.261841322933438e-05,
"loss": 0.6782153129577637,
"step": 4790
},
{
"epoch": 0.8540925266903915,
"grad_norm": 0.7170758247375488,
"learning_rate": 1.2590939028695987e-05,
"loss": 0.6489102840423584,
"step": 4800
},
{
"epoch": 0.8558718861209964,
"grad_norm": 0.7446131110191345,
"learning_rate": 1.2563443847428862e-05,
"loss": 0.6556113719940185,
"step": 4810
},
{
"epoch": 0.8576512455516014,
"grad_norm": 0.5690045952796936,
"learning_rate": 1.2535927908180547e-05,
"loss": 0.6207235813140869,
"step": 4820
},
{
"epoch": 0.8594306049822064,
"grad_norm": 0.7088342308998108,
"learning_rate": 1.2508391433766667e-05,
"loss": 0.6439788341522217,
"step": 4830
},
{
"epoch": 0.8612099644128114,
"grad_norm": 0.573747992515564,
"learning_rate": 1.2480834647169134e-05,
"loss": 0.6493591785430908,
"step": 4840
},
{
"epoch": 0.8629893238434164,
"grad_norm": 0.6367696523666382,
"learning_rate": 1.2453257771534348e-05,
"loss": 0.6573034286499023,
"step": 4850
},
{
"epoch": 0.8647686832740213,
"grad_norm": 0.6127105355262756,
"learning_rate": 1.2425661030171382e-05,
"loss": 0.6419090747833252,
"step": 4860
},
{
"epoch": 0.8665480427046264,
"grad_norm": 0.6644669771194458,
"learning_rate": 1.2398044646550167e-05,
"loss": 0.6521550178527832,
"step": 4870
},
{
"epoch": 0.8683274021352313,
"grad_norm": 0.6144851446151733,
"learning_rate": 1.2370408844299705e-05,
"loss": 0.6388635158538818,
"step": 4880
},
{
"epoch": 0.8701067615658363,
"grad_norm": 0.748590886592865,
"learning_rate": 1.2342753847206236e-05,
"loss": 0.6553171157836915,
"step": 4890
},
{
"epoch": 0.8718861209964412,
"grad_norm": 0.7603781819343567,
"learning_rate": 1.2315079879211435e-05,
"loss": 0.639424467086792,
"step": 4900
},
{
"epoch": 0.8736654804270463,
"grad_norm": 0.60029137134552,
"learning_rate": 1.2287387164410597e-05,
"loss": 0.6580222606658935,
"step": 4910
},
{
"epoch": 0.8754448398576512,
"grad_norm": 0.5586308836936951,
"learning_rate": 1.2259675927050829e-05,
"loss": 0.6166585445404053,
"step": 4920
},
{
"epoch": 0.8772241992882562,
"grad_norm": 0.6883022785186768,
"learning_rate": 1.2231946391529213e-05,
"loss": 0.651360559463501,
"step": 4930
},
{
"epoch": 0.8790035587188612,
"grad_norm": 0.7347468733787537,
"learning_rate": 1.2204198782391018e-05,
"loss": 0.6587865352630615,
"step": 4940
},
{
"epoch": 0.8807829181494662,
"grad_norm": 0.8330339193344116,
"learning_rate": 1.2176433324327868e-05,
"loss": 0.6858234405517578,
"step": 4950
},
{
"epoch": 0.8825622775800712,
"grad_norm": 0.6569383144378662,
"learning_rate": 1.2148650242175908e-05,
"loss": 0.7256248950958252,
"step": 4960
},
{
"epoch": 0.8843416370106761,
"grad_norm": 0.7147039771080017,
"learning_rate": 1.2120849760914013e-05,
"loss": 0.6601557254791259,
"step": 4970
},
{
"epoch": 0.8861209964412812,
"grad_norm": 0.7715662717819214,
"learning_rate": 1.2093032105661944e-05,
"loss": 0.6112511634826661,
"step": 4980
},
{
"epoch": 0.8879003558718861,
"grad_norm": 0.7267486453056335,
"learning_rate": 1.2065197501678529e-05,
"loss": 0.6230842590332031,
"step": 4990
},
{
"epoch": 0.8896797153024911,
"grad_norm": 0.7366806268692017,
"learning_rate": 1.203734617435985e-05,
"loss": 0.6849907875061035,
"step": 5000
},
{
"epoch": 0.8896797153024911,
"eval_loss": 0.6321616172790527,
"eval_runtime": 411.8441,
"eval_samples_per_second": 12.211,
"eval_steps_per_second": 6.107,
"step": 5000
},
{
"epoch": 0.891459074733096,
"grad_norm": 0.7812427282333374,
"learning_rate": 1.2009478349237397e-05,
"loss": 0.6636211395263671,
"step": 5010
},
{
"epoch": 0.8932384341637011,
"grad_norm": 0.6625553965568542,
"learning_rate": 1.1981594251976265e-05,
"loss": 0.6543920040130615,
"step": 5020
},
{
"epoch": 0.895017793594306,
"grad_norm": 0.6391006112098694,
"learning_rate": 1.1953694108373313e-05,
"loss": 0.653505516052246,
"step": 5030
},
{
"epoch": 0.896797153024911,
"grad_norm": 0.763852059841156,
"learning_rate": 1.1925778144355338e-05,
"loss": 0.6871216773986817,
"step": 5040
},
{
"epoch": 0.8985765124555161,
"grad_norm": 0.7798473834991455,
"learning_rate": 1.189784658597724e-05,
"loss": 0.6243946552276611,
"step": 5050
},
{
"epoch": 0.900355871886121,
"grad_norm": 0.6348104476928711,
"learning_rate": 1.1869899659420208e-05,
"loss": 0.6567151069641113,
"step": 5060
},
{
"epoch": 0.902135231316726,
"grad_norm": 0.6433530449867249,
"learning_rate": 1.1841937590989873e-05,
"loss": 0.677039909362793,
"step": 5070
},
{
"epoch": 0.9039145907473309,
"grad_norm": 0.8366072177886963,
"learning_rate": 1.1813960607114476e-05,
"loss": 0.6381283760070801,
"step": 5080
},
{
"epoch": 0.905693950177936,
"grad_norm": 0.7176342606544495,
"learning_rate": 1.1785968934343045e-05,
"loss": 0.6601722717285157,
"step": 5090
},
{
"epoch": 0.9074733096085409,
"grad_norm": 0.6531491279602051,
"learning_rate": 1.1757962799343548e-05,
"loss": 0.6662145137786866,
"step": 5100
},
{
"epoch": 0.9092526690391459,
"grad_norm": 0.6918512582778931,
"learning_rate": 1.1729942428901068e-05,
"loss": 0.6182730674743653,
"step": 5110
},
{
"epoch": 0.9110320284697508,
"grad_norm": 0.5597010850906372,
"learning_rate": 1.1701908049915964e-05,
"loss": 0.6443261623382568,
"step": 5120
},
{
"epoch": 0.9128113879003559,
"grad_norm": 0.6807442307472229,
"learning_rate": 1.1673859889402028e-05,
"loss": 0.6334492206573487,
"step": 5130
},
{
"epoch": 0.9145907473309609,
"grad_norm": 0.6249061226844788,
"learning_rate": 1.1645798174484653e-05,
"loss": 0.614483642578125,
"step": 5140
},
{
"epoch": 0.9163701067615658,
"grad_norm": 0.5527177453041077,
"learning_rate": 1.161772313239899e-05,
"loss": 0.650807237625122,
"step": 5150
},
{
"epoch": 0.9181494661921709,
"grad_norm": 0.7590207457542419,
"learning_rate": 1.1589634990488107e-05,
"loss": 0.6382132053375245,
"step": 5160
},
{
"epoch": 0.9199288256227758,
"grad_norm": 0.7238802909851074,
"learning_rate": 1.1561533976201157e-05,
"loss": 0.6336176872253418,
"step": 5170
},
{
"epoch": 0.9217081850533808,
"grad_norm": 0.8075314164161682,
"learning_rate": 1.1533420317091519e-05,
"loss": 0.6544758796691894,
"step": 5180
},
{
"epoch": 0.9234875444839857,
"grad_norm": 0.7770307064056396,
"learning_rate": 1.1505294240814979e-05,
"loss": 0.6220970153808594,
"step": 5190
},
{
"epoch": 0.9252669039145908,
"grad_norm": 0.5794429779052734,
"learning_rate": 1.1477155975127866e-05,
"loss": 0.6260251045227051,
"step": 5200
},
{
"epoch": 0.9270462633451957,
"grad_norm": 0.6863322257995605,
"learning_rate": 1.1449005747885212e-05,
"loss": 0.6177238464355469,
"step": 5210
},
{
"epoch": 0.9288256227758007,
"grad_norm": 0.6806472539901733,
"learning_rate": 1.142084378703892e-05,
"loss": 0.6731130599975585,
"step": 5220
},
{
"epoch": 0.9306049822064056,
"grad_norm": 0.6430651545524597,
"learning_rate": 1.1392670320635894e-05,
"loss": 0.6535952091217041,
"step": 5230
},
{
"epoch": 0.9323843416370107,
"grad_norm": 0.6920055150985718,
"learning_rate": 1.1364485576816225e-05,
"loss": 0.6301285266876221,
"step": 5240
},
{
"epoch": 0.9341637010676157,
"grad_norm": 0.7043541669845581,
"learning_rate": 1.1336289783811303e-05,
"loss": 0.6545778751373291,
"step": 5250
},
{
"epoch": 0.9359430604982206,
"grad_norm": 0.7749147415161133,
"learning_rate": 1.130808316994201e-05,
"loss": 0.6474496841430664,
"step": 5260
},
{
"epoch": 0.9377224199288257,
"grad_norm": 0.6907658576965332,
"learning_rate": 1.127986596361684e-05,
"loss": 0.6155929565429688,
"step": 5270
},
{
"epoch": 0.9395017793594306,
"grad_norm": 0.6255580186843872,
"learning_rate": 1.125163839333007e-05,
"loss": 0.6644204139709473,
"step": 5280
},
{
"epoch": 0.9412811387900356,
"grad_norm": 0.5674806833267212,
"learning_rate": 1.1223400687659898e-05,
"loss": 0.6196834564208984,
"step": 5290
},
{
"epoch": 0.9430604982206405,
"grad_norm": 0.7296446561813354,
"learning_rate": 1.1195153075266591e-05,
"loss": 0.6685366153717041,
"step": 5300
},
{
"epoch": 0.9448398576512456,
"grad_norm": 0.5960925817489624,
"learning_rate": 1.1166895784890644e-05,
"loss": 0.6431320190429688,
"step": 5310
},
{
"epoch": 0.9466192170818505,
"grad_norm": 0.6691383719444275,
"learning_rate": 1.1138629045350911e-05,
"loss": 0.6147652626037597,
"step": 5320
},
{
"epoch": 0.9483985765124555,
"grad_norm": 0.7819799184799194,
"learning_rate": 1.1110353085542778e-05,
"loss": 0.6021285057067871,
"step": 5330
},
{
"epoch": 0.9501779359430605,
"grad_norm": 0.690168023109436,
"learning_rate": 1.1082068134436281e-05,
"loss": 0.6405185699462891,
"step": 5340
},
{
"epoch": 0.9519572953736655,
"grad_norm": 0.6121358871459961,
"learning_rate": 1.1053774421074272e-05,
"loss": 0.6638952255249023,
"step": 5350
},
{
"epoch": 0.9537366548042705,
"grad_norm": 0.6227392554283142,
"learning_rate": 1.1025472174570554e-05,
"loss": 0.6735183715820312,
"step": 5360
},
{
"epoch": 0.9555160142348754,
"grad_norm": 0.7254964113235474,
"learning_rate": 1.099716162410803e-05,
"loss": 0.6874561309814453,
"step": 5370
},
{
"epoch": 0.9572953736654805,
"grad_norm": 0.916182816028595,
"learning_rate": 1.0968842998936843e-05,
"loss": 0.6294379234313965,
"step": 5380
},
{
"epoch": 0.9590747330960854,
"grad_norm": 0.7955939769744873,
"learning_rate": 1.0940516528372527e-05,
"loss": 0.648458194732666,
"step": 5390
},
{
"epoch": 0.9608540925266904,
"grad_norm": 0.5914123058319092,
"learning_rate": 1.0912182441794141e-05,
"loss": 0.641319465637207,
"step": 5400
},
{
"epoch": 0.9626334519572953,
"grad_norm": 0.8009176850318909,
"learning_rate": 1.0883840968642416e-05,
"loss": 0.6294963836669922,
"step": 5410
},
{
"epoch": 0.9644128113879004,
"grad_norm": 0.8268579244613647,
"learning_rate": 1.0855492338417905e-05,
"loss": 0.6227912425994873,
"step": 5420
},
{
"epoch": 0.9661921708185054,
"grad_norm": 0.7227009534835815,
"learning_rate": 1.0827136780679109e-05,
"loss": 0.6420284271240234,
"step": 5430
},
{
"epoch": 0.9679715302491103,
"grad_norm": 0.7250238060951233,
"learning_rate": 1.0798774525040628e-05,
"loss": 0.6643415927886963,
"step": 5440
},
{
"epoch": 0.9697508896797153,
"grad_norm": 0.6804770827293396,
"learning_rate": 1.07704058011713e-05,
"loss": 0.6299617290496826,
"step": 5450
},
{
"epoch": 0.9715302491103203,
"grad_norm": 0.8592752814292908,
"learning_rate": 1.0742030838792343e-05,
"loss": 0.6287535667419434,
"step": 5460
},
{
"epoch": 0.9733096085409253,
"grad_norm": 0.6754792928695679,
"learning_rate": 1.0713649867675483e-05,
"loss": 0.6633755207061768,
"step": 5470
},
{
"epoch": 0.9750889679715302,
"grad_norm": 0.8593308925628662,
"learning_rate": 1.0685263117641117e-05,
"loss": 0.6187657833099365,
"step": 5480
},
{
"epoch": 0.9768683274021353,
"grad_norm": 0.6328932642936707,
"learning_rate": 1.065687081855643e-05,
"loss": 0.618121862411499,
"step": 5490
},
{
"epoch": 0.9786476868327402,
"grad_norm": 0.7381909489631653,
"learning_rate": 1.062847320033354e-05,
"loss": 0.6680717945098877,
"step": 5500
},
{
"epoch": 0.9786476868327402,
"eval_loss": 0.6279275417327881,
"eval_runtime": 413.1424,
"eval_samples_per_second": 12.173,
"eval_steps_per_second": 6.087,
"step": 5500
},
{
"epoch": 0.9804270462633452,
"grad_norm": 0.8082507848739624,
"learning_rate": 1.0600070492927644e-05,
"loss": 0.619424295425415,
"step": 5510
},
{
"epoch": 0.9822064056939501,
"grad_norm": 0.782515823841095,
"learning_rate": 1.0571662926335138e-05,
"loss": 0.6681380271911621,
"step": 5520
},
{
"epoch": 0.9839857651245552,
"grad_norm": 0.756851851940155,
"learning_rate": 1.0543250730591778e-05,
"loss": 0.648841381072998,
"step": 5530
},
{
"epoch": 0.9857651245551602,
"grad_norm": 0.6018344759941101,
"learning_rate": 1.0514834135770805e-05,
"loss": 0.6303011417388916,
"step": 5540
},
{
"epoch": 0.9875444839857651,
"grad_norm": 0.6782673597335815,
"learning_rate": 1.0486413371981077e-05,
"loss": 0.6756137371063232,
"step": 5550
},
{
"epoch": 0.9893238434163701,
"grad_norm": 0.680172860622406,
"learning_rate": 1.045798866936521e-05,
"loss": 0.62680082321167,
"step": 5560
},
{
"epoch": 0.9911032028469751,
"grad_norm": 0.7947545647621155,
"learning_rate": 1.042956025809772e-05,
"loss": 0.6698834419250488,
"step": 5570
},
{
"epoch": 0.9928825622775801,
"grad_norm": 0.6484361290931702,
"learning_rate": 1.0401128368383151e-05,
"loss": 0.6242072582244873,
"step": 5580
},
{
"epoch": 0.994661921708185,
"grad_norm": 0.8095009326934814,
"learning_rate": 1.037269323045422e-05,
"loss": 0.6368332862854004,
"step": 5590
},
{
"epoch": 0.99644128113879,
"grad_norm": 0.7750351428985596,
"learning_rate": 1.034425507456994e-05,
"loss": 0.6159787654876709,
"step": 5600
},
{
"epoch": 0.998220640569395,
"grad_norm": 0.6969480514526367,
"learning_rate": 1.0315814131013768e-05,
"loss": 0.6518408298492432,
"step": 5610
},
{
"epoch": 1.0,
"grad_norm": 0.7439318299293518,
"learning_rate": 1.0287370630091731e-05,
"loss": 0.6812664031982422,
"step": 5620
},
{
"epoch": 1.001779359430605,
"grad_norm": 0.7261248826980591,
"learning_rate": 1.0258924802130565e-05,
"loss": 0.6570511817932129,
"step": 5630
},
{
"epoch": 1.00355871886121,
"grad_norm": 0.6743627190589905,
"learning_rate": 1.0230476877475854e-05,
"loss": 0.6013021469116211,
"step": 5640
},
{
"epoch": 1.0053380782918149,
"grad_norm": 0.698407769203186,
"learning_rate": 1.0202027086490154e-05,
"loss": 0.6084145069122314,
"step": 5650
},
{
"epoch": 1.00711743772242,
"grad_norm": 0.7206895351409912,
"learning_rate": 1.0173575659551137e-05,
"loss": 0.642551326751709,
"step": 5660
},
{
"epoch": 1.008896797153025,
"grad_norm": 0.8103647232055664,
"learning_rate": 1.0145122827049725e-05,
"loss": 0.5881889820098877,
"step": 5670
},
{
"epoch": 1.01067615658363,
"grad_norm": 0.7496983408927917,
"learning_rate": 1.0116668819388218e-05,
"loss": 0.6021871089935302,
"step": 5680
},
{
"epoch": 1.0124555160142348,
"grad_norm": 0.6843361258506775,
"learning_rate": 1.0088213866978435e-05,
"loss": 0.6468405723571777,
"step": 5690
},
{
"epoch": 1.0142348754448398,
"grad_norm": 0.6858223676681519,
"learning_rate": 1.0059758200239842e-05,
"loss": 0.6358915328979492,
"step": 5700
},
{
"epoch": 1.0160142348754448,
"grad_norm": 0.9082915186882019,
"learning_rate": 1.0031302049597691e-05,
"loss": 0.6340798377990723,
"step": 5710
},
{
"epoch": 1.0177935943060499,
"grad_norm": 0.6842049360275269,
"learning_rate": 1.0002845645481152e-05,
"loss": 0.6286153793334961,
"step": 5720
},
{
"epoch": 1.019572953736655,
"grad_norm": 0.7649215459823608,
"learning_rate": 9.974389218321453e-06,
"loss": 0.6225019931793213,
"step": 5730
},
{
"epoch": 1.0213523131672597,
"grad_norm": 0.6515229344367981,
"learning_rate": 9.945932998549996e-06,
"loss": 0.6120016574859619,
"step": 5740
},
{
"epoch": 1.0231316725978647,
"grad_norm": 0.5917989015579224,
"learning_rate": 9.917477216596521e-06,
"loss": 0.6404437065124512,
"step": 5750
},
{
"epoch": 1.0249110320284698,
"grad_norm": 0.7759416699409485,
"learning_rate": 9.889022102887205e-06,
"loss": 0.6269901275634766,
"step": 5760
},
{
"epoch": 1.0266903914590748,
"grad_norm": 0.7149049043655396,
"learning_rate": 9.860567887842827e-06,
"loss": 0.6032210350036621,
"step": 5770
},
{
"epoch": 1.0284697508896796,
"grad_norm": 0.6775475144386292,
"learning_rate": 9.832114801876877e-06,
"loss": 0.6163185119628907,
"step": 5780
},
{
"epoch": 1.0302491103202847,
"grad_norm": 0.7221093773841858,
"learning_rate": 9.80366307539372e-06,
"loss": 0.5833635330200195,
"step": 5790
},
{
"epoch": 1.0320284697508897,
"grad_norm": 0.7127135396003723,
"learning_rate": 9.775212938786689e-06,
"loss": 0.6052863597869873,
"step": 5800
},
{
"epoch": 1.0338078291814947,
"grad_norm": 0.7285045981407166,
"learning_rate": 9.746764622436265e-06,
"loss": 0.6380964756011963,
"step": 5810
},
{
"epoch": 1.0355871886120998,
"grad_norm": 0.7396540641784668,
"learning_rate": 9.718318356708172e-06,
"loss": 0.6616352081298829,
"step": 5820
},
{
"epoch": 1.0373665480427046,
"grad_norm": 0.6225027441978455,
"learning_rate": 9.689874371951541e-06,
"loss": 0.6689383029937744,
"step": 5830
},
{
"epoch": 1.0391459074733096,
"grad_norm": 0.8353135585784912,
"learning_rate": 9.661432898497024e-06,
"loss": 0.6042938232421875,
"step": 5840
},
{
"epoch": 1.0409252669039146,
"grad_norm": 0.6610475778579712,
"learning_rate": 9.632994166654941e-06,
"loss": 0.6156399726867676,
"step": 5850
},
{
"epoch": 1.0427046263345197,
"grad_norm": 0.7322497963905334,
"learning_rate": 9.604558406713413e-06,
"loss": 0.6352502822875976,
"step": 5860
},
{
"epoch": 1.0444839857651245,
"grad_norm": 0.7838056087493896,
"learning_rate": 9.576125848936484e-06,
"loss": 0.6298691272735596,
"step": 5870
},
{
"epoch": 1.0462633451957295,
"grad_norm": 0.81569504737854,
"learning_rate": 9.547696723562289e-06,
"loss": 0.6307301998138428,
"step": 5880
},
{
"epoch": 1.0480427046263345,
"grad_norm": 0.8897931575775146,
"learning_rate": 9.519271260801146e-06,
"loss": 0.6494285106658936,
"step": 5890
},
{
"epoch": 1.0498220640569396,
"grad_norm": 0.7175611257553101,
"learning_rate": 9.490849690833732e-06,
"loss": 0.5930293083190918,
"step": 5900
},
{
"epoch": 1.0516014234875444,
"grad_norm": 0.6708109378814697,
"learning_rate": 9.462432243809191e-06,
"loss": 0.595799732208252,
"step": 5910
},
{
"epoch": 1.0533807829181494,
"grad_norm": 0.6808570623397827,
"learning_rate": 9.434019149843285e-06,
"loss": 0.6564432621002197,
"step": 5920
},
{
"epoch": 1.0551601423487544,
"grad_norm": 0.8115124106407166,
"learning_rate": 9.405610639016522e-06,
"loss": 0.6025516033172608,
"step": 5930
},
{
"epoch": 1.0569395017793595,
"grad_norm": 0.7574615478515625,
"learning_rate": 9.377206941372306e-06,
"loss": 0.6670190811157226,
"step": 5940
},
{
"epoch": 1.0587188612099645,
"grad_norm": 0.8795053362846375,
"learning_rate": 9.34880828691505e-06,
"loss": 0.6606448650360107,
"step": 5950
},
{
"epoch": 1.0604982206405693,
"grad_norm": 0.7287918925285339,
"learning_rate": 9.320414905608348e-06,
"loss": 0.6295819759368897,
"step": 5960
},
{
"epoch": 1.0622775800711743,
"grad_norm": 0.7272006869316101,
"learning_rate": 9.292027027373075e-06,
"loss": 0.6425396919250488,
"step": 5970
},
{
"epoch": 1.0640569395017794,
"grad_norm": 0.6797536015510559,
"learning_rate": 9.263644882085564e-06,
"loss": 0.5847621917724609,
"step": 5980
},
{
"epoch": 1.0658362989323844,
"grad_norm": 0.6698694229125977,
"learning_rate": 9.235268699575704e-06,
"loss": 0.6509596824645996,
"step": 5990
},
{
"epoch": 1.0676156583629894,
"grad_norm": 0.7406882643699646,
"learning_rate": 9.206898709625109e-06,
"loss": 0.6330312252044678,
"step": 6000
},
{
"epoch": 1.0676156583629894,
"eval_loss": 0.6239920854568481,
"eval_runtime": 411.471,
"eval_samples_per_second": 12.222,
"eval_steps_per_second": 6.112,
"step": 6000
},
{
"epoch": 1.0693950177935942,
"grad_norm": 0.695398211479187,
"learning_rate": 9.17853514196525e-06,
"loss": 0.6093502998352051,
"step": 6010
},
{
"epoch": 1.0711743772241993,
"grad_norm": 0.839644730091095,
"learning_rate": 9.150178226275584e-06,
"loss": 0.6351033687591553,
"step": 6020
},
{
"epoch": 1.0729537366548043,
"grad_norm": 0.6793309450149536,
"learning_rate": 9.121828192181716e-06,
"loss": 0.6527360916137696,
"step": 6030
},
{
"epoch": 1.0747330960854093,
"grad_norm": 0.8601770401000977,
"learning_rate": 9.093485269253508e-06,
"loss": 0.6375271320343018,
"step": 6040
},
{
"epoch": 1.0765124555160142,
"grad_norm": 0.8268119692802429,
"learning_rate": 9.065149687003256e-06,
"loss": 0.6043989181518554,
"step": 6050
},
{
"epoch": 1.0782918149466192,
"grad_norm": 0.6872331500053406,
"learning_rate": 9.0368216748838e-06,
"loss": 0.5880636215209961,
"step": 6060
},
{
"epoch": 1.0800711743772242,
"grad_norm": 0.7637034058570862,
"learning_rate": 9.008501462286682e-06,
"loss": 0.6080229759216309,
"step": 6070
},
{
"epoch": 1.0818505338078293,
"grad_norm": 0.7357282042503357,
"learning_rate": 8.980189278540294e-06,
"loss": 0.6133236408233642,
"step": 6080
},
{
"epoch": 1.083629893238434,
"grad_norm": 0.7250421643257141,
"learning_rate": 8.951885352908006e-06,
"loss": 0.6319203853607178,
"step": 6090
},
{
"epoch": 1.085409252669039,
"grad_norm": 0.6590055227279663,
"learning_rate": 8.92358991458631e-06,
"loss": 0.649744987487793,
"step": 6100
},
{
"epoch": 1.0871886120996441,
"grad_norm": 0.7239564061164856,
"learning_rate": 8.895303192702988e-06,
"loss": 0.6277432918548584,
"step": 6110
},
{
"epoch": 1.0889679715302492,
"grad_norm": 0.8234860301017761,
"learning_rate": 8.867025416315221e-06,
"loss": 0.6343496799468994,
"step": 6120
},
{
"epoch": 1.0907473309608542,
"grad_norm": 0.8464725613594055,
"learning_rate": 8.838756814407766e-06,
"loss": 0.6384105205535888,
"step": 6130
},
{
"epoch": 1.092526690391459,
"grad_norm": 0.8043434619903564,
"learning_rate": 8.810497615891078e-06,
"loss": 0.6212152957916259,
"step": 6140
},
{
"epoch": 1.094306049822064,
"grad_norm": 0.7087031602859497,
"learning_rate": 8.782248049599468e-06,
"loss": 0.5984238624572754,
"step": 6150
},
{
"epoch": 1.096085409252669,
"grad_norm": 0.7631204724311829,
"learning_rate": 8.754008344289253e-06,
"loss": 0.6375543594360351,
"step": 6160
},
{
"epoch": 1.097864768683274,
"grad_norm": 0.6403396725654602,
"learning_rate": 8.725778728636893e-06,
"loss": 0.6312360763549805,
"step": 6170
},
{
"epoch": 1.099644128113879,
"grad_norm": 0.8257510662078857,
"learning_rate": 8.697559431237153e-06,
"loss": 0.6133918762207031,
"step": 6180
},
{
"epoch": 1.101423487544484,
"grad_norm": 0.6782218813896179,
"learning_rate": 8.66935068060123e-06,
"loss": 0.6308522701263428,
"step": 6190
},
{
"epoch": 1.103202846975089,
"grad_norm": 0.7291736602783203,
"learning_rate": 8.641152705154935e-06,
"loss": 0.6390894889831543,
"step": 6200
},
{
"epoch": 1.104982206405694,
"grad_norm": 0.7014942169189453,
"learning_rate": 8.612965733236811e-06,
"loss": 0.6089354038238526,
"step": 6210
},
{
"epoch": 1.106761565836299,
"grad_norm": 0.6617055535316467,
"learning_rate": 8.584789993096303e-06,
"loss": 0.6288234710693359,
"step": 6220
},
{
"epoch": 1.1085409252669038,
"grad_norm": 0.7303082942962646,
"learning_rate": 8.5566257128919e-06,
"loss": 0.6383206367492675,
"step": 6230
},
{
"epoch": 1.1103202846975089,
"grad_norm": 0.7110517024993896,
"learning_rate": 8.528473120689302e-06,
"loss": 0.6590532779693603,
"step": 6240
},
{
"epoch": 1.112099644128114,
"grad_norm": 0.779214084148407,
"learning_rate": 8.50033244445955e-06,
"loss": 0.6261368274688721,
"step": 6250
},
{
"epoch": 1.113879003558719,
"grad_norm": 0.819240152835846,
"learning_rate": 8.472203912077205e-06,
"loss": 0.6361266136169433,
"step": 6260
},
{
"epoch": 1.1156583629893237,
"grad_norm": 0.7131257653236389,
"learning_rate": 8.444087751318484e-06,
"loss": 0.6407057285308838,
"step": 6270
},
{
"epoch": 1.1174377224199288,
"grad_norm": 0.9261246919631958,
"learning_rate": 8.415984189859418e-06,
"loss": 0.5832745552062988,
"step": 6280
},
{
"epoch": 1.1192170818505338,
"grad_norm": 0.8466585278511047,
"learning_rate": 8.38789345527403e-06,
"loss": 0.6238195419311523,
"step": 6290
},
{
"epoch": 1.1209964412811388,
"grad_norm": 0.7208281755447388,
"learning_rate": 8.359815775032457e-06,
"loss": 0.6571403980255127,
"step": 6300
},
{
"epoch": 1.1227758007117439,
"grad_norm": 0.6916828155517578,
"learning_rate": 8.331751376499131e-06,
"loss": 0.6353542327880859,
"step": 6310
},
{
"epoch": 1.1245551601423487,
"grad_norm": 0.804423987865448,
"learning_rate": 8.303700486930935e-06,
"loss": 0.6343297481536865,
"step": 6320
},
{
"epoch": 1.1263345195729537,
"grad_norm": 0.8437130451202393,
"learning_rate": 8.275663333475365e-06,
"loss": 0.6212164878845214,
"step": 6330
},
{
"epoch": 1.1281138790035588,
"grad_norm": 0.8472525477409363,
"learning_rate": 8.247640143168675e-06,
"loss": 0.6239662647247315,
"step": 6340
},
{
"epoch": 1.1298932384341638,
"grad_norm": 0.696539044380188,
"learning_rate": 8.219631142934062e-06,
"loss": 0.6344510078430176,
"step": 6350
},
{
"epoch": 1.1316725978647686,
"grad_norm": 0.7422542572021484,
"learning_rate": 8.191636559579802e-06,
"loss": 0.6504097938537597,
"step": 6360
},
{
"epoch": 1.1334519572953736,
"grad_norm": 0.801693856716156,
"learning_rate": 8.163656619797444e-06,
"loss": 0.6314446449279785,
"step": 6370
},
{
"epoch": 1.1352313167259787,
"grad_norm": 0.786159098148346,
"learning_rate": 8.135691550159943e-06,
"loss": 0.6034214973449707,
"step": 6380
},
{
"epoch": 1.1370106761565837,
"grad_norm": 0.7948549389839172,
"learning_rate": 8.107741577119853e-06,
"loss": 0.653587532043457,
"step": 6390
},
{
"epoch": 1.1387900355871885,
"grad_norm": 0.6635374426841736,
"learning_rate": 8.079806927007469e-06,
"loss": 0.6300637722015381,
"step": 6400
},
{
"epoch": 1.1405693950177935,
"grad_norm": 0.7938548922538757,
"learning_rate": 8.051887826029006e-06,
"loss": 0.6417815685272217,
"step": 6410
},
{
"epoch": 1.1423487544483986,
"grad_norm": 0.7719388008117676,
"learning_rate": 8.023984500264782e-06,
"loss": 0.624653434753418,
"step": 6420
},
{
"epoch": 1.1441281138790036,
"grad_norm": 0.8489981293678284,
"learning_rate": 7.996097175667352e-06,
"loss": 0.6066908836364746,
"step": 6430
},
{
"epoch": 1.1459074733096086,
"grad_norm": 0.7810853123664856,
"learning_rate": 7.968226078059716e-06,
"loss": 0.5920337677001953,
"step": 6440
},
{
"epoch": 1.1476868327402134,
"grad_norm": 0.7887650728225708,
"learning_rate": 7.940371433133459e-06,
"loss": 0.6641497135162353,
"step": 6450
},
{
"epoch": 1.1494661921708185,
"grad_norm": 0.7652443051338196,
"learning_rate": 7.912533466446947e-06,
"loss": 0.6473179340362549,
"step": 6460
},
{
"epoch": 1.1512455516014235,
"grad_norm": 0.7206646800041199,
"learning_rate": 7.88471240342348e-06,
"loss": 0.5841953754425049,
"step": 6470
},
{
"epoch": 1.1530249110320285,
"grad_norm": 0.7867498993873596,
"learning_rate": 7.856908469349495e-06,
"loss": 0.6486004829406739,
"step": 6480
},
{
"epoch": 1.1548042704626336,
"grad_norm": 0.8296671509742737,
"learning_rate": 7.829121889372702e-06,
"loss": 0.6224873065948486,
"step": 6490
},
{
"epoch": 1.1565836298932384,
"grad_norm": 0.8746348023414612,
"learning_rate": 7.801352888500304e-06,
"loss": 0.640509843826294,
"step": 6500
},
{
"epoch": 1.1565836298932384,
"eval_loss": 0.620580792427063,
"eval_runtime": 411.917,
"eval_samples_per_second": 12.209,
"eval_steps_per_second": 6.106,
"step": 6500
},
{
"epoch": 1.1583629893238434,
"grad_norm": 0.7378506660461426,
"learning_rate": 7.773601691597135e-06,
"loss": 0.6328612327575683,
"step": 6510
},
{
"epoch": 1.1601423487544484,
"grad_norm": 0.7263290882110596,
"learning_rate": 7.745868523383876e-06,
"loss": 0.6071421146392822,
"step": 6520
},
{
"epoch": 1.1619217081850535,
"grad_norm": 0.8653813004493713,
"learning_rate": 7.7181536084352e-06,
"loss": 0.6067211627960205,
"step": 6530
},
{
"epoch": 1.1637010676156583,
"grad_norm": 0.9452764987945557,
"learning_rate": 7.690457171177984e-06,
"loss": 0.6291831970214844,
"step": 6540
},
{
"epoch": 1.1654804270462633,
"grad_norm": 0.65375155210495,
"learning_rate": 7.66277943588947e-06,
"loss": 0.6606705665588379,
"step": 6550
},
{
"epoch": 1.1672597864768683,
"grad_norm": 0.978212833404541,
"learning_rate": 7.635120626695456e-06,
"loss": 0.5956210613250732,
"step": 6560
},
{
"epoch": 1.1690391459074734,
"grad_norm": 0.7400839328765869,
"learning_rate": 7.607480967568497e-06,
"loss": 0.6242622852325439,
"step": 6570
},
{
"epoch": 1.1708185053380782,
"grad_norm": 1.0911694765090942,
"learning_rate": 7.579860682326055e-06,
"loss": 0.6239947319030762,
"step": 6580
},
{
"epoch": 1.1725978647686832,
"grad_norm": 0.7858565449714661,
"learning_rate": 7.552259994628728e-06,
"loss": 0.6366580963134766,
"step": 6590
},
{
"epoch": 1.1743772241992882,
"grad_norm": 0.8578958511352539,
"learning_rate": 7.5246791279784056e-06,
"loss": 0.6067252635955811,
"step": 6600
},
{
"epoch": 1.1761565836298933,
"grad_norm": 0.7615208029747009,
"learning_rate": 7.4971183057164785e-06,
"loss": 0.6032675266265869,
"step": 6610
},
{
"epoch": 1.1779359430604983,
"grad_norm": 0.8318284749984741,
"learning_rate": 7.469577751022024e-06,
"loss": 0.5974441528320312,
"step": 6620
},
{
"epoch": 1.1797153024911031,
"grad_norm": 0.882125198841095,
"learning_rate": 7.442057686909998e-06,
"loss": 0.6573843002319336,
"step": 6630
},
{
"epoch": 1.1814946619217082,
"grad_norm": 0.7975313663482666,
"learning_rate": 7.414558336229426e-06,
"loss": 0.6807239055633545,
"step": 6640
},
{
"epoch": 1.1832740213523132,
"grad_norm": 0.7822281122207642,
"learning_rate": 7.387079921661614e-06,
"loss": 0.6663013458251953,
"step": 6650
},
{
"epoch": 1.1850533807829182,
"grad_norm": 0.8564392328262329,
"learning_rate": 7.359622665718317e-06,
"loss": 0.6218353271484375,
"step": 6660
},
{
"epoch": 1.1868327402135233,
"grad_norm": 0.7377424836158752,
"learning_rate": 7.332186790739973e-06,
"loss": 0.5968504905700683,
"step": 6670
},
{
"epoch": 1.188612099644128,
"grad_norm": 0.730351984500885,
"learning_rate": 7.304772518893866e-06,
"loss": 0.6158394813537598,
"step": 6680
},
{
"epoch": 1.190391459074733,
"grad_norm": 0.7883412837982178,
"learning_rate": 7.277380072172354e-06,
"loss": 0.6515108585357666,
"step": 6690
},
{
"epoch": 1.1921708185053381,
"grad_norm": 0.9434972405433655,
"learning_rate": 7.250009672391063e-06,
"loss": 0.6276324272155762,
"step": 6700
},
{
"epoch": 1.193950177935943,
"grad_norm": 0.8042780756950378,
"learning_rate": 7.2226615411870796e-06,
"loss": 0.6224194526672363,
"step": 6710
},
{
"epoch": 1.195729537366548,
"grad_norm": 0.8094464540481567,
"learning_rate": 7.195335900017181e-06,
"loss": 0.6250015735626221,
"step": 6720
},
{
"epoch": 1.197508896797153,
"grad_norm": 0.8359760046005249,
"learning_rate": 7.168032970156011e-06,
"loss": 0.6454795837402344,
"step": 6730
},
{
"epoch": 1.199288256227758,
"grad_norm": 0.728487491607666,
"learning_rate": 7.140752972694325e-06,
"loss": 0.6341996192932129,
"step": 6740
},
{
"epoch": 1.201067615658363,
"grad_norm": 0.8493318557739258,
"learning_rate": 7.113496128537154e-06,
"loss": 0.6078832626342774,
"step": 6750
},
{
"epoch": 1.2028469750889679,
"grad_norm": 0.7177495360374451,
"learning_rate": 7.086262658402059e-06,
"loss": 0.6244680881500244,
"step": 6760
},
{
"epoch": 1.204626334519573,
"grad_norm": 0.7511991858482361,
"learning_rate": 7.059052782817317e-06,
"loss": 0.6233652591705322,
"step": 6770
},
{
"epoch": 1.206405693950178,
"grad_norm": 1.0159603357315063,
"learning_rate": 7.0318667221201505e-06,
"loss": 0.6203540802001953,
"step": 6780
},
{
"epoch": 1.208185053380783,
"grad_norm": 0.9800230264663696,
"learning_rate": 7.004704696454924e-06,
"loss": 0.621929931640625,
"step": 6790
},
{
"epoch": 1.209964412811388,
"grad_norm": 0.740306556224823,
"learning_rate": 6.97756692577139e-06,
"loss": 0.5935471534729004,
"step": 6800
},
{
"epoch": 1.2117437722419928,
"grad_norm": 0.7026015520095825,
"learning_rate": 6.950453629822874e-06,
"loss": 0.5996420383453369,
"step": 6810
},
{
"epoch": 1.2135231316725978,
"grad_norm": 0.7063615918159485,
"learning_rate": 6.923365028164532e-06,
"loss": 0.6140129089355468,
"step": 6820
},
{
"epoch": 1.2153024911032029,
"grad_norm": 0.6547632217407227,
"learning_rate": 6.8963013401515365e-06,
"loss": 0.6896812915802002,
"step": 6830
},
{
"epoch": 1.217081850533808,
"grad_norm": 0.7407922744750977,
"learning_rate": 6.8692627849373226e-06,
"loss": 0.6203756332397461,
"step": 6840
},
{
"epoch": 1.2188612099644127,
"grad_norm": 0.7740945219993591,
"learning_rate": 6.842249581471814e-06,
"loss": 0.6261481761932373,
"step": 6850
},
{
"epoch": 1.2206405693950177,
"grad_norm": 0.8180769085884094,
"learning_rate": 6.815261948499628e-06,
"loss": 0.6112605094909668,
"step": 6860
},
{
"epoch": 1.2224199288256228,
"grad_norm": 0.7909367084503174,
"learning_rate": 6.788300104558341e-06,
"loss": 0.6290336132049561,
"step": 6870
},
{
"epoch": 1.2241992882562278,
"grad_norm": 0.7958800792694092,
"learning_rate": 6.76136426797668e-06,
"loss": 0.6461628913879395,
"step": 6880
},
{
"epoch": 1.2259786476868326,
"grad_norm": 0.83876633644104,
"learning_rate": 6.734454656872784e-06,
"loss": 0.6532453060150146,
"step": 6890
},
{
"epoch": 1.2277580071174377,
"grad_norm": 0.8222100138664246,
"learning_rate": 6.707571489152418e-06,
"loss": 0.6066458225250244,
"step": 6900
},
{
"epoch": 1.2295373665480427,
"grad_norm": 0.7338365316390991,
"learning_rate": 6.680714982507219e-06,
"loss": 0.6187913417816162,
"step": 6910
},
{
"epoch": 1.2313167259786477,
"grad_norm": 1.0070669651031494,
"learning_rate": 6.653885354412935e-06,
"loss": 0.638453197479248,
"step": 6920
},
{
"epoch": 1.2330960854092528,
"grad_norm": 0.8759623169898987,
"learning_rate": 6.627082822127657e-06,
"loss": 0.6360817909240722,
"step": 6930
},
{
"epoch": 1.2348754448398576,
"grad_norm": 0.7884645462036133,
"learning_rate": 6.600307602690057e-06,
"loss": 0.6332939147949219,
"step": 6940
},
{
"epoch": 1.2366548042704626,
"grad_norm": 0.7928617596626282,
"learning_rate": 6.573559912917648e-06,
"loss": 0.6071663856506347,
"step": 6950
},
{
"epoch": 1.2384341637010676,
"grad_norm": 0.8969078063964844,
"learning_rate": 6.546839969405003e-06,
"loss": 0.6088360786437989,
"step": 6960
},
{
"epoch": 1.2402135231316727,
"grad_norm": 0.6923132538795471,
"learning_rate": 6.520147988522021e-06,
"loss": 0.602042293548584,
"step": 6970
},
{
"epoch": 1.2419928825622777,
"grad_norm": 0.8759524822235107,
"learning_rate": 6.493484186412172e-06,
"loss": 0.6699591636657715,
"step": 6980
},
{
"epoch": 1.2437722419928825,
"grad_norm": 0.9501714706420898,
"learning_rate": 6.466848778990734e-06,
"loss": 0.631440258026123,
"step": 6990
},
{
"epoch": 1.2455516014234875,
"grad_norm": 0.8564425706863403,
"learning_rate": 6.440241981943063e-06,
"loss": 0.6175348281860351,
"step": 7000
},
{
"epoch": 1.2455516014234875,
"eval_loss": 0.6177652478218079,
"eval_runtime": 410.1105,
"eval_samples_per_second": 12.263,
"eval_steps_per_second": 6.132,
"step": 7000
},
{
"epoch": 1.2473309608540926,
"grad_norm": 0.8425670266151428,
"learning_rate": 6.413664010722825e-06,
"loss": 0.6334220886230468,
"step": 7010
},
{
"epoch": 1.2491103202846976,
"grad_norm": 0.6265377998352051,
"learning_rate": 6.3871150805502765e-06,
"loss": 0.6206889629364014,
"step": 7020
},
{
"epoch": 1.2508896797153026,
"grad_norm": 0.7120670676231384,
"learning_rate": 6.360595406410497e-06,
"loss": 0.609261417388916,
"step": 7030
},
{
"epoch": 1.2526690391459074,
"grad_norm": 0.7951711416244507,
"learning_rate": 6.334105203051673e-06,
"loss": 0.6361230850219727,
"step": 7040
},
{
"epoch": 1.2544483985765125,
"grad_norm": 0.9487454891204834,
"learning_rate": 6.307644684983329e-06,
"loss": 0.5716474533081055,
"step": 7050
},
{
"epoch": 1.2562277580071175,
"grad_norm": 0.7447927594184875,
"learning_rate": 6.281214066474623e-06,
"loss": 0.6536383152008056,
"step": 7060
},
{
"epoch": 1.2580071174377223,
"grad_norm": 0.8137961626052856,
"learning_rate": 6.254813561552585e-06,
"loss": 0.6361209869384765,
"step": 7070
},
{
"epoch": 1.2597864768683273,
"grad_norm": 0.7065114378929138,
"learning_rate": 6.2284433840004e-06,
"loss": 0.6489431858062744,
"step": 7080
},
{
"epoch": 1.2615658362989324,
"grad_norm": 0.7409806847572327,
"learning_rate": 6.202103747355667e-06,
"loss": 0.6485908031463623,
"step": 7090
},
{
"epoch": 1.2633451957295374,
"grad_norm": 0.662041962146759,
"learning_rate": 6.175794864908672e-06,
"loss": 0.6204521179199218,
"step": 7100
},
{
"epoch": 1.2651245551601424,
"grad_norm": 0.7687442898750305,
"learning_rate": 6.1495169497006755e-06,
"loss": 0.6361071109771729,
"step": 7110
},
{
"epoch": 1.2669039145907472,
"grad_norm": 0.9222472906112671,
"learning_rate": 6.123270214522159e-06,
"loss": 0.6162978172302246,
"step": 7120
},
{
"epoch": 1.2686832740213523,
"grad_norm": 0.819275975227356,
"learning_rate": 6.097054871911132e-06,
"loss": 0.6023138523101806,
"step": 7130
},
{
"epoch": 1.2704626334519573,
"grad_norm": 0.8842992782592773,
"learning_rate": 6.070871134151386e-06,
"loss": 0.658526086807251,
"step": 7140
},
{
"epoch": 1.2722419928825623,
"grad_norm": 0.7101578116416931,
"learning_rate": 6.044719213270791e-06,
"loss": 0.6004268169403076,
"step": 7150
},
{
"epoch": 1.2740213523131674,
"grad_norm": 0.7537800669670105,
"learning_rate": 6.018599321039569e-06,
"loss": 0.6223325729370117,
"step": 7160
},
{
"epoch": 1.2758007117437722,
"grad_norm": 0.6810758709907532,
"learning_rate": 5.9925116689685925e-06,
"loss": 0.6302705764770508,
"step": 7170
},
{
"epoch": 1.2775800711743772,
"grad_norm": 0.8179675936698914,
"learning_rate": 5.966456468307653e-06,
"loss": 0.6177227020263671,
"step": 7180
},
{
"epoch": 1.2793594306049823,
"grad_norm": 0.7833004593849182,
"learning_rate": 5.940433930043772e-06,
"loss": 0.6414045333862305,
"step": 7190
},
{
"epoch": 1.281138790035587,
"grad_norm": 0.9323675036430359,
"learning_rate": 5.914444264899466e-06,
"loss": 0.603922176361084,
"step": 7200
},
{
"epoch": 1.282918149466192,
"grad_norm": 0.844142735004425,
"learning_rate": 5.888487683331072e-06,
"loss": 0.6308831691741943,
"step": 7210
},
{
"epoch": 1.2846975088967971,
"grad_norm": 0.8182924389839172,
"learning_rate": 5.862564395527013e-06,
"loss": 0.6259811401367188,
"step": 7220
},
{
"epoch": 1.2864768683274022,
"grad_norm": 0.8091204166412354,
"learning_rate": 5.836674611406117e-06,
"loss": 0.6158743858337402,
"step": 7230
},
{
"epoch": 1.2882562277580072,
"grad_norm": 0.9385861754417419,
"learning_rate": 5.810818540615903e-06,
"loss": 0.6304144382476806,
"step": 7240
},
{
"epoch": 1.290035587188612,
"grad_norm": 1.0828267335891724,
"learning_rate": 5.784996392530892e-06,
"loss": 0.6548801898956299,
"step": 7250
},
{
"epoch": 1.291814946619217,
"grad_norm": 0.7489603161811829,
"learning_rate": 5.759208376250916e-06,
"loss": 0.6102576732635498,
"step": 7260
},
{
"epoch": 1.293594306049822,
"grad_norm": 0.7216728329658508,
"learning_rate": 5.733454700599408e-06,
"loss": 0.6362377166748047,
"step": 7270
},
{
"epoch": 1.295373665480427,
"grad_norm": 0.8442836403846741,
"learning_rate": 5.707735574121732e-06,
"loss": 0.6299904346466064,
"step": 7280
},
{
"epoch": 1.2971530249110321,
"grad_norm": 0.6905636787414551,
"learning_rate": 5.68205120508347e-06,
"loss": 0.6162694454193115,
"step": 7290
},
{
"epoch": 1.298932384341637,
"grad_norm": 0.8710734844207764,
"learning_rate": 5.656401801468764e-06,
"loss": 0.5905096054077148,
"step": 7300
},
{
"epoch": 1.300711743772242,
"grad_norm": 0.7497642040252686,
"learning_rate": 5.630787570978601e-06,
"loss": 0.6081714630126953,
"step": 7310
},
{
"epoch": 1.302491103202847,
"grad_norm": 0.7930024266242981,
"learning_rate": 5.605208721029162e-06,
"loss": 0.6393218517303467,
"step": 7320
},
{
"epoch": 1.304270462633452,
"grad_norm": 0.622921347618103,
"learning_rate": 5.579665458750119e-06,
"loss": 0.6169804096221924,
"step": 7330
},
{
"epoch": 1.306049822064057,
"grad_norm": 0.7385269403457642,
"learning_rate": 5.554157990982964e-06,
"loss": 0.6091341495513916,
"step": 7340
},
{
"epoch": 1.3078291814946619,
"grad_norm": 0.839229941368103,
"learning_rate": 5.528686524279336e-06,
"loss": 0.6216320037841797,
"step": 7350
},
{
"epoch": 1.309608540925267,
"grad_norm": 0.8395034670829773,
"learning_rate": 5.503251264899361e-06,
"loss": 0.6268163204193116,
"step": 7360
},
{
"epoch": 1.311387900355872,
"grad_norm": 0.7301527261734009,
"learning_rate": 5.477852418809955e-06,
"loss": 0.6226423740386963,
"step": 7370
},
{
"epoch": 1.3131672597864767,
"grad_norm": 0.9010869860649109,
"learning_rate": 5.452490191683171e-06,
"loss": 0.6155654907226562,
"step": 7380
},
{
"epoch": 1.3149466192170818,
"grad_norm": 0.7506121397018433,
"learning_rate": 5.427164788894543e-06,
"loss": 0.6015159130096436,
"step": 7390
},
{
"epoch": 1.3167259786476868,
"grad_norm": 0.8243475556373596,
"learning_rate": 5.401876415521402e-06,
"loss": 0.6120582103729248,
"step": 7400
},
{
"epoch": 1.3185053380782918,
"grad_norm": 0.877004086971283,
"learning_rate": 5.3766252763412375e-06,
"loss": 0.6368375301361084,
"step": 7410
},
{
"epoch": 1.3202846975088969,
"grad_norm": 0.6776864528656006,
"learning_rate": 5.351411575830015e-06,
"loss": 0.5798074722290039,
"step": 7420
},
{
"epoch": 1.3220640569395017,
"grad_norm": 0.7413604259490967,
"learning_rate": 5.326235518160543e-06,
"loss": 0.6417413711547851,
"step": 7430
},
{
"epoch": 1.3238434163701067,
"grad_norm": 0.680547297000885,
"learning_rate": 5.301097307200804e-06,
"loss": 0.6709810256958008,
"step": 7440
},
{
"epoch": 1.3256227758007118,
"grad_norm": 0.8982633352279663,
"learning_rate": 5.275997146512317e-06,
"loss": 0.6244979858398437,
"step": 7450
},
{
"epoch": 1.3274021352313168,
"grad_norm": 0.7878574728965759,
"learning_rate": 5.250935239348469e-06,
"loss": 0.6555228233337402,
"step": 7460
},
{
"epoch": 1.3291814946619218,
"grad_norm": 0.7958076000213623,
"learning_rate": 5.2259117886528974e-06,
"loss": 0.6336334228515625,
"step": 7470
},
{
"epoch": 1.3309608540925266,
"grad_norm": 0.8020284175872803,
"learning_rate": 5.200926997057818e-06,
"loss": 0.6250798225402832,
"step": 7480
},
{
"epoch": 1.3327402135231317,
"grad_norm": 0.9492243528366089,
"learning_rate": 5.175981066882403e-06,
"loss": 0.5951490879058838,
"step": 7490
},
{
"epoch": 1.3345195729537367,
"grad_norm": 0.8778184056282043,
"learning_rate": 5.15107420013113e-06,
"loss": 0.5949514865875244,
"step": 7500
},
{
"epoch": 1.3345195729537367,
"eval_loss": 0.6148350238800049,
"eval_runtime": 411.3736,
"eval_samples_per_second": 12.225,
"eval_steps_per_second": 6.114,
"step": 7500
},
{
"epoch": 1.3362989323843417,
"grad_norm": 0.888278603553772,
"learning_rate": 5.126206598492167e-06,
"loss": 0.6134849071502686,
"step": 7510
},
{
"epoch": 1.3380782918149468,
"grad_norm": 0.8123595714569092,
"learning_rate": 5.101378463335713e-06,
"loss": 0.6279965877532959,
"step": 7520
},
{
"epoch": 1.3398576512455516,
"grad_norm": 0.7082033753395081,
"learning_rate": 5.07658999571238e-06,
"loss": 0.6239530086517334,
"step": 7530
},
{
"epoch": 1.3416370106761566,
"grad_norm": 0.8400927782058716,
"learning_rate": 5.051841396351574e-06,
"loss": 0.5917630195617676,
"step": 7540
},
{
"epoch": 1.3434163701067616,
"grad_norm": 0.8962631225585938,
"learning_rate": 5.027132865659847e-06,
"loss": 0.6083420753479004,
"step": 7550
},
{
"epoch": 1.3451957295373664,
"grad_norm": 0.9693284630775452,
"learning_rate": 5.0024646037193025e-06,
"loss": 0.606035566329956,
"step": 7560
},
{
"epoch": 1.3469750889679715,
"grad_norm": 0.8629393577575684,
"learning_rate": 4.977836810285942e-06,
"loss": 0.6264309406280517,
"step": 7570
},
{
"epoch": 1.3487544483985765,
"grad_norm": 0.7961446642875671,
"learning_rate": 4.953249684788084e-06,
"loss": 0.6090545654296875,
"step": 7580
},
{
"epoch": 1.3505338078291815,
"grad_norm": 0.7653380036354065,
"learning_rate": 4.928703426324712e-06,
"loss": 0.6106627941131592,
"step": 7590
},
{
"epoch": 1.3523131672597866,
"grad_norm": 0.8642345070838928,
"learning_rate": 4.9041982336639004e-06,
"loss": 0.5813540458679199,
"step": 7600
},
{
"epoch": 1.3540925266903914,
"grad_norm": 0.7263400554656982,
"learning_rate": 4.8797343052411675e-06,
"loss": 0.685100507736206,
"step": 7610
},
{
"epoch": 1.3558718861209964,
"grad_norm": 0.8232746720314026,
"learning_rate": 4.855311839157899e-06,
"loss": 0.6377671718597412,
"step": 7620
},
{
"epoch": 1.3576512455516014,
"grad_norm": 0.73633873462677,
"learning_rate": 4.830931033179725e-06,
"loss": 0.6779125690460205,
"step": 7630
},
{
"epoch": 1.3594306049822065,
"grad_norm": 0.8550395965576172,
"learning_rate": 4.806592084734928e-06,
"loss": 0.6058492660522461,
"step": 7640
},
{
"epoch": 1.3612099644128115,
"grad_norm": 0.7828453779220581,
"learning_rate": 4.782295190912831e-06,
"loss": 0.6215229034423828,
"step": 7650
},
{
"epoch": 1.3629893238434163,
"grad_norm": 0.8796820640563965,
"learning_rate": 4.758040548462233e-06,
"loss": 0.5914856433868408,
"step": 7660
},
{
"epoch": 1.3647686832740213,
"grad_norm": 0.7559328675270081,
"learning_rate": 4.733828353789772e-06,
"loss": 0.6324697017669678,
"step": 7670
},
{
"epoch": 1.3665480427046264,
"grad_norm": 0.9165831804275513,
"learning_rate": 4.709658802958366e-06,
"loss": 0.6462049961090088,
"step": 7680
},
{
"epoch": 1.3683274021352312,
"grad_norm": 0.9339064359664917,
"learning_rate": 4.685532091685625e-06,
"loss": 0.6461282253265381,
"step": 7690
},
{
"epoch": 1.3701067615658362,
"grad_norm": 0.8191850781440735,
"learning_rate": 4.66144841534224e-06,
"loss": 0.6682465553283692,
"step": 7700
},
{
"epoch": 1.3718861209964412,
"grad_norm": 0.9715204238891602,
"learning_rate": 4.637407968950434e-06,
"loss": 0.5913180351257324,
"step": 7710
},
{
"epoch": 1.3736654804270463,
"grad_norm": 0.7584412693977356,
"learning_rate": 4.613410947182354e-06,
"loss": 0.5695419311523438,
"step": 7720
},
{
"epoch": 1.3754448398576513,
"grad_norm": 0.6991924047470093,
"learning_rate": 4.589457544358521e-06,
"loss": 0.6356719493865967,
"step": 7730
},
{
"epoch": 1.3772241992882561,
"grad_norm": 0.8555135726928711,
"learning_rate": 4.56554795444623e-06,
"loss": 0.5793665409088135,
"step": 7740
},
{
"epoch": 1.3790035587188612,
"grad_norm": 0.9497565031051636,
"learning_rate": 4.5416823710580046e-06,
"loss": 0.6186023712158203,
"step": 7750
},
{
"epoch": 1.3807829181494662,
"grad_norm": 0.8282408118247986,
"learning_rate": 4.517860987450007e-06,
"loss": 0.6287422180175781,
"step": 7760
},
{
"epoch": 1.3825622775800712,
"grad_norm": 0.7512138485908508,
"learning_rate": 4.494083996520492e-06,
"loss": 0.6511450290679932,
"step": 7770
},
{
"epoch": 1.3843416370106763,
"grad_norm": 0.8342724442481995,
"learning_rate": 4.470351590808228e-06,
"loss": 0.6355900764465332,
"step": 7780
},
{
"epoch": 1.386120996441281,
"grad_norm": 0.9951474666595459,
"learning_rate": 4.446663962490951e-06,
"loss": 0.6075921058654785,
"step": 7790
},
{
"epoch": 1.387900355871886,
"grad_norm": 0.7905099391937256,
"learning_rate": 4.423021303383799e-06,
"loss": 0.5925492286682129,
"step": 7800
},
{
"epoch": 1.3896797153024911,
"grad_norm": 0.8992406129837036,
"learning_rate": 4.3994238049377715e-06,
"loss": 0.6204987049102784,
"step": 7810
},
{
"epoch": 1.3914590747330962,
"grad_norm": 1.124568223953247,
"learning_rate": 4.375871658238163e-06,
"loss": 0.6461961746215821,
"step": 7820
},
{
"epoch": 1.3932384341637012,
"grad_norm": 0.7916883230209351,
"learning_rate": 4.352365054003022e-06,
"loss": 0.6402833461761475,
"step": 7830
},
{
"epoch": 1.395017793594306,
"grad_norm": 1.0332368612289429,
"learning_rate": 4.328904182581619e-06,
"loss": 0.6191961288452148,
"step": 7840
},
{
"epoch": 1.396797153024911,
"grad_norm": 1.0284537076950073,
"learning_rate": 4.305489233952881e-06,
"loss": 0.634841012954712,
"step": 7850
},
{
"epoch": 1.398576512455516,
"grad_norm": 0.8202362656593323,
"learning_rate": 4.282120397723879e-06,
"loss": 0.5904129028320313,
"step": 7860
},
{
"epoch": 1.4003558718861209,
"grad_norm": 0.8149864077568054,
"learning_rate": 4.258797863128266e-06,
"loss": 0.6212067127227783,
"step": 7870
},
{
"epoch": 1.402135231316726,
"grad_norm": 0.750307023525238,
"learning_rate": 4.235521819024776e-06,
"loss": 0.6193465232849121,
"step": 7880
},
{
"epoch": 1.403914590747331,
"grad_norm": 0.719935417175293,
"learning_rate": 4.212292453895658e-06,
"loss": 0.6318532466888428,
"step": 7890
},
{
"epoch": 1.405693950177936,
"grad_norm": 0.8568369150161743,
"learning_rate": 4.189109955845186e-06,
"loss": 0.6267857074737548,
"step": 7900
},
{
"epoch": 1.407473309608541,
"grad_norm": 0.8335803151130676,
"learning_rate": 4.165974512598102e-06,
"loss": 0.6253261089324951,
"step": 7910
},
{
"epoch": 1.4092526690391458,
"grad_norm": 0.9222790598869324,
"learning_rate": 4.142886311498133e-06,
"loss": 0.6295660972595215,
"step": 7920
},
{
"epoch": 1.4110320284697508,
"grad_norm": 0.7769973874092102,
"learning_rate": 4.119845539506436e-06,
"loss": 0.6295949935913085,
"step": 7930
},
{
"epoch": 1.4128113879003559,
"grad_norm": 0.8693393468856812,
"learning_rate": 4.096852383200106e-06,
"loss": 0.5903539180755615,
"step": 7940
},
{
"epoch": 1.414590747330961,
"grad_norm": 0.8232077956199646,
"learning_rate": 4.073907028770671e-06,
"loss": 0.5986703395843506,
"step": 7950
},
{
"epoch": 1.416370106761566,
"grad_norm": 0.9507026672363281,
"learning_rate": 4.051009662022559e-06,
"loss": 0.6164099693298339,
"step": 7960
},
{
"epoch": 1.4181494661921707,
"grad_norm": 0.9032939672470093,
"learning_rate": 4.028160468371618e-06,
"loss": 0.5873546600341797,
"step": 7970
},
{
"epoch": 1.4199288256227758,
"grad_norm": 0.7442892789840698,
"learning_rate": 4.005359632843598e-06,
"loss": 0.6300751209259033,
"step": 7980
},
{
"epoch": 1.4217081850533808,
"grad_norm": 0.870611310005188,
"learning_rate": 3.982607340072673e-06,
"loss": 0.6464887142181397,
"step": 7990
},
{
"epoch": 1.4234875444839858,
"grad_norm": 0.9346739053726196,
"learning_rate": 3.959903774299914e-06,
"loss": 0.5978512763977051,
"step": 8000
},
{
"epoch": 1.4234875444839858,
"eval_loss": 0.6126887202262878,
"eval_runtime": 408.0866,
"eval_samples_per_second": 12.323,
"eval_steps_per_second": 6.163,
"step": 8000
},
{
"epoch": 1.4252669039145909,
"grad_norm": 0.7768172025680542,
"learning_rate": 3.937249119371837e-06,
"loss": 0.6360022544860839,
"step": 8010
},
{
"epoch": 1.4270462633451957,
"grad_norm": 0.9232711791992188,
"learning_rate": 3.914643558738871e-06,
"loss": 0.6255642414093018,
"step": 8020
},
{
"epoch": 1.4288256227758007,
"grad_norm": 0.6983018517494202,
"learning_rate": 3.892087275453913e-06,
"loss": 0.5994945526123047,
"step": 8030
},
{
"epoch": 1.4306049822064058,
"grad_norm": 1.073339819908142,
"learning_rate": 3.869580452170813e-06,
"loss": 0.6356189250946045,
"step": 8040
},
{
"epoch": 1.4323843416370106,
"grad_norm": 0.6503099799156189,
"learning_rate": 3.847123271142921e-06,
"loss": 0.6280940055847168,
"step": 8050
},
{
"epoch": 1.4341637010676156,
"grad_norm": 0.7652536630630493,
"learning_rate": 3.824715914221593e-06,
"loss": 0.633913803100586,
"step": 8060
},
{
"epoch": 1.4359430604982206,
"grad_norm": 0.7776927351951599,
"learning_rate": 3.802358562854719e-06,
"loss": 0.6014345169067383,
"step": 8070
},
{
"epoch": 1.4377224199288257,
"grad_norm": 0.9608566164970398,
"learning_rate": 3.780051398085274e-06,
"loss": 0.6504060745239257,
"step": 8080
},
{
"epoch": 1.4395017793594307,
"grad_norm": 1.0493940114974976,
"learning_rate": 3.7577946005498224e-06,
"loss": 0.6023014545440674,
"step": 8090
},
{
"epoch": 1.4412811387900355,
"grad_norm": 0.6732901930809021,
"learning_rate": 3.735588350477083e-06,
"loss": 0.618662166595459,
"step": 8100
},
{
"epoch": 1.4430604982206405,
"grad_norm": 0.7948004007339478,
"learning_rate": 3.7134328276864474e-06,
"loss": 0.6384446144104003,
"step": 8110
},
{
"epoch": 1.4448398576512456,
"grad_norm": 0.8854271769523621,
"learning_rate": 3.691328211586537e-06,
"loss": 0.6317539215087891,
"step": 8120
},
{
"epoch": 1.4466192170818506,
"grad_norm": 0.849312961101532,
"learning_rate": 3.669274681173741e-06,
"loss": 0.632611894607544,
"step": 8130
},
{
"epoch": 1.4483985765124556,
"grad_norm": 0.8737841844558716,
"learning_rate": 3.647272415030787e-06,
"loss": 0.6378211498260498,
"step": 8140
},
{
"epoch": 1.4501779359430604,
"grad_norm": 0.9041767716407776,
"learning_rate": 3.6253215913252614e-06,
"loss": 0.6225557804107666,
"step": 8150
},
{
"epoch": 1.4519572953736655,
"grad_norm": 0.9240919947624207,
"learning_rate": 3.603422387808203e-06,
"loss": 0.6011790752410888,
"step": 8160
},
{
"epoch": 1.4537366548042705,
"grad_norm": 0.7381271719932556,
"learning_rate": 3.5815749818126298e-06,
"loss": 0.6100322723388671,
"step": 8170
},
{
"epoch": 1.4555160142348753,
"grad_norm": 0.766522228717804,
"learning_rate": 3.559779550252135e-06,
"loss": 0.6223373413085938,
"step": 8180
},
{
"epoch": 1.4572953736654806,
"grad_norm": 0.722597599029541,
"learning_rate": 3.5380362696194246e-06,
"loss": 0.6096580982208252,
"step": 8190
},
{
"epoch": 1.4590747330960854,
"grad_norm": 0.848345935344696,
"learning_rate": 3.5163453159849158e-06,
"loss": 0.5932037830352783,
"step": 8200
},
{
"epoch": 1.4608540925266904,
"grad_norm": 0.8649978637695312,
"learning_rate": 3.4947068649952875e-06,
"loss": 0.616939926147461,
"step": 8210
},
{
"epoch": 1.4626334519572954,
"grad_norm": 1.0290734767913818,
"learning_rate": 3.473121091872068e-06,
"loss": 0.5814998149871826,
"step": 8220
},
{
"epoch": 1.4644128113879002,
"grad_norm": 1.0387070178985596,
"learning_rate": 3.4515881714102283e-06,
"loss": 0.602289867401123,
"step": 8230
},
{
"epoch": 1.4661921708185053,
"grad_norm": 0.7532253861427307,
"learning_rate": 3.4301082779767394e-06,
"loss": 0.6335249423980713,
"step": 8240
},
{
"epoch": 1.4679715302491103,
"grad_norm": 0.8629032373428345,
"learning_rate": 3.4086815855091906e-06,
"loss": 0.6227278709411621,
"step": 8250
},
{
"epoch": 1.4697508896797153,
"grad_norm": 0.9435595870018005,
"learning_rate": 3.3873082675143533e-06,
"loss": 0.6437982559204102,
"step": 8260
},
{
"epoch": 1.4715302491103204,
"grad_norm": 0.9069509506225586,
"learning_rate": 3.3659884970667955e-06,
"loss": 0.5948871612548828,
"step": 8270
},
{
"epoch": 1.4733096085409252,
"grad_norm": 0.7571467757225037,
"learning_rate": 3.344722446807469e-06,
"loss": 0.6302636623382568,
"step": 8280
},
{
"epoch": 1.4750889679715302,
"grad_norm": 0.7709717154502869,
"learning_rate": 3.3235102889423263e-06,
"loss": 0.6287346839904785,
"step": 8290
},
{
"epoch": 1.4768683274021353,
"grad_norm": 0.9050244688987732,
"learning_rate": 3.302352195240901e-06,
"loss": 0.6431692123413086,
"step": 8300
},
{
"epoch": 1.4786476868327403,
"grad_norm": 0.8429788947105408,
"learning_rate": 3.281248337034947e-06,
"loss": 0.6204410076141358,
"step": 8310
},
{
"epoch": 1.4804270462633453,
"grad_norm": 0.7680916786193848,
"learning_rate": 3.2601988852170207e-06,
"loss": 0.6185726642608642,
"step": 8320
},
{
"epoch": 1.4822064056939501,
"grad_norm": 0.7400604486465454,
"learning_rate": 3.2392040102391278e-06,
"loss": 0.711566686630249,
"step": 8330
},
{
"epoch": 1.4839857651245552,
"grad_norm": 0.7004432082176208,
"learning_rate": 3.2182638821113156e-06,
"loss": 0.5973163604736328,
"step": 8340
},
{
"epoch": 1.4857651245551602,
"grad_norm": 0.8438174724578857,
"learning_rate": 3.1973786704003086e-06,
"loss": 0.6273365497589112,
"step": 8350
},
{
"epoch": 1.487544483985765,
"grad_norm": 0.9420303702354431,
"learning_rate": 3.1765485442281453e-06,
"loss": 0.6080061435699463,
"step": 8360
},
{
"epoch": 1.48932384341637,
"grad_norm": 0.8402919769287109,
"learning_rate": 3.1557736722707843e-06,
"loss": 0.6027824401855468,
"step": 8370
},
{
"epoch": 1.491103202846975,
"grad_norm": 1.0150177478790283,
"learning_rate": 3.1350542227567693e-06,
"loss": 0.629277515411377,
"step": 8380
},
{
"epoch": 1.49288256227758,
"grad_norm": 0.8253095149993896,
"learning_rate": 3.1143903634658314e-06,
"loss": 0.6055563926696778,
"step": 8390
},
{
"epoch": 1.4946619217081851,
"grad_norm": 0.7219249606132507,
"learning_rate": 3.093782261727567e-06,
"loss": 0.6305870532989502,
"step": 8400
},
{
"epoch": 1.49644128113879,
"grad_norm": 0.7794970870018005,
"learning_rate": 3.073230084420051e-06,
"loss": 0.6157556533813476,
"step": 8410
},
{
"epoch": 1.498220640569395,
"grad_norm": 0.8257132172584534,
"learning_rate": 3.0527339979685068e-06,
"loss": 0.628666877746582,
"step": 8420
},
{
"epoch": 1.5,
"grad_norm": 0.7853142619132996,
"learning_rate": 3.0322941683439455e-06,
"loss": 0.6387495040893555,
"step": 8430
},
{
"epoch": 1.501779359430605,
"grad_norm": 0.9834029674530029,
"learning_rate": 3.011910761061837e-06,
"loss": 0.6235998153686524,
"step": 8440
},
{
"epoch": 1.50355871886121,
"grad_norm": 0.7815040349960327,
"learning_rate": 2.99158394118075e-06,
"loss": 0.6229785919189453,
"step": 8450
},
{
"epoch": 1.5053380782918149,
"grad_norm": 0.6993287801742554,
"learning_rate": 2.9713138733010373e-06,
"loss": 0.6498080730438233,
"step": 8460
},
{
"epoch": 1.50711743772242,
"grad_norm": 0.8505419492721558,
"learning_rate": 2.951100721563479e-06,
"loss": 0.6163151264190674,
"step": 8470
},
{
"epoch": 1.508896797153025,
"grad_norm": 0.7740962505340576,
"learning_rate": 2.930944649647971e-06,
"loss": 0.6463754177093506,
"step": 8480
},
{
"epoch": 1.5106761565836297,
"grad_norm": 0.9081360101699829,
"learning_rate": 2.9108458207722013e-06,
"loss": 0.5963332653045654,
"step": 8490
},
{
"epoch": 1.512455516014235,
"grad_norm": 0.7486013770103455,
"learning_rate": 2.8908043976903065e-06,
"loss": 0.5907905101776123,
"step": 8500
},
{
"epoch": 1.512455516014235,
"eval_loss": 0.6111557483673096,
"eval_runtime": 408.1089,
"eval_samples_per_second": 12.323,
"eval_steps_per_second": 6.163,
"step": 8500
},
{
"epoch": 1.5142348754448398,
"grad_norm": 0.6754533648490906,
"learning_rate": 2.8708205426915858e-06,
"loss": 0.5735606670379638,
"step": 8510
},
{
"epoch": 1.5160142348754448,
"grad_norm": 0.9227625727653503,
"learning_rate": 2.850894417599154e-06,
"loss": 0.586240005493164,
"step": 8520
},
{
"epoch": 1.5177935943060499,
"grad_norm": 1.1501375436782837,
"learning_rate": 2.8310261837686594e-06,
"loss": 0.5986839771270752,
"step": 8530
},
{
"epoch": 1.5195729537366547,
"grad_norm": 0.8052434325218201,
"learning_rate": 2.811216002086954e-06,
"loss": 0.6387444972991944,
"step": 8540
},
{
"epoch": 1.52135231316726,
"grad_norm": 0.9873703718185425,
"learning_rate": 2.791464032970812e-06,
"loss": 0.6114506244659423,
"step": 8550
},
{
"epoch": 1.5231316725978647,
"grad_norm": 0.8305763006210327,
"learning_rate": 2.771770436365612e-06,
"loss": 0.65102219581604,
"step": 8560
},
{
"epoch": 1.5249110320284698,
"grad_norm": 0.9540684223175049,
"learning_rate": 2.7521353717440523e-06,
"loss": 0.6242643833160401,
"step": 8570
},
{
"epoch": 1.5266903914590748,
"grad_norm": 0.764707624912262,
"learning_rate": 2.732558998104855e-06,
"loss": 0.6340816020965576,
"step": 8580
},
{
"epoch": 1.5284697508896796,
"grad_norm": 0.8800462484359741,
"learning_rate": 2.7130414739714884e-06,
"loss": 0.6269487857818603,
"step": 8590
},
{
"epoch": 1.5302491103202847,
"grad_norm": 0.7068182826042175,
"learning_rate": 2.6935829573908645e-06,
"loss": 0.6376915454864502,
"step": 8600
},
{
"epoch": 1.5320284697508897,
"grad_norm": 0.9243912100791931,
"learning_rate": 2.6741836059320813e-06,
"loss": 0.6650017261505127,
"step": 8610
},
{
"epoch": 1.5338078291814945,
"grad_norm": 0.8034710884094238,
"learning_rate": 2.654843576685129e-06,
"loss": 0.6140787601470947,
"step": 8620
},
{
"epoch": 1.5355871886120998,
"grad_norm": 0.6924305558204651,
"learning_rate": 2.635563026259622e-06,
"loss": 0.592177438735962,
"step": 8630
},
{
"epoch": 1.5373665480427046,
"grad_norm": 0.9960291385650635,
"learning_rate": 2.616342110783544e-06,
"loss": 0.620767879486084,
"step": 8640
},
{
"epoch": 1.5391459074733096,
"grad_norm": 0.9700925350189209,
"learning_rate": 2.5971809859019616e-06,
"loss": 0.6059544563293457,
"step": 8650
},
{
"epoch": 1.5409252669039146,
"grad_norm": 0.8979527354240417,
"learning_rate": 2.578079806775786e-06,
"loss": 0.6165768146514893,
"step": 8660
},
{
"epoch": 1.5427046263345194,
"grad_norm": 0.7694889903068542,
"learning_rate": 2.559038728080495e-06,
"loss": 0.6269434452056885,
"step": 8670
},
{
"epoch": 1.5444839857651247,
"grad_norm": 0.7812192440032959,
"learning_rate": 2.5400579040049045e-06,
"loss": 0.6156674385070801,
"step": 8680
},
{
"epoch": 1.5462633451957295,
"grad_norm": 0.8593633770942688,
"learning_rate": 2.521137488249892e-06,
"loss": 0.5995303630828858,
"step": 8690
},
{
"epoch": 1.5480427046263345,
"grad_norm": 0.7315478920936584,
"learning_rate": 2.5022776340271827e-06,
"loss": 0.6255430698394775,
"step": 8700
},
{
"epoch": 1.5498220640569396,
"grad_norm": 0.7979263663291931,
"learning_rate": 2.483478494058081e-06,
"loss": 0.617695426940918,
"step": 8710
},
{
"epoch": 1.5516014234875444,
"grad_norm": 0.791521430015564,
"learning_rate": 2.4647402205722513e-06,
"loss": 0.5985545158386231,
"step": 8720
},
{
"epoch": 1.5533807829181496,
"grad_norm": 0.8115281462669373,
"learning_rate": 2.446062965306476e-06,
"loss": 0.6137414932250976,
"step": 8730
},
{
"epoch": 1.5551601423487544,
"grad_norm": 0.73805832862854,
"learning_rate": 2.427446879503439e-06,
"loss": 0.6292818069458008,
"step": 8740
},
{
"epoch": 1.5569395017793595,
"grad_norm": 0.7681689262390137,
"learning_rate": 2.408892113910485e-06,
"loss": 0.6184986114501954,
"step": 8750
},
{
"epoch": 1.5587188612099645,
"grad_norm": 0.8392589688301086,
"learning_rate": 2.390398818778403e-06,
"loss": 0.6432001113891601,
"step": 8760
},
{
"epoch": 1.5604982206405693,
"grad_norm": 0.7140183448791504,
"learning_rate": 2.3719671438602287e-06,
"loss": 0.5931034088134766,
"step": 8770
},
{
"epoch": 1.5622775800711743,
"grad_norm": 1.0340561866760254,
"learning_rate": 2.353597238409997e-06,
"loss": 0.6184981346130372,
"step": 8780
},
{
"epoch": 1.5640569395017794,
"grad_norm": 1.3788442611694336,
"learning_rate": 2.3352892511815695e-06,
"loss": 0.6254217147827148,
"step": 8790
},
{
"epoch": 1.5658362989323842,
"grad_norm": 0.8685413599014282,
"learning_rate": 2.3170433304274e-06,
"loss": 0.6440535545349121,
"step": 8800
},
{
"epoch": 1.5676156583629894,
"grad_norm": 0.8243607878684998,
"learning_rate": 2.298859623897357e-06,
"loss": 0.6024580001831055,
"step": 8810
},
{
"epoch": 1.5693950177935942,
"grad_norm": 0.9618055820465088,
"learning_rate": 2.280738278837508e-06,
"loss": 0.6213048934936524,
"step": 8820
},
{
"epoch": 1.5711743772241993,
"grad_norm": 0.8591263294219971,
"learning_rate": 2.2626794419889463e-06,
"loss": 0.6279497146606445,
"step": 8830
},
{
"epoch": 1.5729537366548043,
"grad_norm": 0.8621962070465088,
"learning_rate": 2.2446832595865833e-06,
"loss": 0.5932358741760254,
"step": 8840
},
{
"epoch": 1.5747330960854091,
"grad_norm": 0.9073888659477234,
"learning_rate": 2.226749877357983e-06,
"loss": 0.6451629161834717,
"step": 8850
},
{
"epoch": 1.5765124555160144,
"grad_norm": 0.9106960892677307,
"learning_rate": 2.208879440522167e-06,
"loss": 0.6058315753936767,
"step": 8860
},
{
"epoch": 1.5782918149466192,
"grad_norm": 0.7805183529853821,
"learning_rate": 2.1910720937884432e-06,
"loss": 0.6308177947998047,
"step": 8870
},
{
"epoch": 1.5800711743772242,
"grad_norm": 0.8798645734786987,
"learning_rate": 2.1733279813552386e-06,
"loss": 0.6120688438415527,
"step": 8880
},
{
"epoch": 1.5818505338078293,
"grad_norm": 0.7852058410644531,
"learning_rate": 2.1556472469089305e-06,
"loss": 0.6294644832611084,
"step": 8890
},
{
"epoch": 1.583629893238434,
"grad_norm": 1.0219141244888306,
"learning_rate": 2.1380300336226756e-06,
"loss": 0.6379860877990723,
"step": 8900
},
{
"epoch": 1.585409252669039,
"grad_norm": 0.8301982879638672,
"learning_rate": 2.120476484155255e-06,
"loss": 0.5950196743011474,
"step": 8910
},
{
"epoch": 1.5871886120996441,
"grad_norm": 0.781911313533783,
"learning_rate": 2.102986740649928e-06,
"loss": 0.6155390739440918,
"step": 8920
},
{
"epoch": 1.5889679715302492,
"grad_norm": 0.8553647994995117,
"learning_rate": 2.0855609447332635e-06,
"loss": 0.6030520439147949,
"step": 8930
},
{
"epoch": 1.5907473309608542,
"grad_norm": 0.8010233640670776,
"learning_rate": 2.06819923751401e-06,
"loss": 0.5782717227935791,
"step": 8940
},
{
"epoch": 1.592526690391459,
"grad_norm": 0.7767868041992188,
"learning_rate": 2.050901759581937e-06,
"loss": 0.5862733364105225,
"step": 8950
},
{
"epoch": 1.594306049822064,
"grad_norm": 0.894256591796875,
"learning_rate": 2.033668651006715e-06,
"loss": 0.6170249938964844,
"step": 8960
},
{
"epoch": 1.596085409252669,
"grad_norm": 0.8816112875938416,
"learning_rate": 2.0165000513367604e-06,
"loss": 0.5960803508758545,
"step": 8970
},
{
"epoch": 1.5978647686832739,
"grad_norm": 0.7646706700325012,
"learning_rate": 1.9993960995981287e-06,
"loss": 0.609787130355835,
"step": 8980
},
{
"epoch": 1.5996441281138791,
"grad_norm": 0.8380435109138489,
"learning_rate": 1.9823569342933624e-06,
"loss": 0.5809425354003906,
"step": 8990
},
{
"epoch": 1.601423487544484,
"grad_norm": 0.7763661742210388,
"learning_rate": 1.965382693400396e-06,
"loss": 0.6282608032226562,
"step": 9000
},
{
"epoch": 1.601423487544484,
"eval_loss": 0.6098406314849854,
"eval_runtime": 407.9599,
"eval_samples_per_second": 12.327,
"eval_steps_per_second": 6.165,
"step": 9000
},
{
"epoch": 1.603202846975089,
"grad_norm": 0.7980552315711975,
"learning_rate": 1.9484735143714184e-06,
"loss": 0.5892675876617431,
"step": 9010
},
{
"epoch": 1.604982206405694,
"grad_norm": 0.7350971102714539,
"learning_rate": 1.931629534131769e-06,
"loss": 0.615026330947876,
"step": 9020
},
{
"epoch": 1.6067615658362988,
"grad_norm": 0.9350213408470154,
"learning_rate": 1.9148508890788263e-06,
"loss": 0.5945257663726806,
"step": 9030
},
{
"epoch": 1.608540925266904,
"grad_norm": 0.8562370538711548,
"learning_rate": 1.8981377150809111e-06,
"loss": 0.6524335384368897,
"step": 9040
},
{
"epoch": 1.6103202846975089,
"grad_norm": 0.8481477499008179,
"learning_rate": 1.8814901474761704e-06,
"loss": 0.572668981552124,
"step": 9050
},
{
"epoch": 1.612099644128114,
"grad_norm": 0.8405110239982605,
"learning_rate": 1.8649083210714946e-06,
"loss": 0.6025730609893799,
"step": 9060
},
{
"epoch": 1.613879003558719,
"grad_norm": 0.6442246437072754,
"learning_rate": 1.8483923701414274e-06,
"loss": 0.619115161895752,
"step": 9070
},
{
"epoch": 1.6156583629893237,
"grad_norm": 0.9033611416816711,
"learning_rate": 1.8319424284270638e-06,
"loss": 0.6313360214233399,
"step": 9080
},
{
"epoch": 1.6174377224199288,
"grad_norm": 0.8982630968093872,
"learning_rate": 1.8155586291349836e-06,
"loss": 0.6707229137420654,
"step": 9090
},
{
"epoch": 1.6192170818505338,
"grad_norm": 0.8548517823219299,
"learning_rate": 1.7992411049361612e-06,
"loss": 0.5913959503173828,
"step": 9100
},
{
"epoch": 1.6209964412811388,
"grad_norm": 0.8158797025680542,
"learning_rate": 1.7829899879649005e-06,
"loss": 0.6455716133117676,
"step": 9110
},
{
"epoch": 1.6227758007117439,
"grad_norm": 1.0375958681106567,
"learning_rate": 1.7668054098177512e-06,
"loss": 0.6378812789916992,
"step": 9120
},
{
"epoch": 1.6245551601423487,
"grad_norm": 0.9441247582435608,
"learning_rate": 1.7506875015524649e-06,
"loss": 0.6197398662567138,
"step": 9130
},
{
"epoch": 1.6263345195729537,
"grad_norm": 0.7956865429878235,
"learning_rate": 1.7346363936869082e-06,
"loss": 0.6085912704467773,
"step": 9140
},
{
"epoch": 1.6281138790035588,
"grad_norm": 0.7922634482383728,
"learning_rate": 1.7186522161980279e-06,
"loss": 0.6055495738983154,
"step": 9150
},
{
"epoch": 1.6298932384341636,
"grad_norm": 0.9479956030845642,
"learning_rate": 1.7027350985207847e-06,
"loss": 0.5977864265441895,
"step": 9160
},
{
"epoch": 1.6316725978647688,
"grad_norm": 0.8339085578918457,
"learning_rate": 1.68688516954711e-06,
"loss": 0.6118191242218017,
"step": 9170
},
{
"epoch": 1.6334519572953736,
"grad_norm": 0.8879317045211792,
"learning_rate": 1.6711025576248586e-06,
"loss": 0.607643461227417,
"step": 9180
},
{
"epoch": 1.6352313167259787,
"grad_norm": 0.7563620805740356,
"learning_rate": 1.655387390556782e-06,
"loss": 0.566606092453003,
"step": 9190
},
{
"epoch": 1.6370106761565837,
"grad_norm": 0.8990421295166016,
"learning_rate": 1.6397397955994742e-06,
"loss": 0.6038555145263672,
"step": 9200
},
{
"epoch": 1.6387900355871885,
"grad_norm": 0.8692913055419922,
"learning_rate": 1.624159899462353e-06,
"loss": 0.6045926570892334,
"step": 9210
},
{
"epoch": 1.6405693950177938,
"grad_norm": 0.7604990005493164,
"learning_rate": 1.6086478283066386e-06,
"loss": 0.6289362907409668,
"step": 9220
},
{
"epoch": 1.6423487544483986,
"grad_norm": 0.8552067875862122,
"learning_rate": 1.5932037077443163e-06,
"loss": 0.5995774269104004,
"step": 9230
},
{
"epoch": 1.6441281138790036,
"grad_norm": 0.9507073760032654,
"learning_rate": 1.577827662837136e-06,
"loss": 0.6592823028564453,
"step": 9240
},
{
"epoch": 1.6459074733096086,
"grad_norm": 0.7741204500198364,
"learning_rate": 1.5625198180955836e-06,
"loss": 0.5817698001861572,
"step": 9250
},
{
"epoch": 1.6476868327402134,
"grad_norm": 0.906514048576355,
"learning_rate": 1.5472802974778911e-06,
"loss": 0.6105581760406494,
"step": 9260
},
{
"epoch": 1.6494661921708185,
"grad_norm": 1.1253999471664429,
"learning_rate": 1.5321092243890112e-06,
"loss": 0.6198730945587159,
"step": 9270
},
{
"epoch": 1.6512455516014235,
"grad_norm": 0.7494739294052124,
"learning_rate": 1.5170067216796425e-06,
"loss": 0.5921574115753174,
"step": 9280
},
{
"epoch": 1.6530249110320283,
"grad_norm": 0.674996018409729,
"learning_rate": 1.5019729116452086e-06,
"loss": 0.6231479644775391,
"step": 9290
},
{
"epoch": 1.6548042704626336,
"grad_norm": 0.8674888610839844,
"learning_rate": 1.4870079160248918e-06,
"loss": 0.5954656600952148,
"step": 9300
},
{
"epoch": 1.6565836298932384,
"grad_norm": 0.8911182284355164,
"learning_rate": 1.4721118560006298e-06,
"loss": 0.5914658069610595,
"step": 9310
},
{
"epoch": 1.6583629893238434,
"grad_norm": 0.9357933402061462,
"learning_rate": 1.4572848521961414e-06,
"loss": 0.6310736656188964,
"step": 9320
},
{
"epoch": 1.6601423487544484,
"grad_norm": 0.936082124710083,
"learning_rate": 1.4425270246759549e-06,
"loss": 0.5978594303131104,
"step": 9330
},
{
"epoch": 1.6619217081850532,
"grad_norm": 0.8404752016067505,
"learning_rate": 1.4278384929444233e-06,
"loss": 0.5916398048400879,
"step": 9340
},
{
"epoch": 1.6637010676156585,
"grad_norm": 0.6900134086608887,
"learning_rate": 1.4132193759447665e-06,
"loss": 0.6098292350769043,
"step": 9350
},
{
"epoch": 1.6654804270462633,
"grad_norm": 0.8345168828964233,
"learning_rate": 1.398669792058105e-06,
"loss": 0.6169853687286377,
"step": 9360
},
{
"epoch": 1.6672597864768683,
"grad_norm": 0.8186418414115906,
"learning_rate": 1.384189859102505e-06,
"loss": 0.6554735660552978,
"step": 9370
},
{
"epoch": 1.6690391459074734,
"grad_norm": 0.8094502687454224,
"learning_rate": 1.369779694332012e-06,
"loss": 0.57339506149292,
"step": 9380
},
{
"epoch": 1.6708185053380782,
"grad_norm": 0.8150178790092468,
"learning_rate": 1.3554394144357218e-06,
"loss": 0.5624317169189453,
"step": 9390
},
{
"epoch": 1.6725978647686834,
"grad_norm": 0.7492024898529053,
"learning_rate": 1.341169135536814e-06,
"loss": 0.619264554977417,
"step": 9400
},
{
"epoch": 1.6743772241992882,
"grad_norm": 0.7049137949943542,
"learning_rate": 1.326968973191628e-06,
"loss": 0.5918323040008545,
"step": 9410
},
{
"epoch": 1.6761565836298933,
"grad_norm": 0.8279508948326111,
"learning_rate": 1.312839042388716e-06,
"loss": 0.6143953800201416,
"step": 9420
},
{
"epoch": 1.6779359430604983,
"grad_norm": 1.0093584060668945,
"learning_rate": 1.2987794575479218e-06,
"loss": 0.6140541076660156,
"step": 9430
},
{
"epoch": 1.6797153024911031,
"grad_norm": 0.9923004508018494,
"learning_rate": 1.2847903325194455e-06,
"loss": 0.6000154495239258,
"step": 9440
},
{
"epoch": 1.6814946619217082,
"grad_norm": 0.9289199113845825,
"learning_rate": 1.270871780582924e-06,
"loss": 0.6113490104675293,
"step": 9450
},
{
"epoch": 1.6832740213523132,
"grad_norm": 0.6759699583053589,
"learning_rate": 1.2570239144465212e-06,
"loss": 0.6019508838653564,
"step": 9460
},
{
"epoch": 1.685053380782918,
"grad_norm": 0.8999039530754089,
"learning_rate": 1.2432468462460024e-06,
"loss": 0.6267662048339844,
"step": 9470
},
{
"epoch": 1.6868327402135233,
"grad_norm": 0.9530540704727173,
"learning_rate": 1.2295406875438377e-06,
"loss": 0.5737581729888916,
"step": 9480
},
{
"epoch": 1.688612099644128,
"grad_norm": 0.9010872840881348,
"learning_rate": 1.2159055493282911e-06,
"loss": 0.6142421722412109,
"step": 9490
},
{
"epoch": 1.690391459074733,
"grad_norm": 0.9528436660766602,
"learning_rate": 1.2023415420125262e-06,
"loss": 0.5961336612701416,
"step": 9500
},
{
"epoch": 1.690391459074733,
"eval_loss": 0.6090449094772339,
"eval_runtime": 409.4467,
"eval_samples_per_second": 12.282,
"eval_steps_per_second": 6.142,
"step": 9500
},
{
"epoch": 1.6921708185053381,
"grad_norm": 0.863777220249176,
"learning_rate": 1.1888487754337052e-06,
"loss": 0.6345890045166016,
"step": 9510
},
{
"epoch": 1.693950177935943,
"grad_norm": 1.0439646244049072,
"learning_rate": 1.1754273588521149e-06,
"loss": 0.6450316429138183,
"step": 9520
},
{
"epoch": 1.6957295373665482,
"grad_norm": 0.8258497714996338,
"learning_rate": 1.162077400950261e-06,
"loss": 0.6218976497650146,
"step": 9530
},
{
"epoch": 1.697508896797153,
"grad_norm": 0.8564954996109009,
"learning_rate": 1.148799009832009e-06,
"loss": 0.6231951713562012,
"step": 9540
},
{
"epoch": 1.699288256227758,
"grad_norm": 0.7925037741661072,
"learning_rate": 1.1355922930216867e-06,
"loss": 0.5838134765625,
"step": 9550
},
{
"epoch": 1.701067615658363,
"grad_norm": 0.8268294930458069,
"learning_rate": 1.122457357463238e-06,
"loss": 0.6027390480041503,
"step": 9560
},
{
"epoch": 1.7028469750889679,
"grad_norm": 0.9756558537483215,
"learning_rate": 1.1093943095193328e-06,
"loss": 0.6181758403778076,
"step": 9570
},
{
"epoch": 1.704626334519573,
"grad_norm": 0.8576071262359619,
"learning_rate": 1.096403254970526e-06,
"loss": 0.6374680519104003,
"step": 9580
},
{
"epoch": 1.706405693950178,
"grad_norm": 0.9196457266807556,
"learning_rate": 1.083484299014389e-06,
"loss": 0.6039985179901123,
"step": 9590
},
{
"epoch": 1.708185053380783,
"grad_norm": 0.9652523994445801,
"learning_rate": 1.0706375462646557e-06,
"loss": 0.6314795970916748,
"step": 9600
},
{
"epoch": 1.709964412811388,
"grad_norm": 0.9042948484420776,
"learning_rate": 1.0578631007503914e-06,
"loss": 0.6144063472747803,
"step": 9610
},
{
"epoch": 1.7117437722419928,
"grad_norm": 1.0348018407821655,
"learning_rate": 1.045161065915129e-06,
"loss": 0.6448927879333496,
"step": 9620
},
{
"epoch": 1.7135231316725978,
"grad_norm": 0.9222016334533691,
"learning_rate": 1.03253154461605e-06,
"loss": 0.6264961719512939,
"step": 9630
},
{
"epoch": 1.7153024911032029,
"grad_norm": 0.8399068117141724,
"learning_rate": 1.019974639123138e-06,
"loss": 0.6070163726806641,
"step": 9640
},
{
"epoch": 1.7170818505338077,
"grad_norm": 0.8186811208724976,
"learning_rate": 1.0074904511183592e-06,
"loss": 0.6530916690826416,
"step": 9650
},
{
"epoch": 1.718861209964413,
"grad_norm": 0.8317114114761353,
"learning_rate": 9.95079081694832e-07,
"loss": 0.6258822917938233,
"step": 9660
},
{
"epoch": 1.7206405693950177,
"grad_norm": 0.8472376465797424,
"learning_rate": 9.827406313560194e-07,
"loss": 0.6343019485473633,
"step": 9670
},
{
"epoch": 1.7224199288256228,
"grad_norm": 0.7907516956329346,
"learning_rate": 9.704752000148998e-07,
"loss": 0.5924992561340332,
"step": 9680
},
{
"epoch": 1.7241992882562278,
"grad_norm": 0.8266814351081848,
"learning_rate": 9.582828869931749e-07,
"loss": 0.63353590965271,
"step": 9690
},
{
"epoch": 1.7259786476868326,
"grad_norm": 0.8828052878379822,
"learning_rate": 9.461637910204468e-07,
"loss": 0.5913454532623291,
"step": 9700
},
{
"epoch": 1.7277580071174379,
"grad_norm": 0.795305609703064,
"learning_rate": 9.341180102334391e-07,
"loss": 0.625618314743042,
"step": 9710
},
{
"epoch": 1.7295373665480427,
"grad_norm": 0.8134214878082275,
"learning_rate": 9.221456421751828e-07,
"loss": 0.6241415023803711,
"step": 9720
},
{
"epoch": 1.7313167259786477,
"grad_norm": 0.8993518948554993,
"learning_rate": 9.102467837942364e-07,
"loss": 0.6320163249969483,
"step": 9730
},
{
"epoch": 1.7330960854092528,
"grad_norm": 0.8220566511154175,
"learning_rate": 8.984215314439071e-07,
"loss": 0.5945559501647949,
"step": 9740
},
{
"epoch": 1.7348754448398576,
"grad_norm": 0.8112704753875732,
"learning_rate": 8.866699808814516e-07,
"loss": 0.5934808731079102,
"step": 9750
},
{
"epoch": 1.7366548042704626,
"grad_norm": 0.9084371328353882,
"learning_rate": 8.749922272673228e-07,
"loss": 0.614768123626709,
"step": 9760
},
{
"epoch": 1.7384341637010676,
"grad_norm": 1.0116448402404785,
"learning_rate": 8.633883651643815e-07,
"loss": 0.6284510612487793,
"step": 9770
},
{
"epoch": 1.7402135231316724,
"grad_norm": 0.7356001734733582,
"learning_rate": 8.518584885371417e-07,
"loss": 0.5880512237548828,
"step": 9780
},
{
"epoch": 1.7419928825622777,
"grad_norm": 1.018543004989624,
"learning_rate": 8.404026907510043e-07,
"loss": 0.6093903064727784,
"step": 9790
},
{
"epoch": 1.7437722419928825,
"grad_norm": 0.7327563166618347,
"learning_rate": 8.290210645715002e-07,
"loss": 0.6004554748535156,
"step": 9800
},
{
"epoch": 1.7455516014234875,
"grad_norm": 0.8729904294013977,
"learning_rate": 8.177137021635439e-07,
"loss": 0.6066318035125733,
"step": 9810
},
{
"epoch": 1.7473309608540926,
"grad_norm": 0.9604983925819397,
"learning_rate": 8.064806950906867e-07,
"loss": 0.617271900177002,
"step": 9820
},
{
"epoch": 1.7491103202846974,
"grad_norm": 0.8511675000190735,
"learning_rate": 7.95322134314368e-07,
"loss": 0.6143304824829101,
"step": 9830
},
{
"epoch": 1.7508896797153026,
"grad_norm": 1.240357518196106,
"learning_rate": 7.842381101931895e-07,
"loss": 0.5943353652954102,
"step": 9840
},
{
"epoch": 1.7526690391459074,
"grad_norm": 0.7550404667854309,
"learning_rate": 7.732287124821747e-07,
"loss": 0.6033238887786865,
"step": 9850
},
{
"epoch": 1.7544483985765125,
"grad_norm": 0.8071495294570923,
"learning_rate": 7.622940303320426e-07,
"loss": 0.6051639080047607,
"step": 9860
},
{
"epoch": 1.7562277580071175,
"grad_norm": 1.0281059741973877,
"learning_rate": 7.514341522884971e-07,
"loss": 0.6654407501220703,
"step": 9870
},
{
"epoch": 1.7580071174377223,
"grad_norm": 0.8573912978172302,
"learning_rate": 7.406491662914917e-07,
"loss": 0.5752899646759033,
"step": 9880
},
{
"epoch": 1.7597864768683276,
"grad_norm": 0.9136864542961121,
"learning_rate": 7.299391596745342e-07,
"loss": 0.5918805599212646,
"step": 9890
},
{
"epoch": 1.7615658362989324,
"grad_norm": 0.7273184657096863,
"learning_rate": 7.193042191639677e-07,
"loss": 0.6129786491394043,
"step": 9900
},
{
"epoch": 1.7633451957295374,
"grad_norm": 0.8380533456802368,
"learning_rate": 7.087444308782787e-07,
"loss": 0.6275459289550781,
"step": 9910
},
{
"epoch": 1.7651245551601424,
"grad_norm": 0.8067348003387451,
"learning_rate": 6.982598803273854e-07,
"loss": 0.6274401187896729,
"step": 9920
},
{
"epoch": 1.7669039145907472,
"grad_norm": 0.9172580242156982,
"learning_rate": 6.878506524119644e-07,
"loss": 0.6016909122467041,
"step": 9930
},
{
"epoch": 1.7686832740213523,
"grad_norm": 0.8141267895698547,
"learning_rate": 6.775168314227442e-07,
"loss": 0.5785222053527832,
"step": 9940
},
{
"epoch": 1.7704626334519573,
"grad_norm": 0.8267232179641724,
"learning_rate": 6.672585010398347e-07,
"loss": 0.59950852394104,
"step": 9950
},
{
"epoch": 1.7722419928825621,
"grad_norm": 1.015816569328308,
"learning_rate": 6.570757443320441e-07,
"loss": 0.5815055847167969,
"step": 9960
},
{
"epoch": 1.7740213523131674,
"grad_norm": 0.9419827461242676,
"learning_rate": 6.469686437562117e-07,
"loss": 0.6218266963958741,
"step": 9970
},
{
"epoch": 1.7758007117437722,
"grad_norm": 0.8901230692863464,
"learning_rate": 6.369372811565333e-07,
"loss": 0.6288963317871094,
"step": 9980
},
{
"epoch": 1.7775800711743772,
"grad_norm": 0.8378164172172546,
"learning_rate": 6.269817377639054e-07,
"loss": 0.5956534385681153,
"step": 9990
},
{
"epoch": 1.7793594306049823,
"grad_norm": 0.6490439772605896,
"learning_rate": 6.171020941952611e-07,
"loss": 0.6209733486175537,
"step": 10000
},
{
"epoch": 1.7793594306049823,
"eval_loss": 0.6085147261619568,
"eval_runtime": 413.8517,
"eval_samples_per_second": 12.152,
"eval_steps_per_second": 6.077,
"step": 10000
}
],
"logging_steps": 10,
"max_steps": 11240,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.602790940448358e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}