P1_baseline_without_retrieval / trainer_state.json
AliHmlii's picture
Model save
3574862 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997185477061638,
"eval_steps": 500,
"global_step": 1776,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005629045876723895,
"grad_norm": 0.8329170942306519,
"learning_rate": 1.1235955056179775e-06,
"loss": 0.8354,
"step": 1
},
{
"epoch": 0.0028145229383619475,
"grad_norm": 1.3410223722457886,
"learning_rate": 5.617977528089888e-06,
"loss": 1.725,
"step": 5
},
{
"epoch": 0.005629045876723895,
"grad_norm": 3.013509511947632,
"learning_rate": 1.1235955056179776e-05,
"loss": 1.6574,
"step": 10
},
{
"epoch": 0.008443568815085843,
"grad_norm": 2.5674707889556885,
"learning_rate": 1.6853932584269665e-05,
"loss": 1.2345,
"step": 15
},
{
"epoch": 0.01125809175344779,
"grad_norm": 2.4031896591186523,
"learning_rate": 2.2471910112359552e-05,
"loss": 1.5218,
"step": 20
},
{
"epoch": 0.014072614691809739,
"grad_norm": 1.1180998086929321,
"learning_rate": 2.8089887640449443e-05,
"loss": 1.2468,
"step": 25
},
{
"epoch": 0.016887137630171686,
"grad_norm": 3.0892934799194336,
"learning_rate": 3.370786516853933e-05,
"loss": 1.2287,
"step": 30
},
{
"epoch": 0.019701660568533633,
"grad_norm": 1.6239148378372192,
"learning_rate": 3.9325842696629214e-05,
"loss": 1.1948,
"step": 35
},
{
"epoch": 0.02251618350689558,
"grad_norm": 1.0267276763916016,
"learning_rate": 4.4943820224719104e-05,
"loss": 1.0779,
"step": 40
},
{
"epoch": 0.02533070644525753,
"grad_norm": 0.9497658610343933,
"learning_rate": 5.0561797752808995e-05,
"loss": 0.5863,
"step": 45
},
{
"epoch": 0.028145229383619477,
"grad_norm": 1.321252703666687,
"learning_rate": 5.6179775280898885e-05,
"loss": 0.9368,
"step": 50
},
{
"epoch": 0.030959752321981424,
"grad_norm": 1.583485722541809,
"learning_rate": 6.179775280898876e-05,
"loss": 0.7446,
"step": 55
},
{
"epoch": 0.03377427526034337,
"grad_norm": 1.6845484972000122,
"learning_rate": 6.741573033707866e-05,
"loss": 0.6362,
"step": 60
},
{
"epoch": 0.03658879819870532,
"grad_norm": 1.6922656297683716,
"learning_rate": 7.303370786516854e-05,
"loss": 0.6806,
"step": 65
},
{
"epoch": 0.039403321137067265,
"grad_norm": 1.3192064762115479,
"learning_rate": 7.865168539325843e-05,
"loss": 0.6754,
"step": 70
},
{
"epoch": 0.042217844075429216,
"grad_norm": 1.6908262968063354,
"learning_rate": 8.426966292134831e-05,
"loss": 0.7982,
"step": 75
},
{
"epoch": 0.04503236701379116,
"grad_norm": 1.8348135948181152,
"learning_rate": 8.988764044943821e-05,
"loss": 0.5052,
"step": 80
},
{
"epoch": 0.04784688995215311,
"grad_norm": 1.3342186212539673,
"learning_rate": 9.550561797752809e-05,
"loss": 0.8893,
"step": 85
},
{
"epoch": 0.05066141289051506,
"grad_norm": 1.4185314178466797,
"learning_rate": 0.00010112359550561799,
"loss": 0.8553,
"step": 90
},
{
"epoch": 0.053475935828877004,
"grad_norm": 2.145559310913086,
"learning_rate": 0.00010674157303370786,
"loss": 0.4912,
"step": 95
},
{
"epoch": 0.056290458767238954,
"grad_norm": 0.9930405020713806,
"learning_rate": 0.00011235955056179777,
"loss": 0.5233,
"step": 100
},
{
"epoch": 0.0591049817056009,
"grad_norm": 1.733180046081543,
"learning_rate": 0.00011797752808988764,
"loss": 0.8077,
"step": 105
},
{
"epoch": 0.06191950464396285,
"grad_norm": 2.0119330883026123,
"learning_rate": 0.00012359550561797752,
"loss": 0.3477,
"step": 110
},
{
"epoch": 0.0647340275823248,
"grad_norm": 0.9684231281280518,
"learning_rate": 0.00012921348314606744,
"loss": 0.6151,
"step": 115
},
{
"epoch": 0.06754855052068674,
"grad_norm": 0.8802971839904785,
"learning_rate": 0.00013483146067415732,
"loss": 0.7356,
"step": 120
},
{
"epoch": 0.07036307345904869,
"grad_norm": 1.386469841003418,
"learning_rate": 0.0001404494382022472,
"loss": 0.6414,
"step": 125
},
{
"epoch": 0.07317759639741064,
"grad_norm": 1.002143144607544,
"learning_rate": 0.0001460674157303371,
"loss": 0.5492,
"step": 130
},
{
"epoch": 0.07599211933577259,
"grad_norm": 1.686691164970398,
"learning_rate": 0.00015168539325842697,
"loss": 0.5085,
"step": 135
},
{
"epoch": 0.07880664227413453,
"grad_norm": 1.0069544315338135,
"learning_rate": 0.00015730337078651685,
"loss": 0.7826,
"step": 140
},
{
"epoch": 0.08162116521249649,
"grad_norm": 1.2447969913482666,
"learning_rate": 0.00016292134831460674,
"loss": 0.648,
"step": 145
},
{
"epoch": 0.08443568815085843,
"grad_norm": 1.3154926300048828,
"learning_rate": 0.00016853932584269662,
"loss": 0.5664,
"step": 150
},
{
"epoch": 0.08725021108922038,
"grad_norm": 0.6880718469619751,
"learning_rate": 0.00017415730337078653,
"loss": 0.7669,
"step": 155
},
{
"epoch": 0.09006473402758232,
"grad_norm": 2.1054015159606934,
"learning_rate": 0.00017977528089887642,
"loss": 0.2233,
"step": 160
},
{
"epoch": 0.09287925696594428,
"grad_norm": 2.41121506690979,
"learning_rate": 0.0001853932584269663,
"loss": 0.496,
"step": 165
},
{
"epoch": 0.09569377990430622,
"grad_norm": 1.5123809576034546,
"learning_rate": 0.00019101123595505618,
"loss": 0.834,
"step": 170
},
{
"epoch": 0.09850830284266816,
"grad_norm": 0.9215847849845886,
"learning_rate": 0.00019662921348314607,
"loss": 0.5,
"step": 175
},
{
"epoch": 0.10132282578103012,
"grad_norm": 1.3919601440429688,
"learning_rate": 0.00019999922700687455,
"loss": 0.4441,
"step": 180
},
{
"epoch": 0.10413734871939206,
"grad_norm": 0.9048366546630859,
"learning_rate": 0.00019999053097145492,
"loss": 0.6331,
"step": 185
},
{
"epoch": 0.10695187165775401,
"grad_norm": 0.9864197969436646,
"learning_rate": 0.000199972173502251,
"loss": 0.7634,
"step": 190
},
{
"epoch": 0.10976639459611595,
"grad_norm": 1.0270127058029175,
"learning_rate": 0.00019994415637302547,
"loss": 0.5543,
"step": 195
},
{
"epoch": 0.11258091753447791,
"grad_norm": 1.1281390190124512,
"learning_rate": 0.00019990648229089103,
"loss": 0.7303,
"step": 200
},
{
"epoch": 0.11539544047283985,
"grad_norm": 0.7723137140274048,
"learning_rate": 0.0001998591548960489,
"loss": 0.5619,
"step": 205
},
{
"epoch": 0.1182099634112018,
"grad_norm": 0.5689103603363037,
"learning_rate": 0.00019980217876143698,
"loss": 0.6684,
"step": 210
},
{
"epoch": 0.12102448634956375,
"grad_norm": 2.17149019241333,
"learning_rate": 0.0001997355593922881,
"loss": 0.6625,
"step": 215
},
{
"epoch": 0.1238390092879257,
"grad_norm": 1.15530526638031,
"learning_rate": 0.000199659303225598,
"loss": 0.5027,
"step": 220
},
{
"epoch": 0.12665353222628764,
"grad_norm": 0.933122456073761,
"learning_rate": 0.00019957341762950344,
"loss": 0.4687,
"step": 225
},
{
"epoch": 0.1294680551646496,
"grad_norm": 0.9144341945648193,
"learning_rate": 0.0001994779109025702,
"loss": 0.855,
"step": 230
},
{
"epoch": 0.13228257810301153,
"grad_norm": 0.7820045351982117,
"learning_rate": 0.00019937279227299131,
"loss": 0.5393,
"step": 235
},
{
"epoch": 0.13509710104137349,
"grad_norm": 1.2164809703826904,
"learning_rate": 0.00019925807189769533,
"loss": 0.8503,
"step": 240
},
{
"epoch": 0.13791162397973544,
"grad_norm": 0.8118152022361755,
"learning_rate": 0.0001991337608613649,
"loss": 0.6439,
"step": 245
},
{
"epoch": 0.14072614691809737,
"grad_norm": 1.0747621059417725,
"learning_rate": 0.00019899987117536587,
"loss": 0.5872,
"step": 250
},
{
"epoch": 0.14354066985645933,
"grad_norm": 2.370304822921753,
"learning_rate": 0.00019885641577658666,
"loss": 0.5854,
"step": 255
},
{
"epoch": 0.1463551927948213,
"grad_norm": 0.8851784467697144,
"learning_rate": 0.00019870340852618803,
"loss": 0.6521,
"step": 260
},
{
"epoch": 0.14916971573318322,
"grad_norm": 1.0250051021575928,
"learning_rate": 0.00019854086420826418,
"loss": 0.7139,
"step": 265
},
{
"epoch": 0.15198423867154517,
"grad_norm": 0.45537930727005005,
"learning_rate": 0.00019836879852841387,
"loss": 0.5634,
"step": 270
},
{
"epoch": 0.15479876160990713,
"grad_norm": 0.8726896047592163,
"learning_rate": 0.0001981872281122231,
"loss": 0.4579,
"step": 275
},
{
"epoch": 0.15761328454826906,
"grad_norm": 1.0558394193649292,
"learning_rate": 0.0001979961705036587,
"loss": 0.7129,
"step": 280
},
{
"epoch": 0.16042780748663102,
"grad_norm": 0.9016023874282837,
"learning_rate": 0.000197795644163373,
"loss": 0.67,
"step": 285
},
{
"epoch": 0.16324233042499298,
"grad_norm": 1.2795695066452026,
"learning_rate": 0.00019758566846692029,
"loss": 0.6221,
"step": 290
},
{
"epoch": 0.1660568533633549,
"grad_norm": 0.9220066666603088,
"learning_rate": 0.00019736626370288457,
"loss": 0.6101,
"step": 295
},
{
"epoch": 0.16887137630171686,
"grad_norm": 0.8552654385566711,
"learning_rate": 0.00019713745107091923,
"loss": 0.5985,
"step": 300
},
{
"epoch": 0.1716858992400788,
"grad_norm": 0.9308118224143982,
"learning_rate": 0.0001968992526796987,
"loss": 0.7646,
"step": 305
},
{
"epoch": 0.17450042217844075,
"grad_norm": 1.162402868270874,
"learning_rate": 0.00019665169154478213,
"loss": 0.8352,
"step": 310
},
{
"epoch": 0.1773149451168027,
"grad_norm": 0.7268826961517334,
"learning_rate": 0.00019639479158638972,
"loss": 0.5086,
"step": 315
},
{
"epoch": 0.18012946805516464,
"grad_norm": 0.9068782329559326,
"learning_rate": 0.00019612857762709124,
"loss": 0.7053,
"step": 320
},
{
"epoch": 0.1829439909935266,
"grad_norm": 1.0525766611099243,
"learning_rate": 0.0001958530753894078,
"loss": 0.8175,
"step": 325
},
{
"epoch": 0.18575851393188855,
"grad_norm": 0.8571164608001709,
"learning_rate": 0.0001955683114933263,
"loss": 0.6137,
"step": 330
},
{
"epoch": 0.18857303687025048,
"grad_norm": 1.1700630187988281,
"learning_rate": 0.00019527431345372738,
"loss": 0.4982,
"step": 335
},
{
"epoch": 0.19138755980861244,
"grad_norm": 0.9724971055984497,
"learning_rate": 0.00019497110967772692,
"loss": 0.603,
"step": 340
},
{
"epoch": 0.1942020827469744,
"grad_norm": 1.852655053138733,
"learning_rate": 0.000194658729461931,
"loss": 0.5137,
"step": 345
},
{
"epoch": 0.19701660568533633,
"grad_norm": 1.018691062927246,
"learning_rate": 0.00019433720298960537,
"loss": 0.7321,
"step": 350
},
{
"epoch": 0.19983112862369828,
"grad_norm": 1.1386247873306274,
"learning_rate": 0.00019400656132775908,
"loss": 0.6923,
"step": 355
},
{
"epoch": 0.20264565156206024,
"grad_norm": 0.7724803686141968,
"learning_rate": 0.0001936668364241424,
"loss": 0.4945,
"step": 360
},
{
"epoch": 0.20546017450042217,
"grad_norm": 0.49107155203819275,
"learning_rate": 0.00019331806110416027,
"loss": 0.6003,
"step": 365
},
{
"epoch": 0.20827469743878413,
"grad_norm": 0.6642640829086304,
"learning_rate": 0.00019296026906770027,
"loss": 0.2862,
"step": 370
},
{
"epoch": 0.2110892203771461,
"grad_norm": 1.6255823373794556,
"learning_rate": 0.0001925934948858767,
"loss": 0.6002,
"step": 375
},
{
"epoch": 0.21390374331550802,
"grad_norm": 1.1237847805023193,
"learning_rate": 0.00019221777399768998,
"loss": 0.6876,
"step": 380
},
{
"epoch": 0.21671826625386997,
"grad_norm": 1.4545127153396606,
"learning_rate": 0.00019183314270660248,
"loss": 0.648,
"step": 385
},
{
"epoch": 0.2195327891922319,
"grad_norm": 1.7229652404785156,
"learning_rate": 0.00019143963817703087,
"loss": 0.4523,
"step": 390
},
{
"epoch": 0.22234731213059386,
"grad_norm": 1.0194050073623657,
"learning_rate": 0.00019103729843075498,
"loss": 0.5688,
"step": 395
},
{
"epoch": 0.22516183506895582,
"grad_norm": 1.1462711095809937,
"learning_rate": 0.0001906261623432441,
"loss": 0.608,
"step": 400
},
{
"epoch": 0.22797635800731775,
"grad_norm": 0.7898936867713928,
"learning_rate": 0.00019020626963990074,
"loss": 0.5458,
"step": 405
},
{
"epoch": 0.2307908809456797,
"grad_norm": 3.30757474899292,
"learning_rate": 0.00018977766089222208,
"loss": 0.491,
"step": 410
},
{
"epoch": 0.23360540388404166,
"grad_norm": 1.2833960056304932,
"learning_rate": 0.00018934037751387997,
"loss": 0.5706,
"step": 415
},
{
"epoch": 0.2364199268224036,
"grad_norm": 1.2525941133499146,
"learning_rate": 0.00018889446175671926,
"loss": 0.7208,
"step": 420
},
{
"epoch": 0.23923444976076555,
"grad_norm": 0.9608438014984131,
"learning_rate": 0.00018843995670667543,
"loss": 0.5394,
"step": 425
},
{
"epoch": 0.2420489726991275,
"grad_norm": 0.8090662360191345,
"learning_rate": 0.00018797690627961132,
"loss": 0.4402,
"step": 430
},
{
"epoch": 0.24486349563748944,
"grad_norm": 0.8819811344146729,
"learning_rate": 0.00018750535521707396,
"loss": 0.4728,
"step": 435
},
{
"epoch": 0.2476780185758514,
"grad_norm": 0.9342750906944275,
"learning_rate": 0.0001870253490819713,
"loss": 0.5013,
"step": 440
},
{
"epoch": 0.25049254151421335,
"grad_norm": 0.7526805400848389,
"learning_rate": 0.00018653693425417002,
"loss": 0.4606,
"step": 445
},
{
"epoch": 0.2533070644525753,
"grad_norm": 1.2073928117752075,
"learning_rate": 0.00018604015792601396,
"loss": 0.7602,
"step": 450
},
{
"epoch": 0.2561215873909372,
"grad_norm": 0.48888304829597473,
"learning_rate": 0.00018553506809776424,
"loss": 0.8642,
"step": 455
},
{
"epoch": 0.2589361103292992,
"grad_norm": 1.424832820892334,
"learning_rate": 0.00018502171357296144,
"loss": 0.7515,
"step": 460
},
{
"epoch": 0.2617506332676611,
"grad_norm": 0.4335147738456726,
"learning_rate": 0.00018450014395370983,
"loss": 0.6242,
"step": 465
},
{
"epoch": 0.26456515620602306,
"grad_norm": 1.4052842855453491,
"learning_rate": 0.00018397040963588488,
"loss": 0.405,
"step": 470
},
{
"epoch": 0.26737967914438504,
"grad_norm": 0.8757149577140808,
"learning_rate": 0.0001834325618042636,
"loss": 0.6849,
"step": 475
},
{
"epoch": 0.27019420208274697,
"grad_norm": 2.241838216781616,
"learning_rate": 0.00018288665242757903,
"loss": 0.5653,
"step": 480
},
{
"epoch": 0.2730087250211089,
"grad_norm": 1.2316606044769287,
"learning_rate": 0.00018233273425349885,
"loss": 0.3658,
"step": 485
},
{
"epoch": 0.2758232479594709,
"grad_norm": 1.174371361732483,
"learning_rate": 0.0001817708608035286,
"loss": 0.6299,
"step": 490
},
{
"epoch": 0.2786377708978328,
"grad_norm": 0.7687806487083435,
"learning_rate": 0.00018120108636784034,
"loss": 0.4896,
"step": 495
},
{
"epoch": 0.28145229383619474,
"grad_norm": 1.2060645818710327,
"learning_rate": 0.00018062346600002699,
"loss": 0.6195,
"step": 500
},
{
"epoch": 0.28426681677455673,
"grad_norm": 0.9059650897979736,
"learning_rate": 0.0001800380555117827,
"loss": 0.7028,
"step": 505
},
{
"epoch": 0.28708133971291866,
"grad_norm": 1.1394221782684326,
"learning_rate": 0.00017944491146751026,
"loss": 0.5942,
"step": 510
},
{
"epoch": 0.2898958626512806,
"grad_norm": 0.8699471354484558,
"learning_rate": 0.0001788440911788556,
"loss": 0.336,
"step": 515
},
{
"epoch": 0.2927103855896426,
"grad_norm": 1.5057145357131958,
"learning_rate": 0.0001782356526991702,
"loss": 0.5421,
"step": 520
},
{
"epoch": 0.2955249085280045,
"grad_norm": 1.7533260583877563,
"learning_rate": 0.00017761965481790162,
"loss": 0.5161,
"step": 525
},
{
"epoch": 0.29833943146636643,
"grad_norm": 0.7779251933097839,
"learning_rate": 0.00017699615705491325,
"loss": 0.7673,
"step": 530
},
{
"epoch": 0.3011539544047284,
"grad_norm": 1.0537793636322021,
"learning_rate": 0.00017636521965473323,
"loss": 0.5704,
"step": 535
},
{
"epoch": 0.30396847734309035,
"grad_norm": 1.4870874881744385,
"learning_rate": 0.00017572690358073326,
"loss": 0.5846,
"step": 540
},
{
"epoch": 0.3067830002814523,
"grad_norm": 0.5922363996505737,
"learning_rate": 0.00017508127050923835,
"loss": 0.7014,
"step": 545
},
{
"epoch": 0.30959752321981426,
"grad_norm": 0.8268614411354065,
"learning_rate": 0.00017442838282356727,
"loss": 0.7332,
"step": 550
},
{
"epoch": 0.3124120461581762,
"grad_norm": 0.6626672148704529,
"learning_rate": 0.00017376830360800498,
"loss": 0.6285,
"step": 555
},
{
"epoch": 0.3152265690965381,
"grad_norm": 0.7634074091911316,
"learning_rate": 0.00017310109664170703,
"loss": 0.5794,
"step": 560
},
{
"epoch": 0.3180410920349001,
"grad_norm": 0.6927458047866821,
"learning_rate": 0.00017242682639253718,
"loss": 0.3854,
"step": 565
},
{
"epoch": 0.32085561497326204,
"grad_norm": 0.7036980390548706,
"learning_rate": 0.00017174555801083814,
"loss": 0.4077,
"step": 570
},
{
"epoch": 0.32367013791162397,
"grad_norm": 0.5988078117370605,
"learning_rate": 0.00017105735732313667,
"loss": 0.4527,
"step": 575
},
{
"epoch": 0.32648466084998595,
"grad_norm": 1.1813197135925293,
"learning_rate": 0.00017036229082578307,
"loss": 0.8273,
"step": 580
},
{
"epoch": 0.3292991837883479,
"grad_norm": 0.7539934515953064,
"learning_rate": 0.00016966042567852615,
"loss": 0.6084,
"step": 585
},
{
"epoch": 0.3321137067267098,
"grad_norm": 1.3654534816741943,
"learning_rate": 0.00016895182969802386,
"loss": 0.8049,
"step": 590
},
{
"epoch": 0.3349282296650718,
"grad_norm": 1.3293468952178955,
"learning_rate": 0.00016823657135129087,
"loss": 0.5769,
"step": 595
},
{
"epoch": 0.3377427526034337,
"grad_norm": 0.7219970226287842,
"learning_rate": 0.00016751471974908288,
"loss": 0.6883,
"step": 600
},
{
"epoch": 0.34055727554179566,
"grad_norm": 0.3910175561904907,
"learning_rate": 0.00016678634463921884,
"loss": 0.4406,
"step": 605
},
{
"epoch": 0.3433717984801576,
"grad_norm": 0.6810513138771057,
"learning_rate": 0.00016605151639984187,
"loss": 0.532,
"step": 610
},
{
"epoch": 0.34618632141851957,
"grad_norm": 1.6218785047531128,
"learning_rate": 0.00016531030603261884,
"loss": 0.8189,
"step": 615
},
{
"epoch": 0.3490008443568815,
"grad_norm": 1.138554334640503,
"learning_rate": 0.00016456278515588024,
"loss": 0.5942,
"step": 620
},
{
"epoch": 0.35181536729524343,
"grad_norm": 0.9244788289070129,
"learning_rate": 0.00016380902599769982,
"loss": 0.7253,
"step": 625
},
{
"epoch": 0.3546298902336054,
"grad_norm": 1.2546799182891846,
"learning_rate": 0.00016304910138891597,
"loss": 0.6356,
"step": 630
},
{
"epoch": 0.35744441317196735,
"grad_norm": 0.9498732089996338,
"learning_rate": 0.00016228308475609433,
"loss": 0.6773,
"step": 635
},
{
"epoch": 0.3602589361103293,
"grad_norm": 0.605629026889801,
"learning_rate": 0.00016151105011443314,
"loss": 0.5681,
"step": 640
},
{
"epoch": 0.36307345904869126,
"grad_norm": 0.8649378418922424,
"learning_rate": 0.00016073307206061177,
"loss": 0.657,
"step": 645
},
{
"epoch": 0.3658879819870532,
"grad_norm": 0.9192129969596863,
"learning_rate": 0.00015994922576558263,
"loss": 0.501,
"step": 650
},
{
"epoch": 0.3687025049254151,
"grad_norm": 0.9517626166343689,
"learning_rate": 0.00015915958696730814,
"loss": 0.4914,
"step": 655
},
{
"epoch": 0.3715170278637771,
"grad_norm": 0.7139463424682617,
"learning_rate": 0.0001583642319634426,
"loss": 0.6748,
"step": 660
},
{
"epoch": 0.37433155080213903,
"grad_norm": 0.7017699480056763,
"learning_rate": 0.00015756323760396002,
"loss": 0.4724,
"step": 665
},
{
"epoch": 0.37714607374050096,
"grad_norm": 0.69412761926651,
"learning_rate": 0.00015675668128372854,
"loss": 0.3082,
"step": 670
},
{
"epoch": 0.37996059667886295,
"grad_norm": 0.9961652755737305,
"learning_rate": 0.00015594464093503246,
"loss": 0.5198,
"step": 675
},
{
"epoch": 0.3827751196172249,
"grad_norm": 0.8897234797477722,
"learning_rate": 0.00015512719502004197,
"loss": 0.6707,
"step": 680
},
{
"epoch": 0.3855896425555868,
"grad_norm": 1.5628478527069092,
"learning_rate": 0.0001543044225232319,
"loss": 0.7418,
"step": 685
},
{
"epoch": 0.3884041654939488,
"grad_norm": 0.98553866147995,
"learning_rate": 0.00015347640294375005,
"loss": 0.6553,
"step": 690
},
{
"epoch": 0.3912186884323107,
"grad_norm": 1.6320457458496094,
"learning_rate": 0.0001526432162877356,
"loss": 0.7794,
"step": 695
},
{
"epoch": 0.39403321137067265,
"grad_norm": 0.7922711372375488,
"learning_rate": 0.0001518049430605887,
"loss": 0.4737,
"step": 700
},
{
"epoch": 0.39684773430903464,
"grad_norm": 2.2716257572174072,
"learning_rate": 0.00015096166425919175,
"loss": 0.6215,
"step": 705
},
{
"epoch": 0.39966225724739657,
"grad_norm": 0.6248972415924072,
"learning_rate": 0.0001501134613640832,
"loss": 0.431,
"step": 710
},
{
"epoch": 0.4024767801857585,
"grad_norm": 0.7897845506668091,
"learning_rate": 0.00014926041633158454,
"loss": 0.6622,
"step": 715
},
{
"epoch": 0.4052913031241205,
"grad_norm": 0.35586240887641907,
"learning_rate": 0.0001484026115858815,
"loss": 0.2137,
"step": 720
},
{
"epoch": 0.4081058260624824,
"grad_norm": 1.1606733798980713,
"learning_rate": 0.00014754013001105998,
"loss": 0.5948,
"step": 725
},
{
"epoch": 0.41092034900084434,
"grad_norm": 1.2706083059310913,
"learning_rate": 0.00014667305494309727,
"loss": 0.9048,
"step": 730
},
{
"epoch": 0.41373487193920633,
"grad_norm": 0.4990655779838562,
"learning_rate": 0.00014580147016181005,
"loss": 0.6926,
"step": 735
},
{
"epoch": 0.41654939487756826,
"grad_norm": 0.9936563968658447,
"learning_rate": 0.00014492545988275933,
"loss": 0.8315,
"step": 740
},
{
"epoch": 0.4193639178159302,
"grad_norm": 0.9066222906112671,
"learning_rate": 0.0001440451087491129,
"loss": 0.4171,
"step": 745
},
{
"epoch": 0.4221784407542922,
"grad_norm": 1.0959652662277222,
"learning_rate": 0.00014316050182346733,
"loss": 0.586,
"step": 750
},
{
"epoch": 0.4249929636926541,
"grad_norm": 0.726209819316864,
"learning_rate": 0.0001422717245796285,
"loss": 0.485,
"step": 755
},
{
"epoch": 0.42780748663101603,
"grad_norm": 1.6239256858825684,
"learning_rate": 0.00014137886289435295,
"loss": 0.4708,
"step": 760
},
{
"epoch": 0.430622009569378,
"grad_norm": 1.3807324171066284,
"learning_rate": 0.00014048200303905034,
"loss": 0.4916,
"step": 765
},
{
"epoch": 0.43343653250773995,
"grad_norm": 0.9176377654075623,
"learning_rate": 0.00013958123167144733,
"loss": 0.5487,
"step": 770
},
{
"epoch": 0.4362510554461019,
"grad_norm": 1.0780225992202759,
"learning_rate": 0.0001386766358272146,
"loss": 0.6175,
"step": 775
},
{
"epoch": 0.4390655783844638,
"grad_norm": 2.6700491905212402,
"learning_rate": 0.00013776830291155703,
"loss": 0.7033,
"step": 780
},
{
"epoch": 0.4418801013228258,
"grad_norm": 0.9618711471557617,
"learning_rate": 0.00013685632069076846,
"loss": 0.6698,
"step": 785
},
{
"epoch": 0.4446946242611877,
"grad_norm": 1.3910859823226929,
"learning_rate": 0.00013594077728375128,
"loss": 0.8375,
"step": 790
},
{
"epoch": 0.44750914719954965,
"grad_norm": 0.755133867263794,
"learning_rate": 0.00013502176115350213,
"loss": 0.3719,
"step": 795
},
{
"epoch": 0.45032367013791164,
"grad_norm": 0.5240243077278137,
"learning_rate": 0.00013409936109856424,
"loss": 0.3539,
"step": 800
},
{
"epoch": 0.45313819307627357,
"grad_norm": 1.1372967958450317,
"learning_rate": 0.00013317366624444744,
"loss": 0.7051,
"step": 805
},
{
"epoch": 0.4559527160146355,
"grad_norm": 1.4525517225265503,
"learning_rate": 0.00013224476603501662,
"loss": 0.6504,
"step": 810
},
{
"epoch": 0.4587672389529975,
"grad_norm": 0.658799946308136,
"learning_rate": 0.00013131275022384918,
"loss": 0.4961,
"step": 815
},
{
"epoch": 0.4615817618913594,
"grad_norm": 0.8785876631736755,
"learning_rate": 0.00013037770886556294,
"loss": 0.4614,
"step": 820
},
{
"epoch": 0.46439628482972134,
"grad_norm": 1.465197205543518,
"learning_rate": 0.0001294397323071145,
"loss": 0.4032,
"step": 825
},
{
"epoch": 0.4672108077680833,
"grad_norm": 1.070615291595459,
"learning_rate": 0.00012849891117906978,
"loss": 0.9079,
"step": 830
},
{
"epoch": 0.47002533070644525,
"grad_norm": 0.5417959690093994,
"learning_rate": 0.00012755533638684704,
"loss": 0.33,
"step": 835
},
{
"epoch": 0.4728398536448072,
"grad_norm": 0.9827715754508972,
"learning_rate": 0.00012660909910193303,
"loss": 0.5105,
"step": 840
},
{
"epoch": 0.47565437658316917,
"grad_norm": 0.5091266632080078,
"learning_rate": 0.0001256602907530739,
"loss": 0.2981,
"step": 845
},
{
"epoch": 0.4784688995215311,
"grad_norm": 0.7054405212402344,
"learning_rate": 0.000124709003017441,
"loss": 0.3981,
"step": 850
},
{
"epoch": 0.48128342245989303,
"grad_norm": 0.7525174617767334,
"learning_rate": 0.00012375532781177257,
"loss": 0.6399,
"step": 855
},
{
"epoch": 0.484097945398255,
"grad_norm": 1.1248599290847778,
"learning_rate": 0.0001227993572834926,
"loss": 0.624,
"step": 860
},
{
"epoch": 0.48691246833661694,
"grad_norm": 0.6768050193786621,
"learning_rate": 0.00012184118380180716,
"loss": 0.6998,
"step": 865
},
{
"epoch": 0.4897269912749789,
"grad_norm": 0.8580127954483032,
"learning_rate": 0.0001208808999487793,
"loss": 0.6036,
"step": 870
},
{
"epoch": 0.49254151421334086,
"grad_norm": 1.0916856527328491,
"learning_rate": 0.0001199185985103836,
"loss": 0.6118,
"step": 875
},
{
"epoch": 0.4953560371517028,
"grad_norm": 0.9223160147666931,
"learning_rate": 0.00011895437246754074,
"loss": 0.5047,
"step": 880
},
{
"epoch": 0.4981705600900647,
"grad_norm": 2.015151262283325,
"learning_rate": 0.00011798831498713334,
"loss": 0.4731,
"step": 885
},
{
"epoch": 0.5009850830284267,
"grad_norm": 1.1331337690353394,
"learning_rate": 0.00011702051941300396,
"loss": 0.5136,
"step": 890
},
{
"epoch": 0.5037996059667886,
"grad_norm": 1.0576162338256836,
"learning_rate": 0.00011605107925693582,
"loss": 0.4992,
"step": 895
},
{
"epoch": 0.5066141289051506,
"grad_norm": 0.8101834654808044,
"learning_rate": 0.00011508008818961731,
"loss": 0.4718,
"step": 900
},
{
"epoch": 0.5094286518435125,
"grad_norm": 0.922261655330658,
"learning_rate": 0.00011410764003159147,
"loss": 0.3622,
"step": 905
},
{
"epoch": 0.5122431747818744,
"grad_norm": 0.6725795865058899,
"learning_rate": 0.00011313382874419031,
"loss": 0.4803,
"step": 910
},
{
"epoch": 0.5150576977202365,
"grad_norm": 1.7679307460784912,
"learning_rate": 0.00011215874842045631,
"loss": 0.4316,
"step": 915
},
{
"epoch": 0.5178722206585984,
"grad_norm": 0.7769971489906311,
"learning_rate": 0.00011118249327605055,
"loss": 0.3043,
"step": 920
},
{
"epoch": 0.5206867435969603,
"grad_norm": 1.3879740238189697,
"learning_rate": 0.00011020515764014942,
"loss": 0.4857,
"step": 925
},
{
"epoch": 0.5235012665353223,
"grad_norm": 0.9753875136375427,
"learning_rate": 0.00010922683594633021,
"loss": 0.5712,
"step": 930
},
{
"epoch": 0.5263157894736842,
"grad_norm": 1.2720959186553955,
"learning_rate": 0.00010824762272344651,
"loss": 0.509,
"step": 935
},
{
"epoch": 0.5291303124120461,
"grad_norm": 0.8127233386039734,
"learning_rate": 0.00010726761258649461,
"loss": 0.57,
"step": 940
},
{
"epoch": 0.5319448353504082,
"grad_norm": 1.2257153987884521,
"learning_rate": 0.00010628690022747132,
"loss": 0.5516,
"step": 945
},
{
"epoch": 0.5347593582887701,
"grad_norm": 0.7144739627838135,
"learning_rate": 0.00010530558040622472,
"loss": 0.6066,
"step": 950
},
{
"epoch": 0.537573881227132,
"grad_norm": 0.7518823146820068,
"learning_rate": 0.00010432374794129791,
"loss": 0.6092,
"step": 955
},
{
"epoch": 0.5403884041654939,
"grad_norm": 0.9559532403945923,
"learning_rate": 0.00010334149770076747,
"loss": 0.5019,
"step": 960
},
{
"epoch": 0.5432029271038559,
"grad_norm": 1.0719863176345825,
"learning_rate": 0.00010235892459307688,
"loss": 0.4403,
"step": 965
},
{
"epoch": 0.5460174500422178,
"grad_norm": 0.7807120084762573,
"learning_rate": 0.00010137612355786618,
"loss": 0.5249,
"step": 970
},
{
"epoch": 0.5488319729805798,
"grad_norm": 1.4584418535232544,
"learning_rate": 0.00010039318955679857,
"loss": 0.475,
"step": 975
},
{
"epoch": 0.5516464959189418,
"grad_norm": 0.7352472543716431,
"learning_rate": 9.941021756438488e-05,
"loss": 0.64,
"step": 980
},
{
"epoch": 0.5544610188573037,
"grad_norm": 1.0791490077972412,
"learning_rate": 9.842730255880678e-05,
"loss": 0.5755,
"step": 985
},
{
"epoch": 0.5572755417956656,
"grad_norm": 0.9303123354911804,
"learning_rate": 9.744453951273968e-05,
"loss": 0.6485,
"step": 990
},
{
"epoch": 0.5600900647340276,
"grad_norm": 0.6764497756958008,
"learning_rate": 9.646202338417613e-05,
"loss": 0.4572,
"step": 995
},
{
"epoch": 0.5629045876723895,
"grad_norm": 0.9425994157791138,
"learning_rate": 9.547984910725064e-05,
"loss": 0.733,
"step": 1000
},
{
"epoch": 0.5657191106107515,
"grad_norm": 0.9432514905929565,
"learning_rate": 9.449811158306684e-05,
"loss": 0.4557,
"step": 1005
},
{
"epoch": 0.5685336335491135,
"grad_norm": 0.999247133731842,
"learning_rate": 9.35169056705278e-05,
"loss": 0.6724,
"step": 1010
},
{
"epoch": 0.5713481564874754,
"grad_norm": 1.0435142517089844,
"learning_rate": 9.253632617717038e-05,
"loss": 0.5394,
"step": 1015
},
{
"epoch": 0.5741626794258373,
"grad_norm": 1.8879817724227905,
"learning_rate": 9.155646785000467e-05,
"loss": 0.5066,
"step": 1020
},
{
"epoch": 0.5769772023641992,
"grad_norm": 0.7503070831298828,
"learning_rate": 9.057742536635913e-05,
"loss": 0.6197,
"step": 1025
},
{
"epoch": 0.5797917253025612,
"grad_norm": 0.9366083741188049,
"learning_rate": 8.959929332473262e-05,
"loss": 0.5373,
"step": 1030
},
{
"epoch": 0.5826062482409231,
"grad_norm": 0.5995301008224487,
"learning_rate": 8.86221662356539e-05,
"loss": 0.4054,
"step": 1035
},
{
"epoch": 0.5854207711792851,
"grad_norm": 0.5817670226097107,
"learning_rate": 8.764613851254968e-05,
"loss": 0.5262,
"step": 1040
},
{
"epoch": 0.5882352941176471,
"grad_norm": 0.5300823450088501,
"learning_rate": 8.667130446262214e-05,
"loss": 0.678,
"step": 1045
},
{
"epoch": 0.591049817056009,
"grad_norm": 0.9397202134132385,
"learning_rate": 8.569775827773656e-05,
"loss": 0.6118,
"step": 1050
},
{
"epoch": 0.5938643399943709,
"grad_norm": 0.8151072859764099,
"learning_rate": 8.472559402532021e-05,
"loss": 0.5034,
"step": 1055
},
{
"epoch": 0.5966788629327329,
"grad_norm": 0.7164279818534851,
"learning_rate": 8.375490563927328e-05,
"loss": 0.2909,
"step": 1060
},
{
"epoch": 0.5994933858710948,
"grad_norm": 0.7771260738372803,
"learning_rate": 8.278578691089249e-05,
"loss": 0.5102,
"step": 1065
},
{
"epoch": 0.6023079088094568,
"grad_norm": 0.5859299302101135,
"learning_rate": 8.181833147980894e-05,
"loss": 0.3361,
"step": 1070
},
{
"epoch": 0.6051224317478188,
"grad_norm": 1.3162269592285156,
"learning_rate": 8.085263282493998e-05,
"loss": 0.5598,
"step": 1075
},
{
"epoch": 0.6079369546861807,
"grad_norm": 1.9372239112854004,
"learning_rate": 7.98887842554572e-05,
"loss": 0.5755,
"step": 1080
},
{
"epoch": 0.6107514776245426,
"grad_norm": 1.0530599355697632,
"learning_rate": 7.892687890177044e-05,
"loss": 0.6622,
"step": 1085
},
{
"epoch": 0.6135660005629046,
"grad_norm": 0.8324519991874695,
"learning_rate": 7.796700970652932e-05,
"loss": 0.4768,
"step": 1090
},
{
"epoch": 0.6163805235012665,
"grad_norm": 1.280251145362854,
"learning_rate": 7.700926941564262e-05,
"loss": 0.3465,
"step": 1095
},
{
"epoch": 0.6191950464396285,
"grad_norm": 1.4136282205581665,
"learning_rate": 7.605375056931712e-05,
"loss": 0.4388,
"step": 1100
},
{
"epoch": 0.6220095693779905,
"grad_norm": 0.9467165470123291,
"learning_rate": 7.510054549311573e-05,
"loss": 0.4631,
"step": 1105
},
{
"epoch": 0.6248240923163524,
"grad_norm": 0.4942507743835449,
"learning_rate": 7.41497462890369e-05,
"loss": 0.5125,
"step": 1110
},
{
"epoch": 0.6276386152547143,
"grad_norm": 0.5631272196769714,
"learning_rate": 7.320144482661533e-05,
"loss": 0.5205,
"step": 1115
},
{
"epoch": 0.6304531381930762,
"grad_norm": 1.3057935237884521,
"learning_rate": 7.225573273404513e-05,
"loss": 0.7294,
"step": 1120
},
{
"epoch": 0.6332676611314382,
"grad_norm": 0.9623622298240662,
"learning_rate": 7.131270138932655e-05,
"loss": 0.5844,
"step": 1125
},
{
"epoch": 0.6360821840698002,
"grad_norm": 1.0017409324645996,
"learning_rate": 7.037244191143661e-05,
"loss": 0.3777,
"step": 1130
},
{
"epoch": 0.6388967070081621,
"grad_norm": 0.9390413165092468,
"learning_rate": 6.943504515152491e-05,
"loss": 0.6049,
"step": 1135
},
{
"epoch": 0.6417112299465241,
"grad_norm": 0.5731076598167419,
"learning_rate": 6.850060168413518e-05,
"loss": 0.4316,
"step": 1140
},
{
"epoch": 0.644525752884886,
"grad_norm": 0.42738592624664307,
"learning_rate": 6.756920179845383e-05,
"loss": 0.4463,
"step": 1145
},
{
"epoch": 0.6473402758232479,
"grad_norm": 1.3607089519500732,
"learning_rate": 6.66409354895857e-05,
"loss": 0.5199,
"step": 1150
},
{
"epoch": 0.6501547987616099,
"grad_norm": 0.9413872361183167,
"learning_rate": 6.57158924498586e-05,
"loss": 0.37,
"step": 1155
},
{
"epoch": 0.6529693216999719,
"grad_norm": 0.8918866515159607,
"learning_rate": 6.479416206015679e-05,
"loss": 0.595,
"step": 1160
},
{
"epoch": 0.6557838446383338,
"grad_norm": 1.254056453704834,
"learning_rate": 6.387583338128471e-05,
"loss": 0.7506,
"step": 1165
},
{
"epoch": 0.6585983675766958,
"grad_norm": 0.7493451833724976,
"learning_rate": 6.296099514536167e-05,
"loss": 0.4956,
"step": 1170
},
{
"epoch": 0.6614128905150577,
"grad_norm": 0.6410611271858215,
"learning_rate": 6.20497357472482e-05,
"loss": 0.5979,
"step": 1175
},
{
"epoch": 0.6642274134534196,
"grad_norm": 0.6682597994804382,
"learning_rate": 6.114214323600504e-05,
"loss": 0.4579,
"step": 1180
},
{
"epoch": 0.6670419363917816,
"grad_norm": 1.117984652519226,
"learning_rate": 6.023830530638559e-05,
"loss": 0.4828,
"step": 1185
},
{
"epoch": 0.6698564593301436,
"grad_norm": 0.8667401075363159,
"learning_rate": 5.9338309290362324e-05,
"loss": 0.6108,
"step": 1190
},
{
"epoch": 0.6726709822685055,
"grad_norm": 0.5072731971740723,
"learning_rate": 5.844224214868881e-05,
"loss": 0.4543,
"step": 1195
},
{
"epoch": 0.6754855052068675,
"grad_norm": 1.5477889776229858,
"learning_rate": 5.7550190462496946e-05,
"loss": 0.4118,
"step": 1200
},
{
"epoch": 0.6783000281452294,
"grad_norm": 1.0128791332244873,
"learning_rate": 5.66622404249314e-05,
"loss": 0.5121,
"step": 1205
},
{
"epoch": 0.6811145510835913,
"grad_norm": 0.8333756327629089,
"learning_rate": 5.577847783282122e-05,
"loss": 0.3744,
"step": 1210
},
{
"epoch": 0.6839290740219532,
"grad_norm": 0.6459574103355408,
"learning_rate": 5.48989880783898e-05,
"loss": 0.5707,
"step": 1215
},
{
"epoch": 0.6867435969603152,
"grad_norm": 1.1848655939102173,
"learning_rate": 5.4023856141004236e-05,
"loss": 0.6197,
"step": 1220
},
{
"epoch": 0.6895581198986772,
"grad_norm": 1.0240199565887451,
"learning_rate": 5.3153166578963965e-05,
"loss": 0.544,
"step": 1225
},
{
"epoch": 0.6923726428370391,
"grad_norm": 0.6922760605812073,
"learning_rate": 5.228700352133071e-05,
"loss": 0.4769,
"step": 1230
},
{
"epoch": 0.6951871657754011,
"grad_norm": 0.7377022504806519,
"learning_rate": 5.142545065979955e-05,
"loss": 0.5942,
"step": 1235
},
{
"epoch": 0.698001688713763,
"grad_norm": 0.8629385828971863,
"learning_rate": 5.05685912406123e-05,
"loss": 0.7816,
"step": 1240
},
{
"epoch": 0.7008162116521249,
"grad_norm": 0.9569929242134094,
"learning_rate": 4.971650805651406e-05,
"loss": 0.6106,
"step": 1245
},
{
"epoch": 0.7036307345904869,
"grad_norm": 0.49353519082069397,
"learning_rate": 4.886928343875341e-05,
"loss": 0.4153,
"step": 1250
},
{
"epoch": 0.7064452575288489,
"grad_norm": 0.9434248805046082,
"learning_rate": 4.8026999249127315e-05,
"loss": 0.4643,
"step": 1255
},
{
"epoch": 0.7092597804672108,
"grad_norm": 0.5096941590309143,
"learning_rate": 4.71897368720714e-05,
"loss": 0.9337,
"step": 1260
},
{
"epoch": 0.7120743034055728,
"grad_norm": 0.9566155672073364,
"learning_rate": 4.6357577206796096e-05,
"loss": 0.7062,
"step": 1265
},
{
"epoch": 0.7148888263439347,
"grad_norm": 0.6990448832511902,
"learning_rate": 4.553060065947013e-05,
"loss": 0.4033,
"step": 1270
},
{
"epoch": 0.7177033492822966,
"grad_norm": 0.9153978228569031,
"learning_rate": 4.4708887135451396e-05,
"loss": 0.5575,
"step": 1275
},
{
"epoch": 0.7205178722206586,
"grad_norm": 1.2567604780197144,
"learning_rate": 4.3892516031565954e-05,
"loss": 0.45,
"step": 1280
},
{
"epoch": 0.7233323951590206,
"grad_norm": 1.2241476774215698,
"learning_rate": 4.3081566228436686e-05,
"loss": 0.7022,
"step": 1285
},
{
"epoch": 0.7261469180973825,
"grad_norm": 1.0247058868408203,
"learning_rate": 4.227611608286147e-05,
"loss": 0.5987,
"step": 1290
},
{
"epoch": 0.7289614410357445,
"grad_norm": 0.9229673147201538,
"learning_rate": 4.147624342024209e-05,
"loss": 0.5189,
"step": 1295
},
{
"epoch": 0.7317759639741064,
"grad_norm": 0.6648756861686707,
"learning_rate": 4.0682025527064486e-05,
"loss": 0.4758,
"step": 1300
},
{
"epoch": 0.7345904869124683,
"grad_norm": 0.40781381726264954,
"learning_rate": 3.9893539143431044e-05,
"loss": 0.451,
"step": 1305
},
{
"epoch": 0.7374050098508302,
"grad_norm": 0.7853880524635315,
"learning_rate": 3.911086045564575e-05,
"loss": 0.6077,
"step": 1310
},
{
"epoch": 0.7402195327891923,
"grad_norm": 0.7318276166915894,
"learning_rate": 3.83340650888527e-05,
"loss": 0.4472,
"step": 1315
},
{
"epoch": 0.7430340557275542,
"grad_norm": 1.5267870426177979,
"learning_rate": 3.756322809972905e-05,
"loss": 0.8982,
"step": 1320
},
{
"epoch": 0.7458485786659161,
"grad_norm": 0.5616324543952942,
"learning_rate": 3.679842396923271e-05,
"loss": 0.5635,
"step": 1325
},
{
"epoch": 0.7486631016042781,
"grad_norm": 0.7878595590591431,
"learning_rate": 3.6039726595405755e-05,
"loss": 0.5973,
"step": 1330
},
{
"epoch": 0.75147762454264,
"grad_norm": 0.3910659849643707,
"learning_rate": 3.528720928623414e-05,
"loss": 0.462,
"step": 1335
},
{
"epoch": 0.7542921474810019,
"grad_norm": 0.7616758346557617,
"learning_rate": 3.4540944752564406e-05,
"loss": 0.4222,
"step": 1340
},
{
"epoch": 0.757106670419364,
"grad_norm": 1.3086707592010498,
"learning_rate": 3.380100510107814e-05,
"loss": 0.6267,
"step": 1345
},
{
"epoch": 0.7599211933577259,
"grad_norm": 0.7841249108314514,
"learning_rate": 3.3067461827324755e-05,
"loss": 0.4136,
"step": 1350
},
{
"epoch": 0.7627357162960878,
"grad_norm": 0.47329601645469666,
"learning_rate": 3.2340385808813315e-05,
"loss": 0.485,
"step": 1355
},
{
"epoch": 0.7655502392344498,
"grad_norm": 0.719576895236969,
"learning_rate": 3.161984729816415e-05,
"loss": 0.4311,
"step": 1360
},
{
"epoch": 0.7683647621728117,
"grad_norm": 1.036814570426941,
"learning_rate": 3.090591591632082e-05,
"loss": 0.4341,
"step": 1365
},
{
"epoch": 0.7711792851111736,
"grad_norm": 1.327804684638977,
"learning_rate": 3.0198660645822985e-05,
"loss": 0.4251,
"step": 1370
},
{
"epoch": 0.7739938080495357,
"grad_norm": 0.880226731300354,
"learning_rate": 2.9498149824141196e-05,
"loss": 0.3617,
"step": 1375
},
{
"epoch": 0.7768083309878976,
"grad_norm": 0.7395775318145752,
"learning_rate": 2.880445113707384e-05,
"loss": 0.6637,
"step": 1380
},
{
"epoch": 0.7796228539262595,
"grad_norm": 1.0488626956939697,
"learning_rate": 2.8117631612207084e-05,
"loss": 0.4081,
"step": 1385
},
{
"epoch": 0.7824373768646214,
"grad_norm": 0.8878235220909119,
"learning_rate": 2.743775761243843e-05,
"loss": 0.5031,
"step": 1390
},
{
"epoch": 0.7852518998029834,
"grad_norm": 0.6816940903663635,
"learning_rate": 2.6764894829564613e-05,
"loss": 0.4063,
"step": 1395
},
{
"epoch": 0.7880664227413453,
"grad_norm": 0.4741825759410858,
"learning_rate": 2.6099108277934103e-05,
"loss": 0.3073,
"step": 1400
},
{
"epoch": 0.7908809456797072,
"grad_norm": 0.8777612447738647,
"learning_rate": 2.5440462288165146e-05,
"loss": 0.6016,
"step": 1405
},
{
"epoch": 0.7936954686180693,
"grad_norm": 0.6074641346931458,
"learning_rate": 2.4789020500930095e-05,
"loss": 0.5282,
"step": 1410
},
{
"epoch": 0.7965099915564312,
"grad_norm": 0.8305478692054749,
"learning_rate": 2.414484586080612e-05,
"loss": 0.6746,
"step": 1415
},
{
"epoch": 0.7993245144947931,
"grad_norm": 1.8334358930587769,
"learning_rate": 2.3508000610193258e-05,
"loss": 0.3076,
"step": 1420
},
{
"epoch": 0.8021390374331551,
"grad_norm": 0.8415083885192871,
"learning_rate": 2.287854628330043e-05,
"loss": 0.4482,
"step": 1425
},
{
"epoch": 0.804953560371517,
"grad_norm": 1.1256593465805054,
"learning_rate": 2.2256543700199685e-05,
"loss": 0.4828,
"step": 1430
},
{
"epoch": 0.8077680833098789,
"grad_norm": 0.6625217795372009,
"learning_rate": 2.164205296094961e-05,
"loss": 0.2526,
"step": 1435
},
{
"epoch": 0.810582606248241,
"grad_norm": 1.2664328813552856,
"learning_rate": 2.1035133439788236e-05,
"loss": 0.5837,
"step": 1440
},
{
"epoch": 0.8133971291866029,
"grad_norm": 0.6024655699729919,
"learning_rate": 2.0435843779396156e-05,
"loss": 0.5919,
"step": 1445
},
{
"epoch": 0.8162116521249648,
"grad_norm": 0.660294234752655,
"learning_rate": 1.9844241885230163e-05,
"loss": 0.4167,
"step": 1450
},
{
"epoch": 0.8190261750633268,
"grad_norm": 0.7786351442337036,
"learning_rate": 1.9260384919928266e-05,
"loss": 0.5695,
"step": 1455
},
{
"epoch": 0.8218406980016887,
"grad_norm": 0.7518659830093384,
"learning_rate": 1.8684329297786453e-05,
"loss": 0.7747,
"step": 1460
},
{
"epoch": 0.8246552209400506,
"grad_norm": 0.7769446969032288,
"learning_rate": 1.8116130679307708e-05,
"loss": 0.6442,
"step": 1465
},
{
"epoch": 0.8274697438784127,
"grad_norm": 0.49770888686180115,
"learning_rate": 1.7555843965823992e-05,
"loss": 0.4515,
"step": 1470
},
{
"epoch": 0.8302842668167746,
"grad_norm": 0.9085186719894409,
"learning_rate": 1.7003523294191294e-05,
"loss": 0.419,
"step": 1475
},
{
"epoch": 0.8330987897551365,
"grad_norm": 1.536907434463501,
"learning_rate": 1.6459222031558974e-05,
"loss": 0.5198,
"step": 1480
},
{
"epoch": 0.8359133126934984,
"grad_norm": 0.6355032920837402,
"learning_rate": 1.5922992770213064e-05,
"loss": 0.524,
"step": 1485
},
{
"epoch": 0.8387278356318604,
"grad_norm": 0.7401185631752014,
"learning_rate": 1.5394887322494732e-05,
"loss": 0.362,
"step": 1490
},
{
"epoch": 0.8415423585702223,
"grad_norm": 0.7638711333274841,
"learning_rate": 1.4874956715793886e-05,
"loss": 0.363,
"step": 1495
},
{
"epoch": 0.8443568815085843,
"grad_norm": 1.4240124225616455,
"learning_rate": 1.4363251187618854e-05,
"loss": 0.7827,
"step": 1500
},
{
"epoch": 0.8471714044469463,
"grad_norm": 0.8892093896865845,
"learning_rate": 1.3859820180742156e-05,
"loss": 0.3835,
"step": 1505
},
{
"epoch": 0.8499859273853082,
"grad_norm": 0.8796232342720032,
"learning_rate": 1.3364712338423214e-05,
"loss": 0.6749,
"step": 1510
},
{
"epoch": 0.8528004503236701,
"grad_norm": 0.7944719195365906,
"learning_rate": 1.287797549970826e-05,
"loss": 0.5354,
"step": 1515
},
{
"epoch": 0.8556149732620321,
"grad_norm": 0.6994941830635071,
"learning_rate": 1.2399656694807971e-05,
"loss": 0.5961,
"step": 1520
},
{
"epoch": 0.858429496200394,
"grad_norm": 0.8955370187759399,
"learning_rate": 1.1929802140553258e-05,
"loss": 0.2639,
"step": 1525
},
{
"epoch": 0.861244019138756,
"grad_norm": 0.9208924174308777,
"learning_rate": 1.1468457235929597e-05,
"loss": 0.5161,
"step": 1530
},
{
"epoch": 0.864058542077118,
"grad_norm": 1.6513868570327759,
"learning_rate": 1.1015666557690452e-05,
"loss": 0.3097,
"step": 1535
},
{
"epoch": 0.8668730650154799,
"grad_norm": 0.9617025256156921,
"learning_rate": 1.0571473856050107e-05,
"loss": 0.5886,
"step": 1540
},
{
"epoch": 0.8696875879538418,
"grad_norm": 1.428849458694458,
"learning_rate": 1.0135922050456347e-05,
"loss": 0.5518,
"step": 1545
},
{
"epoch": 0.8725021108922038,
"grad_norm": 0.9242206811904907,
"learning_rate": 9.709053225443487e-06,
"loss": 0.6922,
"step": 1550
},
{
"epoch": 0.8753166338305657,
"grad_norm": 0.9717757701873779,
"learning_rate": 9.29090862656593e-06,
"loss": 0.3653,
"step": 1555
},
{
"epoch": 0.8781311567689276,
"grad_norm": 0.48359620571136475,
"learning_rate": 8.881528656412963e-06,
"loss": 0.2516,
"step": 1560
},
{
"epoch": 0.8809456797072897,
"grad_norm": 1.1804348230361938,
"learning_rate": 8.480952870704873e-06,
"loss": 0.3171,
"step": 1565
},
{
"epoch": 0.8837602026456516,
"grad_norm": 0.8391767740249634,
"learning_rate": 8.08921997447094e-06,
"loss": 0.4603,
"step": 1570
},
{
"epoch": 0.8865747255840135,
"grad_norm": 0.37848615646362305,
"learning_rate": 7.706367818309624e-06,
"loss": 0.3601,
"step": 1575
},
{
"epoch": 0.8893892485223754,
"grad_norm": 0.328762412071228,
"learning_rate": 7.332433394731331e-06,
"loss": 0.4663,
"step": 1580
},
{
"epoch": 0.8922037714607374,
"grad_norm": 1.1124379634857178,
"learning_rate": 6.967452834584009e-06,
"loss": 0.7712,
"step": 1585
},
{
"epoch": 0.8950182943990993,
"grad_norm": 0.946675181388855,
"learning_rate": 6.611461403562147e-06,
"loss": 0.5769,
"step": 1590
},
{
"epoch": 0.8978328173374613,
"grad_norm": 1.3897684812545776,
"learning_rate": 6.264493498799185e-06,
"loss": 0.4381,
"step": 1595
},
{
"epoch": 0.9006473402758233,
"grad_norm": 0.5598315596580505,
"learning_rate": 5.92658264554401e-06,
"loss": 0.4289,
"step": 1600
},
{
"epoch": 0.9034618632141852,
"grad_norm": 0.6429581046104431,
"learning_rate": 5.597761493921627e-06,
"loss": 0.7875,
"step": 1605
},
{
"epoch": 0.9062763861525471,
"grad_norm": 0.8496813178062439,
"learning_rate": 5.278061815778313e-06,
"loss": 0.4269,
"step": 1610
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.9900699257850647,
"learning_rate": 4.967514501611881e-06,
"loss": 0.2818,
"step": 1615
},
{
"epoch": 0.911905432029271,
"grad_norm": 0.8361634016036987,
"learning_rate": 4.666149557586697e-06,
"loss": 0.4749,
"step": 1620
},
{
"epoch": 0.914719954967633,
"grad_norm": 0.5384243130683899,
"learning_rate": 4.3739961026345586e-06,
"loss": 0.5043,
"step": 1625
},
{
"epoch": 0.917534477905995,
"grad_norm": 0.8283894658088684,
"learning_rate": 4.091082365641085e-06,
"loss": 0.7093,
"step": 1630
},
{
"epoch": 0.9203490008443569,
"grad_norm": 0.5737767219543457,
"learning_rate": 3.817435682718096e-06,
"loss": 0.4823,
"step": 1635
},
{
"epoch": 0.9231635237827188,
"grad_norm": 0.4942363500595093,
"learning_rate": 3.5530824945623542e-06,
"loss": 0.6501,
"step": 1640
},
{
"epoch": 0.9259780467210807,
"grad_norm": 0.9362422823905945,
"learning_rate": 3.298048343900717e-06,
"loss": 0.2883,
"step": 1645
},
{
"epoch": 0.9287925696594427,
"grad_norm": 0.9241194725036621,
"learning_rate": 3.0523578730221713e-06,
"loss": 0.6112,
"step": 1650
},
{
"epoch": 0.9316070925978047,
"grad_norm": 1.3847359418869019,
"learning_rate": 2.8160348213967848e-06,
"loss": 0.5209,
"step": 1655
},
{
"epoch": 0.9344216155361666,
"grad_norm": 1.0031473636627197,
"learning_rate": 2.589102023381895e-06,
"loss": 0.3663,
"step": 1660
},
{
"epoch": 0.9372361384745286,
"grad_norm": 0.616346538066864,
"learning_rate": 2.3715814060157772e-06,
"loss": 0.4261,
"step": 1665
},
{
"epoch": 0.9400506614128905,
"grad_norm": 0.8073914647102356,
"learning_rate": 2.1634939868990235e-06,
"loss": 0.3541,
"step": 1670
},
{
"epoch": 0.9428651843512524,
"grad_norm": 0.64414381980896,
"learning_rate": 1.9648598721637045e-06,
"loss": 0.5996,
"step": 1675
},
{
"epoch": 0.9456797072896144,
"grad_norm": 0.7799990177154541,
"learning_rate": 1.7756982545306443e-06,
"loss": 0.4095,
"step": 1680
},
{
"epoch": 0.9484942302279764,
"grad_norm": 0.6466760039329529,
"learning_rate": 1.596027411454981e-06,
"loss": 0.3666,
"step": 1685
},
{
"epoch": 0.9513087531663383,
"grad_norm": 0.728364884853363,
"learning_rate": 1.4258647033601024e-06,
"loss": 0.6216,
"step": 1690
},
{
"epoch": 0.9541232761047003,
"grad_norm": 0.9482190012931824,
"learning_rate": 1.265226571960254e-06,
"loss": 0.5574,
"step": 1695
},
{
"epoch": 0.9569377990430622,
"grad_norm": 0.9594746232032776,
"learning_rate": 1.1141285386718437e-06,
"loss": 0.6466,
"step": 1700
},
{
"epoch": 0.9597523219814241,
"grad_norm": 1.3607548475265503,
"learning_rate": 9.72585203113774e-07,
"loss": 0.5807,
"step": 1705
},
{
"epoch": 0.9625668449197861,
"grad_norm": 2.761012554168701,
"learning_rate": 8.406102416967043e-07,
"loss": 0.4588,
"step": 1710
},
{
"epoch": 0.9653813678581481,
"grad_norm": 0.7540304660797119,
"learning_rate": 7.182164063015973e-07,
"loss": 0.5414,
"step": 1715
},
{
"epoch": 0.96819589079651,
"grad_norm": 1.1836090087890625,
"learning_rate": 6.054155230476699e-07,
"loss": 0.4274,
"step": 1720
},
{
"epoch": 0.971010413734872,
"grad_norm": 0.6484697461128235,
"learning_rate": 5.022184911495864e-07,
"loss": 0.4756,
"step": 1725
},
{
"epoch": 0.9738249366732339,
"grad_norm": 0.72726970911026,
"learning_rate": 4.0863528186445564e-07,
"loss": 0.5541,
"step": 1730
},
{
"epoch": 0.9766394596115958,
"grad_norm": 0.7867249846458435,
"learning_rate": 3.246749375282909e-07,
"loss": 0.4513,
"step": 1735
},
{
"epoch": 0.9794539825499577,
"grad_norm": 0.7862501740455627,
"learning_rate": 2.50345570682331e-07,
"loss": 0.4277,
"step": 1740
},
{
"epoch": 0.9822685054883197,
"grad_norm": 0.9948648810386658,
"learning_rate": 1.856543632892116e-07,
"loss": 0.5665,
"step": 1745
},
{
"epoch": 0.9850830284266817,
"grad_norm": 0.7089385390281677,
"learning_rate": 1.3060756603897605e-07,
"loss": 0.4658,
"step": 1750
},
{
"epoch": 0.9878975513650436,
"grad_norm": 0.5250968933105469,
"learning_rate": 8.521049774512513e-08,
"loss": 0.4507,
"step": 1755
},
{
"epoch": 0.9907120743034056,
"grad_norm": 0.6175426244735718,
"learning_rate": 4.946754483071692e-08,
"loss": 0.3314,
"step": 1760
},
{
"epoch": 0.9935265972417675,
"grad_norm": 0.6678758263587952,
"learning_rate": 2.3382160904483753e-08,
"loss": 0.3931,
"step": 1765
},
{
"epoch": 0.9963411201801294,
"grad_norm": 0.7605099081993103,
"learning_rate": 6.95686642719906e-09,
"loss": 0.3326,
"step": 1770
},
{
"epoch": 0.9991556431184914,
"grad_norm": 0.5569049119949341,
"learning_rate": 1.932484680944313e-10,
"loss": 0.4035,
"step": 1775
},
{
"epoch": 0.9997185477061638,
"step": 1776,
"total_flos": 3.866808531592151e+17,
"train_loss": 0.5711438672759713,
"train_runtime": 4148.7499,
"train_samples_per_second": 3.425,
"train_steps_per_second": 0.428
}
],
"logging_steps": 5,
"max_steps": 1776,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.866808531592151e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}