SykoLLM-v6.7 / trainer_state.json
SykoSLM's picture
SykoLLM v6.7
7121069 verified
Raw
History Blame Contribute Delete
51.4 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.118375,
"eval_steps": 500,
"global_step": 2900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00125,
"grad_norm": 0.36102330684661865,
"learning_rate": 5.5665e-06,
"loss": 2.681707572937012,
"step": 10
},
{
"epoch": 0.0025,
"grad_norm": 0.34577861428260803,
"learning_rate": 1.17515e-05,
"loss": 2.6720260620117187,
"step": 20
},
{
"epoch": 0.00375,
"grad_norm": 0.3295978009700775,
"learning_rate": 1.79365e-05,
"loss": 2.672147750854492,
"step": 30
},
{
"epoch": 0.005,
"grad_norm": 0.32688695192337036,
"learning_rate": 2.41215e-05,
"loss": 2.675041389465332,
"step": 40
},
{
"epoch": 0.00625,
"grad_norm": 0.3257655203342438,
"learning_rate": 3.03065e-05,
"loss": 2.675174522399902,
"step": 50
},
{
"epoch": 0.0075,
"grad_norm": 0.336309552192688,
"learning_rate": 3.6491499999999994e-05,
"loss": 2.6966915130615234,
"step": 60
},
{
"epoch": 0.00875,
"grad_norm": 0.3346744179725647,
"learning_rate": 4.26765e-05,
"loss": 2.6632720947265627,
"step": 70
},
{
"epoch": 0.01,
"grad_norm": 0.32752859592437744,
"learning_rate": 4.88615e-05,
"loss": 2.695608139038086,
"step": 80
},
{
"epoch": 0.01125,
"grad_norm": 0.32975664734840393,
"learning_rate": 5.50465e-05,
"loss": 2.6731294631958007,
"step": 90
},
{
"epoch": 0.0125,
"grad_norm": 0.33192330598831177,
"learning_rate": 6.12315e-05,
"loss": 2.6482282638549806,
"step": 100
},
{
"epoch": 0.01375,
"grad_norm": 0.3244248032569885,
"learning_rate": 6.74165e-05,
"loss": 2.700460433959961,
"step": 110
},
{
"epoch": 0.015,
"grad_norm": 0.32452520728111267,
"learning_rate": 7.36015e-05,
"loss": 2.6673652648925783,
"step": 120
},
{
"epoch": 0.01625,
"grad_norm": 0.32950156927108765,
"learning_rate": 7.97865e-05,
"loss": 2.66015510559082,
"step": 130
},
{
"epoch": 0.0175,
"grad_norm": 0.3157300651073456,
"learning_rate": 8.597149999999999e-05,
"loss": 2.653401184082031,
"step": 140
},
{
"epoch": 0.01875,
"grad_norm": 0.3447306156158447,
"learning_rate": 9.21565e-05,
"loss": 2.638433837890625,
"step": 150
},
{
"epoch": 0.02,
"grad_norm": 0.33060336112976074,
"learning_rate": 9.834150000000001e-05,
"loss": 2.6599313735961916,
"step": 160
},
{
"epoch": 0.02125,
"grad_norm": 0.33370116353034973,
"learning_rate": 0.00010452649999999999,
"loss": 2.675436019897461,
"step": 170
},
{
"epoch": 0.0225,
"grad_norm": 0.32309311628341675,
"learning_rate": 0.0001107115,
"loss": 2.682134246826172,
"step": 180
},
{
"epoch": 0.02375,
"grad_norm": 0.3298942446708679,
"learning_rate": 0.0001168965,
"loss": 2.6672037124633787,
"step": 190
},
{
"epoch": 0.025,
"grad_norm": 0.3257051408290863,
"learning_rate": 0.0001230815,
"loss": 2.6710464477539064,
"step": 200
},
{
"epoch": 0.02625,
"grad_norm": 0.32734546065330505,
"learning_rate": 0.00012369959364576377,
"loss": 2.6694522857666017,
"step": 210
},
{
"epoch": 0.0275,
"grad_norm": 0.3286871016025543,
"learning_rate": 0.00012369818897130838,
"loss": 2.67569580078125,
"step": 220
},
{
"epoch": 0.02875,
"grad_norm": 0.3206029534339905,
"learning_rate": 0.0001236957809826964,
"loss": 2.671968460083008,
"step": 230
},
{
"epoch": 0.03,
"grad_norm": 0.32244956493377686,
"learning_rate": 0.0001236923697189907,
"loss": 2.6653528213500977,
"step": 240
},
{
"epoch": 0.03125,
"grad_norm": 0.3286353647708893,
"learning_rate": 0.00012368795523552952,
"loss": 2.644626998901367,
"step": 250
},
{
"epoch": 0.0325,
"grad_norm": 0.31846532225608826,
"learning_rate": 0.00012368253760392556,
"loss": 2.6661434173583984,
"step": 260
},
{
"epoch": 0.03375,
"grad_norm": 0.34063664078712463,
"learning_rate": 0.00012367611691206466,
"loss": 2.658544921875,
"step": 270
},
{
"epoch": 0.035,
"grad_norm": 0.3394038677215576,
"learning_rate": 0.00012366869326410474,
"loss": 2.671076202392578,
"step": 280
},
{
"epoch": 0.03625,
"grad_norm": 0.3454046845436096,
"learning_rate": 0.00012366026678047368,
"loss": 2.690570068359375,
"step": 290
},
{
"epoch": 0.0375,
"grad_norm": 0.32945406436920166,
"learning_rate": 0.00012365083759786766,
"loss": 2.6626564025878907,
"step": 300
},
{
"epoch": 0.03875,
"grad_norm": 0.3266613483428955,
"learning_rate": 0.00012364040586924886,
"loss": 2.6811601638793947,
"step": 310
},
{
"epoch": 0.04,
"grad_norm": 0.32137027382850647,
"learning_rate": 0.0001236289717638429,
"loss": 2.656772422790527,
"step": 320
},
{
"epoch": 0.04125,
"grad_norm": 0.31430286169052124,
"learning_rate": 0.00012361653546713627,
"loss": 2.667566680908203,
"step": 330
},
{
"epoch": 0.0425,
"grad_norm": 0.3187640905380249,
"learning_rate": 0.00012360309718087312,
"loss": 2.6774127960205076,
"step": 340
},
{
"epoch": 0.04375,
"grad_norm": 0.3238705098628998,
"learning_rate": 0.00012358865712305212,
"loss": 2.650909423828125,
"step": 350
},
{
"epoch": 0.045,
"grad_norm": 0.3178948163986206,
"learning_rate": 0.00012357321552792288,
"loss": 2.6466007232666016,
"step": 360
},
{
"epoch": 0.04625,
"grad_norm": 0.3393631875514984,
"learning_rate": 0.0001235567726459822,
"loss": 2.6694786071777346,
"step": 370
},
{
"epoch": 0.0475,
"grad_norm": 0.33097463846206665,
"learning_rate": 0.00012353932874396988,
"loss": 2.6705909729003907,
"step": 380
},
{
"epoch": 0.04875,
"grad_norm": 0.3237457275390625,
"learning_rate": 0.00012352088410486452,
"loss": 2.666813087463379,
"step": 390
},
{
"epoch": 0.05,
"grad_norm": 0.32804396748542786,
"learning_rate": 0.0001235014390278789,
"loss": 2.6341262817382813,
"step": 400
},
{
"epoch": 0.05125,
"grad_norm": 0.3098997473716736,
"learning_rate": 0.0001234809938284551,
"loss": 2.653286361694336,
"step": 410
},
{
"epoch": 0.0525,
"grad_norm": 0.31869447231292725,
"learning_rate": 0.00012345954883825937,
"loss": 2.6676279067993165,
"step": 420
},
{
"epoch": 0.05375,
"grad_norm": 0.3462599813938141,
"learning_rate": 0.0001234371044051768,
"loss": 2.6937137603759767,
"step": 430
},
{
"epoch": 0.055,
"grad_norm": 0.33410680294036865,
"learning_rate": 0.00012341366089330566,
"loss": 2.6624752044677735,
"step": 440
},
{
"epoch": 0.05625,
"grad_norm": 0.3401891589164734,
"learning_rate": 0.00012338921868295142,
"loss": 2.6673324584960936,
"step": 450
},
{
"epoch": 0.0575,
"grad_norm": 0.3144513964653015,
"learning_rate": 0.00012336377817062075,
"loss": 2.6684280395507813,
"step": 460
},
{
"epoch": 0.05875,
"grad_norm": 0.31319352984428406,
"learning_rate": 0.00012333733976901485,
"loss": 2.6631874084472655,
"step": 470
},
{
"epoch": 0.06,
"grad_norm": 0.3231050670146942,
"learning_rate": 0.00012330990390702298,
"loss": 2.6671581268310547,
"step": 480
},
{
"epoch": 0.06125,
"grad_norm": 0.3283950686454773,
"learning_rate": 0.00012328147102971544,
"loss": 2.6682722091674806,
"step": 490
},
{
"epoch": 0.0625,
"grad_norm": 0.3203584849834442,
"learning_rate": 0.0001232520415983362,
"loss": 2.6619497299194337,
"step": 500
},
{
"epoch": 0.06375,
"grad_norm": 0.3314996063709259,
"learning_rate": 0.00012322161609029563,
"loss": 2.675333023071289,
"step": 510
},
{
"epoch": 0.065,
"grad_norm": 0.3124040961265564,
"learning_rate": 0.00012319019499916267,
"loss": 2.674266052246094,
"step": 520
},
{
"epoch": 0.06625,
"grad_norm": 0.334187775850296,
"learning_rate": 0.0001231577788346567,
"loss": 2.6644060134887697,
"step": 530
},
{
"epoch": 0.0675,
"grad_norm": 0.33853819966316223,
"learning_rate": 0.00012312436812263953,
"loss": 2.6285802841186525,
"step": 540
},
{
"epoch": 0.06875,
"grad_norm": 0.3226993680000305,
"learning_rate": 0.00012308996340510664,
"loss": 2.6620355606079102,
"step": 550
},
{
"epoch": 0.07,
"grad_norm": 0.31965890526771545,
"learning_rate": 0.0001230545652401785,
"loss": 2.669430160522461,
"step": 560
},
{
"epoch": 0.07125,
"grad_norm": 0.323632150888443,
"learning_rate": 0.00012301817420209152,
"loss": 2.6710559844970705,
"step": 570
},
{
"epoch": 0.0725,
"grad_norm": 0.3202168047428131,
"learning_rate": 0.00012298079088118863,
"loss": 2.6743343353271483,
"step": 580
},
{
"epoch": 0.07375,
"grad_norm": 0.3278695344924927,
"learning_rate": 0.00012294241588390982,
"loss": 2.643411636352539,
"step": 590
},
{
"epoch": 0.075,
"grad_norm": 0.3302673101425171,
"learning_rate": 0.0001229030498327823,
"loss": 2.7156848907470703,
"step": 600
},
{
"epoch": 0.07625,
"grad_norm": 0.31964629888534546,
"learning_rate": 0.00012286269336641027,
"loss": 2.6369789123535154,
"step": 610
},
{
"epoch": 0.0775,
"grad_norm": 0.32528844475746155,
"learning_rate": 0.00012282134713946472,
"loss": 2.655129241943359,
"step": 620
},
{
"epoch": 0.07875,
"grad_norm": 0.3346538245677948,
"learning_rate": 0.00012277901182267275,
"loss": 2.6634849548339843,
"step": 630
},
{
"epoch": 0.08,
"grad_norm": 0.32035592198371887,
"learning_rate": 0.00012273568810280665,
"loss": 2.6622406005859376,
"step": 640
},
{
"epoch": 0.08125,
"grad_norm": 0.32753705978393555,
"learning_rate": 0.00012269137668267276,
"loss": 2.6673862457275392,
"step": 650
},
{
"epoch": 0.0825,
"grad_norm": 0.3323623538017273,
"learning_rate": 0.00012264607828110018,
"loss": 2.6660182952880858,
"step": 660
},
{
"epoch": 0.08375,
"grad_norm": 0.3228432238101959,
"learning_rate": 0.0001225997936329289,
"loss": 2.690377044677734,
"step": 670
},
{
"epoch": 0.085,
"grad_norm": 0.3340938687324524,
"learning_rate": 0.00012255252348899816,
"loss": 2.6579252243041993,
"step": 680
},
{
"epoch": 0.08625,
"grad_norm": 0.32717493176460266,
"learning_rate": 0.00012250426861613406,
"loss": 2.6669349670410156,
"step": 690
},
{
"epoch": 0.0875,
"grad_norm": 0.3213510513305664,
"learning_rate": 0.0001224550297971371,
"loss": 2.658818817138672,
"step": 700
},
{
"epoch": 0.08875,
"grad_norm": 0.3103785216808319,
"learning_rate": 0.00012240480783076967,
"loss": 2.64670467376709,
"step": 710
},
{
"epoch": 0.09,
"grad_norm": 0.3206445276737213,
"learning_rate": 0.00012235360353174288,
"loss": 2.649314117431641,
"step": 720
},
{
"epoch": 0.09125,
"grad_norm": 0.3210267722606659,
"learning_rate": 0.00012230141773070355,
"loss": 2.6637636184692384,
"step": 730
},
{
"epoch": 0.0925,
"grad_norm": 0.312549352645874,
"learning_rate": 0.00012224825127422055,
"loss": 2.6725765228271485,
"step": 740
},
{
"epoch": 0.09375,
"grad_norm": 0.32557615637779236,
"learning_rate": 0.00012219410502477114,
"loss": 2.6337608337402343,
"step": 750
},
{
"epoch": 0.095,
"grad_norm": 0.31713125109672546,
"learning_rate": 0.00012213897986072705,
"loss": 2.6361785888671876,
"step": 760
},
{
"epoch": 0.09625,
"grad_norm": 0.3173486590385437,
"learning_rate": 0.00012208287667634017,
"loss": 2.6491493225097655,
"step": 770
},
{
"epoch": 0.0975,
"grad_norm": 0.32202011346817017,
"learning_rate": 0.00012202579638172791,
"loss": 2.665495681762695,
"step": 780
},
{
"epoch": 0.09875,
"grad_norm": 0.31751731038093567,
"learning_rate": 0.0001219677399028587,
"loss": 2.670880889892578,
"step": 790
},
{
"epoch": 0.1,
"grad_norm": 0.3310263156890869,
"learning_rate": 0.00012190870818153682,
"loss": 2.6745986938476562,
"step": 800
},
{
"epoch": 0.10125,
"grad_norm": 0.3246520757675171,
"learning_rate": 0.00012184870217538704,
"loss": 2.6367824554443358,
"step": 810
},
{
"epoch": 0.1025,
"grad_norm": 0.31728002429008484,
"learning_rate": 0.0001217877228578393,
"loss": 2.657224655151367,
"step": 820
},
{
"epoch": 0.10375,
"grad_norm": 0.32666370272636414,
"learning_rate": 0.00012172577121811272,
"loss": 2.629240798950195,
"step": 830
},
{
"epoch": 0.105,
"grad_norm": 0.32864195108413696,
"learning_rate": 0.00012166284826119965,
"loss": 2.6314460754394533,
"step": 840
},
{
"epoch": 0.10625,
"grad_norm": 0.331391304731369,
"learning_rate": 0.00012159895500784936,
"loss": 2.6207229614257814,
"step": 850
},
{
"epoch": 0.1075,
"grad_norm": 0.32856595516204834,
"learning_rate": 0.00012153409249455148,
"loss": 2.6828586578369142,
"step": 860
},
{
"epoch": 0.10875,
"grad_norm": 0.3259557783603668,
"learning_rate": 0.00012146826177351913,
"loss": 2.6800840377807615,
"step": 870
},
{
"epoch": 0.11,
"grad_norm": 0.3368566930294037,
"learning_rate": 0.00012140146391267196,
"loss": 2.644548797607422,
"step": 880
},
{
"epoch": 0.11125,
"grad_norm": 0.3319634199142456,
"learning_rate": 0.00012133369999561872,
"loss": 2.6457305908203126,
"step": 890
},
{
"epoch": 0.1125,
"grad_norm": 0.31302639842033386,
"learning_rate": 0.00012126497112163972,
"loss": 2.6418832778930663,
"step": 900
},
{
"epoch": 0.11375,
"grad_norm": 0.32079464197158813,
"learning_rate": 0.00012119527840566905,
"loss": 2.6311697006225585,
"step": 910
},
{
"epoch": 0.115,
"grad_norm": 0.32719048857688904,
"learning_rate": 0.00012112462297827639,
"loss": 2.641567611694336,
"step": 920
},
{
"epoch": 0.11625,
"grad_norm": 0.32264548540115356,
"learning_rate": 0.00012105300598564874,
"loss": 2.6696403503417967,
"step": 930
},
{
"epoch": 0.1175,
"grad_norm": 0.3197903335094452,
"learning_rate": 0.00012098042858957183,
"loss": 2.6566593170166017,
"step": 940
},
{
"epoch": 0.11875,
"grad_norm": 0.3231068253517151,
"learning_rate": 0.00012090689196741124,
"loss": 2.63052978515625,
"step": 950
},
{
"epoch": 0.12,
"grad_norm": 0.3268223702907562,
"learning_rate": 0.00012083239731209331,
"loss": 2.6513845443725588,
"step": 960
},
{
"epoch": 0.12125,
"grad_norm": 0.3304605484008789,
"learning_rate": 0.00012075694583208578,
"loss": 2.6264434814453126,
"step": 970
},
{
"epoch": 0.1225,
"grad_norm": 0.3171931505203247,
"learning_rate": 0.00012068053875137824,
"loss": 2.636788558959961,
"step": 980
},
{
"epoch": 0.12375,
"grad_norm": 0.3341807425022125,
"learning_rate": 0.00012060317730946224,
"loss": 2.6531208038330076,
"step": 990
},
{
"epoch": 0.125,
"grad_norm": 0.3334127962589264,
"learning_rate": 0.00012052486276131108,
"loss": 2.6705049514770507,
"step": 1000
},
{
"epoch": 0.12625,
"grad_norm": 0.307980477809906,
"learning_rate": 0.00012044559637735965,
"loss": 2.6561138153076174,
"step": 1010
},
{
"epoch": 0.1275,
"grad_norm": 0.31699395179748535,
"learning_rate": 0.00012036537944348368,
"loss": 2.633596420288086,
"step": 1020
},
{
"epoch": 0.12875,
"grad_norm": 0.32349589467048645,
"learning_rate": 0.0001202842132609789,
"loss": 2.651826858520508,
"step": 1030
},
{
"epoch": 0.13,
"grad_norm": 0.3407875895500183,
"learning_rate": 0.00012020209914653999,
"loss": 2.6381755828857423,
"step": 1040
},
{
"epoch": 0.13125,
"grad_norm": 0.31691980361938477,
"learning_rate": 0.00012011903843223914,
"loss": 2.6360122680664064,
"step": 1050
},
{
"epoch": 0.1325,
"grad_norm": 0.31067660450935364,
"learning_rate": 0.0001200350324655045,
"loss": 2.6421882629394533,
"step": 1060
},
{
"epoch": 0.13375,
"grad_norm": 0.32634156942367554,
"learning_rate": 0.0001199500826090983,
"loss": 2.63830509185791,
"step": 1070
},
{
"epoch": 0.135,
"grad_norm": 0.3369225263595581,
"learning_rate": 0.00011986419024109472,
"loss": 2.63408203125,
"step": 1080
},
{
"epoch": 0.13625,
"grad_norm": 0.3302381932735443,
"learning_rate": 0.0001197773567548576,
"loss": 2.6358100891113283,
"step": 1090
},
{
"epoch": 0.1375,
"grad_norm": 0.33104801177978516,
"learning_rate": 0.00011968958355901778,
"loss": 2.6341053009033204,
"step": 1100
},
{
"epoch": 0.13875,
"grad_norm": 0.3302455544471741,
"learning_rate": 0.00011960087207745023,
"loss": 2.659340667724609,
"step": 1110
},
{
"epoch": 0.14,
"grad_norm": 0.318013995885849,
"learning_rate": 0.00011951122374925103,
"loss": 2.6539737701416017,
"step": 1120
},
{
"epoch": 0.14125,
"grad_norm": 0.31688031554222107,
"learning_rate": 0.00011942064002871398,
"loss": 2.650745391845703,
"step": 1130
},
{
"epoch": 0.1425,
"grad_norm": 0.3218444883823395,
"learning_rate": 0.00011932912238530696,
"loss": 2.6293779373168946,
"step": 1140
},
{
"epoch": 0.14375,
"grad_norm": 0.31668025255203247,
"learning_rate": 0.0001192366723036482,
"loss": 2.652189254760742,
"step": 1150
},
{
"epoch": 0.145,
"grad_norm": 0.32894524931907654,
"learning_rate": 0.0001191432912834821,
"loss": 2.6034008026123048,
"step": 1160
},
{
"epoch": 0.14625,
"grad_norm": 0.326031357049942,
"learning_rate": 0.00011904898083965494,
"loss": 2.6356990814208983,
"step": 1170
},
{
"epoch": 0.1475,
"grad_norm": 0.3148091733455658,
"learning_rate": 0.00011895374250209033,
"loss": 2.6438148498535154,
"step": 1180
},
{
"epoch": 0.14875,
"grad_norm": 0.3154153823852539,
"learning_rate": 0.00011885757781576434,
"loss": 2.653242301940918,
"step": 1190
},
{
"epoch": 0.15,
"grad_norm": 0.31809449195861816,
"learning_rate": 0.00011876048834068046,
"loss": 2.6228126525878905,
"step": 1200
},
{
"epoch": 0.15125,
"grad_norm": 0.32725268602371216,
"learning_rate": 0.0001186624756518443,
"loss": 2.6216796875,
"step": 1210
},
{
"epoch": 0.1525,
"grad_norm": 0.32540032267570496,
"learning_rate": 0.00011856354133923805,
"loss": 2.67537841796875,
"step": 1220
},
{
"epoch": 0.15375,
"grad_norm": 0.3263508975505829,
"learning_rate": 0.00011846368700779467,
"loss": 2.6610176086425783,
"step": 1230
},
{
"epoch": 0.155,
"grad_norm": 0.3205776512622833,
"learning_rate": 0.00011836291427737183,
"loss": 2.6613521575927734,
"step": 1240
},
{
"epoch": 0.15625,
"grad_norm": 0.31028124690055847,
"learning_rate": 0.00011826122478272567,
"loss": 2.633769416809082,
"step": 1250
},
{
"epoch": 0.1575,
"grad_norm": 0.31673797965049744,
"learning_rate": 0.00011815862017348429,
"loss": 2.624924087524414,
"step": 1260
},
{
"epoch": 0.15875,
"grad_norm": 0.32373106479644775,
"learning_rate": 0.00011805510211412097,
"loss": 2.6462501525878905,
"step": 1270
},
{
"epoch": 0.16,
"grad_norm": 0.31725797057151794,
"learning_rate": 0.0001179506722839271,
"loss": 2.6365428924560548,
"step": 1280
},
{
"epoch": 0.16125,
"grad_norm": 0.3195420205593109,
"learning_rate": 0.00011784533237698511,
"loss": 2.6311481475830076,
"step": 1290
},
{
"epoch": 0.1625,
"grad_norm": 0.3341420888900757,
"learning_rate": 0.00011773908410214081,
"loss": 2.642291450500488,
"step": 1300
},
{
"epoch": 0.16375,
"grad_norm": 0.3230491876602173,
"learning_rate": 0.00011763192918297575,
"loss": 2.638113594055176,
"step": 1310
},
{
"epoch": 0.165,
"grad_norm": 0.3223067820072174,
"learning_rate": 0.0001175238693577793,
"loss": 2.6444271087646483,
"step": 1320
},
{
"epoch": 0.16625,
"grad_norm": 0.31934627890586853,
"learning_rate": 0.00011741490637952035,
"loss": 2.6657215118408204,
"step": 1330
},
{
"epoch": 0.1675,
"grad_norm": 0.3097170889377594,
"learning_rate": 0.00011730504201581893,
"loss": 2.645807647705078,
"step": 1340
},
{
"epoch": 0.16875,
"grad_norm": 0.32414084672927856,
"learning_rate": 0.00011719427804891757,
"loss": 2.641864776611328,
"step": 1350
},
{
"epoch": 0.17,
"grad_norm": 0.31383687257766724,
"learning_rate": 0.00011708261627565232,
"loss": 2.662236785888672,
"step": 1360
},
{
"epoch": 0.17125,
"grad_norm": 0.31501343846321106,
"learning_rate": 0.00011697005850742364,
"loss": 2.6557693481445312,
"step": 1370
},
{
"epoch": 0.1725,
"grad_norm": 0.31809887290000916,
"learning_rate": 0.00011685660657016701,
"loss": 2.6280593872070312,
"step": 1380
},
{
"epoch": 0.17375,
"grad_norm": 0.31885311007499695,
"learning_rate": 0.0001167422623043233,
"loss": 2.6564004898071287,
"step": 1390
},
{
"epoch": 0.175,
"grad_norm": 0.3105798065662384,
"learning_rate": 0.00011662702756480891,
"loss": 2.64355354309082,
"step": 1400
},
{
"epoch": 0.17625,
"grad_norm": 0.3361447751522064,
"learning_rate": 0.00011651090422098569,
"loss": 2.6594215393066407,
"step": 1410
},
{
"epoch": 0.1775,
"grad_norm": 0.32253745198249817,
"learning_rate": 0.00011639389415663065,
"loss": 2.642239570617676,
"step": 1420
},
{
"epoch": 0.17875,
"grad_norm": 0.32338932156562805,
"learning_rate": 0.00011627599926990531,
"loss": 2.6702959060668947,
"step": 1430
},
{
"epoch": 0.18,
"grad_norm": 0.3116281032562256,
"learning_rate": 0.00011615722147332501,
"loss": 2.6370218276977537,
"step": 1440
},
{
"epoch": 0.18125,
"grad_norm": 0.3282069265842438,
"learning_rate": 0.00011603756269372781,
"loss": 2.589012336730957,
"step": 1450
},
{
"epoch": 0.1825,
"grad_norm": 0.32347872853279114,
"learning_rate": 0.00011591702487224326,
"loss": 2.638626480102539,
"step": 1460
},
{
"epoch": 0.18375,
"grad_norm": 0.31963029503822327,
"learning_rate": 0.0001157956099642609,
"loss": 2.6150590896606447,
"step": 1470
},
{
"epoch": 0.185,
"grad_norm": 0.31573331356048584,
"learning_rate": 0.00011567331993939861,
"loss": 2.6242300033569337,
"step": 1480
},
{
"epoch": 0.18625,
"grad_norm": 0.318210631608963,
"learning_rate": 0.00011555015678147051,
"loss": 2.6236839294433594,
"step": 1490
},
{
"epoch": 0.1875,
"grad_norm": 0.3299921751022339,
"learning_rate": 0.0001154261224884549,
"loss": 2.633551597595215,
"step": 1500
},
{
"epoch": 0.18875,
"grad_norm": 0.32802239060401917,
"learning_rate": 0.00011530121907246187,
"loss": 2.650678253173828,
"step": 1510
},
{
"epoch": 0.19,
"grad_norm": 0.3139156401157379,
"learning_rate": 0.0001151754485597005,
"loss": 2.6056631088256834,
"step": 1520
},
{
"epoch": 0.19125,
"grad_norm": 0.320236474275589,
"learning_rate": 0.00011504881299044619,
"loss": 2.6355617523193358,
"step": 1530
},
{
"epoch": 0.1925,
"grad_norm": 0.3379780054092407,
"learning_rate": 0.00011492131441900742,
"loss": 2.6405055999755858,
"step": 1540
},
{
"epoch": 0.19375,
"grad_norm": 0.3395773470401764,
"learning_rate": 0.00011479295491369245,
"loss": 2.6217134475708006,
"step": 1550
},
{
"epoch": 0.195,
"grad_norm": 0.33206456899642944,
"learning_rate": 0.00011466373655677584,
"loss": 2.6553268432617188,
"step": 1560
},
{
"epoch": 0.19625,
"grad_norm": 0.3266463577747345,
"learning_rate": 0.00011453366144446457,
"loss": 2.615655517578125,
"step": 1570
},
{
"epoch": 0.1975,
"grad_norm": 0.3166464567184448,
"learning_rate": 0.0001144027316868641,
"loss": 2.6240345001220704,
"step": 1580
},
{
"epoch": 0.19875,
"grad_norm": 0.31986290216445923,
"learning_rate": 0.00011427094940794416,
"loss": 2.6230613708496096,
"step": 1590
},
{
"epoch": 0.2,
"grad_norm": 0.3255802392959595,
"learning_rate": 0.00011413831674550421,
"loss": 2.6539276123046873,
"step": 1600
},
{
"epoch": 0.20125,
"grad_norm": 0.3255312144756317,
"learning_rate": 0.00011400483585113883,
"loss": 2.6217121124267577,
"step": 1610
},
{
"epoch": 0.2025,
"grad_norm": 0.3323643207550049,
"learning_rate": 0.0001138705088902028,
"loss": 2.652513885498047,
"step": 1620
},
{
"epoch": 0.20375,
"grad_norm": 0.3227868974208832,
"learning_rate": 0.00011373533804177592,
"loss": 2.630014991760254,
"step": 1630
},
{
"epoch": 0.205,
"grad_norm": 0.31701064109802246,
"learning_rate": 0.00011359932549862779,
"loss": 2.639967346191406,
"step": 1640
},
{
"epoch": 0.20625,
"grad_norm": 0.3187071681022644,
"learning_rate": 0.00011346247346718207,
"loss": 2.6362884521484373,
"step": 1650
},
{
"epoch": 0.2075,
"grad_norm": 0.31707099080085754,
"learning_rate": 0.00011332478416748083,
"loss": 2.649311065673828,
"step": 1660
},
{
"epoch": 0.20875,
"grad_norm": 0.3297825753688812,
"learning_rate": 0.00011318625983314848,
"loss": 2.6421716690063475,
"step": 1670
},
{
"epoch": 0.21,
"grad_norm": 0.3198815584182739,
"learning_rate": 0.00011304690271135548,
"loss": 2.633087730407715,
"step": 1680
},
{
"epoch": 0.21125,
"grad_norm": 0.3226505219936371,
"learning_rate": 0.00011290671506278205,
"loss": 2.6442310333251955,
"step": 1690
},
{
"epoch": 0.2125,
"grad_norm": 0.33370015025138855,
"learning_rate": 0.00011276569916158123,
"loss": 2.6304306030273437,
"step": 1700
},
{
"epoch": 0.21375,
"grad_norm": 0.3307320773601532,
"learning_rate": 0.0001126238572953423,
"loss": 2.6353145599365235,
"step": 1710
},
{
"epoch": 0.215,
"grad_norm": 0.31320619583129883,
"learning_rate": 0.00011248119176505343,
"loss": 2.6117172241210938,
"step": 1720
},
{
"epoch": 0.21625,
"grad_norm": 0.3411354422569275,
"learning_rate": 0.00011233770488506444,
"loss": 2.6199378967285156,
"step": 1730
},
{
"epoch": 0.2175,
"grad_norm": 0.3345658779144287,
"learning_rate": 0.0001121933989830493,
"loss": 2.617340850830078,
"step": 1740
},
{
"epoch": 0.21875,
"grad_norm": 0.328173965215683,
"learning_rate": 0.0001120482763999683,
"loss": 2.646270751953125,
"step": 1750
},
{
"epoch": 0.22,
"grad_norm": 0.31834596395492554,
"learning_rate": 0.00011190233949003007,
"loss": 2.6598697662353517,
"step": 1760
},
{
"epoch": 0.22125,
"grad_norm": 0.32211023569107056,
"learning_rate": 0.00011175559062065348,
"loss": 2.617197036743164,
"step": 1770
},
{
"epoch": 0.2225,
"grad_norm": 0.30770230293273926,
"learning_rate": 0.00011160803217242911,
"loss": 2.6376068115234377,
"step": 1780
},
{
"epoch": 0.22375,
"grad_norm": 0.3243764042854309,
"learning_rate": 0.00011145966653908078,
"loss": 2.606427764892578,
"step": 1790
},
{
"epoch": 0.225,
"grad_norm": 0.33548685908317566,
"learning_rate": 0.00011131049612742655,
"loss": 2.6384208679199217,
"step": 1800
},
{
"epoch": 0.22625,
"grad_norm": 0.3262486159801483,
"learning_rate": 0.00011116052335733979,
"loss": 2.658290672302246,
"step": 1810
},
{
"epoch": 0.2275,
"grad_norm": 0.31495559215545654,
"learning_rate": 0.00011100975066170992,
"loss": 2.662753105163574,
"step": 1820
},
{
"epoch": 0.22875,
"grad_norm": 0.3250574469566345,
"learning_rate": 0.00011085818048640288,
"loss": 2.6388259887695313,
"step": 1830
},
{
"epoch": 0.23,
"grad_norm": 0.34293144941329956,
"learning_rate": 0.00011070581529022152,
"loss": 2.6388187408447266,
"step": 1840
},
{
"epoch": 0.23125,
"grad_norm": 0.31609639525413513,
"learning_rate": 0.00011055265754486565,
"loss": 2.637576675415039,
"step": 1850
},
{
"epoch": 0.2325,
"grad_norm": 0.3181133270263672,
"learning_rate": 0.00011039870973489204,
"loss": 2.634903907775879,
"step": 1860
},
{
"epoch": 0.23375,
"grad_norm": 0.3416786193847656,
"learning_rate": 0.00011024397435767398,
"loss": 2.616485023498535,
"step": 1870
},
{
"epoch": 0.235,
"grad_norm": 0.315266489982605,
"learning_rate": 0.00011008845392336087,
"loss": 2.6373340606689455,
"step": 1880
},
{
"epoch": 0.23625,
"grad_norm": 0.3316870927810669,
"learning_rate": 0.0001099321509548375,
"loss": 2.6363605499267577,
"step": 1890
},
{
"epoch": 0.2375,
"grad_norm": 0.3230259418487549,
"learning_rate": 0.00010977506798768303,
"loss": 2.5958734512329102,
"step": 1900
},
{
"epoch": 0.23875,
"grad_norm": 0.32272425293922424,
"learning_rate": 0.00010961720757012995,
"loss": 2.608958435058594,
"step": 1910
},
{
"epoch": 0.24,
"grad_norm": 0.3131502568721771,
"learning_rate": 0.00010945857226302276,
"loss": 2.6321544647216797,
"step": 1920
},
{
"epoch": 0.24125,
"grad_norm": 0.32143065333366394,
"learning_rate": 0.00010929916463977628,
"loss": 2.613364410400391,
"step": 1930
},
{
"epoch": 0.2425,
"grad_norm": 0.3152971565723419,
"learning_rate": 0.00010913898728633408,
"loss": 2.613265800476074,
"step": 1940
},
{
"epoch": 0.24375,
"grad_norm": 0.32848265767097473,
"learning_rate": 0.00010897804280112643,
"loss": 2.6013004302978517,
"step": 1950
},
{
"epoch": 1.000875,
"grad_norm": 0.3237718939781189,
"learning_rate": 0.00010881633379502814,
"loss": 2.8611122131347657,
"step": 1960
},
{
"epoch": 1.002125,
"grad_norm": 0.3280915915966034,
"learning_rate": 0.00010865386289131632,
"loss": 2.5412445068359375,
"step": 1970
},
{
"epoch": 1.003375,
"grad_norm": 0.33189550042152405,
"learning_rate": 0.00010849063272562764,
"loss": 2.559256362915039,
"step": 1980
},
{
"epoch": 1.004625,
"grad_norm": 0.3265272378921509,
"learning_rate": 0.00010832664594591574,
"loss": 2.5583423614501952,
"step": 1990
},
{
"epoch": 1.005875,
"grad_norm": 0.3453090488910675,
"learning_rate": 0.00010816190521240819,
"loss": 2.5712684631347655,
"step": 2000
},
{
"epoch": 1.007125,
"grad_norm": 0.3423366844654083,
"learning_rate": 0.00010799641319756335,
"loss": 2.5412336349487306,
"step": 2010
},
{
"epoch": 1.008375,
"grad_norm": 0.32097378373146057,
"learning_rate": 0.00010783017258602704,
"loss": 2.5253084182739256,
"step": 2020
},
{
"epoch": 1.009625,
"grad_norm": 0.3252958357334137,
"learning_rate": 0.00010766318607458898,
"loss": 2.5738031387329103,
"step": 2030
},
{
"epoch": 1.010875,
"grad_norm": 0.3372173011302948,
"learning_rate": 0.00010749545637213897,
"loss": 2.54388370513916,
"step": 2040
},
{
"epoch": 1.012125,
"grad_norm": 0.33359599113464355,
"learning_rate": 0.00010732698619962306,
"loss": 2.55248908996582,
"step": 2050
},
{
"epoch": 1.013375,
"grad_norm": 0.34591928124427795,
"learning_rate": 0.00010715777828999937,
"loss": 2.5376352310180663,
"step": 2060
},
{
"epoch": 1.014625,
"grad_norm": 0.35322073101997375,
"learning_rate": 0.00010698783538819372,
"loss": 2.534122085571289,
"step": 2070
},
{
"epoch": 1.015875,
"grad_norm": 0.3539016544818878,
"learning_rate": 0.00010681716025105512,
"loss": 2.492664337158203,
"step": 2080
},
{
"epoch": 1.017125,
"grad_norm": 0.3429170548915863,
"learning_rate": 0.00010664575564731107,
"loss": 2.5008804321289064,
"step": 2090
},
{
"epoch": 1.018375,
"grad_norm": 0.3576091229915619,
"learning_rate": 0.00010647362435752263,
"loss": 2.5176633834838866,
"step": 2100
},
{
"epoch": 1.019625,
"grad_norm": 0.3297135829925537,
"learning_rate": 0.00010630076917403929,
"loss": 2.500911331176758,
"step": 2110
},
{
"epoch": 1.020875,
"grad_norm": 0.33164292573928833,
"learning_rate": 0.00010612719290095374,
"loss": 2.513214111328125,
"step": 2120
},
{
"epoch": 1.022125,
"grad_norm": 0.32680895924568176,
"learning_rate": 0.00010595289835405624,
"loss": 2.501193809509277,
"step": 2130
},
{
"epoch": 1.023375,
"grad_norm": 0.33465543389320374,
"learning_rate": 0.00010577788836078916,
"loss": 2.4999351501464844,
"step": 2140
},
{
"epoch": 1.024625,
"grad_norm": 0.3344171941280365,
"learning_rate": 0.00010560216576020092,
"loss": 2.4867813110351564,
"step": 2150
},
{
"epoch": 1.025875,
"grad_norm": 0.34607550501823425,
"learning_rate": 0.00010542573340289998,
"loss": 2.503824234008789,
"step": 2160
},
{
"epoch": 1.027125,
"grad_norm": 0.33892592787742615,
"learning_rate": 0.00010524859415100871,
"loss": 2.4990135192871095,
"step": 2170
},
{
"epoch": 1.028375,
"grad_norm": 0.3448082208633423,
"learning_rate": 0.00010507075087811677,
"loss": 2.4324840545654296,
"step": 2180
},
{
"epoch": 1.029625,
"grad_norm": 0.3321894407272339,
"learning_rate": 0.00010489220646923464,
"loss": 2.4842708587646483,
"step": 2190
},
{
"epoch": 1.030875,
"grad_norm": 0.3443576395511627,
"learning_rate": 0.0001047129638207468,
"loss": 2.485816764831543,
"step": 2200
},
{
"epoch": 1.032125,
"grad_norm": 0.3381134271621704,
"learning_rate": 0.00010453302584036468,
"loss": 2.4841537475585938,
"step": 2210
},
{
"epoch": 1.033375,
"grad_norm": 0.3401469588279724,
"learning_rate": 0.00010435239544707952,
"loss": 2.48382453918457,
"step": 2220
},
{
"epoch": 1.034625,
"grad_norm": 0.35364025831222534,
"learning_rate": 0.00010417107557111507,
"loss": 2.4872058868408202,
"step": 2230
},
{
"epoch": 1.035875,
"grad_norm": 0.3584776818752289,
"learning_rate": 0.00010398906915388,
"loss": 2.455089569091797,
"step": 2240
},
{
"epoch": 1.037125,
"grad_norm": 0.3385666608810425,
"learning_rate": 0.00010380637914792015,
"loss": 2.4457998275756836,
"step": 2250
},
{
"epoch": 1.038375,
"grad_norm": 0.3520835340023041,
"learning_rate": 0.00010362300851687071,
"loss": 2.479095458984375,
"step": 2260
},
{
"epoch": 1.039625,
"grad_norm": 0.34799060225486755,
"learning_rate": 0.00010343896023540814,
"loss": 2.4659198760986327,
"step": 2270
},
{
"epoch": 1.040875,
"grad_norm": 0.35186630487442017,
"learning_rate": 0.00010325423728920182,
"loss": 2.4467798233032227,
"step": 2280
},
{
"epoch": 1.042125,
"grad_norm": 0.3423445224761963,
"learning_rate": 0.00010306884267486574,
"loss": 2.4702438354492187,
"step": 2290
},
{
"epoch": 1.043375,
"grad_norm": 0.3398495018482208,
"learning_rate": 0.00010288277939990981,
"loss": 2.471152496337891,
"step": 2300
},
{
"epoch": 1.044625,
"grad_norm": 0.34717217087745667,
"learning_rate": 0.00010269605048269109,
"loss": 2.4720317840576174,
"step": 2310
},
{
"epoch": 1.045875,
"grad_norm": 0.34331125020980835,
"learning_rate": 0.00010250865895236482,
"loss": 2.4562469482421876,
"step": 2320
},
{
"epoch": 1.047125,
"grad_norm": 0.35022589564323425,
"learning_rate": 0.00010232060784883528,
"loss": 2.461803436279297,
"step": 2330
},
{
"epoch": 1.048375,
"grad_norm": 0.3725920617580414,
"learning_rate": 0.00010213190022270653,
"loss": 2.4350805282592773,
"step": 2340
},
{
"epoch": 1.049625,
"grad_norm": 0.3634240925312042,
"learning_rate": 0.00010194253913523282,
"loss": 2.454206848144531,
"step": 2350
},
{
"epoch": 1.050875,
"grad_norm": 0.35172227025032043,
"learning_rate": 0.000101752527658269,
"loss": 2.4318115234375,
"step": 2360
},
{
"epoch": 1.052125,
"grad_norm": 0.35827362537384033,
"learning_rate": 0.00010156186887422071,
"loss": 2.4692001342773438,
"step": 2370
},
{
"epoch": 1.053375,
"grad_norm": 0.36834755539894104,
"learning_rate": 0.00010137056587599428,
"loss": 2.4683910369873048,
"step": 2380
},
{
"epoch": 1.054625,
"grad_norm": 0.3573245108127594,
"learning_rate": 0.00010117862176694666,
"loss": 2.4428688049316407,
"step": 2390
},
{
"epoch": 1.055875,
"grad_norm": 0.33202221989631653,
"learning_rate": 0.00010098603966083503,
"loss": 2.4585454940795897,
"step": 2400
},
{
"epoch": 1.057125,
"grad_norm": 0.35598650574684143,
"learning_rate": 0.00010079282268176628,
"loss": 2.4740036010742186,
"step": 2410
},
{
"epoch": 1.058375,
"grad_norm": 0.36041730642318726,
"learning_rate": 0.00010059897396414633,
"loss": 2.4598981857299806,
"step": 2420
},
{
"epoch": 1.059625,
"grad_norm": 0.3481718599796295,
"learning_rate": 0.00010040449665262931,
"loss": 2.4539608001708983,
"step": 2430
},
{
"epoch": 1.060875,
"grad_norm": 0.3672044277191162,
"learning_rate": 0.00010020939390206654,
"loss": 2.433728790283203,
"step": 2440
},
{
"epoch": 1.062125,
"grad_norm": 0.35078802704811096,
"learning_rate": 0.00010001366887745531,
"loss": 2.454706573486328,
"step": 2450
},
{
"epoch": 1.063375,
"grad_norm": 0.36530086398124695,
"learning_rate": 9.981732475388758e-05,
"loss": 2.4748252868652343,
"step": 2460
},
{
"epoch": 1.064625,
"grad_norm": 0.3578907251358032,
"learning_rate": 9.962036471649851e-05,
"loss": 2.480423355102539,
"step": 2470
},
{
"epoch": 1.065875,
"grad_norm": 0.370403528213501,
"learning_rate": 9.942279196041466e-05,
"loss": 2.480521011352539,
"step": 2480
},
{
"epoch": 1.067125,
"grad_norm": 0.36263224482536316,
"learning_rate": 9.922460969070231e-05,
"loss": 2.4786655426025392,
"step": 2490
},
{
"epoch": 1.068375,
"grad_norm": 0.35858920216560364,
"learning_rate": 9.902582112231533e-05,
"loss": 2.461780548095703,
"step": 2500
},
{
"epoch": 1.069625,
"grad_norm": 0.35088691115379333,
"learning_rate": 9.882642948004314e-05,
"loss": 2.4797664642333985,
"step": 2510
},
{
"epoch": 1.070875,
"grad_norm": 0.36078205704689026,
"learning_rate": 9.862643799845839e-05,
"loss": 2.4529985427856444,
"step": 2520
},
{
"epoch": 1.072125,
"grad_norm": 0.35207876563072205,
"learning_rate": 9.842584992186434e-05,
"loss": 2.4753444671630858,
"step": 2530
},
{
"epoch": 1.073375,
"grad_norm": 0.36091098189353943,
"learning_rate": 9.822466850424243e-05,
"loss": 2.4327056884765623,
"step": 2540
},
{
"epoch": 1.074625,
"grad_norm": 0.3577967584133148,
"learning_rate": 9.802289700919933e-05,
"loss": 2.461964416503906,
"step": 2550
},
{
"epoch": 1.075875,
"grad_norm": 0.36178645491600037,
"learning_rate": 9.782053870991414e-05,
"loss": 2.4669708251953124,
"step": 2560
},
{
"epoch": 1.077125,
"grad_norm": 0.3427974581718445,
"learning_rate": 9.761759688908519e-05,
"loss": 2.4416053771972654,
"step": 2570
},
{
"epoch": 1.078375,
"grad_norm": 0.3656075894832611,
"learning_rate": 9.741407483887678e-05,
"loss": 2.4402462005615235,
"step": 2580
},
{
"epoch": 1.079625,
"grad_norm": 0.37002095580101013,
"learning_rate": 9.720997586086587e-05,
"loss": 2.451791191101074,
"step": 2590
},
{
"epoch": 1.080875,
"grad_norm": 0.3515098989009857,
"learning_rate": 9.700530326598842e-05,
"loss": 2.459187889099121,
"step": 2600
},
{
"epoch": 1.082125,
"grad_norm": 0.3765217959880829,
"learning_rate": 9.680006037448575e-05,
"loss": 2.4384769439697265,
"step": 2610
},
{
"epoch": 1.083375,
"grad_norm": 0.5566070675849915,
"learning_rate": 9.659425051585065e-05,
"loss": 2.4481531143188477,
"step": 2620
},
{
"epoch": 1.084625,
"grad_norm": 0.3657555878162384,
"learning_rate": 9.638787702877333e-05,
"loss": 2.470143508911133,
"step": 2630
},
{
"epoch": 1.085875,
"grad_norm": 0.35306495428085327,
"learning_rate": 9.618094326108734e-05,
"loss": 2.4623140335083007,
"step": 2640
},
{
"epoch": 1.087125,
"grad_norm": 0.35708507895469666,
"learning_rate": 9.597345256971521e-05,
"loss": 2.4393037796020507,
"step": 2650
},
{
"epoch": 1.088375,
"grad_norm": 0.36429449915885925,
"learning_rate": 9.576540832061398e-05,
"loss": 2.4460866928100584,
"step": 2660
},
{
"epoch": 1.089625,
"grad_norm": 0.3617342710494995,
"learning_rate": 9.555681388872065e-05,
"loss": 2.476423454284668,
"step": 2670
},
{
"epoch": 1.090875,
"grad_norm": 0.3526591360569,
"learning_rate": 9.534767265789737e-05,
"loss": 2.460892105102539,
"step": 2680
},
{
"epoch": 1.092125,
"grad_norm": 0.3697713613510132,
"learning_rate": 9.51379880208766e-05,
"loss": 2.46860294342041,
"step": 2690
},
{
"epoch": 1.093375,
"grad_norm": 0.37604451179504395,
"learning_rate": 9.492776337920603e-05,
"loss": 2.465809631347656,
"step": 2700
},
{
"epoch": 1.094625,
"grad_norm": 0.37269482016563416,
"learning_rate": 9.471700214319343e-05,
"loss": 2.4291683197021485,
"step": 2710
},
{
"epoch": 1.095875,
"grad_norm": 0.37273484468460083,
"learning_rate": 9.45057077318513e-05,
"loss": 2.447264862060547,
"step": 2720
},
{
"epoch": 1.097125,
"grad_norm": 0.3633696734905243,
"learning_rate": 9.429388357284143e-05,
"loss": 2.471749114990234,
"step": 2730
},
{
"epoch": 1.098375,
"grad_norm": 0.35682767629623413,
"learning_rate": 9.40815331024193e-05,
"loss": 2.42556209564209,
"step": 2740
},
{
"epoch": 1.099625,
"grad_norm": 0.3471936285495758,
"learning_rate": 9.386865976537827e-05,
"loss": 2.446389007568359,
"step": 2750
},
{
"epoch": 1.100875,
"grad_norm": 0.38089418411254883,
"learning_rate": 9.365526701499384e-05,
"loss": 2.4501571655273438,
"step": 2760
},
{
"epoch": 1.102125,
"grad_norm": 0.3654205799102783,
"learning_rate": 9.344135831296749e-05,
"loss": 2.439041519165039,
"step": 2770
},
{
"epoch": 1.103375,
"grad_norm": 0.3512708842754364,
"learning_rate": 9.322693712937054e-05,
"loss": 2.4336933135986327,
"step": 2780
},
{
"epoch": 1.104625,
"grad_norm": 0.36569294333457947,
"learning_rate": 9.301200694258795e-05,
"loss": 2.444048309326172,
"step": 2790
},
{
"epoch": 1.105875,
"grad_norm": 0.36901962757110596,
"learning_rate": 9.279657123926178e-05,
"loss": 2.4316547393798826,
"step": 2800
},
{
"epoch": 1.107125,
"grad_norm": 0.36593225598335266,
"learning_rate": 9.25806335142348e-05,
"loss": 2.4314062118530275,
"step": 2810
},
{
"epoch": 1.108375,
"grad_norm": 0.371039479970932,
"learning_rate": 9.236419727049352e-05,
"loss": 2.4478275299072267,
"step": 2820
},
{
"epoch": 1.109625,
"grad_norm": 0.3607841730117798,
"learning_rate": 9.214726601911162e-05,
"loss": 2.471347999572754,
"step": 2830
},
{
"epoch": 1.110875,
"grad_norm": 0.35733747482299805,
"learning_rate": 9.192984327919289e-05,
"loss": 2.4454570770263673,
"step": 2840
},
{
"epoch": 1.112125,
"grad_norm": 0.3512793183326721,
"learning_rate": 9.171193257781413e-05,
"loss": 2.4474578857421876,
"step": 2850
},
{
"epoch": 1.113375,
"grad_norm": 0.3591439127922058,
"learning_rate": 9.149353744996798e-05,
"loss": 2.3968666076660154,
"step": 2860
},
{
"epoch": 1.114625,
"grad_norm": 0.37512722611427307,
"learning_rate": 9.127466143850551e-05,
"loss": 2.4625476837158202,
"step": 2870
},
{
"epoch": 1.115875,
"grad_norm": 0.3683817982673645,
"learning_rate": 9.105530809407877e-05,
"loss": 2.4239782333374023,
"step": 2880
},
{
"epoch": 1.117125,
"grad_norm": 0.350392609834671,
"learning_rate": 9.08354809750833e-05,
"loss": 2.4604770660400392,
"step": 2890
},
{
"epoch": 1.118375,
"grad_norm": 0.3627133071422577,
"learning_rate": 9.061518364760018e-05,
"loss": 2.4404422760009767,
"step": 2900
}
],
"logging_steps": 10,
"max_steps": 8000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.8847708770441626e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}