klora_2000_skill / 25 /trainer_state.json
RayDu0010's picture
Upload folder using huggingface_hub
cf2d855 verified
raw
history blame
30.5 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 862,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011614401858304297,
"grad_norm": 1.2486553192138672,
"learning_rate": 1.111111111111111e-06,
"loss": 1.3051,
"step": 5
},
{
"epoch": 0.023228803716608595,
"grad_norm": 1.02664053440094,
"learning_rate": 2.4999999999999998e-06,
"loss": 1.3323,
"step": 10
},
{
"epoch": 0.03484320557491289,
"grad_norm": 0.8225151896476746,
"learning_rate": 3.888888888888889e-06,
"loss": 1.3047,
"step": 15
},
{
"epoch": 0.04645760743321719,
"grad_norm": 0.7819671630859375,
"learning_rate": 5.277777777777778e-06,
"loss": 1.2887,
"step": 20
},
{
"epoch": 0.05807200929152149,
"grad_norm": 0.6035093069076538,
"learning_rate": 6.666666666666667e-06,
"loss": 1.3248,
"step": 25
},
{
"epoch": 0.06968641114982578,
"grad_norm": 0.5274394750595093,
"learning_rate": 8.055555555555557e-06,
"loss": 1.2981,
"step": 30
},
{
"epoch": 0.08130081300813008,
"grad_norm": 0.4473659098148346,
"learning_rate": 9.444444444444445e-06,
"loss": 1.2253,
"step": 35
},
{
"epoch": 0.09291521486643438,
"grad_norm": 0.5344942808151245,
"learning_rate": 1.0833333333333334e-05,
"loss": 1.2101,
"step": 40
},
{
"epoch": 0.10452961672473868,
"grad_norm": 0.42955347895622253,
"learning_rate": 1.2222222222222222e-05,
"loss": 1.2136,
"step": 45
},
{
"epoch": 0.11614401858304298,
"grad_norm": 0.4606517553329468,
"learning_rate": 1.3611111111111111e-05,
"loss": 1.2501,
"step": 50
},
{
"epoch": 0.12775842044134728,
"grad_norm": 0.43243467807769775,
"learning_rate": 1.5e-05,
"loss": 1.2287,
"step": 55
},
{
"epoch": 0.13937282229965156,
"grad_norm": 0.45852982997894287,
"learning_rate": 1.638888888888889e-05,
"loss": 1.2272,
"step": 60
},
{
"epoch": 0.15098722415795587,
"grad_norm": 0.422735333442688,
"learning_rate": 1.7777777777777777e-05,
"loss": 1.1591,
"step": 65
},
{
"epoch": 0.16260162601626016,
"grad_norm": 0.48696285486221313,
"learning_rate": 1.9166666666666667e-05,
"loss": 1.2238,
"step": 70
},
{
"epoch": 0.17421602787456447,
"grad_norm": 0.6145470142364502,
"learning_rate": 2.0555555555555558e-05,
"loss": 1.2375,
"step": 75
},
{
"epoch": 0.18583042973286876,
"grad_norm": 0.5468384623527527,
"learning_rate": 2.1944444444444445e-05,
"loss": 1.1484,
"step": 80
},
{
"epoch": 0.19744483159117304,
"grad_norm": 0.4397794008255005,
"learning_rate": 2.3333333333333336e-05,
"loss": 1.1567,
"step": 85
},
{
"epoch": 0.20905923344947736,
"grad_norm": 0.4396965801715851,
"learning_rate": 2.4722222222222223e-05,
"loss": 1.1665,
"step": 90
},
{
"epoch": 0.22067363530778164,
"grad_norm": 0.5894995927810669,
"learning_rate": 2.611111111111111e-05,
"loss": 1.1519,
"step": 95
},
{
"epoch": 0.23228803716608595,
"grad_norm": 0.4796619117259979,
"learning_rate": 2.75e-05,
"loss": 1.1242,
"step": 100
},
{
"epoch": 0.24390243902439024,
"grad_norm": 0.7274127006530762,
"learning_rate": 2.8888888888888888e-05,
"loss": 1.1197,
"step": 105
},
{
"epoch": 0.25551684088269455,
"grad_norm": 0.5538251996040344,
"learning_rate": 2.999998233452831e-05,
"loss": 1.09,
"step": 110
},
{
"epoch": 0.26713124274099886,
"grad_norm": 0.4963814616203308,
"learning_rate": 2.999936404738799e-05,
"loss": 1.14,
"step": 115
},
{
"epoch": 0.2787456445993031,
"grad_norm": 0.5045656561851501,
"learning_rate": 2.9997862528271754e-05,
"loss": 1.0846,
"step": 120
},
{
"epoch": 0.29036004645760743,
"grad_norm": 0.5694212913513184,
"learning_rate": 2.999547786559598e-05,
"loss": 1.0566,
"step": 125
},
{
"epoch": 0.30197444831591175,
"grad_norm": 0.5698084831237793,
"learning_rate": 2.9992210199780657e-05,
"loss": 1.0812,
"step": 130
},
{
"epoch": 0.313588850174216,
"grad_norm": 0.5170321464538574,
"learning_rate": 2.9988059723241064e-05,
"loss": 1.0414,
"step": 135
},
{
"epoch": 0.3252032520325203,
"grad_norm": 0.5222340226173401,
"learning_rate": 2.9983026680376472e-05,
"loss": 1.0631,
"step": 140
},
{
"epoch": 0.33681765389082463,
"grad_norm": 0.5456163883209229,
"learning_rate": 2.997711136755574e-05,
"loss": 1.0866,
"step": 145
},
{
"epoch": 0.34843205574912894,
"grad_norm": 0.5397904515266418,
"learning_rate": 2.9970314133099855e-05,
"loss": 1.1579,
"step": 150
},
{
"epoch": 0.3600464576074332,
"grad_norm": 0.6353335380554199,
"learning_rate": 2.9962635377261457e-05,
"loss": 0.9954,
"step": 155
},
{
"epoch": 0.3716608594657375,
"grad_norm": 0.6079632639884949,
"learning_rate": 2.9954075552201222e-05,
"loss": 1.0647,
"step": 160
},
{
"epoch": 0.3832752613240418,
"grad_norm": 0.6199746131896973,
"learning_rate": 2.994463516196126e-05,
"loss": 1.0637,
"step": 165
},
{
"epoch": 0.3948896631823461,
"grad_norm": 0.6312094926834106,
"learning_rate": 2.9934314762435444e-05,
"loss": 1.0242,
"step": 170
},
{
"epoch": 0.4065040650406504,
"grad_norm": 0.677851140499115,
"learning_rate": 2.9923114961336672e-05,
"loss": 0.9563,
"step": 175
},
{
"epoch": 0.4181184668989547,
"grad_norm": 0.6415532827377319,
"learning_rate": 2.9911036418161058e-05,
"loss": 0.9728,
"step": 180
},
{
"epoch": 0.429732868757259,
"grad_norm": 0.7620236873626709,
"learning_rate": 2.9898079844149132e-05,
"loss": 1.0066,
"step": 185
},
{
"epoch": 0.4413472706155633,
"grad_norm": 0.6720292568206787,
"learning_rate": 2.9884246002243936e-05,
"loss": 1.0059,
"step": 190
},
{
"epoch": 0.4529616724738676,
"grad_norm": 0.7387037873268127,
"learning_rate": 2.9869535707046104e-05,
"loss": 0.9314,
"step": 195
},
{
"epoch": 0.4645760743321719,
"grad_norm": 0.6071408987045288,
"learning_rate": 2.985394982476591e-05,
"loss": 0.9805,
"step": 200
},
{
"epoch": 0.47619047619047616,
"grad_norm": 0.7038552165031433,
"learning_rate": 2.9837489273172232e-05,
"loss": 0.9766,
"step": 205
},
{
"epoch": 0.4878048780487805,
"grad_norm": 0.7892246842384338,
"learning_rate": 2.9820155021538533e-05,
"loss": 0.9558,
"step": 210
},
{
"epoch": 0.4994192799070848,
"grad_norm": 0.7563413381576538,
"learning_rate": 2.980194809058577e-05,
"loss": 0.9455,
"step": 215
},
{
"epoch": 0.5110336817653891,
"grad_norm": 0.7255906462669373,
"learning_rate": 2.9782869552422316e-05,
"loss": 0.9183,
"step": 220
},
{
"epoch": 0.5226480836236934,
"grad_norm": 0.7455199956893921,
"learning_rate": 2.9762920530480788e-05,
"loss": 0.9363,
"step": 225
},
{
"epoch": 0.5342624854819977,
"grad_norm": 0.7163615822792053,
"learning_rate": 2.974210219945193e-05,
"loss": 0.9436,
"step": 230
},
{
"epoch": 0.5458768873403019,
"grad_norm": 0.6869027018547058,
"learning_rate": 2.9720415785215428e-05,
"loss": 0.8932,
"step": 235
},
{
"epoch": 0.5574912891986062,
"grad_norm": 0.776175320148468,
"learning_rate": 2.969786256476772e-05,
"loss": 0.9234,
"step": 240
},
{
"epoch": 0.5691056910569106,
"grad_norm": 0.687179446220398,
"learning_rate": 2.9674443866146807e-05,
"loss": 0.9165,
"step": 245
},
{
"epoch": 0.5807200929152149,
"grad_norm": 0.8298389911651611,
"learning_rate": 2.9650161068354054e-05,
"loss": 0.8863,
"step": 250
},
{
"epoch": 0.5923344947735192,
"grad_norm": 0.7793598175048828,
"learning_rate": 2.9625015601272974e-05,
"loss": 0.8693,
"step": 255
},
{
"epoch": 0.6039488966318235,
"grad_norm": 0.9237378835678101,
"learning_rate": 2.9599008945585066e-05,
"loss": 0.8435,
"step": 260
},
{
"epoch": 0.6155632984901278,
"grad_norm": 0.8277608752250671,
"learning_rate": 2.9572142632682562e-05,
"loss": 0.8683,
"step": 265
},
{
"epoch": 0.627177700348432,
"grad_norm": 0.7906745076179504,
"learning_rate": 2.954441824457832e-05,
"loss": 0.9144,
"step": 270
},
{
"epoch": 0.6387921022067363,
"grad_norm": 0.8107167482376099,
"learning_rate": 2.951583741381263e-05,
"loss": 0.8696,
"step": 275
},
{
"epoch": 0.6504065040650406,
"grad_norm": 0.734391987323761,
"learning_rate": 2.948640182335708e-05,
"loss": 0.871,
"step": 280
},
{
"epoch": 0.662020905923345,
"grad_norm": 0.8837676048278809,
"learning_rate": 2.9456113206515475e-05,
"loss": 0.8883,
"step": 285
},
{
"epoch": 0.6736353077816493,
"grad_norm": 0.8748791217803955,
"learning_rate": 2.942497334682176e-05,
"loss": 0.7801,
"step": 290
},
{
"epoch": 0.6852497096399536,
"grad_norm": 0.8546818494796753,
"learning_rate": 2.9392984077934987e-05,
"loss": 0.8724,
"step": 295
},
{
"epoch": 0.6968641114982579,
"grad_norm": 0.7864014506340027,
"learning_rate": 2.9360147283531373e-05,
"loss": 0.8214,
"step": 300
},
{
"epoch": 0.7084785133565621,
"grad_norm": 0.7984604835510254,
"learning_rate": 2.9326464897193343e-05,
"loss": 0.836,
"step": 305
},
{
"epoch": 0.7200929152148664,
"grad_norm": 0.9750844836235046,
"learning_rate": 2.9291938902295695e-05,
"loss": 0.7873,
"step": 310
},
{
"epoch": 0.7317073170731707,
"grad_norm": 0.8499948382377625,
"learning_rate": 2.925657133188881e-05,
"loss": 0.8412,
"step": 315
},
{
"epoch": 0.743321718931475,
"grad_norm": 0.86067134141922,
"learning_rate": 2.9220364268578922e-05,
"loss": 0.8256,
"step": 320
},
{
"epoch": 0.7549361207897793,
"grad_norm": 0.8807356357574463,
"learning_rate": 2.918331984440549e-05,
"loss": 0.8364,
"step": 325
},
{
"epoch": 0.7665505226480837,
"grad_norm": 0.8951946496963501,
"learning_rate": 2.9145440240715657e-05,
"loss": 0.8415,
"step": 330
},
{
"epoch": 0.778164924506388,
"grad_norm": 0.8705118894577026,
"learning_rate": 2.9106727688035814e-05,
"loss": 0.7762,
"step": 335
},
{
"epoch": 0.7897793263646922,
"grad_norm": 0.9147759675979614,
"learning_rate": 2.9067184465940225e-05,
"loss": 0.7983,
"step": 340
},
{
"epoch": 0.8013937282229965,
"grad_norm": 0.8624527454376221,
"learning_rate": 2.9026812902916834e-05,
"loss": 0.7434,
"step": 345
},
{
"epoch": 0.8130081300813008,
"grad_norm": 0.9535494446754456,
"learning_rate": 2.898561537623011e-05,
"loss": 0.8049,
"step": 350
},
{
"epoch": 0.8246225319396051,
"grad_norm": 0.836469292640686,
"learning_rate": 2.8943594311781104e-05,
"loss": 0.777,
"step": 355
},
{
"epoch": 0.8362369337979094,
"grad_norm": 0.9156211018562317,
"learning_rate": 2.8900752183964573e-05,
"loss": 0.7746,
"step": 360
},
{
"epoch": 0.8478513356562137,
"grad_norm": 0.9176764488220215,
"learning_rate": 2.8857091515523287e-05,
"loss": 0.7827,
"step": 365
},
{
"epoch": 0.859465737514518,
"grad_norm": 0.8853964805603027,
"learning_rate": 2.8812614877399476e-05,
"loss": 0.754,
"step": 370
},
{
"epoch": 0.8710801393728222,
"grad_norm": 0.8558771014213562,
"learning_rate": 2.876732488858344e-05,
"loss": 0.7039,
"step": 375
},
{
"epoch": 0.8826945412311266,
"grad_norm": 0.8176305294036865,
"learning_rate": 2.8721224215959335e-05,
"loss": 0.7892,
"step": 380
},
{
"epoch": 0.8943089430894309,
"grad_norm": 0.9530948996543884,
"learning_rate": 2.8674315574148126e-05,
"loss": 0.7049,
"step": 385
},
{
"epoch": 0.9059233449477352,
"grad_norm": 0.9136149287223816,
"learning_rate": 2.862660172534776e-05,
"loss": 0.7614,
"step": 390
},
{
"epoch": 0.9175377468060395,
"grad_norm": 0.9666270017623901,
"learning_rate": 2.8578085479170478e-05,
"loss": 0.7265,
"step": 395
},
{
"epoch": 0.9291521486643438,
"grad_norm": 0.9841073751449585,
"learning_rate": 2.85287696924774e-05,
"loss": 0.6903,
"step": 400
},
{
"epoch": 0.9407665505226481,
"grad_norm": 1.1007276773452759,
"learning_rate": 2.8478657269210294e-05,
"loss": 0.7114,
"step": 405
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.9525352120399475,
"learning_rate": 2.8427751160220573e-05,
"loss": 0.7115,
"step": 410
},
{
"epoch": 0.9639953542392566,
"grad_norm": 0.8143008351325989,
"learning_rate": 2.8376054363095545e-05,
"loss": 0.6784,
"step": 415
},
{
"epoch": 0.975609756097561,
"grad_norm": 0.9916719198226929,
"learning_rate": 2.8323569921981885e-05,
"loss": 0.7366,
"step": 420
},
{
"epoch": 0.9872241579558653,
"grad_norm": 0.9247941374778748,
"learning_rate": 2.82703009274064e-05,
"loss": 0.6974,
"step": 425
},
{
"epoch": 0.9988385598141696,
"grad_norm": 0.9906225800514221,
"learning_rate": 2.8216250516094027e-05,
"loss": 0.6673,
"step": 430
},
{
"epoch": 1.0092915214866434,
"grad_norm": 0.9208662509918213,
"learning_rate": 2.816142187078315e-05,
"loss": 0.603,
"step": 435
},
{
"epoch": 1.0209059233449478,
"grad_norm": 1.0540852546691895,
"learning_rate": 2.8105818220038167e-05,
"loss": 0.6062,
"step": 440
},
{
"epoch": 1.032520325203252,
"grad_norm": 0.9971239566802979,
"learning_rate": 2.804944283805938e-05,
"loss": 0.5986,
"step": 445
},
{
"epoch": 1.0441347270615564,
"grad_norm": 1.0977699756622314,
"learning_rate": 2.7992299044490192e-05,
"loss": 0.5556,
"step": 450
},
{
"epoch": 1.0557491289198606,
"grad_norm": 1.0428180694580078,
"learning_rate": 2.793439020422165e-05,
"loss": 0.599,
"step": 455
},
{
"epoch": 1.0673635307781648,
"grad_norm": 1.2783362865447998,
"learning_rate": 2.787571972719429e-05,
"loss": 0.5589,
"step": 460
},
{
"epoch": 1.0789779326364692,
"grad_norm": 1.0532207489013672,
"learning_rate": 2.781629106819733e-05,
"loss": 0.5534,
"step": 465
},
{
"epoch": 1.0905923344947734,
"grad_norm": 1.0525319576263428,
"learning_rate": 2.775610772666527e-05,
"loss": 0.577,
"step": 470
},
{
"epoch": 1.1022067363530779,
"grad_norm": 0.9602555632591248,
"learning_rate": 2.7695173246471803e-05,
"loss": 0.5625,
"step": 475
},
{
"epoch": 1.113821138211382,
"grad_norm": 1.0547131299972534,
"learning_rate": 2.763349121572114e-05,
"loss": 0.6173,
"step": 480
},
{
"epoch": 1.1254355400696865,
"grad_norm": 0.963524341583252,
"learning_rate": 2.7571065266536737e-05,
"loss": 0.62,
"step": 485
},
{
"epoch": 1.1370499419279907,
"grad_norm": 1.066751480102539,
"learning_rate": 2.7507899074847394e-05,
"loss": 0.5829,
"step": 490
},
{
"epoch": 1.1486643437862951,
"grad_norm": 0.9866227507591248,
"learning_rate": 2.7443996360170836e-05,
"loss": 0.5729,
"step": 495
},
{
"epoch": 1.1602787456445993,
"grad_norm": 1.037983775138855,
"learning_rate": 2.7379360885394664e-05,
"loss": 0.5955,
"step": 500
},
{
"epoch": 1.1718931475029035,
"grad_norm": 1.2030766010284424,
"learning_rate": 2.731399645655477e-05,
"loss": 0.6062,
"step": 505
},
{
"epoch": 1.183507549361208,
"grad_norm": 1.0329967737197876,
"learning_rate": 2.7247906922611254e-05,
"loss": 0.5871,
"step": 510
},
{
"epoch": 1.1951219512195121,
"grad_norm": 1.1776123046875,
"learning_rate": 2.7181096175221757e-05,
"loss": 0.5755,
"step": 515
},
{
"epoch": 1.2067363530778166,
"grad_norm": 1.0537413358688354,
"learning_rate": 2.7113568148512296e-05,
"loss": 0.545,
"step": 520
},
{
"epoch": 1.2183507549361208,
"grad_norm": 0.9902351498603821,
"learning_rate": 2.704532681884562e-05,
"loss": 0.5564,
"step": 525
},
{
"epoch": 1.229965156794425,
"grad_norm": 1.0097850561141968,
"learning_rate": 2.697637620458706e-05,
"loss": 0.5056,
"step": 530
},
{
"epoch": 1.2415795586527294,
"grad_norm": 0.9843356013298035,
"learning_rate": 2.690672036586791e-05,
"loss": 0.5348,
"step": 535
},
{
"epoch": 1.2531939605110336,
"grad_norm": 0.9540740251541138,
"learning_rate": 2.6836363404346324e-05,
"loss": 0.5411,
"step": 540
},
{
"epoch": 1.264808362369338,
"grad_norm": 1.0271143913269043,
"learning_rate": 2.6765309462965845e-05,
"loss": 0.5065,
"step": 545
},
{
"epoch": 1.2764227642276422,
"grad_norm": 1.077792763710022,
"learning_rate": 2.669356272571138e-05,
"loss": 0.5307,
"step": 550
},
{
"epoch": 1.2880371660859466,
"grad_norm": 1.0701345205307007,
"learning_rate": 2.6621127417362886e-05,
"loss": 0.5587,
"step": 555
},
{
"epoch": 1.2996515679442509,
"grad_norm": 1.0792369842529297,
"learning_rate": 2.6548007803246575e-05,
"loss": 0.5264,
"step": 560
},
{
"epoch": 1.3112659698025553,
"grad_norm": 0.8799574375152588,
"learning_rate": 2.647420818898373e-05,
"loss": 0.4964,
"step": 565
},
{
"epoch": 1.3228803716608595,
"grad_norm": 1.1636409759521484,
"learning_rate": 2.6399732920237212e-05,
"loss": 0.5576,
"step": 570
},
{
"epoch": 1.3344947735191637,
"grad_norm": 1.055418610572815,
"learning_rate": 2.6324586382455525e-05,
"loss": 0.5149,
"step": 575
},
{
"epoch": 1.346109175377468,
"grad_norm": 1.0145419836044312,
"learning_rate": 2.624877300061462e-05,
"loss": 0.4917,
"step": 580
},
{
"epoch": 1.3577235772357723,
"grad_norm": 0.9634491801261902,
"learning_rate": 2.6172297238957297e-05,
"loss": 0.515,
"step": 585
},
{
"epoch": 1.3693379790940767,
"grad_norm": 0.9772712588310242,
"learning_rate": 2.6095163600730355e-05,
"loss": 0.5012,
"step": 590
},
{
"epoch": 1.380952380952381,
"grad_norm": 0.9910968542098999,
"learning_rate": 2.6017376627919405e-05,
"loss": 0.4839,
"step": 595
},
{
"epoch": 1.3925667828106851,
"grad_norm": 1.103830099105835,
"learning_rate": 2.5938940900981424e-05,
"loss": 0.562,
"step": 600
},
{
"epoch": 1.4041811846689896,
"grad_norm": 1.1236923933029175,
"learning_rate": 2.5859861038575035e-05,
"loss": 0.493,
"step": 605
},
{
"epoch": 1.415795586527294,
"grad_norm": 0.9853049516677856,
"learning_rate": 2.5780141697288537e-05,
"loss": 0.4578,
"step": 610
},
{
"epoch": 1.4274099883855982,
"grad_norm": 1.02285635471344,
"learning_rate": 2.5699787571365704e-05,
"loss": 0.5141,
"step": 615
},
{
"epoch": 1.4390243902439024,
"grad_norm": 1.1389844417572021,
"learning_rate": 2.5618803392429373e-05,
"loss": 0.4684,
"step": 620
},
{
"epoch": 1.4506387921022068,
"grad_norm": 1.1375923156738281,
"learning_rate": 2.5537193929202815e-05,
"loss": 0.4761,
"step": 625
},
{
"epoch": 1.462253193960511,
"grad_norm": 1.046151876449585,
"learning_rate": 2.5454963987228926e-05,
"loss": 0.5179,
"step": 630
},
{
"epoch": 1.4738675958188154,
"grad_norm": 1.1057745218276978,
"learning_rate": 2.5372118408587284e-05,
"loss": 0.489,
"step": 635
},
{
"epoch": 1.4854819976771196,
"grad_norm": 1.032041311264038,
"learning_rate": 2.5288662071608975e-05,
"loss": 0.4871,
"step": 640
},
{
"epoch": 1.4970963995354238,
"grad_norm": 1.0814687013626099,
"learning_rate": 2.520459989058939e-05,
"loss": 0.5172,
"step": 645
},
{
"epoch": 1.5087108013937283,
"grad_norm": 1.0195306539535522,
"learning_rate": 2.5119936815498797e-05,
"loss": 0.5218,
"step": 650
},
{
"epoch": 1.5203252032520327,
"grad_norm": 0.9469033479690552,
"learning_rate": 2.503467783169091e-05,
"loss": 0.4883,
"step": 655
},
{
"epoch": 1.5319396051103369,
"grad_norm": 1.1030808687210083,
"learning_rate": 2.4948827959609285e-05,
"loss": 0.501,
"step": 660
},
{
"epoch": 1.543554006968641,
"grad_norm": 1.0832819938659668,
"learning_rate": 2.4862392254491736e-05,
"loss": 0.4813,
"step": 665
},
{
"epoch": 1.5551684088269453,
"grad_norm": 1.1702861785888672,
"learning_rate": 2.477537580607261e-05,
"loss": 0.4737,
"step": 670
},
{
"epoch": 1.5667828106852497,
"grad_norm": 1.0441266298294067,
"learning_rate": 2.4687783738283144e-05,
"loss": 0.454,
"step": 675
},
{
"epoch": 1.5783972125435541,
"grad_norm": 1.1927803754806519,
"learning_rate": 2.4599621208949674e-05,
"loss": 0.4328,
"step": 680
},
{
"epoch": 1.5900116144018583,
"grad_norm": 1.1129752397537231,
"learning_rate": 2.4510893409489967e-05,
"loss": 0.4024,
"step": 685
},
{
"epoch": 1.6016260162601625,
"grad_norm": 1.0736514329910278,
"learning_rate": 2.4421605564607514e-05,
"loss": 0.4499,
"step": 690
},
{
"epoch": 1.6132404181184667,
"grad_norm": 1.0657414197921753,
"learning_rate": 2.4331762931983866e-05,
"loss": 0.4649,
"step": 695
},
{
"epoch": 1.6248548199767712,
"grad_norm": 1.0618281364440918,
"learning_rate": 2.4241370801969045e-05,
"loss": 0.4337,
"step": 700
},
{
"epoch": 1.6364692218350756,
"grad_norm": 1.1562613248825073,
"learning_rate": 2.415043449727003e-05,
"loss": 0.4429,
"step": 705
},
{
"epoch": 1.6480836236933798,
"grad_norm": 1.1286054849624634,
"learning_rate": 2.4058959372637304e-05,
"loss": 0.4714,
"step": 710
},
{
"epoch": 1.659698025551684,
"grad_norm": 1.043707251548767,
"learning_rate": 2.396695081454959e-05,
"loss": 0.4549,
"step": 715
},
{
"epoch": 1.6713124274099884,
"grad_norm": 1.001025676727295,
"learning_rate": 2.387441424089662e-05,
"loss": 0.4843,
"step": 720
},
{
"epoch": 1.6829268292682928,
"grad_norm": 1.0033694505691528,
"learning_rate": 2.378135510066013e-05,
"loss": 0.4203,
"step": 725
},
{
"epoch": 1.694541231126597,
"grad_norm": 1.041979432106018,
"learning_rate": 2.3687778873593e-05,
"loss": 0.4609,
"step": 730
},
{
"epoch": 1.7061556329849012,
"grad_norm": 1.0614110231399536,
"learning_rate": 2.3593691069896582e-05,
"loss": 0.4328,
"step": 735
},
{
"epoch": 1.7177700348432055,
"grad_norm": 1.1184892654418945,
"learning_rate": 2.3499097229896213e-05,
"loss": 0.4323,
"step": 740
},
{
"epoch": 1.7293844367015099,
"grad_norm": 1.0571225881576538,
"learning_rate": 2.340400292371499e-05,
"loss": 0.3688,
"step": 745
},
{
"epoch": 1.7409988385598143,
"grad_norm": 0.9831321239471436,
"learning_rate": 2.3308413750945788e-05,
"loss": 0.4091,
"step": 750
},
{
"epoch": 1.7526132404181185,
"grad_norm": 1.334681749343872,
"learning_rate": 2.3212335340321518e-05,
"loss": 0.4363,
"step": 755
},
{
"epoch": 1.7642276422764227,
"grad_norm": 1.0677987337112427,
"learning_rate": 2.3115773349383658e-05,
"loss": 0.4202,
"step": 760
},
{
"epoch": 1.775842044134727,
"grad_norm": 1.1645525693893433,
"learning_rate": 2.3018733464149156e-05,
"loss": 0.4515,
"step": 765
},
{
"epoch": 1.7874564459930313,
"grad_norm": 1.0856503248214722,
"learning_rate": 2.292122139877558e-05,
"loss": 0.401,
"step": 770
},
{
"epoch": 1.7990708478513358,
"grad_norm": 1.0238492488861084,
"learning_rate": 2.2823242895224643e-05,
"loss": 0.3955,
"step": 775
},
{
"epoch": 1.81068524970964,
"grad_norm": 1.1932063102722168,
"learning_rate": 2.2724803722924106e-05,
"loss": 0.4154,
"step": 780
},
{
"epoch": 1.8222996515679442,
"grad_norm": 1.0874030590057373,
"learning_rate": 2.2625909678428038e-05,
"loss": 0.4082,
"step": 785
},
{
"epoch": 1.8339140534262486,
"grad_norm": 1.109732747077942,
"learning_rate": 2.2526566585075485e-05,
"loss": 0.3826,
"step": 790
},
{
"epoch": 1.845528455284553,
"grad_norm": 1.0591766834259033,
"learning_rate": 2.2426780292647568e-05,
"loss": 0.388,
"step": 795
},
{
"epoch": 1.8571428571428572,
"grad_norm": 1.1858235597610474,
"learning_rate": 2.2326556677023017e-05,
"loss": 0.4026,
"step": 800
},
{
"epoch": 1.8687572590011614,
"grad_norm": 1.0293341875076294,
"learning_rate": 2.2225901639832188e-05,
"loss": 0.419,
"step": 805
},
{
"epoch": 1.8803716608594656,
"grad_norm": 1.1112502813339233,
"learning_rate": 2.2124821108109515e-05,
"loss": 0.4005,
"step": 810
},
{
"epoch": 1.89198606271777,
"grad_norm": 0.9960114359855652,
"learning_rate": 2.2023321033944544e-05,
"loss": 0.328,
"step": 815
},
{
"epoch": 1.9036004645760745,
"grad_norm": 1.1106208562850952,
"learning_rate": 2.1921407394131406e-05,
"loss": 0.3667,
"step": 820
},
{
"epoch": 1.9152148664343787,
"grad_norm": 1.0423858165740967,
"learning_rate": 2.1819086189816893e-05,
"loss": 0.3773,
"step": 825
},
{
"epoch": 1.9268292682926829,
"grad_norm": 1.13150155544281,
"learning_rate": 2.171636344614708e-05,
"loss": 0.3491,
"step": 830
},
{
"epoch": 1.938443670150987,
"grad_norm": 1.0611273050308228,
"learning_rate": 2.1613245211912554e-05,
"loss": 0.3323,
"step": 835
},
{
"epoch": 1.9500580720092915,
"grad_norm": 0.9943997263908386,
"learning_rate": 2.1509737559192188e-05,
"loss": 0.3741,
"step": 840
},
{
"epoch": 1.961672473867596,
"grad_norm": 0.9484174847602844,
"learning_rate": 2.140584658299564e-05,
"loss": 0.3415,
"step": 845
},
{
"epoch": 1.9732868757259001,
"grad_norm": 1.0280919075012207,
"learning_rate": 2.1301578400904424e-05,
"loss": 0.3724,
"step": 850
},
{
"epoch": 1.9849012775842043,
"grad_norm": 1.1084401607513428,
"learning_rate": 2.119693915271168e-05,
"loss": 0.3867,
"step": 855
},
{
"epoch": 1.9965156794425087,
"grad_norm": 1.1195417642593384,
"learning_rate": 2.1091935000060637e-05,
"loss": 0.3641,
"step": 860
}
],
"logging_steps": 5,
"max_steps": 2155,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.3574620943199764e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}