unakar's picture
Upload 10 files
c7223d3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.998870907038013,
"eval_steps": 500,
"global_step": 1992,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02,
"grad_norm": 0.2066432386636734,
"learning_rate": 9.999384369486675e-05,
"loss": 0.6539,
"step": 10
},
{
"epoch": 0.03,
"grad_norm": 0.1350802630186081,
"learning_rate": 9.997525241303441e-05,
"loss": 0.4242,
"step": 20
},
{
"epoch": 0.05,
"grad_norm": 0.12290512770414352,
"learning_rate": 9.994423062331178e-05,
"loss": 0.4085,
"step": 30
},
{
"epoch": 0.06,
"grad_norm": 0.10319065302610397,
"learning_rate": 9.990078604185e-05,
"loss": 0.3843,
"step": 40
},
{
"epoch": 0.08,
"grad_norm": 0.1067107692360878,
"learning_rate": 9.984492947476183e-05,
"loss": 0.3814,
"step": 50
},
{
"epoch": 0.09,
"grad_norm": 0.10464764386415482,
"learning_rate": 9.977667481543383e-05,
"loss": 0.3806,
"step": 60
},
{
"epoch": 0.11,
"grad_norm": 0.10096515715122223,
"learning_rate": 9.969603904107045e-05,
"loss": 0.3823,
"step": 70
},
{
"epoch": 0.12,
"grad_norm": 0.10181506723165512,
"learning_rate": 9.960304220847147e-05,
"loss": 0.3717,
"step": 80
},
{
"epoch": 0.14,
"grad_norm": 0.10328900068998337,
"learning_rate": 9.949770744904306e-05,
"loss": 0.3761,
"step": 90
},
{
"epoch": 0.15,
"grad_norm": 0.10094640403985977,
"learning_rate": 9.938006096304422e-05,
"loss": 0.3766,
"step": 100
},
{
"epoch": 0.17,
"grad_norm": 0.10436520725488663,
"learning_rate": 9.925013201306999e-05,
"loss": 0.3815,
"step": 110
},
{
"epoch": 0.18,
"grad_norm": 0.09791135042905807,
"learning_rate": 9.910795291677279e-05,
"loss": 0.3793,
"step": 120
},
{
"epoch": 0.2,
"grad_norm": 0.10530658811330795,
"learning_rate": 9.8953559038824e-05,
"loss": 0.3746,
"step": 130
},
{
"epoch": 0.21,
"grad_norm": 0.10106656700372696,
"learning_rate": 9.878698878211756e-05,
"loss": 0.3627,
"step": 140
},
{
"epoch": 0.23,
"grad_norm": 0.10589198768138885,
"learning_rate": 9.86082835782179e-05,
"loss": 0.3585,
"step": 150
},
{
"epoch": 0.24,
"grad_norm": 0.09950920939445496,
"learning_rate": 9.841748787705453e-05,
"loss": 0.3648,
"step": 160
},
{
"epoch": 0.26,
"grad_norm": 0.09534649550914764,
"learning_rate": 9.821464913586586e-05,
"loss": 0.3714,
"step": 170
},
{
"epoch": 0.27,
"grad_norm": 0.1069357767701149,
"learning_rate": 9.799981780739504e-05,
"loss": 0.3691,
"step": 180
},
{
"epoch": 0.29,
"grad_norm": 0.0956304594874382,
"learning_rate": 9.777304732734063e-05,
"loss": 0.3621,
"step": 190
},
{
"epoch": 0.3,
"grad_norm": 0.1080741360783577,
"learning_rate": 9.753439410106537e-05,
"loss": 0.3627,
"step": 200
},
{
"epoch": 0.32,
"grad_norm": 0.11919151246547699,
"learning_rate": 9.728391748956637e-05,
"loss": 0.358,
"step": 210
},
{
"epoch": 0.33,
"grad_norm": 0.09667570888996124,
"learning_rate": 9.702167979470994e-05,
"loss": 0.3587,
"step": 220
},
{
"epoch": 0.35,
"grad_norm": 0.09914813190698624,
"learning_rate": 9.67477462437351e-05,
"loss": 0.3593,
"step": 230
},
{
"epoch": 0.36,
"grad_norm": 0.09963402152061462,
"learning_rate": 9.646218497302945e-05,
"loss": 0.3619,
"step": 240
},
{
"epoch": 0.38,
"grad_norm": 0.0956585481762886,
"learning_rate": 9.616506701118124e-05,
"loss": 0.3592,
"step": 250
},
{
"epoch": 0.39,
"grad_norm": 0.08918160200119019,
"learning_rate": 9.585646626131237e-05,
"loss": 0.3572,
"step": 260
},
{
"epoch": 0.41,
"grad_norm": 0.10680545121431351,
"learning_rate": 9.553645948269607e-05,
"loss": 0.3584,
"step": 270
},
{
"epoch": 0.42,
"grad_norm": 0.09338073432445526,
"learning_rate": 9.520512627166445e-05,
"loss": 0.3569,
"step": 280
},
{
"epoch": 0.44,
"grad_norm": 0.09849222749471664,
"learning_rate": 9.48625490418101e-05,
"loss": 0.353,
"step": 290
},
{
"epoch": 0.45,
"grad_norm": 0.09319322556257248,
"learning_rate": 9.450881300348724e-05,
"loss": 0.3492,
"step": 300
},
{
"epoch": 0.47,
"grad_norm": 0.09313003718852997,
"learning_rate": 9.414400614261693e-05,
"loss": 0.3618,
"step": 310
},
{
"epoch": 0.48,
"grad_norm": 0.09641731530427933,
"learning_rate": 9.376821919880219e-05,
"loss": 0.3668,
"step": 320
},
{
"epoch": 0.5,
"grad_norm": 0.10052476078271866,
"learning_rate": 9.338154564275788e-05,
"loss": 0.3445,
"step": 330
},
{
"epoch": 0.51,
"grad_norm": 0.09882521629333496,
"learning_rate": 9.298408165306157e-05,
"loss": 0.3459,
"step": 340
},
{
"epoch": 0.53,
"grad_norm": 0.0998958945274353,
"learning_rate": 9.257592609223059e-05,
"loss": 0.3532,
"step": 350
},
{
"epoch": 0.54,
"grad_norm": 0.0996759906411171,
"learning_rate": 9.21571804821318e-05,
"loss": 0.3542,
"step": 360
},
{
"epoch": 0.56,
"grad_norm": 0.09207039326429367,
"learning_rate": 9.172794897872957e-05,
"loss": 0.3424,
"step": 370
},
{
"epoch": 0.57,
"grad_norm": 0.09934650361537933,
"learning_rate": 9.128833834617876e-05,
"loss": 0.35,
"step": 380
},
{
"epoch": 0.59,
"grad_norm": 0.09678385406732559,
"learning_rate": 9.083845793026905e-05,
"loss": 0.3461,
"step": 390
},
{
"epoch": 0.6,
"grad_norm": 0.09645042568445206,
"learning_rate": 9.037841963122682e-05,
"loss": 0.3367,
"step": 400
},
{
"epoch": 0.62,
"grad_norm": 0.09238140285015106,
"learning_rate": 8.990833787588194e-05,
"loss": 0.3504,
"step": 410
},
{
"epoch": 0.63,
"grad_norm": 0.09661918878555298,
"learning_rate": 8.942832958920602e-05,
"loss": 0.3496,
"step": 420
},
{
"epoch": 0.65,
"grad_norm": 0.09576547890901566,
"learning_rate": 8.893851416522925e-05,
"loss": 0.3513,
"step": 430
},
{
"epoch": 0.66,
"grad_norm": 0.091176837682724,
"learning_rate": 8.843901343734309e-05,
"loss": 0.3409,
"step": 440
},
{
"epoch": 0.68,
"grad_norm": 0.09654372185468674,
"learning_rate": 8.792995164799637e-05,
"loss": 0.3446,
"step": 450
},
{
"epoch": 0.69,
"grad_norm": 0.08725057542324066,
"learning_rate": 8.741145541779199e-05,
"loss": 0.3442,
"step": 460
},
{
"epoch": 0.71,
"grad_norm": 0.09062401205301285,
"learning_rate": 8.688365371399208e-05,
"loss": 0.3444,
"step": 470
},
{
"epoch": 0.72,
"grad_norm": 0.09887181222438812,
"learning_rate": 8.63466778184397e-05,
"loss": 0.3402,
"step": 480
},
{
"epoch": 0.74,
"grad_norm": 0.09595289826393127,
"learning_rate": 8.580066129490462e-05,
"loss": 0.3424,
"step": 490
},
{
"epoch": 0.75,
"grad_norm": 0.0977388396859169,
"learning_rate": 8.524573995586153e-05,
"loss": 0.358,
"step": 500
},
{
"epoch": 0.75,
"eval_loss": 0.26840201020240784,
"eval_runtime": 20.7882,
"eval_samples_per_second": 0.529,
"eval_steps_per_second": 0.529,
"step": 500
},
{
"epoch": 0.77,
"grad_norm": 0.08854895085096359,
"learning_rate": 8.468205182870901e-05,
"loss": 0.3328,
"step": 510
},
{
"epoch": 0.78,
"grad_norm": 0.09385243058204651,
"learning_rate": 8.410973712143747e-05,
"loss": 0.3441,
"step": 520
},
{
"epoch": 0.8,
"grad_norm": 0.09778619557619095,
"learning_rate": 8.352893818775484e-05,
"loss": 0.3451,
"step": 530
},
{
"epoch": 0.81,
"grad_norm": 0.09615397453308105,
"learning_rate": 8.293979949167839e-05,
"loss": 0.3441,
"step": 540
},
{
"epoch": 0.83,
"grad_norm": 0.10019069164991379,
"learning_rate": 8.234246757160174e-05,
"loss": 0.3309,
"step": 550
},
{
"epoch": 0.84,
"grad_norm": 0.09099920839071274,
"learning_rate": 8.17370910038459e-05,
"loss": 0.3319,
"step": 560
},
{
"epoch": 0.86,
"grad_norm": 0.10040932148694992,
"learning_rate": 8.112382036570344e-05,
"loss": 0.342,
"step": 570
},
{
"epoch": 0.87,
"grad_norm": 0.09268151968717575,
"learning_rate": 8.050280819798481e-05,
"loss": 0.334,
"step": 580
},
{
"epoch": 0.89,
"grad_norm": 0.09127725660800934,
"learning_rate": 7.987420896707645e-05,
"loss": 0.3476,
"step": 590
},
{
"epoch": 0.9,
"grad_norm": 0.09117837995290756,
"learning_rate": 7.923817902651978e-05,
"loss": 0.3351,
"step": 600
},
{
"epoch": 0.92,
"grad_norm": 0.09493087977170944,
"learning_rate": 7.859487657812095e-05,
"loss": 0.3408,
"step": 610
},
{
"epoch": 0.93,
"grad_norm": 0.09857796877622604,
"learning_rate": 7.794446163260077e-05,
"loss": 0.3416,
"step": 620
},
{
"epoch": 0.95,
"grad_norm": 0.09530830383300781,
"learning_rate": 7.728709596979471e-05,
"loss": 0.3403,
"step": 630
},
{
"epoch": 0.96,
"grad_norm": 0.10255115479230881,
"learning_rate": 7.662294309841283e-05,
"loss": 0.3349,
"step": 640
},
{
"epoch": 0.98,
"grad_norm": 0.09442761540412903,
"learning_rate": 7.595216821536981e-05,
"loss": 0.3469,
"step": 650
},
{
"epoch": 0.99,
"grad_norm": 0.09131593257188797,
"learning_rate": 7.527493816469492e-05,
"loss": 0.3232,
"step": 660
},
{
"epoch": 1.01,
"grad_norm": 0.10782450437545776,
"learning_rate": 7.459142139603236e-05,
"loss": 0.3275,
"step": 670
},
{
"epoch": 1.02,
"grad_norm": 0.10051246732473373,
"learning_rate": 7.390178792274227e-05,
"loss": 0.3168,
"step": 680
},
{
"epoch": 1.04,
"grad_norm": 0.09489897638559341,
"learning_rate": 7.32062092796127e-05,
"loss": 0.3205,
"step": 690
},
{
"epoch": 1.05,
"grad_norm": 0.10021229833364487,
"learning_rate": 7.250485848019326e-05,
"loss": 0.314,
"step": 700
},
{
"epoch": 1.07,
"grad_norm": 0.09837724268436432,
"learning_rate": 7.179790997376083e-05,
"loss": 0.3131,
"step": 710
},
{
"epoch": 1.08,
"grad_norm": 0.11071084439754486,
"learning_rate": 7.108553960192827e-05,
"loss": 0.3141,
"step": 720
},
{
"epoch": 1.1,
"grad_norm": 0.09489303082227707,
"learning_rate": 7.036792455490675e-05,
"loss": 0.3124,
"step": 730
},
{
"epoch": 1.11,
"grad_norm": 0.10039713978767395,
"learning_rate": 6.964524332743263e-05,
"loss": 0.3258,
"step": 740
},
{
"epoch": 1.13,
"grad_norm": 0.09435191005468369,
"learning_rate": 6.891767567436988e-05,
"loss": 0.318,
"step": 750
},
{
"epoch": 1.14,
"grad_norm": 0.10240574926137924,
"learning_rate": 6.818540256599913e-05,
"loss": 0.3286,
"step": 760
},
{
"epoch": 1.16,
"grad_norm": 0.10050353407859802,
"learning_rate": 6.744860614300426e-05,
"loss": 0.3096,
"step": 770
},
{
"epoch": 1.17,
"grad_norm": 0.09765986353158951,
"learning_rate": 6.670746967116793e-05,
"loss": 0.318,
"step": 780
},
{
"epoch": 1.19,
"grad_norm": 0.10131178051233292,
"learning_rate": 6.596217749578743e-05,
"loss": 0.3199,
"step": 790
},
{
"epoch": 1.2,
"grad_norm": 0.0985412448644638,
"learning_rate": 6.521291499582172e-05,
"loss": 0.3173,
"step": 800
},
{
"epoch": 1.22,
"grad_norm": 0.10045495629310608,
"learning_rate": 6.445986853778156e-05,
"loss": 0.304,
"step": 810
},
{
"epoch": 1.23,
"grad_norm": 0.10255653411149979,
"learning_rate": 6.370322542937403e-05,
"loss": 0.3215,
"step": 820
},
{
"epoch": 1.25,
"grad_norm": 0.10014262050390244,
"learning_rate": 6.294317387291276e-05,
"loss": 0.3185,
"step": 830
},
{
"epoch": 1.26,
"grad_norm": 0.10973095148801804,
"learning_rate": 6.217990291850581e-05,
"loss": 0.3128,
"step": 840
},
{
"epoch": 1.28,
"grad_norm": 0.1075139194726944,
"learning_rate": 6.141360241703264e-05,
"loss": 0.3117,
"step": 850
},
{
"epoch": 1.29,
"grad_norm": 0.10679470747709274,
"learning_rate": 6.0644462972921845e-05,
"loss": 0.314,
"step": 860
},
{
"epoch": 1.31,
"grad_norm": 0.10646017640829086,
"learning_rate": 5.98726758967415e-05,
"loss": 0.3166,
"step": 870
},
{
"epoch": 1.32,
"grad_norm": 0.10854795575141907,
"learning_rate": 5.909843315761385e-05,
"loss": 0.3104,
"step": 880
},
{
"epoch": 1.34,
"grad_norm": 0.09923075139522552,
"learning_rate": 5.832192733546621e-05,
"loss": 0.3085,
"step": 890
},
{
"epoch": 1.35,
"grad_norm": 0.10540423542261124,
"learning_rate": 5.7543351573129964e-05,
"loss": 0.3035,
"step": 900
},
{
"epoch": 1.37,
"grad_norm": 0.10513672232627869,
"learning_rate": 5.676289952829945e-05,
"loss": 0.3069,
"step": 910
},
{
"epoch": 1.39,
"grad_norm": 0.10602447390556335,
"learning_rate": 5.598076532536291e-05,
"loss": 0.3126,
"step": 920
},
{
"epoch": 1.4,
"grad_norm": 0.10258585214614868,
"learning_rate": 5.5197143507117234e-05,
"loss": 0.3148,
"step": 930
},
{
"epoch": 1.42,
"grad_norm": 0.10304014384746552,
"learning_rate": 5.441222898637877e-05,
"loss": 0.3138,
"step": 940
},
{
"epoch": 1.43,
"grad_norm": 0.10201530903577805,
"learning_rate": 5.362621699750196e-05,
"loss": 0.3104,
"step": 950
},
{
"epoch": 1.45,
"grad_norm": 0.10214639455080032,
"learning_rate": 5.28393030478181e-05,
"loss": 0.3081,
"step": 960
},
{
"epoch": 1.46,
"grad_norm": 0.09911152720451355,
"learning_rate": 5.2051682869006126e-05,
"loss": 0.3081,
"step": 970
},
{
"epoch": 1.48,
"grad_norm": 0.10031607747077942,
"learning_rate": 5.126355236840764e-05,
"loss": 0.3134,
"step": 980
},
{
"epoch": 1.49,
"grad_norm": 0.1077575534582138,
"learning_rate": 5.047510758029832e-05,
"loss": 0.3272,
"step": 990
},
{
"epoch": 1.51,
"grad_norm": 0.11312615871429443,
"learning_rate": 4.968654461712753e-05,
"loss": 0.3167,
"step": 1000
},
{
"epoch": 1.51,
"eval_loss": 0.2502507269382477,
"eval_runtime": 14.5906,
"eval_samples_per_second": 0.754,
"eval_steps_per_second": 0.754,
"step": 1000
},
{
"epoch": 1.52,
"grad_norm": 0.10545619577169418,
"learning_rate": 4.889805962073874e-05,
"loss": 0.3142,
"step": 1010
},
{
"epoch": 1.54,
"grad_norm": 0.11502473056316376,
"learning_rate": 4.8109848713582475e-05,
"loss": 0.3164,
"step": 1020
},
{
"epoch": 1.55,
"grad_norm": 0.11116404086351395,
"learning_rate": 4.7322107949934146e-05,
"loss": 0.3191,
"step": 1030
},
{
"epoch": 1.57,
"grad_norm": 0.11101904511451721,
"learning_rate": 4.653503326712886e-05,
"loss": 0.3223,
"step": 1040
},
{
"epoch": 1.58,
"grad_norm": 0.10275658220052719,
"learning_rate": 4.5748820436825204e-05,
"loss": 0.3127,
"step": 1050
},
{
"epoch": 1.6,
"grad_norm": 0.10536840558052063,
"learning_rate": 4.496366501631043e-05,
"loss": 0.3104,
"step": 1060
},
{
"epoch": 1.61,
"grad_norm": 0.11228681355714798,
"learning_rate": 4.417976229985876e-05,
"loss": 0.3181,
"step": 1070
},
{
"epoch": 1.63,
"grad_norm": 0.10409337282180786,
"learning_rate": 4.339730727015527e-05,
"loss": 0.3085,
"step": 1080
},
{
"epoch": 1.64,
"grad_norm": 0.11756953597068787,
"learning_rate": 4.261649454979714e-05,
"loss": 0.3105,
"step": 1090
},
{
"epoch": 1.66,
"grad_norm": 0.10340123623609543,
"learning_rate": 4.183751835288463e-05,
"loss": 0.3168,
"step": 1100
},
{
"epoch": 1.67,
"grad_norm": 0.11595764011144638,
"learning_rate": 4.10605724367135e-05,
"loss": 0.3172,
"step": 1110
},
{
"epoch": 1.69,
"grad_norm": 0.1063317060470581,
"learning_rate": 4.0285850053581105e-05,
"loss": 0.319,
"step": 1120
},
{
"epoch": 1.7,
"grad_norm": 0.10760544240474701,
"learning_rate": 3.9513543902718206e-05,
"loss": 0.3096,
"step": 1130
},
{
"epoch": 1.72,
"grad_norm": 0.11430344730615616,
"learning_rate": 3.87438460823582e-05,
"loss": 0.3119,
"step": 1140
},
{
"epoch": 1.73,
"grad_norm": 0.10582321882247925,
"learning_rate": 3.7976948041955904e-05,
"loss": 0.3179,
"step": 1150
},
{
"epoch": 1.75,
"grad_norm": 0.11124568432569504,
"learning_rate": 3.7213040534567725e-05,
"loss": 0.3099,
"step": 1160
},
{
"epoch": 1.76,
"grad_norm": 0.10317942500114441,
"learning_rate": 3.645231356940501e-05,
"loss": 0.3081,
"step": 1170
},
{
"epoch": 1.78,
"grad_norm": 0.10943736135959625,
"learning_rate": 3.569495636457244e-05,
"loss": 0.3103,
"step": 1180
},
{
"epoch": 1.79,
"grad_norm": 0.10795300453901291,
"learning_rate": 3.494115730000321e-05,
"loss": 0.3123,
"step": 1190
},
{
"epoch": 1.81,
"grad_norm": 0.1120479553937912,
"learning_rate": 3.4191103870602656e-05,
"loss": 0.3072,
"step": 1200
},
{
"epoch": 1.82,
"grad_norm": 0.10475881397724152,
"learning_rate": 3.344498263961201e-05,
"loss": 0.3107,
"step": 1210
},
{
"epoch": 1.84,
"grad_norm": 0.10638121515512466,
"learning_rate": 3.270297919220395e-05,
"loss": 0.3101,
"step": 1220
},
{
"epoch": 1.85,
"grad_norm": 0.10646604746580124,
"learning_rate": 3.1965278089321396e-05,
"loss": 0.3201,
"step": 1230
},
{
"epoch": 1.87,
"grad_norm": 0.10810079425573349,
"learning_rate": 3.123206282177105e-05,
"loss": 0.3129,
"step": 1240
},
{
"epoch": 1.88,
"grad_norm": 0.10837584733963013,
"learning_rate": 3.05035157645831e-05,
"loss": 0.3138,
"step": 1250
},
{
"epoch": 1.9,
"grad_norm": 0.11282283812761307,
"learning_rate": 2.9779818131648563e-05,
"loss": 0.3048,
"step": 1260
},
{
"epoch": 1.91,
"grad_norm": 0.10869032144546509,
"learning_rate": 2.9061149930645243e-05,
"loss": 0.3163,
"step": 1270
},
{
"epoch": 1.93,
"grad_norm": 0.11098149418830872,
"learning_rate": 2.8347689918263976e-05,
"loss": 0.3083,
"step": 1280
},
{
"epoch": 1.94,
"grad_norm": 0.10902676731348038,
"learning_rate": 2.763961555574575e-05,
"loss": 0.3008,
"step": 1290
},
{
"epoch": 1.96,
"grad_norm": 0.10680384933948517,
"learning_rate": 2.69371029647413e-05,
"loss": 0.3022,
"step": 1300
},
{
"epoch": 1.97,
"grad_norm": 0.10289633274078369,
"learning_rate": 2.624032688350374e-05,
"loss": 0.3045,
"step": 1310
},
{
"epoch": 1.99,
"grad_norm": 0.11078934371471405,
"learning_rate": 2.5549460623425354e-05,
"loss": 0.3065,
"step": 1320
},
{
"epoch": 2.0,
"grad_norm": 0.10170484334230423,
"learning_rate": 2.486467602592929e-05,
"loss": 0.2956,
"step": 1330
},
{
"epoch": 2.02,
"grad_norm": 0.1094578206539154,
"learning_rate": 2.4186143419726885e-05,
"loss": 0.2938,
"step": 1340
},
{
"epoch": 2.03,
"grad_norm": 0.10822325944900513,
"learning_rate": 2.351403157845125e-05,
"loss": 0.2863,
"step": 1350
},
{
"epoch": 2.05,
"grad_norm": 0.11142423003911972,
"learning_rate": 2.2848507678677633e-05,
"loss": 0.2846,
"step": 1360
},
{
"epoch": 2.06,
"grad_norm": 0.11497773230075836,
"learning_rate": 2.218973725834109e-05,
"loss": 0.2936,
"step": 1370
},
{
"epoch": 2.08,
"grad_norm": 0.10550795495510101,
"learning_rate": 2.153788417556164e-05,
"loss": 0.2888,
"step": 1380
},
{
"epoch": 2.09,
"grad_norm": 0.10813359171152115,
"learning_rate": 2.089311056788731e-05,
"loss": 0.2889,
"step": 1390
},
{
"epoch": 2.11,
"grad_norm": 0.10650567710399628,
"learning_rate": 2.0255576811965154e-05,
"loss": 0.2925,
"step": 1400
},
{
"epoch": 2.12,
"grad_norm": 0.11273263394832611,
"learning_rate": 1.9625441483650235e-05,
"loss": 0.2856,
"step": 1410
},
{
"epoch": 2.14,
"grad_norm": 0.11437318474054337,
"learning_rate": 1.9002861318562536e-05,
"loss": 0.2845,
"step": 1420
},
{
"epoch": 2.15,
"grad_norm": 0.11178340762853622,
"learning_rate": 1.8387991173101587e-05,
"loss": 0.2904,
"step": 1430
},
{
"epoch": 2.17,
"grad_norm": 0.11630496382713318,
"learning_rate": 1.7780983985928534e-05,
"loss": 0.2851,
"step": 1440
},
{
"epoch": 2.18,
"grad_norm": 0.11532077938318253,
"learning_rate": 1.7181990739925213e-05,
"loss": 0.2797,
"step": 1450
},
{
"epoch": 2.2,
"grad_norm": 0.11311406642198563,
"learning_rate": 1.6591160424639675e-05,
"loss": 0.288,
"step": 1460
},
{
"epoch": 2.21,
"grad_norm": 0.11721379309892654,
"learning_rate": 1.6008639999227527e-05,
"loss": 0.2926,
"step": 1470
},
{
"epoch": 2.23,
"grad_norm": 0.11629457771778107,
"learning_rate": 1.5434574355898306e-05,
"loss": 0.2883,
"step": 1480
},
{
"epoch": 2.24,
"grad_norm": 0.11253120750188828,
"learning_rate": 1.4869106283875972e-05,
"loss": 0.2878,
"step": 1490
},
{
"epoch": 2.26,
"grad_norm": 0.1093481034040451,
"learning_rate": 1.4312376433882457e-05,
"loss": 0.2893,
"step": 1500
},
{
"epoch": 2.26,
"eval_loss": 0.24623431265354156,
"eval_runtime": 15.3708,
"eval_samples_per_second": 0.716,
"eval_steps_per_second": 0.716,
"step": 1500
},
{
"epoch": 2.27,
"grad_norm": 0.114622101187706,
"learning_rate": 1.376452328315318e-05,
"loss": 0.2905,
"step": 1510
},
{
"epoch": 2.29,
"grad_norm": 0.11465822905302048,
"learning_rate": 1.3225683100993113e-05,
"loss": 0.2886,
"step": 1520
},
{
"epoch": 2.3,
"grad_norm": 0.11255551874637604,
"learning_rate": 1.2695989914882128e-05,
"loss": 0.2873,
"step": 1530
},
{
"epoch": 2.32,
"grad_norm": 0.11598910391330719,
"learning_rate": 1.2175575477137824e-05,
"loss": 0.2853,
"step": 1540
},
{
"epoch": 2.33,
"grad_norm": 0.11426587402820587,
"learning_rate": 1.1664569232144445e-05,
"loss": 0.2934,
"step": 1550
},
{
"epoch": 2.35,
"grad_norm": 0.11229728907346725,
"learning_rate": 1.1163098284155665e-05,
"loss": 0.2878,
"step": 1560
},
{
"epoch": 2.36,
"grad_norm": 0.10811661183834076,
"learning_rate": 1.0671287365679567e-05,
"loss": 0.2818,
"step": 1570
},
{
"epoch": 2.38,
"grad_norm": 0.109583280980587,
"learning_rate": 1.018925880645351e-05,
"loss": 0.2915,
"step": 1580
},
{
"epoch": 2.39,
"grad_norm": 0.1194503903388977,
"learning_rate": 9.717132503016685e-06,
"loss": 0.2922,
"step": 1590
},
{
"epoch": 2.41,
"grad_norm": 0.11107660830020905,
"learning_rate": 9.255025888887814e-06,
"loss": 0.2843,
"step": 1600
},
{
"epoch": 2.42,
"grad_norm": 0.12213042378425598,
"learning_rate": 8.80305390535554e-06,
"loss": 0.2867,
"step": 1610
},
{
"epoch": 2.44,
"grad_norm": 0.11774149537086487,
"learning_rate": 8.361328972888732e-06,
"loss": 0.2838,
"step": 1620
},
{
"epoch": 2.45,
"grad_norm": 0.11726924777030945,
"learning_rate": 7.929960963173727e-06,
"loss": 0.288,
"step": 1630
},
{
"epoch": 2.47,
"grad_norm": 0.12220928817987442,
"learning_rate": 7.509057171785639e-06,
"loss": 0.2844,
"step": 1640
},
{
"epoch": 2.48,
"grad_norm": 0.11534737050533295,
"learning_rate": 7.098722291500331e-06,
"loss": 0.2842,
"step": 1650
},
{
"epoch": 2.5,
"grad_norm": 0.11947780847549438,
"learning_rate": 6.699058386253865e-06,
"loss": 0.2827,
"step": 1660
},
{
"epoch": 2.51,
"grad_norm": 0.1192295104265213,
"learning_rate": 6.310164865755808e-06,
"loss": 0.2907,
"step": 1670
},
{
"epoch": 2.53,
"grad_norm": 0.11454175412654877,
"learning_rate": 5.93213846076271e-06,
"loss": 0.2833,
"step": 1680
},
{
"epoch": 2.54,
"grad_norm": 0.11072956025600433,
"learning_rate": 5.5650731990179674e-06,
"loss": 0.2869,
"step": 1690
},
{
"epoch": 2.56,
"grad_norm": 0.10620978474617004,
"learning_rate": 5.20906038186399e-06,
"loss": 0.2775,
"step": 1700
},
{
"epoch": 2.57,
"grad_norm": 0.1189485564827919,
"learning_rate": 4.864188561532507e-06,
"loss": 0.2842,
"step": 1710
},
{
"epoch": 2.59,
"grad_norm": 0.1264795958995819,
"learning_rate": 4.530543519118702e-06,
"loss": 0.2944,
"step": 1720
},
{
"epoch": 2.6,
"grad_norm": 0.10882660746574402,
"learning_rate": 4.208208243244577e-06,
"loss": 0.2903,
"step": 1730
},
{
"epoch": 2.62,
"grad_norm": 0.1184026375412941,
"learning_rate": 3.8972629094169485e-06,
"loss": 0.2934,
"step": 1740
},
{
"epoch": 2.63,
"grad_norm": 0.115041583776474,
"learning_rate": 3.5977848600851016e-06,
"loss": 0.2831,
"step": 1750
},
{
"epoch": 2.65,
"grad_norm": 0.1151689738035202,
"learning_rate": 3.309848585403169e-06,
"loss": 0.2828,
"step": 1760
},
{
"epoch": 2.66,
"grad_norm": 0.11265784502029419,
"learning_rate": 3.033525704701956e-06,
"loss": 0.2868,
"step": 1770
},
{
"epoch": 2.68,
"grad_norm": 0.10989955067634583,
"learning_rate": 2.768884948674816e-06,
"loss": 0.2798,
"step": 1780
},
{
"epoch": 2.69,
"grad_norm": 0.12034480273723602,
"learning_rate": 2.515992142282042e-06,
"loss": 0.2932,
"step": 1790
},
{
"epoch": 2.71,
"grad_norm": 0.11389750242233276,
"learning_rate": 2.2749101883780157e-06,
"loss": 0.2877,
"step": 1800
},
{
"epoch": 2.72,
"grad_norm": 0.12224046885967255,
"learning_rate": 2.0456990520651696e-06,
"loss": 0.29,
"step": 1810
},
{
"epoch": 2.74,
"grad_norm": 0.11018862575292587,
"learning_rate": 1.8284157457786833e-06,
"loss": 0.2903,
"step": 1820
},
{
"epoch": 2.75,
"grad_norm": 0.11373750865459442,
"learning_rate": 1.6231143151055838e-06,
"loss": 0.2823,
"step": 1830
},
{
"epoch": 2.77,
"grad_norm": 0.11346758902072906,
"learning_rate": 1.4298458253417968e-06,
"loss": 0.2918,
"step": 1840
},
{
"epoch": 2.79,
"grad_norm": 0.12030266970396042,
"learning_rate": 1.2486583487905324e-06,
"loss": 0.2793,
"step": 1850
},
{
"epoch": 2.8,
"grad_norm": 0.11971867829561234,
"learning_rate": 1.079596952805101e-06,
"loss": 0.2835,
"step": 1860
},
{
"epoch": 2.82,
"grad_norm": 0.11345378309488297,
"learning_rate": 9.227036885791352e-07,
"loss": 0.287,
"step": 1870
},
{
"epoch": 2.83,
"grad_norm": 0.12087972462177277,
"learning_rate": 7.78017580687107e-07,
"loss": 0.2834,
"step": 1880
},
{
"epoch": 2.85,
"grad_norm": 0.11702313274145126,
"learning_rate": 6.455746173775701e-07,
"loss": 0.2828,
"step": 1890
},
{
"epoch": 2.86,
"grad_norm": 0.11680703610181808,
"learning_rate": 5.25407741621714e-07,
"loss": 0.2821,
"step": 1900
},
{
"epoch": 2.88,
"grad_norm": 0.11461573839187622,
"learning_rate": 4.1754684291934744e-07,
"loss": 0.2784,
"step": 1910
},
{
"epoch": 2.89,
"grad_norm": 0.12143554538488388,
"learning_rate": 3.2201874986437784e-07,
"loss": 0.278,
"step": 1920
},
{
"epoch": 2.91,
"grad_norm": 0.11503802239894867,
"learning_rate": 2.3884722347164434e-07,
"loss": 0.2881,
"step": 1930
},
{
"epoch": 2.92,
"grad_norm": 0.11443481594324112,
"learning_rate": 1.6805295126677833e-07,
"loss": 0.2952,
"step": 1940
},
{
"epoch": 2.94,
"grad_norm": 0.1119546890258789,
"learning_rate": 1.0965354214051982e-07,
"loss": 0.2833,
"step": 1950
},
{
"epoch": 2.95,
"grad_norm": 0.11658598482608795,
"learning_rate": 6.366352196878756e-08,
"loss": 0.2954,
"step": 1960
},
{
"epoch": 2.97,
"grad_norm": 0.11841295659542084,
"learning_rate": 3.0094329999635906e-08,
"loss": 0.2792,
"step": 1970
},
{
"epoch": 2.98,
"grad_norm": 0.11211774498224258,
"learning_rate": 8.954316007908636e-09,
"loss": 0.2914,
"step": 1980
},
{
"epoch": 3.0,
"grad_norm": 0.12004334479570389,
"learning_rate": 2.4873821838911073e-10,
"loss": 0.2895,
"step": 1990
},
{
"epoch": 3.0,
"step": 1992,
"total_flos": 7.377073297607885e+18,
"train_loss": 0.32009275511924523,
"train_runtime": 49821.5539,
"train_samples_per_second": 6.399,
"train_steps_per_second": 0.04
}
],
"logging_steps": 10,
"max_steps": 1992,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 250,
"total_flos": 7.377073297607885e+18,
"train_batch_size": 20,
"trial_name": null,
"trial_params": null
}