t5_stress_model / checkpoint-9140 /trainer_state.json
katelyndekeer's picture
Upload poetry scorer model
9818b6b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 9140,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02735229759299781,
"grad_norm": 0.8252887725830078,
"learning_rate": 0.000245,
"loss": 4.4979,
"step": 50
},
{
"epoch": 0.05470459518599562,
"grad_norm": 0.14624212682247162,
"learning_rate": 0.000495,
"loss": 0.0801,
"step": 100
},
{
"epoch": 0.08205689277899343,
"grad_norm": 0.20989255607128143,
"learning_rate": 0.0004972898230088496,
"loss": 0.045,
"step": 150
},
{
"epoch": 0.10940919037199125,
"grad_norm": 0.43385007977485657,
"learning_rate": 0.0004945243362831859,
"loss": 0.038,
"step": 200
},
{
"epoch": 0.13676148796498905,
"grad_norm": 0.1885886788368225,
"learning_rate": 0.0004917588495575221,
"loss": 0.0351,
"step": 250
},
{
"epoch": 0.16411378555798686,
"grad_norm": 0.2390187680721283,
"learning_rate": 0.0004889933628318584,
"loss": 0.0379,
"step": 300
},
{
"epoch": 0.19146608315098468,
"grad_norm": 0.14649873971939087,
"learning_rate": 0.0004862278761061947,
"loss": 0.0345,
"step": 350
},
{
"epoch": 0.2188183807439825,
"grad_norm": 0.22285760939121246,
"learning_rate": 0.000483462389380531,
"loss": 0.0308,
"step": 400
},
{
"epoch": 0.2461706783369803,
"grad_norm": 0.08991721272468567,
"learning_rate": 0.00048069690265486727,
"loss": 0.0335,
"step": 450
},
{
"epoch": 0.2735229759299781,
"grad_norm": 0.09125715494155884,
"learning_rate": 0.00047793141592920353,
"loss": 0.03,
"step": 500
},
{
"epoch": 0.30087527352297594,
"grad_norm": 0.06857843697071075,
"learning_rate": 0.00047516592920353985,
"loss": 0.0304,
"step": 550
},
{
"epoch": 0.3282275711159737,
"grad_norm": 0.11107359081506729,
"learning_rate": 0.0004724004424778761,
"loss": 0.0318,
"step": 600
},
{
"epoch": 0.35557986870897157,
"grad_norm": 0.1745302528142929,
"learning_rate": 0.00046963495575221237,
"loss": 0.031,
"step": 650
},
{
"epoch": 0.38293216630196936,
"grad_norm": 0.06578712165355682,
"learning_rate": 0.0004668694690265487,
"loss": 0.0303,
"step": 700
},
{
"epoch": 0.4102844638949672,
"grad_norm": 0.08079079538583755,
"learning_rate": 0.00046410398230088495,
"loss": 0.028,
"step": 750
},
{
"epoch": 0.437636761487965,
"grad_norm": 0.09777480363845825,
"learning_rate": 0.0004613384955752212,
"loss": 0.0309,
"step": 800
},
{
"epoch": 0.4649890590809628,
"grad_norm": 0.055102407932281494,
"learning_rate": 0.0004585730088495575,
"loss": 0.0295,
"step": 850
},
{
"epoch": 0.4923413566739606,
"grad_norm": 0.08960958570241928,
"learning_rate": 0.00045580752212389384,
"loss": 0.0308,
"step": 900
},
{
"epoch": 0.5196936542669585,
"grad_norm": 0.09604144841432571,
"learning_rate": 0.0004530420353982301,
"loss": 0.0275,
"step": 950
},
{
"epoch": 0.5470459518599562,
"grad_norm": 0.15996237099170685,
"learning_rate": 0.0004502765486725664,
"loss": 0.028,
"step": 1000
},
{
"epoch": 0.574398249452954,
"grad_norm": 0.07793421298265457,
"learning_rate": 0.0004475110619469027,
"loss": 0.0291,
"step": 1050
},
{
"epoch": 0.6017505470459519,
"grad_norm": 0.21286524832248688,
"learning_rate": 0.00044474557522123894,
"loss": 0.0267,
"step": 1100
},
{
"epoch": 0.6291028446389497,
"grad_norm": 0.15267890691757202,
"learning_rate": 0.00044198008849557525,
"loss": 0.0302,
"step": 1150
},
{
"epoch": 0.6564551422319475,
"grad_norm": 0.10255500674247742,
"learning_rate": 0.0004392146017699115,
"loss": 0.0279,
"step": 1200
},
{
"epoch": 0.6838074398249453,
"grad_norm": 0.05677573382854462,
"learning_rate": 0.0004364491150442478,
"loss": 0.0241,
"step": 1250
},
{
"epoch": 0.7111597374179431,
"grad_norm": 0.17168180644512177,
"learning_rate": 0.0004336836283185841,
"loss": 0.0267,
"step": 1300
},
{
"epoch": 0.7385120350109409,
"grad_norm": 0.06085604801774025,
"learning_rate": 0.00043091814159292035,
"loss": 0.0274,
"step": 1350
},
{
"epoch": 0.7658643326039387,
"grad_norm": 0.11685376614332199,
"learning_rate": 0.0004281526548672566,
"loss": 0.0252,
"step": 1400
},
{
"epoch": 0.7932166301969366,
"grad_norm": 0.0925152450799942,
"learning_rate": 0.00042538716814159293,
"loss": 0.0264,
"step": 1450
},
{
"epoch": 0.8205689277899344,
"grad_norm": 0.409482479095459,
"learning_rate": 0.0004226216814159292,
"loss": 0.0253,
"step": 1500
},
{
"epoch": 0.8479212253829321,
"grad_norm": 0.07285141944885254,
"learning_rate": 0.00041985619469026546,
"loss": 0.0275,
"step": 1550
},
{
"epoch": 0.87527352297593,
"grad_norm": 0.33443543314933777,
"learning_rate": 0.0004170907079646018,
"loss": 0.024,
"step": 1600
},
{
"epoch": 0.9026258205689278,
"grad_norm": 0.09892763197422028,
"learning_rate": 0.0004143252212389381,
"loss": 0.0256,
"step": 1650
},
{
"epoch": 0.9299781181619255,
"grad_norm": 0.05468318983912468,
"learning_rate": 0.00041155973451327435,
"loss": 0.024,
"step": 1700
},
{
"epoch": 0.9573304157549234,
"grad_norm": 0.0622013621032238,
"learning_rate": 0.00040879424778761066,
"loss": 0.0228,
"step": 1750
},
{
"epoch": 0.9846827133479212,
"grad_norm": 0.10611853003501892,
"learning_rate": 0.0004060287610619469,
"loss": 0.0227,
"step": 1800
},
{
"epoch": 1.012035010940919,
"grad_norm": 0.09531939029693604,
"learning_rate": 0.0004032632743362832,
"loss": 0.0244,
"step": 1850
},
{
"epoch": 1.039387308533917,
"grad_norm": 0.04730033501982689,
"learning_rate": 0.0004004977876106195,
"loss": 0.0248,
"step": 1900
},
{
"epoch": 1.0667396061269148,
"grad_norm": 0.06383775174617767,
"learning_rate": 0.00039773230088495576,
"loss": 0.0224,
"step": 1950
},
{
"epoch": 1.0940919037199124,
"grad_norm": 0.08950739353895187,
"learning_rate": 0.000394966814159292,
"loss": 0.0241,
"step": 2000
},
{
"epoch": 1.1214442013129102,
"grad_norm": 0.06031138449907303,
"learning_rate": 0.00039220132743362834,
"loss": 0.0229,
"step": 2050
},
{
"epoch": 1.148796498905908,
"grad_norm": 0.09096775949001312,
"learning_rate": 0.0003894358407079646,
"loss": 0.0265,
"step": 2100
},
{
"epoch": 1.176148796498906,
"grad_norm": 0.042759090662002563,
"learning_rate": 0.00038667035398230086,
"loss": 0.0224,
"step": 2150
},
{
"epoch": 1.2035010940919038,
"grad_norm": 0.1569543182849884,
"learning_rate": 0.0003839048672566372,
"loss": 0.0234,
"step": 2200
},
{
"epoch": 1.2308533916849016,
"grad_norm": 0.10266666859388351,
"learning_rate": 0.00038113938053097344,
"loss": 0.0221,
"step": 2250
},
{
"epoch": 1.2582056892778994,
"grad_norm": 0.11569976806640625,
"learning_rate": 0.00037837389380530976,
"loss": 0.0234,
"step": 2300
},
{
"epoch": 1.2855579868708973,
"grad_norm": 0.1363728791475296,
"learning_rate": 0.0003756084070796461,
"loss": 0.0245,
"step": 2350
},
{
"epoch": 1.312910284463895,
"grad_norm": 0.20765434205532074,
"learning_rate": 0.00037284292035398234,
"loss": 0.0224,
"step": 2400
},
{
"epoch": 1.3402625820568927,
"grad_norm": 0.085990771651268,
"learning_rate": 0.0003700774336283186,
"loss": 0.0242,
"step": 2450
},
{
"epoch": 1.3676148796498906,
"grad_norm": 0.11980952322483063,
"learning_rate": 0.00036731194690265486,
"loss": 0.0236,
"step": 2500
},
{
"epoch": 1.3949671772428884,
"grad_norm": 0.047215647995471954,
"learning_rate": 0.0003645464601769912,
"loss": 0.024,
"step": 2550
},
{
"epoch": 1.4223194748358863,
"grad_norm": 0.13020800054073334,
"learning_rate": 0.00036178097345132744,
"loss": 0.0216,
"step": 2600
},
{
"epoch": 1.449671772428884,
"grad_norm": 0.14372025430202484,
"learning_rate": 0.0003590154867256637,
"loss": 0.0209,
"step": 2650
},
{
"epoch": 1.4770240700218817,
"grad_norm": 0.05821918696165085,
"learning_rate": 0.00035625,
"loss": 0.0232,
"step": 2700
},
{
"epoch": 1.5043763676148796,
"grad_norm": 0.0778370276093483,
"learning_rate": 0.0003534845132743363,
"loss": 0.0183,
"step": 2750
},
{
"epoch": 1.5317286652078774,
"grad_norm": 0.11819358170032501,
"learning_rate": 0.00035071902654867254,
"loss": 0.0221,
"step": 2800
},
{
"epoch": 1.5590809628008753,
"grad_norm": 0.0761452466249466,
"learning_rate": 0.00034795353982300885,
"loss": 0.0203,
"step": 2850
},
{
"epoch": 1.5864332603938731,
"grad_norm": 0.0948249101638794,
"learning_rate": 0.0003451880530973451,
"loss": 0.0227,
"step": 2900
},
{
"epoch": 1.613785557986871,
"grad_norm": 0.05540154129266739,
"learning_rate": 0.0003424225663716814,
"loss": 0.0224,
"step": 2950
},
{
"epoch": 1.6411378555798688,
"grad_norm": 0.11511294543743134,
"learning_rate": 0.00033965707964601774,
"loss": 0.0217,
"step": 3000
},
{
"epoch": 1.6684901531728666,
"grad_norm": 0.05139593780040741,
"learning_rate": 0.000336891592920354,
"loss": 0.0209,
"step": 3050
},
{
"epoch": 1.6958424507658645,
"grad_norm": 0.09306413680315018,
"learning_rate": 0.00033412610619469027,
"loss": 0.0226,
"step": 3100
},
{
"epoch": 1.723194748358862,
"grad_norm": 0.08103025704622269,
"learning_rate": 0.0003313606194690266,
"loss": 0.0209,
"step": 3150
},
{
"epoch": 1.75054704595186,
"grad_norm": 0.03991864249110222,
"learning_rate": 0.00032859513274336285,
"loss": 0.0192,
"step": 3200
},
{
"epoch": 1.7778993435448578,
"grad_norm": 0.029916733503341675,
"learning_rate": 0.0003258296460176991,
"loss": 0.0195,
"step": 3250
},
{
"epoch": 1.8052516411378556,
"grad_norm": 0.06091579794883728,
"learning_rate": 0.0003230641592920354,
"loss": 0.0198,
"step": 3300
},
{
"epoch": 1.8326039387308533,
"grad_norm": 0.06632323563098907,
"learning_rate": 0.0003202986725663717,
"loss": 0.022,
"step": 3350
},
{
"epoch": 1.859956236323851,
"grad_norm": 0.04406097158789635,
"learning_rate": 0.00031753318584070795,
"loss": 0.019,
"step": 3400
},
{
"epoch": 1.887308533916849,
"grad_norm": 0.030064748600125313,
"learning_rate": 0.00031476769911504426,
"loss": 0.0219,
"step": 3450
},
{
"epoch": 1.9146608315098468,
"grad_norm": 0.08452901244163513,
"learning_rate": 0.0003120022123893805,
"loss": 0.0203,
"step": 3500
},
{
"epoch": 1.9420131291028446,
"grad_norm": 0.046714432537555695,
"learning_rate": 0.0003092367256637168,
"loss": 0.0185,
"step": 3550
},
{
"epoch": 1.9693654266958425,
"grad_norm": 0.06433264911174774,
"learning_rate": 0.0003064712389380531,
"loss": 0.0196,
"step": 3600
},
{
"epoch": 1.9967177242888403,
"grad_norm": 0.03903990983963013,
"learning_rate": 0.00030370575221238936,
"loss": 0.0197,
"step": 3650
},
{
"epoch": 2.024070021881838,
"grad_norm": 0.07869091629981995,
"learning_rate": 0.0003009402654867257,
"loss": 0.0185,
"step": 3700
},
{
"epoch": 2.051422319474836,
"grad_norm": 0.08784345537424088,
"learning_rate": 0.000298174778761062,
"loss": 0.0173,
"step": 3750
},
{
"epoch": 2.078774617067834,
"grad_norm": 0.0678941160440445,
"learning_rate": 0.00029540929203539825,
"loss": 0.0172,
"step": 3800
},
{
"epoch": 2.1061269146608317,
"grad_norm": 0.04075619950890541,
"learning_rate": 0.0002926438053097345,
"loss": 0.0187,
"step": 3850
},
{
"epoch": 2.1334792122538295,
"grad_norm": 0.04440777003765106,
"learning_rate": 0.00028987831858407083,
"loss": 0.0178,
"step": 3900
},
{
"epoch": 2.160831509846827,
"grad_norm": 0.11714845895767212,
"learning_rate": 0.0002871128318584071,
"loss": 0.0173,
"step": 3950
},
{
"epoch": 2.1881838074398248,
"grad_norm": 0.05424795299768448,
"learning_rate": 0.00028434734513274335,
"loss": 0.0191,
"step": 4000
},
{
"epoch": 2.2155361050328226,
"grad_norm": 0.05837958678603172,
"learning_rate": 0.00028158185840707967,
"loss": 0.0195,
"step": 4050
},
{
"epoch": 2.2428884026258205,
"grad_norm": 0.06023433431982994,
"learning_rate": 0.00027881637168141593,
"loss": 0.0175,
"step": 4100
},
{
"epoch": 2.2702407002188183,
"grad_norm": 0.054130081087350845,
"learning_rate": 0.0002760508849557522,
"loss": 0.0193,
"step": 4150
},
{
"epoch": 2.297592997811816,
"grad_norm": 0.14617919921875,
"learning_rate": 0.0002732853982300885,
"loss": 0.019,
"step": 4200
},
{
"epoch": 2.324945295404814,
"grad_norm": 0.04461858794093132,
"learning_rate": 0.00027051991150442477,
"loss": 0.0172,
"step": 4250
},
{
"epoch": 2.352297592997812,
"grad_norm": 0.0726858526468277,
"learning_rate": 0.00026775442477876103,
"loss": 0.0165,
"step": 4300
},
{
"epoch": 2.3796498905908097,
"grad_norm": 0.08061967045068741,
"learning_rate": 0.00026498893805309735,
"loss": 0.0176,
"step": 4350
},
{
"epoch": 2.4070021881838075,
"grad_norm": 0.10574040561914444,
"learning_rate": 0.00026222345132743366,
"loss": 0.0173,
"step": 4400
},
{
"epoch": 2.4343544857768054,
"grad_norm": 0.05545186251401901,
"learning_rate": 0.0002594579646017699,
"loss": 0.0173,
"step": 4450
},
{
"epoch": 2.461706783369803,
"grad_norm": 0.24571385979652405,
"learning_rate": 0.00025669247787610624,
"loss": 0.0176,
"step": 4500
},
{
"epoch": 2.489059080962801,
"grad_norm": 0.09013593196868896,
"learning_rate": 0.0002539269911504425,
"loss": 0.0178,
"step": 4550
},
{
"epoch": 2.516411378555799,
"grad_norm": 0.03443370759487152,
"learning_rate": 0.00025116150442477876,
"loss": 0.0161,
"step": 4600
},
{
"epoch": 2.5437636761487967,
"grad_norm": 0.0702081173658371,
"learning_rate": 0.000248396017699115,
"loss": 0.0184,
"step": 4650
},
{
"epoch": 2.5711159737417946,
"grad_norm": 0.2026512324810028,
"learning_rate": 0.00024563053097345134,
"loss": 0.0168,
"step": 4700
},
{
"epoch": 2.598468271334792,
"grad_norm": 0.09723120927810669,
"learning_rate": 0.0002428650442477876,
"loss": 0.0173,
"step": 4750
},
{
"epoch": 2.62582056892779,
"grad_norm": 0.05518170818686485,
"learning_rate": 0.0002400995575221239,
"loss": 0.0174,
"step": 4800
},
{
"epoch": 2.6531728665207877,
"grad_norm": 0.09659027308225632,
"learning_rate": 0.00023733407079646018,
"loss": 0.0163,
"step": 4850
},
{
"epoch": 2.6805251641137855,
"grad_norm": 0.05287766829133034,
"learning_rate": 0.00023456858407079644,
"loss": 0.017,
"step": 4900
},
{
"epoch": 2.7078774617067833,
"grad_norm": 0.06743517518043518,
"learning_rate": 0.00023180309734513276,
"loss": 0.0163,
"step": 4950
},
{
"epoch": 2.735229759299781,
"grad_norm": 0.07032987475395203,
"learning_rate": 0.00022903761061946905,
"loss": 0.0176,
"step": 5000
},
{
"epoch": 2.762582056892779,
"grad_norm": 0.08675131946802139,
"learning_rate": 0.0002262721238938053,
"loss": 0.0153,
"step": 5050
},
{
"epoch": 2.789934354485777,
"grad_norm": 0.0664379745721817,
"learning_rate": 0.0002235066371681416,
"loss": 0.0173,
"step": 5100
},
{
"epoch": 2.8172866520787747,
"grad_norm": 0.08365903049707413,
"learning_rate": 0.00022074115044247789,
"loss": 0.0159,
"step": 5150
},
{
"epoch": 2.8446389496717726,
"grad_norm": 0.050819192081689835,
"learning_rate": 0.00021797566371681415,
"loss": 0.0148,
"step": 5200
},
{
"epoch": 2.8719912472647704,
"grad_norm": 0.10989898443222046,
"learning_rate": 0.00021521017699115044,
"loss": 0.0163,
"step": 5250
},
{
"epoch": 2.899343544857768,
"grad_norm": 0.1679241806268692,
"learning_rate": 0.00021244469026548675,
"loss": 0.017,
"step": 5300
},
{
"epoch": 2.9266958424507656,
"grad_norm": 0.077048659324646,
"learning_rate": 0.000209679203539823,
"loss": 0.0162,
"step": 5350
},
{
"epoch": 2.9540481400437635,
"grad_norm": 0.0816880315542221,
"learning_rate": 0.0002069137168141593,
"loss": 0.0159,
"step": 5400
},
{
"epoch": 2.9814004376367613,
"grad_norm": 0.07774699479341507,
"learning_rate": 0.0002041482300884956,
"loss": 0.0154,
"step": 5450
},
{
"epoch": 3.008752735229759,
"grad_norm": 0.07249217480421066,
"learning_rate": 0.00020138274336283185,
"loss": 0.0169,
"step": 5500
},
{
"epoch": 3.036105032822757,
"grad_norm": 0.054936815053224564,
"learning_rate": 0.00019861725663716814,
"loss": 0.0145,
"step": 5550
},
{
"epoch": 3.063457330415755,
"grad_norm": 0.08779731392860413,
"learning_rate": 0.00019585176991150443,
"loss": 0.0152,
"step": 5600
},
{
"epoch": 3.0908096280087527,
"grad_norm": 0.041388992220163345,
"learning_rate": 0.00019308628318584072,
"loss": 0.0143,
"step": 5650
},
{
"epoch": 3.1181619256017505,
"grad_norm": 0.07258164137601852,
"learning_rate": 0.000190320796460177,
"loss": 0.0146,
"step": 5700
},
{
"epoch": 3.1455142231947484,
"grad_norm": 0.08956324309110641,
"learning_rate": 0.0001875553097345133,
"loss": 0.0135,
"step": 5750
},
{
"epoch": 3.1728665207877462,
"grad_norm": 0.0332624725997448,
"learning_rate": 0.00018478982300884956,
"loss": 0.0143,
"step": 5800
},
{
"epoch": 3.200218818380744,
"grad_norm": 0.10055282711982727,
"learning_rate": 0.00018202433628318585,
"loss": 0.0143,
"step": 5850
},
{
"epoch": 3.227571115973742,
"grad_norm": 0.11651066690683365,
"learning_rate": 0.00017925884955752213,
"loss": 0.0155,
"step": 5900
},
{
"epoch": 3.2549234135667398,
"grad_norm": 0.08547132462263107,
"learning_rate": 0.0001764933628318584,
"loss": 0.0141,
"step": 5950
},
{
"epoch": 3.2822757111597376,
"grad_norm": 0.07074438035488129,
"learning_rate": 0.0001737278761061947,
"loss": 0.0142,
"step": 6000
},
{
"epoch": 3.3096280087527354,
"grad_norm": 0.07982511818408966,
"learning_rate": 0.00017096238938053097,
"loss": 0.0168,
"step": 6050
},
{
"epoch": 3.3369803063457333,
"grad_norm": 0.04909352585673332,
"learning_rate": 0.00016819690265486726,
"loss": 0.014,
"step": 6100
},
{
"epoch": 3.3643326039387307,
"grad_norm": 0.07797563821077347,
"learning_rate": 0.00016543141592920355,
"loss": 0.0157,
"step": 6150
},
{
"epoch": 3.3916849015317285,
"grad_norm": 0.08203662931919098,
"learning_rate": 0.0001626659292035398,
"loss": 0.0142,
"step": 6200
},
{
"epoch": 3.4190371991247264,
"grad_norm": 0.08141785860061646,
"learning_rate": 0.0001599004424778761,
"loss": 0.0144,
"step": 6250
},
{
"epoch": 3.446389496717724,
"grad_norm": 0.05146276205778122,
"learning_rate": 0.0001571349557522124,
"loss": 0.0142,
"step": 6300
},
{
"epoch": 3.473741794310722,
"grad_norm": 0.04246249422430992,
"learning_rate": 0.00015436946902654868,
"loss": 0.0144,
"step": 6350
},
{
"epoch": 3.50109409190372,
"grad_norm": 0.07449432462453842,
"learning_rate": 0.00015160398230088497,
"loss": 0.0166,
"step": 6400
},
{
"epoch": 3.5284463894967177,
"grad_norm": 0.04963746666908264,
"learning_rate": 0.00014883849557522125,
"loss": 0.014,
"step": 6450
},
{
"epoch": 3.5557986870897156,
"grad_norm": 0.10930000245571136,
"learning_rate": 0.00014607300884955752,
"loss": 0.0142,
"step": 6500
},
{
"epoch": 3.5831509846827134,
"grad_norm": 0.09715767204761505,
"learning_rate": 0.0001433075221238938,
"loss": 0.0147,
"step": 6550
},
{
"epoch": 3.6105032822757113,
"grad_norm": 0.1151093989610672,
"learning_rate": 0.0001405420353982301,
"loss": 0.0126,
"step": 6600
},
{
"epoch": 3.637855579868709,
"grad_norm": 0.06474081426858902,
"learning_rate": 0.00013777654867256636,
"loss": 0.0132,
"step": 6650
},
{
"epoch": 3.6652078774617065,
"grad_norm": 0.10157348960638046,
"learning_rate": 0.00013501106194690267,
"loss": 0.0129,
"step": 6700
},
{
"epoch": 3.6925601750547044,
"grad_norm": 0.08445007354021072,
"learning_rate": 0.00013224557522123896,
"loss": 0.0136,
"step": 6750
},
{
"epoch": 3.719912472647702,
"grad_norm": 0.1165938451886177,
"learning_rate": 0.00012948008849557522,
"loss": 0.013,
"step": 6800
},
{
"epoch": 3.7472647702407,
"grad_norm": 0.0693984255194664,
"learning_rate": 0.0001267146017699115,
"loss": 0.0131,
"step": 6850
},
{
"epoch": 3.774617067833698,
"grad_norm": 0.07007980346679688,
"learning_rate": 0.0001239491150442478,
"loss": 0.0136,
"step": 6900
},
{
"epoch": 3.8019693654266957,
"grad_norm": 0.05304344743490219,
"learning_rate": 0.00012118362831858407,
"loss": 0.0123,
"step": 6950
},
{
"epoch": 3.8293216630196936,
"grad_norm": 0.07882869988679886,
"learning_rate": 0.00011841814159292036,
"loss": 0.0138,
"step": 7000
},
{
"epoch": 3.8566739606126914,
"grad_norm": 0.07988675683736801,
"learning_rate": 0.00011565265486725664,
"loss": 0.0142,
"step": 7050
},
{
"epoch": 3.8840262582056893,
"grad_norm": 0.06684820353984833,
"learning_rate": 0.00011288716814159291,
"loss": 0.013,
"step": 7100
},
{
"epoch": 3.911378555798687,
"grad_norm": 0.04811659827828407,
"learning_rate": 0.00011012168141592921,
"loss": 0.0145,
"step": 7150
},
{
"epoch": 3.938730853391685,
"grad_norm": 0.09577899426221848,
"learning_rate": 0.00010735619469026549,
"loss": 0.0149,
"step": 7200
},
{
"epoch": 3.966083150984683,
"grad_norm": 0.08057638257741928,
"learning_rate": 0.00010459070796460176,
"loss": 0.0121,
"step": 7250
},
{
"epoch": 3.9934354485776806,
"grad_norm": 0.07438407838344574,
"learning_rate": 0.00010182522123893805,
"loss": 0.0121,
"step": 7300
},
{
"epoch": 4.0207877461706785,
"grad_norm": 0.039558082818984985,
"learning_rate": 9.905973451327434e-05,
"loss": 0.0127,
"step": 7350
},
{
"epoch": 4.048140043763676,
"grad_norm": 0.09110087901353836,
"learning_rate": 9.629424778761062e-05,
"loss": 0.0112,
"step": 7400
},
{
"epoch": 4.075492341356674,
"grad_norm": 0.11649812757968903,
"learning_rate": 9.35287610619469e-05,
"loss": 0.012,
"step": 7450
},
{
"epoch": 4.102844638949672,
"grad_norm": 0.06791022419929504,
"learning_rate": 9.07632743362832e-05,
"loss": 0.0123,
"step": 7500
},
{
"epoch": 4.13019693654267,
"grad_norm": 0.08256133645772934,
"learning_rate": 8.799778761061947e-05,
"loss": 0.0115,
"step": 7550
},
{
"epoch": 4.157549234135668,
"grad_norm": 0.0456516407430172,
"learning_rate": 8.523230088495576e-05,
"loss": 0.0106,
"step": 7600
},
{
"epoch": 4.1849015317286655,
"grad_norm": 0.0911739319562912,
"learning_rate": 8.246681415929203e-05,
"loss": 0.0132,
"step": 7650
},
{
"epoch": 4.212253829321663,
"grad_norm": 0.04813405126333237,
"learning_rate": 7.970132743362832e-05,
"loss": 0.0122,
"step": 7700
},
{
"epoch": 4.239606126914661,
"grad_norm": 0.08700749278068542,
"learning_rate": 7.69358407079646e-05,
"loss": 0.0115,
"step": 7750
},
{
"epoch": 4.266958424507659,
"grad_norm": 0.08354539424180984,
"learning_rate": 7.417035398230089e-05,
"loss": 0.0129,
"step": 7800
},
{
"epoch": 4.294310722100656,
"grad_norm": 0.07438132911920547,
"learning_rate": 7.140486725663717e-05,
"loss": 0.0125,
"step": 7850
},
{
"epoch": 4.321663019693654,
"grad_norm": 0.04467419162392616,
"learning_rate": 6.863938053097345e-05,
"loss": 0.011,
"step": 7900
},
{
"epoch": 4.349015317286652,
"grad_norm": 0.01871863380074501,
"learning_rate": 6.587389380530974e-05,
"loss": 0.0112,
"step": 7950
},
{
"epoch": 4.3763676148796495,
"grad_norm": 0.07461551576852798,
"learning_rate": 6.310840707964601e-05,
"loss": 0.013,
"step": 8000
},
{
"epoch": 4.403719912472647,
"grad_norm": 0.04248015210032463,
"learning_rate": 6.03429203539823e-05,
"loss": 0.013,
"step": 8050
},
{
"epoch": 4.431072210065645,
"grad_norm": 0.09184252470731735,
"learning_rate": 5.7577433628318583e-05,
"loss": 0.0114,
"step": 8100
},
{
"epoch": 4.458424507658643,
"grad_norm": 0.0578514039516449,
"learning_rate": 5.481194690265487e-05,
"loss": 0.0125,
"step": 8150
},
{
"epoch": 4.485776805251641,
"grad_norm": 0.10820753872394562,
"learning_rate": 5.2046460176991154e-05,
"loss": 0.0135,
"step": 8200
},
{
"epoch": 4.513129102844639,
"grad_norm": 0.10485094785690308,
"learning_rate": 4.9280973451327436e-05,
"loss": 0.0106,
"step": 8250
},
{
"epoch": 4.540481400437637,
"grad_norm": 0.08615507930517197,
"learning_rate": 4.651548672566372e-05,
"loss": 0.0124,
"step": 8300
},
{
"epoch": 4.567833698030634,
"grad_norm": 0.08930730074644089,
"learning_rate": 4.375e-05,
"loss": 0.0121,
"step": 8350
},
{
"epoch": 4.595185995623632,
"grad_norm": 0.05729290097951889,
"learning_rate": 4.098451327433628e-05,
"loss": 0.012,
"step": 8400
},
{
"epoch": 4.62253829321663,
"grad_norm": 0.06665951758623123,
"learning_rate": 3.821902654867257e-05,
"loss": 0.0121,
"step": 8450
},
{
"epoch": 4.649890590809628,
"grad_norm": 0.0894913300871849,
"learning_rate": 3.5453539823008845e-05,
"loss": 0.0132,
"step": 8500
},
{
"epoch": 4.677242888402626,
"grad_norm": 0.05070062354207039,
"learning_rate": 3.2688053097345134e-05,
"loss": 0.0127,
"step": 8550
},
{
"epoch": 4.704595185995624,
"grad_norm": 0.0852472186088562,
"learning_rate": 2.9922566371681416e-05,
"loss": 0.013,
"step": 8600
},
{
"epoch": 4.7319474835886215,
"grad_norm": 0.04788070544600487,
"learning_rate": 2.7157079646017698e-05,
"loss": 0.0116,
"step": 8650
},
{
"epoch": 4.759299781181619,
"grad_norm": 0.05870731547474861,
"learning_rate": 2.4391592920353983e-05,
"loss": 0.0119,
"step": 8700
},
{
"epoch": 4.786652078774617,
"grad_norm": 0.055160026997327805,
"learning_rate": 2.1626106194690268e-05,
"loss": 0.013,
"step": 8750
},
{
"epoch": 4.814004376367615,
"grad_norm": 0.07681864500045776,
"learning_rate": 1.886061946902655e-05,
"loss": 0.0115,
"step": 8800
},
{
"epoch": 4.841356673960613,
"grad_norm": 0.04158329963684082,
"learning_rate": 1.6095132743362832e-05,
"loss": 0.0109,
"step": 8850
},
{
"epoch": 4.868708971553611,
"grad_norm": 0.07591590285301208,
"learning_rate": 1.3329646017699115e-05,
"loss": 0.0115,
"step": 8900
},
{
"epoch": 4.8960612691466086,
"grad_norm": 0.04335255175828934,
"learning_rate": 1.0564159292035397e-05,
"loss": 0.0126,
"step": 8950
},
{
"epoch": 4.923413566739606,
"grad_norm": 0.02843708172440529,
"learning_rate": 7.798672566371682e-06,
"loss": 0.0109,
"step": 9000
},
{
"epoch": 4.950765864332604,
"grad_norm": 0.051726795732975006,
"learning_rate": 5.033185840707965e-06,
"loss": 0.0112,
"step": 9050
},
{
"epoch": 4.978118161925602,
"grad_norm": 0.03216787800192833,
"learning_rate": 2.267699115044248e-06,
"loss": 0.0122,
"step": 9100
}
],
"logging_steps": 50,
"max_steps": 9140,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4947927084564480.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}