Rakhman16's picture
Training in progress, step 5692, checkpoint
329a611 verified
{
"best_metric": 0.20129592716693878,
"best_model_checkpoint": "./fine-tuned/checkpoint-5500",
"epoch": 3.9985950122936424,
"eval_steps": 100,
"global_step": 5692,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.035124692658939236,
"grad_norm": 31298.91015625,
"learning_rate": 2.9736472241742796e-05,
"loss": 0.2772,
"step": 50
},
{
"epoch": 0.07024938531787847,
"grad_norm": 28423.171875,
"learning_rate": 2.9472944483485594e-05,
"loss": 0.2575,
"step": 100
},
{
"epoch": 0.07024938531787847,
"eval_loss": 0.22961987555027008,
"eval_runtime": 67.6563,
"eval_samples_per_second": 65.921,
"eval_steps_per_second": 2.069,
"step": 100
},
{
"epoch": 0.1053740779768177,
"grad_norm": 28882.9609375,
"learning_rate": 2.9209416725228392e-05,
"loss": 0.24,
"step": 150
},
{
"epoch": 0.14049877063575694,
"grad_norm": 44492.234375,
"learning_rate": 2.894588896697119e-05,
"loss": 0.2427,
"step": 200
},
{
"epoch": 0.14049877063575694,
"eval_loss": 0.22477279603481293,
"eval_runtime": 67.2438,
"eval_samples_per_second": 66.326,
"eval_steps_per_second": 2.082,
"step": 200
},
{
"epoch": 0.17562346329469616,
"grad_norm": 23385.271484375,
"learning_rate": 2.8682361208713985e-05,
"loss": 0.237,
"step": 250
},
{
"epoch": 0.2107481559536354,
"grad_norm": 65184.7578125,
"learning_rate": 2.841883345045678e-05,
"loss": 0.2351,
"step": 300
},
{
"epoch": 0.2107481559536354,
"eval_loss": 0.22264569997787476,
"eval_runtime": 67.1557,
"eval_samples_per_second": 66.413,
"eval_steps_per_second": 2.085,
"step": 300
},
{
"epoch": 0.24587284861257463,
"grad_norm": 26510.09375,
"learning_rate": 2.8155305692199578e-05,
"loss": 0.2387,
"step": 350
},
{
"epoch": 0.2809975412715139,
"grad_norm": 35873.625,
"learning_rate": 2.7891777933942376e-05,
"loss": 0.239,
"step": 400
},
{
"epoch": 0.2809975412715139,
"eval_loss": 0.22040367126464844,
"eval_runtime": 67.2556,
"eval_samples_per_second": 66.314,
"eval_steps_per_second": 2.082,
"step": 400
},
{
"epoch": 0.31612223393045313,
"grad_norm": 190454.703125,
"learning_rate": 2.7628250175685175e-05,
"loss": 0.2343,
"step": 450
},
{
"epoch": 0.3512469265893923,
"grad_norm": 27248.146484375,
"learning_rate": 2.736472241742797e-05,
"loss": 0.2349,
"step": 500
},
{
"epoch": 0.3512469265893923,
"eval_loss": 0.21807625889778137,
"eval_runtime": 67.3281,
"eval_samples_per_second": 66.243,
"eval_steps_per_second": 2.079,
"step": 500
},
{
"epoch": 0.3863716192483316,
"grad_norm": 21019.255859375,
"learning_rate": 2.7101194659170764e-05,
"loss": 0.2286,
"step": 550
},
{
"epoch": 0.4214963119072708,
"grad_norm": 23071.5703125,
"learning_rate": 2.6837666900913563e-05,
"loss": 0.2311,
"step": 600
},
{
"epoch": 0.4214963119072708,
"eval_loss": 0.21645724773406982,
"eval_runtime": 67.1857,
"eval_samples_per_second": 66.383,
"eval_steps_per_second": 2.084,
"step": 600
},
{
"epoch": 0.45662100456621,
"grad_norm": 21536.572265625,
"learning_rate": 2.657413914265636e-05,
"loss": 0.2249,
"step": 650
},
{
"epoch": 0.49174569722514927,
"grad_norm": 22037.119140625,
"learning_rate": 2.631061138439916e-05,
"loss": 0.2302,
"step": 700
},
{
"epoch": 0.49174569722514927,
"eval_loss": 0.21522314846515656,
"eval_runtime": 67.377,
"eval_samples_per_second": 66.195,
"eval_steps_per_second": 2.078,
"step": 700
},
{
"epoch": 0.5268703898840885,
"grad_norm": 24826.04296875,
"learning_rate": 2.6047083626141954e-05,
"loss": 0.2295,
"step": 750
},
{
"epoch": 0.5619950825430278,
"grad_norm": 21309.46875,
"learning_rate": 2.578355586788475e-05,
"loss": 0.2265,
"step": 800
},
{
"epoch": 0.5619950825430278,
"eval_loss": 0.21485908329486847,
"eval_runtime": 67.9456,
"eval_samples_per_second": 65.641,
"eval_steps_per_second": 2.06,
"step": 800
},
{
"epoch": 0.597119775201967,
"grad_norm": 21253.212890625,
"learning_rate": 2.5520028109627547e-05,
"loss": 0.2255,
"step": 850
},
{
"epoch": 0.6322444678609063,
"grad_norm": 25884.013671875,
"learning_rate": 2.5256500351370345e-05,
"loss": 0.2189,
"step": 900
},
{
"epoch": 0.6322444678609063,
"eval_loss": 0.21369116008281708,
"eval_runtime": 67.5126,
"eval_samples_per_second": 66.062,
"eval_steps_per_second": 2.074,
"step": 900
},
{
"epoch": 0.6673691605198454,
"grad_norm": 32345.33203125,
"learning_rate": 2.4992972593113144e-05,
"loss": 0.2177,
"step": 950
},
{
"epoch": 0.7024938531787847,
"grad_norm": 22764.255859375,
"learning_rate": 2.472944483485594e-05,
"loss": 0.2205,
"step": 1000
},
{
"epoch": 0.7024938531787847,
"eval_loss": 0.2125701606273651,
"eval_runtime": 67.5281,
"eval_samples_per_second": 66.047,
"eval_steps_per_second": 2.073,
"step": 1000
},
{
"epoch": 0.7376185458377239,
"grad_norm": 26256.35546875,
"learning_rate": 2.4465917076598737e-05,
"loss": 0.2224,
"step": 1050
},
{
"epoch": 0.7727432384966632,
"grad_norm": 29107.78515625,
"learning_rate": 2.420238931834153e-05,
"loss": 0.2211,
"step": 1100
},
{
"epoch": 0.7727432384966632,
"eval_loss": 0.2117428034543991,
"eval_runtime": 67.5369,
"eval_samples_per_second": 66.038,
"eval_steps_per_second": 2.073,
"step": 1100
},
{
"epoch": 0.8078679311556024,
"grad_norm": 98354.15625,
"learning_rate": 2.393886156008433e-05,
"loss": 0.215,
"step": 1150
},
{
"epoch": 0.8429926238145417,
"grad_norm": 22886.3984375,
"learning_rate": 2.3675333801827128e-05,
"loss": 0.2229,
"step": 1200
},
{
"epoch": 0.8429926238145417,
"eval_loss": 0.2107735425233841,
"eval_runtime": 67.6295,
"eval_samples_per_second": 65.948,
"eval_steps_per_second": 2.07,
"step": 1200
},
{
"epoch": 0.8781173164734809,
"grad_norm": 20510.26171875,
"learning_rate": 2.3411806043569923e-05,
"loss": 0.2105,
"step": 1250
},
{
"epoch": 0.91324200913242,
"grad_norm": 20053.85546875,
"learning_rate": 2.314827828531272e-05,
"loss": 0.2195,
"step": 1300
},
{
"epoch": 0.91324200913242,
"eval_loss": 0.20966531336307526,
"eval_runtime": 67.6112,
"eval_samples_per_second": 65.965,
"eval_steps_per_second": 2.071,
"step": 1300
},
{
"epoch": 0.9483667017913593,
"grad_norm": 28154.595703125,
"learning_rate": 2.2884750527055516e-05,
"loss": 0.2215,
"step": 1350
},
{
"epoch": 0.9834913944502985,
"grad_norm": 28011.71484375,
"learning_rate": 2.2621222768798314e-05,
"loss": 0.2172,
"step": 1400
},
{
"epoch": 0.9834913944502985,
"eval_loss": 0.20960816740989685,
"eval_runtime": 67.6089,
"eval_samples_per_second": 65.968,
"eval_steps_per_second": 2.071,
"step": 1400
},
{
"epoch": 1.0186160871092378,
"grad_norm": 26518.01171875,
"learning_rate": 2.2357695010541112e-05,
"loss": 0.21,
"step": 1450
},
{
"epoch": 1.053740779768177,
"grad_norm": 20411.26171875,
"learning_rate": 2.2094167252283907e-05,
"loss": 0.2139,
"step": 1500
},
{
"epoch": 1.053740779768177,
"eval_loss": 0.20940540730953217,
"eval_runtime": 67.4684,
"eval_samples_per_second": 66.105,
"eval_steps_per_second": 2.075,
"step": 1500
},
{
"epoch": 1.0888654724271163,
"grad_norm": 25448.7734375,
"learning_rate": 2.1830639494026705e-05,
"loss": 0.2119,
"step": 1550
},
{
"epoch": 1.1239901650860555,
"grad_norm": 20371.7109375,
"learning_rate": 2.15671117357695e-05,
"loss": 0.2074,
"step": 1600
},
{
"epoch": 1.1239901650860555,
"eval_loss": 0.2086929827928543,
"eval_runtime": 67.511,
"eval_samples_per_second": 66.063,
"eval_steps_per_second": 2.074,
"step": 1600
},
{
"epoch": 1.1591148577449948,
"grad_norm": 24624.9609375,
"learning_rate": 2.13035839775123e-05,
"loss": 0.2109,
"step": 1650
},
{
"epoch": 1.194239550403934,
"grad_norm": 28790.974609375,
"learning_rate": 2.1040056219255097e-05,
"loss": 0.2126,
"step": 1700
},
{
"epoch": 1.194239550403934,
"eval_loss": 0.20832768082618713,
"eval_runtime": 67.1973,
"eval_samples_per_second": 66.372,
"eval_steps_per_second": 2.083,
"step": 1700
},
{
"epoch": 1.2293642430628733,
"grad_norm": 22134.93359375,
"learning_rate": 2.077652846099789e-05,
"loss": 0.2118,
"step": 1750
},
{
"epoch": 1.2644889357218125,
"grad_norm": 22432.322265625,
"learning_rate": 2.051300070274069e-05,
"loss": 0.2128,
"step": 1800
},
{
"epoch": 1.2644889357218125,
"eval_loss": 0.20813611149787903,
"eval_runtime": 67.1539,
"eval_samples_per_second": 66.415,
"eval_steps_per_second": 2.085,
"step": 1800
},
{
"epoch": 1.2996136283807518,
"grad_norm": 21562.96484375,
"learning_rate": 2.0249472944483485e-05,
"loss": 0.2135,
"step": 1850
},
{
"epoch": 1.334738321039691,
"grad_norm": 22612.58203125,
"learning_rate": 1.9985945186226283e-05,
"loss": 0.2081,
"step": 1900
},
{
"epoch": 1.334738321039691,
"eval_loss": 0.2073371410369873,
"eval_runtime": 67.1629,
"eval_samples_per_second": 66.406,
"eval_steps_per_second": 2.084,
"step": 1900
},
{
"epoch": 1.36986301369863,
"grad_norm": 22550.556640625,
"learning_rate": 1.972241742796908e-05,
"loss": 0.2037,
"step": 1950
},
{
"epoch": 1.4049877063575693,
"grad_norm": 24281.9140625,
"learning_rate": 1.9458889669711876e-05,
"loss": 0.2111,
"step": 2000
},
{
"epoch": 1.4049877063575693,
"eval_loss": 0.20700447261333466,
"eval_runtime": 67.2893,
"eval_samples_per_second": 66.281,
"eval_steps_per_second": 2.081,
"step": 2000
},
{
"epoch": 1.4401123990165086,
"grad_norm": 25767.197265625,
"learning_rate": 1.9195361911454674e-05,
"loss": 0.2054,
"step": 2050
},
{
"epoch": 1.4752370916754478,
"grad_norm": 22215.111328125,
"learning_rate": 1.893183415319747e-05,
"loss": 0.2082,
"step": 2100
},
{
"epoch": 1.4752370916754478,
"eval_loss": 0.20631250739097595,
"eval_runtime": 67.1038,
"eval_samples_per_second": 66.464,
"eval_steps_per_second": 2.086,
"step": 2100
},
{
"epoch": 1.510361784334387,
"grad_norm": 27927.373046875,
"learning_rate": 1.8668306394940267e-05,
"loss": 0.2128,
"step": 2150
},
{
"epoch": 1.5454864769933263,
"grad_norm": 25635.267578125,
"learning_rate": 1.8404778636683066e-05,
"loss": 0.2078,
"step": 2200
},
{
"epoch": 1.5454864769933263,
"eval_loss": 0.20582793653011322,
"eval_runtime": 67.2723,
"eval_samples_per_second": 66.298,
"eval_steps_per_second": 2.081,
"step": 2200
},
{
"epoch": 1.5806111696522656,
"grad_norm": 25550.1171875,
"learning_rate": 1.814125087842586e-05,
"loss": 0.2058,
"step": 2250
},
{
"epoch": 1.6157358623112048,
"grad_norm": 21671.251953125,
"learning_rate": 1.787772312016866e-05,
"loss": 0.206,
"step": 2300
},
{
"epoch": 1.6157358623112048,
"eval_loss": 0.2059122622013092,
"eval_runtime": 67.4662,
"eval_samples_per_second": 66.107,
"eval_steps_per_second": 2.075,
"step": 2300
},
{
"epoch": 1.650860554970144,
"grad_norm": 21685.947265625,
"learning_rate": 1.7614195361911453e-05,
"loss": 0.2086,
"step": 2350
},
{
"epoch": 1.685985247629083,
"grad_norm": 24516.828125,
"learning_rate": 1.7350667603654252e-05,
"loss": 0.2069,
"step": 2400
},
{
"epoch": 1.685985247629083,
"eval_loss": 0.20495346188545227,
"eval_runtime": 67.1671,
"eval_samples_per_second": 66.402,
"eval_steps_per_second": 2.084,
"step": 2400
},
{
"epoch": 1.7211099402880223,
"grad_norm": 22610.7734375,
"learning_rate": 1.708713984539705e-05,
"loss": 0.2052,
"step": 2450
},
{
"epoch": 1.7562346329469616,
"grad_norm": 35525.84765625,
"learning_rate": 1.6823612087139845e-05,
"loss": 0.2051,
"step": 2500
},
{
"epoch": 1.7562346329469616,
"eval_loss": 0.20481644570827484,
"eval_runtime": 67.1059,
"eval_samples_per_second": 66.462,
"eval_steps_per_second": 2.086,
"step": 2500
},
{
"epoch": 1.7913593256059008,
"grad_norm": 20207.35546875,
"learning_rate": 1.6560084328882643e-05,
"loss": 0.2049,
"step": 2550
},
{
"epoch": 1.82648401826484,
"grad_norm": 17453.359375,
"learning_rate": 1.6296556570625438e-05,
"loss": 0.2101,
"step": 2600
},
{
"epoch": 1.82648401826484,
"eval_loss": 0.20485134422779083,
"eval_runtime": 67.202,
"eval_samples_per_second": 66.367,
"eval_steps_per_second": 2.083,
"step": 2600
},
{
"epoch": 1.8616087109237793,
"grad_norm": 24568.439453125,
"learning_rate": 1.603302881236824e-05,
"loss": 0.2081,
"step": 2650
},
{
"epoch": 1.8967334035827186,
"grad_norm": 22425.1875,
"learning_rate": 1.5769501054111034e-05,
"loss": 0.2032,
"step": 2700
},
{
"epoch": 1.8967334035827186,
"eval_loss": 0.2041337788105011,
"eval_runtime": 67.2372,
"eval_samples_per_second": 66.332,
"eval_steps_per_second": 2.082,
"step": 2700
},
{
"epoch": 1.9318580962416578,
"grad_norm": 21858.3828125,
"learning_rate": 1.550597329585383e-05,
"loss": 0.2074,
"step": 2750
},
{
"epoch": 1.966982788900597,
"grad_norm": 17712.39453125,
"learning_rate": 1.5242445537596626e-05,
"loss": 0.205,
"step": 2800
},
{
"epoch": 1.966982788900597,
"eval_loss": 0.20371712744235992,
"eval_runtime": 67.1299,
"eval_samples_per_second": 66.438,
"eval_steps_per_second": 2.086,
"step": 2800
},
{
"epoch": 2.0021074815595363,
"grad_norm": 20413.91796875,
"learning_rate": 1.4978917779339424e-05,
"loss": 0.203,
"step": 2850
},
{
"epoch": 2.0372321742184756,
"grad_norm": 21380.130859375,
"learning_rate": 1.471539002108222e-05,
"loss": 0.199,
"step": 2900
},
{
"epoch": 2.0372321742184756,
"eval_loss": 0.20416177809238434,
"eval_runtime": 67.1771,
"eval_samples_per_second": 66.392,
"eval_steps_per_second": 2.084,
"step": 2900
},
{
"epoch": 2.072356866877415,
"grad_norm": 28436.697265625,
"learning_rate": 1.4451862262825019e-05,
"loss": 0.1989,
"step": 2950
},
{
"epoch": 2.107481559536354,
"grad_norm": 18739.8359375,
"learning_rate": 1.4188334504567815e-05,
"loss": 0.1982,
"step": 3000
},
{
"epoch": 2.107481559536354,
"eval_loss": 0.2037852257490158,
"eval_runtime": 67.2417,
"eval_samples_per_second": 66.328,
"eval_steps_per_second": 2.082,
"step": 3000
},
{
"epoch": 2.1426062521952933,
"grad_norm": 26514.828125,
"learning_rate": 1.3924806746310612e-05,
"loss": 0.2032,
"step": 3050
},
{
"epoch": 2.1777309448542326,
"grad_norm": 22808.0234375,
"learning_rate": 1.3661278988053408e-05,
"loss": 0.1944,
"step": 3100
},
{
"epoch": 2.1777309448542326,
"eval_loss": 0.20371171832084656,
"eval_runtime": 67.0231,
"eval_samples_per_second": 66.544,
"eval_steps_per_second": 2.089,
"step": 3100
},
{
"epoch": 2.212855637513172,
"grad_norm": 24228.18359375,
"learning_rate": 1.3397751229796205e-05,
"loss": 0.2056,
"step": 3150
},
{
"epoch": 2.247980330172111,
"grad_norm": 20969.25390625,
"learning_rate": 1.3134223471539003e-05,
"loss": 0.1948,
"step": 3200
},
{
"epoch": 2.247980330172111,
"eval_loss": 0.20387396216392517,
"eval_runtime": 66.9567,
"eval_samples_per_second": 66.61,
"eval_steps_per_second": 2.091,
"step": 3200
},
{
"epoch": 2.2831050228310503,
"grad_norm": 42587.73046875,
"learning_rate": 1.28706957132818e-05,
"loss": 0.2072,
"step": 3250
},
{
"epoch": 2.3182297154899896,
"grad_norm": 22174.130859375,
"learning_rate": 1.2607167955024596e-05,
"loss": 0.2023,
"step": 3300
},
{
"epoch": 2.3182297154899896,
"eval_loss": 0.20358328521251678,
"eval_runtime": 67.1207,
"eval_samples_per_second": 66.447,
"eval_steps_per_second": 2.086,
"step": 3300
},
{
"epoch": 2.353354408148929,
"grad_norm": 28607.568359375,
"learning_rate": 1.2343640196767393e-05,
"loss": 0.1964,
"step": 3350
},
{
"epoch": 2.388479100807868,
"grad_norm": 27227.3203125,
"learning_rate": 1.208011243851019e-05,
"loss": 0.2075,
"step": 3400
},
{
"epoch": 2.388479100807868,
"eval_loss": 0.20336925983428955,
"eval_runtime": 67.2613,
"eval_samples_per_second": 66.309,
"eval_steps_per_second": 2.081,
"step": 3400
},
{
"epoch": 2.4236037934668073,
"grad_norm": 24440.291015625,
"learning_rate": 1.1816584680252988e-05,
"loss": 0.1999,
"step": 3450
},
{
"epoch": 2.4587284861257466,
"grad_norm": 23327.6328125,
"learning_rate": 1.1553056921995784e-05,
"loss": 0.2041,
"step": 3500
},
{
"epoch": 2.4587284861257466,
"eval_loss": 0.2032385915517807,
"eval_runtime": 67.0192,
"eval_samples_per_second": 66.548,
"eval_steps_per_second": 2.089,
"step": 3500
},
{
"epoch": 2.493853178784686,
"grad_norm": 23787.681640625,
"learning_rate": 1.128952916373858e-05,
"loss": 0.1984,
"step": 3550
},
{
"epoch": 2.528977871443625,
"grad_norm": 24526.529296875,
"learning_rate": 1.1026001405481377e-05,
"loss": 0.1971,
"step": 3600
},
{
"epoch": 2.528977871443625,
"eval_loss": 0.20272360742092133,
"eval_runtime": 66.8824,
"eval_samples_per_second": 66.684,
"eval_steps_per_second": 2.093,
"step": 3600
},
{
"epoch": 2.564102564102564,
"grad_norm": 23948.60546875,
"learning_rate": 1.0762473647224174e-05,
"loss": 0.1904,
"step": 3650
},
{
"epoch": 2.5992272567615036,
"grad_norm": 17924.513671875,
"learning_rate": 1.0498945888966972e-05,
"loss": 0.1968,
"step": 3700
},
{
"epoch": 2.5992272567615036,
"eval_loss": 0.20258785784244537,
"eval_runtime": 67.0213,
"eval_samples_per_second": 66.546,
"eval_steps_per_second": 2.089,
"step": 3700
},
{
"epoch": 2.6343519494204424,
"grad_norm": 18695.21875,
"learning_rate": 1.0235418130709768e-05,
"loss": 0.1961,
"step": 3750
},
{
"epoch": 2.669476642079382,
"grad_norm": 23424.083984375,
"learning_rate": 9.971890372452565e-06,
"loss": 0.1961,
"step": 3800
},
{
"epoch": 2.669476642079382,
"eval_loss": 0.2024257928133011,
"eval_runtime": 67.1877,
"eval_samples_per_second": 66.381,
"eval_steps_per_second": 2.084,
"step": 3800
},
{
"epoch": 2.704601334738321,
"grad_norm": 18417.158203125,
"learning_rate": 9.708362614195362e-06,
"loss": 0.2004,
"step": 3850
},
{
"epoch": 2.73972602739726,
"grad_norm": 29204.578125,
"learning_rate": 9.444834855938158e-06,
"loss": 0.2,
"step": 3900
},
{
"epoch": 2.73972602739726,
"eval_loss": 0.20261028409004211,
"eval_runtime": 67.145,
"eval_samples_per_second": 66.423,
"eval_steps_per_second": 2.085,
"step": 3900
},
{
"epoch": 2.7748507200561994,
"grad_norm": 22810.859375,
"learning_rate": 9.181307097680956e-06,
"loss": 0.1955,
"step": 3950
},
{
"epoch": 2.8099754127151386,
"grad_norm": 20385.189453125,
"learning_rate": 8.917779339423753e-06,
"loss": 0.1902,
"step": 4000
},
{
"epoch": 2.8099754127151386,
"eval_loss": 0.20224925875663757,
"eval_runtime": 66.8567,
"eval_samples_per_second": 66.71,
"eval_steps_per_second": 2.094,
"step": 4000
},
{
"epoch": 2.845100105374078,
"grad_norm": 60070.58984375,
"learning_rate": 8.65425158116655e-06,
"loss": 0.1969,
"step": 4050
},
{
"epoch": 2.880224798033017,
"grad_norm": 20594.654296875,
"learning_rate": 8.390723822909348e-06,
"loss": 0.2009,
"step": 4100
},
{
"epoch": 2.880224798033017,
"eval_loss": 0.20173698663711548,
"eval_runtime": 66.8679,
"eval_samples_per_second": 66.699,
"eval_steps_per_second": 2.094,
"step": 4100
},
{
"epoch": 2.9153494906919564,
"grad_norm": 22764.1640625,
"learning_rate": 8.127196064652143e-06,
"loss": 0.1939,
"step": 4150
},
{
"epoch": 2.9504741833508956,
"grad_norm": 22604.9375,
"learning_rate": 7.86366830639494e-06,
"loss": 0.1991,
"step": 4200
},
{
"epoch": 2.9504741833508956,
"eval_loss": 0.20178209245204926,
"eval_runtime": 67.157,
"eval_samples_per_second": 66.412,
"eval_steps_per_second": 2.085,
"step": 4200
},
{
"epoch": 2.985598876009835,
"grad_norm": 23427.0,
"learning_rate": 7.600140548137737e-06,
"loss": 0.1982,
"step": 4250
},
{
"epoch": 3.020723568668774,
"grad_norm": 22872.943359375,
"learning_rate": 7.336612789880535e-06,
"loss": 0.1905,
"step": 4300
},
{
"epoch": 3.020723568668774,
"eval_loss": 0.20212285220623016,
"eval_runtime": 66.9569,
"eval_samples_per_second": 66.61,
"eval_steps_per_second": 2.091,
"step": 4300
},
{
"epoch": 3.0558482613277134,
"grad_norm": 20360.029296875,
"learning_rate": 7.073085031623331e-06,
"loss": 0.2011,
"step": 4350
},
{
"epoch": 3.0909729539866526,
"grad_norm": 26769.02734375,
"learning_rate": 6.809557273366128e-06,
"loss": 0.1939,
"step": 4400
},
{
"epoch": 3.0909729539866526,
"eval_loss": 0.20202863216400146,
"eval_runtime": 66.9701,
"eval_samples_per_second": 66.597,
"eval_steps_per_second": 2.09,
"step": 4400
},
{
"epoch": 3.126097646645592,
"grad_norm": 34976.171875,
"learning_rate": 6.546029515108924e-06,
"loss": 0.1912,
"step": 4450
},
{
"epoch": 3.161222339304531,
"grad_norm": 50123.8671875,
"learning_rate": 6.282501756851722e-06,
"loss": 0.1934,
"step": 4500
},
{
"epoch": 3.161222339304531,
"eval_loss": 0.20200392603874207,
"eval_runtime": 66.9822,
"eval_samples_per_second": 66.585,
"eval_steps_per_second": 2.09,
"step": 4500
},
{
"epoch": 3.1963470319634704,
"grad_norm": 30103.6484375,
"learning_rate": 6.018973998594519e-06,
"loss": 0.1891,
"step": 4550
},
{
"epoch": 3.2314717246224096,
"grad_norm": 22014.908203125,
"learning_rate": 5.755446240337316e-06,
"loss": 0.1933,
"step": 4600
},
{
"epoch": 3.2314717246224096,
"eval_loss": 0.20177535712718964,
"eval_runtime": 66.8767,
"eval_samples_per_second": 66.69,
"eval_steps_per_second": 2.093,
"step": 4600
},
{
"epoch": 3.266596417281349,
"grad_norm": 24894.115234375,
"learning_rate": 5.491918482080113e-06,
"loss": 0.1921,
"step": 4650
},
{
"epoch": 3.301721109940288,
"grad_norm": 21648.677734375,
"learning_rate": 5.2283907238229096e-06,
"loss": 0.1914,
"step": 4700
},
{
"epoch": 3.301721109940288,
"eval_loss": 0.20187227427959442,
"eval_runtime": 66.9001,
"eval_samples_per_second": 66.667,
"eval_steps_per_second": 2.093,
"step": 4700
},
{
"epoch": 3.3368458025992274,
"grad_norm": 24555.294921875,
"learning_rate": 4.964862965565706e-06,
"loss": 0.1914,
"step": 4750
},
{
"epoch": 3.3719704952581666,
"grad_norm": 44338.69921875,
"learning_rate": 4.7013352073085035e-06,
"loss": 0.1936,
"step": 4800
},
{
"epoch": 3.3719704952581666,
"eval_loss": 0.20171089470386505,
"eval_runtime": 67.0479,
"eval_samples_per_second": 66.52,
"eval_steps_per_second": 2.088,
"step": 4800
},
{
"epoch": 3.407095187917106,
"grad_norm": 23296.537109375,
"learning_rate": 4.4378074490513e-06,
"loss": 0.1949,
"step": 4850
},
{
"epoch": 3.442219880576045,
"grad_norm": 21337.087890625,
"learning_rate": 4.1742796907940974e-06,
"loss": 0.1902,
"step": 4900
},
{
"epoch": 3.442219880576045,
"eval_loss": 0.20151035487651825,
"eval_runtime": 66.9445,
"eval_samples_per_second": 66.622,
"eval_steps_per_second": 2.091,
"step": 4900
},
{
"epoch": 3.4773445732349844,
"grad_norm": 20258.736328125,
"learning_rate": 3.910751932536894e-06,
"loss": 0.1966,
"step": 4950
},
{
"epoch": 3.512469265893923,
"grad_norm": 22937.763671875,
"learning_rate": 3.647224174279691e-06,
"loss": 0.1949,
"step": 5000
},
{
"epoch": 3.512469265893923,
"eval_loss": 0.2013118416070938,
"eval_runtime": 67.0166,
"eval_samples_per_second": 66.551,
"eval_steps_per_second": 2.089,
"step": 5000
},
{
"epoch": 3.547593958552863,
"grad_norm": 27274.357421875,
"learning_rate": 3.383696416022488e-06,
"loss": 0.1968,
"step": 5050
},
{
"epoch": 3.5827186512118017,
"grad_norm": 26782.548828125,
"learning_rate": 3.1201686577652844e-06,
"loss": 0.1878,
"step": 5100
},
{
"epoch": 3.5827186512118017,
"eval_loss": 0.20154449343681335,
"eval_runtime": 67.1325,
"eval_samples_per_second": 66.436,
"eval_steps_per_second": 2.085,
"step": 5100
},
{
"epoch": 3.6178433438707414,
"grad_norm": 18810.177734375,
"learning_rate": 2.8566408995080814e-06,
"loss": 0.1912,
"step": 5150
},
{
"epoch": 3.65296803652968,
"grad_norm": 26744.78515625,
"learning_rate": 2.593113141250879e-06,
"loss": 0.1975,
"step": 5200
},
{
"epoch": 3.65296803652968,
"eval_loss": 0.20147912204265594,
"eval_runtime": 67.0091,
"eval_samples_per_second": 66.558,
"eval_steps_per_second": 2.089,
"step": 5200
},
{
"epoch": 3.68809272918862,
"grad_norm": 23326.36328125,
"learning_rate": 2.3295853829936753e-06,
"loss": 0.1995,
"step": 5250
},
{
"epoch": 3.7232174218475587,
"grad_norm": 21197.091796875,
"learning_rate": 2.0660576247364723e-06,
"loss": 0.1894,
"step": 5300
},
{
"epoch": 3.7232174218475587,
"eval_loss": 0.20139345526695251,
"eval_runtime": 66.9887,
"eval_samples_per_second": 66.578,
"eval_steps_per_second": 2.09,
"step": 5300
},
{
"epoch": 3.758342114506498,
"grad_norm": 23258.3671875,
"learning_rate": 1.8025298664792693e-06,
"loss": 0.1941,
"step": 5350
},
{
"epoch": 3.793466807165437,
"grad_norm": 25702.90234375,
"learning_rate": 1.539002108222066e-06,
"loss": 0.1952,
"step": 5400
},
{
"epoch": 3.793466807165437,
"eval_loss": 0.20133435726165771,
"eval_runtime": 67.0042,
"eval_samples_per_second": 66.563,
"eval_steps_per_second": 2.089,
"step": 5400
},
{
"epoch": 3.8285914998243764,
"grad_norm": 22600.765625,
"learning_rate": 1.275474349964863e-06,
"loss": 0.1912,
"step": 5450
},
{
"epoch": 3.8637161924833157,
"grad_norm": 25134.44921875,
"learning_rate": 1.0119465917076597e-06,
"loss": 0.197,
"step": 5500
},
{
"epoch": 3.8637161924833157,
"eval_loss": 0.20129592716693878,
"eval_runtime": 67.1868,
"eval_samples_per_second": 66.382,
"eval_steps_per_second": 2.084,
"step": 5500
},
{
"epoch": 3.898840885142255,
"grad_norm": 22639.22265625,
"learning_rate": 7.484188334504568e-07,
"loss": 0.1898,
"step": 5550
},
{
"epoch": 3.933965577801194,
"grad_norm": 108627.9453125,
"learning_rate": 4.848910751932538e-07,
"loss": 0.1887,
"step": 5600
},
{
"epoch": 3.933965577801194,
"eval_loss": 0.2013484090566635,
"eval_runtime": 67.1981,
"eval_samples_per_second": 66.371,
"eval_steps_per_second": 2.083,
"step": 5600
},
{
"epoch": 3.9690902704601334,
"grad_norm": 28155.427734375,
"learning_rate": 2.213633169360506e-07,
"loss": 0.1955,
"step": 5650
}
],
"logging_steps": 50,
"max_steps": 5692,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.545216223281152e+16,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}