initial-colbert-ir / trainer_state.json
souvickdascmsa019's picture
Upload folder using huggingface_hub
9d91dd3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 250,
"global_step": 22737,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006597176408497163,
"grad_norm": 25.778600692749023,
"learning_rate": 9.600000000000001e-06,
"loss": 3.9255,
"step": 50
},
{
"epoch": 0.013194352816994326,
"grad_norm": 15.797815322875977,
"learning_rate": 1.9600000000000002e-05,
"loss": 1.7945,
"step": 100
},
{
"epoch": 0.01979152922549149,
"grad_norm": 16.71779441833496,
"learning_rate": 1.9957591553651104e-05,
"loss": 1.5522,
"step": 150
},
{
"epoch": 0.02638870563398865,
"grad_norm": 14.723907470703125,
"learning_rate": 1.9913416088704335e-05,
"loss": 1.6267,
"step": 200
},
{
"epoch": 0.032985882042485815,
"grad_norm": 14.432233810424805,
"learning_rate": 1.9869240623757565e-05,
"loss": 1.5434,
"step": 250
},
{
"epoch": 0.032985882042485815,
"eval_accuracy": 0.8714285492897034,
"eval_loss": 0.8546671867370605,
"eval_runtime": 14.7911,
"eval_samples_per_second": 165.64,
"eval_steps_per_second": 5.206,
"step": 250
},
{
"epoch": 0.03958305845098298,
"grad_norm": 12.044981002807617,
"learning_rate": 1.98250651588108e-05,
"loss": 1.3113,
"step": 300
},
{
"epoch": 0.04618023485948014,
"grad_norm": 15.790678024291992,
"learning_rate": 1.978088969386403e-05,
"loss": 1.3674,
"step": 350
},
{
"epoch": 0.0527774112679773,
"grad_norm": 14.051512718200684,
"learning_rate": 1.9736714228917263e-05,
"loss": 1.3417,
"step": 400
},
{
"epoch": 0.059374587676474466,
"grad_norm": 16.866775512695312,
"learning_rate": 1.9692538763970493e-05,
"loss": 1.2831,
"step": 450
},
{
"epoch": 0.06597176408497163,
"grad_norm": 15.336055755615234,
"learning_rate": 1.9648363299023723e-05,
"loss": 1.2243,
"step": 500
},
{
"epoch": 0.06597176408497163,
"eval_accuracy": 0.882040798664093,
"eval_loss": 0.7872514724731445,
"eval_runtime": 14.0019,
"eval_samples_per_second": 174.976,
"eval_steps_per_second": 5.499,
"step": 500
},
{
"epoch": 0.07256894049346879,
"grad_norm": 21.990840911865234,
"learning_rate": 1.9604187834076954e-05,
"loss": 1.2276,
"step": 550
},
{
"epoch": 0.07916611690196595,
"grad_norm": 14.601304054260254,
"learning_rate": 1.9560012369130184e-05,
"loss": 1.2502,
"step": 600
},
{
"epoch": 0.08576329331046312,
"grad_norm": 10.78171157836914,
"learning_rate": 1.9516720413482352e-05,
"loss": 1.2247,
"step": 650
},
{
"epoch": 0.09236046971896028,
"grad_norm": 14.99619197845459,
"learning_rate": 1.9472544948535586e-05,
"loss": 1.178,
"step": 700
},
{
"epoch": 0.09895764612745744,
"grad_norm": 11.09481430053711,
"learning_rate": 1.9428369483588816e-05,
"loss": 1.2379,
"step": 750
},
{
"epoch": 0.09895764612745744,
"eval_accuracy": 0.8930612206459045,
"eval_loss": 0.7503395080566406,
"eval_runtime": 14.018,
"eval_samples_per_second": 174.775,
"eval_steps_per_second": 5.493,
"step": 750
},
{
"epoch": 0.1055548225359546,
"grad_norm": 12.924555778503418,
"learning_rate": 1.938419401864205e-05,
"loss": 1.3893,
"step": 800
},
{
"epoch": 0.11215199894445177,
"grad_norm": 16.87848663330078,
"learning_rate": 1.934001855369528e-05,
"loss": 1.1852,
"step": 850
},
{
"epoch": 0.11874917535294893,
"grad_norm": 17.876659393310547,
"learning_rate": 1.929584308874851e-05,
"loss": 1.1082,
"step": 900
},
{
"epoch": 0.1253463517614461,
"grad_norm": 14.923641204833984,
"learning_rate": 1.925166762380174e-05,
"loss": 0.9946,
"step": 950
},
{
"epoch": 0.13194352816994326,
"grad_norm": 20.28868865966797,
"learning_rate": 1.9207492158854975e-05,
"loss": 1.1834,
"step": 1000
},
{
"epoch": 0.13194352816994326,
"eval_accuracy": 0.899591863155365,
"eval_loss": 0.7308884859085083,
"eval_runtime": 14.203,
"eval_samples_per_second": 172.499,
"eval_steps_per_second": 5.421,
"step": 1000
},
{
"epoch": 0.13854070457844042,
"grad_norm": 13.742298126220703,
"learning_rate": 1.9163316693908205e-05,
"loss": 1.1556,
"step": 1050
},
{
"epoch": 0.14513788098693758,
"grad_norm": 14.966007232666016,
"learning_rate": 1.9119141228961435e-05,
"loss": 1.0251,
"step": 1100
},
{
"epoch": 0.15173505739543475,
"grad_norm": 16.00642967224121,
"learning_rate": 1.907496576401467e-05,
"loss": 1.1943,
"step": 1150
},
{
"epoch": 0.1583322338039319,
"grad_norm": 13.92847728729248,
"learning_rate": 1.90307902990679e-05,
"loss": 1.086,
"step": 1200
},
{
"epoch": 0.16492941021242907,
"grad_norm": 16.767595291137695,
"learning_rate": 1.898661483412113e-05,
"loss": 1.1236,
"step": 1250
},
{
"epoch": 0.16492941021242907,
"eval_accuracy": 0.9008163213729858,
"eval_loss": 0.6945549845695496,
"eval_runtime": 14.235,
"eval_samples_per_second": 172.11,
"eval_steps_per_second": 5.409,
"step": 1250
},
{
"epoch": 0.17152658662092624,
"grad_norm": 13.734739303588867,
"learning_rate": 1.894243936917436e-05,
"loss": 1.0485,
"step": 1300
},
{
"epoch": 0.1781237630294234,
"grad_norm": 8.18282699584961,
"learning_rate": 1.8898263904227594e-05,
"loss": 0.9481,
"step": 1350
},
{
"epoch": 0.18472093943792056,
"grad_norm": 13.874724388122559,
"learning_rate": 1.8854088439280824e-05,
"loss": 1.0898,
"step": 1400
},
{
"epoch": 0.19131811584641772,
"grad_norm": 15.291620254516602,
"learning_rate": 1.8809912974334058e-05,
"loss": 1.0863,
"step": 1450
},
{
"epoch": 0.1979152922549149,
"grad_norm": 21.6629581451416,
"learning_rate": 1.8765737509387288e-05,
"loss": 1.0756,
"step": 1500
},
{
"epoch": 0.1979152922549149,
"eval_accuracy": 0.9036734700202942,
"eval_loss": 0.6746897101402283,
"eval_runtime": 14.4342,
"eval_samples_per_second": 169.736,
"eval_steps_per_second": 5.335,
"step": 1500
},
{
"epoch": 0.20451246866341205,
"grad_norm": 14.420069694519043,
"learning_rate": 1.872156204444052e-05,
"loss": 0.9973,
"step": 1550
},
{
"epoch": 0.2111096450719092,
"grad_norm": 7.541851043701172,
"learning_rate": 1.867738657949375e-05,
"loss": 1.1098,
"step": 1600
},
{
"epoch": 0.21770682148040638,
"grad_norm": 15.377376556396484,
"learning_rate": 1.8633211114546983e-05,
"loss": 1.1745,
"step": 1650
},
{
"epoch": 0.22430399788890354,
"grad_norm": 10.262870788574219,
"learning_rate": 1.8589035649600213e-05,
"loss": 0.9654,
"step": 1700
},
{
"epoch": 0.2309011742974007,
"grad_norm": 11.869269371032715,
"learning_rate": 1.8544860184653447e-05,
"loss": 1.0919,
"step": 1750
},
{
"epoch": 0.2309011742974007,
"eval_accuracy": 0.9093877673149109,
"eval_loss": 0.649857223033905,
"eval_runtime": 14.4377,
"eval_samples_per_second": 169.695,
"eval_steps_per_second": 5.333,
"step": 1750
},
{
"epoch": 0.23749835070589786,
"grad_norm": 12.47613525390625,
"learning_rate": 1.8500684719706677e-05,
"loss": 1.0249,
"step": 1800
},
{
"epoch": 0.24409552711439503,
"grad_norm": 11.694280624389648,
"learning_rate": 1.8456509254759907e-05,
"loss": 0.9863,
"step": 1850
},
{
"epoch": 0.2506927035228922,
"grad_norm": 6.96587610244751,
"learning_rate": 1.841233378981314e-05,
"loss": 1.1091,
"step": 1900
},
{
"epoch": 0.25728987993138935,
"grad_norm": 16.962194442749023,
"learning_rate": 1.836815832486637e-05,
"loss": 1.0989,
"step": 1950
},
{
"epoch": 0.2638870563398865,
"grad_norm": 16.43683433532715,
"learning_rate": 1.83239828599196e-05,
"loss": 1.0662,
"step": 2000
},
{
"epoch": 0.2638870563398865,
"eval_accuracy": 0.9065306186676025,
"eval_loss": 0.6661304235458374,
"eval_runtime": 14.2056,
"eval_samples_per_second": 172.467,
"eval_steps_per_second": 5.42,
"step": 2000
},
{
"epoch": 0.2704842327483837,
"grad_norm": 12.090469360351562,
"learning_rate": 1.8279807394972832e-05,
"loss": 1.0456,
"step": 2050
},
{
"epoch": 0.27708140915688084,
"grad_norm": 14.27798843383789,
"learning_rate": 1.8235631930026066e-05,
"loss": 1.1349,
"step": 2100
},
{
"epoch": 0.283678585565378,
"grad_norm": 14.521726608276367,
"learning_rate": 1.8191456465079296e-05,
"loss": 1.0111,
"step": 2150
},
{
"epoch": 0.29027576197387517,
"grad_norm": 9.772090911865234,
"learning_rate": 1.814728100013253e-05,
"loss": 1.026,
"step": 2200
},
{
"epoch": 0.29687293838237233,
"grad_norm": 15.107865333557129,
"learning_rate": 1.810310553518576e-05,
"loss": 0.9415,
"step": 2250
},
{
"epoch": 0.29687293838237233,
"eval_accuracy": 0.9073469638824463,
"eval_loss": 0.6389794945716858,
"eval_runtime": 14.5287,
"eval_samples_per_second": 168.631,
"eval_steps_per_second": 5.3,
"step": 2250
},
{
"epoch": 0.3034701147908695,
"grad_norm": 15.88947582244873,
"learning_rate": 1.805893007023899e-05,
"loss": 0.9761,
"step": 2300
},
{
"epoch": 0.31006729119936666,
"grad_norm": 13.472917556762695,
"learning_rate": 1.801475460529222e-05,
"loss": 0.9748,
"step": 2350
},
{
"epoch": 0.3166644676078638,
"grad_norm": 14.00285530090332,
"learning_rate": 1.7970579140345454e-05,
"loss": 1.0238,
"step": 2400
},
{
"epoch": 0.323261644016361,
"grad_norm": 15.622306823730469,
"learning_rate": 1.7926403675398685e-05,
"loss": 1.0456,
"step": 2450
},
{
"epoch": 0.32985882042485815,
"grad_norm": 12.947722434997559,
"learning_rate": 1.788222821045192e-05,
"loss": 0.9895,
"step": 2500
},
{
"epoch": 0.32985882042485815,
"eval_accuracy": 0.9110203981399536,
"eval_loss": 0.6434822678565979,
"eval_runtime": 14.6459,
"eval_samples_per_second": 167.283,
"eval_steps_per_second": 5.257,
"step": 2500
},
{
"epoch": 0.3364559968333553,
"grad_norm": 16.64254379272461,
"learning_rate": 1.783805274550515e-05,
"loss": 0.8796,
"step": 2550
},
{
"epoch": 0.34305317324185247,
"grad_norm": 12.790375709533691,
"learning_rate": 1.779387728055838e-05,
"loss": 1.0172,
"step": 2600
},
{
"epoch": 0.34965034965034963,
"grad_norm": 13.025754928588867,
"learning_rate": 1.774970181561161e-05,
"loss": 1.014,
"step": 2650
},
{
"epoch": 0.3562475260588468,
"grad_norm": 13.217813491821289,
"learning_rate": 1.770552635066484e-05,
"loss": 0.9748,
"step": 2700
},
{
"epoch": 0.36284470246734396,
"grad_norm": 13.908524513244629,
"learning_rate": 1.7661350885718073e-05,
"loss": 0.9273,
"step": 2750
},
{
"epoch": 0.36284470246734396,
"eval_accuracy": 0.9081632494926453,
"eval_loss": 0.6303015947341919,
"eval_runtime": 14.1305,
"eval_samples_per_second": 173.384,
"eval_steps_per_second": 5.449,
"step": 2750
},
{
"epoch": 0.3694418788758411,
"grad_norm": 12.053607940673828,
"learning_rate": 1.7617175420771304e-05,
"loss": 1.0122,
"step": 2800
},
{
"epoch": 0.3760390552843383,
"grad_norm": 13.809615135192871,
"learning_rate": 1.7572999955824538e-05,
"loss": 1.0054,
"step": 2850
},
{
"epoch": 0.38263623169283545,
"grad_norm": 14.718282699584961,
"learning_rate": 1.7528824490877768e-05,
"loss": 0.8974,
"step": 2900
},
{
"epoch": 0.3892334081013326,
"grad_norm": 16.11876678466797,
"learning_rate": 1.7484649025930998e-05,
"loss": 0.9396,
"step": 2950
},
{
"epoch": 0.3958305845098298,
"grad_norm": 9.439668655395508,
"learning_rate": 1.744047356098423e-05,
"loss": 0.8734,
"step": 3000
},
{
"epoch": 0.3958305845098298,
"eval_accuracy": 0.9048979878425598,
"eval_loss": 0.6237688064575195,
"eval_runtime": 14.1809,
"eval_samples_per_second": 172.767,
"eval_steps_per_second": 5.43,
"step": 3000
},
{
"epoch": 0.40242776091832694,
"grad_norm": 10.298930168151855,
"learning_rate": 1.7396298096037462e-05,
"loss": 1.0048,
"step": 3050
},
{
"epoch": 0.4090249373268241,
"grad_norm": 7.693696975708008,
"learning_rate": 1.7352122631090693e-05,
"loss": 0.9701,
"step": 3100
},
{
"epoch": 0.41562211373532126,
"grad_norm": 16.300338745117188,
"learning_rate": 1.7307947166143926e-05,
"loss": 0.9924,
"step": 3150
},
{
"epoch": 0.4222192901438184,
"grad_norm": 10.341270446777344,
"learning_rate": 1.7263771701197157e-05,
"loss": 0.9349,
"step": 3200
},
{
"epoch": 0.4288164665523156,
"grad_norm": 14.08645248413086,
"learning_rate": 1.721959623625039e-05,
"loss": 0.974,
"step": 3250
},
{
"epoch": 0.4288164665523156,
"eval_accuracy": 0.9118367433547974,
"eval_loss": 0.6216471791267395,
"eval_runtime": 14.3141,
"eval_samples_per_second": 171.16,
"eval_steps_per_second": 5.379,
"step": 3250
},
{
"epoch": 0.43541364296081275,
"grad_norm": 20.50489044189453,
"learning_rate": 1.717542077130362e-05,
"loss": 1.0539,
"step": 3300
},
{
"epoch": 0.4420108193693099,
"grad_norm": 17.268712997436523,
"learning_rate": 1.713124530635685e-05,
"loss": 0.9389,
"step": 3350
},
{
"epoch": 0.4486079957778071,
"grad_norm": 12.712272644042969,
"learning_rate": 1.708706984141008e-05,
"loss": 0.9171,
"step": 3400
},
{
"epoch": 0.45520517218630424,
"grad_norm": 12.377297401428223,
"learning_rate": 1.704289437646331e-05,
"loss": 0.9706,
"step": 3450
},
{
"epoch": 0.4618023485948014,
"grad_norm": 18.502830505371094,
"learning_rate": 1.6998718911516545e-05,
"loss": 1.0124,
"step": 3500
},
{
"epoch": 0.4618023485948014,
"eval_accuracy": 0.9065306186676025,
"eval_loss": 0.6126046180725098,
"eval_runtime": 14.1583,
"eval_samples_per_second": 173.043,
"eval_steps_per_second": 5.438,
"step": 3500
},
{
"epoch": 0.46839952500329857,
"grad_norm": 17.543399810791016,
"learning_rate": 1.6954543446569776e-05,
"loss": 0.9215,
"step": 3550
},
{
"epoch": 0.47499670141179573,
"grad_norm": 15.049899101257324,
"learning_rate": 1.691036798162301e-05,
"loss": 0.8563,
"step": 3600
},
{
"epoch": 0.4815938778202929,
"grad_norm": 14.00575065612793,
"learning_rate": 1.686619251667624e-05,
"loss": 0.8249,
"step": 3650
},
{
"epoch": 0.48819105422879006,
"grad_norm": 19.295759201049805,
"learning_rate": 1.682201705172947e-05,
"loss": 0.8794,
"step": 3700
},
{
"epoch": 0.4947882306372872,
"grad_norm": 14.837241172790527,
"learning_rate": 1.67778415867827e-05,
"loss": 1.0013,
"step": 3750
},
{
"epoch": 0.4947882306372872,
"eval_accuracy": 0.9077550768852234,
"eval_loss": 0.6021705865859985,
"eval_runtime": 14.1781,
"eval_samples_per_second": 172.802,
"eval_steps_per_second": 5.431,
"step": 3750
},
{
"epoch": 0.5013854070457844,
"grad_norm": 12.423500061035156,
"learning_rate": 1.6733666121835934e-05,
"loss": 0.922,
"step": 3800
},
{
"epoch": 0.5079825834542816,
"grad_norm": 13.505254745483398,
"learning_rate": 1.6689490656889164e-05,
"loss": 0.9168,
"step": 3850
},
{
"epoch": 0.5145797598627787,
"grad_norm": 12.56449031829834,
"learning_rate": 1.6645315191942398e-05,
"loss": 0.9315,
"step": 3900
},
{
"epoch": 0.5211769362712759,
"grad_norm": 11.239628791809082,
"learning_rate": 1.660113972699563e-05,
"loss": 0.9265,
"step": 3950
},
{
"epoch": 0.527774112679773,
"grad_norm": 9.262091636657715,
"learning_rate": 1.655696426204886e-05,
"loss": 0.9453,
"step": 4000
},
{
"epoch": 0.527774112679773,
"eval_accuracy": 0.9077550768852234,
"eval_loss": 0.6083095669746399,
"eval_runtime": 14.2575,
"eval_samples_per_second": 171.839,
"eval_steps_per_second": 5.401,
"step": 4000
},
{
"epoch": 0.5343712890882703,
"grad_norm": 11.317748069763184,
"learning_rate": 1.651278879710209e-05,
"loss": 0.9585,
"step": 4050
},
{
"epoch": 0.5409684654967674,
"grad_norm": 13.768712997436523,
"learning_rate": 1.6468613332155323e-05,
"loss": 0.9886,
"step": 4100
},
{
"epoch": 0.5475656419052646,
"grad_norm": 11.504364967346191,
"learning_rate": 1.6424437867208553e-05,
"loss": 0.9081,
"step": 4150
},
{
"epoch": 0.5541628183137617,
"grad_norm": 16.876300811767578,
"learning_rate": 1.6380262402261787e-05,
"loss": 0.8181,
"step": 4200
},
{
"epoch": 0.5607599947222589,
"grad_norm": 15.651288986206055,
"learning_rate": 1.6336086937315017e-05,
"loss": 0.8806,
"step": 4250
},
{
"epoch": 0.5607599947222589,
"eval_accuracy": 0.9118367433547974,
"eval_loss": 0.5917608141899109,
"eval_runtime": 14.7158,
"eval_samples_per_second": 166.487,
"eval_steps_per_second": 5.232,
"step": 4250
},
{
"epoch": 0.567357171130756,
"grad_norm": 11.500801086425781,
"learning_rate": 1.6291911472368248e-05,
"loss": 0.858,
"step": 4300
},
{
"epoch": 0.5739543475392532,
"grad_norm": 10.485420227050781,
"learning_rate": 1.624773600742148e-05,
"loss": 0.8781,
"step": 4350
},
{
"epoch": 0.5805515239477503,
"grad_norm": 8.773555755615234,
"learning_rate": 1.620356054247471e-05,
"loss": 0.9059,
"step": 4400
},
{
"epoch": 0.5871487003562476,
"grad_norm": 12.097881317138672,
"learning_rate": 1.6159385077527942e-05,
"loss": 0.8475,
"step": 4450
},
{
"epoch": 0.5937458767647447,
"grad_norm": 9.051371574401855,
"learning_rate": 1.6115209612581172e-05,
"loss": 0.9649,
"step": 4500
},
{
"epoch": 0.5937458767647447,
"eval_accuracy": 0.9057142734527588,
"eval_loss": 0.5950626730918884,
"eval_runtime": 14.9206,
"eval_samples_per_second": 164.202,
"eval_steps_per_second": 5.161,
"step": 4500
},
{
"epoch": 0.6003430531732419,
"grad_norm": 15.799858093261719,
"learning_rate": 1.6071034147634406e-05,
"loss": 0.969,
"step": 4550
},
{
"epoch": 0.606940229581739,
"grad_norm": 10.038565635681152,
"learning_rate": 1.6026858682687636e-05,
"loss": 0.8685,
"step": 4600
},
{
"epoch": 0.6135374059902362,
"grad_norm": 14.452479362487793,
"learning_rate": 1.598268321774087e-05,
"loss": 0.9555,
"step": 4650
},
{
"epoch": 0.6201345823987333,
"grad_norm": 14.48049259185791,
"learning_rate": 1.59385077527941e-05,
"loss": 0.9166,
"step": 4700
},
{
"epoch": 0.6267317588072305,
"grad_norm": 10.772700309753418,
"learning_rate": 1.589433228784733e-05,
"loss": 0.877,
"step": 4750
},
{
"epoch": 0.6267317588072305,
"eval_accuracy": 0.9073469638824463,
"eval_loss": 0.5858258605003357,
"eval_runtime": 14.792,
"eval_samples_per_second": 165.63,
"eval_steps_per_second": 5.206,
"step": 4750
},
{
"epoch": 0.6333289352157276,
"grad_norm": 12.199923515319824,
"learning_rate": 1.585015682290056e-05,
"loss": 0.938,
"step": 4800
},
{
"epoch": 0.6399261116242249,
"grad_norm": 11.47739315032959,
"learning_rate": 1.5805981357953795e-05,
"loss": 0.9211,
"step": 4850
},
{
"epoch": 0.646523288032722,
"grad_norm": 12.546594619750977,
"learning_rate": 1.5761805893007025e-05,
"loss": 0.9699,
"step": 4900
},
{
"epoch": 0.6531204644412192,
"grad_norm": 15.941895484924316,
"learning_rate": 1.571763042806026e-05,
"loss": 0.8818,
"step": 4950
},
{
"epoch": 0.6597176408497163,
"grad_norm": 12.06876277923584,
"learning_rate": 1.567345496311349e-05,
"loss": 0.9814,
"step": 5000
},
{
"epoch": 0.6597176408497163,
"eval_accuracy": 0.9175510406494141,
"eval_loss": 0.5705481767654419,
"eval_runtime": 14.3212,
"eval_samples_per_second": 171.075,
"eval_steps_per_second": 5.377,
"step": 5000
},
{
"epoch": 0.6663148172582135,
"grad_norm": 11.047979354858398,
"learning_rate": 1.562927949816672e-05,
"loss": 0.8588,
"step": 5050
},
{
"epoch": 0.6729119936667106,
"grad_norm": 13.39299488067627,
"learning_rate": 1.558510403321995e-05,
"loss": 0.8922,
"step": 5100
},
{
"epoch": 0.6795091700752078,
"grad_norm": 11.451362609863281,
"learning_rate": 1.554092856827318e-05,
"loss": 1.0096,
"step": 5150
},
{
"epoch": 0.6861063464837049,
"grad_norm": 3.436371326446533,
"learning_rate": 1.5496753103326414e-05,
"loss": 0.9217,
"step": 5200
},
{
"epoch": 0.6927035228922022,
"grad_norm": 9.360651016235352,
"learning_rate": 1.5452577638379644e-05,
"loss": 0.9446,
"step": 5250
},
{
"epoch": 0.6927035228922022,
"eval_accuracy": 0.9146938920021057,
"eval_loss": 0.5739869475364685,
"eval_runtime": 14.2053,
"eval_samples_per_second": 172.471,
"eval_steps_per_second": 5.421,
"step": 5250
},
{
"epoch": 0.6993006993006993,
"grad_norm": 13.184717178344727,
"learning_rate": 1.5408402173432878e-05,
"loss": 0.9301,
"step": 5300
},
{
"epoch": 0.7058978757091965,
"grad_norm": 9.54310417175293,
"learning_rate": 1.5364226708486108e-05,
"loss": 0.8436,
"step": 5350
},
{
"epoch": 0.7124950521176936,
"grad_norm": 12.212594032287598,
"learning_rate": 1.532005124353934e-05,
"loss": 0.8547,
"step": 5400
},
{
"epoch": 0.7190922285261908,
"grad_norm": 13.941079139709473,
"learning_rate": 1.527587577859257e-05,
"loss": 0.9552,
"step": 5450
},
{
"epoch": 0.7256894049346879,
"grad_norm": 9.494156837463379,
"learning_rate": 1.5232583822944737e-05,
"loss": 0.9227,
"step": 5500
},
{
"epoch": 0.7256894049346879,
"eval_accuracy": 0.9134693741798401,
"eval_loss": 0.5912680625915527,
"eval_runtime": 14.37,
"eval_samples_per_second": 170.494,
"eval_steps_per_second": 5.358,
"step": 5500
},
{
"epoch": 0.7322865813431851,
"grad_norm": 15.922163963317871,
"learning_rate": 1.518840835799797e-05,
"loss": 0.8813,
"step": 5550
},
{
"epoch": 0.7388837577516822,
"grad_norm": 5.2287068367004395,
"learning_rate": 1.5144232893051201e-05,
"loss": 0.8519,
"step": 5600
},
{
"epoch": 0.7454809341601795,
"grad_norm": 13.272147178649902,
"learning_rate": 1.5100057428104431e-05,
"loss": 0.8223,
"step": 5650
},
{
"epoch": 0.7520781105686766,
"grad_norm": 10.859210968017578,
"learning_rate": 1.5055881963157663e-05,
"loss": 0.8603,
"step": 5700
},
{
"epoch": 0.7586752869771738,
"grad_norm": 13.087916374206543,
"learning_rate": 1.5011706498210894e-05,
"loss": 0.8208,
"step": 5750
},
{
"epoch": 0.7586752869771738,
"eval_accuracy": 0.9151020646095276,
"eval_loss": 0.5698295831680298,
"eval_runtime": 14.3053,
"eval_samples_per_second": 171.265,
"eval_steps_per_second": 5.383,
"step": 5750
},
{
"epoch": 0.7652724633856709,
"grad_norm": 11.395907402038574,
"learning_rate": 1.4967531033264127e-05,
"loss": 0.8542,
"step": 5800
},
{
"epoch": 0.7718696397941681,
"grad_norm": 12.993699073791504,
"learning_rate": 1.4923355568317358e-05,
"loss": 0.7924,
"step": 5850
},
{
"epoch": 0.7784668162026652,
"grad_norm": 12.30950927734375,
"learning_rate": 1.487918010337059e-05,
"loss": 0.9238,
"step": 5900
},
{
"epoch": 0.7850639926111624,
"grad_norm": 14.112768173217773,
"learning_rate": 1.483500463842382e-05,
"loss": 0.8303,
"step": 5950
},
{
"epoch": 0.7916611690196595,
"grad_norm": 12.148374557495117,
"learning_rate": 1.4790829173477052e-05,
"loss": 0.8254,
"step": 6000
},
{
"epoch": 0.7916611690196595,
"eval_accuracy": 0.9159183502197266,
"eval_loss": 0.5643152594566345,
"eval_runtime": 14.2837,
"eval_samples_per_second": 171.524,
"eval_steps_per_second": 5.391,
"step": 6000
},
{
"epoch": 0.7982583454281568,
"grad_norm": 22.816574096679688,
"learning_rate": 1.4746653708530282e-05,
"loss": 0.8556,
"step": 6050
},
{
"epoch": 0.8048555218366539,
"grad_norm": 20.84126853942871,
"learning_rate": 1.4702478243583516e-05,
"loss": 0.9286,
"step": 6100
},
{
"epoch": 0.8114526982451511,
"grad_norm": 10.403019905090332,
"learning_rate": 1.4658302778636746e-05,
"loss": 0.8776,
"step": 6150
},
{
"epoch": 0.8180498746536482,
"grad_norm": 12.371121406555176,
"learning_rate": 1.4614127313689978e-05,
"loss": 0.8146,
"step": 6200
},
{
"epoch": 0.8246470510621454,
"grad_norm": 8.280356407165527,
"learning_rate": 1.4569951848743209e-05,
"loss": 0.8469,
"step": 6250
},
{
"epoch": 0.8246470510621454,
"eval_accuracy": 0.9126530885696411,
"eval_loss": 0.5626720190048218,
"eval_runtime": 14.5893,
"eval_samples_per_second": 167.931,
"eval_steps_per_second": 5.278,
"step": 6250
},
{
"epoch": 0.8312442274706425,
"grad_norm": 12.503052711486816,
"learning_rate": 1.452577638379644e-05,
"loss": 0.9719,
"step": 6300
},
{
"epoch": 0.8378414038791397,
"grad_norm": 8.480955123901367,
"learning_rate": 1.4481600918849673e-05,
"loss": 0.9297,
"step": 6350
},
{
"epoch": 0.8444385802876369,
"grad_norm": 11.265844345092773,
"learning_rate": 1.4437425453902903e-05,
"loss": 0.896,
"step": 6400
},
{
"epoch": 0.8510357566961341,
"grad_norm": 14.131726264953613,
"learning_rate": 1.4393249988956135e-05,
"loss": 0.8709,
"step": 6450
},
{
"epoch": 0.8576329331046312,
"grad_norm": 15.310577392578125,
"learning_rate": 1.4349958033308302e-05,
"loss": 0.9436,
"step": 6500
},
{
"epoch": 0.8576329331046312,
"eval_accuracy": 0.9159183502197266,
"eval_loss": 0.5638322234153748,
"eval_runtime": 14.7169,
"eval_samples_per_second": 166.475,
"eval_steps_per_second": 5.232,
"step": 6500
},
{
"epoch": 0.8642301095131284,
"grad_norm": 9.686817169189453,
"learning_rate": 1.4305782568361532e-05,
"loss": 0.8938,
"step": 6550
},
{
"epoch": 0.8708272859216255,
"grad_norm": 13.079803466796875,
"learning_rate": 1.4261607103414766e-05,
"loss": 0.8065,
"step": 6600
},
{
"epoch": 0.8774244623301227,
"grad_norm": 8.047029495239258,
"learning_rate": 1.4217431638467996e-05,
"loss": 0.8281,
"step": 6650
},
{
"epoch": 0.8840216387386198,
"grad_norm": 9.289030075073242,
"learning_rate": 1.4173256173521228e-05,
"loss": 0.8449,
"step": 6700
},
{
"epoch": 0.890618815147117,
"grad_norm": 14.22453498840332,
"learning_rate": 1.4129080708574458e-05,
"loss": 0.813,
"step": 6750
},
{
"epoch": 0.890618815147117,
"eval_accuracy": 0.9167346954345703,
"eval_loss": 0.5693557262420654,
"eval_runtime": 14.6084,
"eval_samples_per_second": 167.711,
"eval_steps_per_second": 5.271,
"step": 6750
},
{
"epoch": 0.8972159915556142,
"grad_norm": 13.204727172851562,
"learning_rate": 1.408490524362769e-05,
"loss": 0.9052,
"step": 6800
},
{
"epoch": 0.9038131679641114,
"grad_norm": 13.640901565551758,
"learning_rate": 1.404072977868092e-05,
"loss": 0.9501,
"step": 6850
},
{
"epoch": 0.9104103443726085,
"grad_norm": 10.711437225341797,
"learning_rate": 1.3996554313734155e-05,
"loss": 0.9612,
"step": 6900
},
{
"epoch": 0.9170075207811057,
"grad_norm": 13.457585334777832,
"learning_rate": 1.3952378848787385e-05,
"loss": 0.8649,
"step": 6950
},
{
"epoch": 0.9236046971896028,
"grad_norm": 12.793722152709961,
"learning_rate": 1.3908203383840615e-05,
"loss": 0.7366,
"step": 7000
},
{
"epoch": 0.9236046971896028,
"eval_accuracy": 0.9187754988670349,
"eval_loss": 0.5691282153129578,
"eval_runtime": 14.243,
"eval_samples_per_second": 172.014,
"eval_steps_per_second": 5.406,
"step": 7000
},
{
"epoch": 0.9302018735981,
"grad_norm": 15.058178901672363,
"learning_rate": 1.3864027918893847e-05,
"loss": 0.9621,
"step": 7050
},
{
"epoch": 0.9367990500065971,
"grad_norm": 14.427763938903809,
"learning_rate": 1.3819852453947078e-05,
"loss": 0.9154,
"step": 7100
},
{
"epoch": 0.9433962264150944,
"grad_norm": 13.261103630065918,
"learning_rate": 1.3775676989000311e-05,
"loss": 0.8617,
"step": 7150
},
{
"epoch": 0.9499934028235915,
"grad_norm": 12.778352737426758,
"learning_rate": 1.3731501524053542e-05,
"loss": 0.8629,
"step": 7200
},
{
"epoch": 0.9565905792320887,
"grad_norm": 14.332444190979004,
"learning_rate": 1.3687326059106774e-05,
"loss": 0.899,
"step": 7250
},
{
"epoch": 0.9565905792320887,
"eval_accuracy": 0.9159183502197266,
"eval_loss": 0.5559064745903015,
"eval_runtime": 14.2133,
"eval_samples_per_second": 172.374,
"eval_steps_per_second": 5.417,
"step": 7250
},
{
"epoch": 0.9631877556405858,
"grad_norm": 8.828652381896973,
"learning_rate": 1.3643150594160004e-05,
"loss": 0.7766,
"step": 7300
},
{
"epoch": 0.969784932049083,
"grad_norm": 11.421220779418945,
"learning_rate": 1.3598975129213236e-05,
"loss": 0.8968,
"step": 7350
},
{
"epoch": 0.9763821084575801,
"grad_norm": 13.00658893585205,
"learning_rate": 1.3554799664266466e-05,
"loss": 0.8462,
"step": 7400
},
{
"epoch": 0.9829792848660773,
"grad_norm": 6.505890369415283,
"learning_rate": 1.35106241993197e-05,
"loss": 0.8478,
"step": 7450
},
{
"epoch": 0.9895764612745744,
"grad_norm": 9.694055557250977,
"learning_rate": 1.346644873437293e-05,
"loss": 0.8184,
"step": 7500
},
{
"epoch": 0.9895764612745744,
"eval_accuracy": 0.9163265228271484,
"eval_loss": 0.5564213395118713,
"eval_runtime": 14.1149,
"eval_samples_per_second": 173.576,
"eval_steps_per_second": 5.455,
"step": 7500
},
{
"epoch": 0.9961736376830717,
"grad_norm": 8.785748481750488,
"learning_rate": 1.3422273269426162e-05,
"loss": 0.8445,
"step": 7550
},
{
"epoch": 1.0027708140915688,
"grad_norm": 12.255693435668945,
"learning_rate": 1.3378097804479393e-05,
"loss": 0.7305,
"step": 7600
},
{
"epoch": 1.0093679905000659,
"grad_norm": 12.03491497039795,
"learning_rate": 1.3333922339532626e-05,
"loss": 0.695,
"step": 7650
},
{
"epoch": 1.0159651669085632,
"grad_norm": 15.055414199829102,
"learning_rate": 1.3289746874585857e-05,
"loss": 0.779,
"step": 7700
},
{
"epoch": 1.0225623433170603,
"grad_norm": 3.5831682682037354,
"learning_rate": 1.3245571409639089e-05,
"loss": 0.5876,
"step": 7750
},
{
"epoch": 1.0225623433170603,
"eval_accuracy": 0.918367326259613,
"eval_loss": 0.5775763392448425,
"eval_runtime": 14.119,
"eval_samples_per_second": 173.525,
"eval_steps_per_second": 5.454,
"step": 7750
},
{
"epoch": 1.0291595197255574,
"grad_norm": 14.637757301330566,
"learning_rate": 1.3201395944692319e-05,
"loss": 0.6372,
"step": 7800
},
{
"epoch": 1.0357566961340545,
"grad_norm": 9.048910140991211,
"learning_rate": 1.315722047974555e-05,
"loss": 0.7066,
"step": 7850
},
{
"epoch": 1.0423538725425519,
"grad_norm": 13.023659706115723,
"learning_rate": 1.3113045014798781e-05,
"loss": 0.6561,
"step": 7900
},
{
"epoch": 1.048951048951049,
"grad_norm": 13.10300350189209,
"learning_rate": 1.3068869549852012e-05,
"loss": 0.6854,
"step": 7950
},
{
"epoch": 1.055548225359546,
"grad_norm": 13.364474296569824,
"learning_rate": 1.3024694084905245e-05,
"loss": 0.7083,
"step": 8000
},
{
"epoch": 1.055548225359546,
"eval_accuracy": 0.9212244749069214,
"eval_loss": 0.5645425915718079,
"eval_runtime": 14.1023,
"eval_samples_per_second": 173.731,
"eval_steps_per_second": 5.46,
"step": 8000
},
{
"epoch": 1.0621454017680432,
"grad_norm": 10.857477188110352,
"learning_rate": 1.2980518619958476e-05,
"loss": 0.6618,
"step": 8050
},
{
"epoch": 1.0687425781765405,
"grad_norm": 13.178641319274902,
"learning_rate": 1.2936343155011708e-05,
"loss": 0.6602,
"step": 8100
},
{
"epoch": 1.0753397545850376,
"grad_norm": 8.929798126220703,
"learning_rate": 1.2892167690064938e-05,
"loss": 0.7141,
"step": 8150
},
{
"epoch": 1.0819369309935347,
"grad_norm": 14.156282424926758,
"learning_rate": 1.2847992225118172e-05,
"loss": 0.7599,
"step": 8200
},
{
"epoch": 1.0885341074020318,
"grad_norm": 11.46021842956543,
"learning_rate": 1.2803816760171402e-05,
"loss": 0.6307,
"step": 8250
},
{
"epoch": 1.0885341074020318,
"eval_accuracy": 0.9159183502197266,
"eval_loss": 0.5608085989952087,
"eval_runtime": 14.088,
"eval_samples_per_second": 173.907,
"eval_steps_per_second": 5.466,
"step": 8250
},
{
"epoch": 1.0951312838105292,
"grad_norm": 13.287457466125488,
"learning_rate": 1.2759641295224634e-05,
"loss": 0.6611,
"step": 8300
},
{
"epoch": 1.1017284602190263,
"grad_norm": 7.9682793617248535,
"learning_rate": 1.2715465830277864e-05,
"loss": 0.6308,
"step": 8350
},
{
"epoch": 1.1083256366275234,
"grad_norm": 8.86072826385498,
"learning_rate": 1.2671290365331097e-05,
"loss": 0.7035,
"step": 8400
},
{
"epoch": 1.1149228130360207,
"grad_norm": 16.224716186523438,
"learning_rate": 1.2627114900384327e-05,
"loss": 0.683,
"step": 8450
},
{
"epoch": 1.1215199894445178,
"grad_norm": 16.066835403442383,
"learning_rate": 1.258293943543756e-05,
"loss": 0.7077,
"step": 8500
},
{
"epoch": 1.1215199894445178,
"eval_accuracy": 0.918367326259613,
"eval_loss": 0.5556493401527405,
"eval_runtime": 14.2677,
"eval_samples_per_second": 171.717,
"eval_steps_per_second": 5.397,
"step": 8500
},
{
"epoch": 1.128117165853015,
"grad_norm": 16.001686096191406,
"learning_rate": 1.2538763970490791e-05,
"loss": 0.7153,
"step": 8550
},
{
"epoch": 1.134714342261512,
"grad_norm": 10.751116752624512,
"learning_rate": 1.2494588505544021e-05,
"loss": 0.6186,
"step": 8600
},
{
"epoch": 1.1413115186700091,
"grad_norm": 13.352745056152344,
"learning_rate": 1.2450413040597253e-05,
"loss": 0.6289,
"step": 8650
},
{
"epoch": 1.1479086950785065,
"grad_norm": 13.567846298217773,
"learning_rate": 1.2406237575650484e-05,
"loss": 0.5718,
"step": 8700
},
{
"epoch": 1.1545058714870036,
"grad_norm": 7.751793384552002,
"learning_rate": 1.2362062110703717e-05,
"loss": 0.5749,
"step": 8750
},
{
"epoch": 1.1545058714870036,
"eval_accuracy": 0.9167346954345703,
"eval_loss": 0.5695374011993408,
"eval_runtime": 14.4147,
"eval_samples_per_second": 169.965,
"eval_steps_per_second": 5.342,
"step": 8750
},
{
"epoch": 1.1611030478955007,
"grad_norm": 17.14850425720215,
"learning_rate": 1.2317886645756948e-05,
"loss": 0.6788,
"step": 8800
},
{
"epoch": 1.167700224303998,
"grad_norm": 15.17955493927002,
"learning_rate": 1.227371118081018e-05,
"loss": 0.7731,
"step": 8850
},
{
"epoch": 1.174297400712495,
"grad_norm": 15.97493839263916,
"learning_rate": 1.222953571586341e-05,
"loss": 0.6954,
"step": 8900
},
{
"epoch": 1.1808945771209922,
"grad_norm": 13.843533515930176,
"learning_rate": 1.2185360250916642e-05,
"loss": 0.7404,
"step": 8950
},
{
"epoch": 1.1874917535294893,
"grad_norm": 2.941951274871826,
"learning_rate": 1.2141184785969872e-05,
"loss": 0.6871,
"step": 9000
},
{
"epoch": 1.1874917535294893,
"eval_accuracy": 0.9208163022994995,
"eval_loss": 0.5665779709815979,
"eval_runtime": 14.3831,
"eval_samples_per_second": 170.339,
"eval_steps_per_second": 5.354,
"step": 9000
},
{
"epoch": 1.1940889299379864,
"grad_norm": 12.621596336364746,
"learning_rate": 1.2097009321023106e-05,
"loss": 0.6415,
"step": 9050
},
{
"epoch": 1.2006861063464838,
"grad_norm": 15.431377410888672,
"learning_rate": 1.2052833856076336e-05,
"loss": 0.6517,
"step": 9100
},
{
"epoch": 1.2072832827549809,
"grad_norm": 18.660377502441406,
"learning_rate": 1.2008658391129568e-05,
"loss": 0.7354,
"step": 9150
},
{
"epoch": 1.213880459163478,
"grad_norm": 16.014867782592773,
"learning_rate": 1.1964482926182799e-05,
"loss": 0.7325,
"step": 9200
},
{
"epoch": 1.2204776355719753,
"grad_norm": 15.354519844055176,
"learning_rate": 1.192030746123603e-05,
"loss": 0.6272,
"step": 9250
},
{
"epoch": 1.2204776355719753,
"eval_accuracy": 0.9146938920021057,
"eval_loss": 0.5714155435562134,
"eval_runtime": 14.5346,
"eval_samples_per_second": 168.563,
"eval_steps_per_second": 5.298,
"step": 9250
},
{
"epoch": 1.2270748119804724,
"grad_norm": 12.623982429504395,
"learning_rate": 1.1876131996289261e-05,
"loss": 0.7292,
"step": 9300
},
{
"epoch": 1.2336719883889695,
"grad_norm": 9.906524658203125,
"learning_rate": 1.1831956531342495e-05,
"loss": 0.6325,
"step": 9350
},
{
"epoch": 1.2402691647974666,
"grad_norm": 13.0123872756958,
"learning_rate": 1.1787781066395725e-05,
"loss": 0.6344,
"step": 9400
},
{
"epoch": 1.2468663412059637,
"grad_norm": 11.591238975524902,
"learning_rate": 1.1743605601448955e-05,
"loss": 0.7218,
"step": 9450
},
{
"epoch": 1.253463517614461,
"grad_norm": 6.004245758056641,
"learning_rate": 1.1699430136502187e-05,
"loss": 0.6815,
"step": 9500
},
{
"epoch": 1.253463517614461,
"eval_accuracy": 0.9175510406494141,
"eval_loss": 0.5650636553764343,
"eval_runtime": 14.6735,
"eval_samples_per_second": 166.967,
"eval_steps_per_second": 5.248,
"step": 9500
},
{
"epoch": 1.2600606940229582,
"grad_norm": 15.778864860534668,
"learning_rate": 1.1655254671555418e-05,
"loss": 0.7186,
"step": 9550
},
{
"epoch": 1.2666578704314553,
"grad_norm": 9.6397123336792,
"learning_rate": 1.1611079206608651e-05,
"loss": 0.6145,
"step": 9600
},
{
"epoch": 1.2732550468399526,
"grad_norm": 10.774910926818848,
"learning_rate": 1.1566903741661882e-05,
"loss": 0.7095,
"step": 9650
},
{
"epoch": 1.2798522232484497,
"grad_norm": 7.923967361450195,
"learning_rate": 1.1522728276715114e-05,
"loss": 0.674,
"step": 9700
},
{
"epoch": 1.2864493996569468,
"grad_norm": 15.660514831542969,
"learning_rate": 1.1478552811768344e-05,
"loss": 0.7405,
"step": 9750
},
{
"epoch": 1.2864493996569468,
"eval_accuracy": 0.9200000166893005,
"eval_loss": 0.5666268467903137,
"eval_runtime": 14.8203,
"eval_samples_per_second": 165.314,
"eval_steps_per_second": 5.196,
"step": 9750
},
{
"epoch": 1.293046576065444,
"grad_norm": 7.9538397789001465,
"learning_rate": 1.1434377346821576e-05,
"loss": 0.7186,
"step": 9800
},
{
"epoch": 1.299643752473941,
"grad_norm": 10.569086074829102,
"learning_rate": 1.1390201881874806e-05,
"loss": 0.6352,
"step": 9850
},
{
"epoch": 1.3062409288824384,
"grad_norm": 13.37822151184082,
"learning_rate": 1.134602641692804e-05,
"loss": 0.7077,
"step": 9900
},
{
"epoch": 1.3128381052909355,
"grad_norm": 14.899065017700195,
"learning_rate": 1.130185095198127e-05,
"loss": 0.6873,
"step": 9950
},
{
"epoch": 1.3194352816994326,
"grad_norm": 9.051203727722168,
"learning_rate": 1.1257675487034503e-05,
"loss": 0.5939,
"step": 10000
},
{
"epoch": 1.3194352816994326,
"eval_accuracy": 0.9204081892967224,
"eval_loss": 0.5752307176589966,
"eval_runtime": 14.7189,
"eval_samples_per_second": 166.453,
"eval_steps_per_second": 5.231,
"step": 10000
},
{
"epoch": 1.32603245810793,
"grad_norm": 9.774917602539062,
"learning_rate": 1.1213500022087733e-05,
"loss": 0.7171,
"step": 10050
},
{
"epoch": 1.332629634516427,
"grad_norm": 12.088335037231445,
"learning_rate": 1.1169324557140967e-05,
"loss": 0.6592,
"step": 10100
},
{
"epoch": 1.3392268109249241,
"grad_norm": 6.4445881843566895,
"learning_rate": 1.1125149092194197e-05,
"loss": 0.6631,
"step": 10150
},
{
"epoch": 1.3458239873334212,
"grad_norm": 17.67377471923828,
"learning_rate": 1.1080973627247427e-05,
"loss": 0.7658,
"step": 10200
},
{
"epoch": 1.3524211637419183,
"grad_norm": 9.594240188598633,
"learning_rate": 1.103679816230066e-05,
"loss": 0.6213,
"step": 10250
},
{
"epoch": 1.3524211637419183,
"eval_accuracy": 0.9179591536521912,
"eval_loss": 0.5678022503852844,
"eval_runtime": 14.3553,
"eval_samples_per_second": 170.669,
"eval_steps_per_second": 5.364,
"step": 10250
},
{
"epoch": 1.3590183401504157,
"grad_norm": 13.705334663391113,
"learning_rate": 1.099262269735389e-05,
"loss": 0.6486,
"step": 10300
},
{
"epoch": 1.3656155165589128,
"grad_norm": 9.945523262023926,
"learning_rate": 1.0948447232407122e-05,
"loss": 0.662,
"step": 10350
},
{
"epoch": 1.3722126929674099,
"grad_norm": 10.699588775634766,
"learning_rate": 1.0904271767460352e-05,
"loss": 0.6924,
"step": 10400
},
{
"epoch": 1.3788098693759072,
"grad_norm": 11.88040828704834,
"learning_rate": 1.0860096302513586e-05,
"loss": 0.7106,
"step": 10450
},
{
"epoch": 1.3854070457844043,
"grad_norm": 9.69964599609375,
"learning_rate": 1.0815920837566816e-05,
"loss": 0.7239,
"step": 10500
},
{
"epoch": 1.3854070457844043,
"eval_accuracy": 0.918367326259613,
"eval_loss": 0.5687153935432434,
"eval_runtime": 15.0319,
"eval_samples_per_second": 162.987,
"eval_steps_per_second": 5.122,
"step": 10500
},
{
"epoch": 1.3920042221929014,
"grad_norm": 7.956460475921631,
"learning_rate": 1.0771745372620048e-05,
"loss": 0.735,
"step": 10550
},
{
"epoch": 1.3986013986013985,
"grad_norm": 15.08421802520752,
"learning_rate": 1.0728453416972216e-05,
"loss": 0.6784,
"step": 10600
},
{
"epoch": 1.4051985750098956,
"grad_norm": 7.856141090393066,
"learning_rate": 1.0684277952025447e-05,
"loss": 0.6886,
"step": 10650
},
{
"epoch": 1.411795751418393,
"grad_norm": 20.228710174560547,
"learning_rate": 1.0640102487078679e-05,
"loss": 0.649,
"step": 10700
},
{
"epoch": 1.41839292782689,
"grad_norm": 8.827073097229004,
"learning_rate": 1.0595927022131909e-05,
"loss": 0.6133,
"step": 10750
},
{
"epoch": 1.41839292782689,
"eval_accuracy": 0.9200000166893005,
"eval_loss": 0.5682947039604187,
"eval_runtime": 14.6806,
"eval_samples_per_second": 166.887,
"eval_steps_per_second": 5.245,
"step": 10750
},
{
"epoch": 1.4249901042353872,
"grad_norm": 12.990625381469727,
"learning_rate": 1.055175155718514e-05,
"loss": 0.6635,
"step": 10800
},
{
"epoch": 1.4315872806438845,
"grad_norm": 13.446993827819824,
"learning_rate": 1.0507576092238371e-05,
"loss": 0.6803,
"step": 10850
},
{
"epoch": 1.4381844570523816,
"grad_norm": 16.174983978271484,
"learning_rate": 1.0463400627291602e-05,
"loss": 0.6497,
"step": 10900
},
{
"epoch": 1.4447816334608787,
"grad_norm": 14.54861831665039,
"learning_rate": 1.0419225162344835e-05,
"loss": 0.6812,
"step": 10950
},
{
"epoch": 1.4513788098693758,
"grad_norm": 15.023179054260254,
"learning_rate": 1.0375049697398066e-05,
"loss": 0.7493,
"step": 11000
},
{
"epoch": 1.4513788098693758,
"eval_accuracy": 0.9220408201217651,
"eval_loss": 0.5586878657341003,
"eval_runtime": 14.6736,
"eval_samples_per_second": 166.967,
"eval_steps_per_second": 5.248,
"step": 11000
},
{
"epoch": 1.457975986277873,
"grad_norm": 13.154565811157227,
"learning_rate": 1.0330874232451298e-05,
"loss": 0.6694,
"step": 11050
},
{
"epoch": 1.4645731626863703,
"grad_norm": 10.797453880310059,
"learning_rate": 1.0286698767504528e-05,
"loss": 0.6782,
"step": 11100
},
{
"epoch": 1.4711703390948674,
"grad_norm": 9.909940719604492,
"learning_rate": 1.0242523302557762e-05,
"loss": 0.6839,
"step": 11150
},
{
"epoch": 1.4777675155033645,
"grad_norm": 10.230202674865723,
"learning_rate": 1.0198347837610992e-05,
"loss": 0.671,
"step": 11200
},
{
"epoch": 1.4843646919118618,
"grad_norm": 10.81137752532959,
"learning_rate": 1.0154172372664224e-05,
"loss": 0.7648,
"step": 11250
},
{
"epoch": 1.4843646919118618,
"eval_accuracy": 0.9208163022994995,
"eval_loss": 0.5465655326843262,
"eval_runtime": 14.3093,
"eval_samples_per_second": 171.217,
"eval_steps_per_second": 5.381,
"step": 11250
},
{
"epoch": 1.490961868320359,
"grad_norm": 16.720306396484375,
"learning_rate": 1.0109996907717454e-05,
"loss": 0.7448,
"step": 11300
},
{
"epoch": 1.497559044728856,
"grad_norm": 10.62578296661377,
"learning_rate": 1.0065821442770686e-05,
"loss": 0.6811,
"step": 11350
},
{
"epoch": 1.5041562211373534,
"grad_norm": 12.68857479095459,
"learning_rate": 1.0021645977823917e-05,
"loss": 0.6984,
"step": 11400
},
{
"epoch": 1.5107533975458503,
"grad_norm": 9.58633804321289,
"learning_rate": 9.977470512877149e-06,
"loss": 0.6676,
"step": 11450
},
{
"epoch": 1.5173505739543476,
"grad_norm": 16.852190017700195,
"learning_rate": 9.93329504793038e-06,
"loss": 0.7054,
"step": 11500
},
{
"epoch": 1.5173505739543476,
"eval_accuracy": 0.9204081892967224,
"eval_loss": 0.5569261908531189,
"eval_runtime": 14.3932,
"eval_samples_per_second": 170.219,
"eval_steps_per_second": 5.35,
"step": 11500
},
{
"epoch": 1.5239477503628447,
"grad_norm": 12.476948738098145,
"learning_rate": 9.889119582983611e-06,
"loss": 0.6109,
"step": 11550
},
{
"epoch": 1.5305449267713418,
"grad_norm": 15.437007904052734,
"learning_rate": 9.844944118036843e-06,
"loss": 0.7581,
"step": 11600
},
{
"epoch": 1.5371421031798391,
"grad_norm": 14.643590927124023,
"learning_rate": 9.800768653090075e-06,
"loss": 0.7035,
"step": 11650
},
{
"epoch": 1.5437392795883362,
"grad_norm": 8.441386222839355,
"learning_rate": 9.756593188143305e-06,
"loss": 0.6943,
"step": 11700
},
{
"epoch": 1.5503364559968333,
"grad_norm": 17.568815231323242,
"learning_rate": 9.713301232495472e-06,
"loss": 0.6225,
"step": 11750
},
{
"epoch": 1.5503364559968333,
"eval_accuracy": 0.922448992729187,
"eval_loss": 0.5570839643478394,
"eval_runtime": 14.4582,
"eval_samples_per_second": 169.454,
"eval_steps_per_second": 5.326,
"step": 11750
},
{
"epoch": 1.5569336324053307,
"grad_norm": 9.131673812866211,
"learning_rate": 9.669125767548704e-06,
"loss": 0.661,
"step": 11800
},
{
"epoch": 1.5635308088138276,
"grad_norm": 12.502500534057617,
"learning_rate": 9.624950302601936e-06,
"loss": 0.635,
"step": 11850
},
{
"epoch": 1.5701279852223249,
"grad_norm": 16.2374210357666,
"learning_rate": 9.580774837655166e-06,
"loss": 0.613,
"step": 11900
},
{
"epoch": 1.576725161630822,
"grad_norm": 14.896709442138672,
"learning_rate": 9.536599372708398e-06,
"loss": 0.6502,
"step": 11950
},
{
"epoch": 1.583322338039319,
"grad_norm": 10.305893898010254,
"learning_rate": 9.49242390776163e-06,
"loss": 0.6935,
"step": 12000
},
{
"epoch": 1.583322338039319,
"eval_accuracy": 0.9200000166893005,
"eval_loss": 0.5578611493110657,
"eval_runtime": 15.0346,
"eval_samples_per_second": 162.958,
"eval_steps_per_second": 5.122,
"step": 12000
},
{
"epoch": 1.5899195144478164,
"grad_norm": 10.883207321166992,
"learning_rate": 9.44824844281486e-06,
"loss": 0.6147,
"step": 12050
},
{
"epoch": 1.5965166908563135,
"grad_norm": 5.787095069885254,
"learning_rate": 9.404072977868093e-06,
"loss": 0.6575,
"step": 12100
},
{
"epoch": 1.6031138672648106,
"grad_norm": 18.00186538696289,
"learning_rate": 9.359897512921325e-06,
"loss": 0.6837,
"step": 12150
},
{
"epoch": 1.609711043673308,
"grad_norm": 11.488051414489746,
"learning_rate": 9.315722047974555e-06,
"loss": 0.7437,
"step": 12200
},
{
"epoch": 1.6163082200818049,
"grad_norm": 8.060873031616211,
"learning_rate": 9.271546583027787e-06,
"loss": 0.6808,
"step": 12250
},
{
"epoch": 1.6163082200818049,
"eval_accuracy": 0.9204081892967224,
"eval_loss": 0.5507224202156067,
"eval_runtime": 14.9999,
"eval_samples_per_second": 163.335,
"eval_steps_per_second": 5.133,
"step": 12250
},
{
"epoch": 1.6229053964903022,
"grad_norm": 12.3642578125,
"learning_rate": 9.227371118081019e-06,
"loss": 0.6698,
"step": 12300
},
{
"epoch": 1.6295025728987993,
"grad_norm": 15.267610549926758,
"learning_rate": 9.183195653134251e-06,
"loss": 0.6803,
"step": 12350
},
{
"epoch": 1.6360997493072964,
"grad_norm": 14.562056541442871,
"learning_rate": 9.139020188187481e-06,
"loss": 0.676,
"step": 12400
},
{
"epoch": 1.6426969257157937,
"grad_norm": 10.903446197509766,
"learning_rate": 9.094844723240713e-06,
"loss": 0.6418,
"step": 12450
},
{
"epoch": 1.6492941021242908,
"grad_norm": 12.447013854980469,
"learning_rate": 9.050669258293946e-06,
"loss": 0.6042,
"step": 12500
},
{
"epoch": 1.6492941021242908,
"eval_accuracy": 0.9187754988670349,
"eval_loss": 0.556304395198822,
"eval_runtime": 14.8826,
"eval_samples_per_second": 164.622,
"eval_steps_per_second": 5.174,
"step": 12500
},
{
"epoch": 1.655891278532788,
"grad_norm": 14.32016658782959,
"learning_rate": 9.006493793347176e-06,
"loss": 0.7139,
"step": 12550
},
{
"epoch": 1.6624884549412853,
"grad_norm": 10.937993049621582,
"learning_rate": 8.962318328400406e-06,
"loss": 0.6995,
"step": 12600
},
{
"epoch": 1.6690856313497822,
"grad_norm": 9.443896293640137,
"learning_rate": 8.918142863453638e-06,
"loss": 0.6097,
"step": 12650
},
{
"epoch": 1.6756828077582795,
"grad_norm": 13.650922775268555,
"learning_rate": 8.87396739850687e-06,
"loss": 0.6407,
"step": 12700
},
{
"epoch": 1.6822799841667766,
"grad_norm": 13.268482208251953,
"learning_rate": 8.8297919335601e-06,
"loss": 0.5994,
"step": 12750
},
{
"epoch": 1.6822799841667766,
"eval_accuracy": 0.9248979687690735,
"eval_loss": 0.5621338486671448,
"eval_runtime": 14.3912,
"eval_samples_per_second": 170.243,
"eval_steps_per_second": 5.35,
"step": 12750
},
{
"epoch": 1.6888771605752737,
"grad_norm": 10.582622528076172,
"learning_rate": 8.785616468613333e-06,
"loss": 0.6642,
"step": 12800
},
{
"epoch": 1.695474336983771,
"grad_norm": 11.034931182861328,
"learning_rate": 8.741441003666565e-06,
"loss": 0.6198,
"step": 12850
},
{
"epoch": 1.7020715133922681,
"grad_norm": 13.703685760498047,
"learning_rate": 8.697265538719795e-06,
"loss": 0.6648,
"step": 12900
},
{
"epoch": 1.7086686898007653,
"grad_norm": 10.968565940856934,
"learning_rate": 8.653090073773027e-06,
"loss": 0.5644,
"step": 12950
},
{
"epoch": 1.7152658662092626,
"grad_norm": 12.422329902648926,
"learning_rate": 8.608914608826259e-06,
"loss": 0.6531,
"step": 13000
},
{
"epoch": 1.7152658662092626,
"eval_accuracy": 0.9240816235542297,
"eval_loss": 0.5617344975471497,
"eval_runtime": 15.0031,
"eval_samples_per_second": 163.299,
"eval_steps_per_second": 5.132,
"step": 13000
},
{
"epoch": 1.7218630426177595,
"grad_norm": 9.511701583862305,
"learning_rate": 8.564739143879491e-06,
"loss": 0.6159,
"step": 13050
},
{
"epoch": 1.7284602190262568,
"grad_norm": 6.499239921569824,
"learning_rate": 8.520563678932721e-06,
"loss": 0.7855,
"step": 13100
},
{
"epoch": 1.735057395434754,
"grad_norm": 7.864821910858154,
"learning_rate": 8.476388213985953e-06,
"loss": 0.6307,
"step": 13150
},
{
"epoch": 1.741654571843251,
"grad_norm": 11.460110664367676,
"learning_rate": 8.432212749039185e-06,
"loss": 0.61,
"step": 13200
},
{
"epoch": 1.7482517482517483,
"grad_norm": 12.433394432067871,
"learning_rate": 8.388037284092416e-06,
"loss": 0.6672,
"step": 13250
},
{
"epoch": 1.7482517482517483,
"eval_accuracy": 0.9236734509468079,
"eval_loss": 0.5589076280593872,
"eval_runtime": 14.2338,
"eval_samples_per_second": 172.125,
"eval_steps_per_second": 5.41,
"step": 13250
},
{
"epoch": 1.7548489246602454,
"grad_norm": 9.624537467956543,
"learning_rate": 8.343861819145648e-06,
"loss": 0.6002,
"step": 13300
},
{
"epoch": 1.7614461010687426,
"grad_norm": 14.12790584564209,
"learning_rate": 8.299686354198878e-06,
"loss": 0.6638,
"step": 13350
},
{
"epoch": 1.7680432774772399,
"grad_norm": 15.561441421508789,
"learning_rate": 8.25551088925211e-06,
"loss": 0.6112,
"step": 13400
},
{
"epoch": 1.7746404538857368,
"grad_norm": 8.115078926086426,
"learning_rate": 8.21133542430534e-06,
"loss": 0.6236,
"step": 13450
},
{
"epoch": 1.781237630294234,
"grad_norm": 5.141168117523193,
"learning_rate": 8.167159959358572e-06,
"loss": 0.6245,
"step": 13500
},
{
"epoch": 1.781237630294234,
"eval_accuracy": 0.9220408201217651,
"eval_loss": 0.557984471321106,
"eval_runtime": 14.2809,
"eval_samples_per_second": 171.557,
"eval_steps_per_second": 5.392,
"step": 13500
},
{
"epoch": 1.7878348067027312,
"grad_norm": 13.422981262207031,
"learning_rate": 8.122984494411804e-06,
"loss": 0.7146,
"step": 13550
},
{
"epoch": 1.7944319831112283,
"grad_norm": 9.977944374084473,
"learning_rate": 8.078809029465036e-06,
"loss": 0.5969,
"step": 13600
},
{
"epoch": 1.8010291595197256,
"grad_norm": 12.0841064453125,
"learning_rate": 8.034633564518267e-06,
"loss": 0.7246,
"step": 13650
},
{
"epoch": 1.8076263359282227,
"grad_norm": 7.176680088043213,
"learning_rate": 7.990458099571499e-06,
"loss": 0.65,
"step": 13700
},
{
"epoch": 1.8142235123367199,
"grad_norm": 16.529300689697266,
"learning_rate": 7.94628263462473e-06,
"loss": 0.7136,
"step": 13750
},
{
"epoch": 1.8142235123367199,
"eval_accuracy": 0.9204081892967224,
"eval_loss": 0.5532920360565186,
"eval_runtime": 14.1588,
"eval_samples_per_second": 173.038,
"eval_steps_per_second": 5.438,
"step": 13750
},
{
"epoch": 1.8208206887452172,
"grad_norm": 14.22460651397705,
"learning_rate": 7.902107169677961e-06,
"loss": 0.7062,
"step": 13800
},
{
"epoch": 1.827417865153714,
"grad_norm": 12.760638236999512,
"learning_rate": 7.857931704731193e-06,
"loss": 0.6987,
"step": 13850
},
{
"epoch": 1.8340150415622114,
"grad_norm": 18.206119537353516,
"learning_rate": 7.813756239784425e-06,
"loss": 0.6642,
"step": 13900
},
{
"epoch": 1.8406122179707085,
"grad_norm": 10.713970184326172,
"learning_rate": 7.769580774837655e-06,
"loss": 0.6761,
"step": 13950
},
{
"epoch": 1.8472093943792056,
"grad_norm": 15.882705688476562,
"learning_rate": 7.725405309890888e-06,
"loss": 0.6766,
"step": 14000
},
{
"epoch": 1.8472093943792056,
"eval_accuracy": 0.9212244749069214,
"eval_loss": 0.5655022263526917,
"eval_runtime": 14.1648,
"eval_samples_per_second": 172.964,
"eval_steps_per_second": 5.436,
"step": 14000
},
{
"epoch": 1.853806570787703,
"grad_norm": 14.32147216796875,
"learning_rate": 7.68122984494412e-06,
"loss": 0.5758,
"step": 14050
},
{
"epoch": 1.8604037471962,
"grad_norm": 10.756980895996094,
"learning_rate": 7.63705437999735e-06,
"loss": 0.6594,
"step": 14100
},
{
"epoch": 1.8670009236046972,
"grad_norm": 14.631691932678223,
"learning_rate": 7.592878915050581e-06,
"loss": 0.7866,
"step": 14150
},
{
"epoch": 1.8735981000131945,
"grad_norm": 10.117656707763672,
"learning_rate": 7.548703450103812e-06,
"loss": 0.5798,
"step": 14200
},
{
"epoch": 1.8801952764216914,
"grad_norm": 18.793254852294922,
"learning_rate": 7.504527985157044e-06,
"loss": 0.6472,
"step": 14250
},
{
"epoch": 1.8801952764216914,
"eval_accuracy": 0.9212244749069214,
"eval_loss": 0.5508715510368347,
"eval_runtime": 14.2705,
"eval_samples_per_second": 171.683,
"eval_steps_per_second": 5.396,
"step": 14250
},
{
"epoch": 1.8867924528301887,
"grad_norm": 6.77608585357666,
"learning_rate": 7.460352520210275e-06,
"loss": 0.7387,
"step": 14300
},
{
"epoch": 1.8933896292386858,
"grad_norm": 9.065592765808105,
"learning_rate": 7.416177055263507e-06,
"loss": 0.6677,
"step": 14350
},
{
"epoch": 1.899986805647183,
"grad_norm": 16.486297607421875,
"learning_rate": 7.372001590316739e-06,
"loss": 0.6697,
"step": 14400
},
{
"epoch": 1.9065839820556802,
"grad_norm": 10.985074996948242,
"learning_rate": 7.32782612536997e-06,
"loss": 0.6711,
"step": 14450
},
{
"epoch": 1.9131811584641774,
"grad_norm": 10.440354347229004,
"learning_rate": 7.283650660423202e-06,
"loss": 0.6988,
"step": 14500
},
{
"epoch": 1.9131811584641774,
"eval_accuracy": 0.9228571653366089,
"eval_loss": 0.5527560114860535,
"eval_runtime": 14.6748,
"eval_samples_per_second": 166.953,
"eval_steps_per_second": 5.247,
"step": 14500
},
{
"epoch": 1.9197783348726745,
"grad_norm": 10.955256462097168,
"learning_rate": 7.239475195476433e-06,
"loss": 0.6301,
"step": 14550
},
{
"epoch": 1.9263755112811718,
"grad_norm": 12.683993339538574,
"learning_rate": 7.195299730529665e-06,
"loss": 0.6259,
"step": 14600
},
{
"epoch": 1.9329726876896687,
"grad_norm": 12.907076835632324,
"learning_rate": 7.151124265582896e-06,
"loss": 0.6223,
"step": 14650
},
{
"epoch": 1.939569864098166,
"grad_norm": 16.311803817749023,
"learning_rate": 7.106948800636127e-06,
"loss": 0.5702,
"step": 14700
},
{
"epoch": 1.9461670405066631,
"grad_norm": 13.10996150970459,
"learning_rate": 7.062773335689359e-06,
"loss": 0.6324,
"step": 14750
},
{
"epoch": 1.9461670405066631,
"eval_accuracy": 0.9253061413764954,
"eval_loss": 0.5507711172103882,
"eval_runtime": 14.1763,
"eval_samples_per_second": 172.824,
"eval_steps_per_second": 5.432,
"step": 14750
},
{
"epoch": 1.9527642169151602,
"grad_norm": 10.274900436401367,
"learning_rate": 7.0185978707425905e-06,
"loss": 0.6409,
"step": 14800
},
{
"epoch": 1.9593613933236576,
"grad_norm": 14.111708641052246,
"learning_rate": 6.974422405795822e-06,
"loss": 0.6609,
"step": 14850
},
{
"epoch": 1.9659585697321547,
"grad_norm": 7.0049662590026855,
"learning_rate": 6.930246940849053e-06,
"loss": 0.6581,
"step": 14900
},
{
"epoch": 1.9725557461406518,
"grad_norm": 8.995936393737793,
"learning_rate": 6.886071475902284e-06,
"loss": 0.6313,
"step": 14950
},
{
"epoch": 1.979152922549149,
"grad_norm": 19.018213272094727,
"learning_rate": 6.841896010955515e-06,
"loss": 0.6191,
"step": 15000
},
{
"epoch": 1.979152922549149,
"eval_accuracy": 0.9216326475143433,
"eval_loss": 0.5451802611351013,
"eval_runtime": 14.4726,
"eval_samples_per_second": 169.286,
"eval_steps_per_second": 5.32,
"step": 15000
},
{
"epoch": 1.985750098957646,
"grad_norm": 14.76790714263916,
"learning_rate": 6.797720546008747e-06,
"loss": 0.6665,
"step": 15050
},
{
"epoch": 1.9923472753661433,
"grad_norm": 5.290237903594971,
"learning_rate": 6.753545081061978e-06,
"loss": 0.5907,
"step": 15100
},
{
"epoch": 1.9989444517746404,
"grad_norm": 14.515754699707031,
"learning_rate": 6.70936961611521e-06,
"loss": 0.6586,
"step": 15150
},
{
"epoch": 2.0055416281831375,
"grad_norm": 5.708993434906006,
"learning_rate": 6.665194151168442e-06,
"loss": 0.5673,
"step": 15200
},
{
"epoch": 2.012138804591635,
"grad_norm": 15.482401847839355,
"learning_rate": 6.621018686221673e-06,
"loss": 0.5516,
"step": 15250
},
{
"epoch": 2.012138804591635,
"eval_accuracy": 0.923265278339386,
"eval_loss": 0.5589110255241394,
"eval_runtime": 14.2886,
"eval_samples_per_second": 171.465,
"eval_steps_per_second": 5.389,
"step": 15250
},
{
"epoch": 2.0187359810001317,
"grad_norm": 5.612030506134033,
"learning_rate": 6.576843221274905e-06,
"loss": 0.5012,
"step": 15300
},
{
"epoch": 2.025333157408629,
"grad_norm": 17.197193145751953,
"learning_rate": 6.532667756328136e-06,
"loss": 0.5227,
"step": 15350
},
{
"epoch": 2.0319303338171264,
"grad_norm": 19.402557373046875,
"learning_rate": 6.488492291381367e-06,
"loss": 0.4449,
"step": 15400
},
{
"epoch": 2.0385275102256233,
"grad_norm": 18.876649856567383,
"learning_rate": 6.444316826434599e-06,
"loss": 0.4862,
"step": 15450
},
{
"epoch": 2.0451246866341206,
"grad_norm": 9.995197296142578,
"learning_rate": 6.40014136148783e-06,
"loss": 0.5413,
"step": 15500
},
{
"epoch": 2.0451246866341206,
"eval_accuracy": 0.923265278339386,
"eval_loss": 0.5642380714416504,
"eval_runtime": 14.3185,
"eval_samples_per_second": 171.108,
"eval_steps_per_second": 5.378,
"step": 15500
},
{
"epoch": 2.051721863042618,
"grad_norm": 9.785661697387695,
"learning_rate": 6.355965896541062e-06,
"loss": 0.5462,
"step": 15550
},
{
"epoch": 2.058319039451115,
"grad_norm": 14.724440574645996,
"learning_rate": 6.3117904315942935e-06,
"loss": 0.5318,
"step": 15600
},
{
"epoch": 2.064916215859612,
"grad_norm": 11.998701095581055,
"learning_rate": 6.267614966647525e-06,
"loss": 0.5706,
"step": 15650
},
{
"epoch": 2.071513392268109,
"grad_norm": 3.4020655155181885,
"learning_rate": 6.223439501700755e-06,
"loss": 0.5055,
"step": 15700
},
{
"epoch": 2.0781105686766064,
"grad_norm": 16.408964157104492,
"learning_rate": 6.179264036753987e-06,
"loss": 0.6141,
"step": 15750
},
{
"epoch": 2.0781105686766064,
"eval_accuracy": 0.923265278339386,
"eval_loss": 0.5610572695732117,
"eval_runtime": 14.2227,
"eval_samples_per_second": 172.26,
"eval_steps_per_second": 5.414,
"step": 15750
},
{
"epoch": 2.0847077450851037,
"grad_norm": 17.39078140258789,
"learning_rate": 6.135088571807218e-06,
"loss": 0.5247,
"step": 15800
},
{
"epoch": 2.0913049214936006,
"grad_norm": 17.45914649963379,
"learning_rate": 6.09091310686045e-06,
"loss": 0.4817,
"step": 15850
},
{
"epoch": 2.097902097902098,
"grad_norm": 13.649807929992676,
"learning_rate": 6.0467376419136814e-06,
"loss": 0.4599,
"step": 15900
},
{
"epoch": 2.1044992743105952,
"grad_norm": 8.314746856689453,
"learning_rate": 6.002562176966913e-06,
"loss": 0.5676,
"step": 15950
},
{
"epoch": 2.111096450719092,
"grad_norm": 14.881856918334961,
"learning_rate": 5.958386712020145e-06,
"loss": 0.3992,
"step": 16000
},
{
"epoch": 2.111096450719092,
"eval_accuracy": 0.9236734509468079,
"eval_loss": 0.5720360279083252,
"eval_runtime": 14.2571,
"eval_samples_per_second": 171.844,
"eval_steps_per_second": 5.401,
"step": 16000
},
{
"epoch": 2.1176936271275895,
"grad_norm": 21.33131217956543,
"learning_rate": 5.914211247073376e-06,
"loss": 0.5337,
"step": 16050
},
{
"epoch": 2.1242908035360863,
"grad_norm": 14.612150192260742,
"learning_rate": 5.870035782126608e-06,
"loss": 0.4641,
"step": 16100
},
{
"epoch": 2.1308879799445837,
"grad_norm": 19.05860137939453,
"learning_rate": 5.825860317179839e-06,
"loss": 0.5636,
"step": 16150
},
{
"epoch": 2.137485156353081,
"grad_norm": 13.695535659790039,
"learning_rate": 5.78168485223307e-06,
"loss": 0.4811,
"step": 16200
},
{
"epoch": 2.144082332761578,
"grad_norm": 11.873661041259766,
"learning_rate": 5.737509387286302e-06,
"loss": 0.499,
"step": 16250
},
{
"epoch": 2.144082332761578,
"eval_accuracy": 0.9216326475143433,
"eval_loss": 0.5672578811645508,
"eval_runtime": 14.4037,
"eval_samples_per_second": 170.096,
"eval_steps_per_second": 5.346,
"step": 16250
},
{
"epoch": 2.150679509170075,
"grad_norm": 10.252338409423828,
"learning_rate": 5.693333922339533e-06,
"loss": 0.5822,
"step": 16300
},
{
"epoch": 2.1572766855785726,
"grad_norm": 21.956472396850586,
"learning_rate": 5.6491584573927645e-06,
"loss": 0.5935,
"step": 16350
},
{
"epoch": 2.1638738619870694,
"grad_norm": 10.932018280029297,
"learning_rate": 5.6049829924459966e-06,
"loss": 0.5028,
"step": 16400
},
{
"epoch": 2.1704710383955668,
"grad_norm": 11.411332130432129,
"learning_rate": 5.560807527499228e-06,
"loss": 0.5118,
"step": 16450
},
{
"epoch": 2.1770682148040637,
"grad_norm": 12.977612495422363,
"learning_rate": 5.516632062552458e-06,
"loss": 0.5623,
"step": 16500
},
{
"epoch": 2.1770682148040637,
"eval_accuracy": 0.9261224269866943,
"eval_loss": 0.5655830502510071,
"eval_runtime": 14.5501,
"eval_samples_per_second": 168.384,
"eval_steps_per_second": 5.292,
"step": 16500
},
{
"epoch": 2.183665391212561,
"grad_norm": 16.45384979248047,
"learning_rate": 5.47245659760569e-06,
"loss": 0.481,
"step": 16550
},
{
"epoch": 2.1902625676210583,
"grad_norm": 10.353941917419434,
"learning_rate": 5.428281132658921e-06,
"loss": 0.5461,
"step": 16600
},
{
"epoch": 2.196859744029555,
"grad_norm": 13.859786987304688,
"learning_rate": 5.3841056677121524e-06,
"loss": 0.5802,
"step": 16650
},
{
"epoch": 2.2034569204380525,
"grad_norm": 14.852931022644043,
"learning_rate": 5.3399302027653845e-06,
"loss": 0.5269,
"step": 16700
},
{
"epoch": 2.21005409684655,
"grad_norm": 11.935972213745117,
"learning_rate": 5.295754737818616e-06,
"loss": 0.5022,
"step": 16750
},
{
"epoch": 2.21005409684655,
"eval_accuracy": 0.9220408201217651,
"eval_loss": 0.5671045184135437,
"eval_runtime": 14.353,
"eval_samples_per_second": 170.696,
"eval_steps_per_second": 5.365,
"step": 16750
},
{
"epoch": 2.2166512732550467,
"grad_norm": 6.890115261077881,
"learning_rate": 5.251579272871848e-06,
"loss": 0.5203,
"step": 16800
},
{
"epoch": 2.223248449663544,
"grad_norm": 10.788956642150879,
"learning_rate": 5.207403807925079e-06,
"loss": 0.5461,
"step": 16850
},
{
"epoch": 2.2298456260720414,
"grad_norm": 10.99864387512207,
"learning_rate": 5.16322834297831e-06,
"loss": 0.5711,
"step": 16900
},
{
"epoch": 2.2364428024805383,
"grad_norm": 14.6043062210083,
"learning_rate": 5.119052878031542e-06,
"loss": 0.5615,
"step": 16950
},
{
"epoch": 2.2430399788890356,
"grad_norm": 11.577956199645996,
"learning_rate": 5.074877413084773e-06,
"loss": 0.5748,
"step": 17000
},
{
"epoch": 2.2430399788890356,
"eval_accuracy": 0.9257143139839172,
"eval_loss": 0.560461699962616,
"eval_runtime": 14.6816,
"eval_samples_per_second": 166.875,
"eval_steps_per_second": 5.245,
"step": 17000
},
{
"epoch": 2.2496371552975325,
"grad_norm": 10.84367561340332,
"learning_rate": 5.030701948138005e-06,
"loss": 0.5272,
"step": 17050
},
{
"epoch": 2.25623433170603,
"grad_norm": 9.228910446166992,
"learning_rate": 4.9865264831912355e-06,
"loss": 0.4401,
"step": 17100
},
{
"epoch": 2.262831508114527,
"grad_norm": 5.861785411834717,
"learning_rate": 4.9423510182444676e-06,
"loss": 0.5158,
"step": 17150
},
{
"epoch": 2.269428684523024,
"grad_norm": 19.159400939941406,
"learning_rate": 4.898175553297699e-06,
"loss": 0.5163,
"step": 17200
},
{
"epoch": 2.2760258609315214,
"grad_norm": 9.03962516784668,
"learning_rate": 4.85400008835093e-06,
"loss": 0.5195,
"step": 17250
},
{
"epoch": 2.2760258609315214,
"eval_accuracy": 0.9236734509468079,
"eval_loss": 0.5647316575050354,
"eval_runtime": 14.3918,
"eval_samples_per_second": 170.236,
"eval_steps_per_second": 5.35,
"step": 17250
},
{
"epoch": 2.2826230373400183,
"grad_norm": 11.594718933105469,
"learning_rate": 4.809824623404162e-06,
"loss": 0.5235,
"step": 17300
},
{
"epoch": 2.2892202137485156,
"grad_norm": 15.447309494018555,
"learning_rate": 4.765649158457393e-06,
"loss": 0.5335,
"step": 17350
},
{
"epoch": 2.295817390157013,
"grad_norm": 8.447811126708984,
"learning_rate": 4.721473693510625e-06,
"loss": 0.4915,
"step": 17400
},
{
"epoch": 2.30241456656551,
"grad_norm": 11.305243492126465,
"learning_rate": 4.677298228563856e-06,
"loss": 0.4915,
"step": 17450
},
{
"epoch": 2.309011742974007,
"grad_norm": 16.881988525390625,
"learning_rate": 4.6331227636170875e-06,
"loss": 0.4959,
"step": 17500
},
{
"epoch": 2.309011742974007,
"eval_accuracy": 0.923265278339386,
"eval_loss": 0.5674872398376465,
"eval_runtime": 14.3341,
"eval_samples_per_second": 170.921,
"eval_steps_per_second": 5.372,
"step": 17500
},
{
"epoch": 2.3156089193825045,
"grad_norm": 8.994074821472168,
"learning_rate": 4.588947298670319e-06,
"loss": 0.5161,
"step": 17550
},
{
"epoch": 2.3222060957910013,
"grad_norm": 14.556620597839355,
"learning_rate": 4.54477183372355e-06,
"loss": 0.4944,
"step": 17600
},
{
"epoch": 2.3288032721994987,
"grad_norm": 15.484505653381348,
"learning_rate": 4.500596368776782e-06,
"loss": 0.5052,
"step": 17650
},
{
"epoch": 2.335400448607996,
"grad_norm": 6.690243244171143,
"learning_rate": 4.456420903830013e-06,
"loss": 0.4937,
"step": 17700
},
{
"epoch": 2.341997625016493,
"grad_norm": 12.30466365814209,
"learning_rate": 4.4131289481821795e-06,
"loss": 0.4695,
"step": 17750
},
{
"epoch": 2.341997625016493,
"eval_accuracy": 0.9253061413764954,
"eval_loss": 0.561543345451355,
"eval_runtime": 14.3063,
"eval_samples_per_second": 171.254,
"eval_steps_per_second": 5.382,
"step": 17750
},
{
"epoch": 2.34859480142499,
"grad_norm": 14.061612129211426,
"learning_rate": 4.3689534832354116e-06,
"loss": 0.5159,
"step": 17800
},
{
"epoch": 2.355191977833487,
"grad_norm": 17.726974487304688,
"learning_rate": 4.324778018288643e-06,
"loss": 0.4992,
"step": 17850
},
{
"epoch": 2.3617891542419844,
"grad_norm": 7.066623687744141,
"learning_rate": 4.280602553341875e-06,
"loss": 0.5288,
"step": 17900
},
{
"epoch": 2.3683863306504818,
"grad_norm": 18.694576263427734,
"learning_rate": 4.236427088395106e-06,
"loss": 0.5247,
"step": 17950
},
{
"epoch": 2.3749835070589786,
"grad_norm": 14.194579124450684,
"learning_rate": 4.192251623448337e-06,
"loss": 0.5491,
"step": 18000
},
{
"epoch": 2.3749835070589786,
"eval_accuracy": 0.9257143139839172,
"eval_loss": 0.5593844652175903,
"eval_runtime": 14.4387,
"eval_samples_per_second": 169.683,
"eval_steps_per_second": 5.333,
"step": 18000
},
{
"epoch": 2.381580683467476,
"grad_norm": 8.745909690856934,
"learning_rate": 4.148076158501568e-06,
"loss": 0.5332,
"step": 18050
},
{
"epoch": 2.388177859875973,
"grad_norm": 7.993963241577148,
"learning_rate": 4.1039006935547995e-06,
"loss": 0.529,
"step": 18100
},
{
"epoch": 2.39477503628447,
"grad_norm": 11.705142974853516,
"learning_rate": 4.0597252286080315e-06,
"loss": 0.5534,
"step": 18150
},
{
"epoch": 2.4013722126929675,
"grad_norm": 15.30136775970459,
"learning_rate": 4.015549763661263e-06,
"loss": 0.5595,
"step": 18200
},
{
"epoch": 2.4079693891014644,
"grad_norm": 14.33283805847168,
"learning_rate": 3.971374298714495e-06,
"loss": 0.573,
"step": 18250
},
{
"epoch": 2.4079693891014644,
"eval_accuracy": 0.9261224269866943,
"eval_loss": 0.5610310435295105,
"eval_runtime": 14.5813,
"eval_samples_per_second": 168.023,
"eval_steps_per_second": 5.281,
"step": 18250
},
{
"epoch": 2.4145665655099617,
"grad_norm": 15.337475776672363,
"learning_rate": 3.928082343066661e-06,
"loss": 0.4859,
"step": 18300
},
{
"epoch": 2.421163741918459,
"grad_norm": 14.803478240966797,
"learning_rate": 3.883906878119892e-06,
"loss": 0.5019,
"step": 18350
},
{
"epoch": 2.427760918326956,
"grad_norm": 17.378925323486328,
"learning_rate": 3.8397314131731236e-06,
"loss": 0.4771,
"step": 18400
},
{
"epoch": 2.4343580947354533,
"grad_norm": 11.473735809326172,
"learning_rate": 3.7955559482263556e-06,
"loss": 0.5062,
"step": 18450
},
{
"epoch": 2.4409552711439506,
"grad_norm": 14.394603729248047,
"learning_rate": 3.7513804832795868e-06,
"loss": 0.5342,
"step": 18500
},
{
"epoch": 2.4409552711439506,
"eval_accuracy": 0.9228571653366089,
"eval_loss": 0.5616511106491089,
"eval_runtime": 14.3525,
"eval_samples_per_second": 170.701,
"eval_steps_per_second": 5.365,
"step": 18500
},
{
"epoch": 2.4475524475524475,
"grad_norm": 13.148124694824219,
"learning_rate": 3.7072050183328184e-06,
"loss": 0.5275,
"step": 18550
},
{
"epoch": 2.454149623960945,
"grad_norm": 18.74552345275879,
"learning_rate": 3.66302955338605e-06,
"loss": 0.576,
"step": 18600
},
{
"epoch": 2.4607468003694417,
"grad_norm": 9.60922622680664,
"learning_rate": 3.6188540884392807e-06,
"loss": 0.5172,
"step": 18650
},
{
"epoch": 2.467343976777939,
"grad_norm": 19.15401840209961,
"learning_rate": 3.5746786234925123e-06,
"loss": 0.5127,
"step": 18700
},
{
"epoch": 2.4739411531864364,
"grad_norm": 14.698090553283691,
"learning_rate": 3.530503158545744e-06,
"loss": 0.4728,
"step": 18750
},
{
"epoch": 2.4739411531864364,
"eval_accuracy": 0.9248979687690735,
"eval_loss": 0.5650564432144165,
"eval_runtime": 14.464,
"eval_samples_per_second": 169.387,
"eval_steps_per_second": 5.324,
"step": 18750
},
{
"epoch": 2.4805383295949333,
"grad_norm": 11.760072708129883,
"learning_rate": 3.4863276935989755e-06,
"loss": 0.4256,
"step": 18800
},
{
"epoch": 2.4871355060034306,
"grad_norm": 11.057052612304688,
"learning_rate": 3.442152228652207e-06,
"loss": 0.4493,
"step": 18850
},
{
"epoch": 2.4937326824119275,
"grad_norm": 14.985106468200684,
"learning_rate": 3.3979767637054383e-06,
"loss": 0.4881,
"step": 18900
},
{
"epoch": 2.500329858820425,
"grad_norm": 8.24613094329834,
"learning_rate": 3.35380129875867e-06,
"loss": 0.4843,
"step": 18950
},
{
"epoch": 2.506927035228922,
"grad_norm": 12.78288459777832,
"learning_rate": 3.3096258338119015e-06,
"loss": 0.517,
"step": 19000
},
{
"epoch": 2.506927035228922,
"eval_accuracy": 0.9248979687690735,
"eval_loss": 0.5626258850097656,
"eval_runtime": 14.0676,
"eval_samples_per_second": 174.159,
"eval_steps_per_second": 5.474,
"step": 19000
},
{
"epoch": 2.513524211637419,
"grad_norm": 13.174288749694824,
"learning_rate": 3.265450368865132e-06,
"loss": 0.5927,
"step": 19050
},
{
"epoch": 2.5201213880459163,
"grad_norm": 6.971681594848633,
"learning_rate": 3.221274903918364e-06,
"loss": 0.5687,
"step": 19100
},
{
"epoch": 2.5267185644544137,
"grad_norm": 10.98085880279541,
"learning_rate": 3.1770994389715954e-06,
"loss": 0.5261,
"step": 19150
},
{
"epoch": 2.5333157408629106,
"grad_norm": 15.29484748840332,
"learning_rate": 3.132923974024827e-06,
"loss": 0.5698,
"step": 19200
},
{
"epoch": 2.539912917271408,
"grad_norm": 22.54600715637207,
"learning_rate": 3.088748509078058e-06,
"loss": 0.5593,
"step": 19250
},
{
"epoch": 2.539912917271408,
"eval_accuracy": 0.9269387722015381,
"eval_loss": 0.5581403374671936,
"eval_runtime": 14.0758,
"eval_samples_per_second": 174.057,
"eval_steps_per_second": 5.47,
"step": 19250
},
{
"epoch": 2.546510093679905,
"grad_norm": 10.993823051452637,
"learning_rate": 3.0445730441312898e-06,
"loss": 0.571,
"step": 19300
},
{
"epoch": 2.553107270088402,
"grad_norm": 21.144847869873047,
"learning_rate": 3.0003975791845214e-06,
"loss": 0.5606,
"step": 19350
},
{
"epoch": 2.5597044464968994,
"grad_norm": 16.376079559326172,
"learning_rate": 2.956222114237753e-06,
"loss": 0.4912,
"step": 19400
},
{
"epoch": 2.5663016229053968,
"grad_norm": 13.594402313232422,
"learning_rate": 2.9120466492909837e-06,
"loss": 0.4805,
"step": 19450
},
{
"epoch": 2.5728987993138936,
"grad_norm": 17.23542594909668,
"learning_rate": 2.8678711843442153e-06,
"loss": 0.5324,
"step": 19500
},
{
"epoch": 2.5728987993138936,
"eval_accuracy": 0.9281632900238037,
"eval_loss": 0.553718626499176,
"eval_runtime": 14.1602,
"eval_samples_per_second": 173.02,
"eval_steps_per_second": 5.438,
"step": 19500
},
{
"epoch": 2.579495975722391,
"grad_norm": 17.32400131225586,
"learning_rate": 2.823695719397447e-06,
"loss": 0.5584,
"step": 19550
},
{
"epoch": 2.586093152130888,
"grad_norm": 5.780141830444336,
"learning_rate": 2.7795202544506785e-06,
"loss": 0.508,
"step": 19600
},
{
"epoch": 2.592690328539385,
"grad_norm": 12.641766548156738,
"learning_rate": 2.7353447895039097e-06,
"loss": 0.5231,
"step": 19650
},
{
"epoch": 2.599287504947882,
"grad_norm": 18.93987464904785,
"learning_rate": 2.6920528338560762e-06,
"loss": 0.557,
"step": 19700
},
{
"epoch": 2.6058846813563794,
"grad_norm": 15.360589027404785,
"learning_rate": 2.647877368909308e-06,
"loss": 0.5338,
"step": 19750
},
{
"epoch": 2.6058846813563794,
"eval_accuracy": 0.9257143139839172,
"eval_loss": 0.551838219165802,
"eval_runtime": 15.1358,
"eval_samples_per_second": 161.868,
"eval_steps_per_second": 5.087,
"step": 19750
},
{
"epoch": 2.6124818577648767,
"grad_norm": 15.32451057434082,
"learning_rate": 2.6037019039625394e-06,
"loss": 0.5037,
"step": 19800
},
{
"epoch": 2.6190790341733736,
"grad_norm": 11.314981460571289,
"learning_rate": 2.559526439015771e-06,
"loss": 0.6057,
"step": 19850
},
{
"epoch": 2.625676210581871,
"grad_norm": 7.916543006896973,
"learning_rate": 2.5153509740690026e-06,
"loss": 0.5571,
"step": 19900
},
{
"epoch": 2.6322733869903683,
"grad_norm": 17.10308837890625,
"learning_rate": 2.4711755091222338e-06,
"loss": 0.5177,
"step": 19950
},
{
"epoch": 2.638870563398865,
"grad_norm": 16.19850730895996,
"learning_rate": 2.427000044175465e-06,
"loss": 0.4946,
"step": 20000
},
{
"epoch": 2.638870563398865,
"eval_accuracy": 0.9253061413764954,
"eval_loss": 0.5547569394111633,
"eval_runtime": 14.6842,
"eval_samples_per_second": 166.846,
"eval_steps_per_second": 5.244,
"step": 20000
},
{
"epoch": 2.6454677398073625,
"grad_norm": 9.776342391967773,
"learning_rate": 2.3828245792286966e-06,
"loss": 0.5256,
"step": 20050
},
{
"epoch": 2.65206491621586,
"grad_norm": 14.741767883300781,
"learning_rate": 2.338649114281928e-06,
"loss": 0.5107,
"step": 20100
},
{
"epoch": 2.6586620926243567,
"grad_norm": 10.714197158813477,
"learning_rate": 2.2944736493351593e-06,
"loss": 0.5988,
"step": 20150
},
{
"epoch": 2.665259269032854,
"grad_norm": 16.533546447753906,
"learning_rate": 2.250298184388391e-06,
"loss": 0.4907,
"step": 20200
},
{
"epoch": 2.6718564454413514,
"grad_norm": 18.46228790283203,
"learning_rate": 2.2061227194416225e-06,
"loss": 0.4697,
"step": 20250
},
{
"epoch": 2.6718564454413514,
"eval_accuracy": 0.9269387722015381,
"eval_loss": 0.5565572381019592,
"eval_runtime": 15.4517,
"eval_samples_per_second": 158.559,
"eval_steps_per_second": 4.983,
"step": 20250
},
{
"epoch": 2.6784536218498483,
"grad_norm": 11.330911636352539,
"learning_rate": 2.1619472544948537e-06,
"loss": 0.4897,
"step": 20300
},
{
"epoch": 2.6850507982583456,
"grad_norm": 12.666998863220215,
"learning_rate": 2.1177717895480853e-06,
"loss": 0.5088,
"step": 20350
},
{
"epoch": 2.6916479746668425,
"grad_norm": 21.95562171936035,
"learning_rate": 2.0735963246013165e-06,
"loss": 0.5442,
"step": 20400
},
{
"epoch": 2.69824515107534,
"grad_norm": 13.3275785446167,
"learning_rate": 2.029420859654548e-06,
"loss": 0.536,
"step": 20450
},
{
"epoch": 2.7048423274838367,
"grad_norm": 16.374469757080078,
"learning_rate": 1.9852453947077792e-06,
"loss": 0.551,
"step": 20500
},
{
"epoch": 2.7048423274838367,
"eval_accuracy": 0.9269387722015381,
"eval_loss": 0.5562152862548828,
"eval_runtime": 14.1322,
"eval_samples_per_second": 173.363,
"eval_steps_per_second": 5.449,
"step": 20500
},
{
"epoch": 2.711439503892334,
"grad_norm": 9.546135902404785,
"learning_rate": 1.941069929761011e-06,
"loss": 0.5038,
"step": 20550
},
{
"epoch": 2.7180366803008313,
"grad_norm": 8.056339263916016,
"learning_rate": 1.8968944648142424e-06,
"loss": 0.502,
"step": 20600
},
{
"epoch": 2.7246338567093282,
"grad_norm": 15.2578706741333,
"learning_rate": 1.8527189998674738e-06,
"loss": 0.5021,
"step": 20650
},
{
"epoch": 2.7312310331178256,
"grad_norm": 9.090350151062012,
"learning_rate": 1.808543534920705e-06,
"loss": 0.5441,
"step": 20700
},
{
"epoch": 2.737828209526323,
"grad_norm": 8.323760032653809,
"learning_rate": 1.7643680699739366e-06,
"loss": 0.4818,
"step": 20750
},
{
"epoch": 2.737828209526323,
"eval_accuracy": 0.9285714030265808,
"eval_loss": 0.554760754108429,
"eval_runtime": 14.1428,
"eval_samples_per_second": 173.233,
"eval_steps_per_second": 5.444,
"step": 20750
},
{
"epoch": 2.7444253859348198,
"grad_norm": 9.076456069946289,
"learning_rate": 1.720192605027168e-06,
"loss": 0.5012,
"step": 20800
},
{
"epoch": 2.751022562343317,
"grad_norm": 11.11436939239502,
"learning_rate": 1.6760171400803996e-06,
"loss": 0.5294,
"step": 20850
},
{
"epoch": 2.7576197387518144,
"grad_norm": 10.291386604309082,
"learning_rate": 1.6318416751336307e-06,
"loss": 0.4674,
"step": 20900
},
{
"epoch": 2.7642169151603113,
"grad_norm": 19.83849334716797,
"learning_rate": 1.5876662101868623e-06,
"loss": 0.5436,
"step": 20950
},
{
"epoch": 2.7708140915688086,
"grad_norm": 12.49002456665039,
"learning_rate": 1.5434907452400937e-06,
"loss": 0.4609,
"step": 21000
},
{
"epoch": 2.7708140915688086,
"eval_accuracy": 0.9269387722015381,
"eval_loss": 0.5537921190261841,
"eval_runtime": 14.1539,
"eval_samples_per_second": 173.097,
"eval_steps_per_second": 5.44,
"step": 21000
},
{
"epoch": 2.777411267977306,
"grad_norm": 11.501051902770996,
"learning_rate": 1.4993152802933253e-06,
"loss": 0.5015,
"step": 21050
},
{
"epoch": 2.784008444385803,
"grad_norm": 11.332602500915527,
"learning_rate": 1.4551398153465565e-06,
"loss": 0.5299,
"step": 21100
},
{
"epoch": 2.7906056207943,
"grad_norm": 14.520770072937012,
"learning_rate": 1.410964350399788e-06,
"loss": 0.4363,
"step": 21150
},
{
"epoch": 2.797202797202797,
"grad_norm": 19.919044494628906,
"learning_rate": 1.3667888854530195e-06,
"loss": 0.5018,
"step": 21200
},
{
"epoch": 2.8037999736112944,
"grad_norm": 17.189006805419922,
"learning_rate": 1.322613420506251e-06,
"loss": 0.5079,
"step": 21250
},
{
"epoch": 2.8037999736112944,
"eval_accuracy": 0.9265305995941162,
"eval_loss": 0.5549395680427551,
"eval_runtime": 14.2355,
"eval_samples_per_second": 172.105,
"eval_steps_per_second": 5.409,
"step": 21250
},
{
"epoch": 2.8103971500197913,
"grad_norm": 10.528189659118652,
"learning_rate": 1.2784379555594823e-06,
"loss": 0.4467,
"step": 21300
},
{
"epoch": 2.8169943264282886,
"grad_norm": 10.16166877746582,
"learning_rate": 1.2342624906127139e-06,
"loss": 0.5769,
"step": 21350
},
{
"epoch": 2.823591502836786,
"grad_norm": 5.988204002380371,
"learning_rate": 1.1900870256659452e-06,
"loss": 0.5323,
"step": 21400
},
{
"epoch": 2.830188679245283,
"grad_norm": 18.418853759765625,
"learning_rate": 1.1459115607191766e-06,
"loss": 0.4714,
"step": 21450
},
{
"epoch": 2.83678585565378,
"grad_norm": 12.283252716064453,
"learning_rate": 1.1017360957724082e-06,
"loss": 0.4491,
"step": 21500
},
{
"epoch": 2.83678585565378,
"eval_accuracy": 0.9257143139839172,
"eval_loss": 0.5538486242294312,
"eval_runtime": 14.1053,
"eval_samples_per_second": 173.693,
"eval_steps_per_second": 5.459,
"step": 21500
},
{
"epoch": 2.8433830320622775,
"grad_norm": 6.51999568939209,
"learning_rate": 1.0575606308256394e-06,
"loss": 0.4801,
"step": 21550
},
{
"epoch": 2.8499802084707744,
"grad_norm": 12.273625373840332,
"learning_rate": 1.013385165878871e-06,
"loss": 0.5132,
"step": 21600
},
{
"epoch": 2.8565773848792717,
"grad_norm": 14.393851280212402,
"learning_rate": 9.692097009321024e-07,
"loss": 0.4542,
"step": 21650
},
{
"epoch": 2.863174561287769,
"grad_norm": 14.198440551757812,
"learning_rate": 9.250342359853339e-07,
"loss": 0.5015,
"step": 21700
},
{
"epoch": 2.869771737696266,
"grad_norm": 17.55302619934082,
"learning_rate": 8.808587710385653e-07,
"loss": 0.4818,
"step": 21750
},
{
"epoch": 2.869771737696266,
"eval_accuracy": 0.9277551174163818,
"eval_loss": 0.5553678870201111,
"eval_runtime": 14.0909,
"eval_samples_per_second": 173.871,
"eval_steps_per_second": 5.465,
"step": 21750
},
{
"epoch": 2.8763689141047633,
"grad_norm": 14.505319595336914,
"learning_rate": 8.366833060917967e-07,
"loss": 0.5078,
"step": 21800
},
{
"epoch": 2.8829660905132606,
"grad_norm": 15.411627769470215,
"learning_rate": 7.92507841145028e-07,
"loss": 0.508,
"step": 21850
},
{
"epoch": 2.8895632669217575,
"grad_norm": 17.951257705688477,
"learning_rate": 7.483323761982596e-07,
"loss": 0.5331,
"step": 21900
},
{
"epoch": 2.896160443330255,
"grad_norm": 8.112133979797363,
"learning_rate": 7.041569112514909e-07,
"loss": 0.5185,
"step": 21950
},
{
"epoch": 2.9027576197387517,
"grad_norm": 16.92367172241211,
"learning_rate": 6.599814463047224e-07,
"loss": 0.4469,
"step": 22000
},
{
"epoch": 2.9027576197387517,
"eval_accuracy": 0.9265305995941162,
"eval_loss": 0.555115818977356,
"eval_runtime": 14.0889,
"eval_samples_per_second": 173.896,
"eval_steps_per_second": 5.465,
"step": 22000
},
{
"epoch": 2.909354796147249,
"grad_norm": 7.048780918121338,
"learning_rate": 6.158059813579539e-07,
"loss": 0.4762,
"step": 22050
},
{
"epoch": 2.915951972555746,
"grad_norm": 16.79896354675293,
"learning_rate": 5.716305164111853e-07,
"loss": 0.5799,
"step": 22100
},
{
"epoch": 2.9225491489642432,
"grad_norm": 10.826476097106934,
"learning_rate": 5.274550514644168e-07,
"loss": 0.4978,
"step": 22150
},
{
"epoch": 2.9291463253727406,
"grad_norm": 12.840262413024902,
"learning_rate": 4.832795865176481e-07,
"loss": 0.566,
"step": 22200
},
{
"epoch": 2.9357435017812374,
"grad_norm": 20.16173553466797,
"learning_rate": 4.391041215708796e-07,
"loss": 0.5837,
"step": 22250
},
{
"epoch": 2.9357435017812374,
"eval_accuracy": 0.9269387722015381,
"eval_loss": 0.5531713962554932,
"eval_runtime": 14.0502,
"eval_samples_per_second": 174.375,
"eval_steps_per_second": 5.48,
"step": 22250
},
{
"epoch": 2.9423406781897348,
"grad_norm": 12.399968147277832,
"learning_rate": 3.94928656624111e-07,
"loss": 0.5401,
"step": 22300
},
{
"epoch": 2.948937854598232,
"grad_norm": 18.302248001098633,
"learning_rate": 3.5075319167734247e-07,
"loss": 0.523,
"step": 22350
},
{
"epoch": 2.955535031006729,
"grad_norm": 13.304845809936523,
"learning_rate": 3.0657772673057385e-07,
"loss": 0.5913,
"step": 22400
},
{
"epoch": 2.9621322074152263,
"grad_norm": 18.745372772216797,
"learning_rate": 2.624022617838053e-07,
"loss": 0.4701,
"step": 22450
},
{
"epoch": 2.9687293838237236,
"grad_norm": 14.230325698852539,
"learning_rate": 2.1822679683703673e-07,
"loss": 0.5568,
"step": 22500
},
{
"epoch": 2.9687293838237236,
"eval_accuracy": 0.92734694480896,
"eval_loss": 0.5528694987297058,
"eval_runtime": 14.0688,
"eval_samples_per_second": 174.144,
"eval_steps_per_second": 5.473,
"step": 22500
},
{
"epoch": 2.9753265602322205,
"grad_norm": 13.408769607543945,
"learning_rate": 1.7405133189026817e-07,
"loss": 0.5266,
"step": 22550
},
{
"epoch": 2.981923736640718,
"grad_norm": 14.969887733459473,
"learning_rate": 1.298758669434996e-07,
"loss": 0.4969,
"step": 22600
},
{
"epoch": 2.988520913049215,
"grad_norm": 3.428957939147949,
"learning_rate": 8.570040199673103e-08,
"loss": 0.4917,
"step": 22650
},
{
"epoch": 2.995118089457712,
"grad_norm": 9.547283172607422,
"learning_rate": 4.152493704996245e-08,
"loss": 0.5128,
"step": 22700
}
],
"logging_steps": 50,
"max_steps": 22737,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 250,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}