train_mnli_1744802907 / trainer_state.json
rbelanec's picture
End of training
b4bd9b7 verified
{
"best_global_step": 200,
"best_metric": 0.9543563723564148,
"best_model_checkpoint": "saves/ia3/gemma-3-1b-it/train_mnli_1744802907/checkpoint-200",
"epoch": 150.0,
"eval_steps": 200,
"global_step": 600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.32,
"grad_norm": 2.375901460647583,
"learning_rate": 0.2999671025212268,
"loss": 3.6731,
"num_input_tokens_seen": 32576,
"step": 5
},
{
"epoch": 2.64,
"grad_norm": 0.6804730892181396,
"learning_rate": 0.2998334812442955,
"loss": 0.9267,
"num_input_tokens_seen": 63424,
"step": 10
},
{
"epoch": 3.96,
"grad_norm": 0.15599267184734344,
"learning_rate": 0.29959717158366866,
"loss": 0.4169,
"num_input_tokens_seen": 95360,
"step": 15
},
{
"epoch": 5.0,
"grad_norm": 0.22385859489440918,
"learning_rate": 0.29925833549418396,
"loss": 0.2866,
"num_input_tokens_seen": 120384,
"step": 20
},
{
"epoch": 6.32,
"grad_norm": 0.11509355157613754,
"learning_rate": 0.2988172051971717,
"loss": 0.2759,
"num_input_tokens_seen": 152512,
"step": 25
},
{
"epoch": 7.64,
"grad_norm": 0.027715813368558884,
"learning_rate": 0.2982740830213025,
"loss": 0.2554,
"num_input_tokens_seen": 184320,
"step": 30
},
{
"epoch": 8.96,
"grad_norm": 0.0307712834328413,
"learning_rate": 0.29762934119538625,
"loss": 0.2385,
"num_input_tokens_seen": 215296,
"step": 35
},
{
"epoch": 10.0,
"grad_norm": 0.10019869357347488,
"learning_rate": 0.29688342159326486,
"loss": 0.2027,
"num_input_tokens_seen": 239872,
"step": 40
},
{
"epoch": 11.32,
"grad_norm": 0.3066690266132355,
"learning_rate": 0.29603683543097403,
"loss": 0.2518,
"num_input_tokens_seen": 271424,
"step": 45
},
{
"epoch": 12.64,
"grad_norm": 0.0848815068602562,
"learning_rate": 0.2950901629163815,
"loss": 0.1676,
"num_input_tokens_seen": 302912,
"step": 50
},
{
"epoch": 13.96,
"grad_norm": 0.10623391717672348,
"learning_rate": 0.29404405285154145,
"loss": 0.1485,
"num_input_tokens_seen": 334464,
"step": 55
},
{
"epoch": 15.0,
"grad_norm": 0.054420698434114456,
"learning_rate": 0.29289922218803793,
"loss": 0.0873,
"num_input_tokens_seen": 359872,
"step": 60
},
{
"epoch": 16.32,
"grad_norm": 0.18882937729358673,
"learning_rate": 0.29165645553562214,
"loss": 0.0803,
"num_input_tokens_seen": 390784,
"step": 65
},
{
"epoch": 17.64,
"grad_norm": 0.13374768197536469,
"learning_rate": 0.2903166046244801,
"loss": 0.0995,
"num_input_tokens_seen": 422528,
"step": 70
},
{
"epoch": 18.96,
"grad_norm": 0.15120446681976318,
"learning_rate": 0.2888805877214992,
"loss": 0.1117,
"num_input_tokens_seen": 454400,
"step": 75
},
{
"epoch": 20.0,
"grad_norm": 0.11594400554895401,
"learning_rate": 0.28734938900093415,
"loss": 0.0746,
"num_input_tokens_seen": 479744,
"step": 80
},
{
"epoch": 21.32,
"grad_norm": 0.18303385376930237,
"learning_rate": 0.2857240578699029,
"loss": 0.1154,
"num_input_tokens_seen": 511168,
"step": 85
},
{
"epoch": 22.64,
"grad_norm": 0.2756330370903015,
"learning_rate": 0.28400570824917565,
"loss": 0.2176,
"num_input_tokens_seen": 543296,
"step": 90
},
{
"epoch": 23.96,
"grad_norm": 0.05688418075442314,
"learning_rate": 0.2821955178097488,
"loss": 0.133,
"num_input_tokens_seen": 574976,
"step": 95
},
{
"epoch": 25.0,
"grad_norm": 0.19595864415168762,
"learning_rate": 0.2802947271657287,
"loss": 0.139,
"num_input_tokens_seen": 600064,
"step": 100
},
{
"epoch": 26.32,
"grad_norm": 0.034938205033540726,
"learning_rate": 0.278304639024076,
"loss": 0.0892,
"num_input_tokens_seen": 631296,
"step": 105
},
{
"epoch": 27.64,
"grad_norm": 0.039930302649736404,
"learning_rate": 0.27622661729179593,
"loss": 0.0688,
"num_input_tokens_seen": 662976,
"step": 110
},
{
"epoch": 28.96,
"grad_norm": 0.06501033902168274,
"learning_rate": 0.27406208614118427,
"loss": 0.0574,
"num_input_tokens_seen": 694336,
"step": 115
},
{
"epoch": 30.0,
"grad_norm": 0.043391138315200806,
"learning_rate": 0.27181252903377096,
"loss": 0.0307,
"num_input_tokens_seen": 718976,
"step": 120
},
{
"epoch": 31.32,
"grad_norm": 0.006944978144019842,
"learning_rate": 0.26947948770362945,
"loss": 0.0062,
"num_input_tokens_seen": 750656,
"step": 125
},
{
"epoch": 32.64,
"grad_norm": 0.005895116366446018,
"learning_rate": 0.26706456110074944,
"loss": 0.0046,
"num_input_tokens_seen": 781952,
"step": 130
},
{
"epoch": 33.96,
"grad_norm": 0.0026924721896648407,
"learning_rate": 0.2645694042951963,
"loss": 0.0017,
"num_input_tokens_seen": 812736,
"step": 135
},
{
"epoch": 35.0,
"grad_norm": 0.00570798572152853,
"learning_rate": 0.2619957273428087,
"loss": 0.0009,
"num_input_tokens_seen": 836992,
"step": 140
},
{
"epoch": 36.32,
"grad_norm": 0.0005358898197300732,
"learning_rate": 0.2593452941132117,
"loss": 0.0004,
"num_input_tokens_seen": 868160,
"step": 145
},
{
"epoch": 37.64,
"grad_norm": 0.0008586189360357821,
"learning_rate": 0.2566199210809489,
"loss": 0.0003,
"num_input_tokens_seen": 901568,
"step": 150
},
{
"epoch": 38.96,
"grad_norm": 0.0004163524426985532,
"learning_rate": 0.25382147608056105,
"loss": 0.0002,
"num_input_tokens_seen": 931392,
"step": 155
},
{
"epoch": 40.0,
"grad_norm": 0.0008104751468636096,
"learning_rate": 0.250951877026466,
"loss": 0.0002,
"num_input_tokens_seen": 957120,
"step": 160
},
{
"epoch": 41.32,
"grad_norm": 0.00021077878773212433,
"learning_rate": 0.24801309059851584,
"loss": 0.0002,
"num_input_tokens_seen": 987648,
"step": 165
},
{
"epoch": 42.64,
"grad_norm": 0.00015898171113803983,
"learning_rate": 0.2450071308941325,
"loss": 0.0001,
"num_input_tokens_seen": 1020544,
"step": 170
},
{
"epoch": 43.96,
"grad_norm": 0.0001712896191747859,
"learning_rate": 0.2419360580479465,
"loss": 0.0001,
"num_input_tokens_seen": 1051776,
"step": 175
},
{
"epoch": 45.0,
"grad_norm": 0.000376845127902925,
"learning_rate": 0.2388019768198829,
"loss": 0.0001,
"num_input_tokens_seen": 1076864,
"step": 180
},
{
"epoch": 46.32,
"grad_norm": 0.0001391878176946193,
"learning_rate": 0.2356070351526648,
"loss": 0.0001,
"num_input_tokens_seen": 1108608,
"step": 185
},
{
"epoch": 47.64,
"grad_norm": 0.00011551726493053138,
"learning_rate": 0.23235342269971976,
"loss": 0.0001,
"num_input_tokens_seen": 1141888,
"step": 190
},
{
"epoch": 48.96,
"grad_norm": 0.0001131360768340528,
"learning_rate": 0.22904336932450164,
"loss": 0.0001,
"num_input_tokens_seen": 1172352,
"step": 195
},
{
"epoch": 50.0,
"grad_norm": 0.00024781483807601035,
"learning_rate": 0.22567914357225285,
"loss": 0.0001,
"num_input_tokens_seen": 1197376,
"step": 200
},
{
"epoch": 50.0,
"eval_mnli_eval_loss": 0.9543563723564148,
"eval_mnli_eval_runtime": 1.8747,
"eval_mnli_eval_samples_per_second": 106.685,
"eval_mnli_eval_steps_per_second": 13.336,
"num_input_tokens_seen": 1197376,
"step": 200
},
{
"epoch": 51.32,
"grad_norm": 0.00011005056876456365,
"learning_rate": 0.22226305111525727,
"loss": 0.0001,
"num_input_tokens_seen": 1229248,
"step": 205
},
{
"epoch": 52.64,
"grad_norm": 9.758443775353953e-05,
"learning_rate": 0.21879743317264727,
"loss": 0.0001,
"num_input_tokens_seen": 1261888,
"step": 210
},
{
"epoch": 53.96,
"grad_norm": 9.127436351263896e-05,
"learning_rate": 0.21528466490584913,
"loss": 0.0001,
"num_input_tokens_seen": 1292992,
"step": 215
},
{
"epoch": 55.0,
"grad_norm": 0.00014360187924467027,
"learning_rate": 0.21172715379076631,
"loss": 0.0001,
"num_input_tokens_seen": 1318016,
"step": 220
},
{
"epoch": 56.32,
"grad_norm": 8.246286597568542e-05,
"learning_rate": 0.20812733796781543,
"loss": 0.0001,
"num_input_tokens_seen": 1350080,
"step": 225
},
{
"epoch": 57.64,
"grad_norm": 8.383565000258386e-05,
"learning_rate": 0.20448768457094676,
"loss": 0.0001,
"num_input_tokens_seen": 1382272,
"step": 230
},
{
"epoch": 58.96,
"grad_norm": 7.291782821994275e-05,
"learning_rate": 0.20081068803679372,
"loss": 0.0001,
"num_input_tokens_seen": 1413632,
"step": 235
},
{
"epoch": 60.0,
"grad_norm": 0.00019672681810334325,
"learning_rate": 0.19709886839511073,
"loss": 0.0001,
"num_input_tokens_seen": 1438336,
"step": 240
},
{
"epoch": 61.32,
"grad_norm": 6.871890946058556e-05,
"learning_rate": 0.19335476954167072,
"loss": 0.0001,
"num_input_tokens_seen": 1469824,
"step": 245
},
{
"epoch": 62.64,
"grad_norm": 5.799822974950075e-05,
"learning_rate": 0.18958095749480594,
"loss": 0.0001,
"num_input_tokens_seen": 1501568,
"step": 250
},
{
"epoch": 63.96,
"grad_norm": 6.390228372765705e-05,
"learning_rate": 0.18578001863678714,
"loss": 0.0001,
"num_input_tokens_seen": 1533824,
"step": 255
},
{
"epoch": 65.0,
"grad_norm": 8.54826794238761e-05,
"learning_rate": 0.18195455794124651,
"loss": 0.0,
"num_input_tokens_seen": 1558656,
"step": 260
},
{
"epoch": 66.32,
"grad_norm": 5.925075674895197e-05,
"learning_rate": 0.17810719718785872,
"loss": 0.0001,
"num_input_tokens_seen": 1590912,
"step": 265
},
{
"epoch": 67.64,
"grad_norm": 5.148671334609389e-05,
"learning_rate": 0.17424057316550418,
"loss": 0.0,
"num_input_tokens_seen": 1622656,
"step": 270
},
{
"epoch": 68.96,
"grad_norm": 5.6448534451192245e-05,
"learning_rate": 0.17035733586514565,
"loss": 0.0001,
"num_input_tokens_seen": 1653696,
"step": 275
},
{
"epoch": 70.0,
"grad_norm": 0.0001959072978934273,
"learning_rate": 0.16646014666365677,
"loss": 0.0001,
"num_input_tokens_seen": 1678144,
"step": 280
},
{
"epoch": 71.32,
"grad_norm": 5.507750029210001e-05,
"learning_rate": 0.16255167649984736,
"loss": 0.0,
"num_input_tokens_seen": 1709312,
"step": 285
},
{
"epoch": 72.64,
"grad_norm": 4.713419184554368e-05,
"learning_rate": 0.15863460404393512,
"loss": 0.0,
"num_input_tokens_seen": 1741952,
"step": 290
},
{
"epoch": 73.96,
"grad_norm": 5.095809683552943e-05,
"learning_rate": 0.15471161386171925,
"loss": 0.0001,
"num_input_tokens_seen": 1772992,
"step": 295
},
{
"epoch": 75.0,
"grad_norm": 0.0001617352245375514,
"learning_rate": 0.1507853945747129,
"loss": 0.0,
"num_input_tokens_seen": 1797568,
"step": 300
},
{
"epoch": 76.32,
"grad_norm": 4.8264351789839566e-05,
"learning_rate": 0.14685863701749646,
"loss": 0.0,
"num_input_tokens_seen": 1829120,
"step": 305
},
{
"epoch": 77.64,
"grad_norm": 4.358548540039919e-05,
"learning_rate": 0.1429340323935536,
"loss": 0.0,
"num_input_tokens_seen": 1860352,
"step": 310
},
{
"epoch": 78.96,
"grad_norm": 4.9486257921671495e-05,
"learning_rate": 0.13901427043085526,
"loss": 0.0,
"num_input_tokens_seen": 1892736,
"step": 315
},
{
"epoch": 80.0,
"grad_norm": 0.00010933289013337344,
"learning_rate": 0.13510203753845418,
"loss": 0.0,
"num_input_tokens_seen": 1917696,
"step": 320
},
{
"epoch": 81.32,
"grad_norm": 4.788853766513057e-05,
"learning_rate": 0.13120001496535433,
"loss": 0.0,
"num_input_tokens_seen": 1948736,
"step": 325
},
{
"epoch": 82.64,
"grad_norm": 3.8914506149012595e-05,
"learning_rate": 0.12731087696291712,
"loss": 0.0,
"num_input_tokens_seen": 1980480,
"step": 330
},
{
"epoch": 83.96,
"grad_norm": 4.481687574298121e-05,
"learning_rate": 0.12343728895206252,
"loss": 0.0,
"num_input_tokens_seen": 2011776,
"step": 335
},
{
"epoch": 85.0,
"grad_norm": 9.876202238956466e-05,
"learning_rate": 0.11958190569652316,
"loss": 0.0,
"num_input_tokens_seen": 2036736,
"step": 340
},
{
"epoch": 86.32,
"grad_norm": 4.3948042730335146e-05,
"learning_rate": 0.11574736948340163,
"loss": 0.0,
"num_input_tokens_seen": 2068928,
"step": 345
},
{
"epoch": 87.64,
"grad_norm": 3.864551035803743e-05,
"learning_rate": 0.11193630831227916,
"loss": 0.0,
"num_input_tokens_seen": 2099520,
"step": 350
},
{
"epoch": 88.96,
"grad_norm": 4.2283096263417974e-05,
"learning_rate": 0.10815133409411562,
"loss": 0.0,
"num_input_tokens_seen": 2132480,
"step": 355
},
{
"epoch": 90.0,
"grad_norm": 9.957759903045371e-05,
"learning_rate": 0.10439504086117644,
"loss": 0.0,
"num_input_tokens_seen": 2157184,
"step": 360
},
{
"epoch": 91.32,
"grad_norm": 4.125876148464158e-05,
"learning_rate": 0.10067000298921251,
"loss": 0.0,
"num_input_tokens_seen": 2188032,
"step": 365
},
{
"epoch": 92.64,
"grad_norm": 3.753119381144643e-05,
"learning_rate": 0.09697877343311145,
"loss": 0.0,
"num_input_tokens_seen": 2220480,
"step": 370
},
{
"epoch": 93.96,
"grad_norm": 4.0673934563528746e-05,
"learning_rate": 0.09332388197722995,
"loss": 0.0,
"num_input_tokens_seen": 2251008,
"step": 375
},
{
"epoch": 95.0,
"grad_norm": 0.00011201734014321119,
"learning_rate": 0.089707833501606,
"loss": 0.0,
"num_input_tokens_seen": 2276736,
"step": 380
},
{
"epoch": 96.32,
"grad_norm": 3.8152145862113684e-05,
"learning_rate": 0.0861331062652391,
"loss": 0.0,
"num_input_tokens_seen": 2309504,
"step": 385
},
{
"epoch": 97.64,
"grad_norm": 3.642489173216745e-05,
"learning_rate": 0.08260215020761554,
"loss": 0.0,
"num_input_tokens_seen": 2339776,
"step": 390
},
{
"epoch": 98.96,
"grad_norm": 4.08138548664283e-05,
"learning_rate": 0.07911738526964192,
"loss": 0.0,
"num_input_tokens_seen": 2371584,
"step": 395
},
{
"epoch": 100.0,
"grad_norm": 7.001034828135744e-05,
"learning_rate": 0.07568119973513886,
"loss": 0.0,
"num_input_tokens_seen": 2396736,
"step": 400
},
{
"epoch": 100.0,
"eval_mnli_eval_loss": 0.9945777654647827,
"eval_mnli_eval_runtime": 1.8092,
"eval_mnli_eval_samples_per_second": 110.546,
"eval_mnli_eval_steps_per_second": 13.818,
"num_input_tokens_seen": 2396736,
"step": 400
},
{
"epoch": 101.32,
"grad_norm": 3.362021379871294e-05,
"learning_rate": 0.07229594859403049,
"loss": 0.0,
"num_input_tokens_seen": 2428864,
"step": 405
},
{
"epoch": 102.64,
"grad_norm": 3.9229835238074884e-05,
"learning_rate": 0.06896395192835174,
"loss": 0.0,
"num_input_tokens_seen": 2459328,
"step": 410
},
{
"epoch": 103.96,
"grad_norm": 3.417343395994976e-05,
"learning_rate": 0.06568749332218045,
"loss": 0.0,
"num_input_tokens_seen": 2491456,
"step": 415
},
{
"epoch": 105.0,
"grad_norm": 0.0001204285363201052,
"learning_rate": 0.06246881829658239,
"loss": 0.0,
"num_input_tokens_seen": 2516800,
"step": 420
},
{
"epoch": 106.32,
"grad_norm": 3.838081465801224e-05,
"learning_rate": 0.05931013277064377,
"loss": 0.0,
"num_input_tokens_seen": 2548800,
"step": 425
},
{
"epoch": 107.64,
"grad_norm": 2.7628277166513726e-05,
"learning_rate": 0.05621360154964427,
"loss": 0.0,
"num_input_tokens_seen": 2580160,
"step": 430
},
{
"epoch": 108.96,
"grad_norm": 3.643093441496603e-05,
"learning_rate": 0.053181346841407386,
"loss": 0.0,
"num_input_tokens_seen": 2611200,
"step": 435
},
{
"epoch": 110.0,
"grad_norm": 0.00010669205948943272,
"learning_rate": 0.050215446801845885,
"loss": 0.0,
"num_input_tokens_seen": 2635968,
"step": 440
},
{
"epoch": 111.32,
"grad_norm": 3.292246401542798e-05,
"learning_rate": 0.04731793411069669,
"loss": 0.0,
"num_input_tokens_seen": 2668736,
"step": 445
},
{
"epoch": 112.64,
"grad_norm": 3.169509000144899e-05,
"learning_rate": 0.044490794578424434,
"loss": 0.0,
"num_input_tokens_seen": 2699520,
"step": 450
},
{
"epoch": 113.96,
"grad_norm": 3.336479130666703e-05,
"learning_rate": 0.041735965785245674,
"loss": 0.0,
"num_input_tokens_seen": 2730304,
"step": 455
},
{
"epoch": 115.0,
"grad_norm": 0.00010339989967178553,
"learning_rate": 0.03905533575320853,
"loss": 0.0,
"num_input_tokens_seen": 2755776,
"step": 460
},
{
"epoch": 116.32,
"grad_norm": 3.4434106055414304e-05,
"learning_rate": 0.03645074165223656,
"loss": 0.0,
"num_input_tokens_seen": 2787968,
"step": 465
},
{
"epoch": 117.64,
"grad_norm": 3.564298458513804e-05,
"learning_rate": 0.03392396854102408,
"loss": 0.0,
"num_input_tokens_seen": 2819584,
"step": 470
},
{
"epoch": 118.96,
"grad_norm": 3.216815457562916e-05,
"learning_rate": 0.031476748143646434,
"loss": 0.0,
"num_input_tokens_seen": 2851072,
"step": 475
},
{
"epoch": 120.0,
"grad_norm": 0.00010380757885286584,
"learning_rate": 0.029110757662722652,
"loss": 0.0,
"num_input_tokens_seen": 2875328,
"step": 480
},
{
"epoch": 121.32,
"grad_norm": 2.93699158646632e-05,
"learning_rate": 0.026827618629944393,
"loss": 0.0,
"num_input_tokens_seen": 2906496,
"step": 485
},
{
"epoch": 122.64,
"grad_norm": 3.3641983463894576e-05,
"learning_rate": 0.024628895794759492,
"loss": 0.0,
"num_input_tokens_seen": 2939584,
"step": 490
},
{
"epoch": 123.96,
"grad_norm": 3.712475154316053e-05,
"learning_rate": 0.022516096051970434,
"loss": 0.0,
"num_input_tokens_seen": 2971008,
"step": 495
},
{
"epoch": 125.0,
"grad_norm": 7.740116416243836e-05,
"learning_rate": 0.020490667408984253,
"loss": 0.0,
"num_input_tokens_seen": 2995392,
"step": 500
},
{
"epoch": 126.32,
"grad_norm": 3.0008935937075876e-05,
"learning_rate": 0.018553997993420494,
"loss": 0.0,
"num_input_tokens_seen": 3027904,
"step": 505
},
{
"epoch": 127.64,
"grad_norm": 3.011342414538376e-05,
"learning_rate": 0.016707415101757654,
"loss": 0.0,
"num_input_tokens_seen": 3057472,
"step": 510
},
{
"epoch": 128.96,
"grad_norm": 3.3212050766451284e-05,
"learning_rate": 0.014952184289670972,
"loss": 0.0,
"num_input_tokens_seen": 3089216,
"step": 515
},
{
"epoch": 130.0,
"grad_norm": 4.444582009455189e-05,
"learning_rate": 0.013289508504683205,
"loss": 0.0,
"num_input_tokens_seen": 3114240,
"step": 520
},
{
"epoch": 131.32,
"grad_norm": 2.850958662747871e-05,
"learning_rate": 0.011720527261724938,
"loss": 0.0,
"num_input_tokens_seen": 3145600,
"step": 525
},
{
"epoch": 132.64,
"grad_norm": 3.135738006676547e-05,
"learning_rate": 0.010246315862167664,
"loss": 0.0,
"num_input_tokens_seen": 3178176,
"step": 530
},
{
"epoch": 133.96,
"grad_norm": 3.609867053455673e-05,
"learning_rate": 0.00886788465686618,
"loss": 0.0,
"num_input_tokens_seen": 3209344,
"step": 535
},
{
"epoch": 135.0,
"grad_norm": 6.155487790238112e-05,
"learning_rate": 0.007586178353714434,
"loss": 0.0,
"num_input_tokens_seen": 3234688,
"step": 540
},
{
"epoch": 136.32,
"grad_norm": 3.3471937058493495e-05,
"learning_rate": 0.006402075370189914,
"loss": 0.0,
"num_input_tokens_seen": 3265344,
"step": 545
},
{
"epoch": 137.64,
"grad_norm": 3.2188763725571334e-05,
"learning_rate": 0.005316387231330288,
"loss": 0.0,
"num_input_tokens_seen": 3297216,
"step": 550
},
{
"epoch": 138.96,
"grad_norm": 3.0012966817594133e-05,
"learning_rate": 0.004329858013554605,
"loss": 0.0,
"num_input_tokens_seen": 3328832,
"step": 555
},
{
"epoch": 140.0,
"grad_norm": 7.22868790035136e-05,
"learning_rate": 0.0034431638347104552,
"loss": 0.0,
"num_input_tokens_seen": 3353856,
"step": 560
},
{
"epoch": 141.32,
"grad_norm": 3.3028067264240235e-05,
"learning_rate": 0.0026569123906967085,
"loss": 0.0,
"num_input_tokens_seen": 3384640,
"step": 565
},
{
"epoch": 142.64,
"grad_norm": 3.098338493146002e-05,
"learning_rate": 0.0019716425389789127,
"loss": 0.0,
"num_input_tokens_seen": 3417280,
"step": 570
},
{
"epoch": 143.96,
"grad_norm": 3.733243647729978e-05,
"learning_rate": 0.0013878239292834603,
"loss": 0.0,
"num_input_tokens_seen": 3447808,
"step": 575
},
{
"epoch": 145.0,
"grad_norm": 9.276469791075215e-05,
"learning_rate": 0.0009058566817230606,
"loss": 0.0,
"num_input_tokens_seen": 3472192,
"step": 580
},
{
"epoch": 146.32,
"grad_norm": 3.6584464396582916e-05,
"learning_rate": 0.0005260711125743444,
"loss": 0.0,
"num_input_tokens_seen": 3503616,
"step": 585
},
{
"epoch": 147.64,
"grad_norm": 3.148068208247423e-05,
"learning_rate": 0.0002487275078957518,
"loss": 0.0,
"num_input_tokens_seen": 3536128,
"step": 590
},
{
"epoch": 148.96,
"grad_norm": 3.436414772295393e-05,
"learning_rate": 7.401594514025999e-05,
"loss": 0.0,
"num_input_tokens_seen": 3567872,
"step": 595
},
{
"epoch": 150.0,
"grad_norm": 7.5851748988498e-05,
"learning_rate": 2.0561628859883107e-06,
"loss": 0.0,
"num_input_tokens_seen": 3592960,
"step": 600
},
{
"epoch": 150.0,
"eval_mnli_eval_loss": 1.00465726852417,
"eval_mnli_eval_runtime": 1.8424,
"eval_mnli_eval_samples_per_second": 108.557,
"eval_mnli_eval_steps_per_second": 13.57,
"num_input_tokens_seen": 3592960,
"step": 600
},
{
"epoch": 150.0,
"num_input_tokens_seen": 3592960,
"step": 600,
"total_flos": 1.504600852758528e+16,
"train_loss": 0.06805778272234117,
"train_runtime": 800.524,
"train_samples_per_second": 47.969,
"train_steps_per_second": 0.75
}
],
"logging_steps": 5,
"max_steps": 600,
"num_input_tokens_seen": 3592960,
"num_train_epochs": 200,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.504600852758528e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}