| { |
| "best_global_step": 200, |
| "best_metric": 0.9543563723564148, |
| "best_model_checkpoint": "saves/ia3/gemma-3-1b-it/train_mnli_1744802907/checkpoint-200", |
| "epoch": 150.0, |
| "eval_steps": 200, |
| "global_step": 600, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 1.32, |
| "grad_norm": 2.375901460647583, |
| "learning_rate": 0.2999671025212268, |
| "loss": 3.6731, |
| "num_input_tokens_seen": 32576, |
| "step": 5 |
| }, |
| { |
| "epoch": 2.64, |
| "grad_norm": 0.6804730892181396, |
| "learning_rate": 0.2998334812442955, |
| "loss": 0.9267, |
| "num_input_tokens_seen": 63424, |
| "step": 10 |
| }, |
| { |
| "epoch": 3.96, |
| "grad_norm": 0.15599267184734344, |
| "learning_rate": 0.29959717158366866, |
| "loss": 0.4169, |
| "num_input_tokens_seen": 95360, |
| "step": 15 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.22385859489440918, |
| "learning_rate": 0.29925833549418396, |
| "loss": 0.2866, |
| "num_input_tokens_seen": 120384, |
| "step": 20 |
| }, |
| { |
| "epoch": 6.32, |
| "grad_norm": 0.11509355157613754, |
| "learning_rate": 0.2988172051971717, |
| "loss": 0.2759, |
| "num_input_tokens_seen": 152512, |
| "step": 25 |
| }, |
| { |
| "epoch": 7.64, |
| "grad_norm": 0.027715813368558884, |
| "learning_rate": 0.2982740830213025, |
| "loss": 0.2554, |
| "num_input_tokens_seen": 184320, |
| "step": 30 |
| }, |
| { |
| "epoch": 8.96, |
| "grad_norm": 0.0307712834328413, |
| "learning_rate": 0.29762934119538625, |
| "loss": 0.2385, |
| "num_input_tokens_seen": 215296, |
| "step": 35 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.10019869357347488, |
| "learning_rate": 0.29688342159326486, |
| "loss": 0.2027, |
| "num_input_tokens_seen": 239872, |
| "step": 40 |
| }, |
| { |
| "epoch": 11.32, |
| "grad_norm": 0.3066690266132355, |
| "learning_rate": 0.29603683543097403, |
| "loss": 0.2518, |
| "num_input_tokens_seen": 271424, |
| "step": 45 |
| }, |
| { |
| "epoch": 12.64, |
| "grad_norm": 0.0848815068602562, |
| "learning_rate": 0.2950901629163815, |
| "loss": 0.1676, |
| "num_input_tokens_seen": 302912, |
| "step": 50 |
| }, |
| { |
| "epoch": 13.96, |
| "grad_norm": 0.10623391717672348, |
| "learning_rate": 0.29404405285154145, |
| "loss": 0.1485, |
| "num_input_tokens_seen": 334464, |
| "step": 55 |
| }, |
| { |
| "epoch": 15.0, |
| "grad_norm": 0.054420698434114456, |
| "learning_rate": 0.29289922218803793, |
| "loss": 0.0873, |
| "num_input_tokens_seen": 359872, |
| "step": 60 |
| }, |
| { |
| "epoch": 16.32, |
| "grad_norm": 0.18882937729358673, |
| "learning_rate": 0.29165645553562214, |
| "loss": 0.0803, |
| "num_input_tokens_seen": 390784, |
| "step": 65 |
| }, |
| { |
| "epoch": 17.64, |
| "grad_norm": 0.13374768197536469, |
| "learning_rate": 0.2903166046244801, |
| "loss": 0.0995, |
| "num_input_tokens_seen": 422528, |
| "step": 70 |
| }, |
| { |
| "epoch": 18.96, |
| "grad_norm": 0.15120446681976318, |
| "learning_rate": 0.2888805877214992, |
| "loss": 0.1117, |
| "num_input_tokens_seen": 454400, |
| "step": 75 |
| }, |
| { |
| "epoch": 20.0, |
| "grad_norm": 0.11594400554895401, |
| "learning_rate": 0.28734938900093415, |
| "loss": 0.0746, |
| "num_input_tokens_seen": 479744, |
| "step": 80 |
| }, |
| { |
| "epoch": 21.32, |
| "grad_norm": 0.18303385376930237, |
| "learning_rate": 0.2857240578699029, |
| "loss": 0.1154, |
| "num_input_tokens_seen": 511168, |
| "step": 85 |
| }, |
| { |
| "epoch": 22.64, |
| "grad_norm": 0.2756330370903015, |
| "learning_rate": 0.28400570824917565, |
| "loss": 0.2176, |
| "num_input_tokens_seen": 543296, |
| "step": 90 |
| }, |
| { |
| "epoch": 23.96, |
| "grad_norm": 0.05688418075442314, |
| "learning_rate": 0.2821955178097488, |
| "loss": 0.133, |
| "num_input_tokens_seen": 574976, |
| "step": 95 |
| }, |
| { |
| "epoch": 25.0, |
| "grad_norm": 0.19595864415168762, |
| "learning_rate": 0.2802947271657287, |
| "loss": 0.139, |
| "num_input_tokens_seen": 600064, |
| "step": 100 |
| }, |
| { |
| "epoch": 26.32, |
| "grad_norm": 0.034938205033540726, |
| "learning_rate": 0.278304639024076, |
| "loss": 0.0892, |
| "num_input_tokens_seen": 631296, |
| "step": 105 |
| }, |
| { |
| "epoch": 27.64, |
| "grad_norm": 0.039930302649736404, |
| "learning_rate": 0.27622661729179593, |
| "loss": 0.0688, |
| "num_input_tokens_seen": 662976, |
| "step": 110 |
| }, |
| { |
| "epoch": 28.96, |
| "grad_norm": 0.06501033902168274, |
| "learning_rate": 0.27406208614118427, |
| "loss": 0.0574, |
| "num_input_tokens_seen": 694336, |
| "step": 115 |
| }, |
| { |
| "epoch": 30.0, |
| "grad_norm": 0.043391138315200806, |
| "learning_rate": 0.27181252903377096, |
| "loss": 0.0307, |
| "num_input_tokens_seen": 718976, |
| "step": 120 |
| }, |
| { |
| "epoch": 31.32, |
| "grad_norm": 0.006944978144019842, |
| "learning_rate": 0.26947948770362945, |
| "loss": 0.0062, |
| "num_input_tokens_seen": 750656, |
| "step": 125 |
| }, |
| { |
| "epoch": 32.64, |
| "grad_norm": 0.005895116366446018, |
| "learning_rate": 0.26706456110074944, |
| "loss": 0.0046, |
| "num_input_tokens_seen": 781952, |
| "step": 130 |
| }, |
| { |
| "epoch": 33.96, |
| "grad_norm": 0.0026924721896648407, |
| "learning_rate": 0.2645694042951963, |
| "loss": 0.0017, |
| "num_input_tokens_seen": 812736, |
| "step": 135 |
| }, |
| { |
| "epoch": 35.0, |
| "grad_norm": 0.00570798572152853, |
| "learning_rate": 0.2619957273428087, |
| "loss": 0.0009, |
| "num_input_tokens_seen": 836992, |
| "step": 140 |
| }, |
| { |
| "epoch": 36.32, |
| "grad_norm": 0.0005358898197300732, |
| "learning_rate": 0.2593452941132117, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 868160, |
| "step": 145 |
| }, |
| { |
| "epoch": 37.64, |
| "grad_norm": 0.0008586189360357821, |
| "learning_rate": 0.2566199210809489, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 901568, |
| "step": 150 |
| }, |
| { |
| "epoch": 38.96, |
| "grad_norm": 0.0004163524426985532, |
| "learning_rate": 0.25382147608056105, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 931392, |
| "step": 155 |
| }, |
| { |
| "epoch": 40.0, |
| "grad_norm": 0.0008104751468636096, |
| "learning_rate": 0.250951877026466, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 957120, |
| "step": 160 |
| }, |
| { |
| "epoch": 41.32, |
| "grad_norm": 0.00021077878773212433, |
| "learning_rate": 0.24801309059851584, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 987648, |
| "step": 165 |
| }, |
| { |
| "epoch": 42.64, |
| "grad_norm": 0.00015898171113803983, |
| "learning_rate": 0.2450071308941325, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1020544, |
| "step": 170 |
| }, |
| { |
| "epoch": 43.96, |
| "grad_norm": 0.0001712896191747859, |
| "learning_rate": 0.2419360580479465, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1051776, |
| "step": 175 |
| }, |
| { |
| "epoch": 45.0, |
| "grad_norm": 0.000376845127902925, |
| "learning_rate": 0.2388019768198829, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1076864, |
| "step": 180 |
| }, |
| { |
| "epoch": 46.32, |
| "grad_norm": 0.0001391878176946193, |
| "learning_rate": 0.2356070351526648, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1108608, |
| "step": 185 |
| }, |
| { |
| "epoch": 47.64, |
| "grad_norm": 0.00011551726493053138, |
| "learning_rate": 0.23235342269971976, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1141888, |
| "step": 190 |
| }, |
| { |
| "epoch": 48.96, |
| "grad_norm": 0.0001131360768340528, |
| "learning_rate": 0.22904336932450164, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1172352, |
| "step": 195 |
| }, |
| { |
| "epoch": 50.0, |
| "grad_norm": 0.00024781483807601035, |
| "learning_rate": 0.22567914357225285, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1197376, |
| "step": 200 |
| }, |
| { |
| "epoch": 50.0, |
| "eval_mnli_eval_loss": 0.9543563723564148, |
| "eval_mnli_eval_runtime": 1.8747, |
| "eval_mnli_eval_samples_per_second": 106.685, |
| "eval_mnli_eval_steps_per_second": 13.336, |
| "num_input_tokens_seen": 1197376, |
| "step": 200 |
| }, |
| { |
| "epoch": 51.32, |
| "grad_norm": 0.00011005056876456365, |
| "learning_rate": 0.22226305111525727, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1229248, |
| "step": 205 |
| }, |
| { |
| "epoch": 52.64, |
| "grad_norm": 9.758443775353953e-05, |
| "learning_rate": 0.21879743317264727, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1261888, |
| "step": 210 |
| }, |
| { |
| "epoch": 53.96, |
| "grad_norm": 9.127436351263896e-05, |
| "learning_rate": 0.21528466490584913, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1292992, |
| "step": 215 |
| }, |
| { |
| "epoch": 55.0, |
| "grad_norm": 0.00014360187924467027, |
| "learning_rate": 0.21172715379076631, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1318016, |
| "step": 220 |
| }, |
| { |
| "epoch": 56.32, |
| "grad_norm": 8.246286597568542e-05, |
| "learning_rate": 0.20812733796781543, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1350080, |
| "step": 225 |
| }, |
| { |
| "epoch": 57.64, |
| "grad_norm": 8.383565000258386e-05, |
| "learning_rate": 0.20448768457094676, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1382272, |
| "step": 230 |
| }, |
| { |
| "epoch": 58.96, |
| "grad_norm": 7.291782821994275e-05, |
| "learning_rate": 0.20081068803679372, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1413632, |
| "step": 235 |
| }, |
| { |
| "epoch": 60.0, |
| "grad_norm": 0.00019672681810334325, |
| "learning_rate": 0.19709886839511073, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1438336, |
| "step": 240 |
| }, |
| { |
| "epoch": 61.32, |
| "grad_norm": 6.871890946058556e-05, |
| "learning_rate": 0.19335476954167072, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1469824, |
| "step": 245 |
| }, |
| { |
| "epoch": 62.64, |
| "grad_norm": 5.799822974950075e-05, |
| "learning_rate": 0.18958095749480594, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1501568, |
| "step": 250 |
| }, |
| { |
| "epoch": 63.96, |
| "grad_norm": 6.390228372765705e-05, |
| "learning_rate": 0.18578001863678714, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1533824, |
| "step": 255 |
| }, |
| { |
| "epoch": 65.0, |
| "grad_norm": 8.54826794238761e-05, |
| "learning_rate": 0.18195455794124651, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1558656, |
| "step": 260 |
| }, |
| { |
| "epoch": 66.32, |
| "grad_norm": 5.925075674895197e-05, |
| "learning_rate": 0.17810719718785872, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1590912, |
| "step": 265 |
| }, |
| { |
| "epoch": 67.64, |
| "grad_norm": 5.148671334609389e-05, |
| "learning_rate": 0.17424057316550418, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1622656, |
| "step": 270 |
| }, |
| { |
| "epoch": 68.96, |
| "grad_norm": 5.6448534451192245e-05, |
| "learning_rate": 0.17035733586514565, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1653696, |
| "step": 275 |
| }, |
| { |
| "epoch": 70.0, |
| "grad_norm": 0.0001959072978934273, |
| "learning_rate": 0.16646014666365677, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1678144, |
| "step": 280 |
| }, |
| { |
| "epoch": 71.32, |
| "grad_norm": 5.507750029210001e-05, |
| "learning_rate": 0.16255167649984736, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1709312, |
| "step": 285 |
| }, |
| { |
| "epoch": 72.64, |
| "grad_norm": 4.713419184554368e-05, |
| "learning_rate": 0.15863460404393512, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1741952, |
| "step": 290 |
| }, |
| { |
| "epoch": 73.96, |
| "grad_norm": 5.095809683552943e-05, |
| "learning_rate": 0.15471161386171925, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1772992, |
| "step": 295 |
| }, |
| { |
| "epoch": 75.0, |
| "grad_norm": 0.0001617352245375514, |
| "learning_rate": 0.1507853945747129, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1797568, |
| "step": 300 |
| }, |
| { |
| "epoch": 76.32, |
| "grad_norm": 4.8264351789839566e-05, |
| "learning_rate": 0.14685863701749646, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1829120, |
| "step": 305 |
| }, |
| { |
| "epoch": 77.64, |
| "grad_norm": 4.358548540039919e-05, |
| "learning_rate": 0.1429340323935536, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1860352, |
| "step": 310 |
| }, |
| { |
| "epoch": 78.96, |
| "grad_norm": 4.9486257921671495e-05, |
| "learning_rate": 0.13901427043085526, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1892736, |
| "step": 315 |
| }, |
| { |
| "epoch": 80.0, |
| "grad_norm": 0.00010933289013337344, |
| "learning_rate": 0.13510203753845418, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1917696, |
| "step": 320 |
| }, |
| { |
| "epoch": 81.32, |
| "grad_norm": 4.788853766513057e-05, |
| "learning_rate": 0.13120001496535433, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1948736, |
| "step": 325 |
| }, |
| { |
| "epoch": 82.64, |
| "grad_norm": 3.8914506149012595e-05, |
| "learning_rate": 0.12731087696291712, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1980480, |
| "step": 330 |
| }, |
| { |
| "epoch": 83.96, |
| "grad_norm": 4.481687574298121e-05, |
| "learning_rate": 0.12343728895206252, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2011776, |
| "step": 335 |
| }, |
| { |
| "epoch": 85.0, |
| "grad_norm": 9.876202238956466e-05, |
| "learning_rate": 0.11958190569652316, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2036736, |
| "step": 340 |
| }, |
| { |
| "epoch": 86.32, |
| "grad_norm": 4.3948042730335146e-05, |
| "learning_rate": 0.11574736948340163, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2068928, |
| "step": 345 |
| }, |
| { |
| "epoch": 87.64, |
| "grad_norm": 3.864551035803743e-05, |
| "learning_rate": 0.11193630831227916, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2099520, |
| "step": 350 |
| }, |
| { |
| "epoch": 88.96, |
| "grad_norm": 4.2283096263417974e-05, |
| "learning_rate": 0.10815133409411562, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2132480, |
| "step": 355 |
| }, |
| { |
| "epoch": 90.0, |
| "grad_norm": 9.957759903045371e-05, |
| "learning_rate": 0.10439504086117644, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2157184, |
| "step": 360 |
| }, |
| { |
| "epoch": 91.32, |
| "grad_norm": 4.125876148464158e-05, |
| "learning_rate": 0.10067000298921251, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2188032, |
| "step": 365 |
| }, |
| { |
| "epoch": 92.64, |
| "grad_norm": 3.753119381144643e-05, |
| "learning_rate": 0.09697877343311145, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2220480, |
| "step": 370 |
| }, |
| { |
| "epoch": 93.96, |
| "grad_norm": 4.0673934563528746e-05, |
| "learning_rate": 0.09332388197722995, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2251008, |
| "step": 375 |
| }, |
| { |
| "epoch": 95.0, |
| "grad_norm": 0.00011201734014321119, |
| "learning_rate": 0.089707833501606, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2276736, |
| "step": 380 |
| }, |
| { |
| "epoch": 96.32, |
| "grad_norm": 3.8152145862113684e-05, |
| "learning_rate": 0.0861331062652391, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2309504, |
| "step": 385 |
| }, |
| { |
| "epoch": 97.64, |
| "grad_norm": 3.642489173216745e-05, |
| "learning_rate": 0.08260215020761554, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2339776, |
| "step": 390 |
| }, |
| { |
| "epoch": 98.96, |
| "grad_norm": 4.08138548664283e-05, |
| "learning_rate": 0.07911738526964192, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2371584, |
| "step": 395 |
| }, |
| { |
| "epoch": 100.0, |
| "grad_norm": 7.001034828135744e-05, |
| "learning_rate": 0.07568119973513886, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2396736, |
| "step": 400 |
| }, |
| { |
| "epoch": 100.0, |
| "eval_mnli_eval_loss": 0.9945777654647827, |
| "eval_mnli_eval_runtime": 1.8092, |
| "eval_mnli_eval_samples_per_second": 110.546, |
| "eval_mnli_eval_steps_per_second": 13.818, |
| "num_input_tokens_seen": 2396736, |
| "step": 400 |
| }, |
| { |
| "epoch": 101.32, |
| "grad_norm": 3.362021379871294e-05, |
| "learning_rate": 0.07229594859403049, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2428864, |
| "step": 405 |
| }, |
| { |
| "epoch": 102.64, |
| "grad_norm": 3.9229835238074884e-05, |
| "learning_rate": 0.06896395192835174, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2459328, |
| "step": 410 |
| }, |
| { |
| "epoch": 103.96, |
| "grad_norm": 3.417343395994976e-05, |
| "learning_rate": 0.06568749332218045, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2491456, |
| "step": 415 |
| }, |
| { |
| "epoch": 105.0, |
| "grad_norm": 0.0001204285363201052, |
| "learning_rate": 0.06246881829658239, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2516800, |
| "step": 420 |
| }, |
| { |
| "epoch": 106.32, |
| "grad_norm": 3.838081465801224e-05, |
| "learning_rate": 0.05931013277064377, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2548800, |
| "step": 425 |
| }, |
| { |
| "epoch": 107.64, |
| "grad_norm": 2.7628277166513726e-05, |
| "learning_rate": 0.05621360154964427, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2580160, |
| "step": 430 |
| }, |
| { |
| "epoch": 108.96, |
| "grad_norm": 3.643093441496603e-05, |
| "learning_rate": 0.053181346841407386, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2611200, |
| "step": 435 |
| }, |
| { |
| "epoch": 110.0, |
| "grad_norm": 0.00010669205948943272, |
| "learning_rate": 0.050215446801845885, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2635968, |
| "step": 440 |
| }, |
| { |
| "epoch": 111.32, |
| "grad_norm": 3.292246401542798e-05, |
| "learning_rate": 0.04731793411069669, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2668736, |
| "step": 445 |
| }, |
| { |
| "epoch": 112.64, |
| "grad_norm": 3.169509000144899e-05, |
| "learning_rate": 0.044490794578424434, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2699520, |
| "step": 450 |
| }, |
| { |
| "epoch": 113.96, |
| "grad_norm": 3.336479130666703e-05, |
| "learning_rate": 0.041735965785245674, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2730304, |
| "step": 455 |
| }, |
| { |
| "epoch": 115.0, |
| "grad_norm": 0.00010339989967178553, |
| "learning_rate": 0.03905533575320853, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2755776, |
| "step": 460 |
| }, |
| { |
| "epoch": 116.32, |
| "grad_norm": 3.4434106055414304e-05, |
| "learning_rate": 0.03645074165223656, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2787968, |
| "step": 465 |
| }, |
| { |
| "epoch": 117.64, |
| "grad_norm": 3.564298458513804e-05, |
| "learning_rate": 0.03392396854102408, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2819584, |
| "step": 470 |
| }, |
| { |
| "epoch": 118.96, |
| "grad_norm": 3.216815457562916e-05, |
| "learning_rate": 0.031476748143646434, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2851072, |
| "step": 475 |
| }, |
| { |
| "epoch": 120.0, |
| "grad_norm": 0.00010380757885286584, |
| "learning_rate": 0.029110757662722652, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2875328, |
| "step": 480 |
| }, |
| { |
| "epoch": 121.32, |
| "grad_norm": 2.93699158646632e-05, |
| "learning_rate": 0.026827618629944393, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2906496, |
| "step": 485 |
| }, |
| { |
| "epoch": 122.64, |
| "grad_norm": 3.3641983463894576e-05, |
| "learning_rate": 0.024628895794759492, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2939584, |
| "step": 490 |
| }, |
| { |
| "epoch": 123.96, |
| "grad_norm": 3.712475154316053e-05, |
| "learning_rate": 0.022516096051970434, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2971008, |
| "step": 495 |
| }, |
| { |
| "epoch": 125.0, |
| "grad_norm": 7.740116416243836e-05, |
| "learning_rate": 0.020490667408984253, |
| "loss": 0.0, |
| "num_input_tokens_seen": 2995392, |
| "step": 500 |
| }, |
| { |
| "epoch": 126.32, |
| "grad_norm": 3.0008935937075876e-05, |
| "learning_rate": 0.018553997993420494, |
| "loss": 0.0, |
| "num_input_tokens_seen": 3027904, |
| "step": 505 |
| }, |
| { |
| "epoch": 127.64, |
| "grad_norm": 3.011342414538376e-05, |
| "learning_rate": 0.016707415101757654, |
| "loss": 0.0, |
| "num_input_tokens_seen": 3057472, |
| "step": 510 |
| }, |
| { |
| "epoch": 128.96, |
| "grad_norm": 3.3212050766451284e-05, |
| "learning_rate": 0.014952184289670972, |
| "loss": 0.0, |
| "num_input_tokens_seen": 3089216, |
| "step": 515 |
| }, |
| { |
| "epoch": 130.0, |
| "grad_norm": 4.444582009455189e-05, |
| "learning_rate": 0.013289508504683205, |
| "loss": 0.0, |
| "num_input_tokens_seen": 3114240, |
| "step": 520 |
| }, |
| { |
| "epoch": 131.32, |
| "grad_norm": 2.850958662747871e-05, |
| "learning_rate": 0.011720527261724938, |
| "loss": 0.0, |
| "num_input_tokens_seen": 3145600, |
| "step": 525 |
| }, |
| { |
| "epoch": 132.64, |
| "grad_norm": 3.135738006676547e-05, |
| "learning_rate": 0.010246315862167664, |
| "loss": 0.0, |
| "num_input_tokens_seen": 3178176, |
| "step": 530 |
| }, |
| { |
| "epoch": 133.96, |
| "grad_norm": 3.609867053455673e-05, |
| "learning_rate": 0.00886788465686618, |
| "loss": 0.0, |
| "num_input_tokens_seen": 3209344, |
| "step": 535 |
| }, |
| { |
| "epoch": 135.0, |
| "grad_norm": 6.155487790238112e-05, |
| "learning_rate": 0.007586178353714434, |
| "loss": 0.0, |
| "num_input_tokens_seen": 3234688, |
| "step": 540 |
| }, |
| { |
| "epoch": 136.32, |
| "grad_norm": 3.3471937058493495e-05, |
| "learning_rate": 0.006402075370189914, |
| "loss": 0.0, |
| "num_input_tokens_seen": 3265344, |
| "step": 545 |
| }, |
| { |
| "epoch": 137.64, |
| "grad_norm": 3.2188763725571334e-05, |
| "learning_rate": 0.005316387231330288, |
| "loss": 0.0, |
| "num_input_tokens_seen": 3297216, |
| "step": 550 |
| }, |
| { |
| "epoch": 138.96, |
| "grad_norm": 3.0012966817594133e-05, |
| "learning_rate": 0.004329858013554605, |
| "loss": 0.0, |
| "num_input_tokens_seen": 3328832, |
| "step": 555 |
| }, |
| { |
| "epoch": 140.0, |
| "grad_norm": 7.22868790035136e-05, |
| "learning_rate": 0.0034431638347104552, |
| "loss": 0.0, |
| "num_input_tokens_seen": 3353856, |
| "step": 560 |
| }, |
| { |
| "epoch": 141.32, |
| "grad_norm": 3.3028067264240235e-05, |
| "learning_rate": 0.0026569123906967085, |
| "loss": 0.0, |
| "num_input_tokens_seen": 3384640, |
| "step": 565 |
| }, |
| { |
| "epoch": 142.64, |
| "grad_norm": 3.098338493146002e-05, |
| "learning_rate": 0.0019716425389789127, |
| "loss": 0.0, |
| "num_input_tokens_seen": 3417280, |
| "step": 570 |
| }, |
| { |
| "epoch": 143.96, |
| "grad_norm": 3.733243647729978e-05, |
| "learning_rate": 0.0013878239292834603, |
| "loss": 0.0, |
| "num_input_tokens_seen": 3447808, |
| "step": 575 |
| }, |
| { |
| "epoch": 145.0, |
| "grad_norm": 9.276469791075215e-05, |
| "learning_rate": 0.0009058566817230606, |
| "loss": 0.0, |
| "num_input_tokens_seen": 3472192, |
| "step": 580 |
| }, |
| { |
| "epoch": 146.32, |
| "grad_norm": 3.6584464396582916e-05, |
| "learning_rate": 0.0005260711125743444, |
| "loss": 0.0, |
| "num_input_tokens_seen": 3503616, |
| "step": 585 |
| }, |
| { |
| "epoch": 147.64, |
| "grad_norm": 3.148068208247423e-05, |
| "learning_rate": 0.0002487275078957518, |
| "loss": 0.0, |
| "num_input_tokens_seen": 3536128, |
| "step": 590 |
| }, |
| { |
| "epoch": 148.96, |
| "grad_norm": 3.436414772295393e-05, |
| "learning_rate": 7.401594514025999e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 3567872, |
| "step": 595 |
| }, |
| { |
| "epoch": 150.0, |
| "grad_norm": 7.5851748988498e-05, |
| "learning_rate": 2.0561628859883107e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 3592960, |
| "step": 600 |
| }, |
| { |
| "epoch": 150.0, |
| "eval_mnli_eval_loss": 1.00465726852417, |
| "eval_mnli_eval_runtime": 1.8424, |
| "eval_mnli_eval_samples_per_second": 108.557, |
| "eval_mnli_eval_steps_per_second": 13.57, |
| "num_input_tokens_seen": 3592960, |
| "step": 600 |
| }, |
| { |
| "epoch": 150.0, |
| "num_input_tokens_seen": 3592960, |
| "step": 600, |
| "total_flos": 1.504600852758528e+16, |
| "train_loss": 0.06805778272234117, |
| "train_runtime": 800.524, |
| "train_samples_per_second": 47.969, |
| "train_steps_per_second": 0.75 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 600, |
| "num_input_tokens_seen": 3592960, |
| "num_train_epochs": 200, |
| "save_steps": 200, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.504600852758528e+16, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|