| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 370, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.005405405405405406, |
| "grad_norm": 166.34339904785156, |
| "learning_rate": 0.0, |
| "loss": 7.4172, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.010810810810810811, |
| "grad_norm": 187.5918731689453, |
| "learning_rate": 5.405405405405406e-06, |
| "loss": 7.5679, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.016216216216216217, |
| "grad_norm": 52.4649658203125, |
| "learning_rate": 1.0810810810810812e-05, |
| "loss": 7.0005, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.021621621621621623, |
| "grad_norm": 37.77447509765625, |
| "learning_rate": 1.6216216216216218e-05, |
| "loss": 6.5778, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.02702702702702703, |
| "grad_norm": 29.47389793395996, |
| "learning_rate": 2.1621621621621624e-05, |
| "loss": 6.1619, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.032432432432432434, |
| "grad_norm": 11.992705345153809, |
| "learning_rate": 2.702702702702703e-05, |
| "loss": 5.8593, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.03783783783783784, |
| "grad_norm": 8.19919490814209, |
| "learning_rate": 3.2432432432432436e-05, |
| "loss": 5.5268, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.043243243243243246, |
| "grad_norm": 6.438775062561035, |
| "learning_rate": 3.783783783783784e-05, |
| "loss": 5.2354, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.04864864864864865, |
| "grad_norm": 3.4555141925811768, |
| "learning_rate": 4.324324324324325e-05, |
| "loss": 4.9956, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.05405405405405406, |
| "grad_norm": 3.111621856689453, |
| "learning_rate": 4.8648648648648654e-05, |
| "loss": 4.7073, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.05945945945945946, |
| "grad_norm": 2.309434413909912, |
| "learning_rate": 5.405405405405406e-05, |
| "loss": 4.5115, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.06486486486486487, |
| "grad_norm": 1.941659927368164, |
| "learning_rate": 5.9459459459459466e-05, |
| "loss": 4.2527, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.07027027027027027, |
| "grad_norm": 1.9665441513061523, |
| "learning_rate": 6.486486486486487e-05, |
| "loss": 4.0545, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.07567567567567568, |
| "grad_norm": 1.7107363939285278, |
| "learning_rate": 7.027027027027028e-05, |
| "loss": 3.8214, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.08108108108108109, |
| "grad_norm": 1.6905264854431152, |
| "learning_rate": 7.567567567567568e-05, |
| "loss": 3.6188, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.08648648648648649, |
| "grad_norm": 1.2384110689163208, |
| "learning_rate": 8.108108108108109e-05, |
| "loss": 3.3663, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.0918918918918919, |
| "grad_norm": 1.080546498298645, |
| "learning_rate": 8.64864864864865e-05, |
| "loss": 3.181, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.0972972972972973, |
| "grad_norm": 0.9721872806549072, |
| "learning_rate": 9.18918918918919e-05, |
| "loss": 2.9672, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.10270270270270271, |
| "grad_norm": 0.8164976239204407, |
| "learning_rate": 9.729729729729731e-05, |
| "loss": 2.7709, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.10810810810810811, |
| "grad_norm": 0.8994714021682739, |
| "learning_rate": 0.0001027027027027027, |
| "loss": 2.553, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.11351351351351352, |
| "grad_norm": 0.9458346366882324, |
| "learning_rate": 0.00010810810810810812, |
| "loss": 2.3567, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.11891891891891893, |
| "grad_norm": 1.1655299663543701, |
| "learning_rate": 0.00011351351351351351, |
| "loss": 2.1531, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.12432432432432433, |
| "grad_norm": 1.1697014570236206, |
| "learning_rate": 0.00011891891891891893, |
| "loss": 1.9005, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.12972972972972974, |
| "grad_norm": 1.003651738166809, |
| "learning_rate": 0.00012432432432432433, |
| "loss": 1.6242, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.13513513513513514, |
| "grad_norm": 0.8397846817970276, |
| "learning_rate": 0.00012972972972972974, |
| "loss": 1.4416, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.14054054054054055, |
| "grad_norm": 0.6541157960891724, |
| "learning_rate": 0.00013513513513513514, |
| "loss": 1.2792, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.14594594594594595, |
| "grad_norm": 0.612557590007782, |
| "learning_rate": 0.00014054054054054056, |
| "loss": 1.1801, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.15135135135135136, |
| "grad_norm": 0.4974724352359772, |
| "learning_rate": 0.00014594594594594595, |
| "loss": 1.0689, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.15675675675675677, |
| "grad_norm": 0.4679795205593109, |
| "learning_rate": 0.00015135135135135137, |
| "loss": 1.0099, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.16216216216216217, |
| "grad_norm": 0.3515791893005371, |
| "learning_rate": 0.00015675675675675676, |
| "loss": 0.9363, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.16756756756756758, |
| "grad_norm": 0.30370157957077026, |
| "learning_rate": 0.00016216216216216218, |
| "loss": 0.8544, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.17297297297297298, |
| "grad_norm": 0.28620409965515137, |
| "learning_rate": 0.00016756756756756757, |
| "loss": 0.8724, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.1783783783783784, |
| "grad_norm": 0.23916271328926086, |
| "learning_rate": 0.000172972972972973, |
| "loss": 0.8163, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.1837837837837838, |
| "grad_norm": 0.24133414030075073, |
| "learning_rate": 0.00017837837837837839, |
| "loss": 0.8094, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.1891891891891892, |
| "grad_norm": 0.20248687267303467, |
| "learning_rate": 0.0001837837837837838, |
| "loss": 0.7914, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.1945945945945946, |
| "grad_norm": 0.16612087190151215, |
| "learning_rate": 0.0001891891891891892, |
| "loss": 0.7721, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 0.16773800551891327, |
| "learning_rate": 0.00019459459459459462, |
| "loss": 0.7746, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.20540540540540542, |
| "grad_norm": 0.13932561874389648, |
| "learning_rate": 0.0002, |
| "loss": 0.7444, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.21081081081081082, |
| "grad_norm": 0.1528484970331192, |
| "learning_rate": 0.0001999955498150411, |
| "loss": 0.7434, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.21621621621621623, |
| "grad_norm": 0.12268492579460144, |
| "learning_rate": 0.00019998219965624734, |
| "loss": 0.7278, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.22162162162162163, |
| "grad_norm": 0.11921179294586182, |
| "learning_rate": 0.0001999599507118322, |
| "loss": 0.71, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.22702702702702704, |
| "grad_norm": 0.11119277030229568, |
| "learning_rate": 0.000199928804962034, |
| "loss": 0.6873, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.23243243243243245, |
| "grad_norm": 0.10249276459217072, |
| "learning_rate": 0.0001998887651789398, |
| "loss": 0.6887, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.23783783783783785, |
| "grad_norm": 0.1001831665635109, |
| "learning_rate": 0.00019983983492623833, |
| "loss": 0.6915, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.24324324324324326, |
| "grad_norm": 0.10323046892881393, |
| "learning_rate": 0.00019978201855890308, |
| "loss": 0.6763, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.24864864864864866, |
| "grad_norm": 0.10003294050693512, |
| "learning_rate": 0.00019971532122280464, |
| "loss": 0.6561, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.25405405405405407, |
| "grad_norm": 0.08443877846002579, |
| "learning_rate": 0.00019963974885425266, |
| "loss": 0.6784, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.2594594594594595, |
| "grad_norm": 0.09182324260473251, |
| "learning_rate": 0.00019955530817946748, |
| "loss": 0.6587, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.2648648648648649, |
| "grad_norm": 0.13076290488243103, |
| "learning_rate": 0.0001994620067139815, |
| "loss": 0.6483, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.2702702702702703, |
| "grad_norm": 0.08880296349525452, |
| "learning_rate": 0.0001993598527619703, |
| "loss": 0.6451, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.2756756756756757, |
| "grad_norm": 0.07715742290019989, |
| "learning_rate": 0.0001992488554155135, |
| "loss": 0.6329, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.2810810810810811, |
| "grad_norm": 0.077003113925457, |
| "learning_rate": 0.00019912902455378556, |
| "loss": 0.6397, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.2864864864864865, |
| "grad_norm": 0.06974707543849945, |
| "learning_rate": 0.00019900037084217637, |
| "loss": 0.619, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.2918918918918919, |
| "grad_norm": 0.06869279593229294, |
| "learning_rate": 0.00019886290573134228, |
| "loss": 0.6326, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.2972972972972973, |
| "grad_norm": 0.06709641218185425, |
| "learning_rate": 0.00019871664145618657, |
| "loss": 0.6305, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.3027027027027027, |
| "grad_norm": 0.07039665430784225, |
| "learning_rate": 0.00019856159103477086, |
| "loss": 0.6058, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.3081081081081081, |
| "grad_norm": 0.07261249423027039, |
| "learning_rate": 0.00019839776826715614, |
| "loss": 0.6133, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.31351351351351353, |
| "grad_norm": 0.07141660153865814, |
| "learning_rate": 0.0001982251877341748, |
| "loss": 0.6136, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.31891891891891894, |
| "grad_norm": 0.06658609956502914, |
| "learning_rate": 0.0001980438647961327, |
| "loss": 0.6407, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.32432432432432434, |
| "grad_norm": 0.07396089285612106, |
| "learning_rate": 0.00019785381559144196, |
| "loss": 0.601, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.32972972972972975, |
| "grad_norm": 0.08898008614778519, |
| "learning_rate": 0.00019765505703518496, |
| "loss": 0.6149, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.33513513513513515, |
| "grad_norm": 0.1093701645731926, |
| "learning_rate": 0.00019744760681760832, |
| "loss": 0.6014, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.34054054054054056, |
| "grad_norm": 0.1039031520485878, |
| "learning_rate": 0.00019723148340254892, |
| "loss": 0.6126, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.34594594594594597, |
| "grad_norm": 0.14217646420001984, |
| "learning_rate": 0.00019700670602579008, |
| "loss": 0.6057, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.35135135135135137, |
| "grad_norm": 0.11971811205148697, |
| "learning_rate": 0.0001967732946933499, |
| "loss": 0.6041, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.3567567567567568, |
| "grad_norm": 0.12108401209115982, |
| "learning_rate": 0.00019653127017970034, |
| "loss": 0.5906, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.3621621621621622, |
| "grad_norm": 0.0906200259923935, |
| "learning_rate": 0.00019628065402591845, |
| "loss": 0.6014, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.3675675675675676, |
| "grad_norm": 0.0969948098063469, |
| "learning_rate": 0.00019602146853776894, |
| "loss": 0.5973, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.372972972972973, |
| "grad_norm": 0.10772716253995895, |
| "learning_rate": 0.00019575373678371909, |
| "loss": 0.594, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.3783783783783784, |
| "grad_norm": 0.12335596233606339, |
| "learning_rate": 0.00019547748259288536, |
| "loss": 0.6009, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.3837837837837838, |
| "grad_norm": 0.14136555790901184, |
| "learning_rate": 0.00019519273055291266, |
| "loss": 0.5985, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.3891891891891892, |
| "grad_norm": 0.10604984313249588, |
| "learning_rate": 0.0001948995060077859, |
| "loss": 0.5982, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.3945945945945946, |
| "grad_norm": 0.08961839228868484, |
| "learning_rate": 0.00019459783505557424, |
| "loss": 0.5706, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 0.10604697465896606, |
| "learning_rate": 0.00019428774454610843, |
| "loss": 0.5898, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.40540540540540543, |
| "grad_norm": 0.10985071957111359, |
| "learning_rate": 0.00019396926207859084, |
| "loss": 0.5952, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.41081081081081083, |
| "grad_norm": 0.11850868165493011, |
| "learning_rate": 0.00019364241599913924, |
| "loss": 0.5803, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.41621621621621624, |
| "grad_norm": 0.1549469530582428, |
| "learning_rate": 0.00019330723539826375, |
| "loss": 0.5776, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.42162162162162165, |
| "grad_norm": 0.18124178051948547, |
| "learning_rate": 0.00019296375010827773, |
| "loss": 0.5757, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.42702702702702705, |
| "grad_norm": 0.16211983561515808, |
| "learning_rate": 0.0001926119907006426, |
| "loss": 0.5942, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.43243243243243246, |
| "grad_norm": 0.2041509449481964, |
| "learning_rate": 0.0001922519884832469, |
| "loss": 0.5946, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.43783783783783786, |
| "grad_norm": 0.1953067183494568, |
| "learning_rate": 0.00019188377549761963, |
| "loss": 0.6017, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.44324324324324327, |
| "grad_norm": 0.19392773509025574, |
| "learning_rate": 0.0001915073845160786, |
| "loss": 0.5896, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.4486486486486487, |
| "grad_norm": 0.1343798190355301, |
| "learning_rate": 0.0001911228490388136, |
| "loss": 0.5771, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.4540540540540541, |
| "grad_norm": 0.22122260928153992, |
| "learning_rate": 0.00019073020329090444, |
| "loss": 0.5948, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.4594594594594595, |
| "grad_norm": 0.23926760256290436, |
| "learning_rate": 0.00019032948221927524, |
| "loss": 0.5894, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.4648648648648649, |
| "grad_norm": 0.3518514335155487, |
| "learning_rate": 0.00018992072148958368, |
| "loss": 0.6073, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.4702702702702703, |
| "grad_norm": 0.46678459644317627, |
| "learning_rate": 0.00018950395748304678, |
| "loss": 0.6006, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.4756756756756757, |
| "grad_norm": 0.3574659824371338, |
| "learning_rate": 0.00018907922729320285, |
| "loss": 0.5843, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.4810810810810811, |
| "grad_norm": 0.1582118421792984, |
| "learning_rate": 0.00018864656872260985, |
| "loss": 0.5856, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.4864864864864865, |
| "grad_norm": 0.41187095642089844, |
| "learning_rate": 0.00018820602027948114, |
| "loss": 0.6022, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.4918918918918919, |
| "grad_norm": 0.26722782850265503, |
| "learning_rate": 0.00018775762117425777, |
| "loss": 0.5737, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.4972972972972973, |
| "grad_norm": 0.2423318475484848, |
| "learning_rate": 0.00018730141131611882, |
| "loss": 0.5646, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.5027027027027027, |
| "grad_norm": 0.3524855673313141, |
| "learning_rate": 0.00018683743130942928, |
| "loss": 0.5733, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.5081081081081081, |
| "grad_norm": 0.13441210985183716, |
| "learning_rate": 0.00018636572245012606, |
| "loss": 0.5813, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.5135135135135135, |
| "grad_norm": 0.30515116453170776, |
| "learning_rate": 0.00018588632672204264, |
| "loss": 0.5542, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.518918918918919, |
| "grad_norm": 0.17132249474525452, |
| "learning_rate": 0.0001853992867931721, |
| "loss": 0.5838, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.5243243243243243, |
| "grad_norm": 0.28823599219322205, |
| "learning_rate": 0.0001849046460118698, |
| "loss": 0.588, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.5297297297297298, |
| "grad_norm": 0.31162044405937195, |
| "learning_rate": 0.00018440244840299506, |
| "loss": 0.5747, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.5351351351351351, |
| "grad_norm": 0.1594020426273346, |
| "learning_rate": 0.00018389273866399275, |
| "loss": 0.572, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.5405405405405406, |
| "grad_norm": 0.27514341473579407, |
| "learning_rate": 0.00018337556216091517, |
| "loss": 0.5828, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.5459459459459459, |
| "grad_norm": 0.30611446499824524, |
| "learning_rate": 0.00018285096492438424, |
| "loss": 0.5731, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.5513513513513514, |
| "grad_norm": 0.4014338552951813, |
| "learning_rate": 0.00018231899364549455, |
| "loss": 0.5715, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.5567567567567567, |
| "grad_norm": 0.5616762042045593, |
| "learning_rate": 0.0001817796956716578, |
| "loss": 0.605, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.5621621621621622, |
| "grad_norm": 0.5975009202957153, |
| "learning_rate": 0.0001812331190023886, |
| "loss": 0.5571, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.5675675675675675, |
| "grad_norm": 0.34755927324295044, |
| "learning_rate": 0.00018067931228503246, |
| "loss": 0.5662, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.572972972972973, |
| "grad_norm": 0.4398202896118164, |
| "learning_rate": 0.00018011832481043576, |
| "loss": 0.5827, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.5783783783783784, |
| "grad_norm": 0.520788848400116, |
| "learning_rate": 0.000179550206508559, |
| "loss": 0.5825, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.5837837837837838, |
| "grad_norm": 0.3554728627204895, |
| "learning_rate": 0.0001789750079440326, |
| "loss": 0.5745, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.5891891891891892, |
| "grad_norm": 0.4278441369533539, |
| "learning_rate": 0.00017839278031165658, |
| "loss": 0.5901, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.5945945945945946, |
| "grad_norm": 0.5219722390174866, |
| "learning_rate": 0.00017780357543184397, |
| "loss": 0.5574, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 0.24311627447605133, |
| "learning_rate": 0.00017720744574600863, |
| "loss": 0.5721, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.6054054054054054, |
| "grad_norm": 0.37851300835609436, |
| "learning_rate": 0.0001766044443118978, |
| "loss": 0.5522, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.6108108108108108, |
| "grad_norm": 0.2819484770298004, |
| "learning_rate": 0.00017599462479886974, |
| "loss": 0.5762, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.6162162162162163, |
| "grad_norm": 0.4176675081253052, |
| "learning_rate": 0.00017537804148311695, |
| "loss": 0.5871, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.6216216216216216, |
| "grad_norm": 0.5771986842155457, |
| "learning_rate": 0.00017475474924283536, |
| "loss": 0.5898, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.6270270270270271, |
| "grad_norm": 0.5216075778007507, |
| "learning_rate": 0.00017412480355334005, |
| "loss": 0.5817, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.6324324324324324, |
| "grad_norm": 0.48448437452316284, |
| "learning_rate": 0.0001734882604821276, |
| "loss": 0.5815, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.6378378378378379, |
| "grad_norm": 0.42380860447883606, |
| "learning_rate": 0.0001728451766838861, |
| "loss": 0.5781, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.6432432432432432, |
| "grad_norm": 0.27722859382629395, |
| "learning_rate": 0.00017219560939545246, |
| "loss": 0.5797, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.6486486486486487, |
| "grad_norm": 0.3511153757572174, |
| "learning_rate": 0.0001715396164307182, |
| "loss": 0.5978, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.654054054054054, |
| "grad_norm": 0.3476790189743042, |
| "learning_rate": 0.00017087725617548385, |
| "loss": 0.5633, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.6594594594594595, |
| "grad_norm": 0.359022319316864, |
| "learning_rate": 0.00017020858758226229, |
| "loss": 0.5767, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.6648648648648648, |
| "grad_norm": 0.3652413487434387, |
| "learning_rate": 0.00016953367016503182, |
| "loss": 0.5803, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.6702702702702703, |
| "grad_norm": 0.3911918103694916, |
| "learning_rate": 0.00016885256399393924, |
| "loss": 0.5669, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.6756756756756757, |
| "grad_norm": 0.29855138063430786, |
| "learning_rate": 0.00016816532968995328, |
| "loss": 0.5701, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.6810810810810811, |
| "grad_norm": 0.3289024531841278, |
| "learning_rate": 0.00016747202841946928, |
| "loss": 0.5691, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.6864864864864865, |
| "grad_norm": 0.43118664622306824, |
| "learning_rate": 0.00016677272188886483, |
| "loss": 0.595, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.6918918918918919, |
| "grad_norm": 0.48039013147354126, |
| "learning_rate": 0.00016606747233900815, |
| "loss": 0.5894, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.6972972972972973, |
| "grad_norm": 0.5704895853996277, |
| "learning_rate": 0.00016535634253971794, |
| "loss": 0.571, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.7027027027027027, |
| "grad_norm": 0.4907408356666565, |
| "learning_rate": 0.00016463939578417692, |
| "loss": 0.5705, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.7081081081081081, |
| "grad_norm": 0.40252187848091125, |
| "learning_rate": 0.0001639166958832985, |
| "loss": 0.565, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.7135135135135136, |
| "grad_norm": 0.5997945070266724, |
| "learning_rate": 0.00016318830716004722, |
| "loss": 0.5746, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.7189189189189189, |
| "grad_norm": 0.6897152066230774, |
| "learning_rate": 0.0001624542944437139, |
| "loss": 0.5744, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.7243243243243244, |
| "grad_norm": 0.6489009857177734, |
| "learning_rate": 0.00016171472306414554, |
| "loss": 0.5905, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.7297297297297297, |
| "grad_norm": 0.5703084468841553, |
| "learning_rate": 0.0001609696588459307, |
| "loss": 0.5893, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.7351351351351352, |
| "grad_norm": 0.5917540192604065, |
| "learning_rate": 0.00016021916810254097, |
| "loss": 0.5878, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.7405405405405405, |
| "grad_norm": 0.6699403524398804, |
| "learning_rate": 0.00015946331763042867, |
| "loss": 0.5776, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.745945945945946, |
| "grad_norm": 0.6214162111282349, |
| "learning_rate": 0.00015870217470308188, |
| "loss": 0.5866, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.7513513513513513, |
| "grad_norm": 0.5269213914871216, |
| "learning_rate": 0.0001579358070650367, |
| "loss": 0.5797, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.7567567567567568, |
| "grad_norm": 0.5156534910202026, |
| "learning_rate": 0.00015716428292584787, |
| "loss": 0.5837, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.7621621621621621, |
| "grad_norm": 0.3855270445346832, |
| "learning_rate": 0.0001563876709540178, |
| "loss": 0.5794, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.7675675675675676, |
| "grad_norm": 0.42134228348731995, |
| "learning_rate": 0.00015560604027088477, |
| "loss": 0.5607, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.772972972972973, |
| "grad_norm": 0.2681983709335327, |
| "learning_rate": 0.00015481946044447099, |
| "loss": 0.5887, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.7783783783783784, |
| "grad_norm": 0.5026779174804688, |
| "learning_rate": 0.00015402800148329071, |
| "loss": 0.5951, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.7837837837837838, |
| "grad_norm": 0.5202389359474182, |
| "learning_rate": 0.0001532317338301192, |
| "loss": 0.5963, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.7891891891891892, |
| "grad_norm": 0.43367475271224976, |
| "learning_rate": 0.00015243072835572318, |
| "loss": 0.563, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.7945945945945946, |
| "grad_norm": 0.4822995960712433, |
| "learning_rate": 0.00015162505635255287, |
| "loss": 0.5675, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 0.4385891854763031, |
| "learning_rate": 0.00015081478952839693, |
| "loss": 0.5672, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.8054054054054054, |
| "grad_norm": 0.27161282300949097, |
| "learning_rate": 0.00015000000000000001, |
| "loss": 0.5564, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.8108108108108109, |
| "grad_norm": 0.47754400968551636, |
| "learning_rate": 0.0001491807602866442, |
| "loss": 0.5865, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.8162162162162162, |
| "grad_norm": 0.6711156368255615, |
| "learning_rate": 0.00014835714330369446, |
| "loss": 0.5818, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.8216216216216217, |
| "grad_norm": 0.713715136051178, |
| "learning_rate": 0.000147529222356109, |
| "loss": 0.5807, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.827027027027027, |
| "grad_norm": 0.7641165852546692, |
| "learning_rate": 0.00014669707113191483, |
| "loss": 0.5604, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.8324324324324325, |
| "grad_norm": 0.6011711955070496, |
| "learning_rate": 0.00014586076369564908, |
| "loss": 0.5778, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.8378378378378378, |
| "grad_norm": 0.43020206689834595, |
| "learning_rate": 0.00014502037448176734, |
| "loss": 0.5785, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.8432432432432433, |
| "grad_norm": 0.4680975377559662, |
| "learning_rate": 0.00014417597828801832, |
| "loss": 0.5785, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.8486486486486486, |
| "grad_norm": 0.5653423070907593, |
| "learning_rate": 0.00014332765026878687, |
| "loss": 0.5913, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.8540540540540541, |
| "grad_norm": 0.6896083354949951, |
| "learning_rate": 0.0001424754659284048, |
| "loss": 0.5769, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.8594594594594595, |
| "grad_norm": 0.5635109543800354, |
| "learning_rate": 0.00014161950111443077, |
| "loss": 0.5552, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.8648648648648649, |
| "grad_norm": 0.44970378279685974, |
| "learning_rate": 0.00014075983201089964, |
| "loss": 0.5716, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.8702702702702703, |
| "grad_norm": 0.8098542094230652, |
| "learning_rate": 0.00013989653513154165, |
| "loss": 0.5913, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.8756756756756757, |
| "grad_norm": 0.7309775948524475, |
| "learning_rate": 0.00013902968731297255, |
| "loss": 0.593, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.8810810810810811, |
| "grad_norm": 0.7028838992118835, |
| "learning_rate": 0.00013815936570785487, |
| "loss": 0.5623, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.8864864864864865, |
| "grad_norm": 0.6845377087593079, |
| "learning_rate": 0.00013728564777803088, |
| "loss": 0.5829, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.8918918918918919, |
| "grad_norm": 0.43059009313583374, |
| "learning_rate": 0.0001364086112876284, |
| "loss": 0.5786, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.8972972972972973, |
| "grad_norm": 0.8795806765556335, |
| "learning_rate": 0.00013552833429613938, |
| "loss": 0.5714, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.9027027027027027, |
| "grad_norm": 1.0658906698226929, |
| "learning_rate": 0.00013464489515147238, |
| "loss": 0.5688, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.9081081081081082, |
| "grad_norm": 0.7074139714241028, |
| "learning_rate": 0.00013375837248297926, |
| "loss": 0.5736, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.9135135135135135, |
| "grad_norm": 0.5696041584014893, |
| "learning_rate": 0.0001328688451944569, |
| "loss": 0.5862, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.918918918918919, |
| "grad_norm": 0.46176642179489136, |
| "learning_rate": 0.00013197639245712454, |
| "loss": 0.5806, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.9243243243243243, |
| "grad_norm": 0.6477006673812866, |
| "learning_rate": 0.00013108109370257712, |
| "loss": 0.5508, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.9297297297297298, |
| "grad_norm": 0.6348613500595093, |
| "learning_rate": 0.0001301830286157157, |
| "loss": 0.5763, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.9351351351351351, |
| "grad_norm": 0.3889661729335785, |
| "learning_rate": 0.00012928227712765504, |
| "loss": 0.5795, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.9405405405405406, |
| "grad_norm": 0.7158688306808472, |
| "learning_rate": 0.00012837891940860972, |
| "loss": 0.5504, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.9459459459459459, |
| "grad_norm": 0.49243494868278503, |
| "learning_rate": 0.0001274730358607583, |
| "loss": 0.5628, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.9513513513513514, |
| "grad_norm": 0.4271713197231293, |
| "learning_rate": 0.00012656470711108764, |
| "loss": 0.5696, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.9567567567567568, |
| "grad_norm": 0.6557771563529968, |
| "learning_rate": 0.00012565401400421651, |
| "loss": 0.5895, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.9621621621621622, |
| "grad_norm": 0.39298897981643677, |
| "learning_rate": 0.00012474103759520027, |
| "loss": 0.5669, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.9675675675675676, |
| "grad_norm": 0.4338141083717346, |
| "learning_rate": 0.0001238258591423165, |
| "loss": 0.5595, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.972972972972973, |
| "grad_norm": 0.4943206012248993, |
| "learning_rate": 0.000122908560099833, |
| "loss": 0.5636, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.9783783783783784, |
| "grad_norm": 0.3071780502796173, |
| "learning_rate": 0.00012198922211075778, |
| "loss": 0.5771, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.9837837837837838, |
| "grad_norm": 0.3990117907524109, |
| "learning_rate": 0.00012106792699957263, |
| "loss": 0.5546, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.9891891891891892, |
| "grad_norm": 0.3104795217514038, |
| "learning_rate": 0.00012014475676495052, |
| "loss": 0.5523, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.9945945945945946, |
| "grad_norm": 0.40355923771858215, |
| "learning_rate": 0.0001192197935724573, |
| "loss": 0.5621, |
| "step": 184 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.42520806193351746, |
| "learning_rate": 0.00011829311974723867, |
| "loss": 0.5742, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.0054054054054054, |
| "grad_norm": 0.2284257560968399, |
| "learning_rate": 0.00011736481776669306, |
| "loss": 0.5681, |
| "step": 186 |
| }, |
| { |
| "epoch": 1.0108108108108107, |
| "grad_norm": 0.35243916511535645, |
| "learning_rate": 0.00011643497025313061, |
| "loss": 0.5641, |
| "step": 187 |
| }, |
| { |
| "epoch": 1.0162162162162163, |
| "grad_norm": 0.42704927921295166, |
| "learning_rate": 0.00011550365996641979, |
| "loss": 0.5634, |
| "step": 188 |
| }, |
| { |
| "epoch": 1.0216216216216216, |
| "grad_norm": 0.3367633819580078, |
| "learning_rate": 0.00011457096979662114, |
| "loss": 0.5705, |
| "step": 189 |
| }, |
| { |
| "epoch": 1.027027027027027, |
| "grad_norm": 0.38994061946868896, |
| "learning_rate": 0.00011363698275661001, |
| "loss": 0.5522, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.0324324324324325, |
| "grad_norm": 0.3361996114253998, |
| "learning_rate": 0.00011270178197468789, |
| "loss": 0.5688, |
| "step": 191 |
| }, |
| { |
| "epoch": 1.037837837837838, |
| "grad_norm": 0.29897335171699524, |
| "learning_rate": 0.00011176545068718385, |
| "loss": 0.5619, |
| "step": 192 |
| }, |
| { |
| "epoch": 1.0432432432432432, |
| "grad_norm": 0.44845789670944214, |
| "learning_rate": 0.0001108280722310462, |
| "loss": 0.5599, |
| "step": 193 |
| }, |
| { |
| "epoch": 1.0486486486486486, |
| "grad_norm": 0.3990190923213959, |
| "learning_rate": 0.00010988973003642499, |
| "loss": 0.5887, |
| "step": 194 |
| }, |
| { |
| "epoch": 1.054054054054054, |
| "grad_norm": 0.4176868796348572, |
| "learning_rate": 0.00010895050761924668, |
| "loss": 0.5816, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.0594594594594595, |
| "grad_norm": 0.5121976733207703, |
| "learning_rate": 0.00010801048857378071, |
| "loss": 0.5629, |
| "step": 196 |
| }, |
| { |
| "epoch": 1.0648648648648649, |
| "grad_norm": 0.42421454191207886, |
| "learning_rate": 0.00010706975656519946, |
| "loss": 0.5549, |
| "step": 197 |
| }, |
| { |
| "epoch": 1.0702702702702702, |
| "grad_norm": 0.34359657764434814, |
| "learning_rate": 0.00010612839532213164, |
| "loss": 0.554, |
| "step": 198 |
| }, |
| { |
| "epoch": 1.0756756756756758, |
| "grad_norm": 0.2899879813194275, |
| "learning_rate": 0.00010518648862921012, |
| "loss": 0.5679, |
| "step": 199 |
| }, |
| { |
| "epoch": 1.0810810810810811, |
| "grad_norm": 0.3595804274082184, |
| "learning_rate": 0.00010424412031961484, |
| "loss": 0.5765, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.0864864864864865, |
| "grad_norm": 0.4479254484176636, |
| "learning_rate": 0.00010330137426761135, |
| "loss": 0.5824, |
| "step": 201 |
| }, |
| { |
| "epoch": 1.0918918918918918, |
| "grad_norm": 0.4118141829967499, |
| "learning_rate": 0.00010235833438108571, |
| "loss": 0.5684, |
| "step": 202 |
| }, |
| { |
| "epoch": 1.0972972972972972, |
| "grad_norm": 0.3013007640838623, |
| "learning_rate": 0.00010141508459407623, |
| "loss": 0.5634, |
| "step": 203 |
| }, |
| { |
| "epoch": 1.1027027027027028, |
| "grad_norm": 0.32391178607940674, |
| "learning_rate": 0.00010047170885930324, |
| "loss": 0.5702, |
| "step": 204 |
| }, |
| { |
| "epoch": 1.1081081081081081, |
| "grad_norm": 0.35440289974212646, |
| "learning_rate": 9.95282911406968e-05, |
| "loss": 0.5679, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.1135135135135135, |
| "grad_norm": 0.3799758851528168, |
| "learning_rate": 9.858491540592382e-05, |
| "loss": 0.5769, |
| "step": 206 |
| }, |
| { |
| "epoch": 1.118918918918919, |
| "grad_norm": 0.37513500452041626, |
| "learning_rate": 9.764166561891432e-05, |
| "loss": 0.5669, |
| "step": 207 |
| }, |
| { |
| "epoch": 1.1243243243243244, |
| "grad_norm": 0.4138847291469574, |
| "learning_rate": 9.669862573238863e-05, |
| "loss": 0.5883, |
| "step": 208 |
| }, |
| { |
| "epoch": 1.1297297297297297, |
| "grad_norm": 0.4087463915348053, |
| "learning_rate": 9.57558796803852e-05, |
| "loss": 0.5564, |
| "step": 209 |
| }, |
| { |
| "epoch": 1.135135135135135, |
| "grad_norm": 0.3634737432003021, |
| "learning_rate": 9.48135113707899e-05, |
| "loss": 0.5623, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.1405405405405404, |
| "grad_norm": 0.28195977210998535, |
| "learning_rate": 9.38716046778684e-05, |
| "loss": 0.5632, |
| "step": 211 |
| }, |
| { |
| "epoch": 1.145945945945946, |
| "grad_norm": 0.26449114084243774, |
| "learning_rate": 9.293024343480055e-05, |
| "loss": 0.5664, |
| "step": 212 |
| }, |
| { |
| "epoch": 1.1513513513513514, |
| "grad_norm": 0.3275916576385498, |
| "learning_rate": 9.198951142621929e-05, |
| "loss": 0.5582, |
| "step": 213 |
| }, |
| { |
| "epoch": 1.1567567567567567, |
| "grad_norm": 0.2918509840965271, |
| "learning_rate": 9.104949238075336e-05, |
| "loss": 0.5595, |
| "step": 214 |
| }, |
| { |
| "epoch": 1.1621621621621623, |
| "grad_norm": 0.31472450494766235, |
| "learning_rate": 9.011026996357503e-05, |
| "loss": 0.5579, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.1675675675675676, |
| "grad_norm": 0.251597136259079, |
| "learning_rate": 8.917192776895382e-05, |
| "loss": 0.5248, |
| "step": 216 |
| }, |
| { |
| "epoch": 1.172972972972973, |
| "grad_norm": 0.364433228969574, |
| "learning_rate": 8.823454931281616e-05, |
| "loss": 0.5691, |
| "step": 217 |
| }, |
| { |
| "epoch": 1.1783783783783783, |
| "grad_norm": 0.4497614800930023, |
| "learning_rate": 8.729821802531212e-05, |
| "loss": 0.5468, |
| "step": 218 |
| }, |
| { |
| "epoch": 1.1837837837837837, |
| "grad_norm": 0.3149522542953491, |
| "learning_rate": 8.636301724339004e-05, |
| "loss": 0.5561, |
| "step": 219 |
| }, |
| { |
| "epoch": 1.1891891891891893, |
| "grad_norm": 0.3531090021133423, |
| "learning_rate": 8.542903020337887e-05, |
| "loss": 0.5607, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.1945945945945946, |
| "grad_norm": 0.36666902899742126, |
| "learning_rate": 8.449634003358022e-05, |
| "loss": 0.5629, |
| "step": 221 |
| }, |
| { |
| "epoch": 1.2, |
| "grad_norm": 0.27004843950271606, |
| "learning_rate": 8.356502974686941e-05, |
| "loss": 0.5759, |
| "step": 222 |
| }, |
| { |
| "epoch": 1.2054054054054055, |
| "grad_norm": 0.38194379210472107, |
| "learning_rate": 8.263518223330697e-05, |
| "loss": 0.5626, |
| "step": 223 |
| }, |
| { |
| "epoch": 1.2108108108108109, |
| "grad_norm": 0.42150571942329407, |
| "learning_rate": 8.170688025276134e-05, |
| "loss": 0.5692, |
| "step": 224 |
| }, |
| { |
| "epoch": 1.2162162162162162, |
| "grad_norm": 0.3516136407852173, |
| "learning_rate": 8.078020642754274e-05, |
| "loss": 0.5656, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.2216216216216216, |
| "grad_norm": 0.2839685082435608, |
| "learning_rate": 7.985524323504948e-05, |
| "loss": 0.5591, |
| "step": 226 |
| }, |
| { |
| "epoch": 1.227027027027027, |
| "grad_norm": 0.2869662046432495, |
| "learning_rate": 7.89320730004274e-05, |
| "loss": 0.5482, |
| "step": 227 |
| }, |
| { |
| "epoch": 1.2324324324324325, |
| "grad_norm": 0.3703990578651428, |
| "learning_rate": 7.801077788924224e-05, |
| "loss": 0.5596, |
| "step": 228 |
| }, |
| { |
| "epoch": 1.2378378378378379, |
| "grad_norm": 0.36322546005249023, |
| "learning_rate": 7.709143990016702e-05, |
| "loss": 0.5678, |
| "step": 229 |
| }, |
| { |
| "epoch": 1.2432432432432432, |
| "grad_norm": 0.33027511835098267, |
| "learning_rate": 7.617414085768351e-05, |
| "loss": 0.5607, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.2486486486486488, |
| "grad_norm": 0.2999548316001892, |
| "learning_rate": 7.525896240479976e-05, |
| "loss": 0.5468, |
| "step": 231 |
| }, |
| { |
| "epoch": 1.2540540540540541, |
| "grad_norm": 0.2634562849998474, |
| "learning_rate": 7.434598599578351e-05, |
| "loss": 0.5731, |
| "step": 232 |
| }, |
| { |
| "epoch": 1.2594594594594595, |
| "grad_norm": 0.3004055917263031, |
| "learning_rate": 7.343529288891239e-05, |
| "loss": 0.5608, |
| "step": 233 |
| }, |
| { |
| "epoch": 1.2648648648648648, |
| "grad_norm": 0.3801259994506836, |
| "learning_rate": 7.252696413924174e-05, |
| "loss": 0.5561, |
| "step": 234 |
| }, |
| { |
| "epoch": 1.2702702702702702, |
| "grad_norm": 0.297504186630249, |
| "learning_rate": 7.162108059139032e-05, |
| "loss": 0.5571, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.2756756756756757, |
| "grad_norm": 0.2872467637062073, |
| "learning_rate": 7.071772287234497e-05, |
| "loss": 0.5487, |
| "step": 236 |
| }, |
| { |
| "epoch": 1.281081081081081, |
| "grad_norm": 0.3155842423439026, |
| "learning_rate": 6.981697138428434e-05, |
| "loss": 0.5582, |
| "step": 237 |
| }, |
| { |
| "epoch": 1.2864864864864864, |
| "grad_norm": 0.27752622961997986, |
| "learning_rate": 6.891890629742288e-05, |
| "loss": 0.5403, |
| "step": 238 |
| }, |
| { |
| "epoch": 1.291891891891892, |
| "grad_norm": 0.3249455690383911, |
| "learning_rate": 6.802360754287547e-05, |
| "loss": 0.5583, |
| "step": 239 |
| }, |
| { |
| "epoch": 1.2972972972972974, |
| "grad_norm": 0.308685302734375, |
| "learning_rate": 6.713115480554313e-05, |
| "loss": 0.5597, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.3027027027027027, |
| "grad_norm": 0.2561638355255127, |
| "learning_rate": 6.624162751702076e-05, |
| "loss": 0.5391, |
| "step": 241 |
| }, |
| { |
| "epoch": 1.308108108108108, |
| "grad_norm": 0.4116757810115814, |
| "learning_rate": 6.535510484852767e-05, |
| "loss": 0.5485, |
| "step": 242 |
| }, |
| { |
| "epoch": 1.3135135135135134, |
| "grad_norm": 0.3048592805862427, |
| "learning_rate": 6.447166570386063e-05, |
| "loss": 0.5495, |
| "step": 243 |
| }, |
| { |
| "epoch": 1.318918918918919, |
| "grad_norm": 0.26773855090141296, |
| "learning_rate": 6.35913887123716e-05, |
| "loss": 0.5816, |
| "step": 244 |
| }, |
| { |
| "epoch": 1.3243243243243243, |
| "grad_norm": 0.4389781653881073, |
| "learning_rate": 6.271435222196916e-05, |
| "loss": 0.5456, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.3297297297297297, |
| "grad_norm": 0.2906099855899811, |
| "learning_rate": 6.184063429214515e-05, |
| "loss": 0.5579, |
| "step": 246 |
| }, |
| { |
| "epoch": 1.3351351351351353, |
| "grad_norm": 0.29588866233825684, |
| "learning_rate": 6.097031268702746e-05, |
| "loss": 0.5451, |
| "step": 247 |
| }, |
| { |
| "epoch": 1.3405405405405406, |
| "grad_norm": 0.37067651748657227, |
| "learning_rate": 6.010346486845837e-05, |
| "loss": 0.5613, |
| "step": 248 |
| }, |
| { |
| "epoch": 1.345945945945946, |
| "grad_norm": 0.28503182530403137, |
| "learning_rate": 5.924016798910037e-05, |
| "loss": 0.5541, |
| "step": 249 |
| }, |
| { |
| "epoch": 1.3513513513513513, |
| "grad_norm": 0.2947586178779602, |
| "learning_rate": 5.838049888556925e-05, |
| "loss": 0.5543, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.3567567567567567, |
| "grad_norm": 0.2248247265815735, |
| "learning_rate": 5.752453407159522e-05, |
| "loss": 0.5423, |
| "step": 251 |
| }, |
| { |
| "epoch": 1.3621621621621622, |
| "grad_norm": 0.2677771747112274, |
| "learning_rate": 5.667234973121317e-05, |
| "loss": 0.5573, |
| "step": 252 |
| }, |
| { |
| "epoch": 1.3675675675675676, |
| "grad_norm": 0.22564172744750977, |
| "learning_rate": 5.5824021711981686e-05, |
| "loss": 0.5543, |
| "step": 253 |
| }, |
| { |
| "epoch": 1.372972972972973, |
| "grad_norm": 0.23986554145812988, |
| "learning_rate": 5.497962551823266e-05, |
| "loss": 0.5529, |
| "step": 254 |
| }, |
| { |
| "epoch": 1.3783783783783785, |
| "grad_norm": 0.2411298155784607, |
| "learning_rate": 5.4139236304350935e-05, |
| "loss": 0.5596, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.3837837837837839, |
| "grad_norm": 0.22724369168281555, |
| "learning_rate": 5.33029288680852e-05, |
| "loss": 0.5603, |
| "step": 256 |
| }, |
| { |
| "epoch": 1.3891891891891892, |
| "grad_norm": 0.21808037161827087, |
| "learning_rate": 5.247077764389099e-05, |
| "loss": 0.5581, |
| "step": 257 |
| }, |
| { |
| "epoch": 1.3945945945945946, |
| "grad_norm": 0.20949189364910126, |
| "learning_rate": 5.1642856696305575e-05, |
| "loss": 0.5318, |
| "step": 258 |
| }, |
| { |
| "epoch": 1.4, |
| "grad_norm": 0.20597508549690247, |
| "learning_rate": 5.081923971335582e-05, |
| "loss": 0.5507, |
| "step": 259 |
| }, |
| { |
| "epoch": 1.4054054054054055, |
| "grad_norm": 0.27058327198028564, |
| "learning_rate": 5.000000000000002e-05, |
| "loss": 0.5589, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.4108108108108108, |
| "grad_norm": 0.22068722546100616, |
| "learning_rate": 4.918521047160308e-05, |
| "loss": 0.5444, |
| "step": 261 |
| }, |
| { |
| "epoch": 1.4162162162162162, |
| "grad_norm": 0.33482491970062256, |
| "learning_rate": 4.837494364744711e-05, |
| "loss": 0.5403, |
| "step": 262 |
| }, |
| { |
| "epoch": 1.4216216216216218, |
| "grad_norm": 0.22971421480178833, |
| "learning_rate": 4.756927164427685e-05, |
| "loss": 0.54, |
| "step": 263 |
| }, |
| { |
| "epoch": 1.427027027027027, |
| "grad_norm": 0.22710531949996948, |
| "learning_rate": 4.6768266169880804e-05, |
| "loss": 0.5614, |
| "step": 264 |
| }, |
| { |
| "epoch": 1.4324324324324325, |
| "grad_norm": 0.2634375989437103, |
| "learning_rate": 4.597199851670932e-05, |
| "loss": 0.5588, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.4378378378378378, |
| "grad_norm": 0.22090476751327515, |
| "learning_rate": 4.518053955552903e-05, |
| "loss": 0.5664, |
| "step": 266 |
| }, |
| { |
| "epoch": 1.4432432432432432, |
| "grad_norm": 0.2724359333515167, |
| "learning_rate": 4.4393959729115244e-05, |
| "loss": 0.5539, |
| "step": 267 |
| }, |
| { |
| "epoch": 1.4486486486486487, |
| "grad_norm": 0.20361758768558502, |
| "learning_rate": 4.3612329045982236e-05, |
| "loss": 0.5434, |
| "step": 268 |
| }, |
| { |
| "epoch": 1.454054054054054, |
| "grad_norm": 0.22764244675636292, |
| "learning_rate": 4.283571707415214e-05, |
| "loss": 0.5617, |
| "step": 269 |
| }, |
| { |
| "epoch": 1.4594594594594594, |
| "grad_norm": 0.2354433387517929, |
| "learning_rate": 4.206419293496333e-05, |
| "loss": 0.5568, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.464864864864865, |
| "grad_norm": 0.23961907625198364, |
| "learning_rate": 4.129782529691815e-05, |
| "loss": 0.5705, |
| "step": 271 |
| }, |
| { |
| "epoch": 1.4702702702702704, |
| "grad_norm": 0.20233069360256195, |
| "learning_rate": 4.053668236957134e-05, |
| "loss": 0.5673, |
| "step": 272 |
| }, |
| { |
| "epoch": 1.4756756756756757, |
| "grad_norm": 0.22353792190551758, |
| "learning_rate": 3.978083189745907e-05, |
| "loss": 0.5505, |
| "step": 273 |
| }, |
| { |
| "epoch": 1.481081081081081, |
| "grad_norm": 0.20876029133796692, |
| "learning_rate": 3.903034115406931e-05, |
| "loss": 0.5569, |
| "step": 274 |
| }, |
| { |
| "epoch": 1.4864864864864864, |
| "grad_norm": 0.22701065242290497, |
| "learning_rate": 3.828527693585451e-05, |
| "loss": 0.5719, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.491891891891892, |
| "grad_norm": 0.21358339488506317, |
| "learning_rate": 3.7545705556286126e-05, |
| "loss": 0.546, |
| "step": 276 |
| }, |
| { |
| "epoch": 1.4972972972972973, |
| "grad_norm": 0.21339459717273712, |
| "learning_rate": 3.681169283995279e-05, |
| "loss": 0.533, |
| "step": 277 |
| }, |
| { |
| "epoch": 1.5027027027027027, |
| "grad_norm": 0.22355295717716217, |
| "learning_rate": 3.6083304116701535e-05, |
| "loss": 0.5425, |
| "step": 278 |
| }, |
| { |
| "epoch": 1.5081081081081082, |
| "grad_norm": 0.2224012166261673, |
| "learning_rate": 3.536060421582309e-05, |
| "loss": 0.5545, |
| "step": 279 |
| }, |
| { |
| "epoch": 1.5135135135135136, |
| "grad_norm": 0.2069329172372818, |
| "learning_rate": 3.464365746028208e-05, |
| "loss": 0.5269, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.518918918918919, |
| "grad_norm": 0.2479923665523529, |
| "learning_rate": 3.393252766099187e-05, |
| "loss": 0.5583, |
| "step": 281 |
| }, |
| { |
| "epoch": 1.5243243243243243, |
| "grad_norm": 0.22443261742591858, |
| "learning_rate": 3.322727811113516e-05, |
| "loss": 0.5602, |
| "step": 282 |
| }, |
| { |
| "epoch": 1.5297297297297296, |
| "grad_norm": 0.24400541186332703, |
| "learning_rate": 3.252797158053077e-05, |
| "loss": 0.548, |
| "step": 283 |
| }, |
| { |
| "epoch": 1.535135135135135, |
| "grad_norm": 0.1754683256149292, |
| "learning_rate": 3.1834670310046734e-05, |
| "loss": 0.5437, |
| "step": 284 |
| }, |
| { |
| "epoch": 1.5405405405405406, |
| "grad_norm": 0.1994575560092926, |
| "learning_rate": 3.114743600606078e-05, |
| "loss": 0.5526, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.545945945945946, |
| "grad_norm": 0.22562991082668304, |
| "learning_rate": 3.0466329834968233e-05, |
| "loss": 0.5436, |
| "step": 286 |
| }, |
| { |
| "epoch": 1.5513513513513515, |
| "grad_norm": 0.18434454500675201, |
| "learning_rate": 2.979141241773775e-05, |
| "loss": 0.5442, |
| "step": 287 |
| }, |
| { |
| "epoch": 1.5567567567567568, |
| "grad_norm": 0.2776215970516205, |
| "learning_rate": 2.9122743824516195e-05, |
| "loss": 0.5697, |
| "step": 288 |
| }, |
| { |
| "epoch": 1.5621621621621622, |
| "grad_norm": 0.17563948035240173, |
| "learning_rate": 2.8460383569281824e-05, |
| "loss": 0.5298, |
| "step": 289 |
| }, |
| { |
| "epoch": 1.5675675675675675, |
| "grad_norm": 0.22039195895195007, |
| "learning_rate": 2.7804390604547557e-05, |
| "loss": 0.5405, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.572972972972973, |
| "grad_norm": 0.19329974055290222, |
| "learning_rate": 2.7154823316113932e-05, |
| "loss": 0.5485, |
| "step": 291 |
| }, |
| { |
| "epoch": 1.5783783783783782, |
| "grad_norm": 0.1922396570444107, |
| "learning_rate": 2.6511739517872426e-05, |
| "loss": 0.554, |
| "step": 292 |
| }, |
| { |
| "epoch": 1.5837837837837838, |
| "grad_norm": 0.22808901965618134, |
| "learning_rate": 2.587519644666001e-05, |
| "loss": 0.5453, |
| "step": 293 |
| }, |
| { |
| "epoch": 1.5891891891891892, |
| "grad_norm": 0.17797237634658813, |
| "learning_rate": 2.5245250757164663e-05, |
| "loss": 0.554, |
| "step": 294 |
| }, |
| { |
| "epoch": 1.5945945945945947, |
| "grad_norm": 0.2405869960784912, |
| "learning_rate": 2.462195851688306e-05, |
| "loss": 0.5283, |
| "step": 295 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 0.17364168167114258, |
| "learning_rate": 2.4005375201130274e-05, |
| "loss": 0.5506, |
| "step": 296 |
| }, |
| { |
| "epoch": 1.6054054054054054, |
| "grad_norm": 0.19880247116088867, |
| "learning_rate": 2.339555568810221e-05, |
| "loss": 0.5295, |
| "step": 297 |
| }, |
| { |
| "epoch": 1.6108108108108108, |
| "grad_norm": 0.15347057580947876, |
| "learning_rate": 2.2792554253991415e-05, |
| "loss": 0.5462, |
| "step": 298 |
| }, |
| { |
| "epoch": 1.6162162162162161, |
| "grad_norm": 0.192508727312088, |
| "learning_rate": 2.2196424568156073e-05, |
| "loss": 0.5531, |
| "step": 299 |
| }, |
| { |
| "epoch": 1.6216216216216215, |
| "grad_norm": 0.1681094914674759, |
| "learning_rate": 2.160721968834344e-05, |
| "loss": 0.5553, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.627027027027027, |
| "grad_norm": 0.17024287581443787, |
| "learning_rate": 2.102499205596743e-05, |
| "loss": 0.5511, |
| "step": 301 |
| }, |
| { |
| "epoch": 1.6324324324324324, |
| "grad_norm": 0.17030170559883118, |
| "learning_rate": 2.0449793491441028e-05, |
| "loss": 0.5479, |
| "step": 302 |
| }, |
| { |
| "epoch": 1.637837837837838, |
| "grad_norm": 0.17550453543663025, |
| "learning_rate": 1.9881675189564254e-05, |
| "loss": 0.549, |
| "step": 303 |
| }, |
| { |
| "epoch": 1.6432432432432433, |
| "grad_norm": 0.1620909720659256, |
| "learning_rate": 1.93206877149676e-05, |
| "loss": 0.5547, |
| "step": 304 |
| }, |
| { |
| "epoch": 1.6486486486486487, |
| "grad_norm": 0.18618185818195343, |
| "learning_rate": 1.8766880997611424e-05, |
| "loss": 0.5649, |
| "step": 305 |
| }, |
| { |
| "epoch": 1.654054054054054, |
| "grad_norm": 0.16562554240226746, |
| "learning_rate": 1.8220304328342252e-05, |
| "loss": 0.5274, |
| "step": 306 |
| }, |
| { |
| "epoch": 1.6594594594594594, |
| "grad_norm": 0.16524571180343628, |
| "learning_rate": 1.7681006354505493e-05, |
| "loss": 0.5461, |
| "step": 307 |
| }, |
| { |
| "epoch": 1.6648648648648647, |
| "grad_norm": 0.14831188321113586, |
| "learning_rate": 1.7149035075615794e-05, |
| "loss": 0.5544, |
| "step": 308 |
| }, |
| { |
| "epoch": 1.6702702702702703, |
| "grad_norm": 0.1687830537557602, |
| "learning_rate": 1.6624437839084862e-05, |
| "loss": 0.5393, |
| "step": 309 |
| }, |
| { |
| "epoch": 1.6756756756756757, |
| "grad_norm": 0.1825455278158188, |
| "learning_rate": 1.6107261336007285e-05, |
| "loss": 0.5389, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.6810810810810812, |
| "grad_norm": 0.16670149564743042, |
| "learning_rate": 1.5597551597004966e-05, |
| "loss": 0.5369, |
| "step": 311 |
| }, |
| { |
| "epoch": 1.6864864864864866, |
| "grad_norm": 0.1589886099100113, |
| "learning_rate": 1.5095353988130235e-05, |
| "loss": 0.5592, |
| "step": 312 |
| }, |
| { |
| "epoch": 1.691891891891892, |
| "grad_norm": 0.17790549993515015, |
| "learning_rate": 1.4600713206827932e-05, |
| "loss": 0.5612, |
| "step": 313 |
| }, |
| { |
| "epoch": 1.6972972972972973, |
| "grad_norm": 0.15449143946170807, |
| "learning_rate": 1.4113673277957395e-05, |
| "loss": 0.5439, |
| "step": 314 |
| }, |
| { |
| "epoch": 1.7027027027027026, |
| "grad_norm": 0.14207641780376434, |
| "learning_rate": 1.3634277549873953e-05, |
| "loss": 0.5411, |
| "step": 315 |
| }, |
| { |
| "epoch": 1.708108108108108, |
| "grad_norm": 0.16680589318275452, |
| "learning_rate": 1.3162568690570743e-05, |
| "loss": 0.5333, |
| "step": 316 |
| }, |
| { |
| "epoch": 1.7135135135135136, |
| "grad_norm": 0.16983501613140106, |
| "learning_rate": 1.2698588683881186e-05, |
| "loss": 0.5374, |
| "step": 317 |
| }, |
| { |
| "epoch": 1.718918918918919, |
| "grad_norm": 0.14857017993927002, |
| "learning_rate": 1.224237882574224e-05, |
| "loss": 0.5399, |
| "step": 318 |
| }, |
| { |
| "epoch": 1.7243243243243245, |
| "grad_norm": 0.14769425988197327, |
| "learning_rate": 1.1793979720518866e-05, |
| "loss": 0.5586, |
| "step": 319 |
| }, |
| { |
| "epoch": 1.7297297297297298, |
| "grad_norm": 0.17530100047588348, |
| "learning_rate": 1.1353431277390126e-05, |
| "loss": 0.5618, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.7351351351351352, |
| "grad_norm": 0.156886026263237, |
| "learning_rate": 1.0920772706797167e-05, |
| "loss": 0.5622, |
| "step": 321 |
| }, |
| { |
| "epoch": 1.7405405405405405, |
| "grad_norm": 0.14536841213703156, |
| "learning_rate": 1.0496042516953209e-05, |
| "loss": 0.5428, |
| "step": 322 |
| }, |
| { |
| "epoch": 1.7459459459459459, |
| "grad_norm": 0.162484809756279, |
| "learning_rate": 1.0079278510416313e-05, |
| "loss": 0.5443, |
| "step": 323 |
| }, |
| { |
| "epoch": 1.7513513513513512, |
| "grad_norm": 0.16919377446174622, |
| "learning_rate": 9.670517780724775e-06, |
| "loss": 0.5431, |
| "step": 324 |
| }, |
| { |
| "epoch": 1.7567567567567568, |
| "grad_norm": 0.1344226747751236, |
| "learning_rate": 9.269796709095558e-06, |
| "loss": 0.5556, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.7621621621621621, |
| "grad_norm": 0.14219771325588226, |
| "learning_rate": 8.87715096118642e-06, |
| "loss": 0.5542, |
| "step": 326 |
| }, |
| { |
| "epoch": 1.7675675675675677, |
| "grad_norm": 0.14828014373779297, |
| "learning_rate": 8.492615483921395e-06, |
| "loss": 0.5307, |
| "step": 327 |
| }, |
| { |
| "epoch": 1.772972972972973, |
| "grad_norm": 0.15327297151088715, |
| "learning_rate": 8.116224502380387e-06, |
| "loss": 0.5572, |
| "step": 328 |
| }, |
| { |
| "epoch": 1.7783783783783784, |
| "grad_norm": 0.13973894715309143, |
| "learning_rate": 7.74801151675314e-06, |
| "loss": 0.5585, |
| "step": 329 |
| }, |
| { |
| "epoch": 1.7837837837837838, |
| "grad_norm": 0.14004996418952942, |
| "learning_rate": 7.3880092993574125e-06, |
| "loss": 0.5583, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.7891891891891891, |
| "grad_norm": 0.14245465397834778, |
| "learning_rate": 7.03624989172228e-06, |
| "loss": 0.5327, |
| "step": 331 |
| }, |
| { |
| "epoch": 1.7945945945945945, |
| "grad_norm": 0.13383865356445312, |
| "learning_rate": 6.692764601736268e-06, |
| "loss": 0.5423, |
| "step": 332 |
| }, |
| { |
| "epoch": 1.8, |
| "grad_norm": 0.13818518817424774, |
| "learning_rate": 6.357584000860761e-06, |
| "loss": 0.5416, |
| "step": 333 |
| }, |
| { |
| "epoch": 1.8054054054054054, |
| "grad_norm": 0.15666233003139496, |
| "learning_rate": 6.030737921409169e-06, |
| "loss": 0.5258, |
| "step": 334 |
| }, |
| { |
| "epoch": 1.810810810810811, |
| "grad_norm": 0.13427962362766266, |
| "learning_rate": 5.71225545389158e-06, |
| "loss": 0.551, |
| "step": 335 |
| }, |
| { |
| "epoch": 1.8162162162162163, |
| "grad_norm": 0.4789113402366638, |
| "learning_rate": 5.402164944425758e-06, |
| "loss": 0.5499, |
| "step": 336 |
| }, |
| { |
| "epoch": 1.8216216216216217, |
| "grad_norm": 0.13906101882457733, |
| "learning_rate": 5.100493992214128e-06, |
| "loss": 0.5499, |
| "step": 337 |
| }, |
| { |
| "epoch": 1.827027027027027, |
| "grad_norm": 0.13979266583919525, |
| "learning_rate": 4.807269447087348e-06, |
| "loss": 0.5273, |
| "step": 338 |
| }, |
| { |
| "epoch": 1.8324324324324324, |
| "grad_norm": 0.14405637979507446, |
| "learning_rate": 4.5225174071146455e-06, |
| "loss": 0.5469, |
| "step": 339 |
| }, |
| { |
| "epoch": 1.8378378378378377, |
| "grad_norm": 0.1318741887807846, |
| "learning_rate": 4.24626321628091e-06, |
| "loss": 0.5507, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.8432432432432433, |
| "grad_norm": 0.1360122561454773, |
| "learning_rate": 3.9785314622310495e-06, |
| "loss": 0.5467, |
| "step": 341 |
| }, |
| { |
| "epoch": 1.8486486486486486, |
| "grad_norm": 0.15425017476081848, |
| "learning_rate": 3.7193459740815674e-06, |
| "loss": 0.5557, |
| "step": 342 |
| }, |
| { |
| "epoch": 1.8540540540540542, |
| "grad_norm": 0.13652145862579346, |
| "learning_rate": 3.4687298202996655e-06, |
| "loss": 0.5512, |
| "step": 343 |
| }, |
| { |
| "epoch": 1.8594594594594596, |
| "grad_norm": 0.13713712990283966, |
| "learning_rate": 3.226705306650113e-06, |
| "loss": 0.5333, |
| "step": 344 |
| }, |
| { |
| "epoch": 1.864864864864865, |
| "grad_norm": 0.19826094806194305, |
| "learning_rate": 2.9932939742099208e-06, |
| "loss": 0.5419, |
| "step": 345 |
| }, |
| { |
| "epoch": 1.8702702702702703, |
| "grad_norm": 0.13193316757678986, |
| "learning_rate": 2.7685165974510986e-06, |
| "loss": 0.5494, |
| "step": 346 |
| }, |
| { |
| "epoch": 1.8756756756756756, |
| "grad_norm": 0.13418689370155334, |
| "learning_rate": 2.552393182391677e-06, |
| "loss": 0.5572, |
| "step": 347 |
| }, |
| { |
| "epoch": 1.881081081081081, |
| "grad_norm": 0.1294754445552826, |
| "learning_rate": 2.3449429648150665e-06, |
| "loss": 0.5414, |
| "step": 348 |
| }, |
| { |
| "epoch": 1.8864864864864865, |
| "grad_norm": 0.12898264825344086, |
| "learning_rate": 2.1461844085580385e-06, |
| "loss": 0.5634, |
| "step": 349 |
| }, |
| { |
| "epoch": 1.8918918918918919, |
| "grad_norm": 0.12305044382810593, |
| "learning_rate": 1.9561352038673263e-06, |
| "loss": 0.5484, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.8972972972972975, |
| "grad_norm": 0.14300554990768433, |
| "learning_rate": 1.7748122658251876e-06, |
| "loss": 0.5359, |
| "step": 351 |
| }, |
| { |
| "epoch": 1.9027027027027028, |
| "grad_norm": 0.12813441455364227, |
| "learning_rate": 1.6022317328438506e-06, |
| "loss": 0.5343, |
| "step": 352 |
| }, |
| { |
| "epoch": 1.9081081081081082, |
| "grad_norm": 0.12812206149101257, |
| "learning_rate": 1.4384089652291543e-06, |
| "loss": 0.547, |
| "step": 353 |
| }, |
| { |
| "epoch": 1.9135135135135135, |
| "grad_norm": 0.12215188145637512, |
| "learning_rate": 1.2833585438134287e-06, |
| "loss": 0.561, |
| "step": 354 |
| }, |
| { |
| "epoch": 1.9189189189189189, |
| "grad_norm": 0.1359054297208786, |
| "learning_rate": 1.1370942686577347e-06, |
| "loss": 0.553, |
| "step": 355 |
| }, |
| { |
| "epoch": 1.9243243243243242, |
| "grad_norm": 0.13287091255187988, |
| "learning_rate": 9.996291578236228e-07, |
| "loss": 0.5247, |
| "step": 356 |
| }, |
| { |
| "epoch": 1.9297297297297298, |
| "grad_norm": 0.12276577204465866, |
| "learning_rate": 8.709754462144615e-07, |
| "loss": 0.5427, |
| "step": 357 |
| }, |
| { |
| "epoch": 1.9351351351351351, |
| "grad_norm": 0.13017451763153076, |
| "learning_rate": 7.511445844864962e-07, |
| "loss": 0.5476, |
| "step": 358 |
| }, |
| { |
| "epoch": 1.9405405405405407, |
| "grad_norm": 0.12089353799819946, |
| "learning_rate": 6.401472380297091e-07, |
| "loss": 0.5236, |
| "step": 359 |
| }, |
| { |
| "epoch": 1.945945945945946, |
| "grad_norm": 0.12430868297815323, |
| "learning_rate": 5.379932860185122e-07, |
| "loss": 0.5372, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.9513513513513514, |
| "grad_norm": 0.11779884248971939, |
| "learning_rate": 4.44691820532539e-07, |
| "loss": 0.5464, |
| "step": 361 |
| }, |
| { |
| "epoch": 1.9567567567567568, |
| "grad_norm": 0.1252037137746811, |
| "learning_rate": 3.6025114574734785e-07, |
| "loss": 0.564, |
| "step": 362 |
| }, |
| { |
| "epoch": 1.962162162162162, |
| "grad_norm": 0.11790075153112411, |
| "learning_rate": 2.846787771953574e-07, |
| "loss": 0.5419, |
| "step": 363 |
| }, |
| { |
| "epoch": 1.9675675675675675, |
| "grad_norm": 0.11766640096902847, |
| "learning_rate": 2.179814410969261e-07, |
| "loss": 0.5334, |
| "step": 364 |
| }, |
| { |
| "epoch": 1.972972972972973, |
| "grad_norm": 0.11944753676652908, |
| "learning_rate": 1.6016507376169777e-07, |
| "loss": 0.5384, |
| "step": 365 |
| }, |
| { |
| "epoch": 1.9783783783783784, |
| "grad_norm": 0.12327762693166733, |
| "learning_rate": 1.1123482106021322e-07, |
| "loss": 0.5508, |
| "step": 366 |
| }, |
| { |
| "epoch": 1.983783783783784, |
| "grad_norm": 0.1193946972489357, |
| "learning_rate": 7.119503796599868e-08, |
| "loss": 0.5294, |
| "step": 367 |
| }, |
| { |
| "epoch": 1.9891891891891893, |
| "grad_norm": 0.12587259709835052, |
| "learning_rate": 4.0049288167842705e-08, |
| "loss": 0.5297, |
| "step": 368 |
| }, |
| { |
| "epoch": 1.9945945945945946, |
| "grad_norm": 0.1217728778719902, |
| "learning_rate": 1.7800343752683023e-08, |
| "loss": 0.5367, |
| "step": 369 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.12325151264667511, |
| "learning_rate": 4.4501849589040355e-09, |
| "loss": 0.5457, |
| "step": 370 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 370, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.443139033277399e+19, |
| "train_batch_size": 24, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|