diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.650393527436843, + "epoch": 0.7504540701194342, "eval_steps": 500, - "global_step": 11817, + "global_step": 13635, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -82726,6 +82726,12732 @@ "learning_rate": 7.634761785456671e-06, "loss": 0.7012, "step": 11817 + }, + { + "epoch": 0.6504485662391987, + "grad_norm": 0.7214943170547485, + "learning_rate": 7.634393374521478e-06, + "loss": 0.7386, + "step": 11818 + }, + { + "epoch": 0.6505036050415542, + "grad_norm": 0.7026216387748718, + "learning_rate": 7.63402494378689e-06, + "loss": 0.7444, + "step": 11819 + }, + { + "epoch": 0.6505586438439099, + "grad_norm": 0.6271201372146606, + "learning_rate": 7.633656493255677e-06, + "loss": 0.6567, + "step": 11820 + }, + { + "epoch": 0.6506136826462656, + "grad_norm": 0.8359349370002747, + "learning_rate": 7.633288022930606e-06, + "loss": 0.7081, + "step": 11821 + }, + { + "epoch": 0.6506687214486213, + "grad_norm": 0.7009666562080383, + "learning_rate": 7.632919532814444e-06, + "loss": 0.6892, + "step": 11822 + }, + { + "epoch": 0.6507237602509769, + "grad_norm": 0.7445069551467896, + "learning_rate": 7.632551022909966e-06, + "loss": 0.7854, + "step": 11823 + }, + { + "epoch": 0.6507787990533326, + "grad_norm": 0.7204466462135315, + "learning_rate": 7.63218249321994e-06, + "loss": 0.8065, + "step": 11824 + }, + { + "epoch": 0.6508338378556883, + "grad_norm": 0.7058166265487671, + "learning_rate": 7.631813943747135e-06, + "loss": 0.6668, + "step": 11825 + }, + { + "epoch": 0.650888876658044, + "grad_norm": 0.739919126033783, + "learning_rate": 7.631445374494319e-06, + "loss": 0.8657, + "step": 11826 + }, + { + "epoch": 0.6509439154603995, + "grad_norm": 1.0444670915603638, + "learning_rate": 7.631076785464263e-06, + "loss": 0.7226, + "step": 11827 + }, + { + "epoch": 0.6509989542627552, + "grad_norm": 0.7146627306938171, + "learning_rate": 7.630708176659743e-06, + "loss": 0.7567, + "step": 11828 + }, + { + "epoch": 0.6510539930651109, + "grad_norm": 0.6981074810028076, + "learning_rate": 7.630339548083521e-06, + "loss": 0.7158, + "step": 11829 + }, + { + "epoch": 0.6511090318674666, + "grad_norm": 0.7620309591293335, + "learning_rate": 7.629970899738372e-06, + "loss": 0.811, + "step": 11830 + }, + { + "epoch": 0.6511640706698222, + "grad_norm": 0.7017341256141663, + "learning_rate": 7.629602231627066e-06, + "loss": 0.7092, + "step": 11831 + }, + { + "epoch": 0.6512191094721779, + "grad_norm": 0.733524739742279, + "learning_rate": 7.629233543752373e-06, + "loss": 0.859, + "step": 11832 + }, + { + "epoch": 0.6512741482745336, + "grad_norm": 0.7246975898742676, + "learning_rate": 7.628864836117065e-06, + "loss": 0.7732, + "step": 11833 + }, + { + "epoch": 0.6513291870768892, + "grad_norm": 0.5763251185417175, + "learning_rate": 7.628496108723911e-06, + "loss": 0.6632, + "step": 11834 + }, + { + "epoch": 0.6513842258792448, + "grad_norm": 0.6120070815086365, + "learning_rate": 7.628127361575685e-06, + "loss": 0.6809, + "step": 11835 + }, + { + "epoch": 0.6514392646816005, + "grad_norm": 0.8650742769241333, + "learning_rate": 7.627758594675157e-06, + "loss": 0.6388, + "step": 11836 + }, + { + "epoch": 0.6514943034839562, + "grad_norm": 0.8650027513504028, + "learning_rate": 7.627389808025099e-06, + "loss": 0.7622, + "step": 11837 + }, + { + "epoch": 0.6515493422863119, + "grad_norm": 0.6683071851730347, + "learning_rate": 7.627021001628283e-06, + "loss": 0.7424, + "step": 11838 + }, + { + "epoch": 0.6516043810886675, + "grad_norm": 0.6821237206459045, + "learning_rate": 7.626652175487479e-06, + "loss": 0.7844, + "step": 11839 + }, + { + "epoch": 0.6516594198910232, + "grad_norm": 0.7142770886421204, + "learning_rate": 7.626283329605462e-06, + "loss": 0.7706, + "step": 11840 + }, + { + "epoch": 0.6517144586933789, + "grad_norm": 0.7870625257492065, + "learning_rate": 7.625914463985002e-06, + "loss": 0.7673, + "step": 11841 + }, + { + "epoch": 0.6517694974957345, + "grad_norm": 0.7386491894721985, + "learning_rate": 7.62554557862887e-06, + "loss": 0.7562, + "step": 11842 + }, + { + "epoch": 0.6518245362980901, + "grad_norm": 0.6529993414878845, + "learning_rate": 7.625176673539843e-06, + "loss": 0.8258, + "step": 11843 + }, + { + "epoch": 0.6518795751004458, + "grad_norm": 0.7010294795036316, + "learning_rate": 7.6248077487206895e-06, + "loss": 0.7773, + "step": 11844 + }, + { + "epoch": 0.6519346139028015, + "grad_norm": 0.6699075698852539, + "learning_rate": 7.624438804174184e-06, + "loss": 0.7163, + "step": 11845 + }, + { + "epoch": 0.6519896527051572, + "grad_norm": 0.6600161790847778, + "learning_rate": 7.624069839903099e-06, + "loss": 0.7355, + "step": 11846 + }, + { + "epoch": 0.6520446915075128, + "grad_norm": 0.6556873321533203, + "learning_rate": 7.623700855910205e-06, + "loss": 0.627, + "step": 11847 + }, + { + "epoch": 0.6520997303098685, + "grad_norm": 0.6867008805274963, + "learning_rate": 7.623331852198281e-06, + "loss": 0.8228, + "step": 11848 + }, + { + "epoch": 0.6521547691122241, + "grad_norm": 0.6885474324226379, + "learning_rate": 7.622962828770095e-06, + "loss": 0.6804, + "step": 11849 + }, + { + "epoch": 0.6522098079145798, + "grad_norm": 0.6903913021087646, + "learning_rate": 7.622593785628425e-06, + "loss": 0.6553, + "step": 11850 + }, + { + "epoch": 0.6522648467169354, + "grad_norm": 0.6581684947013855, + "learning_rate": 7.622224722776039e-06, + "loss": 0.7102, + "step": 11851 + }, + { + "epoch": 0.6523198855192911, + "grad_norm": 0.8261715769767761, + "learning_rate": 7.621855640215716e-06, + "loss": 0.676, + "step": 11852 + }, + { + "epoch": 0.6523749243216468, + "grad_norm": 0.6238247752189636, + "learning_rate": 7.6214865379502265e-06, + "loss": 0.7065, + "step": 11853 + }, + { + "epoch": 0.6524299631240025, + "grad_norm": 0.7350416779518127, + "learning_rate": 7.621117415982346e-06, + "loss": 0.7512, + "step": 11854 + }, + { + "epoch": 0.652485001926358, + "grad_norm": 0.7337208390235901, + "learning_rate": 7.620748274314851e-06, + "loss": 0.7593, + "step": 11855 + }, + { + "epoch": 0.6525400407287137, + "grad_norm": 0.6568214297294617, + "learning_rate": 7.620379112950511e-06, + "loss": 0.7363, + "step": 11856 + }, + { + "epoch": 0.6525950795310694, + "grad_norm": 0.7099055647850037, + "learning_rate": 7.620009931892105e-06, + "loss": 0.6631, + "step": 11857 + }, + { + "epoch": 0.652650118333425, + "grad_norm": 0.6563010215759277, + "learning_rate": 7.6196407311424035e-06, + "loss": 0.6617, + "step": 11858 + }, + { + "epoch": 0.6527051571357807, + "grad_norm": 0.6664251685142517, + "learning_rate": 7.6192715107041845e-06, + "loss": 0.7898, + "step": 11859 + }, + { + "epoch": 0.6527601959381364, + "grad_norm": 0.6524507403373718, + "learning_rate": 7.618902270580222e-06, + "loss": 0.767, + "step": 11860 + }, + { + "epoch": 0.6528152347404921, + "grad_norm": 0.7391313910484314, + "learning_rate": 7.61853301077329e-06, + "loss": 0.6015, + "step": 11861 + }, + { + "epoch": 0.6528702735428477, + "grad_norm": 0.7691878080368042, + "learning_rate": 7.618163731286167e-06, + "loss": 0.718, + "step": 11862 + }, + { + "epoch": 0.6529253123452033, + "grad_norm": 0.6524633765220642, + "learning_rate": 7.617794432121625e-06, + "loss": 0.6841, + "step": 11863 + }, + { + "epoch": 0.652980351147559, + "grad_norm": 0.7125405073165894, + "learning_rate": 7.61742511328244e-06, + "loss": 0.7654, + "step": 11864 + }, + { + "epoch": 0.6530353899499147, + "grad_norm": 0.7123568058013916, + "learning_rate": 7.617055774771389e-06, + "loss": 0.7189, + "step": 11865 + }, + { + "epoch": 0.6530904287522703, + "grad_norm": 0.6968240141868591, + "learning_rate": 7.616686416591248e-06, + "loss": 0.7201, + "step": 11866 + }, + { + "epoch": 0.653145467554626, + "grad_norm": 0.7208551168441772, + "learning_rate": 7.616317038744792e-06, + "loss": 0.6644, + "step": 11867 + }, + { + "epoch": 0.6532005063569817, + "grad_norm": 0.7320911884307861, + "learning_rate": 7.615947641234798e-06, + "loss": 0.7118, + "step": 11868 + }, + { + "epoch": 0.6532555451593374, + "grad_norm": 0.7762041687965393, + "learning_rate": 7.615578224064041e-06, + "loss": 0.7501, + "step": 11869 + }, + { + "epoch": 0.653310583961693, + "grad_norm": 0.7455989718437195, + "learning_rate": 7.6152087872352975e-06, + "loss": 0.8058, + "step": 11870 + }, + { + "epoch": 0.6533656227640486, + "grad_norm": 0.736044704914093, + "learning_rate": 7.614839330751347e-06, + "loss": 0.727, + "step": 11871 + }, + { + "epoch": 0.6534206615664043, + "grad_norm": 0.680171012878418, + "learning_rate": 7.614469854614961e-06, + "loss": 0.6722, + "step": 11872 + }, + { + "epoch": 0.65347570036876, + "grad_norm": 0.7598134279251099, + "learning_rate": 7.614100358828922e-06, + "loss": 0.7472, + "step": 11873 + }, + { + "epoch": 0.6535307391711156, + "grad_norm": 0.8288099765777588, + "learning_rate": 7.613730843396003e-06, + "loss": 0.7493, + "step": 11874 + }, + { + "epoch": 0.6535857779734713, + "grad_norm": 0.6436724066734314, + "learning_rate": 7.613361308318984e-06, + "loss": 0.7103, + "step": 11875 + }, + { + "epoch": 0.653640816775827, + "grad_norm": 0.671334981918335, + "learning_rate": 7.612991753600639e-06, + "loss": 0.6949, + "step": 11876 + }, + { + "epoch": 0.6536958555781827, + "grad_norm": 0.6019170880317688, + "learning_rate": 7.61262217924375e-06, + "loss": 0.6116, + "step": 11877 + }, + { + "epoch": 0.6537508943805382, + "grad_norm": 1.4682546854019165, + "learning_rate": 7.61225258525109e-06, + "loss": 0.9343, + "step": 11878 + }, + { + "epoch": 0.6538059331828939, + "grad_norm": 0.656822681427002, + "learning_rate": 7.611882971625439e-06, + "loss": 0.7357, + "step": 11879 + }, + { + "epoch": 0.6538609719852496, + "grad_norm": 0.635734498500824, + "learning_rate": 7.611513338369576e-06, + "loss": 0.6263, + "step": 11880 + }, + { + "epoch": 0.6539160107876053, + "grad_norm": 0.7123430967330933, + "learning_rate": 7.611143685486277e-06, + "loss": 0.8446, + "step": 11881 + }, + { + "epoch": 0.6539710495899609, + "grad_norm": 0.7597065567970276, + "learning_rate": 7.610774012978322e-06, + "loss": 0.7449, + "step": 11882 + }, + { + "epoch": 0.6540260883923166, + "grad_norm": 0.7555896043777466, + "learning_rate": 7.610404320848486e-06, + "loss": 0.7575, + "step": 11883 + }, + { + "epoch": 0.6540811271946723, + "grad_norm": 0.7572906613349915, + "learning_rate": 7.6100346090995506e-06, + "loss": 0.7547, + "step": 11884 + }, + { + "epoch": 0.654136165997028, + "grad_norm": 0.6663275957107544, + "learning_rate": 7.609664877734295e-06, + "loss": 0.7038, + "step": 11885 + }, + { + "epoch": 0.6541912047993835, + "grad_norm": 0.7346611618995667, + "learning_rate": 7.609295126755496e-06, + "loss": 0.7902, + "step": 11886 + }, + { + "epoch": 0.6542462436017392, + "grad_norm": 0.6846545338630676, + "learning_rate": 7.608925356165934e-06, + "loss": 0.7334, + "step": 11887 + }, + { + "epoch": 0.6543012824040949, + "grad_norm": 0.6714815497398376, + "learning_rate": 7.608555565968385e-06, + "loss": 0.7204, + "step": 11888 + }, + { + "epoch": 0.6543563212064506, + "grad_norm": 0.805095374584198, + "learning_rate": 7.608185756165634e-06, + "loss": 0.8521, + "step": 11889 + }, + { + "epoch": 0.6544113600088062, + "grad_norm": 0.8415316343307495, + "learning_rate": 7.607815926760456e-06, + "loss": 0.7076, + "step": 11890 + }, + { + "epoch": 0.6544663988111619, + "grad_norm": 0.7665743231773376, + "learning_rate": 7.607446077755632e-06, + "loss": 0.8072, + "step": 11891 + }, + { + "epoch": 0.6545214376135176, + "grad_norm": 0.6705248355865479, + "learning_rate": 7.607076209153939e-06, + "loss": 0.6607, + "step": 11892 + }, + { + "epoch": 0.6545764764158732, + "grad_norm": 0.6791796684265137, + "learning_rate": 7.606706320958159e-06, + "loss": 0.773, + "step": 11893 + }, + { + "epoch": 0.6546315152182288, + "grad_norm": 0.8177357316017151, + "learning_rate": 7.606336413171075e-06, + "loss": 0.8114, + "step": 11894 + }, + { + "epoch": 0.6546865540205845, + "grad_norm": 0.9491637945175171, + "learning_rate": 7.605966485795462e-06, + "loss": 0.7424, + "step": 11895 + }, + { + "epoch": 0.6547415928229402, + "grad_norm": 0.7326256036758423, + "learning_rate": 7.605596538834103e-06, + "loss": 0.8176, + "step": 11896 + }, + { + "epoch": 0.6547966316252959, + "grad_norm": 0.6081808805465698, + "learning_rate": 7.6052265722897775e-06, + "loss": 0.6827, + "step": 11897 + }, + { + "epoch": 0.6548516704276515, + "grad_norm": 0.7165681719779968, + "learning_rate": 7.604856586165268e-06, + "loss": 0.7854, + "step": 11898 + }, + { + "epoch": 0.6549067092300072, + "grad_norm": 0.8777725100517273, + "learning_rate": 7.604486580463353e-06, + "loss": 0.8084, + "step": 11899 + }, + { + "epoch": 0.6549617480323628, + "grad_norm": 0.6814439296722412, + "learning_rate": 7.604116555186811e-06, + "loss": 0.6869, + "step": 11900 + }, + { + "epoch": 0.6550167868347184, + "grad_norm": 0.7060914635658264, + "learning_rate": 7.60374651033843e-06, + "loss": 0.7066, + "step": 11901 + }, + { + "epoch": 0.6550718256370741, + "grad_norm": 0.6823089718818665, + "learning_rate": 7.603376445920987e-06, + "loss": 0.6095, + "step": 11902 + }, + { + "epoch": 0.6551268644394298, + "grad_norm": 0.7099863290786743, + "learning_rate": 7.603006361937262e-06, + "loss": 0.8037, + "step": 11903 + }, + { + "epoch": 0.6551819032417855, + "grad_norm": 0.6479066610336304, + "learning_rate": 7.602636258390037e-06, + "loss": 0.6844, + "step": 11904 + }, + { + "epoch": 0.6552369420441411, + "grad_norm": 0.6663268804550171, + "learning_rate": 7.602266135282097e-06, + "loss": 0.735, + "step": 11905 + }, + { + "epoch": 0.6552919808464968, + "grad_norm": 0.8670598268508911, + "learning_rate": 7.60189599261622e-06, + "loss": 0.779, + "step": 11906 + }, + { + "epoch": 0.6553470196488524, + "grad_norm": 0.607631504535675, + "learning_rate": 7.601525830395189e-06, + "loss": 0.6288, + "step": 11907 + }, + { + "epoch": 0.6554020584512081, + "grad_norm": 0.9054927229881287, + "learning_rate": 7.601155648621786e-06, + "loss": 0.8562, + "step": 11908 + }, + { + "epoch": 0.6554570972535637, + "grad_norm": 0.8069004416465759, + "learning_rate": 7.6007854472987955e-06, + "loss": 0.88, + "step": 11909 + }, + { + "epoch": 0.6555121360559194, + "grad_norm": 0.6393092274665833, + "learning_rate": 7.600415226428995e-06, + "loss": 0.6908, + "step": 11910 + }, + { + "epoch": 0.6555671748582751, + "grad_norm": 0.7533125281333923, + "learning_rate": 7.600044986015172e-06, + "loss": 0.8061, + "step": 11911 + }, + { + "epoch": 0.6556222136606308, + "grad_norm": 0.6859326958656311, + "learning_rate": 7.599674726060105e-06, + "loss": 0.7603, + "step": 11912 + }, + { + "epoch": 0.6556772524629864, + "grad_norm": 0.7284619808197021, + "learning_rate": 7.59930444656658e-06, + "loss": 0.7698, + "step": 11913 + }, + { + "epoch": 0.655732291265342, + "grad_norm": 1.074234127998352, + "learning_rate": 7.598934147537378e-06, + "loss": 0.8252, + "step": 11914 + }, + { + "epoch": 0.6557873300676977, + "grad_norm": 0.6899133920669556, + "learning_rate": 7.598563828975283e-06, + "loss": 0.6023, + "step": 11915 + }, + { + "epoch": 0.6558423688700534, + "grad_norm": 0.6736464500427246, + "learning_rate": 7.598193490883077e-06, + "loss": 0.788, + "step": 11916 + }, + { + "epoch": 0.655897407672409, + "grad_norm": 0.7646307349205017, + "learning_rate": 7.597823133263545e-06, + "loss": 0.7607, + "step": 11917 + }, + { + "epoch": 0.6559524464747647, + "grad_norm": 0.6413717865943909, + "learning_rate": 7.59745275611947e-06, + "loss": 0.6415, + "step": 11918 + }, + { + "epoch": 0.6560074852771204, + "grad_norm": 0.6605532169342041, + "learning_rate": 7.597082359453636e-06, + "loss": 0.6655, + "step": 11919 + }, + { + "epoch": 0.6560625240794761, + "grad_norm": 0.6573199033737183, + "learning_rate": 7.596711943268824e-06, + "loss": 0.624, + "step": 11920 + }, + { + "epoch": 0.6561175628818317, + "grad_norm": 0.8312102556228638, + "learning_rate": 7.596341507567822e-06, + "loss": 0.6803, + "step": 11921 + }, + { + "epoch": 0.6561726016841873, + "grad_norm": 0.6915873289108276, + "learning_rate": 7.59597105235341e-06, + "loss": 0.6897, + "step": 11922 + }, + { + "epoch": 0.656227640486543, + "grad_norm": 0.6916965842247009, + "learning_rate": 7.595600577628377e-06, + "loss": 0.7154, + "step": 11923 + }, + { + "epoch": 0.6562826792888987, + "grad_norm": 0.6712722182273865, + "learning_rate": 7.595230083395501e-06, + "loss": 0.7236, + "step": 11924 + }, + { + "epoch": 0.6563377180912543, + "grad_norm": 0.6514019966125488, + "learning_rate": 7.594859569657575e-06, + "loss": 0.6895, + "step": 11925 + }, + { + "epoch": 0.65639275689361, + "grad_norm": 0.7300555109977722, + "learning_rate": 7.594489036417378e-06, + "loss": 0.7563, + "step": 11926 + }, + { + "epoch": 0.6564477956959657, + "grad_norm": 0.8076907396316528, + "learning_rate": 7.594118483677695e-06, + "loss": 0.8883, + "step": 11927 + }, + { + "epoch": 0.6565028344983214, + "grad_norm": 0.666466236114502, + "learning_rate": 7.5937479114413114e-06, + "loss": 0.7641, + "step": 11928 + }, + { + "epoch": 0.6565578733006769, + "grad_norm": 0.6621832251548767, + "learning_rate": 7.593377319711013e-06, + "loss": 0.6687, + "step": 11929 + }, + { + "epoch": 0.6566129121030326, + "grad_norm": 0.8757139444351196, + "learning_rate": 7.593006708489585e-06, + "loss": 0.7746, + "step": 11930 + }, + { + "epoch": 0.6566679509053883, + "grad_norm": 0.646801769733429, + "learning_rate": 7.5926360777798135e-06, + "loss": 0.6884, + "step": 11931 + }, + { + "epoch": 0.656722989707744, + "grad_norm": 0.6703395843505859, + "learning_rate": 7.592265427584482e-06, + "loss": 0.6822, + "step": 11932 + }, + { + "epoch": 0.6567780285100996, + "grad_norm": 0.7653201222419739, + "learning_rate": 7.591894757906378e-06, + "loss": 0.7999, + "step": 11933 + }, + { + "epoch": 0.6568330673124553, + "grad_norm": 0.6921548247337341, + "learning_rate": 7.591524068748288e-06, + "loss": 0.7177, + "step": 11934 + }, + { + "epoch": 0.656888106114811, + "grad_norm": 0.7085320353507996, + "learning_rate": 7.591153360112995e-06, + "loss": 0.8395, + "step": 11935 + }, + { + "epoch": 0.6569431449171667, + "grad_norm": 0.6565294861793518, + "learning_rate": 7.590782632003287e-06, + "loss": 0.6969, + "step": 11936 + }, + { + "epoch": 0.6569981837195222, + "grad_norm": 0.7023206353187561, + "learning_rate": 7.590411884421952e-06, + "loss": 0.7321, + "step": 11937 + }, + { + "epoch": 0.6570532225218779, + "grad_norm": 0.7848044633865356, + "learning_rate": 7.590041117371774e-06, + "loss": 0.8857, + "step": 11938 + }, + { + "epoch": 0.6571082613242336, + "grad_norm": 1.004591703414917, + "learning_rate": 7.589670330855541e-06, + "loss": 0.8267, + "step": 11939 + }, + { + "epoch": 0.6571633001265893, + "grad_norm": 0.7525139451026917, + "learning_rate": 7.589299524876036e-06, + "loss": 0.6857, + "step": 11940 + }, + { + "epoch": 0.6572183389289449, + "grad_norm": 0.746224582195282, + "learning_rate": 7.588928699436051e-06, + "loss": 0.805, + "step": 11941 + }, + { + "epoch": 0.6572733777313006, + "grad_norm": 0.6304495930671692, + "learning_rate": 7.588557854538371e-06, + "loss": 0.652, + "step": 11942 + }, + { + "epoch": 0.6573284165336563, + "grad_norm": 0.761688768863678, + "learning_rate": 7.588186990185783e-06, + "loss": 0.7954, + "step": 11943 + }, + { + "epoch": 0.6573834553360118, + "grad_norm": 0.7735103368759155, + "learning_rate": 7.587816106381073e-06, + "loss": 0.7584, + "step": 11944 + }, + { + "epoch": 0.6574384941383675, + "grad_norm": 0.7351566553115845, + "learning_rate": 7.5874452031270305e-06, + "loss": 0.7984, + "step": 11945 + }, + { + "epoch": 0.6574935329407232, + "grad_norm": 0.7054993510246277, + "learning_rate": 7.587074280426443e-06, + "loss": 0.7057, + "step": 11946 + }, + { + "epoch": 0.6575485717430789, + "grad_norm": 0.7444368004798889, + "learning_rate": 7.586703338282099e-06, + "loss": 0.7476, + "step": 11947 + }, + { + "epoch": 0.6576036105454345, + "grad_norm": 0.6944568157196045, + "learning_rate": 7.586332376696782e-06, + "loss": 0.6874, + "step": 11948 + }, + { + "epoch": 0.6576586493477902, + "grad_norm": 0.6595578193664551, + "learning_rate": 7.585961395673287e-06, + "loss": 0.7541, + "step": 11949 + }, + { + "epoch": 0.6577136881501459, + "grad_norm": 0.6669502258300781, + "learning_rate": 7.585590395214396e-06, + "loss": 0.7515, + "step": 11950 + }, + { + "epoch": 0.6577687269525015, + "grad_norm": 0.7254583835601807, + "learning_rate": 7.585219375322901e-06, + "loss": 0.8089, + "step": 11951 + }, + { + "epoch": 0.6578237657548571, + "grad_norm": 1.0479141473770142, + "learning_rate": 7.584848336001587e-06, + "loss": 0.8108, + "step": 11952 + }, + { + "epoch": 0.6578788045572128, + "grad_norm": 0.6928718686103821, + "learning_rate": 7.584477277253246e-06, + "loss": 0.6325, + "step": 11953 + }, + { + "epoch": 0.6579338433595685, + "grad_norm": 0.8926869630813599, + "learning_rate": 7.584106199080666e-06, + "loss": 0.7294, + "step": 11954 + }, + { + "epoch": 0.6579888821619242, + "grad_norm": 0.7209964394569397, + "learning_rate": 7.583735101486635e-06, + "loss": 0.7646, + "step": 11955 + }, + { + "epoch": 0.6580439209642798, + "grad_norm": 0.7619316577911377, + "learning_rate": 7.583363984473941e-06, + "loss": 0.7756, + "step": 11956 + }, + { + "epoch": 0.6580989597666355, + "grad_norm": 0.6974903345108032, + "learning_rate": 7.582992848045378e-06, + "loss": 0.6497, + "step": 11957 + }, + { + "epoch": 0.6581539985689911, + "grad_norm": 0.8338617086410522, + "learning_rate": 7.582621692203731e-06, + "loss": 0.6619, + "step": 11958 + }, + { + "epoch": 0.6582090373713468, + "grad_norm": 0.9330396056175232, + "learning_rate": 7.5822505169517905e-06, + "loss": 0.8219, + "step": 11959 + }, + { + "epoch": 0.6582640761737024, + "grad_norm": 0.7725355625152588, + "learning_rate": 7.5818793222923445e-06, + "loss": 0.7262, + "step": 11960 + }, + { + "epoch": 0.6583191149760581, + "grad_norm": 0.7049654722213745, + "learning_rate": 7.5815081082281885e-06, + "loss": 0.7917, + "step": 11961 + }, + { + "epoch": 0.6583741537784138, + "grad_norm": 0.6801711916923523, + "learning_rate": 7.581136874762105e-06, + "loss": 0.6984, + "step": 11962 + }, + { + "epoch": 0.6584291925807695, + "grad_norm": 0.7774253487586975, + "learning_rate": 7.58076562189689e-06, + "loss": 0.7615, + "step": 11963 + }, + { + "epoch": 0.6584842313831251, + "grad_norm": 0.7436443567276001, + "learning_rate": 7.58039434963533e-06, + "loss": 0.7419, + "step": 11964 + }, + { + "epoch": 0.6585392701854808, + "grad_norm": 0.6857719421386719, + "learning_rate": 7.580023057980217e-06, + "loss": 0.8009, + "step": 11965 + }, + { + "epoch": 0.6585943089878364, + "grad_norm": 0.7194758653640747, + "learning_rate": 7.579651746934342e-06, + "loss": 0.7338, + "step": 11966 + }, + { + "epoch": 0.6586493477901921, + "grad_norm": 0.7248701453208923, + "learning_rate": 7.579280416500495e-06, + "loss": 0.6972, + "step": 11967 + }, + { + "epoch": 0.6587043865925477, + "grad_norm": 0.6719415783882141, + "learning_rate": 7.578909066681466e-06, + "loss": 0.7552, + "step": 11968 + }, + { + "epoch": 0.6587594253949034, + "grad_norm": 0.728338897228241, + "learning_rate": 7.578537697480046e-06, + "loss": 0.8386, + "step": 11969 + }, + { + "epoch": 0.6588144641972591, + "grad_norm": 0.7151786684989929, + "learning_rate": 7.578166308899029e-06, + "loss": 0.7186, + "step": 11970 + }, + { + "epoch": 0.6588695029996148, + "grad_norm": 0.664412260055542, + "learning_rate": 7.577794900941205e-06, + "loss": 0.6672, + "step": 11971 + }, + { + "epoch": 0.6589245418019704, + "grad_norm": 0.6915827989578247, + "learning_rate": 7.577423473609361e-06, + "loss": 0.7427, + "step": 11972 + }, + { + "epoch": 0.658979580604326, + "grad_norm": 0.705243706703186, + "learning_rate": 7.577052026906295e-06, + "loss": 0.7526, + "step": 11973 + }, + { + "epoch": 0.6590346194066817, + "grad_norm": 0.6559640169143677, + "learning_rate": 7.576680560834795e-06, + "loss": 0.8187, + "step": 11974 + }, + { + "epoch": 0.6590896582090374, + "grad_norm": 0.7359572649002075, + "learning_rate": 7.576309075397653e-06, + "loss": 0.8127, + "step": 11975 + }, + { + "epoch": 0.659144697011393, + "grad_norm": 0.6581039428710938, + "learning_rate": 7.575937570597661e-06, + "loss": 0.7066, + "step": 11976 + }, + { + "epoch": 0.6591997358137487, + "grad_norm": 0.8360844254493713, + "learning_rate": 7.5755660464376134e-06, + "loss": 0.7998, + "step": 11977 + }, + { + "epoch": 0.6592547746161044, + "grad_norm": 0.7201453447341919, + "learning_rate": 7.5751945029203e-06, + "loss": 0.7884, + "step": 11978 + }, + { + "epoch": 0.6593098134184601, + "grad_norm": 0.6985270977020264, + "learning_rate": 7.574822940048514e-06, + "loss": 0.7268, + "step": 11979 + }, + { + "epoch": 0.6593648522208156, + "grad_norm": 0.6405925154685974, + "learning_rate": 7.574451357825048e-06, + "loss": 0.6848, + "step": 11980 + }, + { + "epoch": 0.6594198910231713, + "grad_norm": 0.6656618714332581, + "learning_rate": 7.574079756252694e-06, + "loss": 0.7755, + "step": 11981 + }, + { + "epoch": 0.659474929825527, + "grad_norm": 0.8461045622825623, + "learning_rate": 7.573708135334248e-06, + "loss": 0.7171, + "step": 11982 + }, + { + "epoch": 0.6595299686278827, + "grad_norm": 0.5527384877204895, + "learning_rate": 7.573336495072498e-06, + "loss": 0.6668, + "step": 11983 + }, + { + "epoch": 0.6595850074302383, + "grad_norm": 0.6703749299049377, + "learning_rate": 7.572964835470241e-06, + "loss": 0.7128, + "step": 11984 + }, + { + "epoch": 0.659640046232594, + "grad_norm": 0.6824783682823181, + "learning_rate": 7.57259315653027e-06, + "loss": 0.8007, + "step": 11985 + }, + { + "epoch": 0.6596950850349497, + "grad_norm": 0.7369599938392639, + "learning_rate": 7.572221458255377e-06, + "loss": 0.7507, + "step": 11986 + }, + { + "epoch": 0.6597501238373052, + "grad_norm": 0.6976807713508606, + "learning_rate": 7.571849740648356e-06, + "loss": 0.7787, + "step": 11987 + }, + { + "epoch": 0.6598051626396609, + "grad_norm": 0.6735848784446716, + "learning_rate": 7.571478003711998e-06, + "loss": 0.6791, + "step": 11988 + }, + { + "epoch": 0.6598602014420166, + "grad_norm": 0.7245956659317017, + "learning_rate": 7.5711062474491025e-06, + "loss": 0.7999, + "step": 11989 + }, + { + "epoch": 0.6599152402443723, + "grad_norm": 0.760748565196991, + "learning_rate": 7.5707344718624595e-06, + "loss": 0.7904, + "step": 11990 + }, + { + "epoch": 0.6599702790467279, + "grad_norm": 0.6745715141296387, + "learning_rate": 7.5703626769548654e-06, + "loss": 0.6938, + "step": 11991 + }, + { + "epoch": 0.6600253178490836, + "grad_norm": 0.7301452159881592, + "learning_rate": 7.569990862729113e-06, + "loss": 0.7546, + "step": 11992 + }, + { + "epoch": 0.6600803566514393, + "grad_norm": 0.68801349401474, + "learning_rate": 7.569619029187998e-06, + "loss": 0.7592, + "step": 11993 + }, + { + "epoch": 0.660135395453795, + "grad_norm": 0.6839548349380493, + "learning_rate": 7.569247176334313e-06, + "loss": 0.7139, + "step": 11994 + }, + { + "epoch": 0.6601904342561505, + "grad_norm": 0.7490861415863037, + "learning_rate": 7.568875304170854e-06, + "loss": 0.7939, + "step": 11995 + }, + { + "epoch": 0.6602454730585062, + "grad_norm": 0.7098836302757263, + "learning_rate": 7.568503412700416e-06, + "loss": 0.7824, + "step": 11996 + }, + { + "epoch": 0.6603005118608619, + "grad_norm": 0.7427988052368164, + "learning_rate": 7.568131501925795e-06, + "loss": 0.7492, + "step": 11997 + }, + { + "epoch": 0.6603555506632176, + "grad_norm": 0.6715356111526489, + "learning_rate": 7.567759571849784e-06, + "loss": 0.6444, + "step": 11998 + }, + { + "epoch": 0.6604105894655732, + "grad_norm": 0.6697829961776733, + "learning_rate": 7.5673876224751795e-06, + "loss": 0.7064, + "step": 11999 + }, + { + "epoch": 0.6604656282679289, + "grad_norm": 0.6778494119644165, + "learning_rate": 7.567015653804777e-06, + "loss": 0.7517, + "step": 12000 + }, + { + "epoch": 0.6605206670702846, + "grad_norm": 0.6423540711402893, + "learning_rate": 7.566643665841371e-06, + "loss": 0.6321, + "step": 12001 + }, + { + "epoch": 0.6605757058726403, + "grad_norm": 0.6874244213104248, + "learning_rate": 7.566271658587761e-06, + "loss": 0.762, + "step": 12002 + }, + { + "epoch": 0.6606307446749958, + "grad_norm": 0.6805301308631897, + "learning_rate": 7.565899632046737e-06, + "loss": 0.765, + "step": 12003 + }, + { + "epoch": 0.6606857834773515, + "grad_norm": 0.7039558291435242, + "learning_rate": 7.5655275862211e-06, + "loss": 0.728, + "step": 12004 + }, + { + "epoch": 0.6607408222797072, + "grad_norm": 0.6513119339942932, + "learning_rate": 7.565155521113643e-06, + "loss": 0.7711, + "step": 12005 + }, + { + "epoch": 0.6607958610820629, + "grad_norm": 0.6483618021011353, + "learning_rate": 7.5647834367271655e-06, + "loss": 0.7015, + "step": 12006 + }, + { + "epoch": 0.6608508998844185, + "grad_norm": 0.7180553674697876, + "learning_rate": 7.564411333064461e-06, + "loss": 0.812, + "step": 12007 + }, + { + "epoch": 0.6609059386867742, + "grad_norm": 0.9036096334457397, + "learning_rate": 7.5640392101283285e-06, + "loss": 0.7858, + "step": 12008 + }, + { + "epoch": 0.6609609774891299, + "grad_norm": 0.7380802035331726, + "learning_rate": 7.563667067921563e-06, + "loss": 0.6615, + "step": 12009 + }, + { + "epoch": 0.6610160162914855, + "grad_norm": 0.6830628514289856, + "learning_rate": 7.5632949064469615e-06, + "loss": 0.7465, + "step": 12010 + }, + { + "epoch": 0.6610710550938411, + "grad_norm": 0.7562816143035889, + "learning_rate": 7.562922725707323e-06, + "loss": 0.8559, + "step": 12011 + }, + { + "epoch": 0.6611260938961968, + "grad_norm": 0.7376649379730225, + "learning_rate": 7.562550525705442e-06, + "loss": 0.7769, + "step": 12012 + }, + { + "epoch": 0.6611811326985525, + "grad_norm": 0.715466320514679, + "learning_rate": 7.562178306444116e-06, + "loss": 0.8233, + "step": 12013 + }, + { + "epoch": 0.6612361715009082, + "grad_norm": 0.6714800596237183, + "learning_rate": 7.561806067926147e-06, + "loss": 0.6025, + "step": 12014 + }, + { + "epoch": 0.6612912103032638, + "grad_norm": 0.7083391547203064, + "learning_rate": 7.561433810154328e-06, + "loss": 0.7063, + "step": 12015 + }, + { + "epoch": 0.6613462491056195, + "grad_norm": 0.8062768578529358, + "learning_rate": 7.561061533131457e-06, + "loss": 0.7992, + "step": 12016 + }, + { + "epoch": 0.6614012879079751, + "grad_norm": 0.741889476776123, + "learning_rate": 7.560689236860334e-06, + "loss": 0.8149, + "step": 12017 + }, + { + "epoch": 0.6614563267103308, + "grad_norm": 0.6834374666213989, + "learning_rate": 7.560316921343756e-06, + "loss": 0.782, + "step": 12018 + }, + { + "epoch": 0.6615113655126864, + "grad_norm": 0.7469872236251831, + "learning_rate": 7.559944586584522e-06, + "loss": 0.759, + "step": 12019 + }, + { + "epoch": 0.6615664043150421, + "grad_norm": 0.8300836086273193, + "learning_rate": 7.559572232585428e-06, + "loss": 0.8637, + "step": 12020 + }, + { + "epoch": 0.6616214431173978, + "grad_norm": 0.6241582632064819, + "learning_rate": 7.559199859349276e-06, + "loss": 0.7134, + "step": 12021 + }, + { + "epoch": 0.6616764819197535, + "grad_norm": 0.6696488261222839, + "learning_rate": 7.5588274668788634e-06, + "loss": 0.7457, + "step": 12022 + }, + { + "epoch": 0.6617315207221091, + "grad_norm": 0.7090815305709839, + "learning_rate": 7.558455055176987e-06, + "loss": 0.7449, + "step": 12023 + }, + { + "epoch": 0.6617865595244647, + "grad_norm": 0.6925215125083923, + "learning_rate": 7.558082624246448e-06, + "loss": 0.758, + "step": 12024 + }, + { + "epoch": 0.6618415983268204, + "grad_norm": 0.6658454537391663, + "learning_rate": 7.5577101740900425e-06, + "loss": 0.6918, + "step": 12025 + }, + { + "epoch": 0.6618966371291761, + "grad_norm": 0.6646405458450317, + "learning_rate": 7.557337704710574e-06, + "loss": 0.7293, + "step": 12026 + }, + { + "epoch": 0.6619516759315317, + "grad_norm": 0.6630399227142334, + "learning_rate": 7.556965216110841e-06, + "loss": 0.7572, + "step": 12027 + }, + { + "epoch": 0.6620067147338874, + "grad_norm": 0.7333918809890747, + "learning_rate": 7.556592708293641e-06, + "loss": 0.8012, + "step": 12028 + }, + { + "epoch": 0.6620617535362431, + "grad_norm": 0.7399254441261292, + "learning_rate": 7.556220181261773e-06, + "loss": 0.8406, + "step": 12029 + }, + { + "epoch": 0.6621167923385987, + "grad_norm": 0.6244909167289734, + "learning_rate": 7.55584763501804e-06, + "loss": 0.7427, + "step": 12030 + }, + { + "epoch": 0.6621718311409543, + "grad_norm": 0.6991485953330994, + "learning_rate": 7.55547506956524e-06, + "loss": 0.7583, + "step": 12031 + }, + { + "epoch": 0.66222686994331, + "grad_norm": 0.7115411162376404, + "learning_rate": 7.555102484906174e-06, + "loss": 0.7951, + "step": 12032 + }, + { + "epoch": 0.6622819087456657, + "grad_norm": 0.7684284448623657, + "learning_rate": 7.554729881043641e-06, + "loss": 0.717, + "step": 12033 + }, + { + "epoch": 0.6623369475480213, + "grad_norm": 0.7705931067466736, + "learning_rate": 7.554357257980443e-06, + "loss": 0.6903, + "step": 12034 + }, + { + "epoch": 0.662391986350377, + "grad_norm": 0.9283333420753479, + "learning_rate": 7.553984615719379e-06, + "loss": 0.7845, + "step": 12035 + }, + { + "epoch": 0.6624470251527327, + "grad_norm": 0.6867572665214539, + "learning_rate": 7.553611954263249e-06, + "loss": 0.8796, + "step": 12036 + }, + { + "epoch": 0.6625020639550884, + "grad_norm": 0.6129451990127563, + "learning_rate": 7.553239273614855e-06, + "loss": 0.6308, + "step": 12037 + }, + { + "epoch": 0.662557102757444, + "grad_norm": 0.749679446220398, + "learning_rate": 7.552866573777e-06, + "loss": 0.8308, + "step": 12038 + }, + { + "epoch": 0.6626121415597996, + "grad_norm": 0.7651422619819641, + "learning_rate": 7.552493854752483e-06, + "loss": 0.7266, + "step": 12039 + }, + { + "epoch": 0.6626671803621553, + "grad_norm": 0.9293195009231567, + "learning_rate": 7.552121116544104e-06, + "loss": 0.7795, + "step": 12040 + }, + { + "epoch": 0.662722219164511, + "grad_norm": 0.7321802973747253, + "learning_rate": 7.5517483591546655e-06, + "loss": 0.7294, + "step": 12041 + }, + { + "epoch": 0.6627772579668666, + "grad_norm": 0.702414333820343, + "learning_rate": 7.551375582586971e-06, + "loss": 0.7954, + "step": 12042 + }, + { + "epoch": 0.6628322967692223, + "grad_norm": 0.7497946619987488, + "learning_rate": 7.551002786843819e-06, + "loss": 0.7654, + "step": 12043 + }, + { + "epoch": 0.662887335571578, + "grad_norm": 0.6125331521034241, + "learning_rate": 7.550629971928017e-06, + "loss": 0.7299, + "step": 12044 + }, + { + "epoch": 0.6629423743739337, + "grad_norm": 0.7252177596092224, + "learning_rate": 7.550257137842358e-06, + "loss": 0.7553, + "step": 12045 + }, + { + "epoch": 0.6629974131762892, + "grad_norm": 0.6463978886604309, + "learning_rate": 7.5498842845896515e-06, + "loss": 0.7114, + "step": 12046 + }, + { + "epoch": 0.6630524519786449, + "grad_norm": 0.7392497062683105, + "learning_rate": 7.549511412172696e-06, + "loss": 0.6801, + "step": 12047 + }, + { + "epoch": 0.6631074907810006, + "grad_norm": 0.8068972229957581, + "learning_rate": 7.549138520594297e-06, + "loss": 0.8207, + "step": 12048 + }, + { + "epoch": 0.6631625295833563, + "grad_norm": 0.7632858753204346, + "learning_rate": 7.548765609857254e-06, + "loss": 0.7095, + "step": 12049 + }, + { + "epoch": 0.6632175683857119, + "grad_norm": 0.7252069115638733, + "learning_rate": 7.5483926799643705e-06, + "loss": 0.7796, + "step": 12050 + }, + { + "epoch": 0.6632726071880676, + "grad_norm": 1.048311471939087, + "learning_rate": 7.54801973091845e-06, + "loss": 0.7306, + "step": 12051 + }, + { + "epoch": 0.6633276459904233, + "grad_norm": 0.7432072758674622, + "learning_rate": 7.547646762722296e-06, + "loss": 0.8209, + "step": 12052 + }, + { + "epoch": 0.663382684792779, + "grad_norm": 0.7191399335861206, + "learning_rate": 7.547273775378709e-06, + "loss": 0.7011, + "step": 12053 + }, + { + "epoch": 0.6634377235951345, + "grad_norm": 0.5776329636573792, + "learning_rate": 7.5469007688904975e-06, + "loss": 0.6055, + "step": 12054 + }, + { + "epoch": 0.6634927623974902, + "grad_norm": 0.9296837449073792, + "learning_rate": 7.546527743260459e-06, + "loss": 0.7413, + "step": 12055 + }, + { + "epoch": 0.6635478011998459, + "grad_norm": 0.7279512286186218, + "learning_rate": 7.5461546984914e-06, + "loss": 0.7734, + "step": 12056 + }, + { + "epoch": 0.6636028400022016, + "grad_norm": 0.7297198176383972, + "learning_rate": 7.545781634586125e-06, + "loss": 0.7535, + "step": 12057 + }, + { + "epoch": 0.6636578788045572, + "grad_norm": 0.7094287872314453, + "learning_rate": 7.545408551547435e-06, + "loss": 0.7587, + "step": 12058 + }, + { + "epoch": 0.6637129176069129, + "grad_norm": 0.7559607028961182, + "learning_rate": 7.5450354493781374e-06, + "loss": 0.7358, + "step": 12059 + }, + { + "epoch": 0.6637679564092686, + "grad_norm": 0.8472892045974731, + "learning_rate": 7.544662328081034e-06, + "loss": 0.7537, + "step": 12060 + }, + { + "epoch": 0.6638229952116242, + "grad_norm": 0.6346176862716675, + "learning_rate": 7.544289187658929e-06, + "loss": 0.7658, + "step": 12061 + }, + { + "epoch": 0.6638780340139798, + "grad_norm": 0.7949367165565491, + "learning_rate": 7.543916028114628e-06, + "loss": 0.6837, + "step": 12062 + }, + { + "epoch": 0.6639330728163355, + "grad_norm": 0.7177689671516418, + "learning_rate": 7.5435428494509355e-06, + "loss": 0.7218, + "step": 12063 + }, + { + "epoch": 0.6639881116186912, + "grad_norm": 0.90680330991745, + "learning_rate": 7.5431696516706555e-06, + "loss": 0.8274, + "step": 12064 + }, + { + "epoch": 0.6640431504210469, + "grad_norm": 0.7799603939056396, + "learning_rate": 7.5427964347765916e-06, + "loss": 0.7528, + "step": 12065 + }, + { + "epoch": 0.6640981892234025, + "grad_norm": 0.7668048739433289, + "learning_rate": 7.542423198771553e-06, + "loss": 0.746, + "step": 12066 + }, + { + "epoch": 0.6641532280257582, + "grad_norm": 1.0042381286621094, + "learning_rate": 7.542049943658341e-06, + "loss": 0.7836, + "step": 12067 + }, + { + "epoch": 0.6642082668281138, + "grad_norm": 0.6915723085403442, + "learning_rate": 7.541676669439761e-06, + "loss": 0.8042, + "step": 12068 + }, + { + "epoch": 0.6642633056304695, + "grad_norm": 0.7268955707550049, + "learning_rate": 7.5413033761186215e-06, + "loss": 0.689, + "step": 12069 + }, + { + "epoch": 0.6643183444328251, + "grad_norm": 0.6418740749359131, + "learning_rate": 7.540930063697726e-06, + "loss": 0.6302, + "step": 12070 + }, + { + "epoch": 0.6643733832351808, + "grad_norm": 0.696384847164154, + "learning_rate": 7.540556732179879e-06, + "loss": 0.7978, + "step": 12071 + }, + { + "epoch": 0.6644284220375365, + "grad_norm": 0.7400668859481812, + "learning_rate": 7.540183381567889e-06, + "loss": 0.8768, + "step": 12072 + }, + { + "epoch": 0.6644834608398921, + "grad_norm": 0.6653871536254883, + "learning_rate": 7.539810011864559e-06, + "loss": 0.8107, + "step": 12073 + }, + { + "epoch": 0.6645384996422478, + "grad_norm": 0.7635810971260071, + "learning_rate": 7.539436623072698e-06, + "loss": 0.8476, + "step": 12074 + }, + { + "epoch": 0.6645935384446034, + "grad_norm": 0.6583054661750793, + "learning_rate": 7.53906321519511e-06, + "loss": 0.7093, + "step": 12075 + }, + { + "epoch": 0.6646485772469591, + "grad_norm": 0.8294859528541565, + "learning_rate": 7.538689788234604e-06, + "loss": 0.8107, + "step": 12076 + }, + { + "epoch": 0.6647036160493147, + "grad_norm": 0.6711081862449646, + "learning_rate": 7.538316342193983e-06, + "loss": 0.7491, + "step": 12077 + }, + { + "epoch": 0.6647586548516704, + "grad_norm": 0.7375408411026001, + "learning_rate": 7.5379428770760575e-06, + "loss": 0.7853, + "step": 12078 + }, + { + "epoch": 0.6648136936540261, + "grad_norm": 0.7322511672973633, + "learning_rate": 7.537569392883633e-06, + "loss": 0.7568, + "step": 12079 + }, + { + "epoch": 0.6648687324563818, + "grad_norm": 0.6390300393104553, + "learning_rate": 7.537195889619515e-06, + "loss": 0.7191, + "step": 12080 + }, + { + "epoch": 0.6649237712587374, + "grad_norm": 0.8155800104141235, + "learning_rate": 7.536822367286514e-06, + "loss": 0.7499, + "step": 12081 + }, + { + "epoch": 0.664978810061093, + "grad_norm": 0.7942230701446533, + "learning_rate": 7.536448825887432e-06, + "loss": 0.7797, + "step": 12082 + }, + { + "epoch": 0.6650338488634487, + "grad_norm": 0.7103378176689148, + "learning_rate": 7.536075265425083e-06, + "loss": 0.6814, + "step": 12083 + }, + { + "epoch": 0.6650888876658044, + "grad_norm": 0.8164991736412048, + "learning_rate": 7.535701685902268e-06, + "loss": 0.7917, + "step": 12084 + }, + { + "epoch": 0.66514392646816, + "grad_norm": 0.6970370411872864, + "learning_rate": 7.535328087321799e-06, + "loss": 0.7266, + "step": 12085 + }, + { + "epoch": 0.6651989652705157, + "grad_norm": 0.6468706130981445, + "learning_rate": 7.534954469686484e-06, + "loss": 0.7229, + "step": 12086 + }, + { + "epoch": 0.6652540040728714, + "grad_norm": 0.6551242470741272, + "learning_rate": 7.534580832999128e-06, + "loss": 0.6759, + "step": 12087 + }, + { + "epoch": 0.6653090428752271, + "grad_norm": 0.670215368270874, + "learning_rate": 7.534207177262543e-06, + "loss": 0.761, + "step": 12088 + }, + { + "epoch": 0.6653640816775827, + "grad_norm": 0.7365970015525818, + "learning_rate": 7.533833502479533e-06, + "loss": 0.7628, + "step": 12089 + }, + { + "epoch": 0.6654191204799383, + "grad_norm": 0.7419471740722656, + "learning_rate": 7.53345980865291e-06, + "loss": 0.8093, + "step": 12090 + }, + { + "epoch": 0.665474159282294, + "grad_norm": 0.6573269963264465, + "learning_rate": 7.53308609578548e-06, + "loss": 0.6806, + "step": 12091 + }, + { + "epoch": 0.6655291980846497, + "grad_norm": 0.9270638227462769, + "learning_rate": 7.5327123638800545e-06, + "loss": 0.8612, + "step": 12092 + }, + { + "epoch": 0.6655842368870053, + "grad_norm": 0.85124671459198, + "learning_rate": 7.532338612939441e-06, + "loss": 0.6776, + "step": 12093 + }, + { + "epoch": 0.665639275689361, + "grad_norm": 0.7791070342063904, + "learning_rate": 7.531964842966446e-06, + "loss": 0.7571, + "step": 12094 + }, + { + "epoch": 0.6656943144917167, + "grad_norm": 0.6604436635971069, + "learning_rate": 7.5315910539638825e-06, + "loss": 0.781, + "step": 12095 + }, + { + "epoch": 0.6657493532940724, + "grad_norm": 0.7567091584205627, + "learning_rate": 7.531217245934559e-06, + "loss": 0.8005, + "step": 12096 + }, + { + "epoch": 0.6658043920964279, + "grad_norm": 0.660637378692627, + "learning_rate": 7.530843418881282e-06, + "loss": 0.7351, + "step": 12097 + }, + { + "epoch": 0.6658594308987836, + "grad_norm": 0.6305738687515259, + "learning_rate": 7.530469572806865e-06, + "loss": 0.7452, + "step": 12098 + }, + { + "epoch": 0.6659144697011393, + "grad_norm": 0.8291265368461609, + "learning_rate": 7.5300957077141164e-06, + "loss": 0.7799, + "step": 12099 + }, + { + "epoch": 0.665969508503495, + "grad_norm": 0.7459661364555359, + "learning_rate": 7.5297218236058456e-06, + "loss": 0.8273, + "step": 12100 + }, + { + "epoch": 0.6660245473058506, + "grad_norm": 0.7570028901100159, + "learning_rate": 7.529347920484862e-06, + "loss": 0.7622, + "step": 12101 + }, + { + "epoch": 0.6660795861082063, + "grad_norm": 0.733403205871582, + "learning_rate": 7.528973998353977e-06, + "loss": 0.8357, + "step": 12102 + }, + { + "epoch": 0.666134624910562, + "grad_norm": 0.8814442753791809, + "learning_rate": 7.528600057216e-06, + "loss": 0.727, + "step": 12103 + }, + { + "epoch": 0.6661896637129177, + "grad_norm": 0.629338800907135, + "learning_rate": 7.528226097073742e-06, + "loss": 0.6758, + "step": 12104 + }, + { + "epoch": 0.6662447025152732, + "grad_norm": 0.7786098122596741, + "learning_rate": 7.527852117930014e-06, + "loss": 0.7476, + "step": 12105 + }, + { + "epoch": 0.6662997413176289, + "grad_norm": 0.6604528427124023, + "learning_rate": 7.527478119787626e-06, + "loss": 0.7275, + "step": 12106 + }, + { + "epoch": 0.6663547801199846, + "grad_norm": 0.6937400698661804, + "learning_rate": 7.527104102649387e-06, + "loss": 0.7187, + "step": 12107 + }, + { + "epoch": 0.6664098189223403, + "grad_norm": 0.6863219738006592, + "learning_rate": 7.526730066518113e-06, + "loss": 0.7512, + "step": 12108 + }, + { + "epoch": 0.6664648577246959, + "grad_norm": 0.7771461606025696, + "learning_rate": 7.526356011396609e-06, + "loss": 0.8439, + "step": 12109 + }, + { + "epoch": 0.6665198965270516, + "grad_norm": 0.7223722338676453, + "learning_rate": 7.525981937287692e-06, + "loss": 0.6488, + "step": 12110 + }, + { + "epoch": 0.6665749353294073, + "grad_norm": 0.8091556429862976, + "learning_rate": 7.52560784419417e-06, + "loss": 0.6618, + "step": 12111 + }, + { + "epoch": 0.666629974131763, + "grad_norm": 0.6435044407844543, + "learning_rate": 7.525233732118856e-06, + "loss": 0.6994, + "step": 12112 + }, + { + "epoch": 0.6666850129341185, + "grad_norm": 0.6933714151382446, + "learning_rate": 7.52485960106456e-06, + "loss": 0.6917, + "step": 12113 + }, + { + "epoch": 0.6667400517364742, + "grad_norm": 0.693192720413208, + "learning_rate": 7.524485451034097e-06, + "loss": 0.7941, + "step": 12114 + }, + { + "epoch": 0.6667950905388299, + "grad_norm": 1.1374844312667847, + "learning_rate": 7.524111282030275e-06, + "loss": 0.9112, + "step": 12115 + }, + { + "epoch": 0.6668501293411855, + "grad_norm": 0.6917465329170227, + "learning_rate": 7.523737094055911e-06, + "loss": 0.681, + "step": 12116 + }, + { + "epoch": 0.6669051681435412, + "grad_norm": 0.8057913184165955, + "learning_rate": 7.523362887113812e-06, + "loss": 0.8186, + "step": 12117 + }, + { + "epoch": 0.6669602069458969, + "grad_norm": 0.7194918394088745, + "learning_rate": 7.522988661206795e-06, + "loss": 0.7875, + "step": 12118 + }, + { + "epoch": 0.6670152457482525, + "grad_norm": 0.6829916834831238, + "learning_rate": 7.52261441633767e-06, + "loss": 0.6506, + "step": 12119 + }, + { + "epoch": 0.6670702845506081, + "grad_norm": 0.7869738936424255, + "learning_rate": 7.5222401525092495e-06, + "loss": 0.7091, + "step": 12120 + }, + { + "epoch": 0.6671253233529638, + "grad_norm": 0.6835895776748657, + "learning_rate": 7.5218658697243475e-06, + "loss": 0.7839, + "step": 12121 + }, + { + "epoch": 0.6671803621553195, + "grad_norm": 0.7462154030799866, + "learning_rate": 7.521491567985776e-06, + "loss": 0.7073, + "step": 12122 + }, + { + "epoch": 0.6672354009576752, + "grad_norm": 0.6413764953613281, + "learning_rate": 7.52111724729635e-06, + "loss": 0.6472, + "step": 12123 + }, + { + "epoch": 0.6672904397600308, + "grad_norm": 0.7085923552513123, + "learning_rate": 7.520742907658881e-06, + "loss": 0.8167, + "step": 12124 + }, + { + "epoch": 0.6673454785623865, + "grad_norm": 0.6490428447723389, + "learning_rate": 7.520368549076182e-06, + "loss": 0.7693, + "step": 12125 + }, + { + "epoch": 0.6674005173647422, + "grad_norm": 0.7082974910736084, + "learning_rate": 7.51999417155107e-06, + "loss": 0.6707, + "step": 12126 + }, + { + "epoch": 0.6674555561670978, + "grad_norm": 0.704335629940033, + "learning_rate": 7.519619775086355e-06, + "loss": 0.825, + "step": 12127 + }, + { + "epoch": 0.6675105949694534, + "grad_norm": 0.6815123558044434, + "learning_rate": 7.519245359684852e-06, + "loss": 0.762, + "step": 12128 + }, + { + "epoch": 0.6675656337718091, + "grad_norm": 0.6497910618782043, + "learning_rate": 7.518870925349376e-06, + "loss": 0.6934, + "step": 12129 + }, + { + "epoch": 0.6676206725741648, + "grad_norm": 0.6699943542480469, + "learning_rate": 7.51849647208274e-06, + "loss": 0.7816, + "step": 12130 + }, + { + "epoch": 0.6676757113765205, + "grad_norm": 0.7139337062835693, + "learning_rate": 7.51812199988776e-06, + "loss": 0.679, + "step": 12131 + }, + { + "epoch": 0.6677307501788761, + "grad_norm": 0.6762346029281616, + "learning_rate": 7.517747508767248e-06, + "loss": 0.7477, + "step": 12132 + }, + { + "epoch": 0.6677857889812318, + "grad_norm": 0.7429338693618774, + "learning_rate": 7.517372998724017e-06, + "loss": 0.7549, + "step": 12133 + }, + { + "epoch": 0.6678408277835874, + "grad_norm": 0.7392850518226624, + "learning_rate": 7.516998469760888e-06, + "loss": 0.8167, + "step": 12134 + }, + { + "epoch": 0.6678958665859431, + "grad_norm": 0.7511306405067444, + "learning_rate": 7.516623921880671e-06, + "loss": 0.7264, + "step": 12135 + }, + { + "epoch": 0.6679509053882987, + "grad_norm": 0.6757550835609436, + "learning_rate": 7.516249355086183e-06, + "loss": 0.7405, + "step": 12136 + }, + { + "epoch": 0.6680059441906544, + "grad_norm": 0.7433735132217407, + "learning_rate": 7.515874769380238e-06, + "loss": 0.7954, + "step": 12137 + }, + { + "epoch": 0.6680609829930101, + "grad_norm": 0.7390886545181274, + "learning_rate": 7.51550016476565e-06, + "loss": 0.7487, + "step": 12138 + }, + { + "epoch": 0.6681160217953658, + "grad_norm": 0.7405929565429688, + "learning_rate": 7.5151255412452385e-06, + "loss": 0.8127, + "step": 12139 + }, + { + "epoch": 0.6681710605977214, + "grad_norm": 0.6628968715667725, + "learning_rate": 7.514750898821817e-06, + "loss": 0.7009, + "step": 12140 + }, + { + "epoch": 0.668226099400077, + "grad_norm": 0.6777421832084656, + "learning_rate": 7.514376237498199e-06, + "loss": 0.6689, + "step": 12141 + }, + { + "epoch": 0.6682811382024327, + "grad_norm": 0.617261528968811, + "learning_rate": 7.514001557277202e-06, + "loss": 0.7597, + "step": 12142 + }, + { + "epoch": 0.6683361770047884, + "grad_norm": 0.6666202545166016, + "learning_rate": 7.5136268581616446e-06, + "loss": 0.6623, + "step": 12143 + }, + { + "epoch": 0.668391215807144, + "grad_norm": 0.7170178890228271, + "learning_rate": 7.513252140154339e-06, + "loss": 0.8224, + "step": 12144 + }, + { + "epoch": 0.6684462546094997, + "grad_norm": 0.6173199415206909, + "learning_rate": 7.512877403258103e-06, + "loss": 0.6784, + "step": 12145 + }, + { + "epoch": 0.6685012934118554, + "grad_norm": 0.6906641125679016, + "learning_rate": 7.512502647475753e-06, + "loss": 0.6649, + "step": 12146 + }, + { + "epoch": 0.6685563322142111, + "grad_norm": 0.6435873508453369, + "learning_rate": 7.5121278728101065e-06, + "loss": 0.751, + "step": 12147 + }, + { + "epoch": 0.6686113710165666, + "grad_norm": 0.8345947861671448, + "learning_rate": 7.511753079263978e-06, + "loss": 0.7841, + "step": 12148 + }, + { + "epoch": 0.6686664098189223, + "grad_norm": 0.6952378153800964, + "learning_rate": 7.511378266840187e-06, + "loss": 0.8187, + "step": 12149 + }, + { + "epoch": 0.668721448621278, + "grad_norm": 0.6878920793533325, + "learning_rate": 7.5110034355415484e-06, + "loss": 0.6726, + "step": 12150 + }, + { + "epoch": 0.6687764874236337, + "grad_norm": 0.7119094729423523, + "learning_rate": 7.5106285853708805e-06, + "loss": 0.7824, + "step": 12151 + }, + { + "epoch": 0.6688315262259893, + "grad_norm": 0.7261053323745728, + "learning_rate": 7.5102537163309994e-06, + "loss": 0.7122, + "step": 12152 + }, + { + "epoch": 0.668886565028345, + "grad_norm": 0.717268168926239, + "learning_rate": 7.509878828424725e-06, + "loss": 0.7144, + "step": 12153 + }, + { + "epoch": 0.6689416038307007, + "grad_norm": 0.8373270630836487, + "learning_rate": 7.5095039216548725e-06, + "loss": 0.7941, + "step": 12154 + }, + { + "epoch": 0.6689966426330564, + "grad_norm": 0.7113829851150513, + "learning_rate": 7.509128996024259e-06, + "loss": 0.705, + "step": 12155 + }, + { + "epoch": 0.6690516814354119, + "grad_norm": 0.7894094586372375, + "learning_rate": 7.508754051535705e-06, + "loss": 0.8284, + "step": 12156 + }, + { + "epoch": 0.6691067202377676, + "grad_norm": 0.6739659905433655, + "learning_rate": 7.508379088192028e-06, + "loss": 0.7264, + "step": 12157 + }, + { + "epoch": 0.6691617590401233, + "grad_norm": 0.735211193561554, + "learning_rate": 7.508004105996043e-06, + "loss": 0.8187, + "step": 12158 + }, + { + "epoch": 0.6692167978424789, + "grad_norm": 0.7438055872917175, + "learning_rate": 7.507629104950571e-06, + "loss": 0.8949, + "step": 12159 + }, + { + "epoch": 0.6692718366448346, + "grad_norm": 1.0734246969223022, + "learning_rate": 7.507254085058431e-06, + "loss": 0.7687, + "step": 12160 + }, + { + "epoch": 0.6693268754471903, + "grad_norm": 0.6719897985458374, + "learning_rate": 7.50687904632244e-06, + "loss": 0.7522, + "step": 12161 + }, + { + "epoch": 0.669381914249546, + "grad_norm": 0.7063966989517212, + "learning_rate": 7.506503988745416e-06, + "loss": 0.7794, + "step": 12162 + }, + { + "epoch": 0.6694369530519015, + "grad_norm": 0.6582265496253967, + "learning_rate": 7.506128912330179e-06, + "loss": 0.7012, + "step": 12163 + }, + { + "epoch": 0.6694919918542572, + "grad_norm": 0.7764506340026855, + "learning_rate": 7.50575381707955e-06, + "loss": 0.7816, + "step": 12164 + }, + { + "epoch": 0.6695470306566129, + "grad_norm": 0.7659780383110046, + "learning_rate": 7.505378702996344e-06, + "loss": 0.753, + "step": 12165 + }, + { + "epoch": 0.6696020694589686, + "grad_norm": 0.9013122916221619, + "learning_rate": 7.505003570083385e-06, + "loss": 0.8255, + "step": 12166 + }, + { + "epoch": 0.6696571082613242, + "grad_norm": 0.6417272686958313, + "learning_rate": 7.504628418343487e-06, + "loss": 0.6236, + "step": 12167 + }, + { + "epoch": 0.6697121470636799, + "grad_norm": 0.7511595487594604, + "learning_rate": 7.504253247779474e-06, + "loss": 0.7961, + "step": 12168 + }, + { + "epoch": 0.6697671858660356, + "grad_norm": 0.7987878918647766, + "learning_rate": 7.503878058394163e-06, + "loss": 0.7249, + "step": 12169 + }, + { + "epoch": 0.6698222246683913, + "grad_norm": 0.6860646605491638, + "learning_rate": 7.503502850190374e-06, + "loss": 0.7973, + "step": 12170 + }, + { + "epoch": 0.6698772634707468, + "grad_norm": 0.7334334850311279, + "learning_rate": 7.50312762317093e-06, + "loss": 0.8756, + "step": 12171 + }, + { + "epoch": 0.6699323022731025, + "grad_norm": 0.7792186737060547, + "learning_rate": 7.502752377338647e-06, + "loss": 0.8393, + "step": 12172 + }, + { + "epoch": 0.6699873410754582, + "grad_norm": 0.6532536149024963, + "learning_rate": 7.502377112696346e-06, + "loss": 0.6509, + "step": 12173 + }, + { + "epoch": 0.6700423798778139, + "grad_norm": 0.6595458984375, + "learning_rate": 7.50200182924685e-06, + "loss": 0.781, + "step": 12174 + }, + { + "epoch": 0.6700974186801695, + "grad_norm": 0.6668636202812195, + "learning_rate": 7.501626526992978e-06, + "loss": 0.7702, + "step": 12175 + }, + { + "epoch": 0.6701524574825252, + "grad_norm": 0.686851441860199, + "learning_rate": 7.501251205937551e-06, + "loss": 0.8648, + "step": 12176 + }, + { + "epoch": 0.6702074962848809, + "grad_norm": 0.7363078594207764, + "learning_rate": 7.500875866083388e-06, + "loss": 0.7309, + "step": 12177 + }, + { + "epoch": 0.6702625350872365, + "grad_norm": 0.6927379369735718, + "learning_rate": 7.500500507433312e-06, + "loss": 0.7258, + "step": 12178 + }, + { + "epoch": 0.6703175738895921, + "grad_norm": 0.6589936017990112, + "learning_rate": 7.5001251299901455e-06, + "loss": 0.6776, + "step": 12179 + }, + { + "epoch": 0.6703726126919478, + "grad_norm": 0.6402539610862732, + "learning_rate": 7.499749733756707e-06, + "loss": 0.7467, + "step": 12180 + }, + { + "epoch": 0.6704276514943035, + "grad_norm": 0.776469886302948, + "learning_rate": 7.499374318735817e-06, + "loss": 0.7856, + "step": 12181 + }, + { + "epoch": 0.6704826902966592, + "grad_norm": 0.7062460780143738, + "learning_rate": 7.4989988849303e-06, + "loss": 0.8286, + "step": 12182 + }, + { + "epoch": 0.6705377290990148, + "grad_norm": 0.6725799441337585, + "learning_rate": 7.4986234323429755e-06, + "loss": 0.7517, + "step": 12183 + }, + { + "epoch": 0.6705927679013705, + "grad_norm": 0.6444042921066284, + "learning_rate": 7.498247960976667e-06, + "loss": 0.5984, + "step": 12184 + }, + { + "epoch": 0.6706478067037261, + "grad_norm": 0.6968628764152527, + "learning_rate": 7.497872470834195e-06, + "loss": 0.6996, + "step": 12185 + }, + { + "epoch": 0.6707028455060818, + "grad_norm": 0.643500030040741, + "learning_rate": 7.497496961918381e-06, + "loss": 0.6252, + "step": 12186 + }, + { + "epoch": 0.6707578843084374, + "grad_norm": 0.7026870846748352, + "learning_rate": 7.49712143423205e-06, + "loss": 0.7883, + "step": 12187 + }, + { + "epoch": 0.6708129231107931, + "grad_norm": 0.8169240951538086, + "learning_rate": 7.496745887778022e-06, + "loss": 0.6717, + "step": 12188 + }, + { + "epoch": 0.6708679619131488, + "grad_norm": 0.6611927151679993, + "learning_rate": 7.496370322559121e-06, + "loss": 0.6674, + "step": 12189 + }, + { + "epoch": 0.6709230007155045, + "grad_norm": 0.7330195307731628, + "learning_rate": 7.495994738578169e-06, + "loss": 0.7809, + "step": 12190 + }, + { + "epoch": 0.6709780395178601, + "grad_norm": 0.6469636559486389, + "learning_rate": 7.495619135837988e-06, + "loss": 0.6511, + "step": 12191 + }, + { + "epoch": 0.6710330783202157, + "grad_norm": 0.6558564901351929, + "learning_rate": 7.495243514341402e-06, + "loss": 0.7284, + "step": 12192 + }, + { + "epoch": 0.6710881171225714, + "grad_norm": 0.6736281514167786, + "learning_rate": 7.494867874091233e-06, + "loss": 0.7007, + "step": 12193 + }, + { + "epoch": 0.6711431559249271, + "grad_norm": 0.7302053570747375, + "learning_rate": 7.494492215090304e-06, + "loss": 0.77, + "step": 12194 + }, + { + "epoch": 0.6711981947272827, + "grad_norm": 0.7368764877319336, + "learning_rate": 7.494116537341442e-06, + "loss": 0.8478, + "step": 12195 + }, + { + "epoch": 0.6712532335296384, + "grad_norm": 0.782767653465271, + "learning_rate": 7.493740840847466e-06, + "loss": 0.813, + "step": 12196 + }, + { + "epoch": 0.6713082723319941, + "grad_norm": 0.6787601113319397, + "learning_rate": 7.493365125611202e-06, + "loss": 0.7507, + "step": 12197 + }, + { + "epoch": 0.6713633111343498, + "grad_norm": 0.6912569999694824, + "learning_rate": 7.4929893916354715e-06, + "loss": 0.8003, + "step": 12198 + }, + { + "epoch": 0.6714183499367053, + "grad_norm": 0.7625328898429871, + "learning_rate": 7.4926136389231005e-06, + "loss": 0.8021, + "step": 12199 + }, + { + "epoch": 0.671473388739061, + "grad_norm": 0.6720984578132629, + "learning_rate": 7.4922378674769146e-06, + "loss": 0.7757, + "step": 12200 + }, + { + "epoch": 0.6715284275414167, + "grad_norm": 0.7816714644432068, + "learning_rate": 7.491862077299734e-06, + "loss": 0.7086, + "step": 12201 + }, + { + "epoch": 0.6715834663437723, + "grad_norm": 0.7546358108520508, + "learning_rate": 7.491486268394387e-06, + "loss": 0.8365, + "step": 12202 + }, + { + "epoch": 0.671638505146128, + "grad_norm": 0.7201979756355286, + "learning_rate": 7.491110440763695e-06, + "loss": 0.835, + "step": 12203 + }, + { + "epoch": 0.6716935439484837, + "grad_norm": 0.8177551031112671, + "learning_rate": 7.490734594410484e-06, + "loss": 0.8636, + "step": 12204 + }, + { + "epoch": 0.6717485827508394, + "grad_norm": 0.7433933019638062, + "learning_rate": 7.490358729337578e-06, + "loss": 0.745, + "step": 12205 + }, + { + "epoch": 0.671803621553195, + "grad_norm": 0.8013591170310974, + "learning_rate": 7.489982845547802e-06, + "loss": 0.7638, + "step": 12206 + }, + { + "epoch": 0.6718586603555506, + "grad_norm": 0.6561495065689087, + "learning_rate": 7.489606943043982e-06, + "loss": 0.7997, + "step": 12207 + }, + { + "epoch": 0.6719136991579063, + "grad_norm": 0.7291023135185242, + "learning_rate": 7.489231021828943e-06, + "loss": 0.7452, + "step": 12208 + }, + { + "epoch": 0.671968737960262, + "grad_norm": 0.6978216171264648, + "learning_rate": 7.488855081905511e-06, + "loss": 0.7984, + "step": 12209 + }, + { + "epoch": 0.6720237767626176, + "grad_norm": 0.701006293296814, + "learning_rate": 7.488479123276507e-06, + "loss": 0.7218, + "step": 12210 + }, + { + "epoch": 0.6720788155649733, + "grad_norm": 0.7275286912918091, + "learning_rate": 7.488103145944763e-06, + "loss": 0.6872, + "step": 12211 + }, + { + "epoch": 0.672133854367329, + "grad_norm": 0.7319645881652832, + "learning_rate": 7.487727149913101e-06, + "loss": 0.7862, + "step": 12212 + }, + { + "epoch": 0.6721888931696847, + "grad_norm": 0.7143612504005432, + "learning_rate": 7.487351135184348e-06, + "loss": 0.838, + "step": 12213 + }, + { + "epoch": 0.6722439319720402, + "grad_norm": 0.7135382294654846, + "learning_rate": 7.486975101761329e-06, + "loss": 0.7263, + "step": 12214 + }, + { + "epoch": 0.6722989707743959, + "grad_norm": 0.6283460259437561, + "learning_rate": 7.486599049646872e-06, + "loss": 0.7262, + "step": 12215 + }, + { + "epoch": 0.6723540095767516, + "grad_norm": 0.7196768522262573, + "learning_rate": 7.486222978843801e-06, + "loss": 0.6752, + "step": 12216 + }, + { + "epoch": 0.6724090483791073, + "grad_norm": 0.5856572389602661, + "learning_rate": 7.485846889354944e-06, + "loss": 0.6779, + "step": 12217 + }, + { + "epoch": 0.6724640871814629, + "grad_norm": 0.7671294808387756, + "learning_rate": 7.485470781183126e-06, + "loss": 0.766, + "step": 12218 + }, + { + "epoch": 0.6725191259838186, + "grad_norm": 0.6780520677566528, + "learning_rate": 7.485094654331177e-06, + "loss": 0.7474, + "step": 12219 + }, + { + "epoch": 0.6725741647861743, + "grad_norm": 0.7537981867790222, + "learning_rate": 7.484718508801921e-06, + "loss": 0.8347, + "step": 12220 + }, + { + "epoch": 0.67262920358853, + "grad_norm": 0.7451551556587219, + "learning_rate": 7.484342344598186e-06, + "loss": 0.8217, + "step": 12221 + }, + { + "epoch": 0.6726842423908855, + "grad_norm": 0.6656951904296875, + "learning_rate": 7.483966161722798e-06, + "loss": 0.7437, + "step": 12222 + }, + { + "epoch": 0.6727392811932412, + "grad_norm": 0.7306267619132996, + "learning_rate": 7.483589960178586e-06, + "loss": 0.8495, + "step": 12223 + }, + { + "epoch": 0.6727943199955969, + "grad_norm": 0.6619658470153809, + "learning_rate": 7.483213739968376e-06, + "loss": 0.6379, + "step": 12224 + }, + { + "epoch": 0.6728493587979526, + "grad_norm": 0.7066444754600525, + "learning_rate": 7.4828375010949974e-06, + "loss": 0.7307, + "step": 12225 + }, + { + "epoch": 0.6729043976003082, + "grad_norm": 0.7356079816818237, + "learning_rate": 7.482461243561276e-06, + "loss": 0.7781, + "step": 12226 + }, + { + "epoch": 0.6729594364026639, + "grad_norm": 0.6759988069534302, + "learning_rate": 7.48208496737004e-06, + "loss": 0.7808, + "step": 12227 + }, + { + "epoch": 0.6730144752050196, + "grad_norm": 0.7519234418869019, + "learning_rate": 7.481708672524119e-06, + "loss": 0.7948, + "step": 12228 + }, + { + "epoch": 0.6730695140073752, + "grad_norm": 0.6387592554092407, + "learning_rate": 7.48133235902634e-06, + "loss": 0.7423, + "step": 12229 + }, + { + "epoch": 0.6731245528097308, + "grad_norm": 1.0615060329437256, + "learning_rate": 7.480956026879529e-06, + "loss": 0.8668, + "step": 12230 + }, + { + "epoch": 0.6731795916120865, + "grad_norm": 0.7578469514846802, + "learning_rate": 7.480579676086519e-06, + "loss": 0.812, + "step": 12231 + }, + { + "epoch": 0.6732346304144422, + "grad_norm": 0.6669226884841919, + "learning_rate": 7.480203306650134e-06, + "loss": 0.7002, + "step": 12232 + }, + { + "epoch": 0.6732896692167979, + "grad_norm": 0.7110459208488464, + "learning_rate": 7.479826918573208e-06, + "loss": 0.8542, + "step": 12233 + }, + { + "epoch": 0.6733447080191535, + "grad_norm": 0.6632254123687744, + "learning_rate": 7.479450511858563e-06, + "loss": 0.6784, + "step": 12234 + }, + { + "epoch": 0.6733997468215092, + "grad_norm": 0.7368438839912415, + "learning_rate": 7.479074086509032e-06, + "loss": 0.7683, + "step": 12235 + }, + { + "epoch": 0.6734547856238648, + "grad_norm": 0.764905571937561, + "learning_rate": 7.478697642527447e-06, + "loss": 0.7585, + "step": 12236 + }, + { + "epoch": 0.6735098244262205, + "grad_norm": 0.7141197323799133, + "learning_rate": 7.478321179916632e-06, + "loss": 0.7409, + "step": 12237 + }, + { + "epoch": 0.6735648632285761, + "grad_norm": 0.6514197587966919, + "learning_rate": 7.477944698679419e-06, + "loss": 0.7623, + "step": 12238 + }, + { + "epoch": 0.6736199020309318, + "grad_norm": 0.7712671160697937, + "learning_rate": 7.477568198818636e-06, + "loss": 0.777, + "step": 12239 + }, + { + "epoch": 0.6736749408332875, + "grad_norm": 0.6690881252288818, + "learning_rate": 7.4771916803371145e-06, + "loss": 0.7275, + "step": 12240 + }, + { + "epoch": 0.6737299796356432, + "grad_norm": 0.7206465601921082, + "learning_rate": 7.476815143237683e-06, + "loss": 0.853, + "step": 12241 + }, + { + "epoch": 0.6737850184379988, + "grad_norm": 0.7052504420280457, + "learning_rate": 7.476438587523171e-06, + "loss": 0.774, + "step": 12242 + }, + { + "epoch": 0.6738400572403545, + "grad_norm": 1.6168169975280762, + "learning_rate": 7.476062013196411e-06, + "loss": 0.7423, + "step": 12243 + }, + { + "epoch": 0.6738950960427101, + "grad_norm": 0.715300977230072, + "learning_rate": 7.475685420260232e-06, + "loss": 0.78, + "step": 12244 + }, + { + "epoch": 0.6739501348450657, + "grad_norm": 0.7774379253387451, + "learning_rate": 7.475308808717463e-06, + "loss": 0.885, + "step": 12245 + }, + { + "epoch": 0.6740051736474214, + "grad_norm": 0.6998060941696167, + "learning_rate": 7.474932178570935e-06, + "loss": 0.807, + "step": 12246 + }, + { + "epoch": 0.6740602124497771, + "grad_norm": 0.6710013747215271, + "learning_rate": 7.47455552982348e-06, + "loss": 0.7639, + "step": 12247 + }, + { + "epoch": 0.6741152512521328, + "grad_norm": 0.707435667514801, + "learning_rate": 7.474178862477929e-06, + "loss": 0.7914, + "step": 12248 + }, + { + "epoch": 0.6741702900544884, + "grad_norm": 0.7344105243682861, + "learning_rate": 7.47380217653711e-06, + "loss": 0.7464, + "step": 12249 + }, + { + "epoch": 0.674225328856844, + "grad_norm": 0.7157585620880127, + "learning_rate": 7.473425472003858e-06, + "loss": 0.7747, + "step": 12250 + }, + { + "epoch": 0.6742803676591997, + "grad_norm": 0.6978434920310974, + "learning_rate": 7.473048748881001e-06, + "loss": 0.6903, + "step": 12251 + }, + { + "epoch": 0.6743354064615554, + "grad_norm": 0.6454086899757385, + "learning_rate": 7.472672007171372e-06, + "loss": 0.725, + "step": 12252 + }, + { + "epoch": 0.674390445263911, + "grad_norm": 0.6729341745376587, + "learning_rate": 7.4722952468778035e-06, + "loss": 0.7704, + "step": 12253 + }, + { + "epoch": 0.6744454840662667, + "grad_norm": 0.7995265126228333, + "learning_rate": 7.471918468003122e-06, + "loss": 0.7567, + "step": 12254 + }, + { + "epoch": 0.6745005228686224, + "grad_norm": 0.729629397392273, + "learning_rate": 7.471541670550165e-06, + "loss": 0.796, + "step": 12255 + }, + { + "epoch": 0.6745555616709781, + "grad_norm": 0.6923666000366211, + "learning_rate": 7.471164854521764e-06, + "loss": 0.6894, + "step": 12256 + }, + { + "epoch": 0.6746106004733337, + "grad_norm": 0.6485042572021484, + "learning_rate": 7.470788019920747e-06, + "loss": 0.6912, + "step": 12257 + }, + { + "epoch": 0.6746656392756893, + "grad_norm": 0.7569034099578857, + "learning_rate": 7.470411166749949e-06, + "loss": 0.8167, + "step": 12258 + }, + { + "epoch": 0.674720678078045, + "grad_norm": 0.6202835440635681, + "learning_rate": 7.470034295012203e-06, + "loss": 0.6409, + "step": 12259 + }, + { + "epoch": 0.6747757168804007, + "grad_norm": 0.6414007544517517, + "learning_rate": 7.4696574047103395e-06, + "loss": 0.7163, + "step": 12260 + }, + { + "epoch": 0.6748307556827563, + "grad_norm": 0.7012181878089905, + "learning_rate": 7.469280495847193e-06, + "loss": 0.7682, + "step": 12261 + }, + { + "epoch": 0.674885794485112, + "grad_norm": 0.7027888298034668, + "learning_rate": 7.468903568425596e-06, + "loss": 0.7561, + "step": 12262 + }, + { + "epoch": 0.6749408332874677, + "grad_norm": 0.7282221913337708, + "learning_rate": 7.4685266224483785e-06, + "loss": 0.7552, + "step": 12263 + }, + { + "epoch": 0.6749958720898234, + "grad_norm": 0.7349117398262024, + "learning_rate": 7.468149657918377e-06, + "loss": 0.8323, + "step": 12264 + }, + { + "epoch": 0.675050910892179, + "grad_norm": 0.8992187976837158, + "learning_rate": 7.467772674838424e-06, + "loss": 0.7589, + "step": 12265 + }, + { + "epoch": 0.6751059496945346, + "grad_norm": 0.6773034930229187, + "learning_rate": 7.4673956732113505e-06, + "loss": 0.7229, + "step": 12266 + }, + { + "epoch": 0.6751609884968903, + "grad_norm": 0.6563699841499329, + "learning_rate": 7.467018653039992e-06, + "loss": 0.7526, + "step": 12267 + }, + { + "epoch": 0.675216027299246, + "grad_norm": 0.7559765577316284, + "learning_rate": 7.466641614327181e-06, + "loss": 0.708, + "step": 12268 + }, + { + "epoch": 0.6752710661016016, + "grad_norm": 0.7077820897102356, + "learning_rate": 7.4662645570757545e-06, + "loss": 0.6568, + "step": 12269 + }, + { + "epoch": 0.6753261049039573, + "grad_norm": 0.8082162141799927, + "learning_rate": 7.465887481288541e-06, + "loss": 0.8751, + "step": 12270 + }, + { + "epoch": 0.675381143706313, + "grad_norm": 0.6940243244171143, + "learning_rate": 7.465510386968377e-06, + "loss": 0.7826, + "step": 12271 + }, + { + "epoch": 0.6754361825086687, + "grad_norm": 0.6634145379066467, + "learning_rate": 7.465133274118099e-06, + "loss": 0.6816, + "step": 12272 + }, + { + "epoch": 0.6754912213110242, + "grad_norm": 0.6797559857368469, + "learning_rate": 7.464756142740539e-06, + "loss": 0.7101, + "step": 12273 + }, + { + "epoch": 0.6755462601133799, + "grad_norm": 0.7696588635444641, + "learning_rate": 7.464378992838531e-06, + "loss": 0.8114, + "step": 12274 + }, + { + "epoch": 0.6756012989157356, + "grad_norm": 0.6733334064483643, + "learning_rate": 7.4640018244149105e-06, + "loss": 0.7585, + "step": 12275 + }, + { + "epoch": 0.6756563377180913, + "grad_norm": 0.7087474465370178, + "learning_rate": 7.463624637472512e-06, + "loss": 0.6911, + "step": 12276 + }, + { + "epoch": 0.6757113765204469, + "grad_norm": 0.6944451928138733, + "learning_rate": 7.46324743201417e-06, + "loss": 0.7726, + "step": 12277 + }, + { + "epoch": 0.6757664153228026, + "grad_norm": 0.7214855551719666, + "learning_rate": 7.46287020804272e-06, + "loss": 0.7844, + "step": 12278 + }, + { + "epoch": 0.6758214541251583, + "grad_norm": 0.7106257677078247, + "learning_rate": 7.462492965560995e-06, + "loss": 0.7724, + "step": 12279 + }, + { + "epoch": 0.675876492927514, + "grad_norm": 0.7403497695922852, + "learning_rate": 7.462115704571833e-06, + "loss": 0.7558, + "step": 12280 + }, + { + "epoch": 0.6759315317298695, + "grad_norm": 0.7157884836196899, + "learning_rate": 7.4617384250780685e-06, + "loss": 0.6681, + "step": 12281 + }, + { + "epoch": 0.6759865705322252, + "grad_norm": 0.6937661170959473, + "learning_rate": 7.461361127082538e-06, + "loss": 0.7852, + "step": 12282 + }, + { + "epoch": 0.6760416093345809, + "grad_norm": 0.7106412053108215, + "learning_rate": 7.4609838105880735e-06, + "loss": 0.7689, + "step": 12283 + }, + { + "epoch": 0.6760966481369366, + "grad_norm": 0.6860619187355042, + "learning_rate": 7.460606475597516e-06, + "loss": 0.6528, + "step": 12284 + }, + { + "epoch": 0.6761516869392922, + "grad_norm": 0.7085865139961243, + "learning_rate": 7.460229122113698e-06, + "loss": 0.7303, + "step": 12285 + }, + { + "epoch": 0.6762067257416479, + "grad_norm": 0.6648178100585938, + "learning_rate": 7.459851750139457e-06, + "loss": 0.6751, + "step": 12286 + }, + { + "epoch": 0.6762617645440036, + "grad_norm": 0.74468594789505, + "learning_rate": 7.459474359677629e-06, + "loss": 0.756, + "step": 12287 + }, + { + "epoch": 0.6763168033463591, + "grad_norm": 0.6408486366271973, + "learning_rate": 7.459096950731048e-06, + "loss": 0.7737, + "step": 12288 + }, + { + "epoch": 0.6763718421487148, + "grad_norm": 0.7204515933990479, + "learning_rate": 7.458719523302556e-06, + "loss": 0.7845, + "step": 12289 + }, + { + "epoch": 0.6764268809510705, + "grad_norm": 0.7373428344726562, + "learning_rate": 7.458342077394984e-06, + "loss": 0.7245, + "step": 12290 + }, + { + "epoch": 0.6764819197534262, + "grad_norm": 0.701654851436615, + "learning_rate": 7.45796461301117e-06, + "loss": 0.7711, + "step": 12291 + }, + { + "epoch": 0.6765369585557818, + "grad_norm": 0.7002573013305664, + "learning_rate": 7.4575871301539526e-06, + "loss": 0.8138, + "step": 12292 + }, + { + "epoch": 0.6765919973581375, + "grad_norm": 0.7460681200027466, + "learning_rate": 7.45720962882617e-06, + "loss": 0.8012, + "step": 12293 + }, + { + "epoch": 0.6766470361604932, + "grad_norm": 0.6478421092033386, + "learning_rate": 7.456832109030655e-06, + "loss": 0.7161, + "step": 12294 + }, + { + "epoch": 0.6767020749628488, + "grad_norm": 0.7101582288742065, + "learning_rate": 7.456454570770248e-06, + "loss": 0.7348, + "step": 12295 + }, + { + "epoch": 0.6767571137652044, + "grad_norm": 0.7735113501548767, + "learning_rate": 7.4560770140477865e-06, + "loss": 0.7584, + "step": 12296 + }, + { + "epoch": 0.6768121525675601, + "grad_norm": 0.6811535358428955, + "learning_rate": 7.4556994388661085e-06, + "loss": 0.7653, + "step": 12297 + }, + { + "epoch": 0.6768671913699158, + "grad_norm": 0.7445605397224426, + "learning_rate": 7.455321845228051e-06, + "loss": 0.7661, + "step": 12298 + }, + { + "epoch": 0.6769222301722715, + "grad_norm": 0.6862059831619263, + "learning_rate": 7.4549442331364505e-06, + "loss": 0.776, + "step": 12299 + }, + { + "epoch": 0.6769772689746271, + "grad_norm": 0.7030314207077026, + "learning_rate": 7.4545666025941465e-06, + "loss": 0.7393, + "step": 12300 + }, + { + "epoch": 0.6770323077769828, + "grad_norm": 0.6718610525131226, + "learning_rate": 7.454188953603978e-06, + "loss": 0.7375, + "step": 12301 + }, + { + "epoch": 0.6770873465793384, + "grad_norm": 0.6716088652610779, + "learning_rate": 7.453811286168782e-06, + "loss": 0.8021, + "step": 12302 + }, + { + "epoch": 0.6771423853816941, + "grad_norm": 0.8916372656822205, + "learning_rate": 7.453433600291395e-06, + "loss": 0.8274, + "step": 12303 + }, + { + "epoch": 0.6771974241840497, + "grad_norm": 0.7396363615989685, + "learning_rate": 7.45305589597466e-06, + "loss": 0.7892, + "step": 12304 + }, + { + "epoch": 0.6772524629864054, + "grad_norm": 0.8074424862861633, + "learning_rate": 7.452678173221413e-06, + "loss": 0.7586, + "step": 12305 + }, + { + "epoch": 0.6773075017887611, + "grad_norm": 0.6928194165229797, + "learning_rate": 7.452300432034494e-06, + "loss": 0.7914, + "step": 12306 + }, + { + "epoch": 0.6773625405911168, + "grad_norm": 0.7064313292503357, + "learning_rate": 7.451922672416739e-06, + "loss": 0.7948, + "step": 12307 + }, + { + "epoch": 0.6774175793934724, + "grad_norm": 0.6828622221946716, + "learning_rate": 7.451544894370992e-06, + "loss": 0.6723, + "step": 12308 + }, + { + "epoch": 0.677472618195828, + "grad_norm": 0.6794914603233337, + "learning_rate": 7.45116709790009e-06, + "loss": 0.7344, + "step": 12309 + }, + { + "epoch": 0.6775276569981837, + "grad_norm": 0.7643330097198486, + "learning_rate": 7.45078928300687e-06, + "loss": 0.7836, + "step": 12310 + }, + { + "epoch": 0.6775826958005394, + "grad_norm": 0.692569375038147, + "learning_rate": 7.450411449694176e-06, + "loss": 0.7608, + "step": 12311 + }, + { + "epoch": 0.677637734602895, + "grad_norm": 0.7718693614006042, + "learning_rate": 7.4500335979648455e-06, + "loss": 0.7131, + "step": 12312 + }, + { + "epoch": 0.6776927734052507, + "grad_norm": 0.6267405152320862, + "learning_rate": 7.449655727821716e-06, + "loss": 0.7543, + "step": 12313 + }, + { + "epoch": 0.6777478122076064, + "grad_norm": 0.8252732157707214, + "learning_rate": 7.4492778392676325e-06, + "loss": 0.8799, + "step": 12314 + }, + { + "epoch": 0.6778028510099621, + "grad_norm": 0.6310145854949951, + "learning_rate": 7.448899932305429e-06, + "loss": 0.7389, + "step": 12315 + }, + { + "epoch": 0.6778578898123176, + "grad_norm": 0.6115848422050476, + "learning_rate": 7.448522006937951e-06, + "loss": 0.6069, + "step": 12316 + }, + { + "epoch": 0.6779129286146733, + "grad_norm": 0.6809090971946716, + "learning_rate": 7.448144063168038e-06, + "loss": 0.7092, + "step": 12317 + }, + { + "epoch": 0.677967967417029, + "grad_norm": 0.7285470366477966, + "learning_rate": 7.447766100998529e-06, + "loss": 0.714, + "step": 12318 + }, + { + "epoch": 0.6780230062193847, + "grad_norm": 0.6637021899223328, + "learning_rate": 7.447388120432264e-06, + "loss": 0.7247, + "step": 12319 + }, + { + "epoch": 0.6780780450217403, + "grad_norm": 0.7735750675201416, + "learning_rate": 7.447010121472087e-06, + "loss": 0.7616, + "step": 12320 + }, + { + "epoch": 0.678133083824096, + "grad_norm": 0.7643262147903442, + "learning_rate": 7.446632104120836e-06, + "loss": 0.5863, + "step": 12321 + }, + { + "epoch": 0.6781881226264517, + "grad_norm": 0.6957301497459412, + "learning_rate": 7.446254068381352e-06, + "loss": 0.7125, + "step": 12322 + }, + { + "epoch": 0.6782431614288074, + "grad_norm": 0.6573877930641174, + "learning_rate": 7.445876014256479e-06, + "loss": 0.7115, + "step": 12323 + }, + { + "epoch": 0.6782982002311629, + "grad_norm": 0.6507790684700012, + "learning_rate": 7.445497941749056e-06, + "loss": 0.7266, + "step": 12324 + }, + { + "epoch": 0.6783532390335186, + "grad_norm": 0.8314819931983948, + "learning_rate": 7.4451198508619245e-06, + "loss": 0.6902, + "step": 12325 + }, + { + "epoch": 0.6784082778358743, + "grad_norm": 0.6907274127006531, + "learning_rate": 7.444741741597927e-06, + "loss": 0.8253, + "step": 12326 + }, + { + "epoch": 0.67846331663823, + "grad_norm": 0.7311725616455078, + "learning_rate": 7.444363613959904e-06, + "loss": 0.8641, + "step": 12327 + }, + { + "epoch": 0.6785183554405856, + "grad_norm": 0.6690121293067932, + "learning_rate": 7.443985467950701e-06, + "loss": 0.6966, + "step": 12328 + }, + { + "epoch": 0.6785733942429413, + "grad_norm": 0.6444346308708191, + "learning_rate": 7.443607303573155e-06, + "loss": 0.7848, + "step": 12329 + }, + { + "epoch": 0.678628433045297, + "grad_norm": 0.7553900480270386, + "learning_rate": 7.4432291208301125e-06, + "loss": 0.8196, + "step": 12330 + }, + { + "epoch": 0.6786834718476525, + "grad_norm": 0.6393183469772339, + "learning_rate": 7.442850919724411e-06, + "loss": 0.7622, + "step": 12331 + }, + { + "epoch": 0.6787385106500082, + "grad_norm": 0.7045423984527588, + "learning_rate": 7.442472700258898e-06, + "loss": 0.7483, + "step": 12332 + }, + { + "epoch": 0.6787935494523639, + "grad_norm": 0.7536678314208984, + "learning_rate": 7.442094462436414e-06, + "loss": 0.815, + "step": 12333 + }, + { + "epoch": 0.6788485882547196, + "grad_norm": 0.645391047000885, + "learning_rate": 7.441716206259801e-06, + "loss": 0.7394, + "step": 12334 + }, + { + "epoch": 0.6789036270570752, + "grad_norm": 0.8870118260383606, + "learning_rate": 7.441337931731905e-06, + "loss": 0.8076, + "step": 12335 + }, + { + "epoch": 0.6789586658594309, + "grad_norm": 0.6672457456588745, + "learning_rate": 7.440959638855564e-06, + "loss": 0.7573, + "step": 12336 + }, + { + "epoch": 0.6790137046617866, + "grad_norm": 0.7104566693305969, + "learning_rate": 7.440581327633625e-06, + "loss": 0.6855, + "step": 12337 + }, + { + "epoch": 0.6790687434641423, + "grad_norm": 0.7201581001281738, + "learning_rate": 7.4402029980689294e-06, + "loss": 0.7977, + "step": 12338 + }, + { + "epoch": 0.6791237822664978, + "grad_norm": 0.6685218811035156, + "learning_rate": 7.43982465016432e-06, + "loss": 0.8114, + "step": 12339 + }, + { + "epoch": 0.6791788210688535, + "grad_norm": 0.6913738250732422, + "learning_rate": 7.439446283922645e-06, + "loss": 0.7584, + "step": 12340 + }, + { + "epoch": 0.6792338598712092, + "grad_norm": 0.7332273721694946, + "learning_rate": 7.439067899346742e-06, + "loss": 0.7658, + "step": 12341 + }, + { + "epoch": 0.6792888986735649, + "grad_norm": 0.777909517288208, + "learning_rate": 7.438689496439458e-06, + "loss": 0.8064, + "step": 12342 + }, + { + "epoch": 0.6793439374759205, + "grad_norm": 0.7444930076599121, + "learning_rate": 7.438311075203636e-06, + "loss": 0.7896, + "step": 12343 + }, + { + "epoch": 0.6793989762782762, + "grad_norm": 0.7678806781768799, + "learning_rate": 7.4379326356421224e-06, + "loss": 0.8533, + "step": 12344 + }, + { + "epoch": 0.6794540150806319, + "grad_norm": 0.6653377413749695, + "learning_rate": 7.437554177757759e-06, + "loss": 0.7287, + "step": 12345 + }, + { + "epoch": 0.6795090538829875, + "grad_norm": 0.6270567178726196, + "learning_rate": 7.43717570155339e-06, + "loss": 0.6802, + "step": 12346 + }, + { + "epoch": 0.6795640926853431, + "grad_norm": 0.7091223001480103, + "learning_rate": 7.436797207031861e-06, + "loss": 0.7693, + "step": 12347 + }, + { + "epoch": 0.6796191314876988, + "grad_norm": 0.6583104133605957, + "learning_rate": 7.436418694196018e-06, + "loss": 0.7171, + "step": 12348 + }, + { + "epoch": 0.6796741702900545, + "grad_norm": 0.6897410750389099, + "learning_rate": 7.436040163048703e-06, + "loss": 0.7831, + "step": 12349 + }, + { + "epoch": 0.6797292090924102, + "grad_norm": 0.6506269574165344, + "learning_rate": 7.435661613592763e-06, + "loss": 0.8037, + "step": 12350 + }, + { + "epoch": 0.6797842478947658, + "grad_norm": 0.6772280931472778, + "learning_rate": 7.435283045831041e-06, + "loss": 0.8102, + "step": 12351 + }, + { + "epoch": 0.6798392866971215, + "grad_norm": 0.8470273017883301, + "learning_rate": 7.434904459766384e-06, + "loss": 0.7816, + "step": 12352 + }, + { + "epoch": 0.6798943254994771, + "grad_norm": 0.6969698071479797, + "learning_rate": 7.434525855401638e-06, + "loss": 0.6911, + "step": 12353 + }, + { + "epoch": 0.6799493643018328, + "grad_norm": 0.9969611763954163, + "learning_rate": 7.434147232739646e-06, + "loss": 0.7041, + "step": 12354 + }, + { + "epoch": 0.6800044031041884, + "grad_norm": 0.6697688698768616, + "learning_rate": 7.433768591783255e-06, + "loss": 0.6602, + "step": 12355 + }, + { + "epoch": 0.6800594419065441, + "grad_norm": 0.9857928156852722, + "learning_rate": 7.433389932535311e-06, + "loss": 0.6505, + "step": 12356 + }, + { + "epoch": 0.6801144807088998, + "grad_norm": 0.8787727355957031, + "learning_rate": 7.43301125499866e-06, + "loss": 0.7558, + "step": 12357 + }, + { + "epoch": 0.6801695195112555, + "grad_norm": 0.6035268306732178, + "learning_rate": 7.432632559176147e-06, + "loss": 0.6337, + "step": 12358 + }, + { + "epoch": 0.6802245583136111, + "grad_norm": 0.7977258563041687, + "learning_rate": 7.432253845070621e-06, + "loss": 0.7324, + "step": 12359 + }, + { + "epoch": 0.6802795971159667, + "grad_norm": 0.5842836499214172, + "learning_rate": 7.431875112684923e-06, + "loss": 0.677, + "step": 12360 + }, + { + "epoch": 0.6803346359183224, + "grad_norm": 0.7134125828742981, + "learning_rate": 7.431496362021905e-06, + "loss": 0.7034, + "step": 12361 + }, + { + "epoch": 0.6803896747206781, + "grad_norm": 0.7101823091506958, + "learning_rate": 7.431117593084411e-06, + "loss": 0.7526, + "step": 12362 + }, + { + "epoch": 0.6804447135230337, + "grad_norm": 0.6543304920196533, + "learning_rate": 7.4307388058752865e-06, + "loss": 0.7548, + "step": 12363 + }, + { + "epoch": 0.6804997523253894, + "grad_norm": 0.6522945761680603, + "learning_rate": 7.430360000397381e-06, + "loss": 0.7044, + "step": 12364 + }, + { + "epoch": 0.6805547911277451, + "grad_norm": 0.7405091524124146, + "learning_rate": 7.429981176653539e-06, + "loss": 0.8064, + "step": 12365 + }, + { + "epoch": 0.6806098299301008, + "grad_norm": 0.6454355716705322, + "learning_rate": 7.429602334646611e-06, + "loss": 0.7179, + "step": 12366 + }, + { + "epoch": 0.6806648687324564, + "grad_norm": 0.8131621479988098, + "learning_rate": 7.429223474379439e-06, + "loss": 0.7144, + "step": 12367 + }, + { + "epoch": 0.680719907534812, + "grad_norm": 0.7203080058097839, + "learning_rate": 7.428844595854876e-06, + "loss": 0.8189, + "step": 12368 + }, + { + "epoch": 0.6807749463371677, + "grad_norm": 0.650414228439331, + "learning_rate": 7.428465699075767e-06, + "loss": 0.7815, + "step": 12369 + }, + { + "epoch": 0.6808299851395234, + "grad_norm": 0.8152775168418884, + "learning_rate": 7.42808678404496e-06, + "loss": 0.7365, + "step": 12370 + }, + { + "epoch": 0.680885023941879, + "grad_norm": 0.5871601700782776, + "learning_rate": 7.427707850765302e-06, + "loss": 0.6804, + "step": 12371 + }, + { + "epoch": 0.6809400627442347, + "grad_norm": 0.7115684747695923, + "learning_rate": 7.427328899239643e-06, + "loss": 0.728, + "step": 12372 + }, + { + "epoch": 0.6809951015465904, + "grad_norm": 0.6575615406036377, + "learning_rate": 7.426949929470828e-06, + "loss": 0.725, + "step": 12373 + }, + { + "epoch": 0.681050140348946, + "grad_norm": 0.7744095325469971, + "learning_rate": 7.426570941461708e-06, + "loss": 0.7647, + "step": 12374 + }, + { + "epoch": 0.6811051791513016, + "grad_norm": 0.6856220364570618, + "learning_rate": 7.4261919352151305e-06, + "loss": 0.8121, + "step": 12375 + }, + { + "epoch": 0.6811602179536573, + "grad_norm": 0.8197830319404602, + "learning_rate": 7.425812910733943e-06, + "loss": 0.8685, + "step": 12376 + }, + { + "epoch": 0.681215256756013, + "grad_norm": 1.240628719329834, + "learning_rate": 7.425433868020996e-06, + "loss": 0.8063, + "step": 12377 + }, + { + "epoch": 0.6812702955583686, + "grad_norm": 0.8716747760772705, + "learning_rate": 7.425054807079136e-06, + "loss": 0.7384, + "step": 12378 + }, + { + "epoch": 0.6813253343607243, + "grad_norm": 0.7512598037719727, + "learning_rate": 7.4246757279112135e-06, + "loss": 0.7428, + "step": 12379 + }, + { + "epoch": 0.68138037316308, + "grad_norm": 0.7002312541007996, + "learning_rate": 7.424296630520078e-06, + "loss": 0.6066, + "step": 12380 + }, + { + "epoch": 0.6814354119654357, + "grad_norm": 0.6422720551490784, + "learning_rate": 7.423917514908578e-06, + "loss": 0.6645, + "step": 12381 + }, + { + "epoch": 0.6814904507677912, + "grad_norm": 0.8667505383491516, + "learning_rate": 7.423538381079562e-06, + "loss": 0.8663, + "step": 12382 + }, + { + "epoch": 0.6815454895701469, + "grad_norm": 0.7045377492904663, + "learning_rate": 7.423159229035881e-06, + "loss": 0.7684, + "step": 12383 + }, + { + "epoch": 0.6816005283725026, + "grad_norm": 0.7663894295692444, + "learning_rate": 7.422780058780385e-06, + "loss": 0.8051, + "step": 12384 + }, + { + "epoch": 0.6816555671748583, + "grad_norm": 0.7612582445144653, + "learning_rate": 7.42240087031592e-06, + "loss": 0.7771, + "step": 12385 + }, + { + "epoch": 0.6817106059772139, + "grad_norm": 0.8682271838188171, + "learning_rate": 7.42202166364534e-06, + "loss": 0.7761, + "step": 12386 + }, + { + "epoch": 0.6817656447795696, + "grad_norm": 0.712204098701477, + "learning_rate": 7.421642438771492e-06, + "loss": 0.7832, + "step": 12387 + }, + { + "epoch": 0.6818206835819253, + "grad_norm": 0.6726338863372803, + "learning_rate": 7.42126319569723e-06, + "loss": 0.7541, + "step": 12388 + }, + { + "epoch": 0.681875722384281, + "grad_norm": 0.647570788860321, + "learning_rate": 7.420883934425401e-06, + "loss": 0.7281, + "step": 12389 + }, + { + "epoch": 0.6819307611866365, + "grad_norm": 0.7058577537536621, + "learning_rate": 7.420504654958857e-06, + "loss": 0.8315, + "step": 12390 + }, + { + "epoch": 0.6819857999889922, + "grad_norm": 0.6683655977249146, + "learning_rate": 7.420125357300446e-06, + "loss": 0.772, + "step": 12391 + }, + { + "epoch": 0.6820408387913479, + "grad_norm": 0.6768681406974792, + "learning_rate": 7.419746041453022e-06, + "loss": 0.7023, + "step": 12392 + }, + { + "epoch": 0.6820958775937036, + "grad_norm": 0.8037514686584473, + "learning_rate": 7.419366707419434e-06, + "loss": 0.6894, + "step": 12393 + }, + { + "epoch": 0.6821509163960592, + "grad_norm": 0.6510934829711914, + "learning_rate": 7.418987355202534e-06, + "loss": 0.6411, + "step": 12394 + }, + { + "epoch": 0.6822059551984149, + "grad_norm": 0.7628617882728577, + "learning_rate": 7.418607984805173e-06, + "loss": 0.7681, + "step": 12395 + }, + { + "epoch": 0.6822609940007706, + "grad_norm": 0.7146260738372803, + "learning_rate": 7.418228596230201e-06, + "loss": 0.7003, + "step": 12396 + }, + { + "epoch": 0.6823160328031262, + "grad_norm": 0.6208338737487793, + "learning_rate": 7.41784918948047e-06, + "loss": 0.7138, + "step": 12397 + }, + { + "epoch": 0.6823710716054818, + "grad_norm": 0.7859066724777222, + "learning_rate": 7.417469764558832e-06, + "loss": 0.7984, + "step": 12398 + }, + { + "epoch": 0.6824261104078375, + "grad_norm": 0.7636224031448364, + "learning_rate": 7.417090321468138e-06, + "loss": 0.7445, + "step": 12399 + }, + { + "epoch": 0.6824811492101932, + "grad_norm": 0.9071671366691589, + "learning_rate": 7.41671086021124e-06, + "loss": 0.8058, + "step": 12400 + }, + { + "epoch": 0.6825361880125489, + "grad_norm": 0.5986278057098389, + "learning_rate": 7.416331380790991e-06, + "loss": 0.7001, + "step": 12401 + }, + { + "epoch": 0.6825912268149045, + "grad_norm": 0.6812893152236938, + "learning_rate": 7.415951883210242e-06, + "loss": 0.7745, + "step": 12402 + }, + { + "epoch": 0.6826462656172602, + "grad_norm": 0.666362464427948, + "learning_rate": 7.415572367471844e-06, + "loss": 0.7861, + "step": 12403 + }, + { + "epoch": 0.6827013044196159, + "grad_norm": 0.6963029503822327, + "learning_rate": 7.415192833578653e-06, + "loss": 0.7657, + "step": 12404 + }, + { + "epoch": 0.6827563432219715, + "grad_norm": 0.669876217842102, + "learning_rate": 7.414813281533517e-06, + "loss": 0.6441, + "step": 12405 + }, + { + "epoch": 0.6828113820243271, + "grad_norm": 0.6608602404594421, + "learning_rate": 7.414433711339293e-06, + "loss": 0.7203, + "step": 12406 + }, + { + "epoch": 0.6828664208266828, + "grad_norm": 0.7262642979621887, + "learning_rate": 7.41405412299883e-06, + "loss": 0.7842, + "step": 12407 + }, + { + "epoch": 0.6829214596290385, + "grad_norm": 0.7728527188301086, + "learning_rate": 7.413674516514983e-06, + "loss": 0.7551, + "step": 12408 + }, + { + "epoch": 0.6829764984313942, + "grad_norm": 0.7970840930938721, + "learning_rate": 7.4132948918906035e-06, + "loss": 0.8181, + "step": 12409 + }, + { + "epoch": 0.6830315372337498, + "grad_norm": 0.6672868728637695, + "learning_rate": 7.412915249128546e-06, + "loss": 0.7201, + "step": 12410 + }, + { + "epoch": 0.6830865760361055, + "grad_norm": 0.8261075019836426, + "learning_rate": 7.412535588231664e-06, + "loss": 0.6006, + "step": 12411 + }, + { + "epoch": 0.6831416148384611, + "grad_norm": 0.6768019795417786, + "learning_rate": 7.412155909202809e-06, + "loss": 0.7326, + "step": 12412 + }, + { + "epoch": 0.6831966536408168, + "grad_norm": 0.7482851147651672, + "learning_rate": 7.4117762120448364e-06, + "loss": 0.7913, + "step": 12413 + }, + { + "epoch": 0.6832516924431724, + "grad_norm": 0.7315956354141235, + "learning_rate": 7.411396496760601e-06, + "loss": 0.7949, + "step": 12414 + }, + { + "epoch": 0.6833067312455281, + "grad_norm": 0.7460561394691467, + "learning_rate": 7.411016763352954e-06, + "loss": 0.8445, + "step": 12415 + }, + { + "epoch": 0.6833617700478838, + "grad_norm": 0.7025588154792786, + "learning_rate": 7.410637011824749e-06, + "loss": 0.7658, + "step": 12416 + }, + { + "epoch": 0.6834168088502394, + "grad_norm": 0.7507885694503784, + "learning_rate": 7.410257242178842e-06, + "loss": 0.711, + "step": 12417 + }, + { + "epoch": 0.683471847652595, + "grad_norm": 0.6935780048370361, + "learning_rate": 7.409877454418088e-06, + "loss": 0.8376, + "step": 12418 + }, + { + "epoch": 0.6835268864549507, + "grad_norm": 0.7747789025306702, + "learning_rate": 7.409497648545341e-06, + "loss": 0.8173, + "step": 12419 + }, + { + "epoch": 0.6835819252573064, + "grad_norm": 0.6559001803398132, + "learning_rate": 7.4091178245634525e-06, + "loss": 0.7146, + "step": 12420 + }, + { + "epoch": 0.683636964059662, + "grad_norm": 0.7123926877975464, + "learning_rate": 7.408737982475279e-06, + "loss": 0.7544, + "step": 12421 + }, + { + "epoch": 0.6836920028620177, + "grad_norm": 0.8163334131240845, + "learning_rate": 7.408358122283678e-06, + "loss": 0.8008, + "step": 12422 + }, + { + "epoch": 0.6837470416643734, + "grad_norm": 0.6837686896324158, + "learning_rate": 7.4079782439915e-06, + "loss": 0.6595, + "step": 12423 + }, + { + "epoch": 0.6838020804667291, + "grad_norm": 0.9385979175567627, + "learning_rate": 7.407598347601601e-06, + "loss": 0.8135, + "step": 12424 + }, + { + "epoch": 0.6838571192690847, + "grad_norm": 0.7197830677032471, + "learning_rate": 7.407218433116839e-06, + "loss": 0.8401, + "step": 12425 + }, + { + "epoch": 0.6839121580714403, + "grad_norm": 0.7165716290473938, + "learning_rate": 7.406838500540069e-06, + "loss": 0.7864, + "step": 12426 + }, + { + "epoch": 0.683967196873796, + "grad_norm": 0.6844950318336487, + "learning_rate": 7.4064585498741435e-06, + "loss": 0.7409, + "step": 12427 + }, + { + "epoch": 0.6840222356761517, + "grad_norm": 0.6237946152687073, + "learning_rate": 7.40607858112192e-06, + "loss": 0.6915, + "step": 12428 + }, + { + "epoch": 0.6840772744785073, + "grad_norm": 0.7437137365341187, + "learning_rate": 7.405698594286252e-06, + "loss": 0.8191, + "step": 12429 + }, + { + "epoch": 0.684132313280863, + "grad_norm": 0.6956225633621216, + "learning_rate": 7.4053185893700006e-06, + "loss": 0.7662, + "step": 12430 + }, + { + "epoch": 0.6841873520832187, + "grad_norm": 0.6508380174636841, + "learning_rate": 7.404938566376018e-06, + "loss": 0.7758, + "step": 12431 + }, + { + "epoch": 0.6842423908855744, + "grad_norm": 0.6759025454521179, + "learning_rate": 7.404558525307159e-06, + "loss": 0.7713, + "step": 12432 + }, + { + "epoch": 0.68429742968793, + "grad_norm": 0.7280172109603882, + "learning_rate": 7.404178466166283e-06, + "loss": 0.7753, + "step": 12433 + }, + { + "epoch": 0.6843524684902856, + "grad_norm": 0.7599073052406311, + "learning_rate": 7.403798388956245e-06, + "loss": 0.6993, + "step": 12434 + }, + { + "epoch": 0.6844075072926413, + "grad_norm": 0.7962353229522705, + "learning_rate": 7.403418293679903e-06, + "loss": 0.771, + "step": 12435 + }, + { + "epoch": 0.684462546094997, + "grad_norm": 0.6714458465576172, + "learning_rate": 7.40303818034011e-06, + "loss": 0.7077, + "step": 12436 + }, + { + "epoch": 0.6845175848973526, + "grad_norm": 0.6770713925361633, + "learning_rate": 7.402658048939726e-06, + "loss": 0.7695, + "step": 12437 + }, + { + "epoch": 0.6845726236997083, + "grad_norm": 0.7337867617607117, + "learning_rate": 7.402277899481608e-06, + "loss": 0.9453, + "step": 12438 + }, + { + "epoch": 0.684627662502064, + "grad_norm": 0.7457698583602905, + "learning_rate": 7.401897731968612e-06, + "loss": 0.7569, + "step": 12439 + }, + { + "epoch": 0.6846827013044197, + "grad_norm": 0.6683285236358643, + "learning_rate": 7.401517546403595e-06, + "loss": 0.7215, + "step": 12440 + }, + { + "epoch": 0.6847377401067752, + "grad_norm": 0.6516628861427307, + "learning_rate": 7.401137342789415e-06, + "loss": 0.7433, + "step": 12441 + }, + { + "epoch": 0.6847927789091309, + "grad_norm": 0.7572295665740967, + "learning_rate": 7.400757121128932e-06, + "loss": 0.7204, + "step": 12442 + }, + { + "epoch": 0.6848478177114866, + "grad_norm": 0.6884106993675232, + "learning_rate": 7.400376881425e-06, + "loss": 0.6766, + "step": 12443 + }, + { + "epoch": 0.6849028565138423, + "grad_norm": 0.798926591873169, + "learning_rate": 7.399996623680475e-06, + "loss": 0.7673, + "step": 12444 + }, + { + "epoch": 0.6849578953161979, + "grad_norm": 0.7200846672058105, + "learning_rate": 7.399616347898221e-06, + "loss": 0.8032, + "step": 12445 + }, + { + "epoch": 0.6850129341185536, + "grad_norm": 0.7085461020469666, + "learning_rate": 7.3992360540810915e-06, + "loss": 0.7075, + "step": 12446 + }, + { + "epoch": 0.6850679729209093, + "grad_norm": 0.6885339021682739, + "learning_rate": 7.398855742231947e-06, + "loss": 0.7278, + "step": 12447 + }, + { + "epoch": 0.685123011723265, + "grad_norm": 0.6693943738937378, + "learning_rate": 7.398475412353643e-06, + "loss": 0.7134, + "step": 12448 + }, + { + "epoch": 0.6851780505256205, + "grad_norm": 0.6908173561096191, + "learning_rate": 7.398095064449041e-06, + "loss": 0.8054, + "step": 12449 + }, + { + "epoch": 0.6852330893279762, + "grad_norm": 0.6207892894744873, + "learning_rate": 7.397714698520999e-06, + "loss": 0.5789, + "step": 12450 + }, + { + "epoch": 0.6852881281303319, + "grad_norm": 0.8367832899093628, + "learning_rate": 7.397334314572374e-06, + "loss": 0.8186, + "step": 12451 + }, + { + "epoch": 0.6853431669326876, + "grad_norm": 0.7005738615989685, + "learning_rate": 7.396953912606026e-06, + "loss": 0.8177, + "step": 12452 + }, + { + "epoch": 0.6853982057350432, + "grad_norm": 0.7189906239509583, + "learning_rate": 7.396573492624814e-06, + "loss": 0.8387, + "step": 12453 + }, + { + "epoch": 0.6854532445373989, + "grad_norm": 1.040576457977295, + "learning_rate": 7.3961930546315995e-06, + "loss": 0.7165, + "step": 12454 + }, + { + "epoch": 0.6855082833397546, + "grad_norm": 0.6417170166969299, + "learning_rate": 7.3958125986292385e-06, + "loss": 0.6671, + "step": 12455 + }, + { + "epoch": 0.6855633221421102, + "grad_norm": 0.6443242430686951, + "learning_rate": 7.395432124620589e-06, + "loss": 0.6995, + "step": 12456 + }, + { + "epoch": 0.6856183609444658, + "grad_norm": 0.5764951705932617, + "learning_rate": 7.395051632608516e-06, + "loss": 0.6088, + "step": 12457 + }, + { + "epoch": 0.6856733997468215, + "grad_norm": 0.6193686127662659, + "learning_rate": 7.394671122595873e-06, + "loss": 0.7283, + "step": 12458 + }, + { + "epoch": 0.6857284385491772, + "grad_norm": 0.6773817539215088, + "learning_rate": 7.394290594585525e-06, + "loss": 0.8204, + "step": 12459 + }, + { + "epoch": 0.6857834773515328, + "grad_norm": 0.7906570434570312, + "learning_rate": 7.393910048580328e-06, + "loss": 0.7057, + "step": 12460 + }, + { + "epoch": 0.6858385161538885, + "grad_norm": 0.7544124126434326, + "learning_rate": 7.393529484583145e-06, + "loss": 0.8053, + "step": 12461 + }, + { + "epoch": 0.6858935549562442, + "grad_norm": 0.6878008842468262, + "learning_rate": 7.3931489025968365e-06, + "loss": 0.6972, + "step": 12462 + }, + { + "epoch": 0.6859485937585998, + "grad_norm": 0.6734861731529236, + "learning_rate": 7.392768302624259e-06, + "loss": 0.7921, + "step": 12463 + }, + { + "epoch": 0.6860036325609554, + "grad_norm": 0.6845618486404419, + "learning_rate": 7.392387684668276e-06, + "loss": 0.7461, + "step": 12464 + }, + { + "epoch": 0.6860586713633111, + "grad_norm": 0.6362663507461548, + "learning_rate": 7.392007048731748e-06, + "loss": 0.7108, + "step": 12465 + }, + { + "epoch": 0.6861137101656668, + "grad_norm": 0.7441046237945557, + "learning_rate": 7.391626394817537e-06, + "loss": 0.6944, + "step": 12466 + }, + { + "epoch": 0.6861687489680225, + "grad_norm": 1.0933935642242432, + "learning_rate": 7.391245722928501e-06, + "loss": 0.7744, + "step": 12467 + }, + { + "epoch": 0.6862237877703781, + "grad_norm": 0.6531348824501038, + "learning_rate": 7.3908650330675e-06, + "loss": 0.6772, + "step": 12468 + }, + { + "epoch": 0.6862788265727338, + "grad_norm": 0.7533715963363647, + "learning_rate": 7.390484325237399e-06, + "loss": 0.7385, + "step": 12469 + }, + { + "epoch": 0.6863338653750894, + "grad_norm": 0.618679940700531, + "learning_rate": 7.390103599441058e-06, + "loss": 0.6053, + "step": 12470 + }, + { + "epoch": 0.6863889041774451, + "grad_norm": 0.7102347612380981, + "learning_rate": 7.389722855681338e-06, + "loss": 0.7246, + "step": 12471 + }, + { + "epoch": 0.6864439429798007, + "grad_norm": 0.8545061945915222, + "learning_rate": 7.3893420939611e-06, + "loss": 0.7386, + "step": 12472 + }, + { + "epoch": 0.6864989817821564, + "grad_norm": 0.6298168897628784, + "learning_rate": 7.388961314283207e-06, + "loss": 0.6573, + "step": 12473 + }, + { + "epoch": 0.6865540205845121, + "grad_norm": 0.6909272074699402, + "learning_rate": 7.388580516650521e-06, + "loss": 0.7973, + "step": 12474 + }, + { + "epoch": 0.6866090593868678, + "grad_norm": 0.6782366037368774, + "learning_rate": 7.388199701065904e-06, + "loss": 0.7437, + "step": 12475 + }, + { + "epoch": 0.6866640981892234, + "grad_norm": 0.6826187372207642, + "learning_rate": 7.387818867532213e-06, + "loss": 0.6254, + "step": 12476 + }, + { + "epoch": 0.686719136991579, + "grad_norm": 0.7471422553062439, + "learning_rate": 7.387438016052318e-06, + "loss": 0.8668, + "step": 12477 + }, + { + "epoch": 0.6867741757939347, + "grad_norm": 0.7987646460533142, + "learning_rate": 7.38705714662908e-06, + "loss": 0.6759, + "step": 12478 + }, + { + "epoch": 0.6868292145962904, + "grad_norm": 0.7318877577781677, + "learning_rate": 7.386676259265356e-06, + "loss": 0.7167, + "step": 12479 + }, + { + "epoch": 0.686884253398646, + "grad_norm": 0.6655439138412476, + "learning_rate": 7.386295353964013e-06, + "loss": 0.7184, + "step": 12480 + }, + { + "epoch": 0.6869392922010017, + "grad_norm": 0.7323878407478333, + "learning_rate": 7.385914430727912e-06, + "loss": 0.7562, + "step": 12481 + }, + { + "epoch": 0.6869943310033574, + "grad_norm": 0.7813006639480591, + "learning_rate": 7.385533489559918e-06, + "loss": 0.7665, + "step": 12482 + }, + { + "epoch": 0.6870493698057131, + "grad_norm": 0.6889718770980835, + "learning_rate": 7.385152530462894e-06, + "loss": 0.6587, + "step": 12483 + }, + { + "epoch": 0.6871044086080687, + "grad_norm": 0.6930332183837891, + "learning_rate": 7.384771553439698e-06, + "loss": 0.8244, + "step": 12484 + }, + { + "epoch": 0.6871594474104243, + "grad_norm": 0.8294679522514343, + "learning_rate": 7.384390558493201e-06, + "loss": 0.6977, + "step": 12485 + }, + { + "epoch": 0.68721448621278, + "grad_norm": 0.7235204577445984, + "learning_rate": 7.384009545626262e-06, + "loss": 0.7946, + "step": 12486 + }, + { + "epoch": 0.6872695250151357, + "grad_norm": 0.6346727609634399, + "learning_rate": 7.3836285148417456e-06, + "loss": 0.6109, + "step": 12487 + }, + { + "epoch": 0.6873245638174913, + "grad_norm": 0.7168872356414795, + "learning_rate": 7.383247466142513e-06, + "loss": 0.7485, + "step": 12488 + }, + { + "epoch": 0.687379602619847, + "grad_norm": 0.6511938571929932, + "learning_rate": 7.382866399531434e-06, + "loss": 0.8048, + "step": 12489 + }, + { + "epoch": 0.6874346414222027, + "grad_norm": 0.7569704651832581, + "learning_rate": 7.3824853150113674e-06, + "loss": 0.8017, + "step": 12490 + }, + { + "epoch": 0.6874896802245584, + "grad_norm": 0.7708210945129395, + "learning_rate": 7.382104212585178e-06, + "loss": 0.7258, + "step": 12491 + }, + { + "epoch": 0.6875447190269139, + "grad_norm": 0.709702730178833, + "learning_rate": 7.381723092255731e-06, + "loss": 0.7707, + "step": 12492 + }, + { + "epoch": 0.6875997578292696, + "grad_norm": 0.6683183908462524, + "learning_rate": 7.381341954025892e-06, + "loss": 0.702, + "step": 12493 + }, + { + "epoch": 0.6876547966316253, + "grad_norm": 0.7639274597167969, + "learning_rate": 7.380960797898524e-06, + "loss": 0.7027, + "step": 12494 + }, + { + "epoch": 0.687709835433981, + "grad_norm": 0.6735698580741882, + "learning_rate": 7.380579623876492e-06, + "loss": 0.7124, + "step": 12495 + }, + { + "epoch": 0.6877648742363366, + "grad_norm": 0.6635340452194214, + "learning_rate": 7.38019843196266e-06, + "loss": 0.6968, + "step": 12496 + }, + { + "epoch": 0.6878199130386923, + "grad_norm": 0.7459729313850403, + "learning_rate": 7.379817222159895e-06, + "loss": 0.7629, + "step": 12497 + }, + { + "epoch": 0.687874951841048, + "grad_norm": 0.7408778667449951, + "learning_rate": 7.37943599447106e-06, + "loss": 0.8327, + "step": 12498 + }, + { + "epoch": 0.6879299906434037, + "grad_norm": 0.659736156463623, + "learning_rate": 7.379054748899021e-06, + "loss": 0.6746, + "step": 12499 + }, + { + "epoch": 0.6879850294457592, + "grad_norm": 0.7429264783859253, + "learning_rate": 7.3786734854466435e-06, + "loss": 0.8555, + "step": 12500 + }, + { + "epoch": 0.6880400682481149, + "grad_norm": 0.7492697834968567, + "learning_rate": 7.378292204116793e-06, + "loss": 0.7825, + "step": 12501 + }, + { + "epoch": 0.6880951070504706, + "grad_norm": 0.6664871573448181, + "learning_rate": 7.377910904912336e-06, + "loss": 0.7343, + "step": 12502 + }, + { + "epoch": 0.6881501458528262, + "grad_norm": 0.8010555505752563, + "learning_rate": 7.377529587836135e-06, + "loss": 0.6789, + "step": 12503 + }, + { + "epoch": 0.6882051846551819, + "grad_norm": 0.6339166164398193, + "learning_rate": 7.3771482528910585e-06, + "loss": 0.7471, + "step": 12504 + }, + { + "epoch": 0.6882602234575376, + "grad_norm": 0.6750906109809875, + "learning_rate": 7.376766900079973e-06, + "loss": 0.665, + "step": 12505 + }, + { + "epoch": 0.6883152622598933, + "grad_norm": 0.6440090537071228, + "learning_rate": 7.376385529405743e-06, + "loss": 0.6804, + "step": 12506 + }, + { + "epoch": 0.6883703010622488, + "grad_norm": 0.7159061431884766, + "learning_rate": 7.376004140871236e-06, + "loss": 0.7524, + "step": 12507 + }, + { + "epoch": 0.6884253398646045, + "grad_norm": 0.7551491260528564, + "learning_rate": 7.375622734479316e-06, + "loss": 0.891, + "step": 12508 + }, + { + "epoch": 0.6884803786669602, + "grad_norm": 0.6584289073944092, + "learning_rate": 7.375241310232854e-06, + "loss": 0.7313, + "step": 12509 + }, + { + "epoch": 0.6885354174693159, + "grad_norm": 0.7616147398948669, + "learning_rate": 7.374859868134713e-06, + "loss": 0.8351, + "step": 12510 + }, + { + "epoch": 0.6885904562716715, + "grad_norm": 0.669541597366333, + "learning_rate": 7.374478408187761e-06, + "loss": 0.6836, + "step": 12511 + }, + { + "epoch": 0.6886454950740272, + "grad_norm": 0.6483158469200134, + "learning_rate": 7.374096930394864e-06, + "loss": 0.6909, + "step": 12512 + }, + { + "epoch": 0.6887005338763829, + "grad_norm": 0.7079604864120483, + "learning_rate": 7.3737154347588925e-06, + "loss": 0.7151, + "step": 12513 + }, + { + "epoch": 0.6887555726787385, + "grad_norm": 0.6805073618888855, + "learning_rate": 7.373333921282709e-06, + "loss": 0.7761, + "step": 12514 + }, + { + "epoch": 0.6888106114810941, + "grad_norm": 0.757008969783783, + "learning_rate": 7.372952389969183e-06, + "loss": 0.7249, + "step": 12515 + }, + { + "epoch": 0.6888656502834498, + "grad_norm": 0.6990587711334229, + "learning_rate": 7.372570840821183e-06, + "loss": 0.7463, + "step": 12516 + }, + { + "epoch": 0.6889206890858055, + "grad_norm": 0.7405683398246765, + "learning_rate": 7.3721892738415745e-06, + "loss": 0.8039, + "step": 12517 + }, + { + "epoch": 0.6889757278881612, + "grad_norm": 0.6736571192741394, + "learning_rate": 7.371807689033228e-06, + "loss": 0.7084, + "step": 12518 + }, + { + "epoch": 0.6890307666905168, + "grad_norm": 0.752955436706543, + "learning_rate": 7.3714260863990095e-06, + "loss": 0.7951, + "step": 12519 + }, + { + "epoch": 0.6890858054928725, + "grad_norm": 0.6810917258262634, + "learning_rate": 7.3710444659417855e-06, + "loss": 0.7884, + "step": 12520 + }, + { + "epoch": 0.6891408442952281, + "grad_norm": 0.727500855922699, + "learning_rate": 7.370662827664427e-06, + "loss": 0.7617, + "step": 12521 + }, + { + "epoch": 0.6891958830975838, + "grad_norm": 0.6739845871925354, + "learning_rate": 7.3702811715698016e-06, + "loss": 0.6831, + "step": 12522 + }, + { + "epoch": 0.6892509218999394, + "grad_norm": 0.850913941860199, + "learning_rate": 7.369899497660779e-06, + "loss": 0.7658, + "step": 12523 + }, + { + "epoch": 0.6893059607022951, + "grad_norm": 0.7352884411811829, + "learning_rate": 7.369517805940223e-06, + "loss": 0.7748, + "step": 12524 + }, + { + "epoch": 0.6893609995046508, + "grad_norm": 0.6702300310134888, + "learning_rate": 7.369136096411008e-06, + "loss": 0.7557, + "step": 12525 + }, + { + "epoch": 0.6894160383070065, + "grad_norm": 0.7117186784744263, + "learning_rate": 7.368754369075999e-06, + "loss": 0.8147, + "step": 12526 + }, + { + "epoch": 0.6894710771093621, + "grad_norm": 0.6896687746047974, + "learning_rate": 7.368372623938067e-06, + "loss": 0.7753, + "step": 12527 + }, + { + "epoch": 0.6895261159117178, + "grad_norm": 0.669207751750946, + "learning_rate": 7.367990861000078e-06, + "loss": 0.739, + "step": 12528 + }, + { + "epoch": 0.6895811547140734, + "grad_norm": 0.7014279961585999, + "learning_rate": 7.367609080264906e-06, + "loss": 0.7712, + "step": 12529 + }, + { + "epoch": 0.6896361935164291, + "grad_norm": 1.0029237270355225, + "learning_rate": 7.367227281735418e-06, + "loss": 0.7641, + "step": 12530 + }, + { + "epoch": 0.6896912323187847, + "grad_norm": 0.6342340707778931, + "learning_rate": 7.3668454654144824e-06, + "loss": 0.7572, + "step": 12531 + }, + { + "epoch": 0.6897462711211404, + "grad_norm": 0.7475802302360535, + "learning_rate": 7.3664636313049696e-06, + "loss": 0.7969, + "step": 12532 + }, + { + "epoch": 0.6898013099234961, + "grad_norm": 0.7478888630867004, + "learning_rate": 7.36608177940975e-06, + "loss": 0.8299, + "step": 12533 + }, + { + "epoch": 0.6898563487258518, + "grad_norm": 0.7017174363136292, + "learning_rate": 7.365699909731694e-06, + "loss": 0.6608, + "step": 12534 + }, + { + "epoch": 0.6899113875282074, + "grad_norm": 0.7259606122970581, + "learning_rate": 7.3653180222736695e-06, + "loss": 0.7088, + "step": 12535 + }, + { + "epoch": 0.689966426330563, + "grad_norm": 0.7049521207809448, + "learning_rate": 7.364936117038548e-06, + "loss": 0.8177, + "step": 12536 + }, + { + "epoch": 0.6900214651329187, + "grad_norm": 0.6557304263114929, + "learning_rate": 7.364554194029201e-06, + "loss": 0.73, + "step": 12537 + }, + { + "epoch": 0.6900765039352744, + "grad_norm": 0.704140305519104, + "learning_rate": 7.364172253248497e-06, + "loss": 0.7671, + "step": 12538 + }, + { + "epoch": 0.69013154273763, + "grad_norm": 0.6879541873931885, + "learning_rate": 7.3637902946993064e-06, + "loss": 0.6707, + "step": 12539 + }, + { + "epoch": 0.6901865815399857, + "grad_norm": 0.7715931534767151, + "learning_rate": 7.363408318384501e-06, + "loss": 0.7494, + "step": 12540 + }, + { + "epoch": 0.6902416203423414, + "grad_norm": 0.7890990972518921, + "learning_rate": 7.363026324306952e-06, + "loss": 0.7499, + "step": 12541 + }, + { + "epoch": 0.6902966591446971, + "grad_norm": 0.7177792191505432, + "learning_rate": 7.362644312469529e-06, + "loss": 0.8053, + "step": 12542 + }, + { + "epoch": 0.6903516979470526, + "grad_norm": 0.7434332370758057, + "learning_rate": 7.3622622828751044e-06, + "loss": 0.7371, + "step": 12543 + }, + { + "epoch": 0.6904067367494083, + "grad_norm": 0.5836912989616394, + "learning_rate": 7.361880235526547e-06, + "loss": 0.6681, + "step": 12544 + }, + { + "epoch": 0.690461775551764, + "grad_norm": 0.6814625263214111, + "learning_rate": 7.3614981704267315e-06, + "loss": 0.7408, + "step": 12545 + }, + { + "epoch": 0.6905168143541196, + "grad_norm": 0.6524162292480469, + "learning_rate": 7.361116087578528e-06, + "loss": 0.6788, + "step": 12546 + }, + { + "epoch": 0.6905718531564753, + "grad_norm": 0.6614788174629211, + "learning_rate": 7.360733986984808e-06, + "loss": 0.75, + "step": 12547 + }, + { + "epoch": 0.690626891958831, + "grad_norm": 1.035152792930603, + "learning_rate": 7.360351868648442e-06, + "loss": 0.7181, + "step": 12548 + }, + { + "epoch": 0.6906819307611867, + "grad_norm": 0.7525657415390015, + "learning_rate": 7.359969732572305e-06, + "loss": 0.8149, + "step": 12549 + }, + { + "epoch": 0.6907369695635422, + "grad_norm": 0.8323431015014648, + "learning_rate": 7.359587578759267e-06, + "loss": 0.6908, + "step": 12550 + }, + { + "epoch": 0.6907920083658979, + "grad_norm": 0.7551344633102417, + "learning_rate": 7.3592054072122e-06, + "loss": 0.794, + "step": 12551 + }, + { + "epoch": 0.6908470471682536, + "grad_norm": 0.5937384366989136, + "learning_rate": 7.358823217933977e-06, + "loss": 0.6532, + "step": 12552 + }, + { + "epoch": 0.6909020859706093, + "grad_norm": 1.5515329837799072, + "learning_rate": 7.358441010927468e-06, + "loss": 0.7003, + "step": 12553 + }, + { + "epoch": 0.6909571247729649, + "grad_norm": 0.6838175654411316, + "learning_rate": 7.3580587861955495e-06, + "loss": 0.7184, + "step": 12554 + }, + { + "epoch": 0.6910121635753206, + "grad_norm": 0.7055354714393616, + "learning_rate": 7.357676543741092e-06, + "loss": 0.8372, + "step": 12555 + }, + { + "epoch": 0.6910672023776763, + "grad_norm": 0.8683249950408936, + "learning_rate": 7.3572942835669695e-06, + "loss": 0.7594, + "step": 12556 + }, + { + "epoch": 0.691122241180032, + "grad_norm": 0.8586179614067078, + "learning_rate": 7.3569120056760535e-06, + "loss": 0.8422, + "step": 12557 + }, + { + "epoch": 0.6911772799823875, + "grad_norm": 0.692132830619812, + "learning_rate": 7.356529710071217e-06, + "loss": 0.7872, + "step": 12558 + }, + { + "epoch": 0.6912323187847432, + "grad_norm": 0.7342404723167419, + "learning_rate": 7.356147396755335e-06, + "loss": 0.6908, + "step": 12559 + }, + { + "epoch": 0.6912873575870989, + "grad_norm": 0.6941357254981995, + "learning_rate": 7.35576506573128e-06, + "loss": 0.608, + "step": 12560 + }, + { + "epoch": 0.6913423963894546, + "grad_norm": 0.648225724697113, + "learning_rate": 7.355382717001925e-06, + "loss": 0.6923, + "step": 12561 + }, + { + "epoch": 0.6913974351918102, + "grad_norm": 0.6735422015190125, + "learning_rate": 7.355000350570144e-06, + "loss": 0.7502, + "step": 12562 + }, + { + "epoch": 0.6914524739941659, + "grad_norm": 0.8507662415504456, + "learning_rate": 7.3546179664388105e-06, + "loss": 0.7883, + "step": 12563 + }, + { + "epoch": 0.6915075127965216, + "grad_norm": 0.7287268042564392, + "learning_rate": 7.3542355646108e-06, + "loss": 0.8687, + "step": 12564 + }, + { + "epoch": 0.6915625515988773, + "grad_norm": 0.6085666418075562, + "learning_rate": 7.353853145088983e-06, + "loss": 0.6675, + "step": 12565 + }, + { + "epoch": 0.6916175904012328, + "grad_norm": 0.727668046951294, + "learning_rate": 7.353470707876237e-06, + "loss": 0.8591, + "step": 12566 + }, + { + "epoch": 0.6916726292035885, + "grad_norm": 0.724846601486206, + "learning_rate": 7.353088252975436e-06, + "loss": 0.8501, + "step": 12567 + }, + { + "epoch": 0.6917276680059442, + "grad_norm": 0.6801046133041382, + "learning_rate": 7.352705780389452e-06, + "loss": 0.7637, + "step": 12568 + }, + { + "epoch": 0.6917827068082999, + "grad_norm": 0.680496335029602, + "learning_rate": 7.352323290121161e-06, + "loss": 0.7308, + "step": 12569 + }, + { + "epoch": 0.6918377456106555, + "grad_norm": 0.7143607139587402, + "learning_rate": 7.351940782173439e-06, + "loss": 0.7494, + "step": 12570 + }, + { + "epoch": 0.6918927844130112, + "grad_norm": 0.679755687713623, + "learning_rate": 7.351558256549158e-06, + "loss": 0.7731, + "step": 12571 + }, + { + "epoch": 0.6919478232153669, + "grad_norm": 0.6626351475715637, + "learning_rate": 7.351175713251197e-06, + "loss": 0.8593, + "step": 12572 + }, + { + "epoch": 0.6920028620177225, + "grad_norm": 0.6830954551696777, + "learning_rate": 7.350793152282427e-06, + "loss": 0.6327, + "step": 12573 + }, + { + "epoch": 0.6920579008200781, + "grad_norm": 0.653810977935791, + "learning_rate": 7.350410573645726e-06, + "loss": 0.7341, + "step": 12574 + }, + { + "epoch": 0.6921129396224338, + "grad_norm": 0.6939566731452942, + "learning_rate": 7.3500279773439675e-06, + "loss": 0.7823, + "step": 12575 + }, + { + "epoch": 0.6921679784247895, + "grad_norm": 0.8212422728538513, + "learning_rate": 7.349645363380029e-06, + "loss": 0.6388, + "step": 12576 + }, + { + "epoch": 0.6922230172271452, + "grad_norm": 0.7703338265419006, + "learning_rate": 7.349262731756783e-06, + "loss": 0.7476, + "step": 12577 + }, + { + "epoch": 0.6922780560295008, + "grad_norm": 0.6710889935493469, + "learning_rate": 7.348880082477108e-06, + "loss": 0.7869, + "step": 12578 + }, + { + "epoch": 0.6923330948318565, + "grad_norm": 0.7384413480758667, + "learning_rate": 7.3484974155438795e-06, + "loss": 0.6628, + "step": 12579 + }, + { + "epoch": 0.6923881336342121, + "grad_norm": 0.7628176212310791, + "learning_rate": 7.348114730959973e-06, + "loss": 0.7599, + "step": 12580 + }, + { + "epoch": 0.6924431724365678, + "grad_norm": 0.683885931968689, + "learning_rate": 7.347732028728264e-06, + "loss": 0.7134, + "step": 12581 + }, + { + "epoch": 0.6924982112389234, + "grad_norm": 0.6710503697395325, + "learning_rate": 7.34734930885163e-06, + "loss": 0.7147, + "step": 12582 + }, + { + "epoch": 0.6925532500412791, + "grad_norm": 0.6984537243843079, + "learning_rate": 7.346966571332947e-06, + "loss": 0.7517, + "step": 12583 + }, + { + "epoch": 0.6926082888436348, + "grad_norm": 0.7563193440437317, + "learning_rate": 7.346583816175092e-06, + "loss": 0.7971, + "step": 12584 + }, + { + "epoch": 0.6926633276459905, + "grad_norm": 0.8407838940620422, + "learning_rate": 7.346201043380941e-06, + "loss": 0.8227, + "step": 12585 + }, + { + "epoch": 0.6927183664483461, + "grad_norm": 0.673098623752594, + "learning_rate": 7.345818252953369e-06, + "loss": 0.7514, + "step": 12586 + }, + { + "epoch": 0.6927734052507017, + "grad_norm": 0.6452111005783081, + "learning_rate": 7.345435444895257e-06, + "loss": 0.7201, + "step": 12587 + }, + { + "epoch": 0.6928284440530574, + "grad_norm": 0.8728383779525757, + "learning_rate": 7.345052619209481e-06, + "loss": 0.7452, + "step": 12588 + }, + { + "epoch": 0.692883482855413, + "grad_norm": 0.7032049298286438, + "learning_rate": 7.344669775898914e-06, + "loss": 0.8885, + "step": 12589 + }, + { + "epoch": 0.6929385216577687, + "grad_norm": 0.7744605541229248, + "learning_rate": 7.344286914966438e-06, + "loss": 0.8048, + "step": 12590 + }, + { + "epoch": 0.6929935604601244, + "grad_norm": 0.7334163784980774, + "learning_rate": 7.343904036414931e-06, + "loss": 0.8502, + "step": 12591 + }, + { + "epoch": 0.6930485992624801, + "grad_norm": 0.6684108376502991, + "learning_rate": 7.343521140247266e-06, + "loss": 0.8264, + "step": 12592 + }, + { + "epoch": 0.6931036380648357, + "grad_norm": 0.6192718744277954, + "learning_rate": 7.343138226466324e-06, + "loss": 0.6625, + "step": 12593 + }, + { + "epoch": 0.6931586768671913, + "grad_norm": 0.6410724520683289, + "learning_rate": 7.342755295074984e-06, + "loss": 0.717, + "step": 12594 + }, + { + "epoch": 0.693213715669547, + "grad_norm": 0.6854361891746521, + "learning_rate": 7.342372346076121e-06, + "loss": 0.7246, + "step": 12595 + }, + { + "epoch": 0.6932687544719027, + "grad_norm": 0.6920250058174133, + "learning_rate": 7.341989379472614e-06, + "loss": 0.7414, + "step": 12596 + }, + { + "epoch": 0.6933237932742583, + "grad_norm": 0.6545842885971069, + "learning_rate": 7.341606395267342e-06, + "loss": 0.7731, + "step": 12597 + }, + { + "epoch": 0.693378832076614, + "grad_norm": 0.6879072785377502, + "learning_rate": 7.341223393463184e-06, + "loss": 0.7272, + "step": 12598 + }, + { + "epoch": 0.6934338708789697, + "grad_norm": 0.7460979223251343, + "learning_rate": 7.340840374063018e-06, + "loss": 0.771, + "step": 12599 + }, + { + "epoch": 0.6934889096813254, + "grad_norm": 0.7836858630180359, + "learning_rate": 7.340457337069722e-06, + "loss": 0.846, + "step": 12600 + }, + { + "epoch": 0.693543948483681, + "grad_norm": 0.958403468132019, + "learning_rate": 7.340074282486174e-06, + "loss": 0.8913, + "step": 12601 + }, + { + "epoch": 0.6935989872860366, + "grad_norm": 0.6614813208580017, + "learning_rate": 7.339691210315254e-06, + "loss": 0.7129, + "step": 12602 + }, + { + "epoch": 0.6936540260883923, + "grad_norm": 0.7303252816200256, + "learning_rate": 7.339308120559843e-06, + "loss": 0.8395, + "step": 12603 + }, + { + "epoch": 0.693709064890748, + "grad_norm": 0.7341620922088623, + "learning_rate": 7.338925013222817e-06, + "loss": 0.8341, + "step": 12604 + }, + { + "epoch": 0.6937641036931036, + "grad_norm": 0.7077179551124573, + "learning_rate": 7.338541888307056e-06, + "loss": 0.7813, + "step": 12605 + }, + { + "epoch": 0.6938191424954593, + "grad_norm": 0.6654969453811646, + "learning_rate": 7.338158745815441e-06, + "loss": 0.7337, + "step": 12606 + }, + { + "epoch": 0.693874181297815, + "grad_norm": 0.6637474894523621, + "learning_rate": 7.337775585750852e-06, + "loss": 0.8197, + "step": 12607 + }, + { + "epoch": 0.6939292201001707, + "grad_norm": 0.654712975025177, + "learning_rate": 7.337392408116166e-06, + "loss": 0.6991, + "step": 12608 + }, + { + "epoch": 0.6939842589025262, + "grad_norm": 0.6698346138000488, + "learning_rate": 7.337009212914265e-06, + "loss": 0.7991, + "step": 12609 + }, + { + "epoch": 0.6940392977048819, + "grad_norm": 0.9616294503211975, + "learning_rate": 7.336626000148028e-06, + "loss": 0.7326, + "step": 12610 + }, + { + "epoch": 0.6940943365072376, + "grad_norm": 0.7749543786048889, + "learning_rate": 7.336242769820335e-06, + "loss": 0.8015, + "step": 12611 + }, + { + "epoch": 0.6941493753095933, + "grad_norm": 0.7263140678405762, + "learning_rate": 7.335859521934068e-06, + "loss": 0.7538, + "step": 12612 + }, + { + "epoch": 0.6942044141119489, + "grad_norm": 0.6383689641952515, + "learning_rate": 7.335476256492105e-06, + "loss": 0.7611, + "step": 12613 + }, + { + "epoch": 0.6942594529143046, + "grad_norm": 0.7464908957481384, + "learning_rate": 7.335092973497326e-06, + "loss": 0.7904, + "step": 12614 + }, + { + "epoch": 0.6943144917166603, + "grad_norm": 1.114864468574524, + "learning_rate": 7.334709672952615e-06, + "loss": 0.8518, + "step": 12615 + }, + { + "epoch": 0.694369530519016, + "grad_norm": 0.6712734699249268, + "learning_rate": 7.334326354860852e-06, + "loss": 0.7431, + "step": 12616 + }, + { + "epoch": 0.6944245693213715, + "grad_norm": 0.7559850811958313, + "learning_rate": 7.3339430192249166e-06, + "loss": 0.7556, + "step": 12617 + }, + { + "epoch": 0.6944796081237272, + "grad_norm": 0.7262033224105835, + "learning_rate": 7.333559666047689e-06, + "loss": 0.7624, + "step": 12618 + }, + { + "epoch": 0.6945346469260829, + "grad_norm": 0.6428695917129517, + "learning_rate": 7.333176295332053e-06, + "loss": 0.6894, + "step": 12619 + }, + { + "epoch": 0.6945896857284386, + "grad_norm": 0.7353672385215759, + "learning_rate": 7.3327929070808875e-06, + "loss": 0.7611, + "step": 12620 + }, + { + "epoch": 0.6946447245307942, + "grad_norm": 0.7063810229301453, + "learning_rate": 7.332409501297076e-06, + "loss": 0.7428, + "step": 12621 + }, + { + "epoch": 0.6946997633331499, + "grad_norm": 0.6552421450614929, + "learning_rate": 7.332026077983498e-06, + "loss": 0.7046, + "step": 12622 + }, + { + "epoch": 0.6947548021355056, + "grad_norm": 0.8843327760696411, + "learning_rate": 7.331642637143037e-06, + "loss": 0.6952, + "step": 12623 + }, + { + "epoch": 0.6948098409378612, + "grad_norm": 0.7279102802276611, + "learning_rate": 7.331259178778574e-06, + "loss": 0.7911, + "step": 12624 + }, + { + "epoch": 0.6948648797402168, + "grad_norm": 0.6585525870323181, + "learning_rate": 7.33087570289299e-06, + "loss": 0.7684, + "step": 12625 + }, + { + "epoch": 0.6949199185425725, + "grad_norm": 0.663185715675354, + "learning_rate": 7.3304922094891695e-06, + "loss": 0.6753, + "step": 12626 + }, + { + "epoch": 0.6949749573449282, + "grad_norm": 0.652765691280365, + "learning_rate": 7.330108698569993e-06, + "loss": 0.7333, + "step": 12627 + }, + { + "epoch": 0.6950299961472839, + "grad_norm": 0.7781688570976257, + "learning_rate": 7.329725170138343e-06, + "loss": 0.7312, + "step": 12628 + }, + { + "epoch": 0.6950850349496395, + "grad_norm": 0.6798241138458252, + "learning_rate": 7.329341624197102e-06, + "loss": 0.7747, + "step": 12629 + }, + { + "epoch": 0.6951400737519952, + "grad_norm": 0.7588373422622681, + "learning_rate": 7.328958060749153e-06, + "loss": 0.8535, + "step": 12630 + }, + { + "epoch": 0.6951951125543508, + "grad_norm": 0.8833348155021667, + "learning_rate": 7.328574479797379e-06, + "loss": 0.8345, + "step": 12631 + }, + { + "epoch": 0.6952501513567064, + "grad_norm": 0.799454927444458, + "learning_rate": 7.328190881344663e-06, + "loss": 0.7571, + "step": 12632 + }, + { + "epoch": 0.6953051901590621, + "grad_norm": 0.8030340671539307, + "learning_rate": 7.327807265393887e-06, + "loss": 0.7426, + "step": 12633 + }, + { + "epoch": 0.6953602289614178, + "grad_norm": 0.6246228218078613, + "learning_rate": 7.327423631947934e-06, + "loss": 0.6712, + "step": 12634 + }, + { + "epoch": 0.6954152677637735, + "grad_norm": 0.7203500866889954, + "learning_rate": 7.32703998100969e-06, + "loss": 0.8315, + "step": 12635 + }, + { + "epoch": 0.6954703065661291, + "grad_norm": 0.6128239035606384, + "learning_rate": 7.326656312582035e-06, + "loss": 0.6788, + "step": 12636 + }, + { + "epoch": 0.6955253453684848, + "grad_norm": 0.8052619695663452, + "learning_rate": 7.326272626667852e-06, + "loss": 0.8076, + "step": 12637 + }, + { + "epoch": 0.6955803841708404, + "grad_norm": 0.9128470420837402, + "learning_rate": 7.325888923270029e-06, + "loss": 0.7135, + "step": 12638 + }, + { + "epoch": 0.6956354229731961, + "grad_norm": 0.6815299391746521, + "learning_rate": 7.325505202391447e-06, + "loss": 0.7756, + "step": 12639 + }, + { + "epoch": 0.6956904617755517, + "grad_norm": 0.6278733611106873, + "learning_rate": 7.325121464034991e-06, + "loss": 0.6583, + "step": 12640 + }, + { + "epoch": 0.6957455005779074, + "grad_norm": 0.7161649465560913, + "learning_rate": 7.324737708203543e-06, + "loss": 0.7106, + "step": 12641 + }, + { + "epoch": 0.6958005393802631, + "grad_norm": 0.6827715635299683, + "learning_rate": 7.324353934899989e-06, + "loss": 0.7988, + "step": 12642 + }, + { + "epoch": 0.6958555781826188, + "grad_norm": 0.9999695420265198, + "learning_rate": 7.323970144127215e-06, + "loss": 0.8222, + "step": 12643 + }, + { + "epoch": 0.6959106169849744, + "grad_norm": 0.8048173785209656, + "learning_rate": 7.323586335888102e-06, + "loss": 0.7157, + "step": 12644 + }, + { + "epoch": 0.69596565578733, + "grad_norm": 0.7403637170791626, + "learning_rate": 7.323202510185536e-06, + "loss": 0.7516, + "step": 12645 + }, + { + "epoch": 0.6960206945896857, + "grad_norm": 0.6660793423652649, + "learning_rate": 7.322818667022402e-06, + "loss": 0.7081, + "step": 12646 + }, + { + "epoch": 0.6960757333920414, + "grad_norm": 0.713985800743103, + "learning_rate": 7.322434806401585e-06, + "loss": 0.7682, + "step": 12647 + }, + { + "epoch": 0.696130772194397, + "grad_norm": 0.739253044128418, + "learning_rate": 7.322050928325969e-06, + "loss": 0.838, + "step": 12648 + }, + { + "epoch": 0.6961858109967527, + "grad_norm": 0.8350489735603333, + "learning_rate": 7.32166703279844e-06, + "loss": 0.7627, + "step": 12649 + }, + { + "epoch": 0.6962408497991084, + "grad_norm": 0.580456018447876, + "learning_rate": 7.321283119821883e-06, + "loss": 0.6248, + "step": 12650 + }, + { + "epoch": 0.6962958886014641, + "grad_norm": 0.8619480729103088, + "learning_rate": 7.320899189399183e-06, + "loss": 0.848, + "step": 12651 + }, + { + "epoch": 0.6963509274038197, + "grad_norm": 0.6201381087303162, + "learning_rate": 7.320515241533227e-06, + "loss": 0.6506, + "step": 12652 + }, + { + "epoch": 0.6964059662061753, + "grad_norm": 0.6956773400306702, + "learning_rate": 7.320131276226898e-06, + "loss": 0.7561, + "step": 12653 + }, + { + "epoch": 0.696461005008531, + "grad_norm": 0.6382080912590027, + "learning_rate": 7.319747293483085e-06, + "loss": 0.6462, + "step": 12654 + }, + { + "epoch": 0.6965160438108867, + "grad_norm": 0.7288708686828613, + "learning_rate": 7.319363293304672e-06, + "loss": 0.7907, + "step": 12655 + }, + { + "epoch": 0.6965710826132423, + "grad_norm": 0.6280390024185181, + "learning_rate": 7.318979275694546e-06, + "loss": 0.6882, + "step": 12656 + }, + { + "epoch": 0.696626121415598, + "grad_norm": 0.7260308861732483, + "learning_rate": 7.31859524065559e-06, + "loss": 0.756, + "step": 12657 + }, + { + "epoch": 0.6966811602179537, + "grad_norm": 0.6715009212493896, + "learning_rate": 7.318211188190696e-06, + "loss": 0.7194, + "step": 12658 + }, + { + "epoch": 0.6967361990203094, + "grad_norm": 0.6770408749580383, + "learning_rate": 7.3178271183027465e-06, + "loss": 0.808, + "step": 12659 + }, + { + "epoch": 0.6967912378226649, + "grad_norm": 0.7209904789924622, + "learning_rate": 7.317443030994628e-06, + "loss": 0.7242, + "step": 12660 + }, + { + "epoch": 0.6968462766250206, + "grad_norm": 0.6943202018737793, + "learning_rate": 7.317058926269227e-06, + "loss": 0.758, + "step": 12661 + }, + { + "epoch": 0.6969013154273763, + "grad_norm": 0.6073412299156189, + "learning_rate": 7.316674804129432e-06, + "loss": 0.6571, + "step": 12662 + }, + { + "epoch": 0.696956354229732, + "grad_norm": 0.7065439224243164, + "learning_rate": 7.316290664578129e-06, + "loss": 0.7333, + "step": 12663 + }, + { + "epoch": 0.6970113930320876, + "grad_norm": 0.6275133490562439, + "learning_rate": 7.315906507618207e-06, + "loss": 0.6785, + "step": 12664 + }, + { + "epoch": 0.6970664318344433, + "grad_norm": 0.6484677791595459, + "learning_rate": 7.315522333252551e-06, + "loss": 0.7461, + "step": 12665 + }, + { + "epoch": 0.697121470636799, + "grad_norm": 0.6815413236618042, + "learning_rate": 7.315138141484049e-06, + "loss": 0.673, + "step": 12666 + }, + { + "epoch": 0.6971765094391547, + "grad_norm": 0.7227872610092163, + "learning_rate": 7.314753932315587e-06, + "loss": 0.7212, + "step": 12667 + }, + { + "epoch": 0.6972315482415102, + "grad_norm": 0.661568284034729, + "learning_rate": 7.314369705750055e-06, + "loss": 0.7633, + "step": 12668 + }, + { + "epoch": 0.6972865870438659, + "grad_norm": 0.5873990654945374, + "learning_rate": 7.3139854617903405e-06, + "loss": 0.6142, + "step": 12669 + }, + { + "epoch": 0.6973416258462216, + "grad_norm": 0.7015652656555176, + "learning_rate": 7.313601200439331e-06, + "loss": 0.6762, + "step": 12670 + }, + { + "epoch": 0.6973966646485773, + "grad_norm": 0.7060853242874146, + "learning_rate": 7.313216921699913e-06, + "loss": 0.8111, + "step": 12671 + }, + { + "epoch": 0.6974517034509329, + "grad_norm": 0.6198092699050903, + "learning_rate": 7.312832625574977e-06, + "loss": 0.7058, + "step": 12672 + }, + { + "epoch": 0.6975067422532886, + "grad_norm": 0.6785464286804199, + "learning_rate": 7.312448312067408e-06, + "loss": 0.7509, + "step": 12673 + }, + { + "epoch": 0.6975617810556443, + "grad_norm": 0.74974524974823, + "learning_rate": 7.312063981180097e-06, + "loss": 0.7679, + "step": 12674 + }, + { + "epoch": 0.6976168198579998, + "grad_norm": 0.6188651919364929, + "learning_rate": 7.311679632915934e-06, + "loss": 0.663, + "step": 12675 + }, + { + "epoch": 0.6976718586603555, + "grad_norm": 0.7458493113517761, + "learning_rate": 7.3112952672778044e-06, + "loss": 0.7316, + "step": 12676 + }, + { + "epoch": 0.6977268974627112, + "grad_norm": 0.7480403780937195, + "learning_rate": 7.310910884268597e-06, + "loss": 0.8476, + "step": 12677 + }, + { + "epoch": 0.6977819362650669, + "grad_norm": 0.6921943426132202, + "learning_rate": 7.310526483891204e-06, + "loss": 0.7931, + "step": 12678 + }, + { + "epoch": 0.6978369750674225, + "grad_norm": 0.7384023666381836, + "learning_rate": 7.3101420661485124e-06, + "loss": 0.7698, + "step": 12679 + }, + { + "epoch": 0.6978920138697782, + "grad_norm": 0.6693310141563416, + "learning_rate": 7.3097576310434105e-06, + "loss": 0.6838, + "step": 12680 + }, + { + "epoch": 0.6979470526721339, + "grad_norm": 0.6888617873191833, + "learning_rate": 7.309373178578789e-06, + "loss": 0.7196, + "step": 12681 + }, + { + "epoch": 0.6980020914744895, + "grad_norm": 0.7608165144920349, + "learning_rate": 7.308988708757536e-06, + "loss": 0.7483, + "step": 12682 + }, + { + "epoch": 0.6980571302768451, + "grad_norm": 0.6969812512397766, + "learning_rate": 7.308604221582543e-06, + "loss": 0.7415, + "step": 12683 + }, + { + "epoch": 0.6981121690792008, + "grad_norm": 0.7440872192382812, + "learning_rate": 7.3082197170566996e-06, + "loss": 0.7776, + "step": 12684 + }, + { + "epoch": 0.6981672078815565, + "grad_norm": 0.7920299768447876, + "learning_rate": 7.307835195182892e-06, + "loss": 0.746, + "step": 12685 + }, + { + "epoch": 0.6982222466839122, + "grad_norm": 0.7002919912338257, + "learning_rate": 7.3074506559640134e-06, + "loss": 0.7948, + "step": 12686 + }, + { + "epoch": 0.6982772854862678, + "grad_norm": 0.7199681997299194, + "learning_rate": 7.3070660994029554e-06, + "loss": 0.7568, + "step": 12687 + }, + { + "epoch": 0.6983323242886235, + "grad_norm": 0.6287575960159302, + "learning_rate": 7.306681525502604e-06, + "loss": 0.6564, + "step": 12688 + }, + { + "epoch": 0.6983873630909792, + "grad_norm": 0.6910778880119324, + "learning_rate": 7.306296934265853e-06, + "loss": 0.7892, + "step": 12689 + }, + { + "epoch": 0.6984424018933348, + "grad_norm": 0.6454603672027588, + "learning_rate": 7.30591232569559e-06, + "loss": 0.7848, + "step": 12690 + }, + { + "epoch": 0.6984974406956904, + "grad_norm": 0.7337101101875305, + "learning_rate": 7.305527699794709e-06, + "loss": 0.8012, + "step": 12691 + }, + { + "epoch": 0.6985524794980461, + "grad_norm": 0.6694337129592896, + "learning_rate": 7.305143056566098e-06, + "loss": 0.7767, + "step": 12692 + }, + { + "epoch": 0.6986075183004018, + "grad_norm": 0.6485214233398438, + "learning_rate": 7.30475839601265e-06, + "loss": 0.7142, + "step": 12693 + }, + { + "epoch": 0.6986625571027575, + "grad_norm": 0.6401854753494263, + "learning_rate": 7.304373718137253e-06, + "loss": 0.6562, + "step": 12694 + }, + { + "epoch": 0.6987175959051131, + "grad_norm": 0.7190635800361633, + "learning_rate": 7.303989022942801e-06, + "loss": 0.7513, + "step": 12695 + }, + { + "epoch": 0.6987726347074688, + "grad_norm": 0.7100299596786499, + "learning_rate": 7.3036043104321854e-06, + "loss": 0.759, + "step": 12696 + }, + { + "epoch": 0.6988276735098244, + "grad_norm": 0.8507145047187805, + "learning_rate": 7.303219580608295e-06, + "loss": 0.7567, + "step": 12697 + }, + { + "epoch": 0.6988827123121801, + "grad_norm": 0.6758378744125366, + "learning_rate": 7.302834833474022e-06, + "loss": 0.6751, + "step": 12698 + }, + { + "epoch": 0.6989377511145357, + "grad_norm": 0.7602974772453308, + "learning_rate": 7.30245006903226e-06, + "loss": 0.7304, + "step": 12699 + }, + { + "epoch": 0.6989927899168914, + "grad_norm": 0.7519045472145081, + "learning_rate": 7.3020652872859e-06, + "loss": 0.7573, + "step": 12700 + }, + { + "epoch": 0.6990478287192471, + "grad_norm": 0.6076456904411316, + "learning_rate": 7.301680488237832e-06, + "loss": 0.6335, + "step": 12701 + }, + { + "epoch": 0.6991028675216028, + "grad_norm": 0.6900685429573059, + "learning_rate": 7.30129567189095e-06, + "loss": 0.7787, + "step": 12702 + }, + { + "epoch": 0.6991579063239584, + "grad_norm": 0.7366316318511963, + "learning_rate": 7.300910838248146e-06, + "loss": 0.8176, + "step": 12703 + }, + { + "epoch": 0.699212945126314, + "grad_norm": 0.6658521890640259, + "learning_rate": 7.300525987312312e-06, + "loss": 0.6436, + "step": 12704 + }, + { + "epoch": 0.6992679839286697, + "grad_norm": 0.7635871171951294, + "learning_rate": 7.300141119086341e-06, + "loss": 0.8421, + "step": 12705 + }, + { + "epoch": 0.6993230227310254, + "grad_norm": 0.7257800698280334, + "learning_rate": 7.299756233573125e-06, + "loss": 0.6468, + "step": 12706 + }, + { + "epoch": 0.699378061533381, + "grad_norm": 0.7536096572875977, + "learning_rate": 7.299371330775558e-06, + "loss": 0.7782, + "step": 12707 + }, + { + "epoch": 0.6994331003357367, + "grad_norm": 0.7504379153251648, + "learning_rate": 7.298986410696529e-06, + "loss": 0.7097, + "step": 12708 + }, + { + "epoch": 0.6994881391380924, + "grad_norm": 0.7340306043624878, + "learning_rate": 7.298601473338936e-06, + "loss": 0.8165, + "step": 12709 + }, + { + "epoch": 0.6995431779404481, + "grad_norm": 0.6928045749664307, + "learning_rate": 7.298216518705667e-06, + "loss": 0.777, + "step": 12710 + }, + { + "epoch": 0.6995982167428036, + "grad_norm": 0.6942496299743652, + "learning_rate": 7.29783154679962e-06, + "loss": 0.6607, + "step": 12711 + }, + { + "epoch": 0.6996532555451593, + "grad_norm": 0.6646896600723267, + "learning_rate": 7.297446557623684e-06, + "loss": 0.712, + "step": 12712 + }, + { + "epoch": 0.699708294347515, + "grad_norm": 0.6828078627586365, + "learning_rate": 7.297061551180758e-06, + "loss": 0.7251, + "step": 12713 + }, + { + "epoch": 0.6997633331498707, + "grad_norm": 0.7554219365119934, + "learning_rate": 7.296676527473729e-06, + "loss": 0.8279, + "step": 12714 + }, + { + "epoch": 0.6998183719522263, + "grad_norm": 0.8122106194496155, + "learning_rate": 7.296291486505495e-06, + "loss": 0.8039, + "step": 12715 + }, + { + "epoch": 0.699873410754582, + "grad_norm": 0.6602222323417664, + "learning_rate": 7.295906428278949e-06, + "loss": 0.7149, + "step": 12716 + }, + { + "epoch": 0.6999284495569377, + "grad_norm": 0.8341954350471497, + "learning_rate": 7.2955213527969845e-06, + "loss": 0.7868, + "step": 12717 + }, + { + "epoch": 0.6999834883592932, + "grad_norm": 0.7157256603240967, + "learning_rate": 7.295136260062496e-06, + "loss": 0.745, + "step": 12718 + }, + { + "epoch": 0.7000385271616489, + "grad_norm": 0.5845672488212585, + "learning_rate": 7.294751150078379e-06, + "loss": 0.657, + "step": 12719 + }, + { + "epoch": 0.7000935659640046, + "grad_norm": 0.7370786070823669, + "learning_rate": 7.2943660228475265e-06, + "loss": 0.7883, + "step": 12720 + }, + { + "epoch": 0.7001486047663603, + "grad_norm": 0.6687451004981995, + "learning_rate": 7.293980878372833e-06, + "loss": 0.7945, + "step": 12721 + }, + { + "epoch": 0.7002036435687159, + "grad_norm": 0.6352105736732483, + "learning_rate": 7.293595716657192e-06, + "loss": 0.6581, + "step": 12722 + }, + { + "epoch": 0.7002586823710716, + "grad_norm": 0.7371370196342468, + "learning_rate": 7.293210537703499e-06, + "loss": 0.7859, + "step": 12723 + }, + { + "epoch": 0.7003137211734273, + "grad_norm": 0.6885504722595215, + "learning_rate": 7.292825341514651e-06, + "loss": 0.7355, + "step": 12724 + }, + { + "epoch": 0.700368759975783, + "grad_norm": 0.6930849552154541, + "learning_rate": 7.292440128093542e-06, + "loss": 0.8145, + "step": 12725 + }, + { + "epoch": 0.7004237987781385, + "grad_norm": 0.6767199635505676, + "learning_rate": 7.292054897443065e-06, + "loss": 0.7136, + "step": 12726 + }, + { + "epoch": 0.7004788375804942, + "grad_norm": 0.6672216653823853, + "learning_rate": 7.291669649566117e-06, + "loss": 0.6131, + "step": 12727 + }, + { + "epoch": 0.7005338763828499, + "grad_norm": 0.6618815064430237, + "learning_rate": 7.291284384465595e-06, + "loss": 0.7633, + "step": 12728 + }, + { + "epoch": 0.7005889151852056, + "grad_norm": 0.6573876142501831, + "learning_rate": 7.290899102144392e-06, + "loss": 0.7621, + "step": 12729 + }, + { + "epoch": 0.7006439539875612, + "grad_norm": 0.7449564337730408, + "learning_rate": 7.290513802605405e-06, + "loss": 0.6488, + "step": 12730 + }, + { + "epoch": 0.7006989927899169, + "grad_norm": 0.7307295203208923, + "learning_rate": 7.290128485851529e-06, + "loss": 0.7095, + "step": 12731 + }, + { + "epoch": 0.7007540315922726, + "grad_norm": 0.698699951171875, + "learning_rate": 7.2897431518856596e-06, + "loss": 0.7428, + "step": 12732 + }, + { + "epoch": 0.7008090703946283, + "grad_norm": 0.6334750056266785, + "learning_rate": 7.289357800710695e-06, + "loss": 0.6977, + "step": 12733 + }, + { + "epoch": 0.7008641091969838, + "grad_norm": 0.6526468396186829, + "learning_rate": 7.288972432329529e-06, + "loss": 0.6375, + "step": 12734 + }, + { + "epoch": 0.7009191479993395, + "grad_norm": 0.7282149791717529, + "learning_rate": 7.288587046745059e-06, + "loss": 0.7494, + "step": 12735 + }, + { + "epoch": 0.7009741868016952, + "grad_norm": 0.8511056900024414, + "learning_rate": 7.288201643960182e-06, + "loss": 0.7494, + "step": 12736 + }, + { + "epoch": 0.7010292256040509, + "grad_norm": 0.6908526420593262, + "learning_rate": 7.287816223977793e-06, + "loss": 0.6861, + "step": 12737 + }, + { + "epoch": 0.7010842644064065, + "grad_norm": 0.7582982182502747, + "learning_rate": 7.2874307868007896e-06, + "loss": 0.7758, + "step": 12738 + }, + { + "epoch": 0.7011393032087622, + "grad_norm": 0.9717779159545898, + "learning_rate": 7.2870453324320685e-06, + "loss": 0.7221, + "step": 12739 + }, + { + "epoch": 0.7011943420111179, + "grad_norm": 0.6532751321792603, + "learning_rate": 7.286659860874529e-06, + "loss": 0.8009, + "step": 12740 + }, + { + "epoch": 0.7012493808134735, + "grad_norm": 0.6708540320396423, + "learning_rate": 7.286274372131065e-06, + "loss": 0.7177, + "step": 12741 + }, + { + "epoch": 0.7013044196158291, + "grad_norm": 0.7624804973602295, + "learning_rate": 7.285888866204575e-06, + "loss": 0.7878, + "step": 12742 + }, + { + "epoch": 0.7013594584181848, + "grad_norm": 0.7167851328849792, + "learning_rate": 7.285503343097955e-06, + "loss": 0.7276, + "step": 12743 + }, + { + "epoch": 0.7014144972205405, + "grad_norm": 0.6592209935188293, + "learning_rate": 7.2851178028141045e-06, + "loss": 0.7665, + "step": 12744 + }, + { + "epoch": 0.7014695360228962, + "grad_norm": 0.684847354888916, + "learning_rate": 7.284732245355921e-06, + "loss": 0.7358, + "step": 12745 + }, + { + "epoch": 0.7015245748252518, + "grad_norm": 0.6852415800094604, + "learning_rate": 7.2843466707262985e-06, + "loss": 0.7805, + "step": 12746 + }, + { + "epoch": 0.7015796136276075, + "grad_norm": 0.6422114968299866, + "learning_rate": 7.283961078928141e-06, + "loss": 0.7386, + "step": 12747 + }, + { + "epoch": 0.7016346524299631, + "grad_norm": 0.7538495659828186, + "learning_rate": 7.283575469964343e-06, + "loss": 0.798, + "step": 12748 + }, + { + "epoch": 0.7016896912323188, + "grad_norm": 0.6646687984466553, + "learning_rate": 7.2831898438378025e-06, + "loss": 0.7048, + "step": 12749 + }, + { + "epoch": 0.7017447300346744, + "grad_norm": 0.8338429927825928, + "learning_rate": 7.2828042005514176e-06, + "loss": 0.8585, + "step": 12750 + }, + { + "epoch": 0.7017997688370301, + "grad_norm": 0.7086663842201233, + "learning_rate": 7.282418540108088e-06, + "loss": 0.8011, + "step": 12751 + }, + { + "epoch": 0.7018548076393858, + "grad_norm": 0.6040074229240417, + "learning_rate": 7.282032862510712e-06, + "loss": 0.6327, + "step": 12752 + }, + { + "epoch": 0.7019098464417415, + "grad_norm": 0.7030978798866272, + "learning_rate": 7.281647167762187e-06, + "loss": 0.6373, + "step": 12753 + }, + { + "epoch": 0.7019648852440971, + "grad_norm": 0.662308394908905, + "learning_rate": 7.281261455865414e-06, + "loss": 0.7283, + "step": 12754 + }, + { + "epoch": 0.7020199240464527, + "grad_norm": 0.7369368672370911, + "learning_rate": 7.28087572682329e-06, + "loss": 0.7632, + "step": 12755 + }, + { + "epoch": 0.7020749628488084, + "grad_norm": 0.6887282729148865, + "learning_rate": 7.280489980638714e-06, + "loss": 0.7629, + "step": 12756 + }, + { + "epoch": 0.702130001651164, + "grad_norm": 0.656512975692749, + "learning_rate": 7.280104217314587e-06, + "loss": 0.8028, + "step": 12757 + }, + { + "epoch": 0.7021850404535197, + "grad_norm": 0.7006264328956604, + "learning_rate": 7.279718436853805e-06, + "loss": 0.7025, + "step": 12758 + }, + { + "epoch": 0.7022400792558754, + "grad_norm": 0.675585925579071, + "learning_rate": 7.279332639259271e-06, + "loss": 0.8001, + "step": 12759 + }, + { + "epoch": 0.7022951180582311, + "grad_norm": 0.7105827331542969, + "learning_rate": 7.278946824533883e-06, + "loss": 0.7767, + "step": 12760 + }, + { + "epoch": 0.7023501568605867, + "grad_norm": 0.8310064673423767, + "learning_rate": 7.27856099268054e-06, + "loss": 0.7828, + "step": 12761 + }, + { + "epoch": 0.7024051956629423, + "grad_norm": 0.6885055899620056, + "learning_rate": 7.278175143702142e-06, + "loss": 0.7018, + "step": 12762 + }, + { + "epoch": 0.702460234465298, + "grad_norm": 0.6542866826057434, + "learning_rate": 7.27778927760159e-06, + "loss": 0.7118, + "step": 12763 + }, + { + "epoch": 0.7025152732676537, + "grad_norm": 0.9102655053138733, + "learning_rate": 7.277403394381784e-06, + "loss": 0.8381, + "step": 12764 + }, + { + "epoch": 0.7025703120700093, + "grad_norm": 0.6538355946540833, + "learning_rate": 7.277017494045624e-06, + "loss": 0.7766, + "step": 12765 + }, + { + "epoch": 0.702625350872365, + "grad_norm": 0.6691237092018127, + "learning_rate": 7.27663157659601e-06, + "loss": 0.8077, + "step": 12766 + }, + { + "epoch": 0.7026803896747207, + "grad_norm": 0.7159995436668396, + "learning_rate": 7.2762456420358414e-06, + "loss": 0.8333, + "step": 12767 + }, + { + "epoch": 0.7027354284770764, + "grad_norm": 0.6518422365188599, + "learning_rate": 7.275859690368022e-06, + "loss": 0.7634, + "step": 12768 + }, + { + "epoch": 0.702790467279432, + "grad_norm": 0.6969057321548462, + "learning_rate": 7.275473721595449e-06, + "loss": 0.7481, + "step": 12769 + }, + { + "epoch": 0.7028455060817876, + "grad_norm": 0.6788915395736694, + "learning_rate": 7.2750877357210225e-06, + "loss": 0.7402, + "step": 12770 + }, + { + "epoch": 0.7029005448841433, + "grad_norm": 0.7323998212814331, + "learning_rate": 7.274701732747649e-06, + "loss": 0.7122, + "step": 12771 + }, + { + "epoch": 0.702955583686499, + "grad_norm": 0.7224077582359314, + "learning_rate": 7.274315712678224e-06, + "loss": 0.7333, + "step": 12772 + }, + { + "epoch": 0.7030106224888546, + "grad_norm": 0.9009444117546082, + "learning_rate": 7.273929675515652e-06, + "loss": 0.6912, + "step": 12773 + }, + { + "epoch": 0.7030656612912103, + "grad_norm": 0.7076312899589539, + "learning_rate": 7.273543621262832e-06, + "loss": 0.7651, + "step": 12774 + }, + { + "epoch": 0.703120700093566, + "grad_norm": 0.78575599193573, + "learning_rate": 7.273157549922668e-06, + "loss": 0.7443, + "step": 12775 + }, + { + "epoch": 0.7031757388959217, + "grad_norm": 0.6957094669342041, + "learning_rate": 7.27277146149806e-06, + "loss": 0.7684, + "step": 12776 + }, + { + "epoch": 0.7032307776982772, + "grad_norm": 1.177878975868225, + "learning_rate": 7.27238535599191e-06, + "loss": 0.9033, + "step": 12777 + }, + { + "epoch": 0.7032858165006329, + "grad_norm": 0.6929007768630981, + "learning_rate": 7.27199923340712e-06, + "loss": 0.7411, + "step": 12778 + }, + { + "epoch": 0.7033408553029886, + "grad_norm": 0.7725315093994141, + "learning_rate": 7.2716130937465926e-06, + "loss": 0.7833, + "step": 12779 + }, + { + "epoch": 0.7033958941053443, + "grad_norm": 0.6512928605079651, + "learning_rate": 7.271226937013228e-06, + "loss": 0.7918, + "step": 12780 + }, + { + "epoch": 0.7034509329076999, + "grad_norm": 0.7033893465995789, + "learning_rate": 7.270840763209931e-06, + "loss": 0.843, + "step": 12781 + }, + { + "epoch": 0.7035059717100556, + "grad_norm": 0.7596432566642761, + "learning_rate": 7.2704545723396e-06, + "loss": 0.7916, + "step": 12782 + }, + { + "epoch": 0.7035610105124113, + "grad_norm": 0.6256046891212463, + "learning_rate": 7.270068364405143e-06, + "loss": 0.6531, + "step": 12783 + }, + { + "epoch": 0.703616049314767, + "grad_norm": 0.8107615113258362, + "learning_rate": 7.26968213940946e-06, + "loss": 0.7755, + "step": 12784 + }, + { + "epoch": 0.7036710881171225, + "grad_norm": 0.6742845177650452, + "learning_rate": 7.269295897355451e-06, + "loss": 0.834, + "step": 12785 + }, + { + "epoch": 0.7037261269194782, + "grad_norm": 0.6665072441101074, + "learning_rate": 7.268909638246024e-06, + "loss": 0.6864, + "step": 12786 + }, + { + "epoch": 0.7037811657218339, + "grad_norm": 0.68357914686203, + "learning_rate": 7.268523362084078e-06, + "loss": 0.7789, + "step": 12787 + }, + { + "epoch": 0.7038362045241896, + "grad_norm": 0.6878114938735962, + "learning_rate": 7.268137068872519e-06, + "loss": 0.7277, + "step": 12788 + }, + { + "epoch": 0.7038912433265452, + "grad_norm": 0.7173313498497009, + "learning_rate": 7.267750758614247e-06, + "loss": 0.8156, + "step": 12789 + }, + { + "epoch": 0.7039462821289009, + "grad_norm": 0.6523084044456482, + "learning_rate": 7.267364431312169e-06, + "loss": 0.7143, + "step": 12790 + }, + { + "epoch": 0.7040013209312566, + "grad_norm": 0.7403815388679504, + "learning_rate": 7.2669780869691865e-06, + "loss": 0.8196, + "step": 12791 + }, + { + "epoch": 0.7040563597336122, + "grad_norm": 0.6411255598068237, + "learning_rate": 7.266591725588204e-06, + "loss": 0.6645, + "step": 12792 + }, + { + "epoch": 0.7041113985359678, + "grad_norm": 0.9094020128250122, + "learning_rate": 7.266205347172124e-06, + "loss": 0.8023, + "step": 12793 + }, + { + "epoch": 0.7041664373383235, + "grad_norm": 1.1041208505630493, + "learning_rate": 7.265818951723851e-06, + "loss": 0.7011, + "step": 12794 + }, + { + "epoch": 0.7042214761406792, + "grad_norm": 0.7339954376220703, + "learning_rate": 7.265432539246289e-06, + "loss": 0.7467, + "step": 12795 + }, + { + "epoch": 0.7042765149430349, + "grad_norm": 0.7055865526199341, + "learning_rate": 7.265046109742344e-06, + "loss": 0.7364, + "step": 12796 + }, + { + "epoch": 0.7043315537453905, + "grad_norm": 0.7052320241928101, + "learning_rate": 7.264659663214917e-06, + "loss": 0.7611, + "step": 12797 + }, + { + "epoch": 0.7043865925477462, + "grad_norm": 0.7374194860458374, + "learning_rate": 7.264273199666915e-06, + "loss": 0.7612, + "step": 12798 + }, + { + "epoch": 0.7044416313501018, + "grad_norm": 0.634986162185669, + "learning_rate": 7.263886719101242e-06, + "loss": 0.8001, + "step": 12799 + }, + { + "epoch": 0.7044966701524574, + "grad_norm": 0.8178644180297852, + "learning_rate": 7.2635002215208014e-06, + "loss": 0.8404, + "step": 12800 + }, + { + "epoch": 0.7045517089548131, + "grad_norm": 0.7743822336196899, + "learning_rate": 7.263113706928501e-06, + "loss": 0.7297, + "step": 12801 + }, + { + "epoch": 0.7046067477571688, + "grad_norm": 0.6558601260185242, + "learning_rate": 7.262727175327242e-06, + "loss": 0.6933, + "step": 12802 + }, + { + "epoch": 0.7046617865595245, + "grad_norm": 1.0608787536621094, + "learning_rate": 7.262340626719933e-06, + "loss": 0.8792, + "step": 12803 + }, + { + "epoch": 0.7047168253618801, + "grad_norm": 0.7488270401954651, + "learning_rate": 7.261954061109475e-06, + "loss": 0.7755, + "step": 12804 + }, + { + "epoch": 0.7047718641642358, + "grad_norm": 0.8960574865341187, + "learning_rate": 7.261567478498778e-06, + "loss": 0.7274, + "step": 12805 + }, + { + "epoch": 0.7048269029665915, + "grad_norm": 0.6289944648742676, + "learning_rate": 7.2611808788907436e-06, + "loss": 0.6469, + "step": 12806 + }, + { + "epoch": 0.7048819417689471, + "grad_norm": 0.6488339900970459, + "learning_rate": 7.26079426228828e-06, + "loss": 0.7581, + "step": 12807 + }, + { + "epoch": 0.7049369805713027, + "grad_norm": 0.7354650497436523, + "learning_rate": 7.260407628694292e-06, + "loss": 0.7596, + "step": 12808 + }, + { + "epoch": 0.7049920193736584, + "grad_norm": 0.8163169026374817, + "learning_rate": 7.2600209781116834e-06, + "loss": 0.8291, + "step": 12809 + }, + { + "epoch": 0.7050470581760141, + "grad_norm": 0.8223916292190552, + "learning_rate": 7.259634310543364e-06, + "loss": 0.7089, + "step": 12810 + }, + { + "epoch": 0.7051020969783698, + "grad_norm": 0.7815924286842346, + "learning_rate": 7.2592476259922374e-06, + "loss": 0.8098, + "step": 12811 + }, + { + "epoch": 0.7051571357807254, + "grad_norm": 0.7027734518051147, + "learning_rate": 7.2588609244612105e-06, + "loss": 0.7276, + "step": 12812 + }, + { + "epoch": 0.705212174583081, + "grad_norm": 0.7345930337905884, + "learning_rate": 7.2584742059531894e-06, + "loss": 0.803, + "step": 12813 + }, + { + "epoch": 0.7052672133854367, + "grad_norm": 0.6998127102851868, + "learning_rate": 7.258087470471081e-06, + "loss": 0.7938, + "step": 12814 + }, + { + "epoch": 0.7053222521877924, + "grad_norm": 0.6418118476867676, + "learning_rate": 7.257700718017793e-06, + "loss": 0.66, + "step": 12815 + }, + { + "epoch": 0.705377290990148, + "grad_norm": 0.6774695515632629, + "learning_rate": 7.257313948596228e-06, + "loss": 0.7143, + "step": 12816 + }, + { + "epoch": 0.7054323297925037, + "grad_norm": 0.7107009291648865, + "learning_rate": 7.256927162209298e-06, + "loss": 0.8378, + "step": 12817 + }, + { + "epoch": 0.7054873685948594, + "grad_norm": 0.7287374138832092, + "learning_rate": 7.256540358859906e-06, + "loss": 0.88, + "step": 12818 + }, + { + "epoch": 0.7055424073972151, + "grad_norm": 0.651221752166748, + "learning_rate": 7.256153538550961e-06, + "loss": 0.7092, + "step": 12819 + }, + { + "epoch": 0.7055974461995707, + "grad_norm": 0.6549085974693298, + "learning_rate": 7.255766701285371e-06, + "loss": 0.6697, + "step": 12820 + }, + { + "epoch": 0.7056524850019263, + "grad_norm": 0.6617292165756226, + "learning_rate": 7.255379847066041e-06, + "loss": 0.7779, + "step": 12821 + }, + { + "epoch": 0.705707523804282, + "grad_norm": 0.6677221655845642, + "learning_rate": 7.254992975895879e-06, + "loss": 0.7821, + "step": 12822 + }, + { + "epoch": 0.7057625626066377, + "grad_norm": 0.8183515667915344, + "learning_rate": 7.2546060877777945e-06, + "loss": 0.7727, + "step": 12823 + }, + { + "epoch": 0.7058176014089933, + "grad_norm": 0.6574132442474365, + "learning_rate": 7.2542191827146945e-06, + "loss": 0.7118, + "step": 12824 + }, + { + "epoch": 0.705872640211349, + "grad_norm": 0.6874130964279175, + "learning_rate": 7.253832260709487e-06, + "loss": 0.7677, + "step": 12825 + }, + { + "epoch": 0.7059276790137047, + "grad_norm": 0.6460297107696533, + "learning_rate": 7.253445321765079e-06, + "loss": 0.725, + "step": 12826 + }, + { + "epoch": 0.7059827178160604, + "grad_norm": 0.6618219614028931, + "learning_rate": 7.253058365884379e-06, + "loss": 0.7504, + "step": 12827 + }, + { + "epoch": 0.706037756618416, + "grad_norm": 0.6519019603729248, + "learning_rate": 7.252671393070295e-06, + "loss": 0.7382, + "step": 12828 + }, + { + "epoch": 0.7060927954207716, + "grad_norm": 0.7114588022232056, + "learning_rate": 7.252284403325737e-06, + "loss": 0.8364, + "step": 12829 + }, + { + "epoch": 0.7061478342231273, + "grad_norm": 0.6304726600646973, + "learning_rate": 7.251897396653611e-06, + "loss": 0.6972, + "step": 12830 + }, + { + "epoch": 0.706202873025483, + "grad_norm": 0.6728807687759399, + "learning_rate": 7.251510373056827e-06, + "loss": 0.671, + "step": 12831 + }, + { + "epoch": 0.7062579118278386, + "grad_norm": 0.690641462802887, + "learning_rate": 7.251123332538295e-06, + "loss": 0.7381, + "step": 12832 + }, + { + "epoch": 0.7063129506301943, + "grad_norm": 0.7018027305603027, + "learning_rate": 7.2507362751009226e-06, + "loss": 0.7546, + "step": 12833 + }, + { + "epoch": 0.70636798943255, + "grad_norm": 0.7203684449195862, + "learning_rate": 7.250349200747617e-06, + "loss": 0.7534, + "step": 12834 + }, + { + "epoch": 0.7064230282349057, + "grad_norm": 0.6936585903167725, + "learning_rate": 7.24996210948129e-06, + "loss": 0.7716, + "step": 12835 + }, + { + "epoch": 0.7064780670372612, + "grad_norm": 0.7421281337738037, + "learning_rate": 7.249575001304851e-06, + "loss": 0.7517, + "step": 12836 + }, + { + "epoch": 0.7065331058396169, + "grad_norm": 0.6622288227081299, + "learning_rate": 7.249187876221207e-06, + "loss": 0.6799, + "step": 12837 + }, + { + "epoch": 0.7065881446419726, + "grad_norm": 0.7267055511474609, + "learning_rate": 7.24880073423327e-06, + "loss": 0.7871, + "step": 12838 + }, + { + "epoch": 0.7066431834443283, + "grad_norm": 0.6978085041046143, + "learning_rate": 7.2484135753439485e-06, + "loss": 0.7812, + "step": 12839 + }, + { + "epoch": 0.7066982222466839, + "grad_norm": 0.8353652358055115, + "learning_rate": 7.248026399556153e-06, + "loss": 0.7481, + "step": 12840 + }, + { + "epoch": 0.7067532610490396, + "grad_norm": 0.8402471542358398, + "learning_rate": 7.247639206872792e-06, + "loss": 0.783, + "step": 12841 + }, + { + "epoch": 0.7068082998513953, + "grad_norm": 0.8279419541358948, + "learning_rate": 7.247251997296777e-06, + "loss": 0.8177, + "step": 12842 + }, + { + "epoch": 0.7068633386537508, + "grad_norm": 0.6850735545158386, + "learning_rate": 7.246864770831017e-06, + "loss": 0.7586, + "step": 12843 + }, + { + "epoch": 0.7069183774561065, + "grad_norm": 0.7327665090560913, + "learning_rate": 7.246477527478422e-06, + "loss": 0.9327, + "step": 12844 + }, + { + "epoch": 0.7069734162584622, + "grad_norm": 0.6343075037002563, + "learning_rate": 7.246090267241905e-06, + "loss": 0.6957, + "step": 12845 + }, + { + "epoch": 0.7070284550608179, + "grad_norm": 0.7028965353965759, + "learning_rate": 7.245702990124373e-06, + "loss": 0.7524, + "step": 12846 + }, + { + "epoch": 0.7070834938631735, + "grad_norm": 0.7578299045562744, + "learning_rate": 7.24531569612874e-06, + "loss": 0.7302, + "step": 12847 + }, + { + "epoch": 0.7071385326655292, + "grad_norm": 0.8113438487052917, + "learning_rate": 7.2449283852579146e-06, + "loss": 0.7658, + "step": 12848 + }, + { + "epoch": 0.7071935714678849, + "grad_norm": 0.6442512273788452, + "learning_rate": 7.244541057514809e-06, + "loss": 0.6742, + "step": 12849 + }, + { + "epoch": 0.7072486102702406, + "grad_norm": 0.8595272898674011, + "learning_rate": 7.244153712902333e-06, + "loss": 0.7944, + "step": 12850 + }, + { + "epoch": 0.7073036490725961, + "grad_norm": 0.6565983891487122, + "learning_rate": 7.243766351423398e-06, + "loss": 0.7411, + "step": 12851 + }, + { + "epoch": 0.7073586878749518, + "grad_norm": 0.7935337424278259, + "learning_rate": 7.243378973080917e-06, + "loss": 0.8109, + "step": 12852 + }, + { + "epoch": 0.7074137266773075, + "grad_norm": 0.7083927392959595, + "learning_rate": 7.242991577877799e-06, + "loss": 0.8405, + "step": 12853 + }, + { + "epoch": 0.7074687654796632, + "grad_norm": 0.7452830672264099, + "learning_rate": 7.242604165816958e-06, + "loss": 0.7972, + "step": 12854 + }, + { + "epoch": 0.7075238042820188, + "grad_norm": 0.6775808334350586, + "learning_rate": 7.242216736901302e-06, + "loss": 0.7114, + "step": 12855 + }, + { + "epoch": 0.7075788430843745, + "grad_norm": 0.8069992661476135, + "learning_rate": 7.241829291133748e-06, + "loss": 0.6606, + "step": 12856 + }, + { + "epoch": 0.7076338818867302, + "grad_norm": 0.6690802574157715, + "learning_rate": 7.241441828517203e-06, + "loss": 0.742, + "step": 12857 + }, + { + "epoch": 0.7076889206890858, + "grad_norm": 0.8077805638313293, + "learning_rate": 7.2410543490545814e-06, + "loss": 0.7786, + "step": 12858 + }, + { + "epoch": 0.7077439594914414, + "grad_norm": 0.6906875967979431, + "learning_rate": 7.240666852748795e-06, + "loss": 0.7445, + "step": 12859 + }, + { + "epoch": 0.7077989982937971, + "grad_norm": 0.6830704808235168, + "learning_rate": 7.2402793396027585e-06, + "loss": 0.7664, + "step": 12860 + }, + { + "epoch": 0.7078540370961528, + "grad_norm": 0.8118640780448914, + "learning_rate": 7.23989180961938e-06, + "loss": 0.7654, + "step": 12861 + }, + { + "epoch": 0.7079090758985085, + "grad_norm": 0.6819882392883301, + "learning_rate": 7.2395042628015755e-06, + "loss": 0.649, + "step": 12862 + }, + { + "epoch": 0.7079641147008641, + "grad_norm": 0.6543441414833069, + "learning_rate": 7.239116699152256e-06, + "loss": 0.8054, + "step": 12863 + }, + { + "epoch": 0.7080191535032198, + "grad_norm": 0.8613989353179932, + "learning_rate": 7.238729118674335e-06, + "loss": 0.7283, + "step": 12864 + }, + { + "epoch": 0.7080741923055754, + "grad_norm": 0.6993124485015869, + "learning_rate": 7.238341521370725e-06, + "loss": 0.8145, + "step": 12865 + }, + { + "epoch": 0.7081292311079311, + "grad_norm": 0.7047560811042786, + "learning_rate": 7.237953907244339e-06, + "loss": 0.6729, + "step": 12866 + }, + { + "epoch": 0.7081842699102867, + "grad_norm": 0.7923689484596252, + "learning_rate": 7.237566276298091e-06, + "loss": 0.7615, + "step": 12867 + }, + { + "epoch": 0.7082393087126424, + "grad_norm": 0.6873850226402283, + "learning_rate": 7.237178628534894e-06, + "loss": 0.7638, + "step": 12868 + }, + { + "epoch": 0.7082943475149981, + "grad_norm": 0.6483134031295776, + "learning_rate": 7.236790963957661e-06, + "loss": 0.6366, + "step": 12869 + }, + { + "epoch": 0.7083493863173538, + "grad_norm": 0.6623784899711609, + "learning_rate": 7.236403282569305e-06, + "loss": 0.7032, + "step": 12870 + }, + { + "epoch": 0.7084044251197094, + "grad_norm": 0.7004366517066956, + "learning_rate": 7.236015584372741e-06, + "loss": 0.6436, + "step": 12871 + }, + { + "epoch": 0.708459463922065, + "grad_norm": 0.5676529407501221, + "learning_rate": 7.235627869370883e-06, + "loss": 0.6395, + "step": 12872 + }, + { + "epoch": 0.7085145027244207, + "grad_norm": 0.6909729838371277, + "learning_rate": 7.235240137566644e-06, + "loss": 0.7063, + "step": 12873 + }, + { + "epoch": 0.7085695415267764, + "grad_norm": 0.7635348439216614, + "learning_rate": 7.234852388962939e-06, + "loss": 0.7518, + "step": 12874 + }, + { + "epoch": 0.708624580329132, + "grad_norm": 0.7217742204666138, + "learning_rate": 7.2344646235626815e-06, + "loss": 0.7782, + "step": 12875 + }, + { + "epoch": 0.7086796191314877, + "grad_norm": 0.6506509184837341, + "learning_rate": 7.2340768413687855e-06, + "loss": 0.7456, + "step": 12876 + }, + { + "epoch": 0.7087346579338434, + "grad_norm": 0.6537386775016785, + "learning_rate": 7.2336890423841664e-06, + "loss": 0.7395, + "step": 12877 + }, + { + "epoch": 0.7087896967361991, + "grad_norm": 0.7759900689125061, + "learning_rate": 7.233301226611737e-06, + "loss": 0.8098, + "step": 12878 + }, + { + "epoch": 0.7088447355385546, + "grad_norm": 0.8476354479789734, + "learning_rate": 7.232913394054415e-06, + "loss": 0.8241, + "step": 12879 + }, + { + "epoch": 0.7088997743409103, + "grad_norm": 0.6770507097244263, + "learning_rate": 7.232525544715114e-06, + "loss": 0.6966, + "step": 12880 + }, + { + "epoch": 0.708954813143266, + "grad_norm": 0.7750027775764465, + "learning_rate": 7.232137678596747e-06, + "loss": 0.8038, + "step": 12881 + }, + { + "epoch": 0.7090098519456217, + "grad_norm": 0.6507213711738586, + "learning_rate": 7.231749795702232e-06, + "loss": 0.6446, + "step": 12882 + }, + { + "epoch": 0.7090648907479773, + "grad_norm": 0.7554625272750854, + "learning_rate": 7.231361896034481e-06, + "loss": 0.7769, + "step": 12883 + }, + { + "epoch": 0.709119929550333, + "grad_norm": 0.8175020813941956, + "learning_rate": 7.230973979596414e-06, + "loss": 0.8283, + "step": 12884 + }, + { + "epoch": 0.7091749683526887, + "grad_norm": 0.7528663873672485, + "learning_rate": 7.2305860463909416e-06, + "loss": 0.7737, + "step": 12885 + }, + { + "epoch": 0.7092300071550443, + "grad_norm": 0.9242768883705139, + "learning_rate": 7.230198096420983e-06, + "loss": 0.647, + "step": 12886 + }, + { + "epoch": 0.7092850459573999, + "grad_norm": 0.899874746799469, + "learning_rate": 7.229810129689452e-06, + "loss": 0.8952, + "step": 12887 + }, + { + "epoch": 0.7093400847597556, + "grad_norm": 0.8221275806427002, + "learning_rate": 7.229422146199266e-06, + "loss": 0.6845, + "step": 12888 + }, + { + "epoch": 0.7093951235621113, + "grad_norm": 0.6964027285575867, + "learning_rate": 7.229034145953338e-06, + "loss": 0.7153, + "step": 12889 + }, + { + "epoch": 0.7094501623644669, + "grad_norm": 0.8018684387207031, + "learning_rate": 7.228646128954588e-06, + "loss": 0.6421, + "step": 12890 + }, + { + "epoch": 0.7095052011668226, + "grad_norm": 0.6874614953994751, + "learning_rate": 7.228258095205928e-06, + "loss": 0.8024, + "step": 12891 + }, + { + "epoch": 0.7095602399691783, + "grad_norm": 0.7141417860984802, + "learning_rate": 7.227870044710277e-06, + "loss": 0.7746, + "step": 12892 + }, + { + "epoch": 0.709615278771534, + "grad_norm": 0.7109399437904358, + "learning_rate": 7.227481977470552e-06, + "loss": 0.7826, + "step": 12893 + }, + { + "epoch": 0.7096703175738895, + "grad_norm": 0.7021867036819458, + "learning_rate": 7.227093893489669e-06, + "loss": 0.7196, + "step": 12894 + }, + { + "epoch": 0.7097253563762452, + "grad_norm": 0.6896560788154602, + "learning_rate": 7.226705792770543e-06, + "loss": 0.6925, + "step": 12895 + }, + { + "epoch": 0.7097803951786009, + "grad_norm": 0.7138262987136841, + "learning_rate": 7.226317675316094e-06, + "loss": 0.7417, + "step": 12896 + }, + { + "epoch": 0.7098354339809566, + "grad_norm": 0.6789212226867676, + "learning_rate": 7.225929541129236e-06, + "loss": 0.7095, + "step": 12897 + }, + { + "epoch": 0.7098904727833122, + "grad_norm": 0.8102045059204102, + "learning_rate": 7.225541390212889e-06, + "loss": 0.9252, + "step": 12898 + }, + { + "epoch": 0.7099455115856679, + "grad_norm": 0.6220358610153198, + "learning_rate": 7.2251532225699674e-06, + "loss": 0.7205, + "step": 12899 + }, + { + "epoch": 0.7100005503880236, + "grad_norm": 0.6375265121459961, + "learning_rate": 7.224765038203391e-06, + "loss": 0.7974, + "step": 12900 + }, + { + "epoch": 0.7100555891903793, + "grad_norm": 0.7457360029220581, + "learning_rate": 7.224376837116075e-06, + "loss": 0.7083, + "step": 12901 + }, + { + "epoch": 0.7101106279927348, + "grad_norm": 0.7012878060340881, + "learning_rate": 7.2239886193109374e-06, + "loss": 0.7334, + "step": 12902 + }, + { + "epoch": 0.7101656667950905, + "grad_norm": 0.7437683343887329, + "learning_rate": 7.223600384790898e-06, + "loss": 0.82, + "step": 12903 + }, + { + "epoch": 0.7102207055974462, + "grad_norm": 0.6727370619773865, + "learning_rate": 7.223212133558872e-06, + "loss": 0.7339, + "step": 12904 + }, + { + "epoch": 0.7102757443998019, + "grad_norm": 0.9253849983215332, + "learning_rate": 7.222823865617781e-06, + "loss": 0.7398, + "step": 12905 + }, + { + "epoch": 0.7103307832021575, + "grad_norm": 0.6664100885391235, + "learning_rate": 7.222435580970539e-06, + "loss": 0.7519, + "step": 12906 + }, + { + "epoch": 0.7103858220045132, + "grad_norm": 0.7452943325042725, + "learning_rate": 7.222047279620066e-06, + "loss": 0.7382, + "step": 12907 + }, + { + "epoch": 0.7104408608068689, + "grad_norm": 0.7235015630722046, + "learning_rate": 7.22165896156928e-06, + "loss": 0.7726, + "step": 12908 + }, + { + "epoch": 0.7104958996092245, + "grad_norm": 0.6324653029441833, + "learning_rate": 7.221270626821102e-06, + "loss": 0.7451, + "step": 12909 + }, + { + "epoch": 0.7105509384115801, + "grad_norm": 0.789829432964325, + "learning_rate": 7.220882275378447e-06, + "loss": 0.7375, + "step": 12910 + }, + { + "epoch": 0.7106059772139358, + "grad_norm": 0.9090244174003601, + "learning_rate": 7.220493907244236e-06, + "loss": 0.8935, + "step": 12911 + }, + { + "epoch": 0.7106610160162915, + "grad_norm": 0.6570677757263184, + "learning_rate": 7.220105522421388e-06, + "loss": 0.7259, + "step": 12912 + }, + { + "epoch": 0.7107160548186472, + "grad_norm": 0.7142132520675659, + "learning_rate": 7.219717120912819e-06, + "loss": 0.7862, + "step": 12913 + }, + { + "epoch": 0.7107710936210028, + "grad_norm": 0.7359404563903809, + "learning_rate": 7.219328702721452e-06, + "loss": 0.7074, + "step": 12914 + }, + { + "epoch": 0.7108261324233585, + "grad_norm": 0.7118046283721924, + "learning_rate": 7.218940267850203e-06, + "loss": 0.8151, + "step": 12915 + }, + { + "epoch": 0.7108811712257141, + "grad_norm": 0.8301580548286438, + "learning_rate": 7.218551816301994e-06, + "loss": 0.7031, + "step": 12916 + }, + { + "epoch": 0.7109362100280698, + "grad_norm": 0.6647501587867737, + "learning_rate": 7.218163348079743e-06, + "loss": 0.8309, + "step": 12917 + }, + { + "epoch": 0.7109912488304254, + "grad_norm": 0.6546997427940369, + "learning_rate": 7.217774863186371e-06, + "loss": 0.717, + "step": 12918 + }, + { + "epoch": 0.7110462876327811, + "grad_norm": 0.6639735102653503, + "learning_rate": 7.217386361624795e-06, + "loss": 0.7308, + "step": 12919 + }, + { + "epoch": 0.7111013264351368, + "grad_norm": 0.724433183670044, + "learning_rate": 7.216997843397938e-06, + "loss": 0.7576, + "step": 12920 + }, + { + "epoch": 0.7111563652374925, + "grad_norm": 0.750253438949585, + "learning_rate": 7.216609308508719e-06, + "loss": 0.7014, + "step": 12921 + }, + { + "epoch": 0.7112114040398481, + "grad_norm": 0.7010897397994995, + "learning_rate": 7.216220756960058e-06, + "loss": 0.6951, + "step": 12922 + }, + { + "epoch": 0.7112664428422037, + "grad_norm": 0.7739251852035522, + "learning_rate": 7.215832188754873e-06, + "loss": 0.7392, + "step": 12923 + }, + { + "epoch": 0.7113214816445594, + "grad_norm": 0.6893059015274048, + "learning_rate": 7.215443603896088e-06, + "loss": 0.7029, + "step": 12924 + }, + { + "epoch": 0.7113765204469151, + "grad_norm": 0.8061872124671936, + "learning_rate": 7.215055002386622e-06, + "loss": 0.7557, + "step": 12925 + }, + { + "epoch": 0.7114315592492707, + "grad_norm": 1.089525580406189, + "learning_rate": 7.214666384229395e-06, + "loss": 0.6701, + "step": 12926 + }, + { + "epoch": 0.7114865980516264, + "grad_norm": 0.7601733207702637, + "learning_rate": 7.2142777494273275e-06, + "loss": 0.8113, + "step": 12927 + }, + { + "epoch": 0.7115416368539821, + "grad_norm": 0.7863540649414062, + "learning_rate": 7.213889097983342e-06, + "loss": 0.7945, + "step": 12928 + }, + { + "epoch": 0.7115966756563377, + "grad_norm": 0.7722556591033936, + "learning_rate": 7.21350042990036e-06, + "loss": 0.9492, + "step": 12929 + }, + { + "epoch": 0.7116517144586934, + "grad_norm": 0.6834682822227478, + "learning_rate": 7.213111745181299e-06, + "loss": 0.7138, + "step": 12930 + }, + { + "epoch": 0.711706753261049, + "grad_norm": 0.6974432468414307, + "learning_rate": 7.212723043829083e-06, + "loss": 0.7654, + "step": 12931 + }, + { + "epoch": 0.7117617920634047, + "grad_norm": 0.9797543883323669, + "learning_rate": 7.2123343258466334e-06, + "loss": 0.7786, + "step": 12932 + }, + { + "epoch": 0.7118168308657603, + "grad_norm": 0.6337804794311523, + "learning_rate": 7.211945591236872e-06, + "loss": 0.7147, + "step": 12933 + }, + { + "epoch": 0.711871869668116, + "grad_norm": 0.7450474500656128, + "learning_rate": 7.211556840002718e-06, + "loss": 0.8516, + "step": 12934 + }, + { + "epoch": 0.7119269084704717, + "grad_norm": 0.7786532640457153, + "learning_rate": 7.2111680721470965e-06, + "loss": 0.837, + "step": 12935 + }, + { + "epoch": 0.7119819472728274, + "grad_norm": 0.666020393371582, + "learning_rate": 7.210779287672927e-06, + "loss": 0.7646, + "step": 12936 + }, + { + "epoch": 0.712036986075183, + "grad_norm": 0.622648298740387, + "learning_rate": 7.210390486583132e-06, + "loss": 0.7102, + "step": 12937 + }, + { + "epoch": 0.7120920248775386, + "grad_norm": 0.7175952792167664, + "learning_rate": 7.210001668880634e-06, + "loss": 0.7043, + "step": 12938 + }, + { + "epoch": 0.7121470636798943, + "grad_norm": 0.8019681572914124, + "learning_rate": 7.209612834568353e-06, + "loss": 0.8166, + "step": 12939 + }, + { + "epoch": 0.71220210248225, + "grad_norm": 0.804457426071167, + "learning_rate": 7.209223983649216e-06, + "loss": 0.7182, + "step": 12940 + }, + { + "epoch": 0.7122571412846056, + "grad_norm": 0.7261730432510376, + "learning_rate": 7.208835116126143e-06, + "loss": 0.6634, + "step": 12941 + }, + { + "epoch": 0.7123121800869613, + "grad_norm": 0.7461307644844055, + "learning_rate": 7.208446232002055e-06, + "loss": 0.709, + "step": 12942 + }, + { + "epoch": 0.712367218889317, + "grad_norm": 0.6730383634567261, + "learning_rate": 7.208057331279877e-06, + "loss": 0.7111, + "step": 12943 + }, + { + "epoch": 0.7124222576916727, + "grad_norm": 0.829530656337738, + "learning_rate": 7.207668413962531e-06, + "loss": 0.729, + "step": 12944 + }, + { + "epoch": 0.7124772964940282, + "grad_norm": 0.5997991561889648, + "learning_rate": 7.20727948005294e-06, + "loss": 0.6385, + "step": 12945 + }, + { + "epoch": 0.7125323352963839, + "grad_norm": 0.9590086936950684, + "learning_rate": 7.206890529554027e-06, + "loss": 0.7217, + "step": 12946 + }, + { + "epoch": 0.7125873740987396, + "grad_norm": 0.7818330526351929, + "learning_rate": 7.206501562468717e-06, + "loss": 0.7276, + "step": 12947 + }, + { + "epoch": 0.7126424129010953, + "grad_norm": 0.6033679842948914, + "learning_rate": 7.206112578799931e-06, + "loss": 0.5935, + "step": 12948 + }, + { + "epoch": 0.7126974517034509, + "grad_norm": 0.7431650757789612, + "learning_rate": 7.205723578550593e-06, + "loss": 0.8649, + "step": 12949 + }, + { + "epoch": 0.7127524905058066, + "grad_norm": 0.7026848793029785, + "learning_rate": 7.205334561723627e-06, + "loss": 0.7484, + "step": 12950 + }, + { + "epoch": 0.7128075293081623, + "grad_norm": 0.6328058242797852, + "learning_rate": 7.204945528321956e-06, + "loss": 0.6994, + "step": 12951 + }, + { + "epoch": 0.712862568110518, + "grad_norm": 0.6806536912918091, + "learning_rate": 7.204556478348507e-06, + "loss": 0.7461, + "step": 12952 + }, + { + "epoch": 0.7129176069128735, + "grad_norm": 0.6822162866592407, + "learning_rate": 7.2041674118062e-06, + "loss": 0.7947, + "step": 12953 + }, + { + "epoch": 0.7129726457152292, + "grad_norm": 0.7283263802528381, + "learning_rate": 7.203778328697962e-06, + "loss": 0.7559, + "step": 12954 + }, + { + "epoch": 0.7130276845175849, + "grad_norm": 0.663564920425415, + "learning_rate": 7.203389229026714e-06, + "loss": 0.6898, + "step": 12955 + }, + { + "epoch": 0.7130827233199406, + "grad_norm": 0.7218708395957947, + "learning_rate": 7.203000112795383e-06, + "loss": 0.8095, + "step": 12956 + }, + { + "epoch": 0.7131377621222962, + "grad_norm": 0.6931518912315369, + "learning_rate": 7.202610980006893e-06, + "loss": 0.7591, + "step": 12957 + }, + { + "epoch": 0.7131928009246519, + "grad_norm": 0.6982918381690979, + "learning_rate": 7.2022218306641704e-06, + "loss": 0.7651, + "step": 12958 + }, + { + "epoch": 0.7132478397270076, + "grad_norm": 0.8033974170684814, + "learning_rate": 7.201832664770135e-06, + "loss": 0.8857, + "step": 12959 + }, + { + "epoch": 0.7133028785293632, + "grad_norm": 0.6625493764877319, + "learning_rate": 7.201443482327717e-06, + "loss": 0.752, + "step": 12960 + }, + { + "epoch": 0.7133579173317188, + "grad_norm": 0.8149683475494385, + "learning_rate": 7.201054283339838e-06, + "loss": 0.8528, + "step": 12961 + }, + { + "epoch": 0.7134129561340745, + "grad_norm": 0.7894958257675171, + "learning_rate": 7.200665067809425e-06, + "loss": 0.8554, + "step": 12962 + }, + { + "epoch": 0.7134679949364302, + "grad_norm": 0.7613523602485657, + "learning_rate": 7.200275835739401e-06, + "loss": 0.7435, + "step": 12963 + }, + { + "epoch": 0.7135230337387859, + "grad_norm": 0.665985643863678, + "learning_rate": 7.199886587132693e-06, + "loss": 0.7072, + "step": 12964 + }, + { + "epoch": 0.7135780725411415, + "grad_norm": 0.7523592710494995, + "learning_rate": 7.199497321992227e-06, + "loss": 0.7945, + "step": 12965 + }, + { + "epoch": 0.7136331113434972, + "grad_norm": 0.8894450664520264, + "learning_rate": 7.199108040320928e-06, + "loss": 0.7885, + "step": 12966 + }, + { + "epoch": 0.7136881501458529, + "grad_norm": 0.639108419418335, + "learning_rate": 7.198718742121722e-06, + "loss": 0.6975, + "step": 12967 + }, + { + "epoch": 0.7137431889482085, + "grad_norm": 0.670013964176178, + "learning_rate": 7.198329427397532e-06, + "loss": 0.7441, + "step": 12968 + }, + { + "epoch": 0.7137982277505641, + "grad_norm": 0.7695425748825073, + "learning_rate": 7.197940096151289e-06, + "loss": 0.7616, + "step": 12969 + }, + { + "epoch": 0.7138532665529198, + "grad_norm": 0.9098057150840759, + "learning_rate": 7.197550748385917e-06, + "loss": 0.9028, + "step": 12970 + }, + { + "epoch": 0.7139083053552755, + "grad_norm": 0.7677769660949707, + "learning_rate": 7.197161384104341e-06, + "loss": 0.7926, + "step": 12971 + }, + { + "epoch": 0.7139633441576311, + "grad_norm": 0.7020674347877502, + "learning_rate": 7.196772003309487e-06, + "loss": 0.7248, + "step": 12972 + }, + { + "epoch": 0.7140183829599868, + "grad_norm": 0.6616366505622864, + "learning_rate": 7.196382606004283e-06, + "loss": 0.7137, + "step": 12973 + }, + { + "epoch": 0.7140734217623425, + "grad_norm": 0.7174738645553589, + "learning_rate": 7.195993192191656e-06, + "loss": 0.8167, + "step": 12974 + }, + { + "epoch": 0.7141284605646981, + "grad_norm": 0.6672176122665405, + "learning_rate": 7.1956037618745325e-06, + "loss": 0.6516, + "step": 12975 + }, + { + "epoch": 0.7141834993670537, + "grad_norm": 0.714790403842926, + "learning_rate": 7.195214315055837e-06, + "loss": 0.865, + "step": 12976 + }, + { + "epoch": 0.7142385381694094, + "grad_norm": 0.6637690663337708, + "learning_rate": 7.1948248517385e-06, + "loss": 0.7328, + "step": 12977 + }, + { + "epoch": 0.7142935769717651, + "grad_norm": 0.8998367786407471, + "learning_rate": 7.194435371925446e-06, + "loss": 0.7097, + "step": 12978 + }, + { + "epoch": 0.7143486157741208, + "grad_norm": 0.7472445964813232, + "learning_rate": 7.194045875619604e-06, + "loss": 0.7556, + "step": 12979 + }, + { + "epoch": 0.7144036545764764, + "grad_norm": 0.7897135019302368, + "learning_rate": 7.1936563628239e-06, + "loss": 0.8728, + "step": 12980 + }, + { + "epoch": 0.714458693378832, + "grad_norm": 0.6520817279815674, + "learning_rate": 7.193266833541261e-06, + "loss": 0.6824, + "step": 12981 + }, + { + "epoch": 0.7145137321811877, + "grad_norm": 0.833849310874939, + "learning_rate": 7.192877287774618e-06, + "loss": 0.8877, + "step": 12982 + }, + { + "epoch": 0.7145687709835434, + "grad_norm": 0.7105151414871216, + "learning_rate": 7.192487725526896e-06, + "loss": 0.7799, + "step": 12983 + }, + { + "epoch": 0.714623809785899, + "grad_norm": 0.7515869140625, + "learning_rate": 7.192098146801021e-06, + "loss": 0.7012, + "step": 12984 + }, + { + "epoch": 0.7146788485882547, + "grad_norm": 0.7447199821472168, + "learning_rate": 7.191708551599923e-06, + "loss": 0.7545, + "step": 12985 + }, + { + "epoch": 0.7147338873906104, + "grad_norm": 0.8502823114395142, + "learning_rate": 7.191318939926532e-06, + "loss": 0.7232, + "step": 12986 + }, + { + "epoch": 0.7147889261929661, + "grad_norm": 0.7193031907081604, + "learning_rate": 7.190929311783774e-06, + "loss": 0.762, + "step": 12987 + }, + { + "epoch": 0.7148439649953217, + "grad_norm": 0.8479939699172974, + "learning_rate": 7.190539667174576e-06, + "loss": 0.7238, + "step": 12988 + }, + { + "epoch": 0.7148990037976773, + "grad_norm": 0.8313719630241394, + "learning_rate": 7.1901500061018704e-06, + "loss": 0.8145, + "step": 12989 + }, + { + "epoch": 0.714954042600033, + "grad_norm": 0.7019978165626526, + "learning_rate": 7.189760328568584e-06, + "loss": 0.6461, + "step": 12990 + }, + { + "epoch": 0.7150090814023887, + "grad_norm": 0.897280216217041, + "learning_rate": 7.1893706345776436e-06, + "loss": 0.818, + "step": 12991 + }, + { + "epoch": 0.7150641202047443, + "grad_norm": 0.7495617866516113, + "learning_rate": 7.1889809241319795e-06, + "loss": 0.7533, + "step": 12992 + }, + { + "epoch": 0.7151191590071, + "grad_norm": 0.733496904373169, + "learning_rate": 7.188591197234522e-06, + "loss": 0.7405, + "step": 12993 + }, + { + "epoch": 0.7151741978094557, + "grad_norm": 0.8873284459114075, + "learning_rate": 7.1882014538882e-06, + "loss": 0.7525, + "step": 12994 + }, + { + "epoch": 0.7152292366118114, + "grad_norm": 0.6693230271339417, + "learning_rate": 7.187811694095939e-06, + "loss": 0.7509, + "step": 12995 + }, + { + "epoch": 0.715284275414167, + "grad_norm": 0.8513357043266296, + "learning_rate": 7.187421917860671e-06, + "loss": 0.8111, + "step": 12996 + }, + { + "epoch": 0.7153393142165226, + "grad_norm": 0.6986566185951233, + "learning_rate": 7.187032125185326e-06, + "loss": 0.8013, + "step": 12997 + }, + { + "epoch": 0.7153943530188783, + "grad_norm": 0.7062557339668274, + "learning_rate": 7.1866423160728335e-06, + "loss": 0.7266, + "step": 12998 + }, + { + "epoch": 0.715449391821234, + "grad_norm": 0.6329573392868042, + "learning_rate": 7.186252490526122e-06, + "loss": 0.6753, + "step": 12999 + }, + { + "epoch": 0.7155044306235896, + "grad_norm": 0.6740719079971313, + "learning_rate": 7.185862648548122e-06, + "loss": 0.7197, + "step": 13000 + }, + { + "epoch": 0.7155594694259453, + "grad_norm": 0.7911732196807861, + "learning_rate": 7.185472790141764e-06, + "loss": 0.6939, + "step": 13001 + }, + { + "epoch": 0.715614508228301, + "grad_norm": 0.7368680238723755, + "learning_rate": 7.185082915309978e-06, + "loss": 0.6919, + "step": 13002 + }, + { + "epoch": 0.7156695470306567, + "grad_norm": 0.6374472975730896, + "learning_rate": 7.1846930240556925e-06, + "loss": 0.6645, + "step": 13003 + }, + { + "epoch": 0.7157245858330122, + "grad_norm": 0.6727073192596436, + "learning_rate": 7.184303116381839e-06, + "loss": 0.5995, + "step": 13004 + }, + { + "epoch": 0.7157796246353679, + "grad_norm": 0.6122208833694458, + "learning_rate": 7.183913192291348e-06, + "loss": 0.6755, + "step": 13005 + }, + { + "epoch": 0.7158346634377236, + "grad_norm": 0.7095892429351807, + "learning_rate": 7.1835232517871525e-06, + "loss": 0.8009, + "step": 13006 + }, + { + "epoch": 0.7158897022400793, + "grad_norm": 0.6828192472457886, + "learning_rate": 7.1831332948721786e-06, + "loss": 0.7755, + "step": 13007 + }, + { + "epoch": 0.7159447410424349, + "grad_norm": 0.7997334003448486, + "learning_rate": 7.182743321549359e-06, + "loss": 0.7259, + "step": 13008 + }, + { + "epoch": 0.7159997798447906, + "grad_norm": 0.7431252002716064, + "learning_rate": 7.182353331821626e-06, + "loss": 0.7765, + "step": 13009 + }, + { + "epoch": 0.7160548186471463, + "grad_norm": 0.7202625870704651, + "learning_rate": 7.181963325691907e-06, + "loss": 0.7638, + "step": 13010 + }, + { + "epoch": 0.716109857449502, + "grad_norm": 0.7617568373680115, + "learning_rate": 7.181573303163139e-06, + "loss": 0.825, + "step": 13011 + }, + { + "epoch": 0.7161648962518575, + "grad_norm": 0.7382665276527405, + "learning_rate": 7.181183264238247e-06, + "loss": 0.8005, + "step": 13012 + }, + { + "epoch": 0.7162199350542132, + "grad_norm": 0.7782611846923828, + "learning_rate": 7.180793208920167e-06, + "loss": 0.7044, + "step": 13013 + }, + { + "epoch": 0.7162749738565689, + "grad_norm": 0.7020898461341858, + "learning_rate": 7.18040313721183e-06, + "loss": 0.8059, + "step": 13014 + }, + { + "epoch": 0.7163300126589245, + "grad_norm": 1.2005099058151245, + "learning_rate": 7.1800130491161656e-06, + "loss": 0.6663, + "step": 13015 + }, + { + "epoch": 0.7163850514612802, + "grad_norm": 0.6663569211959839, + "learning_rate": 7.1796229446361066e-06, + "loss": 0.7046, + "step": 13016 + }, + { + "epoch": 0.7164400902636359, + "grad_norm": 0.7010110020637512, + "learning_rate": 7.1792328237745845e-06, + "loss": 0.6433, + "step": 13017 + }, + { + "epoch": 0.7164951290659916, + "grad_norm": 0.6447514891624451, + "learning_rate": 7.178842686534534e-06, + "loss": 0.7794, + "step": 13018 + }, + { + "epoch": 0.7165501678683471, + "grad_norm": 0.6813021302223206, + "learning_rate": 7.1784525329188835e-06, + "loss": 0.7413, + "step": 13019 + }, + { + "epoch": 0.7166052066707028, + "grad_norm": 0.6894733905792236, + "learning_rate": 7.178062362930567e-06, + "loss": 0.7896, + "step": 13020 + }, + { + "epoch": 0.7166602454730585, + "grad_norm": 0.6717034578323364, + "learning_rate": 7.177672176572517e-06, + "loss": 0.7599, + "step": 13021 + }, + { + "epoch": 0.7167152842754142, + "grad_norm": 0.7861666083335876, + "learning_rate": 7.177281973847665e-06, + "loss": 0.9068, + "step": 13022 + }, + { + "epoch": 0.7167703230777698, + "grad_norm": 0.6784214973449707, + "learning_rate": 7.176891754758946e-06, + "loss": 0.8319, + "step": 13023 + }, + { + "epoch": 0.7168253618801255, + "grad_norm": 0.7053580284118652, + "learning_rate": 7.176501519309289e-06, + "loss": 0.8085, + "step": 13024 + }, + { + "epoch": 0.7168804006824812, + "grad_norm": 0.9643208980560303, + "learning_rate": 7.176111267501631e-06, + "loss": 0.7799, + "step": 13025 + }, + { + "epoch": 0.7169354394848368, + "grad_norm": 0.8921111822128296, + "learning_rate": 7.175720999338902e-06, + "loss": 0.6465, + "step": 13026 + }, + { + "epoch": 0.7169904782871924, + "grad_norm": 0.7356166839599609, + "learning_rate": 7.1753307148240385e-06, + "loss": 0.7862, + "step": 13027 + }, + { + "epoch": 0.7170455170895481, + "grad_norm": 0.6906836628913879, + "learning_rate": 7.174940413959968e-06, + "loss": 0.7341, + "step": 13028 + }, + { + "epoch": 0.7171005558919038, + "grad_norm": 0.6229632496833801, + "learning_rate": 7.174550096749632e-06, + "loss": 0.721, + "step": 13029 + }, + { + "epoch": 0.7171555946942595, + "grad_norm": 0.6832499504089355, + "learning_rate": 7.174159763195958e-06, + "loss": 0.6733, + "step": 13030 + }, + { + "epoch": 0.7172106334966151, + "grad_norm": 0.8304060697555542, + "learning_rate": 7.1737694133018806e-06, + "loss": 0.7732, + "step": 13031 + }, + { + "epoch": 0.7172656722989708, + "grad_norm": 0.6813186407089233, + "learning_rate": 7.173379047070333e-06, + "loss": 0.7742, + "step": 13032 + }, + { + "epoch": 0.7173207111013264, + "grad_norm": 0.6671963930130005, + "learning_rate": 7.172988664504252e-06, + "loss": 0.6516, + "step": 13033 + }, + { + "epoch": 0.7173757499036821, + "grad_norm": 0.661108136177063, + "learning_rate": 7.172598265606569e-06, + "loss": 0.7361, + "step": 13034 + }, + { + "epoch": 0.7174307887060377, + "grad_norm": 0.7097620368003845, + "learning_rate": 7.1722078503802196e-06, + "loss": 0.8142, + "step": 13035 + }, + { + "epoch": 0.7174858275083934, + "grad_norm": 0.7663383483886719, + "learning_rate": 7.1718174188281365e-06, + "loss": 0.8149, + "step": 13036 + }, + { + "epoch": 0.7175408663107491, + "grad_norm": 0.7142401337623596, + "learning_rate": 7.171426970953256e-06, + "loss": 0.7539, + "step": 13037 + }, + { + "epoch": 0.7175959051131048, + "grad_norm": 0.667346715927124, + "learning_rate": 7.171036506758512e-06, + "loss": 0.7517, + "step": 13038 + }, + { + "epoch": 0.7176509439154604, + "grad_norm": 0.5933231711387634, + "learning_rate": 7.170646026246838e-06, + "loss": 0.6852, + "step": 13039 + }, + { + "epoch": 0.717705982717816, + "grad_norm": 0.730015218257904, + "learning_rate": 7.170255529421168e-06, + "loss": 0.7316, + "step": 13040 + }, + { + "epoch": 0.7177610215201717, + "grad_norm": 0.6146146059036255, + "learning_rate": 7.169865016284442e-06, + "loss": 0.6715, + "step": 13041 + }, + { + "epoch": 0.7178160603225274, + "grad_norm": 0.694131076335907, + "learning_rate": 7.16947448683959e-06, + "loss": 0.7944, + "step": 13042 + }, + { + "epoch": 0.717871099124883, + "grad_norm": 0.6736807823181152, + "learning_rate": 7.169083941089547e-06, + "loss": 0.7922, + "step": 13043 + }, + { + "epoch": 0.7179261379272387, + "grad_norm": 0.6748425364494324, + "learning_rate": 7.16869337903725e-06, + "loss": 0.6738, + "step": 13044 + }, + { + "epoch": 0.7179811767295944, + "grad_norm": 0.6807510852813721, + "learning_rate": 7.168302800685635e-06, + "loss": 0.7291, + "step": 13045 + }, + { + "epoch": 0.7180362155319501, + "grad_norm": 0.6613160371780396, + "learning_rate": 7.167912206037637e-06, + "loss": 0.6839, + "step": 13046 + }, + { + "epoch": 0.7180912543343057, + "grad_norm": 0.7184692621231079, + "learning_rate": 7.16752159509619e-06, + "loss": 0.6748, + "step": 13047 + }, + { + "epoch": 0.7181462931366613, + "grad_norm": 0.6938989758491516, + "learning_rate": 7.167130967864231e-06, + "loss": 0.7926, + "step": 13048 + }, + { + "epoch": 0.718201331939017, + "grad_norm": 0.6871020793914795, + "learning_rate": 7.166740324344696e-06, + "loss": 0.8229, + "step": 13049 + }, + { + "epoch": 0.7182563707413727, + "grad_norm": 0.8003624081611633, + "learning_rate": 7.166349664540521e-06, + "loss": 0.8488, + "step": 13050 + }, + { + "epoch": 0.7183114095437283, + "grad_norm": 0.7309357523918152, + "learning_rate": 7.165958988454642e-06, + "loss": 0.7442, + "step": 13051 + }, + { + "epoch": 0.718366448346084, + "grad_norm": 0.7462141513824463, + "learning_rate": 7.165568296089993e-06, + "loss": 0.8014, + "step": 13052 + }, + { + "epoch": 0.7184214871484397, + "grad_norm": 0.8335661292076111, + "learning_rate": 7.165177587449516e-06, + "loss": 0.6773, + "step": 13053 + }, + { + "epoch": 0.7184765259507954, + "grad_norm": 0.6996884346008301, + "learning_rate": 7.164786862536142e-06, + "loss": 0.7491, + "step": 13054 + }, + { + "epoch": 0.7185315647531509, + "grad_norm": 0.7203043103218079, + "learning_rate": 7.164396121352809e-06, + "loss": 0.7196, + "step": 13055 + }, + { + "epoch": 0.7185866035555066, + "grad_norm": 0.7109461426734924, + "learning_rate": 7.164005363902453e-06, + "loss": 0.7336, + "step": 13056 + }, + { + "epoch": 0.7186416423578623, + "grad_norm": 0.7057282328605652, + "learning_rate": 7.1636145901880135e-06, + "loss": 0.734, + "step": 13057 + }, + { + "epoch": 0.7186966811602179, + "grad_norm": 0.7288782000541687, + "learning_rate": 7.163223800212427e-06, + "loss": 0.8141, + "step": 13058 + }, + { + "epoch": 0.7187517199625736, + "grad_norm": 0.6812320947647095, + "learning_rate": 7.162832993978628e-06, + "loss": 0.7525, + "step": 13059 + }, + { + "epoch": 0.7188067587649293, + "grad_norm": 0.6782627105712891, + "learning_rate": 7.1624421714895546e-06, + "loss": 0.7647, + "step": 13060 + }, + { + "epoch": 0.718861797567285, + "grad_norm": 0.7361965775489807, + "learning_rate": 7.162051332748146e-06, + "loss": 0.7774, + "step": 13061 + }, + { + "epoch": 0.7189168363696405, + "grad_norm": 0.68894362449646, + "learning_rate": 7.161660477757337e-06, + "loss": 0.767, + "step": 13062 + }, + { + "epoch": 0.7189718751719962, + "grad_norm": 0.6440854668617249, + "learning_rate": 7.161269606520067e-06, + "loss": 0.7062, + "step": 13063 + }, + { + "epoch": 0.7190269139743519, + "grad_norm": 0.8411546945571899, + "learning_rate": 7.160878719039273e-06, + "loss": 0.728, + "step": 13064 + }, + { + "epoch": 0.7190819527767076, + "grad_norm": 0.6895145177841187, + "learning_rate": 7.160487815317895e-06, + "loss": 0.6667, + "step": 13065 + }, + { + "epoch": 0.7191369915790632, + "grad_norm": 0.6943626403808594, + "learning_rate": 7.160096895358866e-06, + "loss": 0.7579, + "step": 13066 + }, + { + "epoch": 0.7191920303814189, + "grad_norm": 0.7940205335617065, + "learning_rate": 7.1597059591651294e-06, + "loss": 0.7286, + "step": 13067 + }, + { + "epoch": 0.7192470691837746, + "grad_norm": 0.7350896000862122, + "learning_rate": 7.159315006739619e-06, + "loss": 0.7174, + "step": 13068 + }, + { + "epoch": 0.7193021079861303, + "grad_norm": 0.7663372159004211, + "learning_rate": 7.158924038085275e-06, + "loss": 0.7871, + "step": 13069 + }, + { + "epoch": 0.7193571467884858, + "grad_norm": 0.7368965744972229, + "learning_rate": 7.1585330532050375e-06, + "loss": 0.7356, + "step": 13070 + }, + { + "epoch": 0.7194121855908415, + "grad_norm": 0.7345212697982788, + "learning_rate": 7.158142052101843e-06, + "loss": 0.7784, + "step": 13071 + }, + { + "epoch": 0.7194672243931972, + "grad_norm": 0.7847188711166382, + "learning_rate": 7.157751034778629e-06, + "loss": 0.7899, + "step": 13072 + }, + { + "epoch": 0.7195222631955529, + "grad_norm": 0.757514476776123, + "learning_rate": 7.157360001238337e-06, + "loss": 0.8899, + "step": 13073 + }, + { + "epoch": 0.7195773019979085, + "grad_norm": 0.73405522108078, + "learning_rate": 7.156968951483905e-06, + "loss": 0.7283, + "step": 13074 + }, + { + "epoch": 0.7196323408002642, + "grad_norm": 0.7950206398963928, + "learning_rate": 7.156577885518271e-06, + "loss": 0.7338, + "step": 13075 + }, + { + "epoch": 0.7196873796026199, + "grad_norm": 0.8082411289215088, + "learning_rate": 7.156186803344374e-06, + "loss": 0.711, + "step": 13076 + }, + { + "epoch": 0.7197424184049755, + "grad_norm": 0.6868693828582764, + "learning_rate": 7.1557957049651574e-06, + "loss": 0.7583, + "step": 13077 + }, + { + "epoch": 0.7197974572073311, + "grad_norm": 0.7226251363754272, + "learning_rate": 7.155404590383554e-06, + "loss": 0.746, + "step": 13078 + }, + { + "epoch": 0.7198524960096868, + "grad_norm": 0.7437220811843872, + "learning_rate": 7.155013459602509e-06, + "loss": 0.6884, + "step": 13079 + }, + { + "epoch": 0.7199075348120425, + "grad_norm": 0.7486164569854736, + "learning_rate": 7.154622312624958e-06, + "loss": 0.6968, + "step": 13080 + }, + { + "epoch": 0.7199625736143982, + "grad_norm": 0.7709106802940369, + "learning_rate": 7.154231149453843e-06, + "loss": 0.838, + "step": 13081 + }, + { + "epoch": 0.7200176124167538, + "grad_norm": 0.6962981224060059, + "learning_rate": 7.153839970092104e-06, + "loss": 0.7186, + "step": 13082 + }, + { + "epoch": 0.7200726512191095, + "grad_norm": 0.8195380568504333, + "learning_rate": 7.15344877454268e-06, + "loss": 0.7949, + "step": 13083 + }, + { + "epoch": 0.7201276900214651, + "grad_norm": 0.735285758972168, + "learning_rate": 7.15305756280851e-06, + "loss": 0.7477, + "step": 13084 + }, + { + "epoch": 0.7201827288238208, + "grad_norm": 0.6121101379394531, + "learning_rate": 7.1526663348925375e-06, + "loss": 0.6686, + "step": 13085 + }, + { + "epoch": 0.7202377676261764, + "grad_norm": 0.7204885482788086, + "learning_rate": 7.1522750907977e-06, + "loss": 0.8013, + "step": 13086 + }, + { + "epoch": 0.7202928064285321, + "grad_norm": 0.6808584332466125, + "learning_rate": 7.15188383052694e-06, + "loss": 0.7847, + "step": 13087 + }, + { + "epoch": 0.7203478452308878, + "grad_norm": 0.7049086093902588, + "learning_rate": 7.151492554083195e-06, + "loss": 0.7563, + "step": 13088 + }, + { + "epoch": 0.7204028840332435, + "grad_norm": 0.765708327293396, + "learning_rate": 7.151101261469411e-06, + "loss": 0.7648, + "step": 13089 + }, + { + "epoch": 0.7204579228355991, + "grad_norm": 0.6810007095336914, + "learning_rate": 7.150709952688525e-06, + "loss": 0.731, + "step": 13090 + }, + { + "epoch": 0.7205129616379548, + "grad_norm": 0.7242745757102966, + "learning_rate": 7.150318627743478e-06, + "loss": 0.8027, + "step": 13091 + }, + { + "epoch": 0.7205680004403104, + "grad_norm": 0.7452220916748047, + "learning_rate": 7.14992728663721e-06, + "loss": 0.7848, + "step": 13092 + }, + { + "epoch": 0.7206230392426661, + "grad_norm": 0.6333943605422974, + "learning_rate": 7.149535929372667e-06, + "loss": 0.7105, + "step": 13093 + }, + { + "epoch": 0.7206780780450217, + "grad_norm": 0.7565333247184753, + "learning_rate": 7.149144555952785e-06, + "loss": 0.8006, + "step": 13094 + }, + { + "epoch": 0.7207331168473774, + "grad_norm": 0.7703632712364197, + "learning_rate": 7.14875316638051e-06, + "loss": 0.7323, + "step": 13095 + }, + { + "epoch": 0.7207881556497331, + "grad_norm": 0.6275011301040649, + "learning_rate": 7.148361760658779e-06, + "loss": 0.6817, + "step": 13096 + }, + { + "epoch": 0.7208431944520888, + "grad_norm": 0.7363598942756653, + "learning_rate": 7.147970338790537e-06, + "loss": 0.7641, + "step": 13097 + }, + { + "epoch": 0.7208982332544444, + "grad_norm": 0.6284294724464417, + "learning_rate": 7.147578900778727e-06, + "loss": 0.7117, + "step": 13098 + }, + { + "epoch": 0.7209532720568, + "grad_norm": 0.7878503203392029, + "learning_rate": 7.147187446626287e-06, + "loss": 0.8184, + "step": 13099 + }, + { + "epoch": 0.7210083108591557, + "grad_norm": 0.6973691582679749, + "learning_rate": 7.146795976336159e-06, + "loss": 0.7815, + "step": 13100 + }, + { + "epoch": 0.7210633496615113, + "grad_norm": 0.7018479704856873, + "learning_rate": 7.146404489911291e-06, + "loss": 0.7305, + "step": 13101 + }, + { + "epoch": 0.721118388463867, + "grad_norm": 0.6903830766677856, + "learning_rate": 7.14601298735462e-06, + "loss": 0.7074, + "step": 13102 + }, + { + "epoch": 0.7211734272662227, + "grad_norm": 0.7612621188163757, + "learning_rate": 7.145621468669089e-06, + "loss": 0.8189, + "step": 13103 + }, + { + "epoch": 0.7212284660685784, + "grad_norm": 0.7256856560707092, + "learning_rate": 7.145229933857643e-06, + "loss": 0.5959, + "step": 13104 + }, + { + "epoch": 0.721283504870934, + "grad_norm": 0.6632323265075684, + "learning_rate": 7.1448383829232205e-06, + "loss": 0.7519, + "step": 13105 + }, + { + "epoch": 0.7213385436732896, + "grad_norm": 0.6320651769638062, + "learning_rate": 7.144446815868768e-06, + "loss": 0.7259, + "step": 13106 + }, + { + "epoch": 0.7213935824756453, + "grad_norm": 0.6883212924003601, + "learning_rate": 7.144055232697227e-06, + "loss": 0.7776, + "step": 13107 + }, + { + "epoch": 0.721448621278001, + "grad_norm": 0.7159759402275085, + "learning_rate": 7.1436636334115415e-06, + "loss": 0.6915, + "step": 13108 + }, + { + "epoch": 0.7215036600803566, + "grad_norm": 0.7108080983161926, + "learning_rate": 7.1432720180146535e-06, + "loss": 0.731, + "step": 13109 + }, + { + "epoch": 0.7215586988827123, + "grad_norm": 0.7765033841133118, + "learning_rate": 7.142880386509506e-06, + "loss": 0.6965, + "step": 13110 + }, + { + "epoch": 0.721613737685068, + "grad_norm": 0.7205119132995605, + "learning_rate": 7.142488738899045e-06, + "loss": 0.7262, + "step": 13111 + }, + { + "epoch": 0.7216687764874237, + "grad_norm": 0.6786921620368958, + "learning_rate": 7.142097075186212e-06, + "loss": 0.805, + "step": 13112 + }, + { + "epoch": 0.7217238152897792, + "grad_norm": 0.7947409152984619, + "learning_rate": 7.141705395373949e-06, + "loss": 0.7701, + "step": 13113 + }, + { + "epoch": 0.7217788540921349, + "grad_norm": 0.6672971844673157, + "learning_rate": 7.141313699465204e-06, + "loss": 0.7325, + "step": 13114 + }, + { + "epoch": 0.7218338928944906, + "grad_norm": 0.641765296459198, + "learning_rate": 7.140921987462916e-06, + "loss": 0.7902, + "step": 13115 + }, + { + "epoch": 0.7218889316968463, + "grad_norm": 0.6675699353218079, + "learning_rate": 7.140530259370032e-06, + "loss": 0.7422, + "step": 13116 + }, + { + "epoch": 0.7219439704992019, + "grad_norm": 0.6940729022026062, + "learning_rate": 7.140138515189495e-06, + "loss": 0.6978, + "step": 13117 + }, + { + "epoch": 0.7219990093015576, + "grad_norm": 0.6805779337882996, + "learning_rate": 7.1397467549242514e-06, + "loss": 0.7498, + "step": 13118 + }, + { + "epoch": 0.7220540481039133, + "grad_norm": 0.6231662631034851, + "learning_rate": 7.139354978577243e-06, + "loss": 0.7344, + "step": 13119 + }, + { + "epoch": 0.722109086906269, + "grad_norm": 0.6883575916290283, + "learning_rate": 7.138963186151416e-06, + "loss": 0.835, + "step": 13120 + }, + { + "epoch": 0.7221641257086245, + "grad_norm": 0.6902666687965393, + "learning_rate": 7.138571377649712e-06, + "loss": 0.7427, + "step": 13121 + }, + { + "epoch": 0.7222191645109802, + "grad_norm": 0.7156440019607544, + "learning_rate": 7.1381795530750805e-06, + "loss": 0.7661, + "step": 13122 + }, + { + "epoch": 0.7222742033133359, + "grad_norm": 0.6727150678634644, + "learning_rate": 7.137787712430464e-06, + "loss": 0.7872, + "step": 13123 + }, + { + "epoch": 0.7223292421156916, + "grad_norm": 0.6200405359268188, + "learning_rate": 7.137395855718806e-06, + "loss": 0.6108, + "step": 13124 + }, + { + "epoch": 0.7223842809180472, + "grad_norm": 0.6384756565093994, + "learning_rate": 7.137003982943054e-06, + "loss": 0.698, + "step": 13125 + }, + { + "epoch": 0.7224393197204029, + "grad_norm": 0.7212089896202087, + "learning_rate": 7.1366120941061515e-06, + "loss": 0.7679, + "step": 13126 + }, + { + "epoch": 0.7224943585227586, + "grad_norm": 0.737352192401886, + "learning_rate": 7.136220189211044e-06, + "loss": 0.8173, + "step": 13127 + }, + { + "epoch": 0.7225493973251143, + "grad_norm": 0.6244099736213684, + "learning_rate": 7.135828268260679e-06, + "loss": 0.7224, + "step": 13128 + }, + { + "epoch": 0.7226044361274698, + "grad_norm": 0.8191885948181152, + "learning_rate": 7.135436331257997e-06, + "loss": 0.8122, + "step": 13129 + }, + { + "epoch": 0.7226594749298255, + "grad_norm": 0.7069095373153687, + "learning_rate": 7.135044378205949e-06, + "loss": 0.7844, + "step": 13130 + }, + { + "epoch": 0.7227145137321812, + "grad_norm": 0.6094380021095276, + "learning_rate": 7.13465240910748e-06, + "loss": 0.7093, + "step": 13131 + }, + { + "epoch": 0.7227695525345369, + "grad_norm": 0.7075843811035156, + "learning_rate": 7.134260423965534e-06, + "loss": 0.8109, + "step": 13132 + }, + { + "epoch": 0.7228245913368925, + "grad_norm": 0.6684398651123047, + "learning_rate": 7.133868422783057e-06, + "loss": 0.7224, + "step": 13133 + }, + { + "epoch": 0.7228796301392482, + "grad_norm": 0.6574007272720337, + "learning_rate": 7.133476405562998e-06, + "loss": 0.6763, + "step": 13134 + }, + { + "epoch": 0.7229346689416039, + "grad_norm": 0.7124022841453552, + "learning_rate": 7.133084372308301e-06, + "loss": 0.8047, + "step": 13135 + }, + { + "epoch": 0.7229897077439595, + "grad_norm": 0.7035976648330688, + "learning_rate": 7.1326923230219124e-06, + "loss": 0.7544, + "step": 13136 + }, + { + "epoch": 0.7230447465463151, + "grad_norm": 0.7007604241371155, + "learning_rate": 7.132300257706779e-06, + "loss": 0.7584, + "step": 13137 + }, + { + "epoch": 0.7230997853486708, + "grad_norm": 0.6917324066162109, + "learning_rate": 7.131908176365848e-06, + "loss": 0.6846, + "step": 13138 + }, + { + "epoch": 0.7231548241510265, + "grad_norm": 0.6857448816299438, + "learning_rate": 7.1315160790020666e-06, + "loss": 0.8142, + "step": 13139 + }, + { + "epoch": 0.7232098629533822, + "grad_norm": 0.8381820321083069, + "learning_rate": 7.13112396561838e-06, + "loss": 0.8132, + "step": 13140 + }, + { + "epoch": 0.7232649017557378, + "grad_norm": 0.7024879455566406, + "learning_rate": 7.130731836217735e-06, + "loss": 0.7157, + "step": 13141 + }, + { + "epoch": 0.7233199405580935, + "grad_norm": 0.7313332557678223, + "learning_rate": 7.130339690803081e-06, + "loss": 0.7623, + "step": 13142 + }, + { + "epoch": 0.7233749793604491, + "grad_norm": 0.697536051273346, + "learning_rate": 7.129947529377364e-06, + "loss": 0.7202, + "step": 13143 + }, + { + "epoch": 0.7234300181628047, + "grad_norm": 0.6946722865104675, + "learning_rate": 7.129555351943533e-06, + "loss": 0.7862, + "step": 13144 + }, + { + "epoch": 0.7234850569651604, + "grad_norm": 0.6643924117088318, + "learning_rate": 7.129163158504532e-06, + "loss": 0.7055, + "step": 13145 + }, + { + "epoch": 0.7235400957675161, + "grad_norm": 0.7285693287849426, + "learning_rate": 7.1287709490633104e-06, + "loss": 0.6815, + "step": 13146 + }, + { + "epoch": 0.7235951345698718, + "grad_norm": 1.2701799869537354, + "learning_rate": 7.128378723622818e-06, + "loss": 0.8596, + "step": 13147 + }, + { + "epoch": 0.7236501733722274, + "grad_norm": 0.7067306041717529, + "learning_rate": 7.127986482186e-06, + "loss": 0.7077, + "step": 13148 + }, + { + "epoch": 0.7237052121745831, + "grad_norm": 0.8863486051559448, + "learning_rate": 7.127594224755805e-06, + "loss": 0.8961, + "step": 13149 + }, + { + "epoch": 0.7237602509769387, + "grad_norm": 0.7286190986633301, + "learning_rate": 7.127201951335182e-06, + "loss": 0.7941, + "step": 13150 + }, + { + "epoch": 0.7238152897792944, + "grad_norm": 0.8756779432296753, + "learning_rate": 7.126809661927079e-06, + "loss": 0.7862, + "step": 13151 + }, + { + "epoch": 0.72387032858165, + "grad_norm": 0.7780876755714417, + "learning_rate": 7.126417356534443e-06, + "loss": 0.7095, + "step": 13152 + }, + { + "epoch": 0.7239253673840057, + "grad_norm": 0.6332812905311584, + "learning_rate": 7.1260250351602225e-06, + "loss": 0.7057, + "step": 13153 + }, + { + "epoch": 0.7239804061863614, + "grad_norm": 0.8350435495376587, + "learning_rate": 7.125632697807368e-06, + "loss": 0.7695, + "step": 13154 + }, + { + "epoch": 0.7240354449887171, + "grad_norm": 0.8306411504745483, + "learning_rate": 7.125240344478827e-06, + "loss": 0.6605, + "step": 13155 + }, + { + "epoch": 0.7240904837910727, + "grad_norm": 0.7495117783546448, + "learning_rate": 7.124847975177548e-06, + "loss": 0.8078, + "step": 13156 + }, + { + "epoch": 0.7241455225934283, + "grad_norm": 0.6481010317802429, + "learning_rate": 7.12445558990648e-06, + "loss": 0.8094, + "step": 13157 + }, + { + "epoch": 0.724200561395784, + "grad_norm": 0.7742613554000854, + "learning_rate": 7.124063188668573e-06, + "loss": 0.78, + "step": 13158 + }, + { + "epoch": 0.7242556001981397, + "grad_norm": 0.8394206762313843, + "learning_rate": 7.123670771466776e-06, + "loss": 0.8983, + "step": 13159 + }, + { + "epoch": 0.7243106390004953, + "grad_norm": 0.7196840047836304, + "learning_rate": 7.123278338304038e-06, + "loss": 0.7203, + "step": 13160 + }, + { + "epoch": 0.724365677802851, + "grad_norm": 0.5964440107345581, + "learning_rate": 7.122885889183309e-06, + "loss": 0.6251, + "step": 13161 + }, + { + "epoch": 0.7244207166052067, + "grad_norm": 0.7394048571586609, + "learning_rate": 7.1224934241075375e-06, + "loss": 0.7755, + "step": 13162 + }, + { + "epoch": 0.7244757554075624, + "grad_norm": 0.6427145004272461, + "learning_rate": 7.1221009430796724e-06, + "loss": 0.74, + "step": 13163 + }, + { + "epoch": 0.724530794209918, + "grad_norm": 0.7084387540817261, + "learning_rate": 7.121708446102667e-06, + "loss": 0.7464, + "step": 13164 + }, + { + "epoch": 0.7245858330122736, + "grad_norm": 0.6623230576515198, + "learning_rate": 7.121315933179466e-06, + "loss": 0.7237, + "step": 13165 + }, + { + "epoch": 0.7246408718146293, + "grad_norm": 0.9234243631362915, + "learning_rate": 7.120923404313024e-06, + "loss": 0.8238, + "step": 13166 + }, + { + "epoch": 0.724695910616985, + "grad_norm": 0.6458896994590759, + "learning_rate": 7.120530859506289e-06, + "loss": 0.8105, + "step": 13167 + }, + { + "epoch": 0.7247509494193406, + "grad_norm": 0.7160854935646057, + "learning_rate": 7.1201382987622115e-06, + "loss": 0.7954, + "step": 13168 + }, + { + "epoch": 0.7248059882216963, + "grad_norm": 0.6896069645881653, + "learning_rate": 7.119745722083742e-06, + "loss": 0.7281, + "step": 13169 + }, + { + "epoch": 0.724861027024052, + "grad_norm": 0.6609574556350708, + "learning_rate": 7.119353129473831e-06, + "loss": 0.7682, + "step": 13170 + }, + { + "epoch": 0.7249160658264077, + "grad_norm": 0.6477035880088806, + "learning_rate": 7.118960520935429e-06, + "loss": 0.8183, + "step": 13171 + }, + { + "epoch": 0.7249711046287632, + "grad_norm": 1.4488556385040283, + "learning_rate": 7.1185678964714885e-06, + "loss": 0.8321, + "step": 13172 + }, + { + "epoch": 0.7250261434311189, + "grad_norm": 0.8502382040023804, + "learning_rate": 7.118175256084958e-06, + "loss": 0.7881, + "step": 13173 + }, + { + "epoch": 0.7250811822334746, + "grad_norm": 0.6969912648200989, + "learning_rate": 7.117782599778788e-06, + "loss": 0.7598, + "step": 13174 + }, + { + "epoch": 0.7251362210358303, + "grad_norm": 0.7254889011383057, + "learning_rate": 7.117389927555933e-06, + "loss": 0.8473, + "step": 13175 + }, + { + "epoch": 0.7251912598381859, + "grad_norm": 0.9958444237709045, + "learning_rate": 7.116997239419341e-06, + "loss": 0.7558, + "step": 13176 + }, + { + "epoch": 0.7252462986405416, + "grad_norm": 0.6694881916046143, + "learning_rate": 7.116604535371963e-06, + "loss": 0.7072, + "step": 13177 + }, + { + "epoch": 0.7253013374428973, + "grad_norm": 1.0730634927749634, + "learning_rate": 7.116211815416754e-06, + "loss": 0.7607, + "step": 13178 + }, + { + "epoch": 0.725356376245253, + "grad_norm": 0.6770226359367371, + "learning_rate": 7.115819079556663e-06, + "loss": 0.7213, + "step": 13179 + }, + { + "epoch": 0.7254114150476085, + "grad_norm": 0.866215705871582, + "learning_rate": 7.115426327794642e-06, + "loss": 0.7273, + "step": 13180 + }, + { + "epoch": 0.7254664538499642, + "grad_norm": 0.7303730845451355, + "learning_rate": 7.115033560133642e-06, + "loss": 0.764, + "step": 13181 + }, + { + "epoch": 0.7255214926523199, + "grad_norm": 0.6900389194488525, + "learning_rate": 7.114640776576617e-06, + "loss": 0.6958, + "step": 13182 + }, + { + "epoch": 0.7255765314546756, + "grad_norm": 0.7255710959434509, + "learning_rate": 7.114247977126518e-06, + "loss": 0.6507, + "step": 13183 + }, + { + "epoch": 0.7256315702570312, + "grad_norm": 0.6848479509353638, + "learning_rate": 7.113855161786297e-06, + "loss": 0.6848, + "step": 13184 + }, + { + "epoch": 0.7256866090593869, + "grad_norm": 0.6800528764724731, + "learning_rate": 7.113462330558907e-06, + "loss": 0.7354, + "step": 13185 + }, + { + "epoch": 0.7257416478617426, + "grad_norm": 0.7271339297294617, + "learning_rate": 7.113069483447299e-06, + "loss": 0.7695, + "step": 13186 + }, + { + "epoch": 0.7257966866640981, + "grad_norm": 0.8212381601333618, + "learning_rate": 7.112676620454427e-06, + "loss": 0.7348, + "step": 13187 + }, + { + "epoch": 0.7258517254664538, + "grad_norm": 0.6714771389961243, + "learning_rate": 7.112283741583242e-06, + "loss": 0.75, + "step": 13188 + }, + { + "epoch": 0.7259067642688095, + "grad_norm": 0.7834941148757935, + "learning_rate": 7.111890846836699e-06, + "loss": 0.6914, + "step": 13189 + }, + { + "epoch": 0.7259618030711652, + "grad_norm": 0.8107824325561523, + "learning_rate": 7.111497936217748e-06, + "loss": 0.803, + "step": 13190 + }, + { + "epoch": 0.7260168418735208, + "grad_norm": 0.6306549906730652, + "learning_rate": 7.1111050097293464e-06, + "loss": 0.7915, + "step": 13191 + }, + { + "epoch": 0.7260718806758765, + "grad_norm": 0.7030252814292908, + "learning_rate": 7.110712067374444e-06, + "loss": 0.7091, + "step": 13192 + }, + { + "epoch": 0.7261269194782322, + "grad_norm": 0.7625641226768494, + "learning_rate": 7.110319109155992e-06, + "loss": 0.774, + "step": 13193 + }, + { + "epoch": 0.7261819582805878, + "grad_norm": 0.6382628083229065, + "learning_rate": 7.109926135076949e-06, + "loss": 0.6774, + "step": 13194 + }, + { + "epoch": 0.7262369970829434, + "grad_norm": 0.6594563722610474, + "learning_rate": 7.109533145140265e-06, + "loss": 0.7977, + "step": 13195 + }, + { + "epoch": 0.7262920358852991, + "grad_norm": 0.7177248001098633, + "learning_rate": 7.109140139348895e-06, + "loss": 0.6771, + "step": 13196 + }, + { + "epoch": 0.7263470746876548, + "grad_norm": 0.6631305813789368, + "learning_rate": 7.108747117705792e-06, + "loss": 0.6877, + "step": 13197 + }, + { + "epoch": 0.7264021134900105, + "grad_norm": 0.6783736944198608, + "learning_rate": 7.10835408021391e-06, + "loss": 0.8048, + "step": 13198 + }, + { + "epoch": 0.7264571522923661, + "grad_norm": 0.7368303537368774, + "learning_rate": 7.107961026876204e-06, + "loss": 0.7962, + "step": 13199 + }, + { + "epoch": 0.7265121910947218, + "grad_norm": 0.7697044014930725, + "learning_rate": 7.107567957695627e-06, + "loss": 0.769, + "step": 13200 + }, + { + "epoch": 0.7265672298970774, + "grad_norm": 0.639934241771698, + "learning_rate": 7.1071748726751325e-06, + "loss": 0.722, + "step": 13201 + }, + { + "epoch": 0.7266222686994331, + "grad_norm": 0.8410669565200806, + "learning_rate": 7.106781771817676e-06, + "loss": 0.8861, + "step": 13202 + }, + { + "epoch": 0.7266773075017887, + "grad_norm": 0.654924213886261, + "learning_rate": 7.106388655126212e-06, + "loss": 0.7463, + "step": 13203 + }, + { + "epoch": 0.7267323463041444, + "grad_norm": 0.719714879989624, + "learning_rate": 7.105995522603695e-06, + "loss": 0.759, + "step": 13204 + }, + { + "epoch": 0.7267873851065001, + "grad_norm": 0.7019139528274536, + "learning_rate": 7.105602374253078e-06, + "loss": 0.7965, + "step": 13205 + }, + { + "epoch": 0.7268424239088558, + "grad_norm": 0.7289487719535828, + "learning_rate": 7.105209210077318e-06, + "loss": 0.8591, + "step": 13206 + }, + { + "epoch": 0.7268974627112114, + "grad_norm": 0.670274019241333, + "learning_rate": 7.104816030079369e-06, + "loss": 0.7707, + "step": 13207 + }, + { + "epoch": 0.726952501513567, + "grad_norm": 0.7156813740730286, + "learning_rate": 7.104422834262187e-06, + "loss": 0.7724, + "step": 13208 + }, + { + "epoch": 0.7270075403159227, + "grad_norm": 0.6776198148727417, + "learning_rate": 7.104029622628726e-06, + "loss": 0.7331, + "step": 13209 + }, + { + "epoch": 0.7270625791182784, + "grad_norm": 0.8008358478546143, + "learning_rate": 7.103636395181941e-06, + "loss": 0.8279, + "step": 13210 + }, + { + "epoch": 0.727117617920634, + "grad_norm": 0.6622886061668396, + "learning_rate": 7.1032431519247876e-06, + "loss": 0.6646, + "step": 13211 + }, + { + "epoch": 0.7271726567229897, + "grad_norm": 0.6834877729415894, + "learning_rate": 7.102849892860223e-06, + "loss": 0.75, + "step": 13212 + }, + { + "epoch": 0.7272276955253454, + "grad_norm": 0.7659596800804138, + "learning_rate": 7.1024566179912e-06, + "loss": 0.6999, + "step": 13213 + }, + { + "epoch": 0.7272827343277011, + "grad_norm": 0.7368002533912659, + "learning_rate": 7.102063327320677e-06, + "loss": 0.7376, + "step": 13214 + }, + { + "epoch": 0.7273377731300567, + "grad_norm": 0.7286058664321899, + "learning_rate": 7.101670020851609e-06, + "loss": 0.8139, + "step": 13215 + }, + { + "epoch": 0.7273928119324123, + "grad_norm": 1.0521546602249146, + "learning_rate": 7.101276698586951e-06, + "loss": 0.8545, + "step": 13216 + }, + { + "epoch": 0.727447850734768, + "grad_norm": 0.6940305233001709, + "learning_rate": 7.100883360529659e-06, + "loss": 0.7534, + "step": 13217 + }, + { + "epoch": 0.7275028895371237, + "grad_norm": 0.8279024362564087, + "learning_rate": 7.100490006682691e-06, + "loss": 0.852, + "step": 13218 + }, + { + "epoch": 0.7275579283394793, + "grad_norm": 0.63093501329422, + "learning_rate": 7.100096637049002e-06, + "loss": 0.6728, + "step": 13219 + }, + { + "epoch": 0.727612967141835, + "grad_norm": 0.7576018571853638, + "learning_rate": 7.099703251631549e-06, + "loss": 0.6343, + "step": 13220 + }, + { + "epoch": 0.7276680059441907, + "grad_norm": 0.9493140578269958, + "learning_rate": 7.0993098504332894e-06, + "loss": 0.82, + "step": 13221 + }, + { + "epoch": 0.7277230447465464, + "grad_norm": 0.7279804944992065, + "learning_rate": 7.098916433457177e-06, + "loss": 0.8149, + "step": 13222 + }, + { + "epoch": 0.7277780835489019, + "grad_norm": 0.7660531401634216, + "learning_rate": 7.0985230007061725e-06, + "loss": 0.8278, + "step": 13223 + }, + { + "epoch": 0.7278331223512576, + "grad_norm": 0.6468318104743958, + "learning_rate": 7.09812955218323e-06, + "loss": 0.7193, + "step": 13224 + }, + { + "epoch": 0.7278881611536133, + "grad_norm": 0.6389151811599731, + "learning_rate": 7.097736087891306e-06, + "loss": 0.6744, + "step": 13225 + }, + { + "epoch": 0.727943199955969, + "grad_norm": 0.6565649509429932, + "learning_rate": 7.097342607833361e-06, + "loss": 0.7586, + "step": 13226 + }, + { + "epoch": 0.7279982387583246, + "grad_norm": 0.6867381930351257, + "learning_rate": 7.09694911201235e-06, + "loss": 0.684, + "step": 13227 + }, + { + "epoch": 0.7280532775606803, + "grad_norm": 0.7509286403656006, + "learning_rate": 7.096555600431229e-06, + "loss": 0.8242, + "step": 13228 + }, + { + "epoch": 0.728108316363036, + "grad_norm": 0.6997731328010559, + "learning_rate": 7.096162073092959e-06, + "loss": 0.8182, + "step": 13229 + }, + { + "epoch": 0.7281633551653915, + "grad_norm": 0.6698907017707825, + "learning_rate": 7.095768530000496e-06, + "loss": 0.7752, + "step": 13230 + }, + { + "epoch": 0.7282183939677472, + "grad_norm": 0.7219094634056091, + "learning_rate": 7.095374971156799e-06, + "loss": 0.792, + "step": 13231 + }, + { + "epoch": 0.7282734327701029, + "grad_norm": 0.6479744911193848, + "learning_rate": 7.094981396564822e-06, + "loss": 0.7556, + "step": 13232 + }, + { + "epoch": 0.7283284715724586, + "grad_norm": 0.6795497536659241, + "learning_rate": 7.094587806227527e-06, + "loss": 0.7611, + "step": 13233 + }, + { + "epoch": 0.7283835103748142, + "grad_norm": 0.7145074605941772, + "learning_rate": 7.094194200147871e-06, + "loss": 0.8064, + "step": 13234 + }, + { + "epoch": 0.7284385491771699, + "grad_norm": 0.6750605702400208, + "learning_rate": 7.093800578328811e-06, + "loss": 0.7054, + "step": 13235 + }, + { + "epoch": 0.7284935879795256, + "grad_norm": 0.7574751377105713, + "learning_rate": 7.093406940773307e-06, + "loss": 0.7878, + "step": 13236 + }, + { + "epoch": 0.7285486267818813, + "grad_norm": 0.7836418747901917, + "learning_rate": 7.093013287484316e-06, + "loss": 0.7445, + "step": 13237 + }, + { + "epoch": 0.7286036655842368, + "grad_norm": 0.7658870220184326, + "learning_rate": 7.092619618464799e-06, + "loss": 0.7513, + "step": 13238 + }, + { + "epoch": 0.7286587043865925, + "grad_norm": 1.1127573251724243, + "learning_rate": 7.092225933717711e-06, + "loss": 0.7601, + "step": 13239 + }, + { + "epoch": 0.7287137431889482, + "grad_norm": 0.7003853917121887, + "learning_rate": 7.091832233246015e-06, + "loss": 0.8533, + "step": 13240 + }, + { + "epoch": 0.7287687819913039, + "grad_norm": 0.6513979434967041, + "learning_rate": 7.091438517052667e-06, + "loss": 0.7285, + "step": 13241 + }, + { + "epoch": 0.7288238207936595, + "grad_norm": 0.7072234153747559, + "learning_rate": 7.091044785140626e-06, + "loss": 0.7741, + "step": 13242 + }, + { + "epoch": 0.7288788595960152, + "grad_norm": 0.8117190599441528, + "learning_rate": 7.090651037512854e-06, + "loss": 0.6851, + "step": 13243 + }, + { + "epoch": 0.7289338983983709, + "grad_norm": 0.6876427531242371, + "learning_rate": 7.090257274172306e-06, + "loss": 0.7162, + "step": 13244 + }, + { + "epoch": 0.7289889372007266, + "grad_norm": 0.7128324508666992, + "learning_rate": 7.0898634951219455e-06, + "loss": 0.7302, + "step": 13245 + }, + { + "epoch": 0.7290439760030821, + "grad_norm": 0.6918201446533203, + "learning_rate": 7.089469700364731e-06, + "loss": 0.8582, + "step": 13246 + }, + { + "epoch": 0.7290990148054378, + "grad_norm": 0.6172242164611816, + "learning_rate": 7.08907588990362e-06, + "loss": 0.6846, + "step": 13247 + }, + { + "epoch": 0.7291540536077935, + "grad_norm": 0.6799596548080444, + "learning_rate": 7.088682063741575e-06, + "loss": 0.7174, + "step": 13248 + }, + { + "epoch": 0.7292090924101492, + "grad_norm": 0.6663293838500977, + "learning_rate": 7.088288221881554e-06, + "loss": 0.7237, + "step": 13249 + }, + { + "epoch": 0.7292641312125048, + "grad_norm": 0.6758549213409424, + "learning_rate": 7.0878943643265175e-06, + "loss": 0.7912, + "step": 13250 + }, + { + "epoch": 0.7293191700148605, + "grad_norm": 0.6937153339385986, + "learning_rate": 7.087500491079427e-06, + "loss": 0.742, + "step": 13251 + }, + { + "epoch": 0.7293742088172162, + "grad_norm": 0.6441238522529602, + "learning_rate": 7.087106602143241e-06, + "loss": 0.7676, + "step": 13252 + }, + { + "epoch": 0.7294292476195718, + "grad_norm": 0.6615588068962097, + "learning_rate": 7.08671269752092e-06, + "loss": 0.7069, + "step": 13253 + }, + { + "epoch": 0.7294842864219274, + "grad_norm": 0.8052160739898682, + "learning_rate": 7.086318777215424e-06, + "loss": 0.811, + "step": 13254 + }, + { + "epoch": 0.7295393252242831, + "grad_norm": 0.7293280363082886, + "learning_rate": 7.085924841229716e-06, + "loss": 0.7127, + "step": 13255 + }, + { + "epoch": 0.7295943640266388, + "grad_norm": 0.7104617953300476, + "learning_rate": 7.085530889566756e-06, + "loss": 0.716, + "step": 13256 + }, + { + "epoch": 0.7296494028289945, + "grad_norm": 0.72947758436203, + "learning_rate": 7.085136922229503e-06, + "loss": 0.8144, + "step": 13257 + }, + { + "epoch": 0.7297044416313501, + "grad_norm": 0.7993913292884827, + "learning_rate": 7.08474293922092e-06, + "loss": 0.7609, + "step": 13258 + }, + { + "epoch": 0.7297594804337058, + "grad_norm": 0.7810680270195007, + "learning_rate": 7.0843489405439656e-06, + "loss": 0.8107, + "step": 13259 + }, + { + "epoch": 0.7298145192360614, + "grad_norm": 0.6383776664733887, + "learning_rate": 7.083954926201604e-06, + "loss": 0.7842, + "step": 13260 + }, + { + "epoch": 0.7298695580384171, + "grad_norm": 0.7653967142105103, + "learning_rate": 7.083560896196795e-06, + "loss": 0.729, + "step": 13261 + }, + { + "epoch": 0.7299245968407727, + "grad_norm": 0.6693821549415588, + "learning_rate": 7.083166850532498e-06, + "loss": 0.6901, + "step": 13262 + }, + { + "epoch": 0.7299796356431284, + "grad_norm": 0.7408621907234192, + "learning_rate": 7.082772789211678e-06, + "loss": 0.7415, + "step": 13263 + }, + { + "epoch": 0.7300346744454841, + "grad_norm": 0.6693123579025269, + "learning_rate": 7.082378712237295e-06, + "loss": 0.8102, + "step": 13264 + }, + { + "epoch": 0.7300897132478398, + "grad_norm": 0.6572727560997009, + "learning_rate": 7.081984619612311e-06, + "loss": 0.6595, + "step": 13265 + }, + { + "epoch": 0.7301447520501954, + "grad_norm": 0.7934693694114685, + "learning_rate": 7.081590511339687e-06, + "loss": 0.8024, + "step": 13266 + }, + { + "epoch": 0.730199790852551, + "grad_norm": 1.0663061141967773, + "learning_rate": 7.081196387422388e-06, + "loss": 0.7844, + "step": 13267 + }, + { + "epoch": 0.7302548296549067, + "grad_norm": 0.8005035519599915, + "learning_rate": 7.080802247863372e-06, + "loss": 0.751, + "step": 13268 + }, + { + "epoch": 0.7303098684572624, + "grad_norm": 0.6480177044868469, + "learning_rate": 7.0804080926656046e-06, + "loss": 0.7745, + "step": 13269 + }, + { + "epoch": 0.730364907259618, + "grad_norm": 0.7026820182800293, + "learning_rate": 7.080013921832047e-06, + "loss": 0.7545, + "step": 13270 + }, + { + "epoch": 0.7304199460619737, + "grad_norm": 0.673954427242279, + "learning_rate": 7.079619735365662e-06, + "loss": 0.7142, + "step": 13271 + }, + { + "epoch": 0.7304749848643294, + "grad_norm": 0.7296637296676636, + "learning_rate": 7.079225533269411e-06, + "loss": 0.8493, + "step": 13272 + }, + { + "epoch": 0.730530023666685, + "grad_norm": 0.7147308588027954, + "learning_rate": 7.0788313155462576e-06, + "loss": 0.7638, + "step": 13273 + }, + { + "epoch": 0.7305850624690406, + "grad_norm": 0.7531922459602356, + "learning_rate": 7.078437082199163e-06, + "loss": 0.8644, + "step": 13274 + }, + { + "epoch": 0.7306401012713963, + "grad_norm": 0.6581404805183411, + "learning_rate": 7.078042833231092e-06, + "loss": 0.7555, + "step": 13275 + }, + { + "epoch": 0.730695140073752, + "grad_norm": 0.6781187057495117, + "learning_rate": 7.0776485686450095e-06, + "loss": 0.7536, + "step": 13276 + }, + { + "epoch": 0.7307501788761076, + "grad_norm": 0.7164949774742126, + "learning_rate": 7.077254288443874e-06, + "loss": 0.7275, + "step": 13277 + }, + { + "epoch": 0.7308052176784633, + "grad_norm": 0.8158305287361145, + "learning_rate": 7.076859992630652e-06, + "loss": 0.6821, + "step": 13278 + }, + { + "epoch": 0.730860256480819, + "grad_norm": 0.7101448178291321, + "learning_rate": 7.076465681208307e-06, + "loss": 0.69, + "step": 13279 + }, + { + "epoch": 0.7309152952831747, + "grad_norm": 0.6844518780708313, + "learning_rate": 7.076071354179802e-06, + "loss": 0.7577, + "step": 13280 + }, + { + "epoch": 0.7309703340855302, + "grad_norm": 0.6564158797264099, + "learning_rate": 7.0756770115481e-06, + "loss": 0.6752, + "step": 13281 + }, + { + "epoch": 0.7310253728878859, + "grad_norm": 0.7444283962249756, + "learning_rate": 7.0752826533161655e-06, + "loss": 0.8118, + "step": 13282 + }, + { + "epoch": 0.7310804116902416, + "grad_norm": 0.7657533884048462, + "learning_rate": 7.074888279486962e-06, + "loss": 0.8819, + "step": 13283 + }, + { + "epoch": 0.7311354504925973, + "grad_norm": 0.6924453973770142, + "learning_rate": 7.074493890063453e-06, + "loss": 0.7674, + "step": 13284 + }, + { + "epoch": 0.7311904892949529, + "grad_norm": 0.676188588142395, + "learning_rate": 7.074099485048603e-06, + "loss": 0.7266, + "step": 13285 + }, + { + "epoch": 0.7312455280973086, + "grad_norm": 0.6325914263725281, + "learning_rate": 7.073705064445378e-06, + "loss": 0.6856, + "step": 13286 + }, + { + "epoch": 0.7313005668996643, + "grad_norm": 0.662558913230896, + "learning_rate": 7.073310628256739e-06, + "loss": 0.751, + "step": 13287 + }, + { + "epoch": 0.73135560570202, + "grad_norm": 0.8313137292861938, + "learning_rate": 7.072916176485654e-06, + "loss": 0.7187, + "step": 13288 + }, + { + "epoch": 0.7314106445043755, + "grad_norm": 0.7033550143241882, + "learning_rate": 7.072521709135084e-06, + "loss": 0.8132, + "step": 13289 + }, + { + "epoch": 0.7314656833067312, + "grad_norm": 0.715242862701416, + "learning_rate": 7.0721272262079965e-06, + "loss": 0.8551, + "step": 13290 + }, + { + "epoch": 0.7315207221090869, + "grad_norm": 0.7545164227485657, + "learning_rate": 7.071732727707356e-06, + "loss": 0.7772, + "step": 13291 + }, + { + "epoch": 0.7315757609114426, + "grad_norm": 0.7181825637817383, + "learning_rate": 7.071338213636126e-06, + "loss": 0.7378, + "step": 13292 + }, + { + "epoch": 0.7316307997137982, + "grad_norm": 0.7793779969215393, + "learning_rate": 7.070943683997273e-06, + "loss": 0.7801, + "step": 13293 + }, + { + "epoch": 0.7316858385161539, + "grad_norm": 0.7456476092338562, + "learning_rate": 7.070549138793762e-06, + "loss": 0.8038, + "step": 13294 + }, + { + "epoch": 0.7317408773185096, + "grad_norm": 0.652519702911377, + "learning_rate": 7.0701545780285576e-06, + "loss": 0.746, + "step": 13295 + }, + { + "epoch": 0.7317959161208653, + "grad_norm": 0.784450888633728, + "learning_rate": 7.069760001704625e-06, + "loss": 0.8065, + "step": 13296 + }, + { + "epoch": 0.7318509549232208, + "grad_norm": 0.8052587509155273, + "learning_rate": 7.069365409824931e-06, + "loss": 0.8098, + "step": 13297 + }, + { + "epoch": 0.7319059937255765, + "grad_norm": 0.6890794038772583, + "learning_rate": 7.06897080239244e-06, + "loss": 0.783, + "step": 13298 + }, + { + "epoch": 0.7319610325279322, + "grad_norm": 0.7470653057098389, + "learning_rate": 7.068576179410119e-06, + "loss": 0.7658, + "step": 13299 + }, + { + "epoch": 0.7320160713302879, + "grad_norm": 0.6831437945365906, + "learning_rate": 7.068181540880932e-06, + "loss": 0.7864, + "step": 13300 + }, + { + "epoch": 0.7320711101326435, + "grad_norm": 0.7058265209197998, + "learning_rate": 7.067786886807847e-06, + "loss": 0.8254, + "step": 13301 + }, + { + "epoch": 0.7321261489349992, + "grad_norm": 0.7938248515129089, + "learning_rate": 7.067392217193828e-06, + "loss": 0.7291, + "step": 13302 + }, + { + "epoch": 0.7321811877373549, + "grad_norm": 0.7261865735054016, + "learning_rate": 7.066997532041844e-06, + "loss": 0.8115, + "step": 13303 + }, + { + "epoch": 0.7322362265397105, + "grad_norm": 0.6971743702888489, + "learning_rate": 7.0666028313548586e-06, + "loss": 0.7504, + "step": 13304 + }, + { + "epoch": 0.7322912653420661, + "grad_norm": 0.844879150390625, + "learning_rate": 7.0662081151358405e-06, + "loss": 0.7903, + "step": 13305 + }, + { + "epoch": 0.7323463041444218, + "grad_norm": 0.6670572757720947, + "learning_rate": 7.065813383387755e-06, + "loss": 0.7597, + "step": 13306 + }, + { + "epoch": 0.7324013429467775, + "grad_norm": 0.669711172580719, + "learning_rate": 7.06541863611357e-06, + "loss": 0.7179, + "step": 13307 + }, + { + "epoch": 0.7324563817491332, + "grad_norm": 0.7176600098609924, + "learning_rate": 7.0650238733162506e-06, + "loss": 0.8157, + "step": 13308 + }, + { + "epoch": 0.7325114205514888, + "grad_norm": 0.7230100631713867, + "learning_rate": 7.064629094998765e-06, + "loss": 0.7902, + "step": 13309 + }, + { + "epoch": 0.7325664593538445, + "grad_norm": 0.8811234831809998, + "learning_rate": 7.064234301164078e-06, + "loss": 0.7746, + "step": 13310 + }, + { + "epoch": 0.7326214981562001, + "grad_norm": 0.6777653098106384, + "learning_rate": 7.06383949181516e-06, + "loss": 0.7708, + "step": 13311 + }, + { + "epoch": 0.7326765369585558, + "grad_norm": 0.6692547798156738, + "learning_rate": 7.063444666954977e-06, + "loss": 0.7103, + "step": 13312 + }, + { + "epoch": 0.7327315757609114, + "grad_norm": 1.2304950952529907, + "learning_rate": 7.063049826586496e-06, + "loss": 0.7878, + "step": 13313 + }, + { + "epoch": 0.7327866145632671, + "grad_norm": 0.7073930501937866, + "learning_rate": 7.0626549707126834e-06, + "loss": 0.7546, + "step": 13314 + }, + { + "epoch": 0.7328416533656228, + "grad_norm": 0.7184866070747375, + "learning_rate": 7.06226009933651e-06, + "loss": 0.7207, + "step": 13315 + }, + { + "epoch": 0.7328966921679784, + "grad_norm": 0.7098046541213989, + "learning_rate": 7.061865212460941e-06, + "loss": 0.6415, + "step": 13316 + }, + { + "epoch": 0.7329517309703341, + "grad_norm": 0.714379608631134, + "learning_rate": 7.0614703100889445e-06, + "loss": 0.7305, + "step": 13317 + }, + { + "epoch": 0.7330067697726897, + "grad_norm": 0.655060887336731, + "learning_rate": 7.061075392223491e-06, + "loss": 0.6125, + "step": 13318 + }, + { + "epoch": 0.7330618085750454, + "grad_norm": 0.6481055617332458, + "learning_rate": 7.060680458867545e-06, + "loss": 0.7059, + "step": 13319 + }, + { + "epoch": 0.733116847377401, + "grad_norm": 0.7123916745185852, + "learning_rate": 7.060285510024076e-06, + "loss": 0.8007, + "step": 13320 + }, + { + "epoch": 0.7331718861797567, + "grad_norm": 0.7231262922286987, + "learning_rate": 7.059890545696053e-06, + "loss": 0.7781, + "step": 13321 + }, + { + "epoch": 0.7332269249821124, + "grad_norm": 0.8415369391441345, + "learning_rate": 7.0594955658864435e-06, + "loss": 0.6649, + "step": 13322 + }, + { + "epoch": 0.7332819637844681, + "grad_norm": 0.7243070006370544, + "learning_rate": 7.059100570598217e-06, + "loss": 0.6588, + "step": 13323 + }, + { + "epoch": 0.7333370025868237, + "grad_norm": 0.6581026315689087, + "learning_rate": 7.058705559834342e-06, + "loss": 0.7938, + "step": 13324 + }, + { + "epoch": 0.7333920413891793, + "grad_norm": 0.6213739514350891, + "learning_rate": 7.058310533597787e-06, + "loss": 0.7092, + "step": 13325 + }, + { + "epoch": 0.733447080191535, + "grad_norm": 0.6857954859733582, + "learning_rate": 7.057915491891522e-06, + "loss": 0.698, + "step": 13326 + }, + { + "epoch": 0.7335021189938907, + "grad_norm": 0.7528544068336487, + "learning_rate": 7.0575204347185135e-06, + "loss": 0.7234, + "step": 13327 + }, + { + "epoch": 0.7335571577962463, + "grad_norm": 0.6449099779129028, + "learning_rate": 7.057125362081733e-06, + "loss": 0.7391, + "step": 13328 + }, + { + "epoch": 0.733612196598602, + "grad_norm": 0.640689492225647, + "learning_rate": 7.0567302739841495e-06, + "loss": 0.5316, + "step": 13329 + }, + { + "epoch": 0.7336672354009577, + "grad_norm": 0.6686868071556091, + "learning_rate": 7.056335170428731e-06, + "loss": 0.7713, + "step": 13330 + }, + { + "epoch": 0.7337222742033134, + "grad_norm": 0.7627772688865662, + "learning_rate": 7.055940051418447e-06, + "loss": 0.7706, + "step": 13331 + }, + { + "epoch": 0.733777313005669, + "grad_norm": 0.7421852350234985, + "learning_rate": 7.055544916956269e-06, + "loss": 0.6418, + "step": 13332 + }, + { + "epoch": 0.7338323518080246, + "grad_norm": 0.7414699196815491, + "learning_rate": 7.0551497670451666e-06, + "loss": 0.811, + "step": 13333 + }, + { + "epoch": 0.7338873906103803, + "grad_norm": 0.7054136991500854, + "learning_rate": 7.0547546016881064e-06, + "loss": 0.8005, + "step": 13334 + }, + { + "epoch": 0.733942429412736, + "grad_norm": 0.670174241065979, + "learning_rate": 7.054359420888062e-06, + "loss": 0.6136, + "step": 13335 + }, + { + "epoch": 0.7339974682150916, + "grad_norm": 0.728255033493042, + "learning_rate": 7.053964224648001e-06, + "loss": 0.848, + "step": 13336 + }, + { + "epoch": 0.7340525070174473, + "grad_norm": 0.729815661907196, + "learning_rate": 7.053569012970896e-06, + "loss": 0.6985, + "step": 13337 + }, + { + "epoch": 0.734107545819803, + "grad_norm": 0.7564244866371155, + "learning_rate": 7.053173785859715e-06, + "loss": 0.7995, + "step": 13338 + }, + { + "epoch": 0.7341625846221587, + "grad_norm": 0.7746061682701111, + "learning_rate": 7.05277854331743e-06, + "loss": 0.7663, + "step": 13339 + }, + { + "epoch": 0.7342176234245142, + "grad_norm": 0.6878651976585388, + "learning_rate": 7.052383285347011e-06, + "loss": 0.8624, + "step": 13340 + }, + { + "epoch": 0.7342726622268699, + "grad_norm": 0.6989734768867493, + "learning_rate": 7.051988011951428e-06, + "loss": 0.7221, + "step": 13341 + }, + { + "epoch": 0.7343277010292256, + "grad_norm": 0.6854223012924194, + "learning_rate": 7.051592723133654e-06, + "loss": 0.7878, + "step": 13342 + }, + { + "epoch": 0.7343827398315813, + "grad_norm": 0.746696949005127, + "learning_rate": 7.051197418896657e-06, + "loss": 0.7074, + "step": 13343 + }, + { + "epoch": 0.7344377786339369, + "grad_norm": 0.6933150887489319, + "learning_rate": 7.050802099243409e-06, + "loss": 0.7587, + "step": 13344 + }, + { + "epoch": 0.7344928174362926, + "grad_norm": 0.7285788655281067, + "learning_rate": 7.050406764176882e-06, + "loss": 0.6589, + "step": 13345 + }, + { + "epoch": 0.7345478562386483, + "grad_norm": 0.6834994554519653, + "learning_rate": 7.050011413700046e-06, + "loss": 0.7196, + "step": 13346 + }, + { + "epoch": 0.734602895041004, + "grad_norm": 0.6504353880882263, + "learning_rate": 7.049616047815873e-06, + "loss": 0.7675, + "step": 13347 + }, + { + "epoch": 0.7346579338433595, + "grad_norm": 0.7009296417236328, + "learning_rate": 7.049220666527335e-06, + "loss": 0.7638, + "step": 13348 + }, + { + "epoch": 0.7347129726457152, + "grad_norm": 0.6210034489631653, + "learning_rate": 7.0488252698374024e-06, + "loss": 0.6872, + "step": 13349 + }, + { + "epoch": 0.7347680114480709, + "grad_norm": 0.6280165910720825, + "learning_rate": 7.0484298577490485e-06, + "loss": 0.7084, + "step": 13350 + }, + { + "epoch": 0.7348230502504266, + "grad_norm": 0.8055418133735657, + "learning_rate": 7.048034430265242e-06, + "loss": 0.8202, + "step": 13351 + }, + { + "epoch": 0.7348780890527822, + "grad_norm": 0.6674166917800903, + "learning_rate": 7.047638987388959e-06, + "loss": 0.6368, + "step": 13352 + }, + { + "epoch": 0.7349331278551379, + "grad_norm": 0.9182783961296082, + "learning_rate": 7.04724352912317e-06, + "loss": 0.6734, + "step": 13353 + }, + { + "epoch": 0.7349881666574936, + "grad_norm": 0.6371243596076965, + "learning_rate": 7.046848055470845e-06, + "loss": 0.7308, + "step": 13354 + }, + { + "epoch": 0.7350432054598492, + "grad_norm": 0.6454519033432007, + "learning_rate": 7.046452566434959e-06, + "loss": 0.6882, + "step": 13355 + }, + { + "epoch": 0.7350982442622048, + "grad_norm": 0.648970365524292, + "learning_rate": 7.046057062018483e-06, + "loss": 0.7247, + "step": 13356 + }, + { + "epoch": 0.7351532830645605, + "grad_norm": 0.668886661529541, + "learning_rate": 7.04566154222439e-06, + "loss": 0.7379, + "step": 13357 + }, + { + "epoch": 0.7352083218669162, + "grad_norm": 0.6593654751777649, + "learning_rate": 7.045266007055651e-06, + "loss": 0.7473, + "step": 13358 + }, + { + "epoch": 0.7352633606692718, + "grad_norm": 0.8418927192687988, + "learning_rate": 7.044870456515241e-06, + "loss": 0.7949, + "step": 13359 + }, + { + "epoch": 0.7353183994716275, + "grad_norm": 0.7350470423698425, + "learning_rate": 7.044474890606132e-06, + "loss": 0.7545, + "step": 13360 + }, + { + "epoch": 0.7353734382739832, + "grad_norm": 0.7786250114440918, + "learning_rate": 7.044079309331298e-06, + "loss": 0.8587, + "step": 13361 + }, + { + "epoch": 0.7354284770763388, + "grad_norm": 0.6345693469047546, + "learning_rate": 7.04368371269371e-06, + "loss": 0.77, + "step": 13362 + }, + { + "epoch": 0.7354835158786944, + "grad_norm": 0.7030417919158936, + "learning_rate": 7.043288100696343e-06, + "loss": 0.7624, + "step": 13363 + }, + { + "epoch": 0.7355385546810501, + "grad_norm": 0.7526041865348816, + "learning_rate": 7.042892473342169e-06, + "loss": 0.8018, + "step": 13364 + }, + { + "epoch": 0.7355935934834058, + "grad_norm": 0.6419941782951355, + "learning_rate": 7.042496830634162e-06, + "loss": 0.6788, + "step": 13365 + }, + { + "epoch": 0.7356486322857615, + "grad_norm": 0.6952203512191772, + "learning_rate": 7.042101172575297e-06, + "loss": 0.7747, + "step": 13366 + }, + { + "epoch": 0.7357036710881171, + "grad_norm": 0.8046327829360962, + "learning_rate": 7.041705499168544e-06, + "loss": 0.8216, + "step": 13367 + }, + { + "epoch": 0.7357587098904728, + "grad_norm": 0.6641537547111511, + "learning_rate": 7.041309810416881e-06, + "loss": 0.7313, + "step": 13368 + }, + { + "epoch": 0.7358137486928285, + "grad_norm": 0.6824444532394409, + "learning_rate": 7.040914106323278e-06, + "loss": 0.7179, + "step": 13369 + }, + { + "epoch": 0.7358687874951841, + "grad_norm": 0.6469557285308838, + "learning_rate": 7.040518386890711e-06, + "loss": 0.7671, + "step": 13370 + }, + { + "epoch": 0.7359238262975397, + "grad_norm": 0.6826488971710205, + "learning_rate": 7.040122652122156e-06, + "loss": 0.7, + "step": 13371 + }, + { + "epoch": 0.7359788650998954, + "grad_norm": 0.6931618452072144, + "learning_rate": 7.039726902020583e-06, + "loss": 0.7641, + "step": 13372 + }, + { + "epoch": 0.7360339039022511, + "grad_norm": 0.7445465922355652, + "learning_rate": 7.039331136588971e-06, + "loss": 0.7458, + "step": 13373 + }, + { + "epoch": 0.7360889427046068, + "grad_norm": 0.6358756422996521, + "learning_rate": 7.038935355830289e-06, + "loss": 0.6125, + "step": 13374 + }, + { + "epoch": 0.7361439815069624, + "grad_norm": 0.6966063380241394, + "learning_rate": 7.038539559747517e-06, + "loss": 0.6812, + "step": 13375 + }, + { + "epoch": 0.736199020309318, + "grad_norm": 0.9898090362548828, + "learning_rate": 7.038143748343626e-06, + "loss": 0.707, + "step": 13376 + }, + { + "epoch": 0.7362540591116737, + "grad_norm": 0.685951828956604, + "learning_rate": 7.0377479216215935e-06, + "loss": 0.7932, + "step": 13377 + }, + { + "epoch": 0.7363090979140294, + "grad_norm": 0.7056856751441956, + "learning_rate": 7.037352079584392e-06, + "loss": 0.7432, + "step": 13378 + }, + { + "epoch": 0.736364136716385, + "grad_norm": 0.7802489995956421, + "learning_rate": 7.036956222234999e-06, + "loss": 0.8275, + "step": 13379 + }, + { + "epoch": 0.7364191755187407, + "grad_norm": 0.7990192770957947, + "learning_rate": 7.036560349576387e-06, + "loss": 0.893, + "step": 13380 + }, + { + "epoch": 0.7364742143210964, + "grad_norm": 0.6454586386680603, + "learning_rate": 7.0361644616115334e-06, + "loss": 0.751, + "step": 13381 + }, + { + "epoch": 0.7365292531234521, + "grad_norm": 0.7071009278297424, + "learning_rate": 7.035768558343412e-06, + "loss": 0.7771, + "step": 13382 + }, + { + "epoch": 0.7365842919258077, + "grad_norm": 0.6530466079711914, + "learning_rate": 7.035372639774999e-06, + "loss": 0.7529, + "step": 13383 + }, + { + "epoch": 0.7366393307281633, + "grad_norm": 0.728689968585968, + "learning_rate": 7.03497670590927e-06, + "loss": 0.7862, + "step": 13384 + }, + { + "epoch": 0.736694369530519, + "grad_norm": 0.6640015244483948, + "learning_rate": 7.034580756749202e-06, + "loss": 0.6876, + "step": 13385 + }, + { + "epoch": 0.7367494083328747, + "grad_norm": 0.7388426661491394, + "learning_rate": 7.034184792297769e-06, + "loss": 0.8168, + "step": 13386 + }, + { + "epoch": 0.7368044471352303, + "grad_norm": 0.6543731093406677, + "learning_rate": 7.0337888125579465e-06, + "loss": 0.7555, + "step": 13387 + }, + { + "epoch": 0.736859485937586, + "grad_norm": 0.7783555388450623, + "learning_rate": 7.0333928175327125e-06, + "loss": 0.755, + "step": 13388 + }, + { + "epoch": 0.7369145247399417, + "grad_norm": 0.6275887489318848, + "learning_rate": 7.032996807225043e-06, + "loss": 0.7187, + "step": 13389 + }, + { + "epoch": 0.7369695635422974, + "grad_norm": 0.7007517218589783, + "learning_rate": 7.032600781637913e-06, + "loss": 0.6993, + "step": 13390 + }, + { + "epoch": 0.737024602344653, + "grad_norm": 0.6322247385978699, + "learning_rate": 7.0322047407743e-06, + "loss": 0.7178, + "step": 13391 + }, + { + "epoch": 0.7370796411470086, + "grad_norm": 0.7160976529121399, + "learning_rate": 7.0318086846371804e-06, + "loss": 0.6884, + "step": 13392 + }, + { + "epoch": 0.7371346799493643, + "grad_norm": 0.6056101322174072, + "learning_rate": 7.03141261322953e-06, + "loss": 0.6672, + "step": 13393 + }, + { + "epoch": 0.73718971875172, + "grad_norm": 0.8779410123825073, + "learning_rate": 7.0310165265543264e-06, + "loss": 0.7564, + "step": 13394 + }, + { + "epoch": 0.7372447575540756, + "grad_norm": 0.6868176460266113, + "learning_rate": 7.030620424614546e-06, + "loss": 0.7658, + "step": 13395 + }, + { + "epoch": 0.7372997963564313, + "grad_norm": 0.7611618041992188, + "learning_rate": 7.030224307413166e-06, + "loss": 0.6445, + "step": 13396 + }, + { + "epoch": 0.737354835158787, + "grad_norm": 0.7688242793083191, + "learning_rate": 7.0298281749531636e-06, + "loss": 0.8061, + "step": 13397 + }, + { + "epoch": 0.7374098739611427, + "grad_norm": 0.6781700849533081, + "learning_rate": 7.029432027237518e-06, + "loss": 0.6374, + "step": 13398 + }, + { + "epoch": 0.7374649127634982, + "grad_norm": 0.6719028353691101, + "learning_rate": 7.0290358642692e-06, + "loss": 0.7585, + "step": 13399 + }, + { + "epoch": 0.7375199515658539, + "grad_norm": 0.704429030418396, + "learning_rate": 7.028639686051195e-06, + "loss": 0.7052, + "step": 13400 + }, + { + "epoch": 0.7375749903682096, + "grad_norm": 0.714914083480835, + "learning_rate": 7.028243492586478e-06, + "loss": 0.7785, + "step": 13401 + }, + { + "epoch": 0.7376300291705652, + "grad_norm": 0.7732700705528259, + "learning_rate": 7.027847283878023e-06, + "loss": 0.7812, + "step": 13402 + }, + { + "epoch": 0.7376850679729209, + "grad_norm": 0.6849464178085327, + "learning_rate": 7.027451059928813e-06, + "loss": 0.7657, + "step": 13403 + }, + { + "epoch": 0.7377401067752766, + "grad_norm": 0.6924402117729187, + "learning_rate": 7.027054820741822e-06, + "loss": 0.677, + "step": 13404 + }, + { + "epoch": 0.7377951455776323, + "grad_norm": 0.7142716646194458, + "learning_rate": 7.02665856632003e-06, + "loss": 0.7071, + "step": 13405 + }, + { + "epoch": 0.7378501843799878, + "grad_norm": 0.7227265238761902, + "learning_rate": 7.0262622966664154e-06, + "loss": 0.6986, + "step": 13406 + }, + { + "epoch": 0.7379052231823435, + "grad_norm": 0.6387726664543152, + "learning_rate": 7.025866011783954e-06, + "loss": 0.6563, + "step": 13407 + }, + { + "epoch": 0.7379602619846992, + "grad_norm": 0.6411992311477661, + "learning_rate": 7.025469711675628e-06, + "loss": 0.5842, + "step": 13408 + }, + { + "epoch": 0.7380153007870549, + "grad_norm": 0.6811027526855469, + "learning_rate": 7.025073396344413e-06, + "loss": 0.6746, + "step": 13409 + }, + { + "epoch": 0.7380703395894105, + "grad_norm": 1.0705479383468628, + "learning_rate": 7.024677065793289e-06, + "loss": 0.7457, + "step": 13410 + }, + { + "epoch": 0.7381253783917662, + "grad_norm": 0.6920849084854126, + "learning_rate": 7.024280720025232e-06, + "loss": 0.6838, + "step": 13411 + }, + { + "epoch": 0.7381804171941219, + "grad_norm": 0.8089182376861572, + "learning_rate": 7.0238843590432236e-06, + "loss": 0.6682, + "step": 13412 + }, + { + "epoch": 0.7382354559964776, + "grad_norm": 0.6140334010124207, + "learning_rate": 7.023487982850244e-06, + "loss": 0.6992, + "step": 13413 + }, + { + "epoch": 0.7382904947988331, + "grad_norm": 0.8564643263816833, + "learning_rate": 7.023091591449269e-06, + "loss": 0.8512, + "step": 13414 + }, + { + "epoch": 0.7383455336011888, + "grad_norm": 0.655516505241394, + "learning_rate": 7.02269518484328e-06, + "loss": 0.7291, + "step": 13415 + }, + { + "epoch": 0.7384005724035445, + "grad_norm": 0.6373177766799927, + "learning_rate": 7.022298763035255e-06, + "loss": 0.7553, + "step": 13416 + }, + { + "epoch": 0.7384556112059002, + "grad_norm": 0.7023805379867554, + "learning_rate": 7.021902326028174e-06, + "loss": 0.7562, + "step": 13417 + }, + { + "epoch": 0.7385106500082558, + "grad_norm": 0.654181182384491, + "learning_rate": 7.021505873825016e-06, + "loss": 0.7153, + "step": 13418 + }, + { + "epoch": 0.7385656888106115, + "grad_norm": 0.6633459329605103, + "learning_rate": 7.02110940642876e-06, + "loss": 0.6779, + "step": 13419 + }, + { + "epoch": 0.7386207276129672, + "grad_norm": 0.7050659656524658, + "learning_rate": 7.020712923842388e-06, + "loss": 0.741, + "step": 13420 + }, + { + "epoch": 0.7386757664153228, + "grad_norm": 0.7241182327270508, + "learning_rate": 7.020316426068879e-06, + "loss": 0.7479, + "step": 13421 + }, + { + "epoch": 0.7387308052176784, + "grad_norm": 1.0262155532836914, + "learning_rate": 7.019919913111212e-06, + "loss": 0.8418, + "step": 13422 + }, + { + "epoch": 0.7387858440200341, + "grad_norm": 0.6765457391738892, + "learning_rate": 7.019523384972366e-06, + "loss": 0.727, + "step": 13423 + }, + { + "epoch": 0.7388408828223898, + "grad_norm": 0.6871724724769592, + "learning_rate": 7.0191268416553245e-06, + "loss": 0.8273, + "step": 13424 + }, + { + "epoch": 0.7388959216247455, + "grad_norm": 0.8085252046585083, + "learning_rate": 7.018730283163067e-06, + "loss": 0.7306, + "step": 13425 + }, + { + "epoch": 0.7389509604271011, + "grad_norm": 0.6822873950004578, + "learning_rate": 7.018333709498572e-06, + "loss": 0.7454, + "step": 13426 + }, + { + "epoch": 0.7390059992294568, + "grad_norm": 0.7210521697998047, + "learning_rate": 7.01793712066482e-06, + "loss": 0.8306, + "step": 13427 + }, + { + "epoch": 0.7390610380318124, + "grad_norm": 0.6404997110366821, + "learning_rate": 7.017540516664795e-06, + "loss": 0.7151, + "step": 13428 + }, + { + "epoch": 0.7391160768341681, + "grad_norm": 0.6662821769714355, + "learning_rate": 7.017143897501475e-06, + "loss": 0.7446, + "step": 13429 + }, + { + "epoch": 0.7391711156365237, + "grad_norm": 0.8048129081726074, + "learning_rate": 7.0167472631778415e-06, + "loss": 0.7953, + "step": 13430 + }, + { + "epoch": 0.7392261544388794, + "grad_norm": 0.7215000987052917, + "learning_rate": 7.016350613696873e-06, + "loss": 0.8373, + "step": 13431 + }, + { + "epoch": 0.7392811932412351, + "grad_norm": 0.7309150099754333, + "learning_rate": 7.015953949061555e-06, + "loss": 0.7654, + "step": 13432 + }, + { + "epoch": 0.7393362320435908, + "grad_norm": 0.6487464904785156, + "learning_rate": 7.0155572692748665e-06, + "loss": 0.6473, + "step": 13433 + }, + { + "epoch": 0.7393912708459464, + "grad_norm": 0.6172077059745789, + "learning_rate": 7.01516057433979e-06, + "loss": 0.6672, + "step": 13434 + }, + { + "epoch": 0.739446309648302, + "grad_norm": 0.7569651007652283, + "learning_rate": 7.014763864259304e-06, + "loss": 0.8501, + "step": 13435 + }, + { + "epoch": 0.7395013484506577, + "grad_norm": 0.824669599533081, + "learning_rate": 7.014367139036393e-06, + "loss": 0.8596, + "step": 13436 + }, + { + "epoch": 0.7395563872530134, + "grad_norm": 0.6904401183128357, + "learning_rate": 7.013970398674038e-06, + "loss": 0.7403, + "step": 13437 + }, + { + "epoch": 0.739611426055369, + "grad_norm": 0.7999581098556519, + "learning_rate": 7.013573643175221e-06, + "loss": 0.8879, + "step": 13438 + }, + { + "epoch": 0.7396664648577247, + "grad_norm": 0.6600533723831177, + "learning_rate": 7.0131768725429236e-06, + "loss": 0.7324, + "step": 13439 + }, + { + "epoch": 0.7397215036600804, + "grad_norm": 0.7174191474914551, + "learning_rate": 7.0127800867801275e-06, + "loss": 0.7474, + "step": 13440 + }, + { + "epoch": 0.7397765424624361, + "grad_norm": 0.7023884654045105, + "learning_rate": 7.012383285889814e-06, + "loss": 0.7826, + "step": 13441 + }, + { + "epoch": 0.7398315812647916, + "grad_norm": 0.6486913561820984, + "learning_rate": 7.011986469874969e-06, + "loss": 0.6553, + "step": 13442 + }, + { + "epoch": 0.7398866200671473, + "grad_norm": 0.7238486409187317, + "learning_rate": 7.011589638738569e-06, + "loss": 0.6759, + "step": 13443 + }, + { + "epoch": 0.739941658869503, + "grad_norm": 0.7879656553268433, + "learning_rate": 7.011192792483601e-06, + "loss": 0.886, + "step": 13444 + }, + { + "epoch": 0.7399966976718586, + "grad_norm": 0.6592407822608948, + "learning_rate": 7.010795931113047e-06, + "loss": 0.7746, + "step": 13445 + }, + { + "epoch": 0.7400517364742143, + "grad_norm": 0.8274507522583008, + "learning_rate": 7.010399054629889e-06, + "loss": 0.7615, + "step": 13446 + }, + { + "epoch": 0.74010677527657, + "grad_norm": 0.6233614087104797, + "learning_rate": 7.010002163037109e-06, + "loss": 0.695, + "step": 13447 + }, + { + "epoch": 0.7401618140789257, + "grad_norm": 0.7082701921463013, + "learning_rate": 7.00960525633769e-06, + "loss": 0.6677, + "step": 13448 + }, + { + "epoch": 0.7402168528812813, + "grad_norm": 1.0694652795791626, + "learning_rate": 7.009208334534618e-06, + "loss": 0.7792, + "step": 13449 + }, + { + "epoch": 0.7402718916836369, + "grad_norm": 0.7189109325408936, + "learning_rate": 7.008811397630874e-06, + "loss": 0.8606, + "step": 13450 + }, + { + "epoch": 0.7403269304859926, + "grad_norm": 0.7136901617050171, + "learning_rate": 7.00841444562944e-06, + "loss": 0.7142, + "step": 13451 + }, + { + "epoch": 0.7403819692883483, + "grad_norm": 0.6508508920669556, + "learning_rate": 7.008017478533301e-06, + "loss": 0.6748, + "step": 13452 + }, + { + "epoch": 0.7404370080907039, + "grad_norm": 0.6560903191566467, + "learning_rate": 7.007620496345441e-06, + "loss": 0.7929, + "step": 13453 + }, + { + "epoch": 0.7404920468930596, + "grad_norm": 0.6909067034721375, + "learning_rate": 7.007223499068841e-06, + "loss": 0.6118, + "step": 13454 + }, + { + "epoch": 0.7405470856954153, + "grad_norm": 0.6554582715034485, + "learning_rate": 7.0068264867064874e-06, + "loss": 0.7687, + "step": 13455 + }, + { + "epoch": 0.740602124497771, + "grad_norm": 0.7788346409797668, + "learning_rate": 7.006429459261363e-06, + "loss": 0.7535, + "step": 13456 + }, + { + "epoch": 0.7406571633001265, + "grad_norm": 0.7702943682670593, + "learning_rate": 7.006032416736452e-06, + "loss": 0.833, + "step": 13457 + }, + { + "epoch": 0.7407122021024822, + "grad_norm": 0.6860190033912659, + "learning_rate": 7.005635359134738e-06, + "loss": 0.6643, + "step": 13458 + }, + { + "epoch": 0.7407672409048379, + "grad_norm": 0.7470136880874634, + "learning_rate": 7.005238286459205e-06, + "loss": 0.7811, + "step": 13459 + }, + { + "epoch": 0.7408222797071936, + "grad_norm": 0.6769132614135742, + "learning_rate": 7.004841198712839e-06, + "loss": 0.7322, + "step": 13460 + }, + { + "epoch": 0.7408773185095492, + "grad_norm": 0.7865259647369385, + "learning_rate": 7.004444095898623e-06, + "loss": 0.817, + "step": 13461 + }, + { + "epoch": 0.7409323573119049, + "grad_norm": 0.7352784276008606, + "learning_rate": 7.004046978019542e-06, + "loss": 0.7373, + "step": 13462 + }, + { + "epoch": 0.7409873961142606, + "grad_norm": 0.7647448182106018, + "learning_rate": 7.00364984507858e-06, + "loss": 0.7129, + "step": 13463 + }, + { + "epoch": 0.7410424349166163, + "grad_norm": 0.6979989409446716, + "learning_rate": 7.003252697078722e-06, + "loss": 0.7833, + "step": 13464 + }, + { + "epoch": 0.7410974737189718, + "grad_norm": 0.6117465496063232, + "learning_rate": 7.002855534022953e-06, + "loss": 0.6732, + "step": 13465 + }, + { + "epoch": 0.7411525125213275, + "grad_norm": 0.6754159331321716, + "learning_rate": 7.002458355914258e-06, + "loss": 0.6939, + "step": 13466 + }, + { + "epoch": 0.7412075513236832, + "grad_norm": 0.6713566184043884, + "learning_rate": 7.002061162755621e-06, + "loss": 0.7459, + "step": 13467 + }, + { + "epoch": 0.7412625901260389, + "grad_norm": 0.6475394368171692, + "learning_rate": 7.001663954550029e-06, + "loss": 0.7912, + "step": 13468 + }, + { + "epoch": 0.7413176289283945, + "grad_norm": 0.6577908992767334, + "learning_rate": 7.001266731300467e-06, + "loss": 0.6903, + "step": 13469 + }, + { + "epoch": 0.7413726677307502, + "grad_norm": 0.8129748106002808, + "learning_rate": 7.00086949300992e-06, + "loss": 0.8277, + "step": 13470 + }, + { + "epoch": 0.7414277065331059, + "grad_norm": 0.6730444431304932, + "learning_rate": 7.000472239681372e-06, + "loss": 0.7357, + "step": 13471 + }, + { + "epoch": 0.7414827453354615, + "grad_norm": 0.7166460156440735, + "learning_rate": 7.000074971317812e-06, + "loss": 0.7544, + "step": 13472 + }, + { + "epoch": 0.7415377841378171, + "grad_norm": 0.6668731570243835, + "learning_rate": 6.9996776879222225e-06, + "loss": 0.7073, + "step": 13473 + }, + { + "epoch": 0.7415928229401728, + "grad_norm": 0.7031315565109253, + "learning_rate": 6.999280389497591e-06, + "loss": 0.7262, + "step": 13474 + }, + { + "epoch": 0.7416478617425285, + "grad_norm": 0.7426775693893433, + "learning_rate": 6.998883076046904e-06, + "loss": 0.7394, + "step": 13475 + }, + { + "epoch": 0.7417029005448842, + "grad_norm": 0.665226399898529, + "learning_rate": 6.9984857475731475e-06, + "loss": 0.7365, + "step": 13476 + }, + { + "epoch": 0.7417579393472398, + "grad_norm": 0.7762128114700317, + "learning_rate": 6.998088404079306e-06, + "loss": 0.8551, + "step": 13477 + }, + { + "epoch": 0.7418129781495955, + "grad_norm": 0.7129524350166321, + "learning_rate": 6.997691045568366e-06, + "loss": 0.7646, + "step": 13478 + }, + { + "epoch": 0.7418680169519511, + "grad_norm": 0.7199442386627197, + "learning_rate": 6.997293672043316e-06, + "loss": 0.6879, + "step": 13479 + }, + { + "epoch": 0.7419230557543068, + "grad_norm": 0.6559237241744995, + "learning_rate": 6.9968962835071415e-06, + "loss": 0.6965, + "step": 13480 + }, + { + "epoch": 0.7419780945566624, + "grad_norm": 0.7428768277168274, + "learning_rate": 6.996498879962829e-06, + "loss": 0.7748, + "step": 13481 + }, + { + "epoch": 0.7420331333590181, + "grad_norm": 0.7344076633453369, + "learning_rate": 6.996101461413365e-06, + "loss": 0.6554, + "step": 13482 + }, + { + "epoch": 0.7420881721613738, + "grad_norm": 0.7080272436141968, + "learning_rate": 6.995704027861736e-06, + "loss": 0.7335, + "step": 13483 + }, + { + "epoch": 0.7421432109637295, + "grad_norm": 0.6296887397766113, + "learning_rate": 6.9953065793109306e-06, + "loss": 0.6411, + "step": 13484 + }, + { + "epoch": 0.7421982497660851, + "grad_norm": 0.7597532868385315, + "learning_rate": 6.994909115763935e-06, + "loss": 0.8281, + "step": 13485 + }, + { + "epoch": 0.7422532885684407, + "grad_norm": 0.7059680819511414, + "learning_rate": 6.994511637223737e-06, + "loss": 0.8075, + "step": 13486 + }, + { + "epoch": 0.7423083273707964, + "grad_norm": 0.8097653388977051, + "learning_rate": 6.994114143693323e-06, + "loss": 0.772, + "step": 13487 + }, + { + "epoch": 0.742363366173152, + "grad_norm": 0.7609913945198059, + "learning_rate": 6.993716635175681e-06, + "loss": 0.8265, + "step": 13488 + }, + { + "epoch": 0.7424184049755077, + "grad_norm": 0.6209948062896729, + "learning_rate": 6.993319111673799e-06, + "loss": 0.6266, + "step": 13489 + }, + { + "epoch": 0.7424734437778634, + "grad_norm": 0.6655107140541077, + "learning_rate": 6.992921573190663e-06, + "loss": 0.7519, + "step": 13490 + }, + { + "epoch": 0.7425284825802191, + "grad_norm": 1.1243617534637451, + "learning_rate": 6.992524019729262e-06, + "loss": 0.7707, + "step": 13491 + }, + { + "epoch": 0.7425835213825747, + "grad_norm": 0.6680326461791992, + "learning_rate": 6.9921264512925845e-06, + "loss": 0.7344, + "step": 13492 + }, + { + "epoch": 0.7426385601849304, + "grad_norm": 0.7689213156700134, + "learning_rate": 6.991728867883618e-06, + "loss": 0.7591, + "step": 13493 + }, + { + "epoch": 0.742693598987286, + "grad_norm": 0.8587394952774048, + "learning_rate": 6.99133126950535e-06, + "loss": 0.6991, + "step": 13494 + }, + { + "epoch": 0.7427486377896417, + "grad_norm": 0.6736756563186646, + "learning_rate": 6.990933656160768e-06, + "loss": 0.7604, + "step": 13495 + }, + { + "epoch": 0.7428036765919973, + "grad_norm": 0.6538887023925781, + "learning_rate": 6.990536027852864e-06, + "loss": 0.7332, + "step": 13496 + }, + { + "epoch": 0.742858715394353, + "grad_norm": 0.6578357815742493, + "learning_rate": 6.990138384584623e-06, + "loss": 0.7238, + "step": 13497 + }, + { + "epoch": 0.7429137541967087, + "grad_norm": 0.6865534782409668, + "learning_rate": 6.989740726359035e-06, + "loss": 0.7012, + "step": 13498 + }, + { + "epoch": 0.7429687929990644, + "grad_norm": 0.6198129057884216, + "learning_rate": 6.989343053179088e-06, + "loss": 0.7391, + "step": 13499 + }, + { + "epoch": 0.74302383180142, + "grad_norm": 0.6929547786712646, + "learning_rate": 6.98894536504777e-06, + "loss": 0.8498, + "step": 13500 + }, + { + "epoch": 0.7430788706037756, + "grad_norm": 0.6863006353378296, + "learning_rate": 6.988547661968072e-06, + "loss": 0.6589, + "step": 13501 + }, + { + "epoch": 0.7431339094061313, + "grad_norm": 0.7490457892417908, + "learning_rate": 6.988149943942982e-06, + "loss": 0.8145, + "step": 13502 + }, + { + "epoch": 0.743188948208487, + "grad_norm": 0.6597211360931396, + "learning_rate": 6.987752210975489e-06, + "loss": 0.7786, + "step": 13503 + }, + { + "epoch": 0.7432439870108426, + "grad_norm": 0.7211003303527832, + "learning_rate": 6.987354463068583e-06, + "loss": 0.7668, + "step": 13504 + }, + { + "epoch": 0.7432990258131983, + "grad_norm": 0.6257827877998352, + "learning_rate": 6.9869567002252526e-06, + "loss": 0.7378, + "step": 13505 + }, + { + "epoch": 0.743354064615554, + "grad_norm": 0.656944751739502, + "learning_rate": 6.986558922448488e-06, + "loss": 0.6408, + "step": 13506 + }, + { + "epoch": 0.7434091034179097, + "grad_norm": 0.6862110495567322, + "learning_rate": 6.986161129741276e-06, + "loss": 0.7648, + "step": 13507 + }, + { + "epoch": 0.7434641422202652, + "grad_norm": 0.6216374039649963, + "learning_rate": 6.985763322106612e-06, + "loss": 0.6826, + "step": 13508 + }, + { + "epoch": 0.7435191810226209, + "grad_norm": 0.7959128618240356, + "learning_rate": 6.985365499547479e-06, + "loss": 0.7554, + "step": 13509 + }, + { + "epoch": 0.7435742198249766, + "grad_norm": 0.5882300734519958, + "learning_rate": 6.984967662066875e-06, + "loss": 0.6523, + "step": 13510 + }, + { + "epoch": 0.7436292586273323, + "grad_norm": 0.8529833555221558, + "learning_rate": 6.9845698096677805e-06, + "loss": 0.7871, + "step": 13511 + }, + { + "epoch": 0.7436842974296879, + "grad_norm": 1.2988953590393066, + "learning_rate": 6.9841719423531925e-06, + "loss": 0.708, + "step": 13512 + }, + { + "epoch": 0.7437393362320436, + "grad_norm": 0.6735696792602539, + "learning_rate": 6.983774060126101e-06, + "loss": 0.7962, + "step": 13513 + }, + { + "epoch": 0.7437943750343993, + "grad_norm": 0.8145982623100281, + "learning_rate": 6.9833761629894925e-06, + "loss": 0.9067, + "step": 13514 + }, + { + "epoch": 0.743849413836755, + "grad_norm": 0.7107387781143188, + "learning_rate": 6.98297825094636e-06, + "loss": 0.7986, + "step": 13515 + }, + { + "epoch": 0.7439044526391105, + "grad_norm": 0.7350436449050903, + "learning_rate": 6.9825803239996934e-06, + "loss": 0.7724, + "step": 13516 + }, + { + "epoch": 0.7439594914414662, + "grad_norm": 0.7300962805747986, + "learning_rate": 6.982182382152485e-06, + "loss": 0.734, + "step": 13517 + }, + { + "epoch": 0.7440145302438219, + "grad_norm": 0.7088475823402405, + "learning_rate": 6.981784425407724e-06, + "loss": 0.818, + "step": 13518 + }, + { + "epoch": 0.7440695690461776, + "grad_norm": 0.6911785006523132, + "learning_rate": 6.981386453768402e-06, + "loss": 0.6857, + "step": 13519 + }, + { + "epoch": 0.7441246078485332, + "grad_norm": 0.794143795967102, + "learning_rate": 6.980988467237508e-06, + "loss": 0.7496, + "step": 13520 + }, + { + "epoch": 0.7441796466508889, + "grad_norm": 0.7116371989250183, + "learning_rate": 6.980590465818037e-06, + "loss": 0.7082, + "step": 13521 + }, + { + "epoch": 0.7442346854532446, + "grad_norm": 0.6306180953979492, + "learning_rate": 6.980192449512978e-06, + "loss": 0.7227, + "step": 13522 + }, + { + "epoch": 0.7442897242556002, + "grad_norm": 0.6662481427192688, + "learning_rate": 6.979794418325323e-06, + "loss": 0.7323, + "step": 13523 + }, + { + "epoch": 0.7443447630579558, + "grad_norm": 0.6824387907981873, + "learning_rate": 6.97939637225806e-06, + "loss": 0.7188, + "step": 13524 + }, + { + "epoch": 0.7443998018603115, + "grad_norm": 0.7429190278053284, + "learning_rate": 6.9789983113141865e-06, + "loss": 0.7818, + "step": 13525 + }, + { + "epoch": 0.7444548406626672, + "grad_norm": 0.7148364782333374, + "learning_rate": 6.978600235496692e-06, + "loss": 0.7665, + "step": 13526 + }, + { + "epoch": 0.7445098794650229, + "grad_norm": 0.711482584476471, + "learning_rate": 6.978202144808567e-06, + "loss": 0.7865, + "step": 13527 + }, + { + "epoch": 0.7445649182673785, + "grad_norm": 0.6913465857505798, + "learning_rate": 6.977804039252802e-06, + "loss": 0.8206, + "step": 13528 + }, + { + "epoch": 0.7446199570697342, + "grad_norm": 0.9090713858604431, + "learning_rate": 6.977405918832394e-06, + "loss": 0.7243, + "step": 13529 + }, + { + "epoch": 0.7446749958720899, + "grad_norm": 0.7680408954620361, + "learning_rate": 6.977007783550331e-06, + "loss": 0.847, + "step": 13530 + }, + { + "epoch": 0.7447300346744454, + "grad_norm": 0.6486232876777649, + "learning_rate": 6.976609633409608e-06, + "loss": 0.7258, + "step": 13531 + }, + { + "epoch": 0.7447850734768011, + "grad_norm": 0.7612336277961731, + "learning_rate": 6.976211468413214e-06, + "loss": 0.7452, + "step": 13532 + }, + { + "epoch": 0.7448401122791568, + "grad_norm": 0.7539309859275818, + "learning_rate": 6.975813288564146e-06, + "loss": 0.8292, + "step": 13533 + }, + { + "epoch": 0.7448951510815125, + "grad_norm": 0.64984530210495, + "learning_rate": 6.975415093865394e-06, + "loss": 0.6818, + "step": 13534 + }, + { + "epoch": 0.7449501898838681, + "grad_norm": 0.6415309309959412, + "learning_rate": 6.9750168843199506e-06, + "loss": 0.7369, + "step": 13535 + }, + { + "epoch": 0.7450052286862238, + "grad_norm": 0.7107319235801697, + "learning_rate": 6.974618659930807e-06, + "loss": 0.7364, + "step": 13536 + }, + { + "epoch": 0.7450602674885795, + "grad_norm": 0.7358448505401611, + "learning_rate": 6.9742204207009605e-06, + "loss": 0.7784, + "step": 13537 + }, + { + "epoch": 0.7451153062909351, + "grad_norm": 0.6950068473815918, + "learning_rate": 6.9738221666334e-06, + "loss": 0.792, + "step": 13538 + }, + { + "epoch": 0.7451703450932907, + "grad_norm": 0.7355311512947083, + "learning_rate": 6.973423897731122e-06, + "loss": 0.7631, + "step": 13539 + }, + { + "epoch": 0.7452253838956464, + "grad_norm": 0.6813983917236328, + "learning_rate": 6.9730256139971175e-06, + "loss": 0.7397, + "step": 13540 + }, + { + "epoch": 0.7452804226980021, + "grad_norm": 0.7698497772216797, + "learning_rate": 6.9726273154343806e-06, + "loss": 0.7769, + "step": 13541 + }, + { + "epoch": 0.7453354615003578, + "grad_norm": 0.7406428456306458, + "learning_rate": 6.972229002045905e-06, + "loss": 0.6502, + "step": 13542 + }, + { + "epoch": 0.7453905003027134, + "grad_norm": 0.6976667046546936, + "learning_rate": 6.9718306738346846e-06, + "loss": 0.773, + "step": 13543 + }, + { + "epoch": 0.745445539105069, + "grad_norm": 0.6932592391967773, + "learning_rate": 6.9714323308037115e-06, + "loss": 0.7315, + "step": 13544 + }, + { + "epoch": 0.7455005779074247, + "grad_norm": 0.7329851984977722, + "learning_rate": 6.971033972955981e-06, + "loss": 0.7432, + "step": 13545 + }, + { + "epoch": 0.7455556167097804, + "grad_norm": 0.6262860298156738, + "learning_rate": 6.970635600294489e-06, + "loss": 0.6368, + "step": 13546 + }, + { + "epoch": 0.745610655512136, + "grad_norm": 0.7157273292541504, + "learning_rate": 6.970237212822225e-06, + "loss": 0.7209, + "step": 13547 + }, + { + "epoch": 0.7456656943144917, + "grad_norm": 0.7256374955177307, + "learning_rate": 6.9698388105421855e-06, + "loss": 0.794, + "step": 13548 + }, + { + "epoch": 0.7457207331168474, + "grad_norm": 0.7763124704360962, + "learning_rate": 6.969440393457365e-06, + "loss": 0.7211, + "step": 13549 + }, + { + "epoch": 0.7457757719192031, + "grad_norm": 0.7139148712158203, + "learning_rate": 6.9690419615707585e-06, + "loss": 0.6612, + "step": 13550 + }, + { + "epoch": 0.7458308107215587, + "grad_norm": 0.7532974481582642, + "learning_rate": 6.968643514885359e-06, + "loss": 0.6952, + "step": 13551 + }, + { + "epoch": 0.7458858495239143, + "grad_norm": 0.6845714449882507, + "learning_rate": 6.968245053404161e-06, + "loss": 0.6972, + "step": 13552 + }, + { + "epoch": 0.74594088832627, + "grad_norm": 0.7445462346076965, + "learning_rate": 6.967846577130162e-06, + "loss": 0.7826, + "step": 13553 + }, + { + "epoch": 0.7459959271286257, + "grad_norm": 0.7269366383552551, + "learning_rate": 6.967448086066353e-06, + "loss": 0.7353, + "step": 13554 + }, + { + "epoch": 0.7460509659309813, + "grad_norm": 0.7366362810134888, + "learning_rate": 6.967049580215732e-06, + "loss": 0.7955, + "step": 13555 + }, + { + "epoch": 0.746106004733337, + "grad_norm": 0.6456870436668396, + "learning_rate": 6.966651059581292e-06, + "loss": 0.7467, + "step": 13556 + }, + { + "epoch": 0.7461610435356927, + "grad_norm": 0.7196624279022217, + "learning_rate": 6.966252524166031e-06, + "loss": 0.6621, + "step": 13557 + }, + { + "epoch": 0.7462160823380484, + "grad_norm": 0.6776413917541504, + "learning_rate": 6.965853973972941e-06, + "loss": 0.7647, + "step": 13558 + }, + { + "epoch": 0.746271121140404, + "grad_norm": 0.7319629192352295, + "learning_rate": 6.9654554090050195e-06, + "loss": 0.8172, + "step": 13559 + }, + { + "epoch": 0.7463261599427596, + "grad_norm": 0.6995210647583008, + "learning_rate": 6.96505682926526e-06, + "loss": 0.7252, + "step": 13560 + }, + { + "epoch": 0.7463811987451153, + "grad_norm": 0.6520518064498901, + "learning_rate": 6.964658234756659e-06, + "loss": 0.6856, + "step": 13561 + }, + { + "epoch": 0.746436237547471, + "grad_norm": 0.7562724947929382, + "learning_rate": 6.964259625482215e-06, + "loss": 0.7088, + "step": 13562 + }, + { + "epoch": 0.7464912763498266, + "grad_norm": 0.788045346736908, + "learning_rate": 6.963861001444919e-06, + "loss": 0.7183, + "step": 13563 + }, + { + "epoch": 0.7465463151521823, + "grad_norm": 0.7461729049682617, + "learning_rate": 6.96346236264777e-06, + "loss": 0.6725, + "step": 13564 + }, + { + "epoch": 0.746601353954538, + "grad_norm": 0.7283952832221985, + "learning_rate": 6.963063709093764e-06, + "loss": 0.7765, + "step": 13565 + }, + { + "epoch": 0.7466563927568937, + "grad_norm": 0.7947741150856018, + "learning_rate": 6.962665040785896e-06, + "loss": 0.8423, + "step": 13566 + }, + { + "epoch": 0.7467114315592492, + "grad_norm": 0.7964398264884949, + "learning_rate": 6.962266357727164e-06, + "loss": 0.7589, + "step": 13567 + }, + { + "epoch": 0.7467664703616049, + "grad_norm": 0.7807595133781433, + "learning_rate": 6.961867659920563e-06, + "loss": 0.7843, + "step": 13568 + }, + { + "epoch": 0.7468215091639606, + "grad_norm": 0.678011417388916, + "learning_rate": 6.961468947369089e-06, + "loss": 0.6664, + "step": 13569 + }, + { + "epoch": 0.7468765479663163, + "grad_norm": 0.6768447756767273, + "learning_rate": 6.961070220075741e-06, + "loss": 0.7531, + "step": 13570 + }, + { + "epoch": 0.7469315867686719, + "grad_norm": 0.7405245304107666, + "learning_rate": 6.960671478043514e-06, + "loss": 0.8278, + "step": 13571 + }, + { + "epoch": 0.7469866255710276, + "grad_norm": 0.605675458908081, + "learning_rate": 6.960272721275403e-06, + "loss": 0.7167, + "step": 13572 + }, + { + "epoch": 0.7470416643733833, + "grad_norm": 0.7406657338142395, + "learning_rate": 6.959873949774409e-06, + "loss": 0.8191, + "step": 13573 + }, + { + "epoch": 0.7470967031757388, + "grad_norm": 0.6163522601127625, + "learning_rate": 6.959475163543526e-06, + "loss": 0.6711, + "step": 13574 + }, + { + "epoch": 0.7471517419780945, + "grad_norm": 0.6036590337753296, + "learning_rate": 6.9590763625857525e-06, + "loss": 0.7029, + "step": 13575 + }, + { + "epoch": 0.7472067807804502, + "grad_norm": 0.8638957738876343, + "learning_rate": 6.9586775469040845e-06, + "loss": 0.6288, + "step": 13576 + }, + { + "epoch": 0.7472618195828059, + "grad_norm": 0.7490845322608948, + "learning_rate": 6.958278716501521e-06, + "loss": 0.7375, + "step": 13577 + }, + { + "epoch": 0.7473168583851615, + "grad_norm": 0.7788114547729492, + "learning_rate": 6.957879871381059e-06, + "loss": 0.814, + "step": 13578 + }, + { + "epoch": 0.7473718971875172, + "grad_norm": 0.7247292995452881, + "learning_rate": 6.957481011545697e-06, + "loss": 0.6187, + "step": 13579 + }, + { + "epoch": 0.7474269359898729, + "grad_norm": 0.9642785787582397, + "learning_rate": 6.95708213699843e-06, + "loss": 0.8745, + "step": 13580 + }, + { + "epoch": 0.7474819747922286, + "grad_norm": 0.701675295829773, + "learning_rate": 6.956683247742259e-06, + "loss": 0.8474, + "step": 13581 + }, + { + "epoch": 0.7475370135945841, + "grad_norm": 0.6338050961494446, + "learning_rate": 6.9562843437801795e-06, + "loss": 0.7346, + "step": 13582 + }, + { + "epoch": 0.7475920523969398, + "grad_norm": 0.6954126358032227, + "learning_rate": 6.955885425115191e-06, + "loss": 0.8083, + "step": 13583 + }, + { + "epoch": 0.7476470911992955, + "grad_norm": 0.7316300272941589, + "learning_rate": 6.95548649175029e-06, + "loss": 0.8009, + "step": 13584 + }, + { + "epoch": 0.7477021300016512, + "grad_norm": 0.6314196586608887, + "learning_rate": 6.955087543688477e-06, + "loss": 0.6375, + "step": 13585 + }, + { + "epoch": 0.7477571688040068, + "grad_norm": 0.6604906320571899, + "learning_rate": 6.9546885809327495e-06, + "loss": 0.7081, + "step": 13586 + }, + { + "epoch": 0.7478122076063625, + "grad_norm": 0.8251973986625671, + "learning_rate": 6.9542896034861064e-06, + "loss": 0.7483, + "step": 13587 + }, + { + "epoch": 0.7478672464087182, + "grad_norm": 0.6946399211883545, + "learning_rate": 6.953890611351544e-06, + "loss": 0.8849, + "step": 13588 + }, + { + "epoch": 0.7479222852110738, + "grad_norm": 0.7713609933853149, + "learning_rate": 6.953491604532063e-06, + "loss": 0.7913, + "step": 13589 + }, + { + "epoch": 0.7479773240134294, + "grad_norm": 0.734355092048645, + "learning_rate": 6.953092583030664e-06, + "loss": 0.7216, + "step": 13590 + }, + { + "epoch": 0.7480323628157851, + "grad_norm": 0.6147064566612244, + "learning_rate": 6.952693546850342e-06, + "loss": 0.6894, + "step": 13591 + }, + { + "epoch": 0.7480874016181408, + "grad_norm": 0.7472255229949951, + "learning_rate": 6.9522944959940986e-06, + "loss": 0.7941, + "step": 13592 + }, + { + "epoch": 0.7481424404204965, + "grad_norm": 0.6478431224822998, + "learning_rate": 6.951895430464935e-06, + "loss": 0.6995, + "step": 13593 + }, + { + "epoch": 0.7481974792228521, + "grad_norm": 0.6956225633621216, + "learning_rate": 6.951496350265844e-06, + "loss": 0.7637, + "step": 13594 + }, + { + "epoch": 0.7482525180252078, + "grad_norm": 1.0637938976287842, + "learning_rate": 6.95109725539983e-06, + "loss": 0.7448, + "step": 13595 + }, + { + "epoch": 0.7483075568275634, + "grad_norm": 0.6948299407958984, + "learning_rate": 6.9506981458698916e-06, + "loss": 0.7343, + "step": 13596 + }, + { + "epoch": 0.7483625956299191, + "grad_norm": 0.9034255743026733, + "learning_rate": 6.950299021679028e-06, + "loss": 0.6481, + "step": 13597 + }, + { + "epoch": 0.7484176344322747, + "grad_norm": 0.7901731729507446, + "learning_rate": 6.949899882830239e-06, + "loss": 0.8368, + "step": 13598 + }, + { + "epoch": 0.7484726732346304, + "grad_norm": 0.7791730761528015, + "learning_rate": 6.949500729326525e-06, + "loss": 0.7912, + "step": 13599 + }, + { + "epoch": 0.7485277120369861, + "grad_norm": 0.7678626179695129, + "learning_rate": 6.949101561170883e-06, + "loss": 0.7514, + "step": 13600 + }, + { + "epoch": 0.7485827508393418, + "grad_norm": 0.709762454032898, + "learning_rate": 6.948702378366318e-06, + "loss": 0.6809, + "step": 13601 + }, + { + "epoch": 0.7486377896416974, + "grad_norm": 0.706031084060669, + "learning_rate": 6.948303180915827e-06, + "loss": 0.7454, + "step": 13602 + }, + { + "epoch": 0.748692828444053, + "grad_norm": 0.658869743347168, + "learning_rate": 6.9479039688224105e-06, + "loss": 0.6498, + "step": 13603 + }, + { + "epoch": 0.7487478672464087, + "grad_norm": 0.7253865599632263, + "learning_rate": 6.9475047420890685e-06, + "loss": 0.8063, + "step": 13604 + }, + { + "epoch": 0.7488029060487644, + "grad_norm": 0.752839207649231, + "learning_rate": 6.947105500718804e-06, + "loss": 0.7708, + "step": 13605 + }, + { + "epoch": 0.74885794485112, + "grad_norm": 0.6694571375846863, + "learning_rate": 6.946706244714615e-06, + "loss": 0.7121, + "step": 13606 + }, + { + "epoch": 0.7489129836534757, + "grad_norm": 0.751380443572998, + "learning_rate": 6.946306974079503e-06, + "loss": 0.8797, + "step": 13607 + }, + { + "epoch": 0.7489680224558314, + "grad_norm": 0.8001984357833862, + "learning_rate": 6.9459076888164676e-06, + "loss": 0.8963, + "step": 13608 + }, + { + "epoch": 0.7490230612581871, + "grad_norm": 0.7149432301521301, + "learning_rate": 6.945508388928511e-06, + "loss": 0.8311, + "step": 13609 + }, + { + "epoch": 0.7490781000605427, + "grad_norm": 0.8295183777809143, + "learning_rate": 6.945109074418635e-06, + "loss": 0.7466, + "step": 13610 + }, + { + "epoch": 0.7491331388628983, + "grad_norm": 0.7480556964874268, + "learning_rate": 6.94470974528984e-06, + "loss": 0.8277, + "step": 13611 + }, + { + "epoch": 0.749188177665254, + "grad_norm": 0.7962234616279602, + "learning_rate": 6.944310401545127e-06, + "loss": 0.7143, + "step": 13612 + }, + { + "epoch": 0.7492432164676097, + "grad_norm": 0.7722699642181396, + "learning_rate": 6.943911043187497e-06, + "loss": 0.6619, + "step": 13613 + }, + { + "epoch": 0.7492982552699653, + "grad_norm": 0.8495624661445618, + "learning_rate": 6.943511670219952e-06, + "loss": 0.8475, + "step": 13614 + }, + { + "epoch": 0.749353294072321, + "grad_norm": 0.7702826261520386, + "learning_rate": 6.943112282645494e-06, + "loss": 0.826, + "step": 13615 + }, + { + "epoch": 0.7494083328746767, + "grad_norm": 0.7435297966003418, + "learning_rate": 6.942712880467124e-06, + "loss": 0.8121, + "step": 13616 + }, + { + "epoch": 0.7494633716770323, + "grad_norm": 0.8108325600624084, + "learning_rate": 6.942313463687844e-06, + "loss": 0.7282, + "step": 13617 + }, + { + "epoch": 0.7495184104793879, + "grad_norm": 0.6840381622314453, + "learning_rate": 6.9419140323106574e-06, + "loss": 0.7446, + "step": 13618 + }, + { + "epoch": 0.7495734492817436, + "grad_norm": 0.7155357599258423, + "learning_rate": 6.941514586338562e-06, + "loss": 0.7598, + "step": 13619 + }, + { + "epoch": 0.7496284880840993, + "grad_norm": 0.7693290114402771, + "learning_rate": 6.941115125774564e-06, + "loss": 0.7666, + "step": 13620 + }, + { + "epoch": 0.7496835268864549, + "grad_norm": 0.6918750405311584, + "learning_rate": 6.940715650621665e-06, + "loss": 0.6831, + "step": 13621 + }, + { + "epoch": 0.7497385656888106, + "grad_norm": 0.8241471648216248, + "learning_rate": 6.9403161608828654e-06, + "loss": 0.6753, + "step": 13622 + }, + { + "epoch": 0.7497936044911663, + "grad_norm": 0.6659193634986877, + "learning_rate": 6.93991665656117e-06, + "loss": 0.6988, + "step": 13623 + }, + { + "epoch": 0.749848643293522, + "grad_norm": 0.8012998700141907, + "learning_rate": 6.9395171376595795e-06, + "loss": 0.7922, + "step": 13624 + }, + { + "epoch": 0.7499036820958775, + "grad_norm": 0.783018946647644, + "learning_rate": 6.9391176041810974e-06, + "loss": 0.7062, + "step": 13625 + }, + { + "epoch": 0.7499587208982332, + "grad_norm": 0.8228014707565308, + "learning_rate": 6.938718056128726e-06, + "loss": 0.7762, + "step": 13626 + }, + { + "epoch": 0.7500137597005889, + "grad_norm": 0.783525288105011, + "learning_rate": 6.9383184935054705e-06, + "loss": 0.7517, + "step": 13627 + }, + { + "epoch": 0.7500687985029446, + "grad_norm": 0.6686612963676453, + "learning_rate": 6.93791891631433e-06, + "loss": 0.7372, + "step": 13628 + }, + { + "epoch": 0.7501238373053002, + "grad_norm": 0.7089647054672241, + "learning_rate": 6.937519324558312e-06, + "loss": 0.7847, + "step": 13629 + }, + { + "epoch": 0.7501788761076559, + "grad_norm": 0.7674399018287659, + "learning_rate": 6.937119718240415e-06, + "loss": 0.7414, + "step": 13630 + }, + { + "epoch": 0.7502339149100116, + "grad_norm": 0.6331565380096436, + "learning_rate": 6.936720097363646e-06, + "loss": 0.7603, + "step": 13631 + }, + { + "epoch": 0.7502889537123673, + "grad_norm": 0.7084798812866211, + "learning_rate": 6.9363204619310065e-06, + "loss": 0.6844, + "step": 13632 + }, + { + "epoch": 0.7503439925147228, + "grad_norm": 0.8624362945556641, + "learning_rate": 6.9359208119455015e-06, + "loss": 0.7098, + "step": 13633 + }, + { + "epoch": 0.7503990313170785, + "grad_norm": 0.7681849598884583, + "learning_rate": 6.935521147410134e-06, + "loss": 0.7896, + "step": 13634 + }, + { + "epoch": 0.7504540701194342, + "grad_norm": 0.7494263052940369, + "learning_rate": 6.935121468327907e-06, + "loss": 0.7858, + "step": 13635 } ], "logging_steps": 1, @@ -82745,7 +95471,7 @@ "attributes": {} } }, - "total_flos": 3.4872691283585925e+19, + "total_flos": 4.023772071182991e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null