| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.999000999000999, |
| "eval_steps": 500, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.003996003996003996, |
| "grad_norm": 45.397716048013955, |
| "learning_rate": 4.9998026105095405e-05, |
| "loss": 3.2856, |
| "num_input_tokens_seen": 2097152, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.007992007992007992, |
| "grad_norm": 16.720700695798918, |
| "learning_rate": 4.99921047320825e-05, |
| "loss": 4.9341, |
| "num_input_tokens_seen": 4194304, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.011988011988011988, |
| "grad_norm": 7.237461510911915, |
| "learning_rate": 4.998223681601473e-05, |
| "loss": 3.1436, |
| "num_input_tokens_seen": 6291456, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.015984015984015984, |
| "grad_norm": 2.2135290807336365, |
| "learning_rate": 4.996842391515044e-05, |
| "loss": 2.9181, |
| "num_input_tokens_seen": 8388608, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.01998001998001998, |
| "grad_norm": 3.1777625146803534, |
| "learning_rate": 4.995066821070679e-05, |
| "loss": 2.8071, |
| "num_input_tokens_seen": 10485760, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.023976023976023976, |
| "grad_norm": 2.0926007787793646, |
| "learning_rate": 4.992897250651535e-05, |
| "loss": 2.8332, |
| "num_input_tokens_seen": 12582912, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.027972027972027972, |
| "grad_norm": 2.412968925864246, |
| "learning_rate": 4.990334022857932e-05, |
| "loss": 2.7195, |
| "num_input_tokens_seen": 14680064, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.03196803196803197, |
| "grad_norm": 1.1410800182016025, |
| "learning_rate": 4.987377542453251e-05, |
| "loss": 2.7389, |
| "num_input_tokens_seen": 16777216, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.03596403596403597, |
| "grad_norm": 1.3576408063131136, |
| "learning_rate": 4.984028276300021e-05, |
| "loss": 2.7014, |
| "num_input_tokens_seen": 18874368, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.03996003996003996, |
| "grad_norm": 0.9551004260961587, |
| "learning_rate": 4.980286753286195e-05, |
| "loss": 2.6864, |
| "num_input_tokens_seen": 20971520, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.04395604395604396, |
| "grad_norm": 1.2917881872858297, |
| "learning_rate": 4.976153564241628e-05, |
| "loss": 2.6969, |
| "num_input_tokens_seen": 23068672, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.04795204795204795, |
| "grad_norm": 1.2325808705376222, |
| "learning_rate": 4.971629361844785e-05, |
| "loss": 2.531, |
| "num_input_tokens_seen": 25165824, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.05194805194805195, |
| "grad_norm": 0.8577945730620076, |
| "learning_rate": 4.96671486051967e-05, |
| "loss": 2.6493, |
| "num_input_tokens_seen": 27262976, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.055944055944055944, |
| "grad_norm": 1.0732342099108947, |
| "learning_rate": 4.9614108363230135e-05, |
| "loss": 2.6148, |
| "num_input_tokens_seen": 29360128, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.059940059940059943, |
| "grad_norm": 0.7845706780773702, |
| "learning_rate": 4.9557181268217227e-05, |
| "loss": 2.6009, |
| "num_input_tokens_seen": 31457280, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.06393606393606394, |
| "grad_norm": 1.6165807283344802, |
| "learning_rate": 4.949637630960617e-05, |
| "loss": 2.5976, |
| "num_input_tokens_seen": 33554432, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.06793206793206794, |
| "grad_norm": 2.8144498870504027, |
| "learning_rate": 4.943170308920484e-05, |
| "loss": 2.6129, |
| "num_input_tokens_seen": 35651584, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.07192807192807193, |
| "grad_norm": 1.8049857582717068, |
| "learning_rate": 4.9363171819664434e-05, |
| "loss": 2.567, |
| "num_input_tokens_seen": 37748736, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.07592407592407592, |
| "grad_norm": 1.826683609901875, |
| "learning_rate": 4.929079332286685e-05, |
| "loss": 2.5708, |
| "num_input_tokens_seen": 39845888, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.07992007992007992, |
| "grad_norm": 1.6842175771710053, |
| "learning_rate": 4.9214579028215776e-05, |
| "loss": 2.5386, |
| "num_input_tokens_seen": 41943040, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.08391608391608392, |
| "grad_norm": 2.664220035464748, |
| "learning_rate": 4.913454097083185e-05, |
| "loss": 2.5569, |
| "num_input_tokens_seen": 44040192, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.08791208791208792, |
| "grad_norm": 1.903388897596229, |
| "learning_rate": 4.905069178965215e-05, |
| "loss": 2.5432, |
| "num_input_tokens_seen": 46137344, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.0919080919080919, |
| "grad_norm": 6.989496095818957, |
| "learning_rate": 4.89630447254344e-05, |
| "loss": 2.482, |
| "num_input_tokens_seen": 48234496, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.0959040959040959, |
| "grad_norm": 2.0181196594962465, |
| "learning_rate": 4.887161361866608e-05, |
| "loss": 2.6001, |
| "num_input_tokens_seen": 50331648, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.0999000999000999, |
| "grad_norm": 2.901110037229965, |
| "learning_rate": 4.877641290737884e-05, |
| "loss": 2.5543, |
| "num_input_tokens_seen": 52428800, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.1038961038961039, |
| "grad_norm": 3.043139566878998, |
| "learning_rate": 4.867745762486861e-05, |
| "loss": 2.5969, |
| "num_input_tokens_seen": 54525952, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.10789210789210789, |
| "grad_norm": 1.3915435125167144, |
| "learning_rate": 4.8574763397321614e-05, |
| "loss": 2.5838, |
| "num_input_tokens_seen": 56623104, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.11188811188811189, |
| "grad_norm": 1.9093668488910387, |
| "learning_rate": 4.846834644134686e-05, |
| "loss": 2.5429, |
| "num_input_tokens_seen": 58720256, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.11588411588411589, |
| "grad_norm": 3.01181526806804, |
| "learning_rate": 4.8358223561415304e-05, |
| "loss": 2.6161, |
| "num_input_tokens_seen": 60817408, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.11988011988011989, |
| "grad_norm": 1.624336721197139, |
| "learning_rate": 4.8244412147206284e-05, |
| "loss": 2.5881, |
| "num_input_tokens_seen": 62914560, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.12387612387612387, |
| "grad_norm": 1.8720615635790336, |
| "learning_rate": 4.812693017086145e-05, |
| "loss": 2.546, |
| "num_input_tokens_seen": 65011712, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.12787212787212787, |
| "grad_norm": 1.9333264726490988, |
| "learning_rate": 4.800579618414676e-05, |
| "loss": 2.5099, |
| "num_input_tokens_seen": 67108864, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.13186813186813187, |
| "grad_norm": 1.3173676324547374, |
| "learning_rate": 4.788102931552294e-05, |
| "loss": 2.529, |
| "num_input_tokens_seen": 69206016, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.13586413586413587, |
| "grad_norm": 1.4145858951847954, |
| "learning_rate": 4.775264926712489e-05, |
| "loss": 2.5496, |
| "num_input_tokens_seen": 71303168, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.13986013986013987, |
| "grad_norm": 1.2459777044350313, |
| "learning_rate": 4.762067631165049e-05, |
| "loss": 2.5767, |
| "num_input_tokens_seen": 73400320, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.14385614385614387, |
| "grad_norm": 0.7204082348835508, |
| "learning_rate": 4.7485131289159276e-05, |
| "loss": 2.451, |
| "num_input_tokens_seen": 75497472, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.14785214785214784, |
| "grad_norm": 0.7970916518174539, |
| "learning_rate": 4.73460356037816e-05, |
| "loss": 2.5332, |
| "num_input_tokens_seen": 77594624, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.15184815184815184, |
| "grad_norm": 1.1114254341260015, |
| "learning_rate": 4.720341122033862e-05, |
| "loss": 2.5736, |
| "num_input_tokens_seen": 79691776, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.15584415584415584, |
| "grad_norm": 0.8457897383835116, |
| "learning_rate": 4.7057280660873835e-05, |
| "loss": 2.4711, |
| "num_input_tokens_seen": 81788928, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.15984015984015984, |
| "grad_norm": 0.7884656103158801, |
| "learning_rate": 4.690766700109659e-05, |
| "loss": 2.5646, |
| "num_input_tokens_seen": 83886080, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.16383616383616384, |
| "grad_norm": 0.8582096491361969, |
| "learning_rate": 4.675459386673815e-05, |
| "loss": 2.4781, |
| "num_input_tokens_seen": 85983232, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.16783216783216784, |
| "grad_norm": 0.7464874872363706, |
| "learning_rate": 4.659808542982088e-05, |
| "loss": 2.4963, |
| "num_input_tokens_seen": 88080384, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.17182817182817184, |
| "grad_norm": 0.7533013060343199, |
| "learning_rate": 4.643816640484131e-05, |
| "loss": 2.5023, |
| "num_input_tokens_seen": 90177536, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.17582417582417584, |
| "grad_norm": 0.7198555133144876, |
| "learning_rate": 4.6274862044867304e-05, |
| "loss": 2.4608, |
| "num_input_tokens_seen": 92274688, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.1798201798201798, |
| "grad_norm": 0.7979263068427599, |
| "learning_rate": 4.610819813755038e-05, |
| "loss": 2.5307, |
| "num_input_tokens_seen": 94371840, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.1838161838161838, |
| "grad_norm": 0.6866827015012487, |
| "learning_rate": 4.593820100105355e-05, |
| "loss": 2.4924, |
| "num_input_tokens_seen": 96468992, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.1878121878121878, |
| "grad_norm": 0.7414434038225337, |
| "learning_rate": 4.5764897479895317e-05, |
| "loss": 2.4959, |
| "num_input_tokens_seen": 98566144, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.1918081918081918, |
| "grad_norm": 0.6080104398924819, |
| "learning_rate": 4.558831494071069e-05, |
| "loss": 2.4251, |
| "num_input_tokens_seen": 100663296, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.1958041958041958, |
| "grad_norm": 0.6138336762931118, |
| "learning_rate": 4.5408481267929605e-05, |
| "loss": 2.4906, |
| "num_input_tokens_seen": 102760448, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.1998001998001998, |
| "grad_norm": 1.0920278537803145, |
| "learning_rate": 4.522542485937369e-05, |
| "loss": 2.3702, |
| "num_input_tokens_seen": 104857600, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.2037962037962038, |
| "grad_norm": 0.7688299543749751, |
| "learning_rate": 4.503917462177192e-05, |
| "loss": 2.434, |
| "num_input_tokens_seen": 106954752, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.2077922077922078, |
| "grad_norm": 0.8771468943104782, |
| "learning_rate": 4.484975996619589e-05, |
| "loss": 2.4492, |
| "num_input_tokens_seen": 109051904, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.21178821178821178, |
| "grad_norm": 0.533345425491502, |
| "learning_rate": 4.465721080341547e-05, |
| "loss": 2.4919, |
| "num_input_tokens_seen": 111149056, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.21578421578421578, |
| "grad_norm": 2.6249379157784087, |
| "learning_rate": 4.4461557539175594e-05, |
| "loss": 2.4276, |
| "num_input_tokens_seen": 113246208, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.21978021978021978, |
| "grad_norm": 2.179059812675857, |
| "learning_rate": 4.426283106939474e-05, |
| "loss": 2.4055, |
| "num_input_tokens_seen": 115343360, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.22377622377622378, |
| "grad_norm": 0.9411076208822224, |
| "learning_rate": 4.40610627752862e-05, |
| "loss": 2.4613, |
| "num_input_tokens_seen": 117440512, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.22777222777222778, |
| "grad_norm": 1.377881364411232, |
| "learning_rate": 4.3856284518402594e-05, |
| "loss": 2.486, |
| "num_input_tokens_seen": 119537664, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.23176823176823177, |
| "grad_norm": 0.7750081918877708, |
| "learning_rate": 4.3648528635604556e-05, |
| "loss": 2.4829, |
| "num_input_tokens_seen": 121634816, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.23576423576423577, |
| "grad_norm": 2.4636679019402297, |
| "learning_rate": 4.343782793395435e-05, |
| "loss": 2.4698, |
| "num_input_tokens_seen": 123731968, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.23976023976023977, |
| "grad_norm": 1.0339493481387614, |
| "learning_rate": 4.3224215685535294e-05, |
| "loss": 2.4442, |
| "num_input_tokens_seen": 125829120, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.24375624375624375, |
| "grad_norm": 0.8689702977756828, |
| "learning_rate": 4.3007725622197674e-05, |
| "loss": 2.4808, |
| "num_input_tokens_seen": 127926272, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.24775224775224775, |
| "grad_norm": 0.8503808400068437, |
| "learning_rate": 4.278839193023214e-05, |
| "loss": 2.3803, |
| "num_input_tokens_seen": 130023424, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.2517482517482518, |
| "grad_norm": 1.4097904765459457, |
| "learning_rate": 4.256624924497123e-05, |
| "loss": 2.4473, |
| "num_input_tokens_seen": 132120576, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.25574425574425574, |
| "grad_norm": 2.488257561603659, |
| "learning_rate": 4.234133264532012e-05, |
| "loss": 2.4532, |
| "num_input_tokens_seen": 134217728, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.2597402597402597, |
| "grad_norm": 1.4664859777576438, |
| "learning_rate": 4.211367764821722e-05, |
| "loss": 2.4389, |
| "num_input_tokens_seen": 136314880, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.26373626373626374, |
| "grad_norm": 1.125154728992796, |
| "learning_rate": 4.188332020302561e-05, |
| "loss": 2.4051, |
| "num_input_tokens_seen": 138412032, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.2677322677322677, |
| "grad_norm": 0.6661249575175722, |
| "learning_rate": 4.165029668585629e-05, |
| "loss": 2.4608, |
| "num_input_tokens_seen": 140509184, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.27172827172827174, |
| "grad_norm": 2.89273237433577, |
| "learning_rate": 4.1414643893823914e-05, |
| "loss": 2.4266, |
| "num_input_tokens_seen": 142606336, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.2757242757242757, |
| "grad_norm": 1.6375945671111662, |
| "learning_rate": 4.1176399039236116e-05, |
| "loss": 2.4137, |
| "num_input_tokens_seen": 144703488, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.27972027972027974, |
| "grad_norm": 1.9952689171371465, |
| "learning_rate": 4.093559974371725e-05, |
| "loss": 2.4269, |
| "num_input_tokens_seen": 146800640, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.2837162837162837, |
| "grad_norm": 1.8749543255823333, |
| "learning_rate": 4.0692284032267516e-05, |
| "loss": 2.4848, |
| "num_input_tokens_seen": 148897792, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.28771228771228774, |
| "grad_norm": 1.290679930782177, |
| "learning_rate": 4.044649032725836e-05, |
| "loss": 2.4403, |
| "num_input_tokens_seen": 150994944, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.2917082917082917, |
| "grad_norm": 2.326385846144409, |
| "learning_rate": 4.019825744236514e-05, |
| "loss": 2.367, |
| "num_input_tokens_seen": 153092096, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.2957042957042957, |
| "grad_norm": 1.1124688690335176, |
| "learning_rate": 3.9947624576437975e-05, |
| "loss": 2.4078, |
| "num_input_tokens_seen": 155189248, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.2997002997002997, |
| "grad_norm": 1.5859029362273107, |
| "learning_rate": 3.969463130731183e-05, |
| "loss": 2.3746, |
| "num_input_tokens_seen": 157286400, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.3036963036963037, |
| "grad_norm": 2.535299707595878, |
| "learning_rate": 3.943931758555669e-05, |
| "loss": 2.4477, |
| "num_input_tokens_seen": 159383552, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.3076923076923077, |
| "grad_norm": 0.8511054127383315, |
| "learning_rate": 3.9181723728168916e-05, |
| "loss": 2.3503, |
| "num_input_tokens_seen": 161480704, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.3116883116883117, |
| "grad_norm": 2.2096000842841605, |
| "learning_rate": 3.8921890412204705e-05, |
| "loss": 2.4519, |
| "num_input_tokens_seen": 163577856, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.3156843156843157, |
| "grad_norm": 0.6762489736454366, |
| "learning_rate": 3.865985866835673e-05, |
| "loss": 2.4244, |
| "num_input_tokens_seen": 165675008, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.3196803196803197, |
| "grad_norm": 1.174734752764483, |
| "learning_rate": 3.8395669874474915e-05, |
| "loss": 2.3877, |
| "num_input_tokens_seen": 167772160, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.32367632367632365, |
| "grad_norm": 0.8893848412621251, |
| "learning_rate": 3.81293657490324e-05, |
| "loss": 2.4656, |
| "num_input_tokens_seen": 169869312, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.3276723276723277, |
| "grad_norm": 0.8008763444280786, |
| "learning_rate": 3.786098834453766e-05, |
| "loss": 2.3584, |
| "num_input_tokens_seen": 171966464, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.33166833166833165, |
| "grad_norm": 0.8682214888286258, |
| "learning_rate": 3.759058004089402e-05, |
| "loss": 2.3319, |
| "num_input_tokens_seen": 174063616, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.3356643356643357, |
| "grad_norm": 1.8753245134159358, |
| "learning_rate": 3.731818353870729e-05, |
| "loss": 2.4093, |
| "num_input_tokens_seen": 176160768, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.33966033966033965, |
| "grad_norm": 1.4539019981614403, |
| "learning_rate": 3.704384185254288e-05, |
| "loss": 2.4119, |
| "num_input_tokens_seen": 178257920, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.3436563436563437, |
| "grad_norm": 0.7008538588533324, |
| "learning_rate": 3.6767598304133324e-05, |
| "loss": 2.4248, |
| "num_input_tokens_seen": 180355072, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.34765234765234765, |
| "grad_norm": 0.8384215400783821, |
| "learning_rate": 3.6489496515537204e-05, |
| "loss": 2.4342, |
| "num_input_tokens_seen": 182452224, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.3516483516483517, |
| "grad_norm": 2.4158703170559694, |
| "learning_rate": 3.6209580402250815e-05, |
| "loss": 2.3808, |
| "num_input_tokens_seen": 184549376, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.35564435564435565, |
| "grad_norm": 0.6965715476379709, |
| "learning_rate": 3.592789416627332e-05, |
| "loss": 2.3784, |
| "num_input_tokens_seen": 186646528, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.3596403596403596, |
| "grad_norm": 0.6080689676257547, |
| "learning_rate": 3.564448228912682e-05, |
| "loss": 2.3808, |
| "num_input_tokens_seen": 188743680, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.36363636363636365, |
| "grad_norm": 1.5053035133397736, |
| "learning_rate": 3.535938952483211e-05, |
| "loss": 2.3535, |
| "num_input_tokens_seen": 190840832, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.3676323676323676, |
| "grad_norm": 0.5969182915785666, |
| "learning_rate": 3.507266089284157e-05, |
| "loss": 2.3879, |
| "num_input_tokens_seen": 192937984, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.37162837162837165, |
| "grad_norm": 0.6614317168865398, |
| "learning_rate": 3.4784341670930065e-05, |
| "loss": 2.4265, |
| "num_input_tokens_seen": 195035136, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.3756243756243756, |
| "grad_norm": 0.6396901885259411, |
| "learning_rate": 3.4494477388045035e-05, |
| "loss": 2.376, |
| "num_input_tokens_seen": 197132288, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.37962037962037964, |
| "grad_norm": 0.5015447365282152, |
| "learning_rate": 3.4203113817116957e-05, |
| "loss": 2.44, |
| "num_input_tokens_seen": 199229440, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.3836163836163836, |
| "grad_norm": 0.517602362869134, |
| "learning_rate": 3.3910296967831266e-05, |
| "loss": 2.4048, |
| "num_input_tokens_seen": 201326592, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.3876123876123876, |
| "grad_norm": 0.5367124231293549, |
| "learning_rate": 3.3616073079362926e-05, |
| "loss": 2.4968, |
| "num_input_tokens_seen": 203423744, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.3916083916083916, |
| "grad_norm": 0.44163794718107957, |
| "learning_rate": 3.332048861307467e-05, |
| "loss": 2.3913, |
| "num_input_tokens_seen": 205520896, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.3956043956043956, |
| "grad_norm": 0.4453294122902193, |
| "learning_rate": 3.302359024518024e-05, |
| "loss": 2.3615, |
| "num_input_tokens_seen": 207618048, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.3996003996003996, |
| "grad_norm": 36.96663608221555, |
| "learning_rate": 3.272542485937369e-05, |
| "loss": 2.4242, |
| "num_input_tokens_seen": 209715200, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.4035964035964036, |
| "grad_norm": 1.0158667426862753, |
| "learning_rate": 3.2426039539425876e-05, |
| "loss": 2.4347, |
| "num_input_tokens_seen": 211812352, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.4075924075924076, |
| "grad_norm": 0.6166463156099811, |
| "learning_rate": 3.21254815617494e-05, |
| "loss": 2.298, |
| "num_input_tokens_seen": 213909504, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.4115884115884116, |
| "grad_norm": 0.6259451796209666, |
| "learning_rate": 3.1823798387933134e-05, |
| "loss": 2.3888, |
| "num_input_tokens_seen": 216006656, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.4155844155844156, |
| "grad_norm": 1.0489453670708535, |
| "learning_rate": 3.152103765724743e-05, |
| "loss": 2.3289, |
| "num_input_tokens_seen": 218103808, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.4195804195804196, |
| "grad_norm": 0.4908844589531169, |
| "learning_rate": 3.121724717912138e-05, |
| "loss": 2.348, |
| "num_input_tokens_seen": 220200960, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.42357642357642356, |
| "grad_norm": 0.4631786561281481, |
| "learning_rate": 3.091247492559312e-05, |
| "loss": 2.3266, |
| "num_input_tokens_seen": 222298112, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.4275724275724276, |
| "grad_norm": 1.5774889656207558, |
| "learning_rate": 3.0606769023734536e-05, |
| "loss": 2.277, |
| "num_input_tokens_seen": 224395264, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.43156843156843155, |
| "grad_norm": 0.5054931421528289, |
| "learning_rate": 3.0300177748051373e-05, |
| "loss": 2.4185, |
| "num_input_tokens_seen": 226492416, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.4355644355644356, |
| "grad_norm": 0.6752074324680739, |
| "learning_rate": 2.9992749512860173e-05, |
| "loss": 2.3338, |
| "num_input_tokens_seen": 228589568, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.43956043956043955, |
| "grad_norm": 0.5210827434198482, |
| "learning_rate": 2.9684532864643122e-05, |
| "loss": 2.3148, |
| "num_input_tokens_seen": 230686720, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.4435564435564436, |
| "grad_norm": 0.8108950341145076, |
| "learning_rate": 2.9375576474381905e-05, |
| "loss": 2.3176, |
| "num_input_tokens_seen": 232783872, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.44755244755244755, |
| "grad_norm": 0.44812690973573566, |
| "learning_rate": 2.9065929129872094e-05, |
| "loss": 2.3399, |
| "num_input_tokens_seen": 234881024, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.4515484515484515, |
| "grad_norm": 1.4306421391522692, |
| "learning_rate": 2.875563972801893e-05, |
| "loss": 2.3927, |
| "num_input_tokens_seen": 236978176, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.45554445554445555, |
| "grad_norm": 0.473266657367917, |
| "learning_rate": 2.844475726711595e-05, |
| "loss": 2.3542, |
| "num_input_tokens_seen": 239075328, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.4595404595404595, |
| "grad_norm": 0.49871390173536145, |
| "learning_rate": 2.8133330839107608e-05, |
| "loss": 2.335, |
| "num_input_tokens_seen": 241172480, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.46353646353646355, |
| "grad_norm": 0.46530478050660184, |
| "learning_rate": 2.782140962183704e-05, |
| "loss": 2.343, |
| "num_input_tokens_seen": 243269632, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.4675324675324675, |
| "grad_norm": 1.0618225465218298, |
| "learning_rate": 2.7509042871280372e-05, |
| "loss": 2.3028, |
| "num_input_tokens_seen": 245366784, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.47152847152847155, |
| "grad_norm": 0.6365579201710051, |
| "learning_rate": 2.7196279913768584e-05, |
| "loss": 2.3386, |
| "num_input_tokens_seen": 247463936, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.4755244755244755, |
| "grad_norm": 0.517454879712277, |
| "learning_rate": 2.6883170138198323e-05, |
| "loss": 2.366, |
| "num_input_tokens_seen": 249561088, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.47952047952047955, |
| "grad_norm": 0.5489103583269549, |
| "learning_rate": 2.656976298823284e-05, |
| "loss": 2.3358, |
| "num_input_tokens_seen": 251658240, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.4835164835164835, |
| "grad_norm": 1.5497443025841122, |
| "learning_rate": 2.6256107954494242e-05, |
| "loss": 2.3482, |
| "num_input_tokens_seen": 253755392, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.4875124875124875, |
| "grad_norm": 0.3890946667445295, |
| "learning_rate": 2.594225456674837e-05, |
| "loss": 2.3389, |
| "num_input_tokens_seen": 255852544, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.4915084915084915, |
| "grad_norm": 0.8995091738499138, |
| "learning_rate": 2.562825238608344e-05, |
| "loss": 2.325, |
| "num_input_tokens_seen": 257949696, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.4955044955044955, |
| "grad_norm": 0.6057590450303696, |
| "learning_rate": 2.531415099708382e-05, |
| "loss": 2.3447, |
| "num_input_tokens_seen": 260046848, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.4995004995004995, |
| "grad_norm": 0.40513537407679856, |
| "learning_rate": 2.5e-05, |
| "loss": 2.3578, |
| "num_input_tokens_seen": 262144000, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.5034965034965035, |
| "grad_norm": 0.45174009842666113, |
| "learning_rate": 2.4685849002916183e-05, |
| "loss": 2.3141, |
| "num_input_tokens_seen": 264241152, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.5074925074925075, |
| "grad_norm": 0.4797548555921774, |
| "learning_rate": 2.4371747613916566e-05, |
| "loss": 2.3213, |
| "num_input_tokens_seen": 266338304, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.5114885114885115, |
| "grad_norm": 0.4407379540989733, |
| "learning_rate": 2.4057745433251635e-05, |
| "loss": 2.3199, |
| "num_input_tokens_seen": 268435456, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.5154845154845155, |
| "grad_norm": 0.44590516535808167, |
| "learning_rate": 2.3743892045505764e-05, |
| "loss": 2.2893, |
| "num_input_tokens_seen": 270532608, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.5194805194805194, |
| "grad_norm": 1.066996789800445, |
| "learning_rate": 2.3430237011767167e-05, |
| "loss": 2.3784, |
| "num_input_tokens_seen": 272629760, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.5234765234765235, |
| "grad_norm": 0.4721535837300895, |
| "learning_rate": 2.3116829861801686e-05, |
| "loss": 2.3312, |
| "num_input_tokens_seen": 274726912, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.5274725274725275, |
| "grad_norm": 1.4135381841204495, |
| "learning_rate": 2.280372008623142e-05, |
| "loss": 2.2565, |
| "num_input_tokens_seen": 276824064, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.5314685314685315, |
| "grad_norm": 0.46724481789578903, |
| "learning_rate": 2.2490957128719624e-05, |
| "loss": 2.33, |
| "num_input_tokens_seen": 278921216, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.5354645354645354, |
| "grad_norm": 0.49590353093665784, |
| "learning_rate": 2.217859037816296e-05, |
| "loss": 2.2589, |
| "num_input_tokens_seen": 281018368, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.5394605394605395, |
| "grad_norm": 0.4426696356932668, |
| "learning_rate": 2.186666916089239e-05, |
| "loss": 2.291, |
| "num_input_tokens_seen": 283115520, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.5434565434565435, |
| "grad_norm": 0.4212409336589665, |
| "learning_rate": 2.155524273288405e-05, |
| "loss": 2.2541, |
| "num_input_tokens_seen": 285212672, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.5474525474525475, |
| "grad_norm": 1.9969240684352454, |
| "learning_rate": 2.1244360271981073e-05, |
| "loss": 2.2777, |
| "num_input_tokens_seen": 287309824, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.5514485514485514, |
| "grad_norm": 0.42561998512597665, |
| "learning_rate": 2.0934070870127912e-05, |
| "loss": 2.3222, |
| "num_input_tokens_seen": 289406976, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.5554445554445554, |
| "grad_norm": 0.43570923645323345, |
| "learning_rate": 2.0624423525618098e-05, |
| "loss": 2.2762, |
| "num_input_tokens_seen": 291504128, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.5594405594405595, |
| "grad_norm": 0.42444476798188513, |
| "learning_rate": 2.031546713535688e-05, |
| "loss": 2.2721, |
| "num_input_tokens_seen": 293601280, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.5634365634365635, |
| "grad_norm": 0.40938463612957693, |
| "learning_rate": 2.000725048713983e-05, |
| "loss": 2.2734, |
| "num_input_tokens_seen": 295698432, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.5674325674325674, |
| "grad_norm": 0.4212666953068437, |
| "learning_rate": 1.969982225194864e-05, |
| "loss": 2.3372, |
| "num_input_tokens_seen": 297795584, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.5714285714285714, |
| "grad_norm": 0.4315458369408749, |
| "learning_rate": 1.9393230976265473e-05, |
| "loss": 2.3148, |
| "num_input_tokens_seen": 299892736, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.5754245754245755, |
| "grad_norm": 0.4272214181115824, |
| "learning_rate": 1.908752507440689e-05, |
| "loss": 2.2485, |
| "num_input_tokens_seen": 301989888, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.5794205794205795, |
| "grad_norm": 0.4208712412248795, |
| "learning_rate": 1.8782752820878634e-05, |
| "loss": 2.3022, |
| "num_input_tokens_seen": 304087040, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.5834165834165834, |
| "grad_norm": 0.4413709962140258, |
| "learning_rate": 1.8478962342752583e-05, |
| "loss": 2.2754, |
| "num_input_tokens_seen": 306184192, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.5874125874125874, |
| "grad_norm": 0.3977832437481733, |
| "learning_rate": 1.817620161206687e-05, |
| "loss": 2.3479, |
| "num_input_tokens_seen": 308281344, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.5914085914085914, |
| "grad_norm": 2.7720827054186454, |
| "learning_rate": 1.7874518438250597e-05, |
| "loss": 2.3408, |
| "num_input_tokens_seen": 310378496, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.5954045954045954, |
| "grad_norm": 1.1890799010288586, |
| "learning_rate": 1.7573960460574133e-05, |
| "loss": 2.2974, |
| "num_input_tokens_seen": 312475648, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.5994005994005994, |
| "grad_norm": 0.39608616820146314, |
| "learning_rate": 1.7274575140626318e-05, |
| "loss": 2.3153, |
| "num_input_tokens_seen": 314572800, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.6033966033966034, |
| "grad_norm": 0.38134650113731394, |
| "learning_rate": 1.6976409754819767e-05, |
| "loss": 2.2548, |
| "num_input_tokens_seen": 316669952, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.6073926073926074, |
| "grad_norm": 0.3743488140608569, |
| "learning_rate": 1.6679511386925337e-05, |
| "loss": 2.2596, |
| "num_input_tokens_seen": 318767104, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.6113886113886113, |
| "grad_norm": 0.3828025671771742, |
| "learning_rate": 1.6383926920637077e-05, |
| "loss": 2.2723, |
| "num_input_tokens_seen": 320864256, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.6153846153846154, |
| "grad_norm": 0.40159034575781394, |
| "learning_rate": 1.6089703032168733e-05, |
| "loss": 2.3033, |
| "num_input_tokens_seen": 322961408, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.6193806193806194, |
| "grad_norm": 0.3933271871286809, |
| "learning_rate": 1.5796886182883053e-05, |
| "loss": 2.2521, |
| "num_input_tokens_seen": 325058560, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.6233766233766234, |
| "grad_norm": 0.3794803538189512, |
| "learning_rate": 1.5505522611954975e-05, |
| "loss": 2.3233, |
| "num_input_tokens_seen": 327155712, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.6273726273726273, |
| "grad_norm": 0.4209446387405617, |
| "learning_rate": 1.521565832906994e-05, |
| "loss": 2.27, |
| "num_input_tokens_seen": 329252864, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.6313686313686314, |
| "grad_norm": 1.0074900430168887, |
| "learning_rate": 1.4927339107158437e-05, |
| "loss": 2.268, |
| "num_input_tokens_seen": 331350016, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.6353646353646354, |
| "grad_norm": 0.41630443418379, |
| "learning_rate": 1.4640610475167898e-05, |
| "loss": 2.316, |
| "num_input_tokens_seen": 333447168, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.6393606393606394, |
| "grad_norm": 0.4047604480679634, |
| "learning_rate": 1.4355517710873184e-05, |
| "loss": 2.2582, |
| "num_input_tokens_seen": 335544320, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.6433566433566433, |
| "grad_norm": 1.9494737149304817, |
| "learning_rate": 1.4072105833726684e-05, |
| "loss": 2.2726, |
| "num_input_tokens_seen": 337641472, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.6473526473526473, |
| "grad_norm": 0.39399670610977866, |
| "learning_rate": 1.3790419597749199e-05, |
| "loss": 2.2892, |
| "num_input_tokens_seen": 339738624, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.6513486513486514, |
| "grad_norm": 0.4451047482328129, |
| "learning_rate": 1.3510503484462805e-05, |
| "loss": 2.3082, |
| "num_input_tokens_seen": 341835776, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.6553446553446554, |
| "grad_norm": 0.4533609634065945, |
| "learning_rate": 1.3232401695866687e-05, |
| "loss": 2.2171, |
| "num_input_tokens_seen": 343932928, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.6593406593406593, |
| "grad_norm": 0.42100229748743384, |
| "learning_rate": 1.2956158147457115e-05, |
| "loss": 2.2627, |
| "num_input_tokens_seen": 346030080, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.6633366633366633, |
| "grad_norm": 0.45019542546347174, |
| "learning_rate": 1.2681816461292715e-05, |
| "loss": 2.2401, |
| "num_input_tokens_seen": 348127232, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.6673326673326674, |
| "grad_norm": 0.3844311288159986, |
| "learning_rate": 1.2409419959105981e-05, |
| "loss": 2.3032, |
| "num_input_tokens_seen": 350224384, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.6713286713286714, |
| "grad_norm": 1.0270704429019066, |
| "learning_rate": 1.2139011655462337e-05, |
| "loss": 2.2389, |
| "num_input_tokens_seen": 352321536, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.6753246753246753, |
| "grad_norm": 0.8976658936990556, |
| "learning_rate": 1.1870634250967605e-05, |
| "loss": 2.2195, |
| "num_input_tokens_seen": 354418688, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.6793206793206793, |
| "grad_norm": 0.3523904307422417, |
| "learning_rate": 1.1604330125525079e-05, |
| "loss": 2.3188, |
| "num_input_tokens_seen": 356515840, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.6833166833166833, |
| "grad_norm": 0.3678325920469417, |
| "learning_rate": 1.1340141331643276e-05, |
| "loss": 2.3015, |
| "num_input_tokens_seen": 358612992, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.6873126873126874, |
| "grad_norm": 0.3948745677403049, |
| "learning_rate": 1.107810958779531e-05, |
| "loss": 2.2556, |
| "num_input_tokens_seen": 360710144, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.6913086913086913, |
| "grad_norm": 0.36847213053456956, |
| "learning_rate": 1.0818276271831093e-05, |
| "loss": 2.2802, |
| "num_input_tokens_seen": 362807296, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.6953046953046953, |
| "grad_norm": 0.37215082870987926, |
| "learning_rate": 1.0560682414443315e-05, |
| "loss": 2.3612, |
| "num_input_tokens_seen": 364904448, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.6993006993006993, |
| "grad_norm": 0.3578522330518854, |
| "learning_rate": 1.0305368692688174e-05, |
| "loss": 2.251, |
| "num_input_tokens_seen": 367001600, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.7032967032967034, |
| "grad_norm": 0.35812347992126276, |
| "learning_rate": 1.0052375423562038e-05, |
| "loss": 2.3016, |
| "num_input_tokens_seen": 369098752, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.7072927072927073, |
| "grad_norm": 1.168071672392917, |
| "learning_rate": 9.801742557634872e-06, |
| "loss": 2.344, |
| "num_input_tokens_seen": 371195904, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.7112887112887113, |
| "grad_norm": 0.36137582326423007, |
| "learning_rate": 9.553509672741645e-06, |
| "loss": 2.2647, |
| "num_input_tokens_seen": 373293056, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.7152847152847153, |
| "grad_norm": 0.377052279774602, |
| "learning_rate": 9.307715967732491e-06, |
| "loss": 2.2306, |
| "num_input_tokens_seen": 375390208, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.7192807192807192, |
| "grad_norm": 0.3495969961337015, |
| "learning_rate": 9.064400256282757e-06, |
| "loss": 2.2993, |
| "num_input_tokens_seen": 377487360, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.7232767232767233, |
| "grad_norm": 0.3905042359103258, |
| "learning_rate": 8.8236009607639e-06, |
| "loss": 2.2201, |
| "num_input_tokens_seen": 379584512, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.7272727272727273, |
| "grad_norm": 0.35991200170219206, |
| "learning_rate": 8.585356106176094e-06, |
| "loss": 2.2431, |
| "num_input_tokens_seen": 381681664, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.7312687312687313, |
| "grad_norm": 0.4747746623605877, |
| "learning_rate": 8.34970331414371e-06, |
| "loss": 2.2521, |
| "num_input_tokens_seen": 383778816, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.7352647352647352, |
| "grad_norm": 0.3513858375488768, |
| "learning_rate": 8.116679796974388e-06, |
| "loss": 2.226, |
| "num_input_tokens_seen": 385875968, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.7392607392607392, |
| "grad_norm": 1.487313594856455, |
| "learning_rate": 7.886322351782783e-06, |
| "loss": 2.3006, |
| "num_input_tokens_seen": 387973120, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.7432567432567433, |
| "grad_norm": 0.3613902906975422, |
| "learning_rate": 7.65866735467988e-06, |
| "loss": 2.2603, |
| "num_input_tokens_seen": 390070272, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.7472527472527473, |
| "grad_norm": 0.39453939305159935, |
| "learning_rate": 7.433750755028773e-06, |
| "loss": 2.2391, |
| "num_input_tokens_seen": 392167424, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.7512487512487512, |
| "grad_norm": 0.38139090397827197, |
| "learning_rate": 7.211608069767867e-06, |
| "loss": 2.2556, |
| "num_input_tokens_seen": 394264576, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.7552447552447552, |
| "grad_norm": 0.34293302336613524, |
| "learning_rate": 6.992274377802327e-06, |
| "loss": 2.2651, |
| "num_input_tokens_seen": 396361728, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.7592407592407593, |
| "grad_norm": 0.3529801238138127, |
| "learning_rate": 6.775784314464717e-06, |
| "loss": 2.3033, |
| "num_input_tokens_seen": 398458880, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.7632367632367633, |
| "grad_norm": 0.3596835450126056, |
| "learning_rate": 6.562172066045655e-06, |
| "loss": 2.2464, |
| "num_input_tokens_seen": 400556032, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.7672327672327672, |
| "grad_norm": 0.3529032326443918, |
| "learning_rate": 6.3514713643954475e-06, |
| "loss": 2.3033, |
| "num_input_tokens_seen": 402653184, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.7712287712287712, |
| "grad_norm": 0.35479404201249337, |
| "learning_rate": 6.143715481597404e-06, |
| "loss": 2.2197, |
| "num_input_tokens_seen": 404750336, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.7752247752247752, |
| "grad_norm": 0.41764946351033705, |
| "learning_rate": 5.9389372247138e-06, |
| "loss": 2.2838, |
| "num_input_tokens_seen": 406847488, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.7792207792207793, |
| "grad_norm": 0.37035230076352776, |
| "learning_rate": 5.737168930605272e-06, |
| "loss": 2.338, |
| "num_input_tokens_seen": 408944640, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.7832167832167832, |
| "grad_norm": 0.3611144507014344, |
| "learning_rate": 5.538442460824417e-06, |
| "loss": 2.3085, |
| "num_input_tokens_seen": 411041792, |
| "step": 392 |
| }, |
| { |
| "epoch": 0.7872127872127872, |
| "grad_norm": 0.34270832986796484, |
| "learning_rate": 5.342789196584527e-06, |
| "loss": 2.2701, |
| "num_input_tokens_seen": 413138944, |
| "step": 394 |
| }, |
| { |
| "epoch": 0.7912087912087912, |
| "grad_norm": 0.3499855740079436, |
| "learning_rate": 5.150240033804116e-06, |
| "loss": 2.2596, |
| "num_input_tokens_seen": 415236096, |
| "step": 396 |
| }, |
| { |
| "epoch": 0.7952047952047953, |
| "grad_norm": 0.35091235500304685, |
| "learning_rate": 4.960825378228082e-06, |
| "loss": 2.2454, |
| "num_input_tokens_seen": 417333248, |
| "step": 398 |
| }, |
| { |
| "epoch": 0.7992007992007992, |
| "grad_norm": 0.35572491493138064, |
| "learning_rate": 4.7745751406263165e-06, |
| "loss": 2.2892, |
| "num_input_tokens_seen": 419430400, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.8031968031968032, |
| "grad_norm": 0.3656185720616, |
| "learning_rate": 4.591518732070402e-06, |
| "loss": 2.2484, |
| "num_input_tokens_seen": 421527552, |
| "step": 402 |
| }, |
| { |
| "epoch": 0.8071928071928072, |
| "grad_norm": 0.35958523758269045, |
| "learning_rate": 4.411685059289314e-06, |
| "loss": 2.2715, |
| "num_input_tokens_seen": 423624704, |
| "step": 404 |
| }, |
| { |
| "epoch": 0.8111888111888111, |
| "grad_norm": 0.34502433460955273, |
| "learning_rate": 4.235102520104681e-06, |
| "loss": 2.3122, |
| "num_input_tokens_seen": 425721856, |
| "step": 406 |
| }, |
| { |
| "epoch": 0.8151848151848152, |
| "grad_norm": 0.3333844525910081, |
| "learning_rate": 4.061798998946459e-06, |
| "loss": 2.2649, |
| "num_input_tokens_seen": 427819008, |
| "step": 408 |
| }, |
| { |
| "epoch": 0.8191808191808192, |
| "grad_norm": 0.34568730328624064, |
| "learning_rate": 3.891801862449629e-06, |
| "loss": 2.2721, |
| "num_input_tokens_seen": 429916160, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.8231768231768232, |
| "grad_norm": 0.3376181415187624, |
| "learning_rate": 3.725137955132707e-06, |
| "loss": 2.2464, |
| "num_input_tokens_seen": 432013312, |
| "step": 412 |
| }, |
| { |
| "epoch": 0.8271728271728271, |
| "grad_norm": 1.1666087674229744, |
| "learning_rate": 3.561833595158698e-06, |
| "loss": 2.2386, |
| "num_input_tokens_seen": 434110464, |
| "step": 414 |
| }, |
| { |
| "epoch": 0.8311688311688312, |
| "grad_norm": 0.36376699370653237, |
| "learning_rate": 3.4019145701791184e-06, |
| "loss": 2.2312, |
| "num_input_tokens_seen": 436207616, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.8351648351648352, |
| "grad_norm": 0.34063507681263155, |
| "learning_rate": 3.245406133261858e-06, |
| "loss": 2.2347, |
| "num_input_tokens_seen": 438304768, |
| "step": 418 |
| }, |
| { |
| "epoch": 0.8391608391608392, |
| "grad_norm": 0.34280358693585106, |
| "learning_rate": 3.092332998903416e-06, |
| "loss": 2.2765, |
| "num_input_tokens_seen": 440401920, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.8431568431568431, |
| "grad_norm": 0.8610621578087045, |
| "learning_rate": 2.942719339126171e-06, |
| "loss": 2.2549, |
| "num_input_tokens_seen": 442499072, |
| "step": 422 |
| }, |
| { |
| "epoch": 0.8471528471528471, |
| "grad_norm": 0.3619308517828453, |
| "learning_rate": 2.7965887796613884e-06, |
| "loss": 2.3026, |
| "num_input_tokens_seen": 444596224, |
| "step": 424 |
| }, |
| { |
| "epoch": 0.8511488511488512, |
| "grad_norm": 0.5817614011276511, |
| "learning_rate": 2.6539643962184057e-06, |
| "loss": 2.2588, |
| "num_input_tokens_seen": 446693376, |
| "step": 426 |
| }, |
| { |
| "epoch": 0.8551448551448552, |
| "grad_norm": 0.334222822712159, |
| "learning_rate": 2.514868710840723e-06, |
| "loss": 2.2712, |
| "num_input_tokens_seen": 448790528, |
| "step": 428 |
| }, |
| { |
| "epoch": 0.8591408591408591, |
| "grad_norm": 1.4792940966004624, |
| "learning_rate": 2.379323688349516e-06, |
| "loss": 2.282, |
| "num_input_tokens_seen": 450887680, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.8631368631368631, |
| "grad_norm": 0.34644597736640836, |
| "learning_rate": 2.2473507328751086e-06, |
| "loss": 2.2378, |
| "num_input_tokens_seen": 452984832, |
| "step": 432 |
| }, |
| { |
| "epoch": 0.8671328671328671, |
| "grad_norm": 0.3365961839686141, |
| "learning_rate": 2.118970684477062e-06, |
| "loss": 2.1886, |
| "num_input_tokens_seen": 455081984, |
| "step": 434 |
| }, |
| { |
| "epoch": 0.8711288711288712, |
| "grad_norm": 0.3636539876420837, |
| "learning_rate": 1.9942038158532407e-06, |
| "loss": 2.1706, |
| "num_input_tokens_seen": 457179136, |
| "step": 436 |
| }, |
| { |
| "epoch": 0.8751248751248751, |
| "grad_norm": 0.32607457279943014, |
| "learning_rate": 1.8730698291385518e-06, |
| "loss": 2.276, |
| "num_input_tokens_seen": 459276288, |
| "step": 438 |
| }, |
| { |
| "epoch": 0.8791208791208791, |
| "grad_norm": 0.33134848014564655, |
| "learning_rate": 1.7555878527937164e-06, |
| "loss": 2.24, |
| "num_input_tokens_seen": 461373440, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.8831168831168831, |
| "grad_norm": 0.32978098587865856, |
| "learning_rate": 1.6417764385846996e-06, |
| "loss": 2.2343, |
| "num_input_tokens_seen": 463470592, |
| "step": 442 |
| }, |
| { |
| "epoch": 0.8871128871128872, |
| "grad_norm": 0.331016575114396, |
| "learning_rate": 1.5316535586531483e-06, |
| "loss": 2.2695, |
| "num_input_tokens_seen": 465567744, |
| "step": 444 |
| }, |
| { |
| "epoch": 0.8911088911088911, |
| "grad_norm": 0.35011048945385426, |
| "learning_rate": 1.425236602678387e-06, |
| "loss": 2.2426, |
| "num_input_tokens_seen": 467664896, |
| "step": 446 |
| }, |
| { |
| "epoch": 0.8951048951048951, |
| "grad_norm": 0.3315052440372763, |
| "learning_rate": 1.3225423751313942e-06, |
| "loss": 2.338, |
| "num_input_tokens_seen": 469762048, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.8991008991008991, |
| "grad_norm": 4.3691224366012404, |
| "learning_rate": 1.2235870926211619e-06, |
| "loss": 2.2712, |
| "num_input_tokens_seen": 471859200, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.903096903096903, |
| "grad_norm": 0.35367270915680155, |
| "learning_rate": 1.1283863813339263e-06, |
| "loss": 2.2091, |
| "num_input_tokens_seen": 473956352, |
| "step": 452 |
| }, |
| { |
| "epoch": 0.9070929070929071, |
| "grad_norm": 0.33425375011918684, |
| "learning_rate": 1.0369552745656013e-06, |
| "loss": 2.2301, |
| "num_input_tokens_seen": 476053504, |
| "step": 454 |
| }, |
| { |
| "epoch": 0.9110889110889111, |
| "grad_norm": 0.3236353912937987, |
| "learning_rate": 9.493082103478517e-07, |
| "loss": 2.2485, |
| "num_input_tokens_seen": 478150656, |
| "step": 456 |
| }, |
| { |
| "epoch": 0.9150849150849151, |
| "grad_norm": 0.32004900093967087, |
| "learning_rate": 8.65459029168153e-07, |
| "loss": 2.229, |
| "num_input_tokens_seen": 480247808, |
| "step": 458 |
| }, |
| { |
| "epoch": 0.919080919080919, |
| "grad_norm": 0.3285834593094262, |
| "learning_rate": 7.854209717842231e-07, |
| "loss": 2.2467, |
| "num_input_tokens_seen": 482344960, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.9230769230769231, |
| "grad_norm": 0.3274324393769599, |
| "learning_rate": 7.092066771331507e-07, |
| "loss": 2.2881, |
| "num_input_tokens_seen": 484442112, |
| "step": 462 |
| }, |
| { |
| "epoch": 0.9270729270729271, |
| "grad_norm": 0.323377738085483, |
| "learning_rate": 6.368281803355691e-07, |
| "loss": 2.2366, |
| "num_input_tokens_seen": 486539264, |
| "step": 464 |
| }, |
| { |
| "epoch": 0.9310689310689311, |
| "grad_norm": 0.3394540923040774, |
| "learning_rate": 5.68296910795163e-07, |
| "loss": 2.217, |
| "num_input_tokens_seen": 488636416, |
| "step": 466 |
| }, |
| { |
| "epoch": 0.935064935064935, |
| "grad_norm": 0.3214753885461269, |
| "learning_rate": 5.036236903938285e-07, |
| "loss": 2.2773, |
| "num_input_tokens_seen": 490733568, |
| "step": 468 |
| }, |
| { |
| "epoch": 0.939060939060939, |
| "grad_norm": 0.3264220662536442, |
| "learning_rate": 4.4281873178278475e-07, |
| "loss": 2.248, |
| "num_input_tokens_seen": 492830720, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.9430569430569431, |
| "grad_norm": 0.34531517443289306, |
| "learning_rate": 3.8589163676986674e-07, |
| "loss": 2.2461, |
| "num_input_tokens_seen": 494927872, |
| "step": 472 |
| }, |
| { |
| "epoch": 0.9470529470529471, |
| "grad_norm": 0.6927568998454873, |
| "learning_rate": 3.328513948032991e-07, |
| "loss": 2.2763, |
| "num_input_tokens_seen": 497025024, |
| "step": 474 |
| }, |
| { |
| "epoch": 0.951048951048951, |
| "grad_norm": 1.9898788697751348, |
| "learning_rate": 2.8370638155215123e-07, |
| "loss": 2.2534, |
| "num_input_tokens_seen": 499122176, |
| "step": 476 |
| }, |
| { |
| "epoch": 0.955044955044955, |
| "grad_norm": 0.3214426319279153, |
| "learning_rate": 2.384643575837203e-07, |
| "loss": 2.2104, |
| "num_input_tokens_seen": 501219328, |
| "step": 478 |
| }, |
| { |
| "epoch": 0.9590409590409591, |
| "grad_norm": 0.32987014767880857, |
| "learning_rate": 1.9713246713805588e-07, |
| "loss": 2.2754, |
| "num_input_tokens_seen": 503316480, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.9630369630369631, |
| "grad_norm": 0.32633907338773177, |
| "learning_rate": 1.5971723699979013e-07, |
| "loss": 2.1888, |
| "num_input_tokens_seen": 505413632, |
| "step": 482 |
| }, |
| { |
| "epoch": 0.967032967032967, |
| "grad_norm": 0.3309525910484035, |
| "learning_rate": 1.2622457546749567e-07, |
| "loss": 2.2745, |
| "num_input_tokens_seen": 507510784, |
| "step": 484 |
| }, |
| { |
| "epoch": 0.971028971028971, |
| "grad_norm": 0.32728796033384344, |
| "learning_rate": 9.665977142068738e-08, |
| "loss": 2.1886, |
| "num_input_tokens_seen": 509607936, |
| "step": 486 |
| }, |
| { |
| "epoch": 0.975024975024975, |
| "grad_norm": 0.337955114826972, |
| "learning_rate": 7.102749348465165e-08, |
| "loss": 2.2253, |
| "num_input_tokens_seen": 511705088, |
| "step": 488 |
| }, |
| { |
| "epoch": 0.9790209790209791, |
| "grad_norm": 0.31617587266422914, |
| "learning_rate": 4.9331789293211026e-08, |
| "loss": 2.2652, |
| "num_input_tokens_seen": 513802240, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.983016983016983, |
| "grad_norm": 0.31833465687238105, |
| "learning_rate": 3.157608484956332e-08, |
| "loss": 2.2791, |
| "num_input_tokens_seen": 515899392, |
| "step": 492 |
| }, |
| { |
| "epoch": 0.987012987012987, |
| "grad_norm": 0.3291062130462829, |
| "learning_rate": 1.7763183985269883e-08, |
| "loss": 2.2311, |
| "num_input_tokens_seen": 517996544, |
| "step": 494 |
| }, |
| { |
| "epoch": 0.991008991008991, |
| "grad_norm": 0.32088692616380343, |
| "learning_rate": 7.895267917501504e-09, |
| "loss": 2.2188, |
| "num_input_tokens_seen": 520093696, |
| "step": 496 |
| }, |
| { |
| "epoch": 0.995004995004995, |
| "grad_norm": 0.7923180204462165, |
| "learning_rate": 1.973894904597207e-09, |
| "loss": 2.2549, |
| "num_input_tokens_seen": 522190848, |
| "step": 498 |
| }, |
| { |
| "epoch": 0.999000999000999, |
| "grad_norm": 0.558944032911619, |
| "learning_rate": 0.0, |
| "loss": 2.2687, |
| "num_input_tokens_seen": 524288000, |
| "step": 500 |
| } |
| ], |
| "logging_steps": 2, |
| "max_steps": 500, |
| "num_input_tokens_seen": 524288000, |
| "num_train_epochs": 1, |
| "save_steps": 150, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1048246763913216.0, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|