| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.994059405940594, | |
| "eval_steps": 500, | |
| "global_step": 378, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.015841584158415842, | |
| "grad_norm": 9.02661650060573, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 1.9636, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.031683168316831684, | |
| "grad_norm": 6.243135269339089, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 1.9061, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.047524752475247525, | |
| "grad_norm": 5.193323051829328, | |
| "learning_rate": 1e-05, | |
| "loss": 1.6579, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.06336633663366337, | |
| "grad_norm": 5.161379651013384, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 1.4226, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.07920792079207921, | |
| "grad_norm": 3.1813181834190916, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 1.2698, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.09504950495049505, | |
| "grad_norm": 3.948535232635907, | |
| "learning_rate": 2e-05, | |
| "loss": 1.1848, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.11089108910891089, | |
| "grad_norm": 2.6512379356069657, | |
| "learning_rate": 1.999852647705027e-05, | |
| "loss": 1.2354, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.12673267326732673, | |
| "grad_norm": 3.032346402915996, | |
| "learning_rate": 1.9994106342455053e-05, | |
| "loss": 1.161, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.14257425742574256, | |
| "grad_norm": 2.1212186467865726, | |
| "learning_rate": 1.9986740898848306e-05, | |
| "loss": 1.1529, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.15841584158415842, | |
| "grad_norm": 2.135193559072075, | |
| "learning_rate": 1.9976432316860065e-05, | |
| "loss": 1.0521, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.17425742574257425, | |
| "grad_norm": 2.1958764723179547, | |
| "learning_rate": 1.9963183634476757e-05, | |
| "loss": 1.0078, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.1900990099009901, | |
| "grad_norm": 2.2450349863339, | |
| "learning_rate": 1.9946998756145894e-05, | |
| "loss": 0.9967, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.20594059405940593, | |
| "grad_norm": 2.509752007569588, | |
| "learning_rate": 1.99278824516254e-05, | |
| "loss": 1.0667, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.22178217821782178, | |
| "grad_norm": 2.201210356151739, | |
| "learning_rate": 1.990584035457797e-05, | |
| "loss": 1.0156, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.2376237623762376, | |
| "grad_norm": 2.1135504562735723, | |
| "learning_rate": 1.9880878960910772e-05, | |
| "loss": 0.9989, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.25346534653465347, | |
| "grad_norm": 2.0708467368043655, | |
| "learning_rate": 1.985300562686109e-05, | |
| "loss": 0.9244, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.2693069306930693, | |
| "grad_norm": 1.9881447087099158, | |
| "learning_rate": 1.982222856682841e-05, | |
| "loss": 0.912, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.2851485148514851, | |
| "grad_norm": 2.1711952522187965, | |
| "learning_rate": 1.978855685095358e-05, | |
| "loss": 0.9678, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.300990099009901, | |
| "grad_norm": 1.8520892574995997, | |
| "learning_rate": 1.9752000402445824e-05, | |
| "loss": 0.9405, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.31683168316831684, | |
| "grad_norm": 2.330675399944507, | |
| "learning_rate": 1.9712569994658315e-05, | |
| "loss": 1.0241, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.3326732673267327, | |
| "grad_norm": 1.9912264091727097, | |
| "learning_rate": 1.9670277247913205e-05, | |
| "loss": 0.9204, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.3485148514851485, | |
| "grad_norm": 2.123562111523868, | |
| "learning_rate": 1.9625134626077084e-05, | |
| "loss": 0.9179, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.36435643564356435, | |
| "grad_norm": 1.9790301716804515, | |
| "learning_rate": 1.9577155432887805e-05, | |
| "loss": 0.898, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.3801980198019802, | |
| "grad_norm": 2.008624349315442, | |
| "learning_rate": 1.9526353808033827e-05, | |
| "loss": 0.9364, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.39603960396039606, | |
| "grad_norm": 1.888190643885973, | |
| "learning_rate": 1.947274472298717e-05, | |
| "loss": 0.8907, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.41188118811881186, | |
| "grad_norm": 2.015566919384652, | |
| "learning_rate": 1.941634397659126e-05, | |
| "loss": 0.9009, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.4277227722772277, | |
| "grad_norm": 1.9768310339741355, | |
| "learning_rate": 1.9357168190404937e-05, | |
| "loss": 0.8975, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.44356435643564357, | |
| "grad_norm": 2.133682758220573, | |
| "learning_rate": 1.9295234803804005e-05, | |
| "loss": 0.9143, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.4594059405940594, | |
| "grad_norm": 1.9749224153125167, | |
| "learning_rate": 1.9230562068841764e-05, | |
| "loss": 0.9031, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.4752475247524752, | |
| "grad_norm": 1.8623068380901364, | |
| "learning_rate": 1.916316904487005e-05, | |
| "loss": 0.8818, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.4910891089108911, | |
| "grad_norm": 1.8392963872778836, | |
| "learning_rate": 1.909307559292236e-05, | |
| "loss": 0.8816, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.5069306930693069, | |
| "grad_norm": 1.8853129700358418, | |
| "learning_rate": 1.9020302369860708e-05, | |
| "loss": 0.9097, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.5227722772277228, | |
| "grad_norm": 2.137270128251829, | |
| "learning_rate": 1.8944870822287957e-05, | |
| "loss": 0.9296, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.5386138613861386, | |
| "grad_norm": 1.7161417851549035, | |
| "learning_rate": 1.8866803180227403e-05, | |
| "loss": 0.8762, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.5544554455445545, | |
| "grad_norm": 2.4673647833737093, | |
| "learning_rate": 1.8786122450571485e-05, | |
| "loss": 0.8757, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.5702970297029702, | |
| "grad_norm": 1.9999583978057465, | |
| "learning_rate": 1.8702852410301556e-05, | |
| "loss": 0.9574, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.5861386138613861, | |
| "grad_norm": 1.908768010347383, | |
| "learning_rate": 1.861701759948068e-05, | |
| "loss": 0.864, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.601980198019802, | |
| "grad_norm": 1.9712540583735692, | |
| "learning_rate": 1.85286433140216e-05, | |
| "loss": 0.9225, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.6178217821782178, | |
| "grad_norm": 1.7764975660995597, | |
| "learning_rate": 1.8437755598231857e-05, | |
| "loss": 0.9098, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.6336633663366337, | |
| "grad_norm": 1.8462120353772475, | |
| "learning_rate": 1.8344381237138473e-05, | |
| "loss": 0.9059, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.6495049504950495, | |
| "grad_norm": 1.7841160221457657, | |
| "learning_rate": 1.8248547748594246e-05, | |
| "loss": 0.8664, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.6653465346534654, | |
| "grad_norm": 1.8561876013842784, | |
| "learning_rate": 1.8150283375168112e-05, | |
| "loss": 0.8621, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.6811881188118812, | |
| "grad_norm": 2.203573345708881, | |
| "learning_rate": 1.8049617075821962e-05, | |
| "loss": 0.8719, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.697029702970297, | |
| "grad_norm": 1.7796878917747134, | |
| "learning_rate": 1.794657851737625e-05, | |
| "loss": 0.8964, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.7128712871287128, | |
| "grad_norm": 1.838149293961002, | |
| "learning_rate": 1.7841198065767107e-05, | |
| "loss": 0.898, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.7287128712871287, | |
| "grad_norm": 1.903112272121249, | |
| "learning_rate": 1.77335067770973e-05, | |
| "loss": 0.8394, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.7445544554455445, | |
| "grad_norm": 1.968601238697486, | |
| "learning_rate": 1.7623536388483902e-05, | |
| "loss": 0.8807, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.7603960396039604, | |
| "grad_norm": 1.975435786088584, | |
| "learning_rate": 1.7511319308705198e-05, | |
| "loss": 0.829, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.7762376237623763, | |
| "grad_norm": 2.008008309376045, | |
| "learning_rate": 1.7396888608649673e-05, | |
| "loss": 0.8512, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.7920792079207921, | |
| "grad_norm": 1.8389509044704804, | |
| "learning_rate": 1.7280278011569848e-05, | |
| "loss": 0.8546, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.807920792079208, | |
| "grad_norm": 1.842123818268232, | |
| "learning_rate": 1.7161521883143936e-05, | |
| "loss": 0.8344, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.8237623762376237, | |
| "grad_norm": 1.7379472653976658, | |
| "learning_rate": 1.7040655221348057e-05, | |
| "loss": 0.8621, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.8396039603960396, | |
| "grad_norm": 1.8262243602907722, | |
| "learning_rate": 1.6917713646142222e-05, | |
| "loss": 0.8454, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.8554455445544554, | |
| "grad_norm": 1.656358917650264, | |
| "learning_rate": 1.679273338897293e-05, | |
| "loss": 0.8517, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.8712871287128713, | |
| "grad_norm": 1.741005790971336, | |
| "learning_rate": 1.6665751282095634e-05, | |
| "loss": 0.8372, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.8871287128712871, | |
| "grad_norm": 1.7417421499425474, | |
| "learning_rate": 1.653680474772006e-05, | |
| "loss": 0.8436, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.902970297029703, | |
| "grad_norm": 1.8414378471720574, | |
| "learning_rate": 1.6405931786981753e-05, | |
| "loss": 0.84, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.9188118811881189, | |
| "grad_norm": 1.710424834244549, | |
| "learning_rate": 1.6273170968742942e-05, | |
| "loss": 0.8252, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.9346534653465347, | |
| "grad_norm": 1.7602252189051653, | |
| "learning_rate": 1.613856141822612e-05, | |
| "loss": 0.8332, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.9504950495049505, | |
| "grad_norm": 1.7838059979323873, | |
| "learning_rate": 1.6002142805483686e-05, | |
| "loss": 0.8228, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.9663366336633663, | |
| "grad_norm": 1.7374417300201197, | |
| "learning_rate": 1.586395533370696e-05, | |
| "loss": 0.8231, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.9821782178217822, | |
| "grad_norm": 1.6853967583594676, | |
| "learning_rate": 1.572403972737815e-05, | |
| "loss": 0.8337, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.998019801980198, | |
| "grad_norm": 1.887105041037784, | |
| "learning_rate": 1.5582437220268648e-05, | |
| "loss": 0.8572, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.0138613861386139, | |
| "grad_norm": 1.6531514365483646, | |
| "learning_rate": 1.5439189543287247e-05, | |
| "loss": 0.6016, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.0297029702970297, | |
| "grad_norm": 1.736528180579246, | |
| "learning_rate": 1.529433891218185e-05, | |
| "loss": 0.5991, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.0455445544554456, | |
| "grad_norm": 1.7551648317268331, | |
| "learning_rate": 1.5147928015098309e-05, | |
| "loss": 0.5635, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.0613861386138614, | |
| "grad_norm": 1.8298936253985936, | |
| "learning_rate": 1.5000000000000002e-05, | |
| "loss": 0.565, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.0772277227722773, | |
| "grad_norm": 1.7317255025108698, | |
| "learning_rate": 1.4850598461951963e-05, | |
| "loss": 0.5592, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.0930693069306932, | |
| "grad_norm": 1.748739410420368, | |
| "learning_rate": 1.4699767430273202e-05, | |
| "loss": 0.574, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.108910891089109, | |
| "grad_norm": 1.7869424929925428, | |
| "learning_rate": 1.454755135556106e-05, | |
| "loss": 0.5741, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.1247524752475249, | |
| "grad_norm": 1.8524853975885442, | |
| "learning_rate": 1.4393995096591415e-05, | |
| "loss": 0.5454, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.1405940594059407, | |
| "grad_norm": 1.8345725351471323, | |
| "learning_rate": 1.423914390709861e-05, | |
| "loss": 0.547, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.1564356435643564, | |
| "grad_norm": 1.8060990165115691, | |
| "learning_rate": 1.4083043422438936e-05, | |
| "loss": 0.5953, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.1722772277227722, | |
| "grad_norm": 1.6416315510212192, | |
| "learning_rate": 1.3925739646141721e-05, | |
| "loss": 0.5412, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.188118811881188, | |
| "grad_norm": 1.664162470996372, | |
| "learning_rate": 1.3767278936351853e-05, | |
| "loss": 0.5293, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.203960396039604, | |
| "grad_norm": 1.6113229932234892, | |
| "learning_rate": 1.3607707992167836e-05, | |
| "loss": 0.5276, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.2198019801980198, | |
| "grad_norm": 1.799039577469552, | |
| "learning_rate": 1.3447073839879339e-05, | |
| "loss": 0.5569, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.2356435643564356, | |
| "grad_norm": 1.6604650735978537, | |
| "learning_rate": 1.3285423819108349e-05, | |
| "loss": 0.5435, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.2514851485148515, | |
| "grad_norm": 1.7636186077229687, | |
| "learning_rate": 1.3122805568857948e-05, | |
| "loss": 0.5837, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.2673267326732673, | |
| "grad_norm": 1.6232817216358415, | |
| "learning_rate": 1.2959267013472894e-05, | |
| "loss": 0.546, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.2831683168316832, | |
| "grad_norm": 1.6172712656831925, | |
| "learning_rate": 1.2794856348516095e-05, | |
| "loss": 0.5574, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.299009900990099, | |
| "grad_norm": 1.8102400872571573, | |
| "learning_rate": 1.2629622026565147e-05, | |
| "loss": 0.5796, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.314851485148515, | |
| "grad_norm": 1.7802623079191595, | |
| "learning_rate": 1.2463612742933148e-05, | |
| "loss": 0.5442, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.3306930693069308, | |
| "grad_norm": 1.7022382654116954, | |
| "learning_rate": 1.2296877421317958e-05, | |
| "loss": 0.585, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.3465346534653464, | |
| "grad_norm": 1.6401839574055097, | |
| "learning_rate": 1.2129465199384158e-05, | |
| "loss": 0.5397, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.3623762376237623, | |
| "grad_norm": 1.7101624547819703, | |
| "learning_rate": 1.196142541428197e-05, | |
| "loss": 0.5745, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.378217821782178, | |
| "grad_norm": 1.6535320779874634, | |
| "learning_rate": 1.1792807588107358e-05, | |
| "loss": 0.5357, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.394059405940594, | |
| "grad_norm": 1.6245529400611858, | |
| "learning_rate": 1.1623661413307638e-05, | |
| "loss": 0.5543, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.4099009900990098, | |
| "grad_norm": 1.6629345686182546, | |
| "learning_rate": 1.14540367380369e-05, | |
| "loss": 0.5237, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 1.4257425742574257, | |
| "grad_norm": 1.68909608575907, | |
| "learning_rate": 1.1283983551465512e-05, | |
| "loss": 0.5746, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.4415841584158415, | |
| "grad_norm": 1.7585724609639268, | |
| "learning_rate": 1.1113551969048088e-05, | |
| "loss": 0.542, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.4574257425742574, | |
| "grad_norm": 1.598454179166171, | |
| "learning_rate": 1.0942792217754245e-05, | |
| "loss": 0.5217, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.4732673267326732, | |
| "grad_norm": 1.720537069197336, | |
| "learning_rate": 1.0771754621266466e-05, | |
| "loss": 0.5567, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.489108910891089, | |
| "grad_norm": 1.6147204766507826, | |
| "learning_rate": 1.0600489585149485e-05, | |
| "loss": 0.5509, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.504950495049505, | |
| "grad_norm": 1.7292799453559213, | |
| "learning_rate": 1.0429047581995547e-05, | |
| "loss": 0.5479, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.5207920792079208, | |
| "grad_norm": 1.7407208786866153, | |
| "learning_rate": 1.0257479136549889e-05, | |
| "loss": 0.5539, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.5366336633663367, | |
| "grad_norm": 1.6829968026665005, | |
| "learning_rate": 1.0085834810820871e-05, | |
| "loss": 0.5236, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.5524752475247525, | |
| "grad_norm": 1.5077533197060748, | |
| "learning_rate": 9.914165189179132e-06, | |
| "loss": 0.5492, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.5683168316831684, | |
| "grad_norm": 1.619225808673778, | |
| "learning_rate": 9.742520863450116e-06, | |
| "loss": 0.5435, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.5841584158415842, | |
| "grad_norm": 1.6946922694133721, | |
| "learning_rate": 9.570952418004455e-06, | |
| "loss": 0.5628, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 1.7264082675727195, | |
| "learning_rate": 9.399510414850518e-06, | |
| "loss": 0.5405, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.615841584158416, | |
| "grad_norm": 1.668467187331713, | |
| "learning_rate": 9.228245378733537e-06, | |
| "loss": 0.5503, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.6316831683168318, | |
| "grad_norm": 1.6228977289298525, | |
| "learning_rate": 9.057207782245756e-06, | |
| "loss": 0.552, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 1.6475247524752477, | |
| "grad_norm": 1.6720968963932732, | |
| "learning_rate": 8.886448030951912e-06, | |
| "loss": 0.5266, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.6633663366336635, | |
| "grad_norm": 1.6117561664476598, | |
| "learning_rate": 8.71601644853449e-06, | |
| "loss": 0.5439, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.6792079207920794, | |
| "grad_norm": 1.7309748707062824, | |
| "learning_rate": 8.545963261963102e-06, | |
| "loss": 0.5518, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 1.695049504950495, | |
| "grad_norm": 1.727253391380127, | |
| "learning_rate": 8.376338586692367e-06, | |
| "loss": 0.5689, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 1.7108910891089109, | |
| "grad_norm": 1.6465767825228266, | |
| "learning_rate": 8.207192411892645e-06, | |
| "loss": 0.5263, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 1.7267326732673267, | |
| "grad_norm": 1.7061559526686028, | |
| "learning_rate": 8.038574585718032e-06, | |
| "loss": 0.561, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 1.7425742574257426, | |
| "grad_norm": 1.641091211390163, | |
| "learning_rate": 7.870534800615845e-06, | |
| "loss": 0.5279, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.7584158415841584, | |
| "grad_norm": 1.703349596429662, | |
| "learning_rate": 7.703122578682047e-06, | |
| "loss": 0.5496, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.7742574257425743, | |
| "grad_norm": 1.6149104022531628, | |
| "learning_rate": 7.536387257066854e-06, | |
| "loss": 0.5592, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.7900990099009901, | |
| "grad_norm": 1.5562454704981712, | |
| "learning_rate": 7.370377973434854e-06, | |
| "loss": 0.5439, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.805940594059406, | |
| "grad_norm": 1.8575887871163193, | |
| "learning_rate": 7.2051436514839064e-06, | |
| "loss": 0.5488, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.8217821782178216, | |
| "grad_norm": 1.584715339525317, | |
| "learning_rate": 7.040732986527108e-06, | |
| "loss": 0.5308, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.8376237623762375, | |
| "grad_norm": 1.501398258461206, | |
| "learning_rate": 6.877194431142055e-06, | |
| "loss": 0.5239, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.8534653465346533, | |
| "grad_norm": 1.6132499536201124, | |
| "learning_rate": 6.714576180891653e-06, | |
| "loss": 0.5522, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.8693069306930692, | |
| "grad_norm": 1.6315843930884224, | |
| "learning_rate": 6.552926160120663e-06, | |
| "loss": 0.5077, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.885148514851485, | |
| "grad_norm": 1.7272400167492237, | |
| "learning_rate": 6.3922920078321685e-06, | |
| "loss": 0.5633, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.900990099009901, | |
| "grad_norm": 1.6689075327553573, | |
| "learning_rate": 6.232721063648148e-06, | |
| "loss": 0.5055, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.9168316831683168, | |
| "grad_norm": 1.578818325825261, | |
| "learning_rate": 6.074260353858283e-06, | |
| "loss": 0.5389, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.9326732673267326, | |
| "grad_norm": 1.7205801010677875, | |
| "learning_rate": 5.916956577561066e-06, | |
| "loss": 0.5562, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.9485148514851485, | |
| "grad_norm": 1.6477307933447243, | |
| "learning_rate": 5.760856092901394e-06, | |
| "loss": 0.5193, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.9643564356435643, | |
| "grad_norm": 1.5685239373769044, | |
| "learning_rate": 5.6060049034085815e-06, | |
| "loss": 0.5512, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.9801980198019802, | |
| "grad_norm": 1.6936425396642782, | |
| "learning_rate": 5.4524486444389455e-06, | |
| "loss": 0.5584, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.996039603960396, | |
| "grad_norm": 1.7086317273357263, | |
| "learning_rate": 5.300232569726805e-06, | |
| "loss": 0.5644, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 2.011881188118812, | |
| "grad_norm": 1.5710974953396362, | |
| "learning_rate": 5.14940153804804e-06, | |
| "loss": 0.3551, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 2.0277227722772277, | |
| "grad_norm": 1.3779991120357036, | |
| "learning_rate": 5.000000000000003e-06, | |
| "loss": 0.3204, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 2.0435643564356436, | |
| "grad_norm": 1.3532509589808863, | |
| "learning_rate": 4.852071984901696e-06, | |
| "loss": 0.3381, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 2.0594059405940595, | |
| "grad_norm": 1.5655956293278384, | |
| "learning_rate": 4.705661087818149e-06, | |
| "loss": 0.3114, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.0752475247524753, | |
| "grad_norm": 1.8047288146207374, | |
| "learning_rate": 4.560810456712754e-06, | |
| "loss": 0.3167, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 2.091089108910891, | |
| "grad_norm": 1.5969295392705383, | |
| "learning_rate": 4.417562779731355e-06, | |
| "loss": 0.2968, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 2.106930693069307, | |
| "grad_norm": 1.5043862632074243, | |
| "learning_rate": 4.275960272621852e-06, | |
| "loss": 0.2873, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 2.122772277227723, | |
| "grad_norm": 1.4566588428064557, | |
| "learning_rate": 4.1360446662930445e-06, | |
| "loss": 0.2906, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 2.1386138613861387, | |
| "grad_norm": 1.4148675515775622, | |
| "learning_rate": 3.997857194516319e-06, | |
| "loss": 0.2902, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.1544554455445546, | |
| "grad_norm": 1.4039039705486787, | |
| "learning_rate": 3.86143858177388e-06, | |
| "loss": 0.3055, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 2.1702970297029704, | |
| "grad_norm": 1.4590478348079479, | |
| "learning_rate": 3.7268290312570622e-06, | |
| "loss": 0.2887, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 2.1861386138613863, | |
| "grad_norm": 1.4904023110394045, | |
| "learning_rate": 3.594068213018249e-06, | |
| "loss": 0.2838, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 2.201980198019802, | |
| "grad_norm": 1.4223228096195222, | |
| "learning_rate": 3.4631952522799396e-06, | |
| "loss": 0.2816, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 2.217821782178218, | |
| "grad_norm": 1.4645691354886492, | |
| "learning_rate": 3.334248717904368e-06, | |
| "loss": 0.304, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.233663366336634, | |
| "grad_norm": 1.446048636808659, | |
| "learning_rate": 3.207266611027069e-06, | |
| "loss": 0.3098, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 2.2495049504950497, | |
| "grad_norm": 1.4585412000734919, | |
| "learning_rate": 3.082286353857782e-06, | |
| "loss": 0.2881, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 2.2653465346534656, | |
| "grad_norm": 1.4508547813683121, | |
| "learning_rate": 2.9593447786519424e-06, | |
| "loss": 0.2961, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 2.2811881188118814, | |
| "grad_norm": 1.4305009349332931, | |
| "learning_rate": 2.8384781168560693e-06, | |
| "loss": 0.3015, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 2.297029702970297, | |
| "grad_norm": 1.3742813840340038, | |
| "learning_rate": 2.719721988430153e-06, | |
| "loss": 0.2826, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.3128712871287127, | |
| "grad_norm": 1.4476247412504035, | |
| "learning_rate": 2.6031113913503337e-06, | |
| "loss": 0.2768, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 2.3287128712871286, | |
| "grad_norm": 1.3974509182841102, | |
| "learning_rate": 2.4886806912948034e-06, | |
| "loss": 0.2669, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 2.3445544554455444, | |
| "grad_norm": 1.465709495723541, | |
| "learning_rate": 2.376463611516098e-06, | |
| "loss": 0.2952, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 2.3603960396039603, | |
| "grad_norm": 1.4155568971087533, | |
| "learning_rate": 2.2664932229027025e-06, | |
| "loss": 0.2863, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 2.376237623762376, | |
| "grad_norm": 1.4443007558100314, | |
| "learning_rate": 2.158801934232897e-06, | |
| "loss": 0.286, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.392079207920792, | |
| "grad_norm": 1.3965088698230241, | |
| "learning_rate": 2.0534214826237486e-06, | |
| "loss": 0.2783, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 2.407920792079208, | |
| "grad_norm": 1.4407127199864882, | |
| "learning_rate": 1.9503829241780416e-06, | |
| "loss": 0.2852, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 2.4237623762376237, | |
| "grad_norm": 1.4265187454796464, | |
| "learning_rate": 1.8497166248318876e-06, | |
| "loss": 0.2945, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 2.4396039603960396, | |
| "grad_norm": 1.5393606880372226, | |
| "learning_rate": 1.7514522514057552e-06, | |
| "loss": 0.2906, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 2.4554455445544554, | |
| "grad_norm": 1.509234066590099, | |
| "learning_rate": 1.6556187628615273e-06, | |
| "loss": 0.2803, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.4712871287128713, | |
| "grad_norm": 1.4716057289372597, | |
| "learning_rate": 1.5622444017681438e-06, | |
| "loss": 0.2834, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 2.487128712871287, | |
| "grad_norm": 1.4208546193648086, | |
| "learning_rate": 1.4713566859784045e-06, | |
| "loss": 0.2807, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 2.502970297029703, | |
| "grad_norm": 1.4813272843019751, | |
| "learning_rate": 1.3829824005193183e-06, | |
| "loss": 0.2606, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 2.518811881188119, | |
| "grad_norm": 1.4317127596411277, | |
| "learning_rate": 1.2971475896984475e-06, | |
| "loss": 0.2872, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 2.5346534653465347, | |
| "grad_norm": 1.4190350255812814, | |
| "learning_rate": 1.2138775494285181e-06, | |
| "loss": 0.2849, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.5504950495049505, | |
| "grad_norm": 1.3786354752806598, | |
| "learning_rate": 1.1331968197725985e-06, | |
| "loss": 0.2748, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 2.5663366336633664, | |
| "grad_norm": 1.3687349820385863, | |
| "learning_rate": 1.0551291777120465e-06, | |
| "loss": 0.2637, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 2.5821782178217823, | |
| "grad_norm": 1.5296662542963093, | |
| "learning_rate": 9.796976301392935e-07, | |
| "loss": 0.2763, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 2.598019801980198, | |
| "grad_norm": 1.438465874033189, | |
| "learning_rate": 9.069244070776428e-07, | |
| "loss": 0.3135, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 2.613861386138614, | |
| "grad_norm": 1.423122514861154, | |
| "learning_rate": 8.368309551299536e-07, | |
| "loss": 0.2867, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.62970297029703, | |
| "grad_norm": 1.479495787675627, | |
| "learning_rate": 7.694379311582401e-07, | |
| "loss": 0.284, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 2.6455445544554457, | |
| "grad_norm": 1.4083216396155038, | |
| "learning_rate": 7.047651961959978e-07, | |
| "loss": 0.2815, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 2.6613861386138615, | |
| "grad_norm": 1.3782207183520463, | |
| "learning_rate": 6.428318095950648e-07, | |
| "loss": 0.2746, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 2.6772277227722774, | |
| "grad_norm": 1.4227248154028977, | |
| "learning_rate": 5.836560234087418e-07, | |
| "loss": 0.2646, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 2.693069306930693, | |
| "grad_norm": 1.4574818091415243, | |
| "learning_rate": 5.272552770128314e-07, | |
| "loss": 0.2725, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.7089108910891087, | |
| "grad_norm": 1.3601121324659562, | |
| "learning_rate": 4.73646191966175e-07, | |
| "loss": 0.2609, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 2.7247524752475245, | |
| "grad_norm": 1.4223626310719977, | |
| "learning_rate": 4.2284456711219723e-07, | |
| "loss": 0.2949, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 2.7405940594059404, | |
| "grad_norm": 1.3931852574014099, | |
| "learning_rate": 3.748653739229191e-07, | |
| "loss": 0.2818, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 2.756435643564356, | |
| "grad_norm": 1.4108773420640486, | |
| "learning_rate": 3.2972275208679625e-07, | |
| "loss": 0.2888, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 2.772277227722772, | |
| "grad_norm": 1.3496089808630627, | |
| "learning_rate": 2.8743000534168673e-07, | |
| "loss": 0.2651, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.788118811881188, | |
| "grad_norm": 1.4057990507827347, | |
| "learning_rate": 2.479995975541749e-07, | |
| "loss": 0.2895, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 2.803960396039604, | |
| "grad_norm": 1.4102665625188138, | |
| "learning_rate": 2.1144314904642194e-07, | |
| "loss": 0.2609, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 2.8198019801980196, | |
| "grad_norm": 1.3292831585550606, | |
| "learning_rate": 1.7777143317159407e-07, | |
| "loss": 0.2717, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 2.8356435643564355, | |
| "grad_norm": 1.3815722866503821, | |
| "learning_rate": 1.4699437313891007e-07, | |
| "loss": 0.2941, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 2.8514851485148514, | |
| "grad_norm": 1.4033894606032193, | |
| "learning_rate": 1.1912103908922945e-07, | |
| "loss": 0.2747, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.867326732673267, | |
| "grad_norm": 1.463974792809012, | |
| "learning_rate": 9.415964542203059e-08, | |
| "loss": 0.2759, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 2.883168316831683, | |
| "grad_norm": 1.3556200240858571, | |
| "learning_rate": 7.21175483745995e-08, | |
| "loss": 0.2728, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 2.899009900990099, | |
| "grad_norm": 1.4407004492072946, | |
| "learning_rate": 5.300124385410943e-08, | |
| "loss": 0.255, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 2.9148514851485148, | |
| "grad_norm": 1.3953279840487336, | |
| "learning_rate": 3.681636552324452e-08, | |
| "loss": 0.2831, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 2.9306930693069306, | |
| "grad_norm": 1.3987888464947156, | |
| "learning_rate": 2.3567683139936736e-08, | |
| "loss": 0.2607, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.9465346534653465, | |
| "grad_norm": 1.4508777672512516, | |
| "learning_rate": 1.325910115169471e-08, | |
| "loss": 0.2599, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 2.9623762376237623, | |
| "grad_norm": 1.3800670482596784, | |
| "learning_rate": 5.8936575449475284e-09, | |
| "loss": 0.2661, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 2.978217821782178, | |
| "grad_norm": 1.394978304799224, | |
| "learning_rate": 1.47352294973091e-09, | |
| "loss": 0.286, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 2.994059405940594, | |
| "grad_norm": 1.4850311019997162, | |
| "learning_rate": 0.0, | |
| "loss": 0.285, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 2.994059405940594, | |
| "step": 378, | |
| "total_flos": 1243685281333248.0, | |
| "train_loss": 0.6043043333701986, | |
| "train_runtime": 10201.0989, | |
| "train_samples_per_second": 1.188, | |
| "train_steps_per_second": 0.037 | |
| } | |
| ], | |
| "logging_steps": 2, | |
| "max_steps": 378, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 32, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1243685281333248.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |