| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 249, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.012048192771084338, | |
| "grad_norm": 40.75222396850586, | |
| "learning_rate": 9.99960204377842e-06, | |
| "loss": 0.8283, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.024096385542168676, | |
| "grad_norm": 7.6217546463012695, | |
| "learning_rate": 9.99840823846134e-06, | |
| "loss": 0.7463, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.03614457831325301, | |
| "grad_norm": 4.7737321853637695, | |
| "learning_rate": 9.996418774081658e-06, | |
| "loss": 0.6773, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.04819277108433735, | |
| "grad_norm": 7.380457401275635, | |
| "learning_rate": 9.99363396732727e-06, | |
| "loss": 0.7069, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.060240963855421686, | |
| "grad_norm": 6.07143497467041, | |
| "learning_rate": 9.990054261490643e-06, | |
| "loss": 0.8157, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.07228915662650602, | |
| "grad_norm": 32.628204345703125, | |
| "learning_rate": 9.985680226398261e-06, | |
| "loss": 0.7604, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.08433734939759036, | |
| "grad_norm": 7.0896759033203125, | |
| "learning_rate": 9.980512558319915e-06, | |
| "loss": 0.6947, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.0963855421686747, | |
| "grad_norm": 4.166346549987793, | |
| "learning_rate": 9.974552079857873e-06, | |
| "loss": 0.5901, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.10843373493975904, | |
| "grad_norm": 5.307025909423828, | |
| "learning_rate": 9.967799739815925e-06, | |
| "loss": 0.6768, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.12048192771084337, | |
| "grad_norm": 5.205200672149658, | |
| "learning_rate": 9.960256613048367e-06, | |
| "loss": 0.7401, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.13253012048192772, | |
| "grad_norm": 4.746809482574463, | |
| "learning_rate": 9.951923900288888e-06, | |
| "loss": 0.603, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.14457831325301204, | |
| "grad_norm": 4.554243087768555, | |
| "learning_rate": 9.942802927959444e-06, | |
| "loss": 0.5951, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.1566265060240964, | |
| "grad_norm": 4.13496732711792, | |
| "learning_rate": 9.932895147959106e-06, | |
| "loss": 0.6246, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.1686746987951807, | |
| "grad_norm": 5.653600692749023, | |
| "learning_rate": 9.922202137432954e-06, | |
| "loss": 0.8116, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.18072289156626506, | |
| "grad_norm": 4.084902286529541, | |
| "learning_rate": 9.910725598521014e-06, | |
| "loss": 0.5243, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.1927710843373494, | |
| "grad_norm": 4.84393835067749, | |
| "learning_rate": 9.89846735808731e-06, | |
| "loss": 0.5911, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.20481927710843373, | |
| "grad_norm": 3.5985801219940186, | |
| "learning_rate": 9.885429367429062e-06, | |
| "loss": 0.5873, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.21686746987951808, | |
| "grad_norm": 4.133760452270508, | |
| "learning_rate": 9.871613701966067e-06, | |
| "loss": 0.58, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.2289156626506024, | |
| "grad_norm": 5.736385345458984, | |
| "learning_rate": 9.857022560910338e-06, | |
| "loss": 0.6884, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.24096385542168675, | |
| "grad_norm": 5.400482177734375, | |
| "learning_rate": 9.84165826691602e-06, | |
| "loss": 0.7507, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.25301204819277107, | |
| "grad_norm": 3.2082321643829346, | |
| "learning_rate": 9.825523265709667e-06, | |
| "loss": 0.4539, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.26506024096385544, | |
| "grad_norm": 3.9605965614318848, | |
| "learning_rate": 9.808620125700925e-06, | |
| "loss": 0.5744, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.27710843373493976, | |
| "grad_norm": 3.652902603149414, | |
| "learning_rate": 9.790951537573686e-06, | |
| "loss": 0.4361, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.2891566265060241, | |
| "grad_norm": 3.5659713745117188, | |
| "learning_rate": 9.772520313857777e-06, | |
| "loss": 0.4565, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.30120481927710846, | |
| "grad_norm": 5.866443157196045, | |
| "learning_rate": 9.753329388481261e-06, | |
| "loss": 0.6564, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.3132530120481928, | |
| "grad_norm": 5.043295383453369, | |
| "learning_rate": 9.733381816303395e-06, | |
| "loss": 0.5905, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.3253012048192771, | |
| "grad_norm": 4.576389789581299, | |
| "learning_rate": 9.712680772628365e-06, | |
| "loss": 0.5458, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.3373493975903614, | |
| "grad_norm": 2.964594602584839, | |
| "learning_rate": 9.691229552699817e-06, | |
| "loss": 0.4196, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.3493975903614458, | |
| "grad_norm": 3.668825387954712, | |
| "learning_rate": 9.669031571176322e-06, | |
| "loss": 0.5939, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.3614457831325301, | |
| "grad_norm": 3.4135804176330566, | |
| "learning_rate": 9.646090361587828e-06, | |
| "loss": 0.4942, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.37349397590361444, | |
| "grad_norm": 3.3271186351776123, | |
| "learning_rate": 9.622409575773162e-06, | |
| "loss": 0.4447, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.3855421686746988, | |
| "grad_norm": 3.8561484813690186, | |
| "learning_rate": 9.597992983298748e-06, | |
| "loss": 0.5443, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.39759036144578314, | |
| "grad_norm": 3.4959912300109863, | |
| "learning_rate": 9.572844470858537e-06, | |
| "loss": 0.5228, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.40963855421686746, | |
| "grad_norm": 4.416797637939453, | |
| "learning_rate": 9.546968041655326e-06, | |
| "loss": 0.5745, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.42168674698795183, | |
| "grad_norm": 3.1685950756073, | |
| "learning_rate": 9.520367814763514e-06, | |
| "loss": 0.5249, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.43373493975903615, | |
| "grad_norm": 3.5792479515075684, | |
| "learning_rate": 9.493048024473413e-06, | |
| "loss": 0.5533, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.4457831325301205, | |
| "grad_norm": 3.136587619781494, | |
| "learning_rate": 9.46501301961723e-06, | |
| "loss": 0.5707, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.4578313253012048, | |
| "grad_norm": 6.66333532333374, | |
| "learning_rate": 9.436267262876808e-06, | |
| "loss": 0.5537, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.46987951807228917, | |
| "grad_norm": 3.710054397583008, | |
| "learning_rate": 9.406815330073244e-06, | |
| "loss": 0.5072, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.4819277108433735, | |
| "grad_norm": 2.439741611480713, | |
| "learning_rate": 9.376661909438496e-06, | |
| "loss": 0.4088, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.4939759036144578, | |
| "grad_norm": 2.6984646320343018, | |
| "learning_rate": 9.3458118008691e-06, | |
| "loss": 0.548, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.5060240963855421, | |
| "grad_norm": 2.626049757003784, | |
| "learning_rate": 9.314269915162115e-06, | |
| "loss": 0.541, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.5180722891566265, | |
| "grad_norm": 31.189899444580078, | |
| "learning_rate": 9.282041273233402e-06, | |
| "loss": 0.5461, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.5301204819277109, | |
| "grad_norm": 4.356227397918701, | |
| "learning_rate": 9.249131005318388e-06, | |
| "loss": 0.6082, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.5421686746987951, | |
| "grad_norm": 10.281394958496094, | |
| "learning_rate": 9.215544350155423e-06, | |
| "loss": 0.6193, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.5542168674698795, | |
| "grad_norm": 81.10453796386719, | |
| "learning_rate": 9.18128665415186e-06, | |
| "loss": 0.4795, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.5662650602409639, | |
| "grad_norm": 136.2274932861328, | |
| "learning_rate": 9.146363370533004e-06, | |
| "loss": 0.5669, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.5783132530120482, | |
| "grad_norm": 24.73008155822754, | |
| "learning_rate": 9.110780058474052e-06, | |
| "loss": 0.4712, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.5903614457831325, | |
| "grad_norm": 3.0569868087768555, | |
| "learning_rate": 9.07454238221517e-06, | |
| "loss": 0.4934, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.6024096385542169, | |
| "grad_norm": 3.192237615585327, | |
| "learning_rate": 9.03765611015985e-06, | |
| "loss": 0.5427, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.6144578313253012, | |
| "grad_norm": 1.920320749282837, | |
| "learning_rate": 9.000127113956673e-06, | |
| "loss": 0.4182, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.6265060240963856, | |
| "grad_norm": 3.1197104454040527, | |
| "learning_rate": 8.961961367564652e-06, | |
| "loss": 0.5577, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.6385542168674698, | |
| "grad_norm": 2.1309397220611572, | |
| "learning_rate": 8.923164946302274e-06, | |
| "loss": 0.5111, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.6506024096385542, | |
| "grad_norm": 2.3042995929718018, | |
| "learning_rate": 8.883744025880429e-06, | |
| "loss": 0.5015, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.6626506024096386, | |
| "grad_norm": 2.4492433071136475, | |
| "learning_rate": 8.843704881419333e-06, | |
| "loss": 0.3826, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.6746987951807228, | |
| "grad_norm": 2.3031723499298096, | |
| "learning_rate": 8.803053886449644e-06, | |
| "loss": 0.4694, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.6867469879518072, | |
| "grad_norm": 3.1464896202087402, | |
| "learning_rate": 8.761797511897907e-06, | |
| "loss": 0.5708, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.6987951807228916, | |
| "grad_norm": 2.5254249572753906, | |
| "learning_rate": 8.719942325056496e-06, | |
| "loss": 0.5605, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.7108433734939759, | |
| "grad_norm": 2.614318370819092, | |
| "learning_rate": 8.67749498853821e-06, | |
| "loss": 0.5702, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.7228915662650602, | |
| "grad_norm": 2.1782386302948, | |
| "learning_rate": 8.634462259215719e-06, | |
| "loss": 0.5409, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.7349397590361446, | |
| "grad_norm": 2.084237813949585, | |
| "learning_rate": 8.590850987145964e-06, | |
| "loss": 0.4923, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.7469879518072289, | |
| "grad_norm": 2.4142396450042725, | |
| "learning_rate": 8.546668114479769e-06, | |
| "loss": 0.6142, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.7590361445783133, | |
| "grad_norm": 1.6900039911270142, | |
| "learning_rate": 8.501920674356755e-06, | |
| "loss": 0.4445, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.7710843373493976, | |
| "grad_norm": 1.9757111072540283, | |
| "learning_rate": 8.456615789785804e-06, | |
| "loss": 0.491, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.7831325301204819, | |
| "grad_norm": 2.328930139541626, | |
| "learning_rate": 8.410760672511188e-06, | |
| "loss": 0.5563, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.7951807228915663, | |
| "grad_norm": 2.8067822456359863, | |
| "learning_rate": 8.364362621864595e-06, | |
| "loss": 0.6574, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.8072289156626506, | |
| "grad_norm": 2.0766549110412598, | |
| "learning_rate": 8.31742902360319e-06, | |
| "loss": 0.5063, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.8192771084337349, | |
| "grad_norm": 2.085911989212036, | |
| "learning_rate": 8.269967348733947e-06, | |
| "loss": 0.5504, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.8313253012048193, | |
| "grad_norm": 1.8254350423812866, | |
| "learning_rate": 8.221985152324385e-06, | |
| "loss": 0.4678, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.8433734939759037, | |
| "grad_norm": 2.208496332168579, | |
| "learning_rate": 8.17349007229994e-06, | |
| "loss": 0.5589, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.8554216867469879, | |
| "grad_norm": 2.833843469619751, | |
| "learning_rate": 8.124489828228136e-06, | |
| "loss": 0.6464, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.8674698795180723, | |
| "grad_norm": 2.181140661239624, | |
| "learning_rate": 8.07499222008977e-06, | |
| "loss": 0.6037, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.8795180722891566, | |
| "grad_norm": 1.5879639387130737, | |
| "learning_rate": 8.025005127037282e-06, | |
| "loss": 0.4077, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.891566265060241, | |
| "grad_norm": 1.94895601272583, | |
| "learning_rate": 7.974536506140546e-06, | |
| "loss": 0.4523, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.9036144578313253, | |
| "grad_norm": 2.282900810241699, | |
| "learning_rate": 7.923594391120237e-06, | |
| "loss": 0.4889, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.9156626506024096, | |
| "grad_norm": 1.8225998878479004, | |
| "learning_rate": 7.872186891068997e-06, | |
| "loss": 0.4483, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.927710843373494, | |
| "grad_norm": 2.1921205520629883, | |
| "learning_rate": 7.820322189160618e-06, | |
| "loss": 0.4848, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.9397590361445783, | |
| "grad_norm": 1.9695558547973633, | |
| "learning_rate": 7.768008541347423e-06, | |
| "loss": 0.4577, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.9518072289156626, | |
| "grad_norm": 2.367926836013794, | |
| "learning_rate": 7.715254275046062e-06, | |
| "loss": 0.6004, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.963855421686747, | |
| "grad_norm": 1.95900297164917, | |
| "learning_rate": 7.66206778781193e-06, | |
| "loss": 0.5161, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.9759036144578314, | |
| "grad_norm": 4.2675557136535645, | |
| "learning_rate": 7.608457546002423e-06, | |
| "loss": 0.4645, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.9879518072289156, | |
| "grad_norm": 2.129870891571045, | |
| "learning_rate": 7.554432083429253e-06, | |
| "loss": 0.5267, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.7695404291152954, | |
| "learning_rate": 7.500000000000001e-06, | |
| "loss": 0.3909, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 1.0120481927710843, | |
| "grad_norm": 2.0876364707946777, | |
| "learning_rate": 7.445169960349167e-06, | |
| "loss": 0.3333, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 1.0240963855421688, | |
| "grad_norm": 1.5992554426193237, | |
| "learning_rate": 7.389950692458916e-06, | |
| "loss": 0.3103, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.036144578313253, | |
| "grad_norm": 2.081721544265747, | |
| "learning_rate": 7.3343509862697295e-06, | |
| "loss": 0.286, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 1.0481927710843373, | |
| "grad_norm": 1.5453327894210815, | |
| "learning_rate": 7.278379692281209e-06, | |
| "loss": 0.2851, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 1.0602409638554218, | |
| "grad_norm": 1.6960233449935913, | |
| "learning_rate": 7.22204572014322e-06, | |
| "loss": 0.3118, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 1.072289156626506, | |
| "grad_norm": 1.6961935758590698, | |
| "learning_rate": 7.165358037237644e-06, | |
| "loss": 0.3024, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 1.0843373493975903, | |
| "grad_norm": 1.9473631381988525, | |
| "learning_rate": 7.10832566725092e-06, | |
| "loss": 0.3262, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.0963855421686748, | |
| "grad_norm": 1.5019605159759521, | |
| "learning_rate": 7.0509576887376375e-06, | |
| "loss": 0.23, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 1.108433734939759, | |
| "grad_norm": 1.7088998556137085, | |
| "learning_rate": 6.99326323367538e-06, | |
| "loss": 0.2511, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 1.1204819277108433, | |
| "grad_norm": 2.8957417011260986, | |
| "learning_rate": 6.9352514860110876e-06, | |
| "loss": 0.3191, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 1.1325301204819278, | |
| "grad_norm": 1.71742844581604, | |
| "learning_rate": 6.876931680199121e-06, | |
| "loss": 0.2792, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 1.144578313253012, | |
| "grad_norm": 1.615378975868225, | |
| "learning_rate": 6.818313099731308e-06, | |
| "loss": 0.2653, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.1566265060240963, | |
| "grad_norm": 1.4427539110183716, | |
| "learning_rate": 6.759405075659165e-06, | |
| "loss": 0.2909, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 1.1686746987951806, | |
| "grad_norm": 1.1839165687561035, | |
| "learning_rate": 6.700216985108568e-06, | |
| "loss": 0.1959, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 1.180722891566265, | |
| "grad_norm": 1.7143460512161255, | |
| "learning_rate": 6.640758249787067e-06, | |
| "loss": 0.2841, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 1.1927710843373494, | |
| "grad_norm": 1.3873624801635742, | |
| "learning_rate": 6.58103833448412e-06, | |
| "loss": 0.2838, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 1.2048192771084336, | |
| "grad_norm": 1.8592312335968018, | |
| "learning_rate": 6.521066745564467e-06, | |
| "loss": 0.2963, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.216867469879518, | |
| "grad_norm": 1.608494758605957, | |
| "learning_rate": 6.460853029454879e-06, | |
| "loss": 0.2877, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 1.2289156626506024, | |
| "grad_norm": 1.8831335306167603, | |
| "learning_rate": 6.4004067711245366e-06, | |
| "loss": 0.3066, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.2409638554216866, | |
| "grad_norm": 1.743905782699585, | |
| "learning_rate": 6.3397375925592675e-06, | |
| "loss": 0.3099, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 1.2530120481927711, | |
| "grad_norm": 1.8759677410125732, | |
| "learning_rate": 6.2788551512299014e-06, | |
| "loss": 0.2914, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.2650602409638554, | |
| "grad_norm": 1.7082366943359375, | |
| "learning_rate": 6.2177691385549595e-06, | |
| "loss": 0.2931, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.2771084337349397, | |
| "grad_norm": 1.519975185394287, | |
| "learning_rate": 6.156489278357967e-06, | |
| "loss": 0.2499, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 1.2891566265060241, | |
| "grad_norm": 1.8293309211730957, | |
| "learning_rate": 6.0950253253195656e-06, | |
| "loss": 0.3611, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 1.3012048192771084, | |
| "grad_norm": 1.728571891784668, | |
| "learning_rate": 6.033387063424765e-06, | |
| "loss": 0.3017, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.3132530120481927, | |
| "grad_norm": 1.6766902208328247, | |
| "learning_rate": 5.971584304405489e-06, | |
| "loss": 0.2823, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.3253012048192772, | |
| "grad_norm": 1.7143419981002808, | |
| "learning_rate": 5.909626886178721e-06, | |
| "loss": 0.2307, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.3373493975903614, | |
| "grad_norm": 1.5373152494430542, | |
| "learning_rate": 5.8475246712804845e-06, | |
| "loss": 0.2963, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.3493975903614457, | |
| "grad_norm": 1.8781455755233765, | |
| "learning_rate": 5.785287545295895e-06, | |
| "loss": 0.2874, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.3614457831325302, | |
| "grad_norm": 1.824504017829895, | |
| "learning_rate": 5.722925415285555e-06, | |
| "loss": 0.2454, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 1.3734939759036144, | |
| "grad_norm": 1.7806376218795776, | |
| "learning_rate": 5.660448208208513e-06, | |
| "loss": 0.3654, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 1.3855421686746987, | |
| "grad_norm": 1.5633933544158936, | |
| "learning_rate": 5.597865869342075e-06, | |
| "loss": 0.2931, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.3975903614457832, | |
| "grad_norm": 1.8875840902328491, | |
| "learning_rate": 5.535188360698687e-06, | |
| "loss": 0.331, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 1.4096385542168675, | |
| "grad_norm": 1.404435634613037, | |
| "learning_rate": 5.472425659440157e-06, | |
| "loss": 0.246, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 1.4216867469879517, | |
| "grad_norm": 1.4050829410552979, | |
| "learning_rate": 5.409587756289462e-06, | |
| "loss": 0.2689, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 1.4337349397590362, | |
| "grad_norm": 1.5876859426498413, | |
| "learning_rate": 5.346684653940408e-06, | |
| "loss": 0.2645, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 1.4457831325301205, | |
| "grad_norm": 1.6692218780517578, | |
| "learning_rate": 5.2837263654653715e-06, | |
| "loss": 0.3155, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.4578313253012047, | |
| "grad_norm": 1.2533305883407593, | |
| "learning_rate": 5.2207229127213866e-06, | |
| "loss": 0.2112, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 1.4698795180722892, | |
| "grad_norm": 1.5980626344680786, | |
| "learning_rate": 5.157684324754858e-06, | |
| "loss": 0.2441, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 1.4819277108433735, | |
| "grad_norm": 1.6085745096206665, | |
| "learning_rate": 5.094620636205096e-06, | |
| "loss": 0.3087, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 1.4939759036144578, | |
| "grad_norm": 1.7097792625427246, | |
| "learning_rate": 5.031541885706987e-06, | |
| "loss": 0.2499, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.5060240963855422, | |
| "grad_norm": 1.4703900814056396, | |
| "learning_rate": 4.9684581142930135e-06, | |
| "loss": 0.2413, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.5180722891566265, | |
| "grad_norm": 2.3154144287109375, | |
| "learning_rate": 4.905379363794907e-06, | |
| "loss": 0.3701, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.5301204819277108, | |
| "grad_norm": 1.665852427482605, | |
| "learning_rate": 4.842315675245144e-06, | |
| "loss": 0.2791, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 1.5421686746987953, | |
| "grad_norm": 1.7872849702835083, | |
| "learning_rate": 4.779277087278615e-06, | |
| "loss": 0.3303, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.5542168674698795, | |
| "grad_norm": 1.4255069494247437, | |
| "learning_rate": 4.71627363453463e-06, | |
| "loss": 0.2462, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 1.5662650602409638, | |
| "grad_norm": 1.8723397254943848, | |
| "learning_rate": 4.653315346059592e-06, | |
| "loss": 0.3083, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.5783132530120483, | |
| "grad_norm": 1.6238393783569336, | |
| "learning_rate": 4.5904122437105384e-06, | |
| "loss": 0.2947, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 1.5903614457831325, | |
| "grad_norm": 1.5982369184494019, | |
| "learning_rate": 4.527574340559844e-06, | |
| "loss": 0.3114, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.6024096385542168, | |
| "grad_norm": 1.7584006786346436, | |
| "learning_rate": 4.464811639301314e-06, | |
| "loss": 0.3335, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 1.6144578313253013, | |
| "grad_norm": 1.7169082164764404, | |
| "learning_rate": 4.402134130657925e-06, | |
| "loss": 0.2783, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.6265060240963856, | |
| "grad_norm": 1.6119632720947266, | |
| "learning_rate": 4.33955179179149e-06, | |
| "loss": 0.252, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.6385542168674698, | |
| "grad_norm": 1.5756961107254028, | |
| "learning_rate": 4.277074584714447e-06, | |
| "loss": 0.2825, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.6506024096385543, | |
| "grad_norm": 1.511651873588562, | |
| "learning_rate": 4.214712454704107e-06, | |
| "loss": 0.2479, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 1.6626506024096386, | |
| "grad_norm": 1.354615330696106, | |
| "learning_rate": 4.152475328719517e-06, | |
| "loss": 0.2192, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.6746987951807228, | |
| "grad_norm": 1.821956753730774, | |
| "learning_rate": 4.090373113821281e-06, | |
| "loss": 0.2735, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 1.6867469879518073, | |
| "grad_norm": 1.4524273872375488, | |
| "learning_rate": 4.028415695594512e-06, | |
| "loss": 0.2222, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.6987951807228916, | |
| "grad_norm": 1.6997952461242676, | |
| "learning_rate": 3.966612936575235e-06, | |
| "loss": 0.2841, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 1.7108433734939759, | |
| "grad_norm": 1.5502634048461914, | |
| "learning_rate": 3.904974674680436e-06, | |
| "loss": 0.281, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.7228915662650603, | |
| "grad_norm": 1.6944836378097534, | |
| "learning_rate": 3.843510721642036e-06, | |
| "loss": 0.19, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 1.7349397590361446, | |
| "grad_norm": 1.958292007446289, | |
| "learning_rate": 3.782230861445041e-06, | |
| "loss": 0.3143, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.7469879518072289, | |
| "grad_norm": 1.9379884004592896, | |
| "learning_rate": 3.7211448487701002e-06, | |
| "loss": 0.2964, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.7590361445783134, | |
| "grad_norm": 1.6362128257751465, | |
| "learning_rate": 3.6602624074407354e-06, | |
| "loss": 0.2749, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.7710843373493976, | |
| "grad_norm": 1.740090250968933, | |
| "learning_rate": 3.5995932288754655e-06, | |
| "loss": 0.2572, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 1.783132530120482, | |
| "grad_norm": 1.3941646814346313, | |
| "learning_rate": 3.539146970545124e-06, | |
| "loss": 0.2476, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.7951807228915664, | |
| "grad_norm": 1.6419267654418945, | |
| "learning_rate": 3.478933254435534e-06, | |
| "loss": 0.2902, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 1.8072289156626506, | |
| "grad_norm": 1.825861930847168, | |
| "learning_rate": 3.4189616655158803e-06, | |
| "loss": 0.3345, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.819277108433735, | |
| "grad_norm": 1.749080777168274, | |
| "learning_rate": 3.359241750212934e-06, | |
| "loss": 0.314, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 1.8313253012048194, | |
| "grad_norm": 1.2390449047088623, | |
| "learning_rate": 3.2997830148914316e-06, | |
| "loss": 0.214, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.8433734939759037, | |
| "grad_norm": 1.6753946542739868, | |
| "learning_rate": 3.240594924340835e-06, | |
| "loss": 0.2988, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 1.855421686746988, | |
| "grad_norm": 1.6091383695602417, | |
| "learning_rate": 3.181686900268694e-06, | |
| "loss": 0.2481, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.8674698795180724, | |
| "grad_norm": 1.438892126083374, | |
| "learning_rate": 3.1230683198008817e-06, | |
| "loss": 0.2702, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.8795180722891565, | |
| "grad_norm": 1.931443691253662, | |
| "learning_rate": 3.0647485139889145e-06, | |
| "loss": 0.2957, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.891566265060241, | |
| "grad_norm": 1.5713204145431519, | |
| "learning_rate": 3.006736766324623e-06, | |
| "loss": 0.2815, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.9036144578313254, | |
| "grad_norm": 1.5962169170379639, | |
| "learning_rate": 2.9490423112623646e-06, | |
| "loss": 0.2791, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.9156626506024095, | |
| "grad_norm": 2.0020360946655273, | |
| "learning_rate": 2.89167433274908e-06, | |
| "loss": 0.3897, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.927710843373494, | |
| "grad_norm": 1.6599327325820923, | |
| "learning_rate": 2.834641962762358e-06, | |
| "loss": 0.2742, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.9397590361445785, | |
| "grad_norm": 1.6006088256835938, | |
| "learning_rate": 2.7779542798567804e-06, | |
| "loss": 0.2678, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.9518072289156625, | |
| "grad_norm": 1.5215158462524414, | |
| "learning_rate": 2.721620307718793e-06, | |
| "loss": 0.3035, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.963855421686747, | |
| "grad_norm": 1.8756093978881836, | |
| "learning_rate": 2.66564901373027e-06, | |
| "loss": 0.3407, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.9759036144578315, | |
| "grad_norm": 1.5014938116073608, | |
| "learning_rate": 2.610049307541085e-06, | |
| "loss": 0.2533, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.9879518072289155, | |
| "grad_norm": 1.6140003204345703, | |
| "learning_rate": 2.554830039650834e-06, | |
| "loss": 0.2369, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.5059895515441895, | |
| "learning_rate": 2.5000000000000015e-06, | |
| "loss": 0.1612, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 2.0120481927710845, | |
| "grad_norm": 1.2760642766952515, | |
| "learning_rate": 2.4455679165707473e-06, | |
| "loss": 0.1247, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 2.0240963855421685, | |
| "grad_norm": 1.3720568418502808, | |
| "learning_rate": 2.391542453997578e-06, | |
| "loss": 0.1618, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 2.036144578313253, | |
| "grad_norm": 1.4044466018676758, | |
| "learning_rate": 2.337932212188073e-06, | |
| "loss": 0.1427, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 2.0481927710843375, | |
| "grad_norm": 1.2212741374969482, | |
| "learning_rate": 2.284745724953939e-06, | |
| "loss": 0.1587, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.0602409638554215, | |
| "grad_norm": 1.1166741847991943, | |
| "learning_rate": 2.2319914586525776e-06, | |
| "loss": 0.1169, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 2.072289156626506, | |
| "grad_norm": 1.2007352113723755, | |
| "learning_rate": 2.1796778108393824e-06, | |
| "loss": 0.1232, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 2.0843373493975905, | |
| "grad_norm": 1.4228880405426025, | |
| "learning_rate": 2.127813108931007e-06, | |
| "loss": 0.1646, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 2.0963855421686746, | |
| "grad_norm": 1.2214866876602173, | |
| "learning_rate": 2.0764056088797646e-06, | |
| "loss": 0.1058, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 2.108433734939759, | |
| "grad_norm": 1.8072195053100586, | |
| "learning_rate": 2.0254634938594555e-06, | |
| "loss": 0.1579, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 2.1204819277108435, | |
| "grad_norm": 1.872309684753418, | |
| "learning_rate": 1.9749948729627188e-06, | |
| "loss": 0.138, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 2.1325301204819276, | |
| "grad_norm": 1.8318668603897095, | |
| "learning_rate": 1.9250077799102323e-06, | |
| "loss": 0.1331, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 2.144578313253012, | |
| "grad_norm": 2.1385316848754883, | |
| "learning_rate": 1.875510171771865e-06, | |
| "loss": 0.1635, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 2.1566265060240966, | |
| "grad_norm": 1.719831943511963, | |
| "learning_rate": 1.8265099277000614e-06, | |
| "loss": 0.1561, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 2.1686746987951806, | |
| "grad_norm": 1.7940328121185303, | |
| "learning_rate": 1.7780148476756148e-06, | |
| "loss": 0.14, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.180722891566265, | |
| "grad_norm": 1.3721911907196045, | |
| "learning_rate": 1.7300326512660542e-06, | |
| "loss": 0.1233, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 2.1927710843373496, | |
| "grad_norm": 1.2797173261642456, | |
| "learning_rate": 1.6825709763968112e-06, | |
| "loss": 0.0936, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 2.2048192771084336, | |
| "grad_norm": 1.5839323997497559, | |
| "learning_rate": 1.6356373781354058e-06, | |
| "loss": 0.1648, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 2.216867469879518, | |
| "grad_norm": 1.3700120449066162, | |
| "learning_rate": 1.589239327488812e-06, | |
| "loss": 0.126, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 2.2289156626506026, | |
| "grad_norm": 1.5171151161193848, | |
| "learning_rate": 1.543384210214196e-06, | |
| "loss": 0.1212, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 2.2409638554216866, | |
| "grad_norm": 1.6373289823532104, | |
| "learning_rate": 1.4980793256432474e-06, | |
| "loss": 0.1509, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 2.253012048192771, | |
| "grad_norm": 1.30360746383667, | |
| "learning_rate": 1.453331885520234e-06, | |
| "loss": 0.12, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 2.2650602409638556, | |
| "grad_norm": 1.395431399345398, | |
| "learning_rate": 1.4091490128540374e-06, | |
| "loss": 0.1406, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 2.2771084337349397, | |
| "grad_norm": 1.3656375408172607, | |
| "learning_rate": 1.3655377407842813e-06, | |
| "loss": 0.1706, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 2.289156626506024, | |
| "grad_norm": 1.189477562904358, | |
| "learning_rate": 1.32250501146179e-06, | |
| "loss": 0.1243, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.3012048192771086, | |
| "grad_norm": 1.273803949356079, | |
| "learning_rate": 1.2800576749435068e-06, | |
| "loss": 0.1132, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 2.3132530120481927, | |
| "grad_norm": 1.249987244606018, | |
| "learning_rate": 1.2382024881020937e-06, | |
| "loss": 0.133, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 2.325301204819277, | |
| "grad_norm": 1.2117363214492798, | |
| "learning_rate": 1.1969461135503573e-06, | |
| "loss": 0.1153, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 2.337349397590361, | |
| "grad_norm": 1.115524172782898, | |
| "learning_rate": 1.1562951185806675e-06, | |
| "loss": 0.1068, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 2.3493975903614457, | |
| "grad_norm": 1.2410939931869507, | |
| "learning_rate": 1.1162559741195733e-06, | |
| "loss": 0.0926, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 2.36144578313253, | |
| "grad_norm": 1.0989357233047485, | |
| "learning_rate": 1.076835053697728e-06, | |
| "loss": 0.1147, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 2.3734939759036147, | |
| "grad_norm": 1.2773900032043457, | |
| "learning_rate": 1.0380386324353508e-06, | |
| "loss": 0.131, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 2.3855421686746987, | |
| "grad_norm": 1.2643158435821533, | |
| "learning_rate": 9.998728860433277e-07, | |
| "loss": 0.1377, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 2.397590361445783, | |
| "grad_norm": 1.3157423734664917, | |
| "learning_rate": 9.62343889840151e-07, | |
| "loss": 0.127, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 2.4096385542168672, | |
| "grad_norm": 1.1823986768722534, | |
| "learning_rate": 9.254576177848313e-07, | |
| "loss": 0.1039, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.4216867469879517, | |
| "grad_norm": 1.2062366008758545, | |
| "learning_rate": 8.892199415259501e-07, | |
| "loss": 0.1137, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 2.433734939759036, | |
| "grad_norm": 1.358426570892334, | |
| "learning_rate": 8.536366294669979e-07, | |
| "loss": 0.1188, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 2.4457831325301207, | |
| "grad_norm": 1.4414290189743042, | |
| "learning_rate": 8.187133458481416e-07, | |
| "loss": 0.1393, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 2.4578313253012047, | |
| "grad_norm": 1.2111995220184326, | |
| "learning_rate": 7.844556498445788e-07, | |
| "loss": 0.1088, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 2.4698795180722892, | |
| "grad_norm": 1.0797122716903687, | |
| "learning_rate": 7.508689946816128e-07, | |
| "loss": 0.1012, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 2.4819277108433733, | |
| "grad_norm": 1.2910206317901611, | |
| "learning_rate": 7.179587267665999e-07, | |
| "loss": 0.1283, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 2.4939759036144578, | |
| "grad_norm": 1.76227867603302, | |
| "learning_rate": 6.857300848378857e-07, | |
| "loss": 0.1773, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 2.5060240963855422, | |
| "grad_norm": 1.2892178297042847, | |
| "learning_rate": 6.541881991309013e-07, | |
| "loss": 0.1003, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 2.5180722891566267, | |
| "grad_norm": 1.2142372131347656, | |
| "learning_rate": 6.233380905615049e-07, | |
| "loss": 0.1059, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 2.5301204819277108, | |
| "grad_norm": 1.3028932809829712, | |
| "learning_rate": 5.931846699267558e-07, | |
| "loss": 0.0997, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.5421686746987953, | |
| "grad_norm": 1.2703701257705688, | |
| "learning_rate": 5.637327371231921e-07, | |
| "loss": 0.1074, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 2.5542168674698793, | |
| "grad_norm": 1.6055101156234741, | |
| "learning_rate": 5.349869803827717e-07, | |
| "loss": 0.1635, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 2.566265060240964, | |
| "grad_norm": 1.2764710187911987, | |
| "learning_rate": 5.0695197552659e-07, | |
| "loss": 0.1394, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 2.5783132530120483, | |
| "grad_norm": 1.3518632650375366, | |
| "learning_rate": 4.796321852364877e-07, | |
| "loss": 0.1363, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 2.5903614457831328, | |
| "grad_norm": 1.3571412563323975, | |
| "learning_rate": 4.5303195834467463e-07, | |
| "loss": 0.1326, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 2.602409638554217, | |
| "grad_norm": 1.4019449949264526, | |
| "learning_rate": 4.271555291414636e-07, | |
| "loss": 0.1222, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 2.6144578313253013, | |
| "grad_norm": 1.184061050415039, | |
| "learning_rate": 4.020070167012541e-07, | |
| "loss": 0.0845, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 2.6265060240963853, | |
| "grad_norm": 1.5907222032546997, | |
| "learning_rate": 3.775904242268391e-07, | |
| "loss": 0.1353, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 2.63855421686747, | |
| "grad_norm": 1.3479151725769043, | |
| "learning_rate": 3.539096384121743e-07, | |
| "loss": 0.1445, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 2.6506024096385543, | |
| "grad_norm": 1.36601722240448, | |
| "learning_rate": 3.309684288236775e-07, | |
| "loss": 0.1386, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.662650602409639, | |
| "grad_norm": 1.6582006216049194, | |
| "learning_rate": 3.0877044730018515e-07, | |
| "loss": 0.1237, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 2.674698795180723, | |
| "grad_norm": 1.505927324295044, | |
| "learning_rate": 2.873192273716369e-07, | |
| "loss": 0.153, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 2.6867469879518073, | |
| "grad_norm": 1.2739795446395874, | |
| "learning_rate": 2.666181836966053e-07, | |
| "loss": 0.1038, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 2.6987951807228914, | |
| "grad_norm": 1.3373569250106812, | |
| "learning_rate": 2.466706115187406e-07, | |
| "loss": 0.1208, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 2.710843373493976, | |
| "grad_norm": 1.3513188362121582, | |
| "learning_rate": 2.274796861422246e-07, | |
| "loss": 0.1209, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.7228915662650603, | |
| "grad_norm": 1.4020378589630127, | |
| "learning_rate": 2.090484624263167e-07, | |
| "loss": 0.1323, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 2.734939759036145, | |
| "grad_norm": 1.4146372079849243, | |
| "learning_rate": 1.9137987429907635e-07, | |
| "loss": 0.1304, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 2.746987951807229, | |
| "grad_norm": 1.3225347995758057, | |
| "learning_rate": 1.7447673429033361e-07, | |
| "loss": 0.1149, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 2.7590361445783134, | |
| "grad_norm": 1.3890403509140015, | |
| "learning_rate": 1.583417330839798e-07, | |
| "loss": 0.1557, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 2.7710843373493974, | |
| "grad_norm": 1.466339349746704, | |
| "learning_rate": 1.4297743908966212e-07, | |
| "loss": 0.1489, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.783132530120482, | |
| "grad_norm": 1.2367849349975586, | |
| "learning_rate": 1.2838629803393343e-07, | |
| "loss": 0.0997, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 2.7951807228915664, | |
| "grad_norm": 1.390717625617981, | |
| "learning_rate": 1.1457063257093892e-07, | |
| "loss": 0.1218, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 2.807228915662651, | |
| "grad_norm": 1.2239187955856323, | |
| "learning_rate": 1.0153264191269052e-07, | |
| "loss": 0.1135, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 2.819277108433735, | |
| "grad_norm": 1.0472311973571777, | |
| "learning_rate": 8.927440147898703e-08, | |
| "loss": 0.1065, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 2.8313253012048194, | |
| "grad_norm": 1.2322300672531128, | |
| "learning_rate": 7.779786256704669e-08, | |
| "loss": 0.1016, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 2.8433734939759034, | |
| "grad_norm": 1.3203635215759277, | |
| "learning_rate": 6.710485204089456e-08, | |
| "loss": 0.1239, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 2.855421686746988, | |
| "grad_norm": 1.2621276378631592, | |
| "learning_rate": 5.7197072040557356e-08, | |
| "loss": 0.1358, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 2.8674698795180724, | |
| "grad_norm": 1.3743759393692017, | |
| "learning_rate": 4.807609971111238e-08, | |
| "loss": 0.1337, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 2.8795180722891565, | |
| "grad_norm": 1.0450865030288696, | |
| "learning_rate": 3.974338695163393e-08, | |
| "loss": 0.0945, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 2.891566265060241, | |
| "grad_norm": 1.5270490646362305, | |
| "learning_rate": 3.220026018407541e-08, | |
| "loss": 0.0994, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.9036144578313254, | |
| "grad_norm": 1.694990873336792, | |
| "learning_rate": 2.5447920142128712e-08, | |
| "loss": 0.1689, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 2.9156626506024095, | |
| "grad_norm": 1.4968199729919434, | |
| "learning_rate": 1.9487441680084983e-08, | |
| "loss": 0.1219, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 2.927710843373494, | |
| "grad_norm": 1.332356572151184, | |
| "learning_rate": 1.431977360173975e-08, | |
| "loss": 0.1137, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 2.9397590361445785, | |
| "grad_norm": 1.4295134544372559, | |
| "learning_rate": 9.945738509358205e-09, | |
| "loss": 0.1498, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 2.9518072289156625, | |
| "grad_norm": 1.1830252408981323, | |
| "learning_rate": 6.366032672731059e-09, | |
| "loss": 0.1002, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.963855421686747, | |
| "grad_norm": 1.3982295989990234, | |
| "learning_rate": 3.5812259183426457e-09, | |
| "loss": 0.1247, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 2.9759036144578315, | |
| "grad_norm": 1.467788577079773, | |
| "learning_rate": 1.591761538662362e-09, | |
| "loss": 0.1098, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 2.9879518072289155, | |
| "grad_norm": 1.2710858583450317, | |
| "learning_rate": 3.9795622158111945e-10, | |
| "loss": 0.1335, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.9258020520210266, | |
| "learning_rate": 0.0, | |
| "loss": 0.0612, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 249, | |
| "total_flos": 3.4614492313052774e+17, | |
| "train_loss": 0.32225324711706266, | |
| "train_runtime": 433.6826, | |
| "train_samples_per_second": 4.545, | |
| "train_steps_per_second": 0.574 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 249, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.4614492313052774e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |