{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 241, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004149377593360996, "grad_norm": 0.5114469528198242, "learning_rate": 2e-05, "loss": 0.7995174527168274, "step": 1 }, { "epoch": 0.008298755186721992, "grad_norm": 0.5205491185188293, "learning_rate": 2e-05, "loss": 0.8812965750694275, "step": 2 }, { "epoch": 0.012448132780082987, "grad_norm": 0.6057224273681641, "learning_rate": 2e-05, "loss": 0.8402022123336792, "step": 3 }, { "epoch": 0.016597510373443983, "grad_norm": 0.5623906254768372, "learning_rate": 2e-05, "loss": 0.8188848495483398, "step": 4 }, { "epoch": 0.02074688796680498, "grad_norm": 0.574876606464386, "learning_rate": 2e-05, "loss": 0.8380811214447021, "step": 5 }, { "epoch": 0.024896265560165973, "grad_norm": 0.4625989496707916, "learning_rate": 2e-05, "loss": 0.7132218480110168, "step": 6 }, { "epoch": 0.029045643153526972, "grad_norm": 0.5183306336402893, "learning_rate": 2e-05, "loss": 0.8268325328826904, "step": 7 }, { "epoch": 0.03319502074688797, "grad_norm": 0.4928549826145172, "learning_rate": 2e-05, "loss": 0.7686080932617188, "step": 8 }, { "epoch": 0.03734439834024896, "grad_norm": 0.4636511206626892, "learning_rate": 2e-05, "loss": 0.8444753289222717, "step": 9 }, { "epoch": 0.04149377593360996, "grad_norm": 0.5008803606033325, "learning_rate": 2e-05, "loss": 0.6671140789985657, "step": 10 }, { "epoch": 0.04564315352697095, "grad_norm": 0.49685290455818176, "learning_rate": 2e-05, "loss": 0.7625027894973755, "step": 11 }, { "epoch": 0.04979253112033195, "grad_norm": 0.5161386728286743, "learning_rate": 2e-05, "loss": 0.5999635457992554, "step": 12 }, { "epoch": 0.05394190871369295, "grad_norm": 0.46996110677719116, "learning_rate": 2e-05, "loss": 0.7389070987701416, "step": 13 }, { "epoch": 0.058091286307053944, "grad_norm": 0.45131370425224304, "learning_rate": 2e-05, "loss": 0.6111957430839539, "step": 14 }, { "epoch": 0.06224066390041494, "grad_norm": 0.4911205470561981, "learning_rate": 2e-05, "loss": 0.5750669240951538, "step": 15 }, { "epoch": 0.06639004149377593, "grad_norm": 0.46468034386634827, "learning_rate": 2e-05, "loss": 0.6607809066772461, "step": 16 }, { "epoch": 0.07053941908713693, "grad_norm": 0.5140272378921509, "learning_rate": 2e-05, "loss": 0.8089659214019775, "step": 17 }, { "epoch": 0.07468879668049792, "grad_norm": 0.49761149287223816, "learning_rate": 2e-05, "loss": 0.8017055988311768, "step": 18 }, { "epoch": 0.07883817427385892, "grad_norm": 0.45623964071273804, "learning_rate": 2e-05, "loss": 0.725612223148346, "step": 19 }, { "epoch": 0.08298755186721991, "grad_norm": 0.4778558015823364, "learning_rate": 2e-05, "loss": 0.6465242505073547, "step": 20 }, { "epoch": 0.08713692946058091, "grad_norm": 0.4813624620437622, "learning_rate": 2e-05, "loss": 0.6812542676925659, "step": 21 }, { "epoch": 0.0912863070539419, "grad_norm": 0.45828455686569214, "learning_rate": 2e-05, "loss": 0.6355943083763123, "step": 22 }, { "epoch": 0.0954356846473029, "grad_norm": 0.39770182967185974, "learning_rate": 2e-05, "loss": 0.734164297580719, "step": 23 }, { "epoch": 0.0995850622406639, "grad_norm": 0.515662431716919, "learning_rate": 2e-05, "loss": 0.775545060634613, "step": 24 }, { "epoch": 0.1037344398340249, "grad_norm": 0.4875846207141876, "learning_rate": 2e-05, "loss": 0.7608263492584229, "step": 25 }, { "epoch": 0.1078838174273859, "grad_norm": 0.4272926449775696, "learning_rate": 2e-05, "loss": 0.655767560005188, "step": 26 }, { "epoch": 0.11203319502074689, "grad_norm": 0.47189342975616455, "learning_rate": 2e-05, "loss": 0.6984891295433044, "step": 27 }, { "epoch": 0.11618257261410789, "grad_norm": 0.49677926301956177, "learning_rate": 2e-05, "loss": 0.6952549815177917, "step": 28 }, { "epoch": 0.12033195020746888, "grad_norm": 0.5341811776161194, "learning_rate": 2e-05, "loss": 0.6844781041145325, "step": 29 }, { "epoch": 0.12448132780082988, "grad_norm": 0.49139678478240967, "learning_rate": 2e-05, "loss": 0.7043532729148865, "step": 30 }, { "epoch": 0.12863070539419086, "grad_norm": 0.42113780975341797, "learning_rate": 2e-05, "loss": 0.6791371703147888, "step": 31 }, { "epoch": 0.13278008298755187, "grad_norm": 0.490699827671051, "learning_rate": 2e-05, "loss": 0.66917484998703, "step": 32 }, { "epoch": 0.13692946058091288, "grad_norm": 0.48269012570381165, "learning_rate": 2e-05, "loss": 0.6663049459457397, "step": 33 }, { "epoch": 0.14107883817427386, "grad_norm": 0.4833972454071045, "learning_rate": 2e-05, "loss": 0.7479192018508911, "step": 34 }, { "epoch": 0.14522821576763487, "grad_norm": 0.4521920382976532, "learning_rate": 2e-05, "loss": 0.5006750822067261, "step": 35 }, { "epoch": 0.14937759336099585, "grad_norm": 0.4805753231048584, "learning_rate": 2e-05, "loss": 0.7437685132026672, "step": 36 }, { "epoch": 0.15352697095435686, "grad_norm": 0.4702300429344177, "learning_rate": 2e-05, "loss": 0.7820006608963013, "step": 37 }, { "epoch": 0.15767634854771784, "grad_norm": 0.4416898190975189, "learning_rate": 2e-05, "loss": 0.5911201238632202, "step": 38 }, { "epoch": 0.16182572614107885, "grad_norm": 0.46818608045578003, "learning_rate": 2e-05, "loss": 0.6237752437591553, "step": 39 }, { "epoch": 0.16597510373443983, "grad_norm": 0.38742795586586, "learning_rate": 2e-05, "loss": 0.6044095754623413, "step": 40 }, { "epoch": 0.17012448132780084, "grad_norm": 0.4806065857410431, "learning_rate": 2e-05, "loss": 0.6341798901557922, "step": 41 }, { "epoch": 0.17427385892116182, "grad_norm": 0.4329955279827118, "learning_rate": 2e-05, "loss": 0.621407687664032, "step": 42 }, { "epoch": 0.17842323651452283, "grad_norm": 0.46890074014663696, "learning_rate": 2e-05, "loss": 0.7025566697120667, "step": 43 }, { "epoch": 0.1825726141078838, "grad_norm": 0.4821957051753998, "learning_rate": 2e-05, "loss": 0.6547812819480896, "step": 44 }, { "epoch": 0.18672199170124482, "grad_norm": 0.4716266691684723, "learning_rate": 2e-05, "loss": 0.6434807777404785, "step": 45 }, { "epoch": 0.1908713692946058, "grad_norm": 0.5017584562301636, "learning_rate": 2e-05, "loss": 0.6461539268493652, "step": 46 }, { "epoch": 0.1950207468879668, "grad_norm": 0.4837803244590759, "learning_rate": 2e-05, "loss": 0.6638780236244202, "step": 47 }, { "epoch": 0.1991701244813278, "grad_norm": 0.4523409605026245, "learning_rate": 2e-05, "loss": 0.5731872916221619, "step": 48 }, { "epoch": 0.2033195020746888, "grad_norm": 0.46308189630508423, "learning_rate": 2e-05, "loss": 0.6024616956710815, "step": 49 }, { "epoch": 0.2074688796680498, "grad_norm": 0.4565693140029907, "learning_rate": 2e-05, "loss": 0.5795129537582397, "step": 50 }, { "epoch": 0.21161825726141079, "grad_norm": 0.48081323504447937, "learning_rate": 2e-05, "loss": 0.6645175814628601, "step": 51 }, { "epoch": 0.2157676348547718, "grad_norm": 0.4649989902973175, "learning_rate": 2e-05, "loss": 0.6339988112449646, "step": 52 }, { "epoch": 0.21991701244813278, "grad_norm": 0.45999905467033386, "learning_rate": 2e-05, "loss": 0.6070005297660828, "step": 53 }, { "epoch": 0.22406639004149378, "grad_norm": 0.43405112624168396, "learning_rate": 2e-05, "loss": 0.6078118085861206, "step": 54 }, { "epoch": 0.22821576763485477, "grad_norm": 0.557212233543396, "learning_rate": 2e-05, "loss": 0.6502783894538879, "step": 55 }, { "epoch": 0.23236514522821577, "grad_norm": 0.4206949472427368, "learning_rate": 2e-05, "loss": 0.5604119896888733, "step": 56 }, { "epoch": 0.23651452282157676, "grad_norm": 0.4931945502758026, "learning_rate": 2e-05, "loss": 0.5463195443153381, "step": 57 }, { "epoch": 0.24066390041493776, "grad_norm": 0.44888630509376526, "learning_rate": 2e-05, "loss": 0.49333369731903076, "step": 58 }, { "epoch": 0.24481327800829875, "grad_norm": 0.4515199363231659, "learning_rate": 2e-05, "loss": 0.66854327917099, "step": 59 }, { "epoch": 0.24896265560165975, "grad_norm": 0.46686026453971863, "learning_rate": 2e-05, "loss": 0.5279274582862854, "step": 60 }, { "epoch": 0.25311203319502074, "grad_norm": 0.46663975715637207, "learning_rate": 2e-05, "loss": 0.6141489148139954, "step": 61 }, { "epoch": 0.2572614107883817, "grad_norm": 0.45049089193344116, "learning_rate": 2e-05, "loss": 0.6643646955490112, "step": 62 }, { "epoch": 0.26141078838174275, "grad_norm": 0.49262335896492004, "learning_rate": 2e-05, "loss": 0.6589719653129578, "step": 63 }, { "epoch": 0.26556016597510373, "grad_norm": 0.5234288573265076, "learning_rate": 2e-05, "loss": 0.6250555515289307, "step": 64 }, { "epoch": 0.2697095435684647, "grad_norm": 0.4657873809337616, "learning_rate": 2e-05, "loss": 0.5761417150497437, "step": 65 }, { "epoch": 0.27385892116182575, "grad_norm": 2.8522613048553467, "learning_rate": 2e-05, "loss": 0.6810148358345032, "step": 66 }, { "epoch": 0.27800829875518673, "grad_norm": 0.45667174458503723, "learning_rate": 2e-05, "loss": 0.5667203664779663, "step": 67 }, { "epoch": 0.2821576763485477, "grad_norm": 0.48965880274772644, "learning_rate": 2e-05, "loss": 0.6057634949684143, "step": 68 }, { "epoch": 0.2863070539419087, "grad_norm": 0.4700252115726471, "learning_rate": 2e-05, "loss": 0.5498369932174683, "step": 69 }, { "epoch": 0.29045643153526973, "grad_norm": 0.4457707703113556, "learning_rate": 2e-05, "loss": 0.5500881671905518, "step": 70 }, { "epoch": 0.2946058091286307, "grad_norm": 0.5242801904678345, "learning_rate": 2e-05, "loss": 0.6648991703987122, "step": 71 }, { "epoch": 0.2987551867219917, "grad_norm": 0.4845593273639679, "learning_rate": 2e-05, "loss": 0.6495253443717957, "step": 72 }, { "epoch": 0.3029045643153527, "grad_norm": 0.4535577595233917, "learning_rate": 2e-05, "loss": 0.6440762281417847, "step": 73 }, { "epoch": 0.3070539419087137, "grad_norm": 0.4424896240234375, "learning_rate": 2e-05, "loss": 0.5427602529525757, "step": 74 }, { "epoch": 0.3112033195020747, "grad_norm": 0.4791293144226074, "learning_rate": 2e-05, "loss": 0.6312339901924133, "step": 75 }, { "epoch": 0.3153526970954357, "grad_norm": 0.49440717697143555, "learning_rate": 2e-05, "loss": 0.7304765582084656, "step": 76 }, { "epoch": 0.31950207468879666, "grad_norm": 0.47376683354377747, "learning_rate": 2e-05, "loss": 0.5550855994224548, "step": 77 }, { "epoch": 0.3236514522821577, "grad_norm": 0.5386195182800293, "learning_rate": 2e-05, "loss": 0.7627665996551514, "step": 78 }, { "epoch": 0.3278008298755187, "grad_norm": 0.5139470100402832, "learning_rate": 2e-05, "loss": 0.7294001579284668, "step": 79 }, { "epoch": 0.33195020746887965, "grad_norm": 0.5727441310882568, "learning_rate": 2e-05, "loss": 0.6094337105751038, "step": 80 }, { "epoch": 0.3360995850622407, "grad_norm": 0.4475933313369751, "learning_rate": 2e-05, "loss": 0.6689184904098511, "step": 81 }, { "epoch": 0.34024896265560167, "grad_norm": 0.48615196347236633, "learning_rate": 2e-05, "loss": 0.5170673727989197, "step": 82 }, { "epoch": 0.34439834024896265, "grad_norm": 0.4444977939128876, "learning_rate": 2e-05, "loss": 0.5426638126373291, "step": 83 }, { "epoch": 0.34854771784232363, "grad_norm": 0.4532429873943329, "learning_rate": 2e-05, "loss": 0.5246436595916748, "step": 84 }, { "epoch": 0.35269709543568467, "grad_norm": 0.5425305962562561, "learning_rate": 2e-05, "loss": 0.7444034814834595, "step": 85 }, { "epoch": 0.35684647302904565, "grad_norm": 0.4604993164539337, "learning_rate": 2e-05, "loss": 0.6390590071678162, "step": 86 }, { "epoch": 0.36099585062240663, "grad_norm": 0.4503551423549652, "learning_rate": 2e-05, "loss": 0.7437008023262024, "step": 87 }, { "epoch": 0.3651452282157676, "grad_norm": 0.473531037569046, "learning_rate": 2e-05, "loss": 0.5801289677619934, "step": 88 }, { "epoch": 0.36929460580912865, "grad_norm": 0.43614616990089417, "learning_rate": 2e-05, "loss": 0.5945846438407898, "step": 89 }, { "epoch": 0.37344398340248963, "grad_norm": 0.5157416462898254, "learning_rate": 2e-05, "loss": 0.5870503187179565, "step": 90 }, { "epoch": 0.3775933609958506, "grad_norm": 0.4724714756011963, "learning_rate": 2e-05, "loss": 0.7136172652244568, "step": 91 }, { "epoch": 0.3817427385892116, "grad_norm": 0.49608129262924194, "learning_rate": 2e-05, "loss": 0.5707521438598633, "step": 92 }, { "epoch": 0.38589211618257263, "grad_norm": 0.4372619390487671, "learning_rate": 2e-05, "loss": 0.6751445531845093, "step": 93 }, { "epoch": 0.3900414937759336, "grad_norm": 0.8502039909362793, "learning_rate": 2e-05, "loss": 0.7432682514190674, "step": 94 }, { "epoch": 0.3941908713692946, "grad_norm": 0.43237465620040894, "learning_rate": 2e-05, "loss": 0.5463064908981323, "step": 95 }, { "epoch": 0.3983402489626556, "grad_norm": 0.4683166444301605, "learning_rate": 2e-05, "loss": 0.5722454190254211, "step": 96 }, { "epoch": 0.4024896265560166, "grad_norm": 0.49307140707969666, "learning_rate": 2e-05, "loss": 0.7676360011100769, "step": 97 }, { "epoch": 0.4066390041493776, "grad_norm": 0.45873740315437317, "learning_rate": 2e-05, "loss": 0.7670221328735352, "step": 98 }, { "epoch": 0.4107883817427386, "grad_norm": 0.522739589214325, "learning_rate": 2e-05, "loss": 0.6198732256889343, "step": 99 }, { "epoch": 0.4149377593360996, "grad_norm": 0.513500988483429, "learning_rate": 2e-05, "loss": 0.6557285189628601, "step": 100 }, { "epoch": 0.4190871369294606, "grad_norm": 0.5162559747695923, "learning_rate": 2e-05, "loss": 0.6777411699295044, "step": 101 }, { "epoch": 0.42323651452282157, "grad_norm": 0.4742807447910309, "learning_rate": 2e-05, "loss": 0.5189216732978821, "step": 102 }, { "epoch": 0.42738589211618255, "grad_norm": 0.3864991068840027, "learning_rate": 2e-05, "loss": 0.5397198796272278, "step": 103 }, { "epoch": 0.4315352697095436, "grad_norm": 0.44808462262153625, "learning_rate": 2e-05, "loss": 0.5719993710517883, "step": 104 }, { "epoch": 0.43568464730290457, "grad_norm": 0.5047919154167175, "learning_rate": 2e-05, "loss": 0.7246726751327515, "step": 105 }, { "epoch": 0.43983402489626555, "grad_norm": 0.4501510262489319, "learning_rate": 2e-05, "loss": 0.5421350598335266, "step": 106 }, { "epoch": 0.44398340248962653, "grad_norm": 0.5187399983406067, "learning_rate": 2e-05, "loss": 0.6851190328598022, "step": 107 }, { "epoch": 0.44813278008298757, "grad_norm": 0.4442541003227234, "learning_rate": 2e-05, "loss": 0.7323095798492432, "step": 108 }, { "epoch": 0.45228215767634855, "grad_norm": 0.4546023905277252, "learning_rate": 2e-05, "loss": 0.5949406027793884, "step": 109 }, { "epoch": 0.45643153526970953, "grad_norm": 0.43765076994895935, "learning_rate": 2e-05, "loss": 0.5195109248161316, "step": 110 }, { "epoch": 0.4605809128630705, "grad_norm": 0.6012418866157532, "learning_rate": 2e-05, "loss": 0.5891928672790527, "step": 111 }, { "epoch": 0.46473029045643155, "grad_norm": 0.5350989699363708, "learning_rate": 2e-05, "loss": 0.7073556184768677, "step": 112 }, { "epoch": 0.46887966804979253, "grad_norm": 0.40423402190208435, "learning_rate": 2e-05, "loss": 0.6081284284591675, "step": 113 }, { "epoch": 0.4730290456431535, "grad_norm": 0.48459556698799133, "learning_rate": 2e-05, "loss": 0.7626031637191772, "step": 114 }, { "epoch": 0.47717842323651455, "grad_norm": 0.5132282972335815, "learning_rate": 2e-05, "loss": 0.7070454359054565, "step": 115 }, { "epoch": 0.48132780082987553, "grad_norm": 0.40754643082618713, "learning_rate": 2e-05, "loss": 0.7881268858909607, "step": 116 }, { "epoch": 0.4854771784232365, "grad_norm": 0.46227574348449707, "learning_rate": 2e-05, "loss": 0.5589393973350525, "step": 117 }, { "epoch": 0.4896265560165975, "grad_norm": 0.458891898393631, "learning_rate": 2e-05, "loss": 0.6076244711875916, "step": 118 }, { "epoch": 0.49377593360995853, "grad_norm": 0.4314862787723541, "learning_rate": 2e-05, "loss": 0.58890700340271, "step": 119 }, { "epoch": 0.4979253112033195, "grad_norm": 0.4849430322647095, "learning_rate": 2e-05, "loss": 0.7297042012214661, "step": 120 }, { "epoch": 0.5020746887966805, "grad_norm": 0.4734286963939667, "learning_rate": 2e-05, "loss": 0.7929898500442505, "step": 121 }, { "epoch": 0.5062240663900415, "grad_norm": 0.4982983469963074, "learning_rate": 2e-05, "loss": 0.6973749399185181, "step": 122 }, { "epoch": 0.5103734439834025, "grad_norm": 0.4555007517337799, "learning_rate": 2e-05, "loss": 0.6363988518714905, "step": 123 }, { "epoch": 0.5145228215767634, "grad_norm": 0.469707190990448, "learning_rate": 2e-05, "loss": 0.6936283111572266, "step": 124 }, { "epoch": 0.5186721991701245, "grad_norm": 0.45310160517692566, "learning_rate": 2e-05, "loss": 0.8045607209205627, "step": 125 }, { "epoch": 0.5228215767634855, "grad_norm": 0.5117340087890625, "learning_rate": 2e-05, "loss": 0.5602521300315857, "step": 126 }, { "epoch": 0.5269709543568465, "grad_norm": 0.4890298545360565, "learning_rate": 2e-05, "loss": 0.5749447345733643, "step": 127 }, { "epoch": 0.5311203319502075, "grad_norm": 0.4680368900299072, "learning_rate": 2e-05, "loss": 0.6603504419326782, "step": 128 }, { "epoch": 0.5352697095435685, "grad_norm": 0.4364625811576843, "learning_rate": 2e-05, "loss": 0.6615546941757202, "step": 129 }, { "epoch": 0.5394190871369294, "grad_norm": 0.44393712282180786, "learning_rate": 2e-05, "loss": 0.7206588387489319, "step": 130 }, { "epoch": 0.5435684647302904, "grad_norm": 0.4770648777484894, "learning_rate": 2e-05, "loss": 0.5122599005699158, "step": 131 }, { "epoch": 0.5477178423236515, "grad_norm": 0.4254826307296753, "learning_rate": 2e-05, "loss": 0.5919891595840454, "step": 132 }, { "epoch": 0.5518672199170125, "grad_norm": 0.49948850274086, "learning_rate": 2e-05, "loss": 0.7168218493461609, "step": 133 }, { "epoch": 0.5560165975103735, "grad_norm": 0.46940577030181885, "learning_rate": 2e-05, "loss": 0.559630274772644, "step": 134 }, { "epoch": 0.5601659751037344, "grad_norm": 0.38155895471572876, "learning_rate": 2e-05, "loss": 0.35719043016433716, "step": 135 }, { "epoch": 0.5643153526970954, "grad_norm": 0.446111798286438, "learning_rate": 2e-05, "loss": 0.5944488644599915, "step": 136 }, { "epoch": 0.5684647302904564, "grad_norm": 0.44898721575737, "learning_rate": 2e-05, "loss": 0.6778333187103271, "step": 137 }, { "epoch": 0.5726141078838174, "grad_norm": 0.4727020263671875, "learning_rate": 2e-05, "loss": 0.6683153510093689, "step": 138 }, { "epoch": 0.5767634854771784, "grad_norm": 0.4775353968143463, "learning_rate": 2e-05, "loss": 0.7357037663459778, "step": 139 }, { "epoch": 0.5809128630705395, "grad_norm": 0.5201453566551208, "learning_rate": 2e-05, "loss": 0.5672426819801331, "step": 140 }, { "epoch": 0.5850622406639004, "grad_norm": 0.4446447491645813, "learning_rate": 2e-05, "loss": 0.6665009260177612, "step": 141 }, { "epoch": 0.5892116182572614, "grad_norm": 0.44674625992774963, "learning_rate": 2e-05, "loss": 0.6256436705589294, "step": 142 }, { "epoch": 0.5933609958506224, "grad_norm": 0.48278629779815674, "learning_rate": 2e-05, "loss": 0.652278482913971, "step": 143 }, { "epoch": 0.5975103734439834, "grad_norm": 0.4608626067638397, "learning_rate": 2e-05, "loss": 0.687121570110321, "step": 144 }, { "epoch": 0.6016597510373444, "grad_norm": 0.5146644711494446, "learning_rate": 2e-05, "loss": 0.7759085297584534, "step": 145 }, { "epoch": 0.6058091286307054, "grad_norm": 0.4703519344329834, "learning_rate": 2e-05, "loss": 0.6268375515937805, "step": 146 }, { "epoch": 0.6099585062240664, "grad_norm": 0.4373490512371063, "learning_rate": 2e-05, "loss": 0.7350006699562073, "step": 147 }, { "epoch": 0.6141078838174274, "grad_norm": 0.48525917530059814, "learning_rate": 2e-05, "loss": 0.6609182357788086, "step": 148 }, { "epoch": 0.6182572614107884, "grad_norm": 0.509609043598175, "learning_rate": 2e-05, "loss": 0.7720542550086975, "step": 149 }, { "epoch": 0.6224066390041494, "grad_norm": 0.46813687682151794, "learning_rate": 2e-05, "loss": 0.658400297164917, "step": 150 }, { "epoch": 0.6265560165975104, "grad_norm": 0.48811477422714233, "learning_rate": 2e-05, "loss": 0.6340473890304565, "step": 151 }, { "epoch": 0.6307053941908713, "grad_norm": 0.48529860377311707, "learning_rate": 2e-05, "loss": 0.7543718218803406, "step": 152 }, { "epoch": 0.6348547717842323, "grad_norm": 0.4565221965312958, "learning_rate": 2e-05, "loss": 0.5810791254043579, "step": 153 }, { "epoch": 0.6390041493775933, "grad_norm": 0.4667608141899109, "learning_rate": 2e-05, "loss": 0.5940293669700623, "step": 154 }, { "epoch": 0.6431535269709544, "grad_norm": 0.476724773645401, "learning_rate": 2e-05, "loss": 0.5076797604560852, "step": 155 }, { "epoch": 0.6473029045643154, "grad_norm": 0.48997762799263, "learning_rate": 2e-05, "loss": 0.5588229894638062, "step": 156 }, { "epoch": 0.6514522821576764, "grad_norm": 0.4687066674232483, "learning_rate": 2e-05, "loss": 0.7414963245391846, "step": 157 }, { "epoch": 0.6556016597510373, "grad_norm": 0.5096819400787354, "learning_rate": 2e-05, "loss": 0.6766090393066406, "step": 158 }, { "epoch": 0.6597510373443983, "grad_norm": 0.40396353602409363, "learning_rate": 2e-05, "loss": 0.5890622735023499, "step": 159 }, { "epoch": 0.6639004149377593, "grad_norm": 0.46985870599746704, "learning_rate": 2e-05, "loss": 0.5969380140304565, "step": 160 }, { "epoch": 0.6680497925311203, "grad_norm": 0.49084073305130005, "learning_rate": 2e-05, "loss": 0.6371229887008667, "step": 161 }, { "epoch": 0.6721991701244814, "grad_norm": 0.4466313123703003, "learning_rate": 2e-05, "loss": 0.6732550263404846, "step": 162 }, { "epoch": 0.6763485477178424, "grad_norm": 0.4656016528606415, "learning_rate": 2e-05, "loss": 0.7082672119140625, "step": 163 }, { "epoch": 0.6804979253112033, "grad_norm": 0.43604540824890137, "learning_rate": 2e-05, "loss": 0.5961745977401733, "step": 164 }, { "epoch": 0.6846473029045643, "grad_norm": 0.45962008833885193, "learning_rate": 2e-05, "loss": 0.5974591374397278, "step": 165 }, { "epoch": 0.6887966804979253, "grad_norm": 0.4566839635372162, "learning_rate": 2e-05, "loss": 0.5828849673271179, "step": 166 }, { "epoch": 0.6929460580912863, "grad_norm": 0.38006696105003357, "learning_rate": 2e-05, "loss": 0.6747267246246338, "step": 167 }, { "epoch": 0.6970954356846473, "grad_norm": 0.439981609582901, "learning_rate": 2e-05, "loss": 0.7797038555145264, "step": 168 }, { "epoch": 0.7012448132780082, "grad_norm": 0.47687003016471863, "learning_rate": 2e-05, "loss": 0.570720911026001, "step": 169 }, { "epoch": 0.7053941908713693, "grad_norm": 0.4829600155353546, "learning_rate": 2e-05, "loss": 0.5899892449378967, "step": 170 }, { "epoch": 0.7095435684647303, "grad_norm": 0.4642188847064972, "learning_rate": 2e-05, "loss": 0.6866733431816101, "step": 171 }, { "epoch": 0.7136929460580913, "grad_norm": 0.4619278013706207, "learning_rate": 2e-05, "loss": 0.5310846567153931, "step": 172 }, { "epoch": 0.7178423236514523, "grad_norm": 0.40906423330307007, "learning_rate": 2e-05, "loss": 0.6505522131919861, "step": 173 }, { "epoch": 0.7219917012448133, "grad_norm": 0.47687482833862305, "learning_rate": 2e-05, "loss": 0.6477482318878174, "step": 174 }, { "epoch": 0.7261410788381742, "grad_norm": 0.4249359369277954, "learning_rate": 2e-05, "loss": 0.542078971862793, "step": 175 }, { "epoch": 0.7302904564315352, "grad_norm": 0.4437820315361023, "learning_rate": 2e-05, "loss": 0.7326051592826843, "step": 176 }, { "epoch": 0.7344398340248963, "grad_norm": 0.47250184416770935, "learning_rate": 2e-05, "loss": 0.7204862236976624, "step": 177 }, { "epoch": 0.7385892116182573, "grad_norm": 0.45673149824142456, "learning_rate": 2e-05, "loss": 0.6894567608833313, "step": 178 }, { "epoch": 0.7427385892116183, "grad_norm": 0.4065015912055969, "learning_rate": 2e-05, "loss": 0.5020947456359863, "step": 179 }, { "epoch": 0.7468879668049793, "grad_norm": 0.480761855840683, "learning_rate": 2e-05, "loss": 0.652772843837738, "step": 180 }, { "epoch": 0.7510373443983402, "grad_norm": 0.4796382784843445, "learning_rate": 2e-05, "loss": 0.5466834306716919, "step": 181 }, { "epoch": 0.7551867219917012, "grad_norm": 0.427696168422699, "learning_rate": 2e-05, "loss": 0.46073320508003235, "step": 182 }, { "epoch": 0.7593360995850622, "grad_norm": 0.4324597716331482, "learning_rate": 2e-05, "loss": 0.6211638450622559, "step": 183 }, { "epoch": 0.7634854771784232, "grad_norm": 0.47733691334724426, "learning_rate": 2e-05, "loss": 0.6684774160385132, "step": 184 }, { "epoch": 0.7676348547717843, "grad_norm": 0.431084007024765, "learning_rate": 2e-05, "loss": 0.6145834922790527, "step": 185 }, { "epoch": 0.7717842323651453, "grad_norm": 0.5007755160331726, "learning_rate": 2e-05, "loss": 0.6526326537132263, "step": 186 }, { "epoch": 0.7759336099585062, "grad_norm": 0.4393167793750763, "learning_rate": 2e-05, "loss": 0.6100775599479675, "step": 187 }, { "epoch": 0.7800829875518672, "grad_norm": 0.4865422248840332, "learning_rate": 2e-05, "loss": 0.7980203032493591, "step": 188 }, { "epoch": 0.7842323651452282, "grad_norm": 0.4837598502635956, "learning_rate": 2e-05, "loss": 0.5299490690231323, "step": 189 }, { "epoch": 0.7883817427385892, "grad_norm": 0.5101847052574158, "learning_rate": 2e-05, "loss": 0.636174201965332, "step": 190 }, { "epoch": 0.7925311203319502, "grad_norm": 0.481587678194046, "learning_rate": 2e-05, "loss": 0.584964394569397, "step": 191 }, { "epoch": 0.7966804979253111, "grad_norm": 0.4833771288394928, "learning_rate": 2e-05, "loss": 0.660033643245697, "step": 192 }, { "epoch": 0.8008298755186722, "grad_norm": 0.47723522782325745, "learning_rate": 2e-05, "loss": 0.5514160394668579, "step": 193 }, { "epoch": 0.8049792531120332, "grad_norm": 0.46386954188346863, "learning_rate": 2e-05, "loss": 0.5447302460670471, "step": 194 }, { "epoch": 0.8091286307053942, "grad_norm": 0.47975945472717285, "learning_rate": 2e-05, "loss": 0.6700522303581238, "step": 195 }, { "epoch": 0.8132780082987552, "grad_norm": 0.45628130435943604, "learning_rate": 2e-05, "loss": 0.725788950920105, "step": 196 }, { "epoch": 0.8174273858921162, "grad_norm": 0.5276447534561157, "learning_rate": 2e-05, "loss": 0.4795994460582733, "step": 197 }, { "epoch": 0.8215767634854771, "grad_norm": 0.4197767376899719, "learning_rate": 2e-05, "loss": 0.5689822435379028, "step": 198 }, { "epoch": 0.8257261410788381, "grad_norm": 0.4988608956336975, "learning_rate": 2e-05, "loss": 0.5570112466812134, "step": 199 }, { "epoch": 0.8298755186721992, "grad_norm": 0.43889400362968445, "learning_rate": 2e-05, "loss": 0.5546621680259705, "step": 200 }, { "epoch": 0.8340248962655602, "grad_norm": 0.4966701865196228, "learning_rate": 2e-05, "loss": 0.7806369066238403, "step": 201 }, { "epoch": 0.8381742738589212, "grad_norm": 0.444965124130249, "learning_rate": 2e-05, "loss": 0.6175658702850342, "step": 202 }, { "epoch": 0.8423236514522822, "grad_norm": 0.47721561789512634, "learning_rate": 2e-05, "loss": 0.608608603477478, "step": 203 }, { "epoch": 0.8464730290456431, "grad_norm": 0.41363325715065, "learning_rate": 2e-05, "loss": 0.5362960696220398, "step": 204 }, { "epoch": 0.8506224066390041, "grad_norm": 0.4979526102542877, "learning_rate": 2e-05, "loss": 0.6923606395721436, "step": 205 }, { "epoch": 0.8547717842323651, "grad_norm": 0.4715823829174042, "learning_rate": 2e-05, "loss": 0.5849528312683105, "step": 206 }, { "epoch": 0.8589211618257261, "grad_norm": 0.43941834568977356, "learning_rate": 2e-05, "loss": 0.5507952570915222, "step": 207 }, { "epoch": 0.8630705394190872, "grad_norm": 0.6943396925926208, "learning_rate": 2e-05, "loss": 0.6139302253723145, "step": 208 }, { "epoch": 0.8672199170124482, "grad_norm": 0.4135432541370392, "learning_rate": 2e-05, "loss": 0.6495124697685242, "step": 209 }, { "epoch": 0.8713692946058091, "grad_norm": 0.4735243320465088, "learning_rate": 2e-05, "loss": 0.6073355674743652, "step": 210 }, { "epoch": 0.8755186721991701, "grad_norm": 0.5081479549407959, "learning_rate": 2e-05, "loss": 0.5338884592056274, "step": 211 }, { "epoch": 0.8796680497925311, "grad_norm": 0.44402876496315, "learning_rate": 2e-05, "loss": 0.5649405717849731, "step": 212 }, { "epoch": 0.8838174273858921, "grad_norm": 0.4597266614437103, "learning_rate": 2e-05, "loss": 0.851700484752655, "step": 213 }, { "epoch": 0.8879668049792531, "grad_norm": 0.49691715836524963, "learning_rate": 2e-05, "loss": 0.6800894141197205, "step": 214 }, { "epoch": 0.8921161825726142, "grad_norm": 0.4347255825996399, "learning_rate": 2e-05, "loss": 0.6838465332984924, "step": 215 }, { "epoch": 0.8962655601659751, "grad_norm": 0.4532018303871155, "learning_rate": 2e-05, "loss": 0.6527755856513977, "step": 216 }, { "epoch": 0.9004149377593361, "grad_norm": 0.5003204941749573, "learning_rate": 2e-05, "loss": 0.6630940437316895, "step": 217 }, { "epoch": 0.9045643153526971, "grad_norm": 0.4661204218864441, "learning_rate": 2e-05, "loss": 0.693079948425293, "step": 218 }, { "epoch": 0.9087136929460581, "grad_norm": 0.4552728235721588, "learning_rate": 2e-05, "loss": 0.6484197974205017, "step": 219 }, { "epoch": 0.9128630705394191, "grad_norm": 0.4681585133075714, "learning_rate": 2e-05, "loss": 0.6020994186401367, "step": 220 }, { "epoch": 0.91701244813278, "grad_norm": 0.41022825241088867, "learning_rate": 2e-05, "loss": 0.530207097530365, "step": 221 }, { "epoch": 0.921161825726141, "grad_norm": 0.39006152749061584, "learning_rate": 2e-05, "loss": 0.445180743932724, "step": 222 }, { "epoch": 0.9253112033195021, "grad_norm": 0.4057929217815399, "learning_rate": 2e-05, "loss": 0.5387605428695679, "step": 223 }, { "epoch": 0.9294605809128631, "grad_norm": 0.42876264452934265, "learning_rate": 2e-05, "loss": 0.5825240015983582, "step": 224 }, { "epoch": 0.9336099585062241, "grad_norm": 0.48948875069618225, "learning_rate": 2e-05, "loss": 0.6396217942237854, "step": 225 }, { "epoch": 0.9377593360995851, "grad_norm": 0.4649500548839569, "learning_rate": 2e-05, "loss": 0.4400583505630493, "step": 226 }, { "epoch": 0.941908713692946, "grad_norm": 0.43061113357543945, "learning_rate": 2e-05, "loss": 0.5668185353279114, "step": 227 }, { "epoch": 0.946058091286307, "grad_norm": 0.37659695744514465, "learning_rate": 2e-05, "loss": 0.3734014630317688, "step": 228 }, { "epoch": 0.950207468879668, "grad_norm": 0.5160449743270874, "learning_rate": 2e-05, "loss": 0.7836225032806396, "step": 229 }, { "epoch": 0.9543568464730291, "grad_norm": 0.5332698822021484, "learning_rate": 2e-05, "loss": 0.6564600467681885, "step": 230 }, { "epoch": 0.9585062240663901, "grad_norm": 0.48597726225852966, "learning_rate": 2e-05, "loss": 0.7620537281036377, "step": 231 }, { "epoch": 0.9626556016597511, "grad_norm": 0.437928169965744, "learning_rate": 2e-05, "loss": 0.5499407052993774, "step": 232 }, { "epoch": 0.966804979253112, "grad_norm": 0.4861524701118469, "learning_rate": 2e-05, "loss": 0.6248472332954407, "step": 233 }, { "epoch": 0.970954356846473, "grad_norm": 0.4638573229312897, "learning_rate": 2e-05, "loss": 0.5971051454544067, "step": 234 }, { "epoch": 0.975103734439834, "grad_norm": 0.4368666410446167, "learning_rate": 2e-05, "loss": 0.5971348285675049, "step": 235 }, { "epoch": 0.979253112033195, "grad_norm": 0.4261365830898285, "learning_rate": 2e-05, "loss": 0.5625735521316528, "step": 236 }, { "epoch": 0.983402489626556, "grad_norm": 0.47601279616355896, "learning_rate": 2e-05, "loss": 0.518233597278595, "step": 237 }, { "epoch": 0.9875518672199171, "grad_norm": 0.4935397803783417, "learning_rate": 2e-05, "loss": 0.7158107161521912, "step": 238 }, { "epoch": 0.991701244813278, "grad_norm": 0.456167072057724, "learning_rate": 2e-05, "loss": 0.6627569198608398, "step": 239 }, { "epoch": 0.995850622406639, "grad_norm": 0.4805908799171448, "learning_rate": 2e-05, "loss": 0.6887528896331787, "step": 240 }, { "epoch": 1.0, "grad_norm": 0.6356716156005859, "learning_rate": 2e-05, "loss": 0.65900057554245, "step": 241 } ], "logging_steps": 1, "max_steps": 241, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1048678841008456e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }