{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9958417169684776, "eval_steps": 500, "global_step": 232, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004292421193829644, "grad_norm": 3.342827936598552, "learning_rate": 1.4285714285714286e-06, "loss": 0.679, "step": 1 }, { "epoch": 0.008584842387659289, "grad_norm": 2.896501931914046, "learning_rate": 2.8571428571428573e-06, "loss": 0.6709, "step": 2 }, { "epoch": 0.012877263581488933, "grad_norm": 3.3718581451101177, "learning_rate": 4.2857142857142855e-06, "loss": 0.6619, "step": 3 }, { "epoch": 0.017169684775318578, "grad_norm": 2.529609856453942, "learning_rate": 5.7142857142857145e-06, "loss": 0.7753, "step": 4 }, { "epoch": 0.021462105969148222, "grad_norm": 2.4044812444703103, "learning_rate": 7.1428571428571436e-06, "loss": 0.7833, "step": 5 }, { "epoch": 0.025754527162977867, "grad_norm": 2.152377713157304, "learning_rate": 8.571428571428571e-06, "loss": 0.7411, "step": 6 }, { "epoch": 0.03004694835680751, "grad_norm": 7.163921239673759, "learning_rate": 1e-05, "loss": 0.6751, "step": 7 }, { "epoch": 0.034339369550637155, "grad_norm": 1.9811371567636475, "learning_rate": 9.999512620046523e-06, "loss": 0.6019, "step": 8 }, { "epoch": 0.0386317907444668, "grad_norm": 2.2991427332066663, "learning_rate": 9.998050575201772e-06, "loss": 0.6195, "step": 9 }, { "epoch": 0.042924211938296444, "grad_norm": 2.633277122254043, "learning_rate": 9.995614150494293e-06, "loss": 0.6773, "step": 10 }, { "epoch": 0.04721663313212609, "grad_norm": 0.8137858553380385, "learning_rate": 9.992203820909906e-06, "loss": 0.44, "step": 11 }, { "epoch": 0.05150905432595573, "grad_norm": 2.5789476047477495, "learning_rate": 9.987820251299121e-06, "loss": 0.6856, "step": 12 }, { "epoch": 0.05580147551978538, "grad_norm": 2.325022123665024, "learning_rate": 9.982464296247523e-06, "loss": 0.6573, "step": 13 }, { "epoch": 0.06009389671361502, "grad_norm": 2.1016133484431814, "learning_rate": 9.976136999909156e-06, "loss": 0.5795, "step": 14 }, { "epoch": 0.06438631790744467, "grad_norm": 2.203575792232459, "learning_rate": 9.968839595802982e-06, "loss": 0.5512, "step": 15 }, { "epoch": 0.06867873910127431, "grad_norm": 1.7649770138174061, "learning_rate": 9.960573506572391e-06, "loss": 0.5991, "step": 16 }, { "epoch": 0.07297116029510396, "grad_norm": 2.125017710581645, "learning_rate": 9.951340343707852e-06, "loss": 0.6961, "step": 17 }, { "epoch": 0.0772635814889336, "grad_norm": 1.9648109979705357, "learning_rate": 9.941141907232766e-06, "loss": 0.6274, "step": 18 }, { "epoch": 0.08155600268276325, "grad_norm": 2.1430042253940997, "learning_rate": 9.929980185352525e-06, "loss": 0.5933, "step": 19 }, { "epoch": 0.08584842387659289, "grad_norm": 2.187065629596637, "learning_rate": 9.91785735406693e-06, "loss": 0.6537, "step": 20 }, { "epoch": 0.09014084507042254, "grad_norm": 2.0411377847693366, "learning_rate": 9.904775776745959e-06, "loss": 0.5277, "step": 21 }, { "epoch": 0.09443326626425218, "grad_norm": 1.9222476925913479, "learning_rate": 9.890738003669029e-06, "loss": 0.6009, "step": 22 }, { "epoch": 0.09872568745808183, "grad_norm": 2.202338617043183, "learning_rate": 9.875746771527817e-06, "loss": 0.6808, "step": 23 }, { "epoch": 0.10301810865191147, "grad_norm": 1.8704298128204246, "learning_rate": 9.859805002892733e-06, "loss": 0.5834, "step": 24 }, { "epoch": 0.10731052984574112, "grad_norm": 2.26606517803316, "learning_rate": 9.842915805643156e-06, "loss": 0.6416, "step": 25 }, { "epoch": 0.11160295103957076, "grad_norm": 1.957724168370316, "learning_rate": 9.825082472361558e-06, "loss": 0.5565, "step": 26 }, { "epoch": 0.1158953722334004, "grad_norm": 2.7304041309488367, "learning_rate": 9.806308479691595e-06, "loss": 0.6941, "step": 27 }, { "epoch": 0.12018779342723004, "grad_norm": 2.252924472572446, "learning_rate": 9.786597487660336e-06, "loss": 0.6806, "step": 28 }, { "epoch": 0.1244802146210597, "grad_norm": 2.208805819097276, "learning_rate": 9.765953338964736e-06, "loss": 0.6123, "step": 29 }, { "epoch": 0.12877263581488935, "grad_norm": 2.0330562196938238, "learning_rate": 9.744380058222483e-06, "loss": 0.5966, "step": 30 }, { "epoch": 0.13306505700871898, "grad_norm": 2.4286898255231533, "learning_rate": 9.721881851187406e-06, "loss": 0.6148, "step": 31 }, { "epoch": 0.13735747820254862, "grad_norm": 1.911339918779555, "learning_rate": 9.698463103929542e-06, "loss": 0.5662, "step": 32 }, { "epoch": 0.14164989939637826, "grad_norm": 2.0543803346952583, "learning_rate": 9.674128381980073e-06, "loss": 0.6602, "step": 33 }, { "epoch": 0.14594232059020792, "grad_norm": 2.086982956085146, "learning_rate": 9.648882429441258e-06, "loss": 0.5432, "step": 34 }, { "epoch": 0.15023474178403756, "grad_norm": 2.0901966575723474, "learning_rate": 9.622730168061568e-06, "loss": 0.5274, "step": 35 }, { "epoch": 0.1545271629778672, "grad_norm": 4.381472469442544, "learning_rate": 9.595676696276173e-06, "loss": 0.583, "step": 36 }, { "epoch": 0.15881958417169684, "grad_norm": 2.3265871885088503, "learning_rate": 9.567727288213005e-06, "loss": 0.5623, "step": 37 }, { "epoch": 0.1631120053655265, "grad_norm": 2.0021378156582963, "learning_rate": 9.538887392664544e-06, "loss": 0.6318, "step": 38 }, { "epoch": 0.16740442655935614, "grad_norm": 1.0984958441170714, "learning_rate": 9.50916263202557e-06, "loss": 0.4548, "step": 39 }, { "epoch": 0.17169684775318578, "grad_norm": 1.8318936065536409, "learning_rate": 9.478558801197065e-06, "loss": 0.5985, "step": 40 }, { "epoch": 0.17598926894701541, "grad_norm": 2.7994275799684076, "learning_rate": 9.44708186645649e-06, "loss": 0.5926, "step": 41 }, { "epoch": 0.18028169014084508, "grad_norm": 2.072220455177539, "learning_rate": 9.414737964294636e-06, "loss": 0.5318, "step": 42 }, { "epoch": 0.18457411133467472, "grad_norm": 2.2812531791006645, "learning_rate": 9.381533400219319e-06, "loss": 0.5948, "step": 43 }, { "epoch": 0.18886653252850436, "grad_norm": 1.8766396474219533, "learning_rate": 9.347474647526095e-06, "loss": 0.5514, "step": 44 }, { "epoch": 0.193158953722334, "grad_norm": 1.9908306217286644, "learning_rate": 9.312568346036288e-06, "loss": 0.5282, "step": 45 }, { "epoch": 0.19745137491616366, "grad_norm": 2.2942225048556284, "learning_rate": 9.276821300802535e-06, "loss": 0.5875, "step": 46 }, { "epoch": 0.2017437961099933, "grad_norm": 3.1824151916415087, "learning_rate": 9.24024048078213e-06, "loss": 0.6144, "step": 47 }, { "epoch": 0.20603621730382293, "grad_norm": 0.992721768550404, "learning_rate": 9.202833017478421e-06, "loss": 0.4847, "step": 48 }, { "epoch": 0.21032863849765257, "grad_norm": 1.9310743502843328, "learning_rate": 9.164606203550498e-06, "loss": 0.5974, "step": 49 }, { "epoch": 0.21462105969148224, "grad_norm": 2.1730863843947392, "learning_rate": 9.125567491391476e-06, "loss": 0.6293, "step": 50 }, { "epoch": 0.21891348088531187, "grad_norm": 0.8680338310564839, "learning_rate": 9.085724491675642e-06, "loss": 0.4757, "step": 51 }, { "epoch": 0.2232059020791415, "grad_norm": 2.1915863963781916, "learning_rate": 9.045084971874738e-06, "loss": 0.5925, "step": 52 }, { "epoch": 0.22749832327297115, "grad_norm": 2.3492931220336195, "learning_rate": 9.003656854743667e-06, "loss": 0.6402, "step": 53 }, { "epoch": 0.2317907444668008, "grad_norm": 0.8164522682092396, "learning_rate": 8.961448216775955e-06, "loss": 0.4382, "step": 54 }, { "epoch": 0.23608316566063045, "grad_norm": 2.2933091603858418, "learning_rate": 8.9184672866292e-06, "loss": 0.5367, "step": 55 }, { "epoch": 0.2403755868544601, "grad_norm": 2.051540581160434, "learning_rate": 8.874722443520898e-06, "loss": 0.5736, "step": 56 }, { "epoch": 0.24466800804828973, "grad_norm": 1.9508118237454382, "learning_rate": 8.83022221559489e-06, "loss": 0.6196, "step": 57 }, { "epoch": 0.2489604292421194, "grad_norm": 2.4166528722303036, "learning_rate": 8.784975278258783e-06, "loss": 0.5279, "step": 58 }, { "epoch": 0.25325285043594903, "grad_norm": 1.9719959484618856, "learning_rate": 8.73899045249266e-06, "loss": 0.5248, "step": 59 }, { "epoch": 0.2575452716297787, "grad_norm": 2.088094168914141, "learning_rate": 8.692276703129421e-06, "loss": 0.5436, "step": 60 }, { "epoch": 0.2618376928236083, "grad_norm": 2.347294342285649, "learning_rate": 8.644843137107058e-06, "loss": 0.7, "step": 61 }, { "epoch": 0.26613011401743797, "grad_norm": 2.152662276659629, "learning_rate": 8.596699001693257e-06, "loss": 0.5052, "step": 62 }, { "epoch": 0.2704225352112676, "grad_norm": 2.1260026022790552, "learning_rate": 8.547853682682605e-06, "loss": 0.6121, "step": 63 }, { "epoch": 0.27471495640509724, "grad_norm": 1.9148541801577235, "learning_rate": 8.498316702566828e-06, "loss": 0.5516, "step": 64 }, { "epoch": 0.2790073775989269, "grad_norm": 2.134479085245887, "learning_rate": 8.44809771867835e-06, "loss": 0.6297, "step": 65 }, { "epoch": 0.2832997987927565, "grad_norm": 1.776446730314099, "learning_rate": 8.397206521307584e-06, "loss": 0.5337, "step": 66 }, { "epoch": 0.2875922199865862, "grad_norm": 1.955780895559264, "learning_rate": 8.345653031794292e-06, "loss": 0.6187, "step": 67 }, { "epoch": 0.29188464118041585, "grad_norm": 2.065677438153802, "learning_rate": 8.293447300593402e-06, "loss": 0.4712, "step": 68 }, { "epoch": 0.29617706237424546, "grad_norm": 1.929637625719691, "learning_rate": 8.240599505315656e-06, "loss": 0.5638, "step": 69 }, { "epoch": 0.3004694835680751, "grad_norm": 2.1234613246255294, "learning_rate": 8.18711994874345e-06, "loss": 0.5622, "step": 70 }, { "epoch": 0.3047619047619048, "grad_norm": 2.4974658951008935, "learning_rate": 8.133019056822303e-06, "loss": 0.5656, "step": 71 }, { "epoch": 0.3090543259557344, "grad_norm": 2.1148121496211028, "learning_rate": 8.078307376628292e-06, "loss": 0.5706, "step": 72 }, { "epoch": 0.31334674714956406, "grad_norm": 1.9637387645317304, "learning_rate": 8.022995574311876e-06, "loss": 0.6431, "step": 73 }, { "epoch": 0.3176391683433937, "grad_norm": 2.2321260500467996, "learning_rate": 7.967094433018508e-06, "loss": 0.6038, "step": 74 }, { "epoch": 0.32193158953722334, "grad_norm": 2.1333068986028136, "learning_rate": 7.910614850786448e-06, "loss": 0.5666, "step": 75 }, { "epoch": 0.326224010731053, "grad_norm": 2.195983244813881, "learning_rate": 7.85356783842216e-06, "loss": 0.5877, "step": 76 }, { "epoch": 0.3305164319248826, "grad_norm": 2.9185529975089644, "learning_rate": 7.795964517353734e-06, "loss": 0.5221, "step": 77 }, { "epoch": 0.3348088531187123, "grad_norm": 2.1356064453519363, "learning_rate": 7.737816117462752e-06, "loss": 0.5536, "step": 78 }, { "epoch": 0.33910127431254194, "grad_norm": 0.9888358843543892, "learning_rate": 7.679133974894984e-06, "loss": 0.427, "step": 79 }, { "epoch": 0.34339369550637155, "grad_norm": 0.9532274184167417, "learning_rate": 7.619929529850397e-06, "loss": 0.4607, "step": 80 }, { "epoch": 0.3476861167002012, "grad_norm": 1.969695168470127, "learning_rate": 7.560214324352858e-06, "loss": 0.525, "step": 81 }, { "epoch": 0.35197853789403083, "grad_norm": 2.2331654792734272, "learning_rate": 7.500000000000001e-06, "loss": 0.6041, "step": 82 }, { "epoch": 0.3562709590878605, "grad_norm": 2.4884565474083606, "learning_rate": 7.4392982956936644e-06, "loss": 0.5886, "step": 83 }, { "epoch": 0.36056338028169016, "grad_norm": 1.8314120774170615, "learning_rate": 7.378121045351378e-06, "loss": 0.5349, "step": 84 }, { "epoch": 0.36485580147551977, "grad_norm": 2.1762807073421127, "learning_rate": 7.31648017559931e-06, "loss": 0.5606, "step": 85 }, { "epoch": 0.36914822266934944, "grad_norm": 2.422109142275802, "learning_rate": 7.254387703447154e-06, "loss": 0.5431, "step": 86 }, { "epoch": 0.3734406438631791, "grad_norm": 1.9079552738678454, "learning_rate": 7.191855733945388e-06, "loss": 0.5553, "step": 87 }, { "epoch": 0.3777330650570087, "grad_norm": 1.760125442274873, "learning_rate": 7.128896457825364e-06, "loss": 0.5808, "step": 88 }, { "epoch": 0.3820254862508384, "grad_norm": 0.9979331285164651, "learning_rate": 7.06552214912271e-06, "loss": 0.4579, "step": 89 }, { "epoch": 0.386317907444668, "grad_norm": 1.8496753131089991, "learning_rate": 7.0017451627844765e-06, "loss": 0.591, "step": 90 }, { "epoch": 0.39061032863849765, "grad_norm": 2.3343608471053265, "learning_rate": 6.9375779322605154e-06, "loss": 0.6091, "step": 91 }, { "epoch": 0.3949027498323273, "grad_norm": 1.9668213618430554, "learning_rate": 6.873032967079562e-06, "loss": 0.6944, "step": 92 }, { "epoch": 0.3991951710261569, "grad_norm": 1.8725242772507493, "learning_rate": 6.808122850410461e-06, "loss": 0.6055, "step": 93 }, { "epoch": 0.4034875922199866, "grad_norm": 1.9966003945224369, "learning_rate": 6.7428602366090764e-06, "loss": 0.5595, "step": 94 }, { "epoch": 0.40778001341381626, "grad_norm": 3.053074851635477, "learning_rate": 6.677257848751276e-06, "loss": 0.5857, "step": 95 }, { "epoch": 0.41207243460764587, "grad_norm": 2.010221226340498, "learning_rate": 6.611328476152557e-06, "loss": 0.5995, "step": 96 }, { "epoch": 0.41636485580147553, "grad_norm": 1.742641730594434, "learning_rate": 6.545084971874738e-06, "loss": 0.5389, "step": 97 }, { "epoch": 0.42065727699530514, "grad_norm": 0.7858607450939203, "learning_rate": 6.4785402502202345e-06, "loss": 0.4598, "step": 98 }, { "epoch": 0.4249496981891348, "grad_norm": 1.9026874984032676, "learning_rate": 6.411707284214384e-06, "loss": 0.5824, "step": 99 }, { "epoch": 0.42924211938296447, "grad_norm": 1.7972924537621116, "learning_rate": 6.344599103076329e-06, "loss": 0.5605, "step": 100 }, { "epoch": 0.4335345405767941, "grad_norm": 1.6566505050926905, "learning_rate": 6.277228789678953e-06, "loss": 0.55, "step": 101 }, { "epoch": 0.43782696177062375, "grad_norm": 0.7871200662869098, "learning_rate": 6.209609477998339e-06, "loss": 0.4487, "step": 102 }, { "epoch": 0.4421193829644534, "grad_norm": 2.118924742242862, "learning_rate": 6.141754350553279e-06, "loss": 0.5791, "step": 103 }, { "epoch": 0.446411804158283, "grad_norm": 1.9417216221368725, "learning_rate": 6.073676635835317e-06, "loss": 0.5321, "step": 104 }, { "epoch": 0.4507042253521127, "grad_norm": 0.8301362013675532, "learning_rate": 6.005389605729824e-06, "loss": 0.471, "step": 105 }, { "epoch": 0.4549966465459423, "grad_norm": 1.9216842351031653, "learning_rate": 5.936906572928625e-06, "loss": 0.4981, "step": 106 }, { "epoch": 0.45928906773977196, "grad_norm": 2.1309977101502913, "learning_rate": 5.8682408883346535e-06, "loss": 0.523, "step": 107 }, { "epoch": 0.4635814889336016, "grad_norm": 6.307531137311205, "learning_rate": 5.799405938459175e-06, "loss": 0.5631, "step": 108 }, { "epoch": 0.46787391012743124, "grad_norm": 0.8161016261331794, "learning_rate": 5.730415142812059e-06, "loss": 0.4739, "step": 109 }, { "epoch": 0.4721663313212609, "grad_norm": 1.9009887448608773, "learning_rate": 5.661281951285613e-06, "loss": 0.6272, "step": 110 }, { "epoch": 0.47645875251509057, "grad_norm": 2.2577462172723233, "learning_rate": 5.592019841532507e-06, "loss": 0.5796, "step": 111 }, { "epoch": 0.4807511737089202, "grad_norm": 2.1012817228049814, "learning_rate": 5.522642316338268e-06, "loss": 0.6074, "step": 112 }, { "epoch": 0.48504359490274984, "grad_norm": 3.169512060210574, "learning_rate": 5.453162900988902e-06, "loss": 0.5817, "step": 113 }, { "epoch": 0.48933601609657945, "grad_norm": 0.7965146219233805, "learning_rate": 5.383595140634093e-06, "loss": 0.4628, "step": 114 }, { "epoch": 0.4936284372904091, "grad_norm": 2.528449046345642, "learning_rate": 5.3139525976465675e-06, "loss": 0.6113, "step": 115 }, { "epoch": 0.4979208584842388, "grad_norm": 2.169410644155486, "learning_rate": 5.244248848978067e-06, "loss": 0.6655, "step": 116 }, { "epoch": 0.5022132796780684, "grad_norm": 2.114218695353453, "learning_rate": 5.174497483512506e-06, "loss": 0.5438, "step": 117 }, { "epoch": 0.5065057008718981, "grad_norm": 2.322736697640736, "learning_rate": 5.1047120994167855e-06, "loss": 0.619, "step": 118 }, { "epoch": 0.5107981220657277, "grad_norm": 1.9355642603619068, "learning_rate": 5.034906301489808e-06, "loss": 0.568, "step": 119 }, { "epoch": 0.5150905432595574, "grad_norm": 0.8275929454164218, "learning_rate": 4.965093698510192e-06, "loss": 0.4642, "step": 120 }, { "epoch": 0.5193829644533869, "grad_norm": 0.7826606670717038, "learning_rate": 4.895287900583216e-06, "loss": 0.428, "step": 121 }, { "epoch": 0.5236753856472166, "grad_norm": 2.227565392356624, "learning_rate": 4.825502516487497e-06, "loss": 0.616, "step": 122 }, { "epoch": 0.5279678068410463, "grad_norm": 1.9049065560319947, "learning_rate": 4.755751151021934e-06, "loss": 0.6396, "step": 123 }, { "epoch": 0.5322602280348759, "grad_norm": 2.360278324447153, "learning_rate": 4.686047402353433e-06, "loss": 0.581, "step": 124 }, { "epoch": 0.5365526492287056, "grad_norm": 2.1436744795832463, "learning_rate": 4.6164048593659076e-06, "loss": 0.6027, "step": 125 }, { "epoch": 0.5408450704225352, "grad_norm": 2.0687980774005856, "learning_rate": 4.546837099011101e-06, "loss": 0.5455, "step": 126 }, { "epoch": 0.5451374916163648, "grad_norm": 2.094975644434325, "learning_rate": 4.477357683661734e-06, "loss": 0.5659, "step": 127 }, { "epoch": 0.5494299128101945, "grad_norm": 2.183849966879906, "learning_rate": 4.4079801584674955e-06, "loss": 0.578, "step": 128 }, { "epoch": 0.5537223340040242, "grad_norm": 1.8848396606946802, "learning_rate": 4.3387180487143875e-06, "loss": 0.5052, "step": 129 }, { "epoch": 0.5580147551978538, "grad_norm": 0.843007824971089, "learning_rate": 4.269584857187942e-06, "loss": 0.4997, "step": 130 }, { "epoch": 0.5623071763916835, "grad_norm": 2.763950937739874, "learning_rate": 4.200594061540827e-06, "loss": 0.5819, "step": 131 }, { "epoch": 0.566599597585513, "grad_norm": 1.7630340741332657, "learning_rate": 4.131759111665349e-06, "loss": 0.604, "step": 132 }, { "epoch": 0.5708920187793427, "grad_norm": 3.2094078977714897, "learning_rate": 4.063093427071376e-06, "loss": 0.6265, "step": 133 }, { "epoch": 0.5751844399731724, "grad_norm": 1.8312046959677455, "learning_rate": 3.994610394270178e-06, "loss": 0.5885, "step": 134 }, { "epoch": 0.579476861167002, "grad_norm": 1.8491871843778356, "learning_rate": 3.926323364164684e-06, "loss": 0.634, "step": 135 }, { "epoch": 0.5837692823608317, "grad_norm": 2.6473671182169167, "learning_rate": 3.8582456494467214e-06, "loss": 0.6355, "step": 136 }, { "epoch": 0.5880617035546613, "grad_norm": 2.999849822112049, "learning_rate": 3.790390522001662e-06, "loss": 0.529, "step": 137 }, { "epoch": 0.5923541247484909, "grad_norm": 1.9445772581815945, "learning_rate": 3.7227712103210485e-06, "loss": 0.5575, "step": 138 }, { "epoch": 0.5966465459423206, "grad_norm": 1.8104560751827103, "learning_rate": 3.655400896923672e-06, "loss": 0.5254, "step": 139 }, { "epoch": 0.6009389671361502, "grad_norm": 2.3671152557639346, "learning_rate": 3.5882927157856175e-06, "loss": 0.5583, "step": 140 }, { "epoch": 0.6052313883299799, "grad_norm": 2.074346620633345, "learning_rate": 3.521459749779769e-06, "loss": 0.6084, "step": 141 }, { "epoch": 0.6095238095238096, "grad_norm": 2.035892537869217, "learning_rate": 3.4549150281252635e-06, "loss": 0.5832, "step": 142 }, { "epoch": 0.6138162307176391, "grad_norm": 2.1099675037548966, "learning_rate": 3.3886715238474454e-06, "loss": 0.5579, "step": 143 }, { "epoch": 0.6181086519114688, "grad_norm": 2.0468759829486443, "learning_rate": 3.322742151248726e-06, "loss": 0.5848, "step": 144 }, { "epoch": 0.6224010731052985, "grad_norm": 1.9674918076912449, "learning_rate": 3.2571397633909252e-06, "loss": 0.5398, "step": 145 }, { "epoch": 0.6266934942991281, "grad_norm": 1.7459454556549392, "learning_rate": 3.1918771495895395e-06, "loss": 0.6756, "step": 146 }, { "epoch": 0.6309859154929578, "grad_norm": 0.7735085423697842, "learning_rate": 3.12696703292044e-06, "loss": 0.4194, "step": 147 }, { "epoch": 0.6352783366867873, "grad_norm": 1.820298479603609, "learning_rate": 3.0624220677394854e-06, "loss": 0.5858, "step": 148 }, { "epoch": 0.639570757880617, "grad_norm": 3.1630846211682, "learning_rate": 2.9982548372155264e-06, "loss": 0.5303, "step": 149 }, { "epoch": 0.6438631790744467, "grad_norm": 2.413654673008136, "learning_rate": 2.934477850877292e-06, "loss": 0.5315, "step": 150 }, { "epoch": 0.6481556002682763, "grad_norm": 2.508188149902217, "learning_rate": 2.871103542174637e-06, "loss": 0.5468, "step": 151 }, { "epoch": 0.652448021462106, "grad_norm": 1.801696332460669, "learning_rate": 2.8081442660546126e-06, "loss": 0.5817, "step": 152 }, { "epoch": 0.6567404426559356, "grad_norm": 2.26133822829944, "learning_rate": 2.7456122965528475e-06, "loss": 0.522, "step": 153 }, { "epoch": 0.6610328638497652, "grad_norm": 2.4471397871687834, "learning_rate": 2.683519824400693e-06, "loss": 0.5892, "step": 154 }, { "epoch": 0.6653252850435949, "grad_norm": 1.9131453194028882, "learning_rate": 2.6218789546486235e-06, "loss": 0.5554, "step": 155 }, { "epoch": 0.6696177062374246, "grad_norm": 2.4034600764131606, "learning_rate": 2.560701704306336e-06, "loss": 0.5969, "step": 156 }, { "epoch": 0.6739101274312542, "grad_norm": 2.1929545250468423, "learning_rate": 2.5000000000000015e-06, "loss": 0.5703, "step": 157 }, { "epoch": 0.6782025486250839, "grad_norm": 2.2198350864402348, "learning_rate": 2.4397856756471435e-06, "loss": 0.5812, "step": 158 }, { "epoch": 0.6824949698189134, "grad_norm": 2.0128371852734332, "learning_rate": 2.380070470149605e-06, "loss": 0.6307, "step": 159 }, { "epoch": 0.6867873910127431, "grad_norm": 2.217606056909539, "learning_rate": 2.320866025105016e-06, "loss": 0.5601, "step": 160 }, { "epoch": 0.6910798122065728, "grad_norm": 2.0541472609321065, "learning_rate": 2.2621838825372496e-06, "loss": 0.6326, "step": 161 }, { "epoch": 0.6953722334004024, "grad_norm": 1.7762192766929534, "learning_rate": 2.204035482646267e-06, "loss": 0.501, "step": 162 }, { "epoch": 0.6996646545942321, "grad_norm": 2.4025706210449322, "learning_rate": 2.146432161577842e-06, "loss": 0.5587, "step": 163 }, { "epoch": 0.7039570757880617, "grad_norm": 3.686601475649348, "learning_rate": 2.0893851492135536e-06, "loss": 0.5732, "step": 164 }, { "epoch": 0.7082494969818913, "grad_norm": 1.8617954191026609, "learning_rate": 2.0329055669814936e-06, "loss": 0.5018, "step": 165 }, { "epoch": 0.712541918175721, "grad_norm": 3.1015155924797804, "learning_rate": 1.977004425688126e-06, "loss": 0.6083, "step": 166 }, { "epoch": 0.7168343393695507, "grad_norm": 11.133598367985865, "learning_rate": 1.9216926233717087e-06, "loss": 0.5543, "step": 167 }, { "epoch": 0.7211267605633803, "grad_norm": 1.8192465630367278, "learning_rate": 1.8669809431776991e-06, "loss": 0.5863, "step": 168 }, { "epoch": 0.7254191817572099, "grad_norm": 2.0788969216539757, "learning_rate": 1.8128800512565514e-06, "loss": 0.5291, "step": 169 }, { "epoch": 0.7297116029510395, "grad_norm": 1.8368930044145042, "learning_rate": 1.7594004946843458e-06, "loss": 0.5378, "step": 170 }, { "epoch": 0.7340040241448692, "grad_norm": 2.0502782524895315, "learning_rate": 1.7065526994065973e-06, "loss": 0.5478, "step": 171 }, { "epoch": 0.7382964453386989, "grad_norm": 1.7763584764334766, "learning_rate": 1.6543469682057105e-06, "loss": 0.5448, "step": 172 }, { "epoch": 0.7425888665325285, "grad_norm": 1.7918310960966537, "learning_rate": 1.6027934786924187e-06, "loss": 0.6076, "step": 173 }, { "epoch": 0.7468812877263582, "grad_norm": 2.3563140946836523, "learning_rate": 1.551902281321651e-06, "loss": 0.565, "step": 174 }, { "epoch": 0.7511737089201878, "grad_norm": 1.8987524052104983, "learning_rate": 1.5016832974331725e-06, "loss": 0.5367, "step": 175 }, { "epoch": 0.7554661301140174, "grad_norm": 1.9189932438009185, "learning_rate": 1.4521463173173966e-06, "loss": 0.5435, "step": 176 }, { "epoch": 0.7597585513078471, "grad_norm": 2.42896938145852, "learning_rate": 1.4033009983067454e-06, "loss": 0.5141, "step": 177 }, { "epoch": 0.7640509725016768, "grad_norm": 2.5249548327367766, "learning_rate": 1.3551568628929434e-06, "loss": 0.674, "step": 178 }, { "epoch": 0.7683433936955064, "grad_norm": 2.806361559097513, "learning_rate": 1.3077232968705805e-06, "loss": 0.5517, "step": 179 }, { "epoch": 0.772635814889336, "grad_norm": 2.2153752757917555, "learning_rate": 1.2610095475073415e-06, "loss": 0.6408, "step": 180 }, { "epoch": 0.7769282360831656, "grad_norm": 2.5628201933255124, "learning_rate": 1.2150247217412186e-06, "loss": 0.5957, "step": 181 }, { "epoch": 0.7812206572769953, "grad_norm": 2.079321530887709, "learning_rate": 1.1697777844051105e-06, "loss": 0.6155, "step": 182 }, { "epoch": 0.785513078470825, "grad_norm": 3.2999932447363416, "learning_rate": 1.1252775564791023e-06, "loss": 0.5515, "step": 183 }, { "epoch": 0.7898054996646546, "grad_norm": 0.7905984900012323, "learning_rate": 1.0815327133708015e-06, "loss": 0.4313, "step": 184 }, { "epoch": 0.7940979208584842, "grad_norm": 1.9368152867312256, "learning_rate": 1.0385517832240472e-06, "loss": 0.6071, "step": 185 }, { "epoch": 0.7983903420523139, "grad_norm": 2.366580720258878, "learning_rate": 9.963431452563331e-07, "loss": 0.5578, "step": 186 }, { "epoch": 0.8026827632461435, "grad_norm": 2.3822261396548576, "learning_rate": 9.549150281252633e-07, "loss": 0.499, "step": 187 }, { "epoch": 0.8069751844399732, "grad_norm": 1.7417985644400509, "learning_rate": 9.142755083243577e-07, "loss": 0.5696, "step": 188 }, { "epoch": 0.8112676056338028, "grad_norm": 3.9103317258184376, "learning_rate": 8.744325086085248e-07, "loss": 0.5079, "step": 189 }, { "epoch": 0.8155600268276325, "grad_norm": 3.0678763221012586, "learning_rate": 8.353937964495029e-07, "loss": 0.5418, "step": 190 }, { "epoch": 0.8198524480214621, "grad_norm": 2.042653243276808, "learning_rate": 7.971669825215789e-07, "loss": 0.5748, "step": 191 }, { "epoch": 0.8241448692152917, "grad_norm": 2.39821396059648, "learning_rate": 7.597595192178702e-07, "loss": 0.5428, "step": 192 }, { "epoch": 0.8284372904091214, "grad_norm": 2.762719441820346, "learning_rate": 7.23178699197467e-07, "loss": 0.5673, "step": 193 }, { "epoch": 0.8327297116029511, "grad_norm": 1.9044906325399893, "learning_rate": 6.874316539637127e-07, "loss": 0.5932, "step": 194 }, { "epoch": 0.8370221327967807, "grad_norm": 1.856435008918489, "learning_rate": 6.52525352473905e-07, "loss": 0.5787, "step": 195 }, { "epoch": 0.8413145539906103, "grad_norm": 2.3652417447261995, "learning_rate": 6.184665997806832e-07, "loss": 0.5167, "step": 196 }, { "epoch": 0.84560697518444, "grad_norm": 2.723770526598716, "learning_rate": 5.852620357053651e-07, "loss": 0.571, "step": 197 }, { "epoch": 0.8498993963782696, "grad_norm": 2.4817466195220437, "learning_rate": 5.529181335435124e-07, "loss": 0.58, "step": 198 }, { "epoch": 0.8541918175720993, "grad_norm": 2.0455723374907397, "learning_rate": 5.214411988029355e-07, "loss": 0.5313, "step": 199 }, { "epoch": 0.8584842387659289, "grad_norm": 1.9318379229380933, "learning_rate": 4.908373679744316e-07, "loss": 0.5439, "step": 200 }, { "epoch": 0.8627766599597585, "grad_norm": 0.7959253630064103, "learning_rate": 4.6111260733545714e-07, "loss": 0.4454, "step": 201 }, { "epoch": 0.8670690811535882, "grad_norm": 1.9913907969524394, "learning_rate": 4.322727117869951e-07, "loss": 0.5008, "step": 202 }, { "epoch": 0.8713615023474178, "grad_norm": 2.1458695326057082, "learning_rate": 4.043233037238281e-07, "loss": 0.5459, "step": 203 }, { "epoch": 0.8756539235412475, "grad_norm": 1.9619666249914482, "learning_rate": 3.772698319384349e-07, "loss": 0.4999, "step": 204 }, { "epoch": 0.8799463447350772, "grad_norm": 4.3401484961367744, "learning_rate": 3.511175705587433e-07, "loss": 0.5758, "step": 205 }, { "epoch": 0.8842387659289068, "grad_norm": 2.5430076829964174, "learning_rate": 3.258716180199278e-07, "loss": 0.5761, "step": 206 }, { "epoch": 0.8885311871227364, "grad_norm": 0.767231711493243, "learning_rate": 3.015368960704584e-07, "loss": 0.4614, "step": 207 }, { "epoch": 0.892823608316566, "grad_norm": 2.193097249503329, "learning_rate": 2.7811814881259503e-07, "loss": 0.6247, "step": 208 }, { "epoch": 0.8971160295103957, "grad_norm": 1.9331493033717462, "learning_rate": 2.556199417775174e-07, "loss": 0.56, "step": 209 }, { "epoch": 0.9014084507042254, "grad_norm": 2.057966480432949, "learning_rate": 2.3404666103526542e-07, "loss": 0.5618, "step": 210 }, { "epoch": 0.905700871898055, "grad_norm": 2.351596162489054, "learning_rate": 2.134025123396638e-07, "loss": 0.4834, "step": 211 }, { "epoch": 0.9099932930918846, "grad_norm": 1.9120169610557864, "learning_rate": 1.9369152030840553e-07, "loss": 0.5541, "step": 212 }, { "epoch": 0.9142857142857143, "grad_norm": 2.2049446195768763, "learning_rate": 1.7491752763844294e-07, "loss": 0.5889, "step": 213 }, { "epoch": 0.9185781354795439, "grad_norm": 2.0434712402893145, "learning_rate": 1.5708419435684463e-07, "loss": 0.5638, "step": 214 }, { "epoch": 0.9228705566733736, "grad_norm": 2.1757829918674916, "learning_rate": 1.4019499710726913e-07, "loss": 0.642, "step": 215 }, { "epoch": 0.9271629778672033, "grad_norm": 1.9838844707633234, "learning_rate": 1.2425322847218368e-07, "loss": 0.6747, "step": 216 }, { "epoch": 0.9314553990610329, "grad_norm": 2.3027124358123836, "learning_rate": 1.0926199633097156e-07, "loss": 0.5337, "step": 217 }, { "epoch": 0.9357478202548625, "grad_norm": 1.8834569228162623, "learning_rate": 9.522422325404234e-08, "loss": 0.4917, "step": 218 }, { "epoch": 0.9400402414486921, "grad_norm": 1.8818328820882426, "learning_rate": 8.214264593307097e-08, "loss": 0.5806, "step": 219 }, { "epoch": 0.9443326626425218, "grad_norm": 2.121602640833575, "learning_rate": 7.001981464747565e-08, "loss": 0.6032, "step": 220 }, { "epoch": 0.9486250838363515, "grad_norm": 1.7145222003788387, "learning_rate": 5.8858092767236084e-08, "loss": 0.5455, "step": 221 }, { "epoch": 0.9529175050301811, "grad_norm": 1.9516038348847442, "learning_rate": 4.865965629214819e-08, "loss": 0.4984, "step": 222 }, { "epoch": 0.9572099262240107, "grad_norm": 2.156767335001454, "learning_rate": 3.9426493427611177e-08, "loss": 0.5284, "step": 223 }, { "epoch": 0.9615023474178404, "grad_norm": 1.8089215543018735, "learning_rate": 3.1160404197018155e-08, "loss": 0.5852, "step": 224 }, { "epoch": 0.96579476861167, "grad_norm": 3.2819358158442062, "learning_rate": 2.386300009084408e-08, "loss": 0.535, "step": 225 }, { "epoch": 0.9700871898054997, "grad_norm": 1.7304415336384433, "learning_rate": 1.753570375247815e-08, "loss": 0.5152, "step": 226 }, { "epoch": 0.9743796109993293, "grad_norm": 2.7057456233183785, "learning_rate": 1.2179748700879013e-08, "loss": 0.5592, "step": 227 }, { "epoch": 0.9786720321931589, "grad_norm": 2.0005265863891872, "learning_rate": 7.796179090094891e-09, "loss": 0.548, "step": 228 }, { "epoch": 0.9829644533869886, "grad_norm": 3.129587457460373, "learning_rate": 4.385849505708084e-09, "loss": 0.5154, "step": 229 }, { "epoch": 0.9872568745808182, "grad_norm": 2.082407038336782, "learning_rate": 1.9494247982282386e-09, "loss": 0.5263, "step": 230 }, { "epoch": 0.9915492957746479, "grad_norm": 2.035928140228953, "learning_rate": 4.87379953478806e-10, "loss": 0.5001, "step": 231 }, { "epoch": 0.9958417169684776, "grad_norm": 1.9493320834510575, "learning_rate": 0.0, "loss": 0.5539, "step": 232 }, { "epoch": 0.9958417169684776, "step": 232, "total_flos": 57740051464192.0, "train_loss": 0.5700135146235598, "train_runtime": 22473.305, "train_samples_per_second": 1.327, "train_steps_per_second": 0.01 } ], "logging_steps": 1.0, "max_steps": 232, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 57740051464192.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }