{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 230, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004347826086956522, "grad_norm": 209.3180389404297, "learning_rate": 0.0, "loss": 5.8188, "step": 1 }, { "epoch": 0.008695652173913044, "grad_norm": 215.69874572753906, "learning_rate": 4.347826086956522e-06, "loss": 5.9259, "step": 2 }, { "epoch": 0.013043478260869565, "grad_norm": 62.712825775146484, "learning_rate": 8.695652173913044e-06, "loss": 5.4202, "step": 3 }, { "epoch": 0.017391304347826087, "grad_norm": 85.59194946289062, "learning_rate": 1.3043478260869566e-05, "loss": 5.3079, "step": 4 }, { "epoch": 0.021739130434782608, "grad_norm": 22.901897430419922, "learning_rate": 1.739130434782609e-05, "loss": 5.0196, "step": 5 }, { "epoch": 0.02608695652173913, "grad_norm": 22.081829071044922, "learning_rate": 2.173913043478261e-05, "loss": 4.8222, "step": 6 }, { "epoch": 0.030434782608695653, "grad_norm": 11.022245407104492, "learning_rate": 2.608695652173913e-05, "loss": 4.4617, "step": 7 }, { "epoch": 0.034782608695652174, "grad_norm": 7.274469375610352, "learning_rate": 3.0434782608695656e-05, "loss": 4.335, "step": 8 }, { "epoch": 0.0391304347826087, "grad_norm": 3.8645834922790527, "learning_rate": 3.478260869565218e-05, "loss": 4.0476, "step": 9 }, { "epoch": 0.043478260869565216, "grad_norm": 2.6724016666412354, "learning_rate": 3.91304347826087e-05, "loss": 3.8387, "step": 10 }, { "epoch": 0.04782608695652174, "grad_norm": 2.258195161819458, "learning_rate": 4.347826086956522e-05, "loss": 3.8144, "step": 11 }, { "epoch": 0.05217391304347826, "grad_norm": 1.8822625875473022, "learning_rate": 4.782608695652174e-05, "loss": 3.4008, "step": 12 }, { "epoch": 0.05652173913043478, "grad_norm": 2.047840118408203, "learning_rate": 5.217391304347826e-05, "loss": 3.2554, "step": 13 }, { "epoch": 0.06086956521739131, "grad_norm": 1.8671568632125854, "learning_rate": 5.652173913043478e-05, "loss": 3.2461, "step": 14 }, { "epoch": 0.06521739130434782, "grad_norm": 1.6069483757019043, "learning_rate": 6.086956521739131e-05, "loss": 2.9738, "step": 15 }, { "epoch": 0.06956521739130435, "grad_norm": 1.3096915483474731, "learning_rate": 6.521739130434783e-05, "loss": 2.7823, "step": 16 }, { "epoch": 0.07391304347826087, "grad_norm": 1.3594956398010254, "learning_rate": 6.956521739130436e-05, "loss": 2.6255, "step": 17 }, { "epoch": 0.0782608695652174, "grad_norm": 1.0210895538330078, "learning_rate": 7.391304347826086e-05, "loss": 2.4501, "step": 18 }, { "epoch": 0.08260869565217391, "grad_norm": 0.8942164182662964, "learning_rate": 7.82608695652174e-05, "loss": 2.2934, "step": 19 }, { "epoch": 0.08695652173913043, "grad_norm": 0.8361735343933105, "learning_rate": 8.260869565217392e-05, "loss": 2.2029, "step": 20 }, { "epoch": 0.09130434782608696, "grad_norm": 0.794482409954071, "learning_rate": 8.695652173913044e-05, "loss": 2.0223, "step": 21 }, { "epoch": 0.09565217391304348, "grad_norm": 0.7513137459754944, "learning_rate": 9.130434782608696e-05, "loss": 1.8504, "step": 22 }, { "epoch": 0.1, "grad_norm": 0.76312655210495, "learning_rate": 9.565217391304348e-05, "loss": 1.6577, "step": 23 }, { "epoch": 0.10434782608695652, "grad_norm": 0.8560758829116821, "learning_rate": 0.0001, "loss": 1.5565, "step": 24 }, { "epoch": 0.10869565217391304, "grad_norm": 0.7479954957962036, "learning_rate": 0.00010434782608695653, "loss": 1.4364, "step": 25 }, { "epoch": 0.11304347826086956, "grad_norm": 0.5951140522956848, "learning_rate": 0.00010869565217391305, "loss": 1.2957, "step": 26 }, { "epoch": 0.11739130434782609, "grad_norm": 0.503224790096283, "learning_rate": 0.00011304347826086956, "loss": 1.1799, "step": 27 }, { "epoch": 0.12173913043478261, "grad_norm": 0.47480374574661255, "learning_rate": 0.0001173913043478261, "loss": 1.1277, "step": 28 }, { "epoch": 0.12608695652173912, "grad_norm": 0.38552260398864746, "learning_rate": 0.00012173913043478263, "loss": 1.0744, "step": 29 }, { "epoch": 0.13043478260869565, "grad_norm": 0.35596558451652527, "learning_rate": 0.00012608695652173915, "loss": 1.0023, "step": 30 }, { "epoch": 0.13478260869565217, "grad_norm": 0.32971665263175964, "learning_rate": 0.00013043478260869567, "loss": 0.9691, "step": 31 }, { "epoch": 0.1391304347826087, "grad_norm": 0.37770169973373413, "learning_rate": 0.0001347826086956522, "loss": 0.9116, "step": 32 }, { "epoch": 0.14347826086956522, "grad_norm": 0.22640736401081085, "learning_rate": 0.0001391304347826087, "loss": 0.8613, "step": 33 }, { "epoch": 0.14782608695652175, "grad_norm": 0.20925410091876984, "learning_rate": 0.0001434782608695652, "loss": 0.8836, "step": 34 }, { "epoch": 0.15217391304347827, "grad_norm": 0.20542123913764954, "learning_rate": 0.00014782608695652173, "loss": 0.8502, "step": 35 }, { "epoch": 0.1565217391304348, "grad_norm": 0.16715222597122192, "learning_rate": 0.00015217391304347827, "loss": 0.8292, "step": 36 }, { "epoch": 0.1608695652173913, "grad_norm": 0.1648133248090744, "learning_rate": 0.0001565217391304348, "loss": 0.8189, "step": 37 }, { "epoch": 0.16521739130434782, "grad_norm": 0.13562779128551483, "learning_rate": 0.00016086956521739132, "loss": 0.8078, "step": 38 }, { "epoch": 0.16956521739130434, "grad_norm": 0.1290610432624817, "learning_rate": 0.00016521739130434784, "loss": 0.7712, "step": 39 }, { "epoch": 0.17391304347826086, "grad_norm": 0.11024343967437744, "learning_rate": 0.00016956521739130436, "loss": 0.7448, "step": 40 }, { "epoch": 0.1782608695652174, "grad_norm": 0.12418993562459946, "learning_rate": 0.00017391304347826088, "loss": 0.7633, "step": 41 }, { "epoch": 0.1826086956521739, "grad_norm": 0.10319849103689194, "learning_rate": 0.0001782608695652174, "loss": 0.7463, "step": 42 }, { "epoch": 0.18695652173913044, "grad_norm": 0.10371455550193787, "learning_rate": 0.00018260869565217392, "loss": 0.7516, "step": 43 }, { "epoch": 0.19130434782608696, "grad_norm": 0.09219090640544891, "learning_rate": 0.00018695652173913045, "loss": 0.7265, "step": 44 }, { "epoch": 0.1956521739130435, "grad_norm": 0.09577666968107224, "learning_rate": 0.00019130434782608697, "loss": 0.7382, "step": 45 }, { "epoch": 0.2, "grad_norm": 0.08755916357040405, "learning_rate": 0.0001956521739130435, "loss": 0.7392, "step": 46 }, { "epoch": 0.20434782608695654, "grad_norm": 0.08335893601179123, "learning_rate": 0.0002, "loss": 0.7182, "step": 47 }, { "epoch": 0.20869565217391303, "grad_norm": 0.08622466027736664, "learning_rate": 0.00019999712083215463, "loss": 0.7196, "step": 48 }, { "epoch": 0.21304347826086956, "grad_norm": 0.07222707569599152, "learning_rate": 0.00019998848349441062, "loss": 0.7014, "step": 49 }, { "epoch": 0.21739130434782608, "grad_norm": 0.07286012172698975, "learning_rate": 0.00019997408848413493, "loss": 0.6986, "step": 50 }, { "epoch": 0.2217391304347826, "grad_norm": 0.07811558246612549, "learning_rate": 0.00019995393663024054, "loss": 0.6922, "step": 51 }, { "epoch": 0.22608695652173913, "grad_norm": 0.07095416635274887, "learning_rate": 0.0001999280290931388, "loss": 0.7188, "step": 52 }, { "epoch": 0.23043478260869565, "grad_norm": 0.0705651044845581, "learning_rate": 0.00019989636736467278, "loss": 0.7135, "step": 53 }, { "epoch": 0.23478260869565218, "grad_norm": 0.0649741142988205, "learning_rate": 0.00019985895326803097, "loss": 0.6833, "step": 54 }, { "epoch": 0.2391304347826087, "grad_norm": 0.07023416459560394, "learning_rate": 0.00019981578895764273, "loss": 0.6902, "step": 55 }, { "epoch": 0.24347826086956523, "grad_norm": 0.065043605864048, "learning_rate": 0.00019976687691905393, "loss": 0.6933, "step": 56 }, { "epoch": 0.24782608695652175, "grad_norm": 0.0647321566939354, "learning_rate": 0.00019971221996878394, "loss": 0.6946, "step": 57 }, { "epoch": 0.25217391304347825, "grad_norm": 0.08214448392391205, "learning_rate": 0.0001996518212541634, "loss": 0.6789, "step": 58 }, { "epoch": 0.2565217391304348, "grad_norm": 0.06106014922261238, "learning_rate": 0.00019958568425315314, "loss": 0.6826, "step": 59 }, { "epoch": 0.2608695652173913, "grad_norm": 0.06052952632308006, "learning_rate": 0.0001995138127741436, "loss": 0.6706, "step": 60 }, { "epoch": 0.26521739130434785, "grad_norm": 0.06265316903591156, "learning_rate": 0.00019943621095573586, "loss": 0.6809, "step": 61 }, { "epoch": 0.26956521739130435, "grad_norm": 0.0603368878364563, "learning_rate": 0.00019935288326650312, "loss": 0.6728, "step": 62 }, { "epoch": 0.27391304347826084, "grad_norm": 0.06611189991235733, "learning_rate": 0.00019926383450473344, "loss": 0.6499, "step": 63 }, { "epoch": 0.2782608695652174, "grad_norm": 0.06278355419635773, "learning_rate": 0.00019916906979815347, "loss": 0.6561, "step": 64 }, { "epoch": 0.2826086956521739, "grad_norm": 0.07379094511270523, "learning_rate": 0.00019906859460363307, "loss": 0.6786, "step": 65 }, { "epoch": 0.28695652173913044, "grad_norm": 0.09574166685342789, "learning_rate": 0.0001989624147068713, "loss": 0.6625, "step": 66 }, { "epoch": 0.29130434782608694, "grad_norm": 0.08743462711572647, "learning_rate": 0.00019885053622206304, "loss": 0.648, "step": 67 }, { "epoch": 0.2956521739130435, "grad_norm": 0.08914034813642502, "learning_rate": 0.00019873296559154698, "loss": 0.6561, "step": 68 }, { "epoch": 0.3, "grad_norm": 0.06804706901311874, "learning_rate": 0.0001986097095854347, "loss": 0.658, "step": 69 }, { "epoch": 0.30434782608695654, "grad_norm": 0.09893489629030228, "learning_rate": 0.00019848077530122083, "loss": 0.6708, "step": 70 }, { "epoch": 0.30869565217391304, "grad_norm": 0.07928409427404404, "learning_rate": 0.0001983461701633742, "loss": 0.6407, "step": 71 }, { "epoch": 0.3130434782608696, "grad_norm": 0.07455449551343918, "learning_rate": 0.0001982059019229106, "loss": 0.676, "step": 72 }, { "epoch": 0.3173913043478261, "grad_norm": 0.0770968496799469, "learning_rate": 0.00019805997865694614, "loss": 0.6639, "step": 73 }, { "epoch": 0.3217391304347826, "grad_norm": 0.06771919876337051, "learning_rate": 0.00019790840876823232, "loss": 0.6486, "step": 74 }, { "epoch": 0.32608695652173914, "grad_norm": 0.07457810640335083, "learning_rate": 0.0001977512009846721, "loss": 0.6681, "step": 75 }, { "epoch": 0.33043478260869563, "grad_norm": 0.0826922208070755, "learning_rate": 0.00019758836435881746, "loss": 0.6356, "step": 76 }, { "epoch": 0.3347826086956522, "grad_norm": 0.07923886179924011, "learning_rate": 0.00019741990826734794, "loss": 0.6682, "step": 77 }, { "epoch": 0.3391304347826087, "grad_norm": 0.11045071482658386, "learning_rate": 0.0001972458424105307, "loss": 0.6203, "step": 78 }, { "epoch": 0.34347826086956523, "grad_norm": 0.11731227487325668, "learning_rate": 0.00019706617681166218, "loss": 0.66, "step": 79 }, { "epoch": 0.34782608695652173, "grad_norm": 0.12649305164813995, "learning_rate": 0.00019688092181649065, "loss": 0.6613, "step": 80 }, { "epoch": 0.3521739130434783, "grad_norm": 0.1144268661737442, "learning_rate": 0.00019669008809262062, "loss": 0.6606, "step": 81 }, { "epoch": 0.3565217391304348, "grad_norm": 0.11361440271139145, "learning_rate": 0.00019649368662889855, "loss": 0.629, "step": 82 }, { "epoch": 0.36086956521739133, "grad_norm": 0.12539249658584595, "learning_rate": 0.00019629172873477995, "loss": 0.6676, "step": 83 }, { "epoch": 0.3652173913043478, "grad_norm": 0.11141279339790344, "learning_rate": 0.00019608422603967836, "loss": 0.6376, "step": 84 }, { "epoch": 0.3695652173913043, "grad_norm": 0.09837634861469269, "learning_rate": 0.00019587119049229557, "loss": 0.6503, "step": 85 }, { "epoch": 0.3739130434782609, "grad_norm": 0.15677575767040253, "learning_rate": 0.0001956526343599335, "loss": 0.6638, "step": 86 }, { "epoch": 0.3782608695652174, "grad_norm": 0.252825528383255, "learning_rate": 0.0001954285702277879, "loss": 0.6713, "step": 87 }, { "epoch": 0.3826086956521739, "grad_norm": 0.3602813482284546, "learning_rate": 0.00019519901099822372, "loss": 0.6596, "step": 88 }, { "epoch": 0.3869565217391304, "grad_norm": 0.3970949053764343, "learning_rate": 0.00019496396989003193, "loss": 0.6617, "step": 89 }, { "epoch": 0.391304347826087, "grad_norm": 0.284343421459198, "learning_rate": 0.00019472346043766865, "loss": 0.6229, "step": 90 }, { "epoch": 0.39565217391304347, "grad_norm": 0.19832171499729156, "learning_rate": 0.00019447749649047542, "loss": 0.6665, "step": 91 }, { "epoch": 0.4, "grad_norm": 0.24541743099689484, "learning_rate": 0.00019422609221188207, "loss": 0.6585, "step": 92 }, { "epoch": 0.4043478260869565, "grad_norm": 0.1915537267923355, "learning_rate": 0.00019396926207859084, "loss": 0.6343, "step": 93 }, { "epoch": 0.40869565217391307, "grad_norm": 0.20492875576019287, "learning_rate": 0.00019370702087974302, "loss": 0.6438, "step": 94 }, { "epoch": 0.41304347826086957, "grad_norm": 0.25835996866226196, "learning_rate": 0.00019343938371606712, "loss": 0.6502, "step": 95 }, { "epoch": 0.41739130434782606, "grad_norm": 0.2585464417934418, "learning_rate": 0.00019316636599900946, "loss": 0.6393, "step": 96 }, { "epoch": 0.4217391304347826, "grad_norm": 0.2317182868719101, "learning_rate": 0.00019288798344984672, "loss": 0.6275, "step": 97 }, { "epoch": 0.4260869565217391, "grad_norm": 0.23632416129112244, "learning_rate": 0.00019260425209878052, "loss": 0.6414, "step": 98 }, { "epoch": 0.43043478260869567, "grad_norm": 0.1801244169473648, "learning_rate": 0.00019231518828401458, "loss": 0.6491, "step": 99 }, { "epoch": 0.43478260869565216, "grad_norm": 0.24871514737606049, "learning_rate": 0.00019202080865081368, "loss": 0.6581, "step": 100 }, { "epoch": 0.4391304347826087, "grad_norm": 0.26276353001594543, "learning_rate": 0.00019172113015054532, "loss": 0.644, "step": 101 }, { "epoch": 0.4434782608695652, "grad_norm": 0.19743724167346954, "learning_rate": 0.0001914161700397035, "loss": 0.6519, "step": 102 }, { "epoch": 0.44782608695652176, "grad_norm": 0.31385916471481323, "learning_rate": 0.00019110594587891519, "loss": 0.6462, "step": 103 }, { "epoch": 0.45217391304347826, "grad_norm": 0.2689647674560547, "learning_rate": 0.0001907904755319289, "loss": 0.6517, "step": 104 }, { "epoch": 0.45652173913043476, "grad_norm": 0.17245543003082275, "learning_rate": 0.00019046977716458626, "loss": 0.6245, "step": 105 }, { "epoch": 0.4608695652173913, "grad_norm": 0.4380849003791809, "learning_rate": 0.00019014386924377582, "loss": 0.6519, "step": 106 }, { "epoch": 0.4652173913043478, "grad_norm": 0.305043488740921, "learning_rate": 0.0001898127705363696, "loss": 0.6606, "step": 107 }, { "epoch": 0.46956521739130436, "grad_norm": 0.20340269804000854, "learning_rate": 0.0001894765001081428, "loss": 0.6359, "step": 108 }, { "epoch": 0.47391304347826085, "grad_norm": 0.15703125298023224, "learning_rate": 0.0001891350773226754, "loss": 0.6461, "step": 109 }, { "epoch": 0.4782608695652174, "grad_norm": 0.16932646930217743, "learning_rate": 0.0001887885218402375, "loss": 0.6413, "step": 110 }, { "epoch": 0.4826086956521739, "grad_norm": 0.1790553480386734, "learning_rate": 0.00018843685361665723, "loss": 0.6378, "step": 111 }, { "epoch": 0.48695652173913045, "grad_norm": 0.24903282523155212, "learning_rate": 0.00018808009290217136, "loss": 0.6308, "step": 112 }, { "epoch": 0.49130434782608695, "grad_norm": 0.20529182255268097, "learning_rate": 0.00018771826024025946, "loss": 0.6315, "step": 113 }, { "epoch": 0.4956521739130435, "grad_norm": 0.18206629157066345, "learning_rate": 0.00018735137646646078, "loss": 0.6409, "step": 114 }, { "epoch": 0.5, "grad_norm": 0.22906547784805298, "learning_rate": 0.00018697946270717467, "loss": 0.6522, "step": 115 }, { "epoch": 0.5043478260869565, "grad_norm": 0.23560722172260284, "learning_rate": 0.00018660254037844388, "loss": 0.6424, "step": 116 }, { "epoch": 0.508695652173913, "grad_norm": 0.3479248881340027, "learning_rate": 0.00018622063118472134, "loss": 0.6591, "step": 117 }, { "epoch": 0.5130434782608696, "grad_norm": 0.48405924439430237, "learning_rate": 0.00018583375711762052, "loss": 0.6312, "step": 118 }, { "epoch": 0.5173913043478261, "grad_norm": 0.6660999655723572, "learning_rate": 0.00018544194045464886, "loss": 0.6492, "step": 119 }, { "epoch": 0.5217391304347826, "grad_norm": 0.6070662140846252, "learning_rate": 0.0001850452037579251, "loss": 0.631, "step": 120 }, { "epoch": 0.5260869565217391, "grad_norm": 0.2432556301355362, "learning_rate": 0.00018464356987288013, "loss": 0.6192, "step": 121 }, { "epoch": 0.5304347826086957, "grad_norm": 0.4718700647354126, "learning_rate": 0.00018423706192694116, "loss": 0.6385, "step": 122 }, { "epoch": 0.5347826086956522, "grad_norm": 0.41220200061798096, "learning_rate": 0.00018382570332820043, "loss": 0.6362, "step": 123 }, { "epoch": 0.5391304347826087, "grad_norm": 0.24313992261886597, "learning_rate": 0.00018340951776406694, "loss": 0.659, "step": 124 }, { "epoch": 0.5434782608695652, "grad_norm": 0.42307668924331665, "learning_rate": 0.00018298852919990252, "loss": 0.6484, "step": 125 }, { "epoch": 0.5478260869565217, "grad_norm": 0.2858572006225586, "learning_rate": 0.00018256276187764197, "loss": 0.6437, "step": 126 }, { "epoch": 0.5521739130434783, "grad_norm": 0.2318851351737976, "learning_rate": 0.0001821322403143969, "loss": 0.6191, "step": 127 }, { "epoch": 0.5565217391304348, "grad_norm": 0.3861188292503357, "learning_rate": 0.0001816969893010442, "loss": 0.639, "step": 128 }, { "epoch": 0.5608695652173913, "grad_norm": 0.2969801127910614, "learning_rate": 0.0001812570339007983, "loss": 0.6624, "step": 129 }, { "epoch": 0.5652173913043478, "grad_norm": 0.29341548681259155, "learning_rate": 0.00018081239944776805, "loss": 0.639, "step": 130 }, { "epoch": 0.5695652173913044, "grad_norm": 0.43678849935531616, "learning_rate": 0.00018036311154549784, "loss": 0.6384, "step": 131 }, { "epoch": 0.5739130434782609, "grad_norm": 0.5248069167137146, "learning_rate": 0.00017990919606549328, "loss": 0.6451, "step": 132 }, { "epoch": 0.5782608695652174, "grad_norm": 0.5387030243873596, "learning_rate": 0.00017945067914573146, "loss": 0.6198, "step": 133 }, { "epoch": 0.5826086956521739, "grad_norm": 0.55666184425354, "learning_rate": 0.00017898758718915586, "loss": 0.6391, "step": 134 }, { "epoch": 0.5869565217391305, "grad_norm": 0.4839560389518738, "learning_rate": 0.0001785199468621559, "loss": 0.6411, "step": 135 }, { "epoch": 0.591304347826087, "grad_norm": 0.5173195004463196, "learning_rate": 0.00017804778509303138, "loss": 0.6318, "step": 136 }, { "epoch": 0.5956521739130435, "grad_norm": 0.341448038816452, "learning_rate": 0.000177571129070442, "loss": 0.6427, "step": 137 }, { "epoch": 0.6, "grad_norm": 0.2654604911804199, "learning_rate": 0.00017709000624184162, "loss": 0.616, "step": 138 }, { "epoch": 0.6043478260869565, "grad_norm": 0.4000408351421356, "learning_rate": 0.0001766044443118978, "loss": 0.611, "step": 139 }, { "epoch": 0.6086956521739131, "grad_norm": 0.2812383770942688, "learning_rate": 0.00017611447124089649, "loss": 0.6508, "step": 140 }, { "epoch": 0.6130434782608696, "grad_norm": 0.30483949184417725, "learning_rate": 0.00017562011524313185, "loss": 0.6628, "step": 141 }, { "epoch": 0.6173913043478261, "grad_norm": 0.4457907974720001, "learning_rate": 0.0001751214047852818, "loss": 0.6274, "step": 142 }, { "epoch": 0.6217391304347826, "grad_norm": 0.38395488262176514, "learning_rate": 0.00017461836858476856, "loss": 0.6528, "step": 143 }, { "epoch": 0.6260869565217392, "grad_norm": 0.573344886302948, "learning_rate": 0.00017411103560810526, "loss": 0.6504, "step": 144 }, { "epoch": 0.6304347826086957, "grad_norm": 0.5133661031723022, "learning_rate": 0.00017359943506922774, "loss": 0.6334, "step": 145 }, { "epoch": 0.6347826086956522, "grad_norm": 0.2995568513870239, "learning_rate": 0.00017308359642781242, "loss": 0.6328, "step": 146 }, { "epoch": 0.6391304347826087, "grad_norm": 0.5677820444107056, "learning_rate": 0.0001725635493875799, "loss": 0.639, "step": 147 }, { "epoch": 0.6434782608695652, "grad_norm": 0.4751092791557312, "learning_rate": 0.00017203932389458454, "loss": 0.6229, "step": 148 }, { "epoch": 0.6478260869565218, "grad_norm": 0.4374710023403168, "learning_rate": 0.00017151095013548994, "loss": 0.6377, "step": 149 }, { "epoch": 0.6521739130434783, "grad_norm": 0.4172927439212799, "learning_rate": 0.0001709784585358309, "loss": 0.6277, "step": 150 }, { "epoch": 0.6565217391304348, "grad_norm": 0.3994798958301544, "learning_rate": 0.00017044187975826124, "loss": 0.637, "step": 151 }, { "epoch": 0.6608695652173913, "grad_norm": 0.34366917610168457, "learning_rate": 0.00016990124470078822, "loss": 0.6556, "step": 152 }, { "epoch": 0.6652173913043479, "grad_norm": 0.533347487449646, "learning_rate": 0.0001693565844949933, "loss": 0.6073, "step": 153 }, { "epoch": 0.6695652173913044, "grad_norm": 0.4292946457862854, "learning_rate": 0.0001688079305042395, "loss": 0.6548, "step": 154 }, { "epoch": 0.6739130434782609, "grad_norm": 0.2770076394081116, "learning_rate": 0.00016825531432186543, "loss": 0.6014, "step": 155 }, { "epoch": 0.6782608695652174, "grad_norm": 0.377838134765625, "learning_rate": 0.0001676987677693659, "loss": 0.6406, "step": 156 }, { "epoch": 0.6826086956521739, "grad_norm": 0.421268492937088, "learning_rate": 0.0001671383228945597, "loss": 0.6288, "step": 157 }, { "epoch": 0.6869565217391305, "grad_norm": 0.4219221770763397, "learning_rate": 0.00016657401196974405, "loss": 0.647, "step": 158 }, { "epoch": 0.691304347826087, "grad_norm": 0.3563760221004486, "learning_rate": 0.00016600586748983641, "loss": 0.6307, "step": 159 }, { "epoch": 0.6956521739130435, "grad_norm": 0.39387866854667664, "learning_rate": 0.00016543392217050314, "loss": 0.631, "step": 160 }, { "epoch": 0.7, "grad_norm": 0.36268243193626404, "learning_rate": 0.0001648582089462756, "loss": 0.6429, "step": 161 }, { "epoch": 0.7043478260869566, "grad_norm": 0.3702019155025482, "learning_rate": 0.00016427876096865394, "loss": 0.6338, "step": 162 }, { "epoch": 0.7086956521739131, "grad_norm": 0.44408297538757324, "learning_rate": 0.00016369561160419784, "loss": 0.6416, "step": 163 }, { "epoch": 0.7130434782608696, "grad_norm": 0.5986080765724182, "learning_rate": 0.00016310879443260528, "loss": 0.6187, "step": 164 }, { "epoch": 0.717391304347826, "grad_norm": 0.7963016629219055, "learning_rate": 0.0001625183432447789, "loss": 0.6365, "step": 165 }, { "epoch": 0.7217391304347827, "grad_norm": 1.2156025171279907, "learning_rate": 0.0001619242920408802, "loss": 0.6625, "step": 166 }, { "epoch": 0.7260869565217392, "grad_norm": 0.7924716472625732, "learning_rate": 0.00016132667502837165, "loss": 0.6276, "step": 167 }, { "epoch": 0.7304347826086957, "grad_norm": 0.29551273584365845, "learning_rate": 0.00016072552662004696, "loss": 0.6159, "step": 168 }, { "epoch": 0.7347826086956522, "grad_norm": 0.7566269040107727, "learning_rate": 0.00016012088143204953, "loss": 0.6485, "step": 169 }, { "epoch": 0.7391304347826086, "grad_norm": 1.001354455947876, "learning_rate": 0.00015951277428187898, "loss": 0.6323, "step": 170 }, { "epoch": 0.7434782608695653, "grad_norm": 0.9103027582168579, "learning_rate": 0.00015890124018638638, "loss": 0.6255, "step": 171 }, { "epoch": 0.7478260869565218, "grad_norm": 0.3885137736797333, "learning_rate": 0.00015828631435975784, "loss": 0.6323, "step": 172 }, { "epoch": 0.7521739130434782, "grad_norm": 0.6141281723976135, "learning_rate": 0.00015766803221148673, "loss": 0.6504, "step": 173 }, { "epoch": 0.7565217391304347, "grad_norm": 0.8024821281433105, "learning_rate": 0.0001570464293443346, "loss": 0.641, "step": 174 }, { "epoch": 0.7608695652173914, "grad_norm": 0.43333736062049866, "learning_rate": 0.00015642154155228122, "loss": 0.627, "step": 175 }, { "epoch": 0.7652173913043478, "grad_norm": 0.649389922618866, "learning_rate": 0.00015579340481846336, "loss": 0.6483, "step": 176 }, { "epoch": 0.7695652173913043, "grad_norm": 1.0359424352645874, "learning_rate": 0.00015516205531310273, "loss": 0.6332, "step": 177 }, { "epoch": 0.7739130434782608, "grad_norm": 0.7209396362304688, "learning_rate": 0.00015452752939142328, "loss": 0.6524, "step": 178 }, { "epoch": 0.7782608695652173, "grad_norm": 0.6178513169288635, "learning_rate": 0.00015388986359155758, "loss": 0.645, "step": 179 }, { "epoch": 0.782608695652174, "grad_norm": 0.9886595606803894, "learning_rate": 0.00015324909463244296, "loss": 0.6642, "step": 180 }, { "epoch": 0.7869565217391304, "grad_norm": 0.7466373443603516, "learning_rate": 0.00015260525941170712, "loss": 0.6315, "step": 181 }, { "epoch": 0.7913043478260869, "grad_norm": 0.5552679896354675, "learning_rate": 0.00015195839500354335, "loss": 0.6207, "step": 182 }, { "epoch": 0.7956521739130434, "grad_norm": 0.5576688647270203, "learning_rate": 0.0001513085386565758, "loss": 0.6421, "step": 183 }, { "epoch": 0.8, "grad_norm": 0.4000707268714905, "learning_rate": 0.00015065572779171432, "loss": 0.6398, "step": 184 }, { "epoch": 0.8043478260869565, "grad_norm": 0.4978863298892975, "learning_rate": 0.00015000000000000001, "loss": 0.6456, "step": 185 }, { "epoch": 0.808695652173913, "grad_norm": 0.4530424177646637, "learning_rate": 0.00014934139304044033, "loss": 0.6453, "step": 186 }, { "epoch": 0.8130434782608695, "grad_norm": 0.29163071513175964, "learning_rate": 0.00014867994483783485, "loss": 0.6558, "step": 187 }, { "epoch": 0.8173913043478261, "grad_norm": 0.33445900678634644, "learning_rate": 0.00014801569348059157, "loss": 0.6291, "step": 188 }, { "epoch": 0.8217391304347826, "grad_norm": 0.3891032934188843, "learning_rate": 0.0001473486772185334, "loss": 0.6458, "step": 189 }, { "epoch": 0.8260869565217391, "grad_norm": 0.4320944845676422, "learning_rate": 0.00014667893446069588, "loss": 0.6275, "step": 190 }, { "epoch": 0.8304347826086956, "grad_norm": 0.3652418553829193, "learning_rate": 0.00014600650377311522, "loss": 0.6434, "step": 191 }, { "epoch": 0.8347826086956521, "grad_norm": 0.2939096689224243, "learning_rate": 0.00014533142387660773, "loss": 0.6462, "step": 192 }, { "epoch": 0.8391304347826087, "grad_norm": 0.36094796657562256, "learning_rate": 0.00014465373364454001, "loss": 0.6259, "step": 193 }, { "epoch": 0.8434782608695652, "grad_norm": 0.503746747970581, "learning_rate": 0.00014397347210059057, "loss": 0.6565, "step": 194 }, { "epoch": 0.8478260869565217, "grad_norm": 0.501377522945404, "learning_rate": 0.00014329067841650274, "loss": 0.6358, "step": 195 }, { "epoch": 0.8521739130434782, "grad_norm": 0.40720251202583313, "learning_rate": 0.00014260539190982886, "loss": 0.636, "step": 196 }, { "epoch": 0.8565217391304348, "grad_norm": 0.3170947730541229, "learning_rate": 0.00014191765204166643, "loss": 0.6343, "step": 197 }, { "epoch": 0.8608695652173913, "grad_norm": 0.43554455041885376, "learning_rate": 0.00014122749841438575, "loss": 0.6319, "step": 198 }, { "epoch": 0.8652173913043478, "grad_norm": 0.5128415822982788, "learning_rate": 0.00014053497076934948, "loss": 0.6326, "step": 199 }, { "epoch": 0.8695652173913043, "grad_norm": 0.44992515444755554, "learning_rate": 0.00013984010898462416, "loss": 0.6343, "step": 200 }, { "epoch": 0.8739130434782608, "grad_norm": 0.506968080997467, "learning_rate": 0.00013914295307268396, "loss": 0.6472, "step": 201 }, { "epoch": 0.8782608695652174, "grad_norm": 0.6257392764091492, "learning_rate": 0.0001384435431781065, "loss": 0.6535, "step": 202 }, { "epoch": 0.8826086956521739, "grad_norm": 0.9480230808258057, "learning_rate": 0.00013774191957526143, "loss": 0.6628, "step": 203 }, { "epoch": 0.8869565217391304, "grad_norm": 1.2171893119812012, "learning_rate": 0.00013703812266599113, "loss": 0.6585, "step": 204 }, { "epoch": 0.8913043478260869, "grad_norm": 0.3134421110153198, "learning_rate": 0.00013633219297728416, "loss": 0.6629, "step": 205 }, { "epoch": 0.8956521739130435, "grad_norm": 1.003349781036377, "learning_rate": 0.00013562417115894172, "loss": 0.6516, "step": 206 }, { "epoch": 0.9, "grad_norm": 1.246419906616211, "learning_rate": 0.00013491409798123687, "loss": 0.6418, "step": 207 }, { "epoch": 0.9043478260869565, "grad_norm": 0.46948862075805664, "learning_rate": 0.00013420201433256689, "loss": 0.6441, "step": 208 }, { "epoch": 0.908695652173913, "grad_norm": 1.628340244293213, "learning_rate": 0.00013348796121709862, "loss": 0.6661, "step": 209 }, { "epoch": 0.9130434782608695, "grad_norm": 0.4027623236179352, "learning_rate": 0.0001327719797524075, "loss": 0.6342, "step": 210 }, { "epoch": 0.9173913043478261, "grad_norm": 1.3196384906768799, "learning_rate": 0.00013205411116710972, "loss": 0.6724, "step": 211 }, { "epoch": 0.9217391304347826, "grad_norm": 0.561631977558136, "learning_rate": 0.00013133439679848823, "loss": 0.6541, "step": 212 }, { "epoch": 0.9260869565217391, "grad_norm": 0.7715569734573364, "learning_rate": 0.00013061287809011242, "loss": 0.6419, "step": 213 }, { "epoch": 0.9304347826086956, "grad_norm": 0.8591257333755493, "learning_rate": 0.0001298895965894516, "loss": 0.6197, "step": 214 }, { "epoch": 0.9347826086956522, "grad_norm": 0.4229847192764282, "learning_rate": 0.0001291645939454825, "loss": 0.6472, "step": 215 }, { "epoch": 0.9391304347826087, "grad_norm": 0.7943733930587769, "learning_rate": 0.0001284379119062912, "loss": 0.6576, "step": 216 }, { "epoch": 0.9434782608695652, "grad_norm": 0.7454273104667664, "learning_rate": 0.0001277095923166689, "loss": 0.6245, "step": 217 }, { "epoch": 0.9478260869565217, "grad_norm": 0.4976602792739868, "learning_rate": 0.00012697967711570242, "loss": 0.644, "step": 218 }, { "epoch": 0.9521739130434783, "grad_norm": 0.6845293641090393, "learning_rate": 0.00012624820833435937, "loss": 0.6412, "step": 219 }, { "epoch": 0.9565217391304348, "grad_norm": 0.7265484929084778, "learning_rate": 0.0001255152280930676, "loss": 0.6438, "step": 220 }, { "epoch": 0.9608695652173913, "grad_norm": 0.4346272647380829, "learning_rate": 0.00012478077859929, "loss": 0.6116, "step": 221 }, { "epoch": 0.9652173913043478, "grad_norm": 0.5768253803253174, "learning_rate": 0.00012404490214509386, "loss": 0.6242, "step": 222 }, { "epoch": 0.9695652173913043, "grad_norm": 0.688556969165802, "learning_rate": 0.00012330764110471566, "loss": 0.6546, "step": 223 }, { "epoch": 0.9739130434782609, "grad_norm": 0.6147114634513855, "learning_rate": 0.00012256903793212107, "loss": 0.6286, "step": 224 }, { "epoch": 0.9782608695652174, "grad_norm": 0.6598117351531982, "learning_rate": 0.00012182913515856015, "loss": 0.65, "step": 225 }, { "epoch": 0.9826086956521739, "grad_norm": 0.6232290863990784, "learning_rate": 0.00012108797539011847, "loss": 0.6465, "step": 226 }, { "epoch": 0.9869565217391304, "grad_norm": 0.3764599561691284, "learning_rate": 0.0001203456013052634, "loss": 0.6397, "step": 227 }, { "epoch": 0.991304347826087, "grad_norm": 0.4177006781101227, "learning_rate": 0.00011960205565238684, "loss": 0.6324, "step": 228 }, { "epoch": 0.9956521739130435, "grad_norm": 0.6632861495018005, "learning_rate": 0.00011885738124734358, "loss": 0.6394, "step": 229 }, { "epoch": 1.0, "grad_norm": 0.8406037092208862, "learning_rate": 0.00011811162097098558, "loss": 0.6563, "step": 230 } ], "logging_steps": 1, "max_steps": 460, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.970864260913562e+18, "train_batch_size": 24, "trial_name": null, "trial_params": null }