{ "best_global_step": 1280, "best_metric": 0.1124713271856308, "best_model_checkpoint": "../data/CADLlava_shuffle_Qwen3_adapt_4B/checkpoint-1280", "epoch": 1.9984613444318327, "eval_steps": 128, "global_step": 2476, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008071635767435994, "grad_norm": 6.537415981292725, "learning_rate": 0.0, "loss": 0.1762, "step": 1 }, { "epoch": 0.001614327153487199, "grad_norm": 7.005346298217773, "learning_rate": 4.032258064516129e-07, "loss": 0.1796, "step": 2 }, { "epoch": 0.0024214907302307982, "grad_norm": 9.528681755065918, "learning_rate": 8.064516129032258e-07, "loss": 0.1903, "step": 3 }, { "epoch": 0.003228654306974398, "grad_norm": 9.784961700439453, "learning_rate": 1.2096774193548388e-06, "loss": 0.1974, "step": 4 }, { "epoch": 0.004035817883717997, "grad_norm": 7.837964057922363, "learning_rate": 1.6129032258064516e-06, "loss": 0.1833, "step": 5 }, { "epoch": 0.0048429814604615965, "grad_norm": 6.994354248046875, "learning_rate": 2.0161290322580646e-06, "loss": 0.1685, "step": 6 }, { "epoch": 0.005650145037205196, "grad_norm": 7.814670562744141, "learning_rate": 2.4193548387096776e-06, "loss": 0.1652, "step": 7 }, { "epoch": 0.006457308613948796, "grad_norm": 5.98387336730957, "learning_rate": 2.82258064516129e-06, "loss": 0.1581, "step": 8 }, { "epoch": 0.007264472190692395, "grad_norm": 6.336279392242432, "learning_rate": 3.225806451612903e-06, "loss": 0.1604, "step": 9 }, { "epoch": 0.008071635767435994, "grad_norm": 5.878385066986084, "learning_rate": 3.6290322580645166e-06, "loss": 0.1469, "step": 10 }, { "epoch": 0.008878799344179594, "grad_norm": 5.314018249511719, "learning_rate": 4.032258064516129e-06, "loss": 0.1557, "step": 11 }, { "epoch": 0.009685962920923193, "grad_norm": 4.8463215827941895, "learning_rate": 4.435483870967742e-06, "loss": 0.151, "step": 12 }, { "epoch": 0.010493126497666793, "grad_norm": 3.255847692489624, "learning_rate": 4.838709677419355e-06, "loss": 0.1383, "step": 13 }, { "epoch": 0.011300290074410392, "grad_norm": 3.546886682510376, "learning_rate": 5.241935483870968e-06, "loss": 0.1459, "step": 14 }, { "epoch": 0.012107453651153992, "grad_norm": 2.180122137069702, "learning_rate": 5.64516129032258e-06, "loss": 0.1269, "step": 15 }, { "epoch": 0.012914617227897591, "grad_norm": 1.679854393005371, "learning_rate": 6.048387096774194e-06, "loss": 0.1182, "step": 16 }, { "epoch": 0.01372178080464119, "grad_norm": 1.390531301498413, "learning_rate": 6.451612903225806e-06, "loss": 0.1148, "step": 17 }, { "epoch": 0.01452894438138479, "grad_norm": 1.2529916763305664, "learning_rate": 6.854838709677419e-06, "loss": 0.1275, "step": 18 }, { "epoch": 0.015336107958128389, "grad_norm": 0.9661539196968079, "learning_rate": 7.258064516129033e-06, "loss": 0.1102, "step": 19 }, { "epoch": 0.016143271534871988, "grad_norm": 0.878141462802887, "learning_rate": 7.661290322580646e-06, "loss": 0.1306, "step": 20 }, { "epoch": 0.016950435111615588, "grad_norm": 0.577216625213623, "learning_rate": 8.064516129032258e-06, "loss": 0.1247, "step": 21 }, { "epoch": 0.01775759868835919, "grad_norm": 0.8132558465003967, "learning_rate": 8.46774193548387e-06, "loss": 0.1129, "step": 22 }, { "epoch": 0.01856476226510279, "grad_norm": 0.3227245509624481, "learning_rate": 8.870967741935484e-06, "loss": 0.1143, "step": 23 }, { "epoch": 0.019371925841846386, "grad_norm": 0.31732091307640076, "learning_rate": 9.274193548387097e-06, "loss": 0.125, "step": 24 }, { "epoch": 0.020179089418589986, "grad_norm": 0.30689844489097595, "learning_rate": 9.67741935483871e-06, "loss": 0.1229, "step": 25 }, { "epoch": 0.020986252995333587, "grad_norm": 0.288072407245636, "learning_rate": 1.0080645161290323e-05, "loss": 0.1109, "step": 26 }, { "epoch": 0.021793416572077184, "grad_norm": 0.3111433982849121, "learning_rate": 1.0483870967741936e-05, "loss": 0.1068, "step": 27 }, { "epoch": 0.022600580148820784, "grad_norm": 0.4033043384552002, "learning_rate": 1.0887096774193549e-05, "loss": 0.1059, "step": 28 }, { "epoch": 0.023407743725564385, "grad_norm": 0.18217837810516357, "learning_rate": 1.129032258064516e-05, "loss": 0.1144, "step": 29 }, { "epoch": 0.024214907302307985, "grad_norm": 0.15868546068668365, "learning_rate": 1.1693548387096775e-05, "loss": 0.1178, "step": 30 }, { "epoch": 0.025022070879051582, "grad_norm": 0.16039422154426575, "learning_rate": 1.2096774193548388e-05, "loss": 0.1055, "step": 31 }, { "epoch": 0.025829234455795182, "grad_norm": 0.16415774822235107, "learning_rate": 1.25e-05, "loss": 0.1114, "step": 32 }, { "epoch": 0.026636398032538783, "grad_norm": 0.183277890086174, "learning_rate": 1.2903225806451613e-05, "loss": 0.1093, "step": 33 }, { "epoch": 0.02744356160928238, "grad_norm": 0.13378675282001495, "learning_rate": 1.3306451612903225e-05, "loss": 0.1102, "step": 34 }, { "epoch": 0.02825072518602598, "grad_norm": 0.12459564954042435, "learning_rate": 1.3709677419354839e-05, "loss": 0.117, "step": 35 }, { "epoch": 0.02905788876276958, "grad_norm": 0.1288418173789978, "learning_rate": 1.4112903225806454e-05, "loss": 0.1137, "step": 36 }, { "epoch": 0.02986505233951318, "grad_norm": 0.14229263365268707, "learning_rate": 1.4516129032258066e-05, "loss": 0.1212, "step": 37 }, { "epoch": 0.030672215916256778, "grad_norm": 0.1231795996427536, "learning_rate": 1.4919354838709679e-05, "loss": 0.1156, "step": 38 }, { "epoch": 0.03147937949300038, "grad_norm": 0.12648315727710724, "learning_rate": 1.5322580645161292e-05, "loss": 0.1041, "step": 39 }, { "epoch": 0.032286543069743975, "grad_norm": 0.08965980261564255, "learning_rate": 1.5725806451612903e-05, "loss": 0.1167, "step": 40 }, { "epoch": 0.03309370664648758, "grad_norm": 0.09989285469055176, "learning_rate": 1.6129032258064517e-05, "loss": 0.1112, "step": 41 }, { "epoch": 0.033900870223231176, "grad_norm": 0.08563119918107986, "learning_rate": 1.653225806451613e-05, "loss": 0.115, "step": 42 }, { "epoch": 0.03470803379997477, "grad_norm": 0.08803705871105194, "learning_rate": 1.693548387096774e-05, "loss": 0.1057, "step": 43 }, { "epoch": 0.03551519737671838, "grad_norm": 0.0755741223692894, "learning_rate": 1.733870967741936e-05, "loss": 0.1101, "step": 44 }, { "epoch": 0.036322360953461974, "grad_norm": 0.08133774250745773, "learning_rate": 1.774193548387097e-05, "loss": 0.1184, "step": 45 }, { "epoch": 0.03712952453020558, "grad_norm": 0.07770796865224838, "learning_rate": 1.8145161290322583e-05, "loss": 0.0933, "step": 46 }, { "epoch": 0.037936688106949175, "grad_norm": 0.07862403243780136, "learning_rate": 1.8548387096774193e-05, "loss": 0.1145, "step": 47 }, { "epoch": 0.03874385168369277, "grad_norm": 0.07872931659221649, "learning_rate": 1.8951612903225807e-05, "loss": 0.105, "step": 48 }, { "epoch": 0.039551015260436376, "grad_norm": 0.14976580440998077, "learning_rate": 1.935483870967742e-05, "loss": 0.1083, "step": 49 }, { "epoch": 0.04035817883717997, "grad_norm": 0.0725972130894661, "learning_rate": 1.975806451612903e-05, "loss": 0.1013, "step": 50 }, { "epoch": 0.04116534241392357, "grad_norm": 0.06562651693820953, "learning_rate": 2.0161290322580645e-05, "loss": 0.1048, "step": 51 }, { "epoch": 0.041972505990667174, "grad_norm": 0.06627114862203598, "learning_rate": 2.056451612903226e-05, "loss": 0.1144, "step": 52 }, { "epoch": 0.04277966956741077, "grad_norm": 0.06689553707838058, "learning_rate": 2.0967741935483873e-05, "loss": 0.1142, "step": 53 }, { "epoch": 0.04358683314415437, "grad_norm": 0.062286876142024994, "learning_rate": 2.1370967741935487e-05, "loss": 0.0958, "step": 54 }, { "epoch": 0.04439399672089797, "grad_norm": 0.25462499260902405, "learning_rate": 2.1774193548387097e-05, "loss": 0.1223, "step": 55 }, { "epoch": 0.04520116029764157, "grad_norm": 0.06840191036462784, "learning_rate": 2.217741935483871e-05, "loss": 0.1057, "step": 56 }, { "epoch": 0.046008323874385165, "grad_norm": 0.06521051377058029, "learning_rate": 2.258064516129032e-05, "loss": 0.1147, "step": 57 }, { "epoch": 0.04681548745112877, "grad_norm": 0.057607006281614304, "learning_rate": 2.2983870967741935e-05, "loss": 0.1063, "step": 58 }, { "epoch": 0.047622651027872366, "grad_norm": 0.06961840391159058, "learning_rate": 2.338709677419355e-05, "loss": 0.1008, "step": 59 }, { "epoch": 0.04842981460461597, "grad_norm": 0.06263464689254761, "learning_rate": 2.3790322580645163e-05, "loss": 0.1082, "step": 60 }, { "epoch": 0.04923697818135957, "grad_norm": 0.06535794585943222, "learning_rate": 2.4193548387096777e-05, "loss": 0.1111, "step": 61 }, { "epoch": 0.050044141758103164, "grad_norm": 0.07065515220165253, "learning_rate": 2.4596774193548387e-05, "loss": 0.1099, "step": 62 }, { "epoch": 0.05085130533484677, "grad_norm": 0.05753614008426666, "learning_rate": 2.5e-05, "loss": 0.1102, "step": 63 }, { "epoch": 0.051658468911590365, "grad_norm": 0.060760337859392166, "learning_rate": 2.5403225806451615e-05, "loss": 0.1044, "step": 64 }, { "epoch": 0.05246563248833396, "grad_norm": 0.06035856530070305, "learning_rate": 2.5806451612903226e-05, "loss": 0.1183, "step": 65 }, { "epoch": 0.053272796065077566, "grad_norm": 0.06560114771127701, "learning_rate": 2.620967741935484e-05, "loss": 0.1042, "step": 66 }, { "epoch": 0.05407995964182116, "grad_norm": 0.06319893151521683, "learning_rate": 2.661290322580645e-05, "loss": 0.1093, "step": 67 }, { "epoch": 0.05488712321856476, "grad_norm": 0.05948679894208908, "learning_rate": 2.7016129032258064e-05, "loss": 0.115, "step": 68 }, { "epoch": 0.05569428679530836, "grad_norm": 0.06028081849217415, "learning_rate": 2.7419354838709678e-05, "loss": 0.1163, "step": 69 }, { "epoch": 0.05650145037205196, "grad_norm": 0.05303265154361725, "learning_rate": 2.7822580645161288e-05, "loss": 0.098, "step": 70 }, { "epoch": 0.05730861394879556, "grad_norm": 0.060923706740140915, "learning_rate": 2.822580645161291e-05, "loss": 0.112, "step": 71 }, { "epoch": 0.05811577752553916, "grad_norm": 0.061492979526519775, "learning_rate": 2.862903225806452e-05, "loss": 0.0928, "step": 72 }, { "epoch": 0.05892294110228276, "grad_norm": 0.05839088559150696, "learning_rate": 2.9032258064516133e-05, "loss": 0.1146, "step": 73 }, { "epoch": 0.05973010467902636, "grad_norm": 0.07647139579057693, "learning_rate": 2.9435483870967743e-05, "loss": 0.1135, "step": 74 }, { "epoch": 0.06053726825576996, "grad_norm": 0.05788235366344452, "learning_rate": 2.9838709677419357e-05, "loss": 0.1116, "step": 75 }, { "epoch": 0.061344431832513556, "grad_norm": 0.059873975813388824, "learning_rate": 3.024193548387097e-05, "loss": 0.1071, "step": 76 }, { "epoch": 0.06215159540925716, "grad_norm": 0.05932065099477768, "learning_rate": 3.0645161290322585e-05, "loss": 0.1109, "step": 77 }, { "epoch": 0.06295875898600076, "grad_norm": 0.059455059468746185, "learning_rate": 3.1048387096774195e-05, "loss": 0.0982, "step": 78 }, { "epoch": 0.06376592256274435, "grad_norm": 0.055153969675302505, "learning_rate": 3.1451612903225806e-05, "loss": 0.1082, "step": 79 }, { "epoch": 0.06457308613948795, "grad_norm": 0.059384312480688095, "learning_rate": 3.185483870967742e-05, "loss": 0.0971, "step": 80 }, { "epoch": 0.06538024971623156, "grad_norm": 0.06253433227539062, "learning_rate": 3.2258064516129034e-05, "loss": 0.1007, "step": 81 }, { "epoch": 0.06618741329297516, "grad_norm": 0.056873831897974014, "learning_rate": 3.2661290322580644e-05, "loss": 0.1056, "step": 82 }, { "epoch": 0.06699457686971876, "grad_norm": 0.05717776343226433, "learning_rate": 3.306451612903226e-05, "loss": 0.1084, "step": 83 }, { "epoch": 0.06780174044646235, "grad_norm": 0.059075336903333664, "learning_rate": 3.346774193548387e-05, "loss": 0.1102, "step": 84 }, { "epoch": 0.06860890402320595, "grad_norm": 0.05745232105255127, "learning_rate": 3.387096774193548e-05, "loss": 0.1099, "step": 85 }, { "epoch": 0.06941606759994955, "grad_norm": 0.06274908035993576, "learning_rate": 3.427419354838709e-05, "loss": 0.1137, "step": 86 }, { "epoch": 0.07022323117669316, "grad_norm": 0.060963571071624756, "learning_rate": 3.467741935483872e-05, "loss": 0.0991, "step": 87 }, { "epoch": 0.07103039475343675, "grad_norm": 0.05854825675487518, "learning_rate": 3.508064516129033e-05, "loss": 0.0968, "step": 88 }, { "epoch": 0.07183755833018035, "grad_norm": 0.05948904901742935, "learning_rate": 3.548387096774194e-05, "loss": 0.1019, "step": 89 }, { "epoch": 0.07264472190692395, "grad_norm": 0.05700518190860748, "learning_rate": 3.5887096774193555e-05, "loss": 0.1014, "step": 90 }, { "epoch": 0.07345188548366754, "grad_norm": 0.05793029069900513, "learning_rate": 3.6290322580645165e-05, "loss": 0.1118, "step": 91 }, { "epoch": 0.07425904906041116, "grad_norm": 0.05789302662014961, "learning_rate": 3.6693548387096776e-05, "loss": 0.1171, "step": 92 }, { "epoch": 0.07506621263715475, "grad_norm": 0.05164478346705437, "learning_rate": 3.7096774193548386e-05, "loss": 0.1146, "step": 93 }, { "epoch": 0.07587337621389835, "grad_norm": 0.05558072775602341, "learning_rate": 3.7500000000000003e-05, "loss": 0.1088, "step": 94 }, { "epoch": 0.07668053979064195, "grad_norm": 0.06337806582450867, "learning_rate": 3.7903225806451614e-05, "loss": 0.1052, "step": 95 }, { "epoch": 0.07748770336738554, "grad_norm": 0.06151774525642395, "learning_rate": 3.8306451612903224e-05, "loss": 0.1139, "step": 96 }, { "epoch": 0.07829486694412914, "grad_norm": 0.05710529908537865, "learning_rate": 3.870967741935484e-05, "loss": 0.1077, "step": 97 }, { "epoch": 0.07910203052087275, "grad_norm": 0.056564535945653915, "learning_rate": 3.911290322580645e-05, "loss": 0.1066, "step": 98 }, { "epoch": 0.07990919409761635, "grad_norm": 0.05551023408770561, "learning_rate": 3.951612903225806e-05, "loss": 0.1029, "step": 99 }, { "epoch": 0.08071635767435995, "grad_norm": 0.05938760191202164, "learning_rate": 3.991935483870968e-05, "loss": 0.1195, "step": 100 }, { "epoch": 0.08152352125110354, "grad_norm": 0.05322523042559624, "learning_rate": 4.032258064516129e-05, "loss": 0.1044, "step": 101 }, { "epoch": 0.08233068482784714, "grad_norm": 0.05394389480352402, "learning_rate": 4.072580645161291e-05, "loss": 0.1174, "step": 102 }, { "epoch": 0.08313784840459074, "grad_norm": 0.054755546152591705, "learning_rate": 4.112903225806452e-05, "loss": 0.0926, "step": 103 }, { "epoch": 0.08394501198133435, "grad_norm": 0.06272943317890167, "learning_rate": 4.1532258064516135e-05, "loss": 0.1144, "step": 104 }, { "epoch": 0.08475217555807794, "grad_norm": 0.05476183816790581, "learning_rate": 4.1935483870967746e-05, "loss": 0.1048, "step": 105 }, { "epoch": 0.08555933913482154, "grad_norm": 0.052727799862623215, "learning_rate": 4.2338709677419356e-05, "loss": 0.1141, "step": 106 }, { "epoch": 0.08636650271156514, "grad_norm": 0.0577688030898571, "learning_rate": 4.2741935483870973e-05, "loss": 0.1122, "step": 107 }, { "epoch": 0.08717366628830873, "grad_norm": 0.05016804113984108, "learning_rate": 4.3145161290322584e-05, "loss": 0.1034, "step": 108 }, { "epoch": 0.08798082986505235, "grad_norm": 0.05800708755850792, "learning_rate": 4.3548387096774194e-05, "loss": 0.1036, "step": 109 }, { "epoch": 0.08878799344179594, "grad_norm": 0.056348077952861786, "learning_rate": 4.395161290322581e-05, "loss": 0.104, "step": 110 }, { "epoch": 0.08959515701853954, "grad_norm": 0.054399192333221436, "learning_rate": 4.435483870967742e-05, "loss": 0.1101, "step": 111 }, { "epoch": 0.09040232059528314, "grad_norm": 0.05659837648272514, "learning_rate": 4.475806451612903e-05, "loss": 0.1098, "step": 112 }, { "epoch": 0.09120948417202673, "grad_norm": 0.05344666168093681, "learning_rate": 4.516129032258064e-05, "loss": 0.106, "step": 113 }, { "epoch": 0.09201664774877033, "grad_norm": 0.06315038353204727, "learning_rate": 4.556451612903226e-05, "loss": 0.1044, "step": 114 }, { "epoch": 0.09282381132551394, "grad_norm": 0.05642089620232582, "learning_rate": 4.596774193548387e-05, "loss": 0.1128, "step": 115 }, { "epoch": 0.09363097490225754, "grad_norm": 0.06222492456436157, "learning_rate": 4.637096774193548e-05, "loss": 0.1012, "step": 116 }, { "epoch": 0.09443813847900114, "grad_norm": 0.058098964393138885, "learning_rate": 4.67741935483871e-05, "loss": 0.1167, "step": 117 }, { "epoch": 0.09524530205574473, "grad_norm": 0.04947588965296745, "learning_rate": 4.7177419354838716e-05, "loss": 0.1072, "step": 118 }, { "epoch": 0.09605246563248833, "grad_norm": 0.06120481714606285, "learning_rate": 4.7580645161290326e-05, "loss": 0.1125, "step": 119 }, { "epoch": 0.09685962920923194, "grad_norm": 0.05734037607908249, "learning_rate": 4.7983870967741937e-05, "loss": 0.1102, "step": 120 }, { "epoch": 0.09766679278597554, "grad_norm": 0.05344703420996666, "learning_rate": 4.8387096774193554e-05, "loss": 0.1129, "step": 121 }, { "epoch": 0.09847395636271913, "grad_norm": 0.05551686882972717, "learning_rate": 4.8790322580645164e-05, "loss": 0.104, "step": 122 }, { "epoch": 0.09928111993946273, "grad_norm": 0.06121409684419632, "learning_rate": 4.9193548387096775e-05, "loss": 0.1143, "step": 123 }, { "epoch": 0.10008828351620633, "grad_norm": 0.04987439885735512, "learning_rate": 4.959677419354839e-05, "loss": 0.1114, "step": 124 }, { "epoch": 0.10089544709294992, "grad_norm": 0.06666464358568192, "learning_rate": 5e-05, "loss": 0.1127, "step": 125 }, { "epoch": 0.10170261066969354, "grad_norm": 0.04981999844312668, "learning_rate": 5.040322580645161e-05, "loss": 0.1018, "step": 126 }, { "epoch": 0.10250977424643713, "grad_norm": 0.05332005396485329, "learning_rate": 5.080645161290323e-05, "loss": 0.1171, "step": 127 }, { "epoch": 0.10331693782318073, "grad_norm": 0.05107470229268074, "learning_rate": 5.120967741935484e-05, "loss": 0.0975, "step": 128 }, { "epoch": 0.10331693782318073, "eval_loss": 0.1161612719297409, "eval_runtime": 4095.4836, "eval_samples_per_second": 2.174, "eval_steps_per_second": 2.174, "step": 128 }, { "epoch": 0.10412410139992433, "grad_norm": 0.05755999684333801, "learning_rate": 5.161290322580645e-05, "loss": 0.103, "step": 129 }, { "epoch": 0.10493126497666792, "grad_norm": 0.057126082479953766, "learning_rate": 5.201612903225807e-05, "loss": 0.0971, "step": 130 }, { "epoch": 0.10573842855341153, "grad_norm": 0.06009543314576149, "learning_rate": 5.241935483870968e-05, "loss": 0.1038, "step": 131 }, { "epoch": 0.10654559213015513, "grad_norm": 0.049857355654239655, "learning_rate": 5.282258064516129e-05, "loss": 0.0992, "step": 132 }, { "epoch": 0.10735275570689873, "grad_norm": 0.0541410967707634, "learning_rate": 5.32258064516129e-05, "loss": 0.0976, "step": 133 }, { "epoch": 0.10815991928364233, "grad_norm": 0.05288451537489891, "learning_rate": 5.362903225806452e-05, "loss": 0.1107, "step": 134 }, { "epoch": 0.10896708286038592, "grad_norm": 0.05952351540327072, "learning_rate": 5.403225806451613e-05, "loss": 0.114, "step": 135 }, { "epoch": 0.10977424643712952, "grad_norm": 0.0553399957716465, "learning_rate": 5.443548387096774e-05, "loss": 0.1029, "step": 136 }, { "epoch": 0.11058141001387313, "grad_norm": 0.06096586212515831, "learning_rate": 5.4838709677419355e-05, "loss": 0.1137, "step": 137 }, { "epoch": 0.11138857359061673, "grad_norm": 0.05294370651245117, "learning_rate": 5.5241935483870966e-05, "loss": 0.1046, "step": 138 }, { "epoch": 0.11219573716736032, "grad_norm": 0.055291298776865005, "learning_rate": 5.5645161290322576e-05, "loss": 0.1021, "step": 139 }, { "epoch": 0.11300290074410392, "grad_norm": 0.05802519991993904, "learning_rate": 5.604838709677419e-05, "loss": 0.106, "step": 140 }, { "epoch": 0.11381006432084752, "grad_norm": 0.05426187440752983, "learning_rate": 5.645161290322582e-05, "loss": 0.1074, "step": 141 }, { "epoch": 0.11461722789759111, "grad_norm": 0.05858275294303894, "learning_rate": 5.685483870967743e-05, "loss": 0.1133, "step": 142 }, { "epoch": 0.11542439147433473, "grad_norm": 0.05799126625061035, "learning_rate": 5.725806451612904e-05, "loss": 0.0986, "step": 143 }, { "epoch": 0.11623155505107832, "grad_norm": 0.05286507308483124, "learning_rate": 5.7661290322580655e-05, "loss": 0.1074, "step": 144 }, { "epoch": 0.11703871862782192, "grad_norm": 0.05947837606072426, "learning_rate": 5.8064516129032266e-05, "loss": 0.1171, "step": 145 }, { "epoch": 0.11784588220456552, "grad_norm": 0.06141556799411774, "learning_rate": 5.8467741935483876e-05, "loss": 0.1158, "step": 146 }, { "epoch": 0.11865304578130911, "grad_norm": 0.059260569512844086, "learning_rate": 5.887096774193549e-05, "loss": 0.1088, "step": 147 }, { "epoch": 0.11946020935805272, "grad_norm": 0.06074953451752663, "learning_rate": 5.9274193548387104e-05, "loss": 0.1091, "step": 148 }, { "epoch": 0.12026737293479632, "grad_norm": 0.057758111506700516, "learning_rate": 5.9677419354838715e-05, "loss": 0.1073, "step": 149 }, { "epoch": 0.12107453651153992, "grad_norm": 0.05568552389740944, "learning_rate": 6.0080645161290325e-05, "loss": 0.1144, "step": 150 }, { "epoch": 0.12188170008828351, "grad_norm": 0.05366987735033035, "learning_rate": 6.048387096774194e-05, "loss": 0.102, "step": 151 }, { "epoch": 0.12268886366502711, "grad_norm": 0.056751757860183716, "learning_rate": 6.088709677419355e-05, "loss": 0.098, "step": 152 }, { "epoch": 0.12349602724177071, "grad_norm": 0.05664277449250221, "learning_rate": 6.129032258064517e-05, "loss": 0.1132, "step": 153 }, { "epoch": 0.12430319081851432, "grad_norm": 0.05554311349987984, "learning_rate": 6.169354838709678e-05, "loss": 0.1068, "step": 154 }, { "epoch": 0.12511035439525792, "grad_norm": 0.06001408398151398, "learning_rate": 6.209677419354839e-05, "loss": 0.1048, "step": 155 }, { "epoch": 0.1259175179720015, "grad_norm": 0.06698593497276306, "learning_rate": 6.25e-05, "loss": 0.12, "step": 156 }, { "epoch": 0.1267246815487451, "grad_norm": 0.0560469888150692, "learning_rate": 6.290322580645161e-05, "loss": 0.1082, "step": 157 }, { "epoch": 0.1275318451254887, "grad_norm": 0.05426601320505142, "learning_rate": 6.330645161290322e-05, "loss": 0.0959, "step": 158 }, { "epoch": 0.1283390087022323, "grad_norm": 0.05859432369470596, "learning_rate": 6.370967741935485e-05, "loss": 0.1015, "step": 159 }, { "epoch": 0.1291461722789759, "grad_norm": 0.05580521374940872, "learning_rate": 6.411290322580646e-05, "loss": 0.115, "step": 160 }, { "epoch": 0.1299533358557195, "grad_norm": 0.05768033117055893, "learning_rate": 6.451612903225807e-05, "loss": 0.1074, "step": 161 }, { "epoch": 0.13076049943246312, "grad_norm": 0.053041938692331314, "learning_rate": 6.491935483870968e-05, "loss": 0.1169, "step": 162 }, { "epoch": 0.13156766300920672, "grad_norm": 0.05263933166861534, "learning_rate": 6.532258064516129e-05, "loss": 0.1057, "step": 163 }, { "epoch": 0.13237482658595032, "grad_norm": 0.06356436759233475, "learning_rate": 6.57258064516129e-05, "loss": 0.1112, "step": 164 }, { "epoch": 0.13318199016269391, "grad_norm": 0.05676979944109917, "learning_rate": 6.612903225806452e-05, "loss": 0.1044, "step": 165 }, { "epoch": 0.1339891537394375, "grad_norm": 0.052152328193187714, "learning_rate": 6.653225806451613e-05, "loss": 0.1144, "step": 166 }, { "epoch": 0.1347963173161811, "grad_norm": 0.05460150167346001, "learning_rate": 6.693548387096774e-05, "loss": 0.1087, "step": 167 }, { "epoch": 0.1356034808929247, "grad_norm": 0.05729132145643234, "learning_rate": 6.733870967741935e-05, "loss": 0.115, "step": 168 }, { "epoch": 0.1364106444696683, "grad_norm": 0.06310966610908508, "learning_rate": 6.774193548387096e-05, "loss": 0.1057, "step": 169 }, { "epoch": 0.1372178080464119, "grad_norm": 0.05967114120721817, "learning_rate": 6.814516129032257e-05, "loss": 0.1046, "step": 170 }, { "epoch": 0.1380249716231555, "grad_norm": 0.0621599406003952, "learning_rate": 6.854838709677419e-05, "loss": 0.1114, "step": 171 }, { "epoch": 0.1388321351998991, "grad_norm": 0.06002378463745117, "learning_rate": 6.895161290322581e-05, "loss": 0.1056, "step": 172 }, { "epoch": 0.13963929877664272, "grad_norm": 0.05943439155817032, "learning_rate": 6.935483870967743e-05, "loss": 0.1051, "step": 173 }, { "epoch": 0.14044646235338631, "grad_norm": 0.050571709871292114, "learning_rate": 6.975806451612904e-05, "loss": 0.1056, "step": 174 }, { "epoch": 0.1412536259301299, "grad_norm": 0.05529572814702988, "learning_rate": 7.016129032258065e-05, "loss": 0.1065, "step": 175 }, { "epoch": 0.1420607895068735, "grad_norm": 0.05796671658754349, "learning_rate": 7.056451612903226e-05, "loss": 0.1155, "step": 176 }, { "epoch": 0.1428679530836171, "grad_norm": 0.054809004068374634, "learning_rate": 7.096774193548388e-05, "loss": 0.1033, "step": 177 }, { "epoch": 0.1436751166603607, "grad_norm": 0.05852701887488365, "learning_rate": 7.137096774193549e-05, "loss": 0.1027, "step": 178 }, { "epoch": 0.1444822802371043, "grad_norm": 0.0524359792470932, "learning_rate": 7.177419354838711e-05, "loss": 0.1023, "step": 179 }, { "epoch": 0.1452894438138479, "grad_norm": 0.05656169727444649, "learning_rate": 7.217741935483872e-05, "loss": 0.1138, "step": 180 }, { "epoch": 0.1460966073905915, "grad_norm": 0.05562685430049896, "learning_rate": 7.258064516129033e-05, "loss": 0.1044, "step": 181 }, { "epoch": 0.1469037709673351, "grad_norm": 0.05728350579738617, "learning_rate": 7.298387096774194e-05, "loss": 0.1125, "step": 182 }, { "epoch": 0.1477109345440787, "grad_norm": 0.052636172622442245, "learning_rate": 7.338709677419355e-05, "loss": 0.0948, "step": 183 }, { "epoch": 0.1485180981208223, "grad_norm": 0.06120290234684944, "learning_rate": 7.379032258064516e-05, "loss": 0.1114, "step": 184 }, { "epoch": 0.1493252616975659, "grad_norm": 0.055645301938056946, "learning_rate": 7.419354838709677e-05, "loss": 0.099, "step": 185 }, { "epoch": 0.1501324252743095, "grad_norm": 0.05310719460248947, "learning_rate": 7.45967741935484e-05, "loss": 0.1063, "step": 186 }, { "epoch": 0.1509395888510531, "grad_norm": 0.05271172523498535, "learning_rate": 7.500000000000001e-05, "loss": 0.1061, "step": 187 }, { "epoch": 0.1517467524277967, "grad_norm": 0.05266357958316803, "learning_rate": 7.540322580645162e-05, "loss": 0.1102, "step": 188 }, { "epoch": 0.1525539160045403, "grad_norm": 0.0574917271733284, "learning_rate": 7.580645161290323e-05, "loss": 0.1021, "step": 189 }, { "epoch": 0.1533610795812839, "grad_norm": 0.057761140167713165, "learning_rate": 7.620967741935484e-05, "loss": 0.096, "step": 190 }, { "epoch": 0.1541682431580275, "grad_norm": 0.05890843644738197, "learning_rate": 7.661290322580645e-05, "loss": 0.1028, "step": 191 }, { "epoch": 0.1549754067347711, "grad_norm": 0.05141144245862961, "learning_rate": 7.701612903225807e-05, "loss": 0.1082, "step": 192 }, { "epoch": 0.15578257031151468, "grad_norm": 0.05284852534532547, "learning_rate": 7.741935483870968e-05, "loss": 0.1101, "step": 193 }, { "epoch": 0.15658973388825828, "grad_norm": 0.06894844025373459, "learning_rate": 7.78225806451613e-05, "loss": 0.0991, "step": 194 }, { "epoch": 0.15739689746500188, "grad_norm": 0.0619078055024147, "learning_rate": 7.82258064516129e-05, "loss": 0.1194, "step": 195 }, { "epoch": 0.1582040610417455, "grad_norm": 0.05480406433343887, "learning_rate": 7.862903225806451e-05, "loss": 0.1076, "step": 196 }, { "epoch": 0.1590112246184891, "grad_norm": 0.05481485277414322, "learning_rate": 7.903225806451613e-05, "loss": 0.1002, "step": 197 }, { "epoch": 0.1598183881952327, "grad_norm": 0.054309576749801636, "learning_rate": 7.943548387096774e-05, "loss": 0.1179, "step": 198 }, { "epoch": 0.1606255517719763, "grad_norm": 0.06254801154136658, "learning_rate": 7.983870967741936e-05, "loss": 0.1146, "step": 199 }, { "epoch": 0.1614327153487199, "grad_norm": 0.057748422026634216, "learning_rate": 8.024193548387097e-05, "loss": 0.1091, "step": 200 }, { "epoch": 0.1622398789254635, "grad_norm": 0.06503515690565109, "learning_rate": 8.064516129032258e-05, "loss": 0.112, "step": 201 }, { "epoch": 0.16304704250220708, "grad_norm": 0.05895281955599785, "learning_rate": 8.104838709677419e-05, "loss": 0.103, "step": 202 }, { "epoch": 0.16385420607895068, "grad_norm": 0.0623365081846714, "learning_rate": 8.145161290322582e-05, "loss": 0.1057, "step": 203 }, { "epoch": 0.16466136965569428, "grad_norm": 0.06177631765604019, "learning_rate": 8.185483870967743e-05, "loss": 0.1151, "step": 204 }, { "epoch": 0.16546853323243788, "grad_norm": 0.05293562263250351, "learning_rate": 8.225806451612904e-05, "loss": 0.1003, "step": 205 }, { "epoch": 0.16627569680918147, "grad_norm": 0.05533309653401375, "learning_rate": 8.266129032258066e-05, "loss": 0.1064, "step": 206 }, { "epoch": 0.1670828603859251, "grad_norm": 0.05776599422097206, "learning_rate": 8.306451612903227e-05, "loss": 0.0999, "step": 207 }, { "epoch": 0.1678900239626687, "grad_norm": 0.05928900092840195, "learning_rate": 8.346774193548388e-05, "loss": 0.1039, "step": 208 }, { "epoch": 0.1686971875394123, "grad_norm": 0.057382941246032715, "learning_rate": 8.387096774193549e-05, "loss": 0.1045, "step": 209 }, { "epoch": 0.1695043511161559, "grad_norm": 0.05921955406665802, "learning_rate": 8.42741935483871e-05, "loss": 0.0992, "step": 210 }, { "epoch": 0.17031151469289948, "grad_norm": 0.05668126046657562, "learning_rate": 8.467741935483871e-05, "loss": 0.1047, "step": 211 }, { "epoch": 0.17111867826964308, "grad_norm": 0.05774497985839844, "learning_rate": 8.508064516129032e-05, "loss": 0.1145, "step": 212 }, { "epoch": 0.17192584184638668, "grad_norm": 0.06269316375255585, "learning_rate": 8.548387096774195e-05, "loss": 0.1095, "step": 213 }, { "epoch": 0.17273300542313028, "grad_norm": 0.056003738194704056, "learning_rate": 8.588709677419356e-05, "loss": 0.1048, "step": 214 }, { "epoch": 0.17354016899987387, "grad_norm": 0.0613495372235775, "learning_rate": 8.629032258064517e-05, "loss": 0.0968, "step": 215 }, { "epoch": 0.17434733257661747, "grad_norm": 0.09114450961351395, "learning_rate": 8.669354838709678e-05, "loss": 0.1012, "step": 216 }, { "epoch": 0.17515449615336107, "grad_norm": 0.06504707038402557, "learning_rate": 8.709677419354839e-05, "loss": 0.1135, "step": 217 }, { "epoch": 0.1759616597301047, "grad_norm": 0.05313399061560631, "learning_rate": 8.75e-05, "loss": 0.0985, "step": 218 }, { "epoch": 0.1767688233068483, "grad_norm": 0.05742132291197777, "learning_rate": 8.790322580645162e-05, "loss": 0.1019, "step": 219 }, { "epoch": 0.17757598688359189, "grad_norm": 0.05729842185974121, "learning_rate": 8.830645161290323e-05, "loss": 0.1092, "step": 220 }, { "epoch": 0.17838315046033548, "grad_norm": 0.053899843245744705, "learning_rate": 8.870967741935484e-05, "loss": 0.1027, "step": 221 }, { "epoch": 0.17919031403707908, "grad_norm": 0.05387280136346817, "learning_rate": 8.911290322580645e-05, "loss": 0.1119, "step": 222 }, { "epoch": 0.17999747761382268, "grad_norm": 0.07458134740591049, "learning_rate": 8.951612903225806e-05, "loss": 0.1193, "step": 223 }, { "epoch": 0.18080464119056627, "grad_norm": 0.06176605075597763, "learning_rate": 8.991935483870968e-05, "loss": 0.0961, "step": 224 }, { "epoch": 0.18161180476730987, "grad_norm": 0.06017741560935974, "learning_rate": 9.032258064516129e-05, "loss": 0.0968, "step": 225 }, { "epoch": 0.18241896834405347, "grad_norm": 0.0889572873711586, "learning_rate": 9.072580645161291e-05, "loss": 0.1027, "step": 226 }, { "epoch": 0.18322613192079706, "grad_norm": 0.061678677797317505, "learning_rate": 9.112903225806452e-05, "loss": 0.1094, "step": 227 }, { "epoch": 0.18403329549754066, "grad_norm": 0.06008686125278473, "learning_rate": 9.153225806451613e-05, "loss": 0.104, "step": 228 }, { "epoch": 0.18484045907428429, "grad_norm": 0.06000223755836487, "learning_rate": 9.193548387096774e-05, "loss": 0.1151, "step": 229 }, { "epoch": 0.18564762265102788, "grad_norm": 0.058045681565999985, "learning_rate": 9.233870967741935e-05, "loss": 0.1007, "step": 230 }, { "epoch": 0.18645478622777148, "grad_norm": 0.06773912906646729, "learning_rate": 9.274193548387096e-05, "loss": 0.0994, "step": 231 }, { "epoch": 0.18726194980451508, "grad_norm": 0.0622003972530365, "learning_rate": 9.314516129032259e-05, "loss": 0.1113, "step": 232 }, { "epoch": 0.18806911338125867, "grad_norm": 0.05484921857714653, "learning_rate": 9.35483870967742e-05, "loss": 0.1027, "step": 233 }, { "epoch": 0.18887627695800227, "grad_norm": 0.057545896619558334, "learning_rate": 9.395161290322582e-05, "loss": 0.1068, "step": 234 }, { "epoch": 0.18968344053474587, "grad_norm": 0.059905070811510086, "learning_rate": 9.435483870967743e-05, "loss": 0.1111, "step": 235 }, { "epoch": 0.19049060411148946, "grad_norm": 0.057194534689188004, "learning_rate": 9.475806451612904e-05, "loss": 0.1, "step": 236 }, { "epoch": 0.19129776768823306, "grad_norm": 0.06968406587839127, "learning_rate": 9.516129032258065e-05, "loss": 0.1043, "step": 237 }, { "epoch": 0.19210493126497666, "grad_norm": 0.05680746212601662, "learning_rate": 9.556451612903226e-05, "loss": 0.1023, "step": 238 }, { "epoch": 0.19291209484172026, "grad_norm": 0.06075569614768028, "learning_rate": 9.596774193548387e-05, "loss": 0.1031, "step": 239 }, { "epoch": 0.19371925841846388, "grad_norm": 0.05578656867146492, "learning_rate": 9.63709677419355e-05, "loss": 0.1023, "step": 240 }, { "epoch": 0.19452642199520748, "grad_norm": 0.06038297712802887, "learning_rate": 9.677419354838711e-05, "loss": 0.1054, "step": 241 }, { "epoch": 0.19533358557195107, "grad_norm": 0.06007019057869911, "learning_rate": 9.717741935483872e-05, "loss": 0.1034, "step": 242 }, { "epoch": 0.19614074914869467, "grad_norm": 0.06147025525569916, "learning_rate": 9.758064516129033e-05, "loss": 0.099, "step": 243 }, { "epoch": 0.19694791272543827, "grad_norm": 0.0644707903265953, "learning_rate": 9.798387096774194e-05, "loss": 0.1078, "step": 244 }, { "epoch": 0.19775507630218186, "grad_norm": 0.06165936216711998, "learning_rate": 9.838709677419355e-05, "loss": 0.1143, "step": 245 }, { "epoch": 0.19856223987892546, "grad_norm": 0.05921250581741333, "learning_rate": 9.879032258064517e-05, "loss": 0.1136, "step": 246 }, { "epoch": 0.19936940345566906, "grad_norm": 0.05928260460495949, "learning_rate": 9.919354838709678e-05, "loss": 0.1051, "step": 247 }, { "epoch": 0.20017656703241266, "grad_norm": 0.05721810832619667, "learning_rate": 9.95967741935484e-05, "loss": 0.1157, "step": 248 }, { "epoch": 0.20098373060915625, "grad_norm": 0.05861903354525566, "learning_rate": 0.0001, "loss": 0.1011, "step": 249 }, { "epoch": 0.20179089418589985, "grad_norm": 0.06758609414100647, "learning_rate": 9.999995029394351e-05, "loss": 0.1017, "step": 250 }, { "epoch": 0.20259805776264347, "grad_norm": 0.06704200059175491, "learning_rate": 9.999980117587285e-05, "loss": 0.1049, "step": 251 }, { "epoch": 0.20340522133938707, "grad_norm": 0.0630844309926033, "learning_rate": 9.999955264608451e-05, "loss": 0.1096, "step": 252 }, { "epoch": 0.20421238491613067, "grad_norm": 0.06997450441122055, "learning_rate": 9.999920470507262e-05, "loss": 0.1119, "step": 253 }, { "epoch": 0.20501954849287427, "grad_norm": 0.061440762132406235, "learning_rate": 9.9998757353529e-05, "loss": 0.1017, "step": 254 }, { "epoch": 0.20582671206961786, "grad_norm": 0.059259746223688126, "learning_rate": 9.999821059234307e-05, "loss": 0.0929, "step": 255 }, { "epoch": 0.20663387564636146, "grad_norm": 0.055394191294908524, "learning_rate": 9.99975644226019e-05, "loss": 0.0979, "step": 256 }, { "epoch": 0.20663387564636146, "eval_loss": 0.11454448103904724, "eval_runtime": 4053.1519, "eval_samples_per_second": 2.196, "eval_steps_per_second": 2.196, "step": 256 }, { "epoch": 0.20744103922310506, "grad_norm": 0.05967559292912483, "learning_rate": 9.999681884559027e-05, "loss": 0.1152, "step": 257 }, { "epoch": 0.20824820279984865, "grad_norm": 0.06448899209499359, "learning_rate": 9.999597386279056e-05, "loss": 0.1143, "step": 258 }, { "epoch": 0.20905536637659225, "grad_norm": 0.06080959737300873, "learning_rate": 9.99950294758828e-05, "loss": 0.1007, "step": 259 }, { "epoch": 0.20986252995333585, "grad_norm": 0.06751599162817001, "learning_rate": 9.999398568674464e-05, "loss": 0.1055, "step": 260 }, { "epoch": 0.21066969353007944, "grad_norm": 0.05802805721759796, "learning_rate": 9.999284249745142e-05, "loss": 0.102, "step": 261 }, { "epoch": 0.21147685710682307, "grad_norm": 0.07094297558069229, "learning_rate": 9.999159991027605e-05, "loss": 0.1013, "step": 262 }, { "epoch": 0.21228402068356667, "grad_norm": 0.06312110275030136, "learning_rate": 9.999025792768909e-05, "loss": 0.1066, "step": 263 }, { "epoch": 0.21309118426031026, "grad_norm": 0.062266234308481216, "learning_rate": 9.998881655235876e-05, "loss": 0.1069, "step": 264 }, { "epoch": 0.21389834783705386, "grad_norm": 0.06362524628639221, "learning_rate": 9.998727578715082e-05, "loss": 0.1099, "step": 265 }, { "epoch": 0.21470551141379746, "grad_norm": 0.06389642506837845, "learning_rate": 9.998563563512873e-05, "loss": 0.1165, "step": 266 }, { "epoch": 0.21551267499054105, "grad_norm": 0.05605356767773628, "learning_rate": 9.998389609955348e-05, "loss": 0.1103, "step": 267 }, { "epoch": 0.21631983856728465, "grad_norm": 0.07204721868038177, "learning_rate": 9.998205718388369e-05, "loss": 0.1095, "step": 268 }, { "epoch": 0.21712700214402825, "grad_norm": 0.05972163751721382, "learning_rate": 9.998011889177556e-05, "loss": 0.1045, "step": 269 }, { "epoch": 0.21793416572077184, "grad_norm": 0.05439569056034088, "learning_rate": 9.997808122708292e-05, "loss": 0.1101, "step": 270 }, { "epoch": 0.21874132929751544, "grad_norm": 0.05748212710022926, "learning_rate": 9.997594419385712e-05, "loss": 0.1064, "step": 271 }, { "epoch": 0.21954849287425904, "grad_norm": 0.05812348052859306, "learning_rate": 9.99737077963471e-05, "loss": 0.1041, "step": 272 }, { "epoch": 0.22035565645100263, "grad_norm": 0.06308595091104507, "learning_rate": 9.997137203899936e-05, "loss": 0.1096, "step": 273 }, { "epoch": 0.22116282002774626, "grad_norm": 0.06374591588973999, "learning_rate": 9.996893692645794e-05, "loss": 0.1083, "step": 274 }, { "epoch": 0.22196998360448986, "grad_norm": 0.06280554085969925, "learning_rate": 9.996640246356445e-05, "loss": 0.1135, "step": 275 }, { "epoch": 0.22277714718123345, "grad_norm": 0.05631393566727638, "learning_rate": 9.996376865535801e-05, "loss": 0.1003, "step": 276 }, { "epoch": 0.22358431075797705, "grad_norm": 0.0565749816596508, "learning_rate": 9.996103550707527e-05, "loss": 0.1118, "step": 277 }, { "epoch": 0.22439147433472065, "grad_norm": 0.0627712681889534, "learning_rate": 9.99582030241504e-05, "loss": 0.107, "step": 278 }, { "epoch": 0.22519863791146424, "grad_norm": 0.06044355779886246, "learning_rate": 9.995527121221504e-05, "loss": 0.1023, "step": 279 }, { "epoch": 0.22600580148820784, "grad_norm": 0.05564684048295021, "learning_rate": 9.995224007709837e-05, "loss": 0.1018, "step": 280 }, { "epoch": 0.22681296506495144, "grad_norm": 0.051460545510053635, "learning_rate": 9.9949109624827e-05, "loss": 0.1028, "step": 281 }, { "epoch": 0.22762012864169504, "grad_norm": 0.05416421219706535, "learning_rate": 9.994587986162502e-05, "loss": 0.1045, "step": 282 }, { "epoch": 0.22842729221843863, "grad_norm": 0.06218072772026062, "learning_rate": 9.994255079391402e-05, "loss": 0.1036, "step": 283 }, { "epoch": 0.22923445579518223, "grad_norm": 0.0585336908698082, "learning_rate": 9.993912242831296e-05, "loss": 0.0994, "step": 284 }, { "epoch": 0.23004161937192585, "grad_norm": 0.05899813026189804, "learning_rate": 9.993559477163827e-05, "loss": 0.1199, "step": 285 }, { "epoch": 0.23084878294866945, "grad_norm": 0.06317445635795593, "learning_rate": 9.993196783090377e-05, "loss": 0.1059, "step": 286 }, { "epoch": 0.23165594652541305, "grad_norm": 0.06165239214897156, "learning_rate": 9.992824161332072e-05, "loss": 0.1114, "step": 287 }, { "epoch": 0.23246311010215664, "grad_norm": 0.06145606189966202, "learning_rate": 9.992441612629775e-05, "loss": 0.1038, "step": 288 }, { "epoch": 0.23327027367890024, "grad_norm": 0.05932976305484772, "learning_rate": 9.992049137744084e-05, "loss": 0.1099, "step": 289 }, { "epoch": 0.23407743725564384, "grad_norm": 0.057493988424539566, "learning_rate": 9.991646737455334e-05, "loss": 0.1047, "step": 290 }, { "epoch": 0.23488460083238744, "grad_norm": 0.06467270106077194, "learning_rate": 9.991234412563593e-05, "loss": 0.0993, "step": 291 }, { "epoch": 0.23569176440913103, "grad_norm": 0.059481848031282425, "learning_rate": 9.990812163888666e-05, "loss": 0.0974, "step": 292 }, { "epoch": 0.23649892798587463, "grad_norm": 0.056533560156822205, "learning_rate": 9.990379992270084e-05, "loss": 0.111, "step": 293 }, { "epoch": 0.23730609156261823, "grad_norm": 0.06554078310728073, "learning_rate": 9.989937898567108e-05, "loss": 0.1156, "step": 294 }, { "epoch": 0.23811325513936182, "grad_norm": 0.061008621007204056, "learning_rate": 9.989485883658729e-05, "loss": 0.0997, "step": 295 }, { "epoch": 0.23892041871610545, "grad_norm": 0.05975068360567093, "learning_rate": 9.989023948443662e-05, "loss": 0.1168, "step": 296 }, { "epoch": 0.23972758229284905, "grad_norm": 0.06001225858926773, "learning_rate": 9.988552093840344e-05, "loss": 0.1083, "step": 297 }, { "epoch": 0.24053474586959264, "grad_norm": 0.05878763645887375, "learning_rate": 9.988070320786938e-05, "loss": 0.1139, "step": 298 }, { "epoch": 0.24134190944633624, "grad_norm": 0.06265363097190857, "learning_rate": 9.987578630241325e-05, "loss": 0.0948, "step": 299 }, { "epoch": 0.24214907302307984, "grad_norm": 0.06659475713968277, "learning_rate": 9.987077023181106e-05, "loss": 0.1148, "step": 300 }, { "epoch": 0.24295623659982343, "grad_norm": 0.05710739642381668, "learning_rate": 9.986565500603598e-05, "loss": 0.1049, "step": 301 }, { "epoch": 0.24376340017656703, "grad_norm": 0.06117522343993187, "learning_rate": 9.986044063525828e-05, "loss": 0.1099, "step": 302 }, { "epoch": 0.24457056375331063, "grad_norm": 0.06268444657325745, "learning_rate": 9.985512712984541e-05, "loss": 0.1122, "step": 303 }, { "epoch": 0.24537772733005422, "grad_norm": 0.05912703648209572, "learning_rate": 9.984971450036194e-05, "loss": 0.0994, "step": 304 }, { "epoch": 0.24618489090679782, "grad_norm": 0.05969204008579254, "learning_rate": 9.984420275756944e-05, "loss": 0.0998, "step": 305 }, { "epoch": 0.24699205448354142, "grad_norm": 0.056127700954675674, "learning_rate": 9.983859191242661e-05, "loss": 0.1036, "step": 306 }, { "epoch": 0.24779921806028504, "grad_norm": 0.0559709332883358, "learning_rate": 9.983288197608918e-05, "loss": 0.1087, "step": 307 }, { "epoch": 0.24860638163702864, "grad_norm": 0.05808999016880989, "learning_rate": 9.982707295990987e-05, "loss": 0.0941, "step": 308 }, { "epoch": 0.24941354521377224, "grad_norm": 0.056650929152965546, "learning_rate": 9.982116487543843e-05, "loss": 0.1027, "step": 309 }, { "epoch": 0.25022070879051583, "grad_norm": 0.06523299217224121, "learning_rate": 9.981515773442155e-05, "loss": 0.1055, "step": 310 }, { "epoch": 0.25102787236725943, "grad_norm": 0.05767962709069252, "learning_rate": 9.980905154880288e-05, "loss": 0.1028, "step": 311 }, { "epoch": 0.251835035944003, "grad_norm": 0.06657809764146805, "learning_rate": 9.980284633072298e-05, "loss": 0.1083, "step": 312 }, { "epoch": 0.2526421995207466, "grad_norm": 0.055512599647045135, "learning_rate": 9.979654209251938e-05, "loss": 0.1052, "step": 313 }, { "epoch": 0.2534493630974902, "grad_norm": 0.05472143366932869, "learning_rate": 9.979013884672638e-05, "loss": 0.0973, "step": 314 }, { "epoch": 0.2542565266742338, "grad_norm": 0.058596353977918625, "learning_rate": 9.978363660607522e-05, "loss": 0.099, "step": 315 }, { "epoch": 0.2550636902509774, "grad_norm": 0.06164759397506714, "learning_rate": 9.97770353834939e-05, "loss": 0.1056, "step": 316 }, { "epoch": 0.255870853827721, "grad_norm": 0.05422014743089676, "learning_rate": 9.977033519210725e-05, "loss": 0.0985, "step": 317 }, { "epoch": 0.2566780174044646, "grad_norm": 0.06098928302526474, "learning_rate": 9.97635360452369e-05, "loss": 0.1054, "step": 318 }, { "epoch": 0.2574851809812082, "grad_norm": 0.06325767934322357, "learning_rate": 9.975663795640118e-05, "loss": 0.0999, "step": 319 }, { "epoch": 0.2582923445579518, "grad_norm": 0.06534324586391449, "learning_rate": 9.974964093931518e-05, "loss": 0.1082, "step": 320 }, { "epoch": 0.2590995081346954, "grad_norm": 0.05478568747639656, "learning_rate": 9.974254500789065e-05, "loss": 0.1089, "step": 321 }, { "epoch": 0.259906671711439, "grad_norm": 0.054606009274721146, "learning_rate": 9.973535017623602e-05, "loss": 0.1096, "step": 322 }, { "epoch": 0.2607138352881826, "grad_norm": 0.05855771526694298, "learning_rate": 9.972805645865637e-05, "loss": 0.1201, "step": 323 }, { "epoch": 0.26152099886492625, "grad_norm": 0.05184745788574219, "learning_rate": 9.972066386965336e-05, "loss": 0.1039, "step": 324 }, { "epoch": 0.26232816244166984, "grad_norm": 0.05539630725979805, "learning_rate": 9.971317242392526e-05, "loss": 0.1046, "step": 325 }, { "epoch": 0.26313532601841344, "grad_norm": 0.0644024908542633, "learning_rate": 9.97055821363669e-05, "loss": 0.1106, "step": 326 }, { "epoch": 0.26394248959515704, "grad_norm": 0.06462795287370682, "learning_rate": 9.969789302206956e-05, "loss": 0.1126, "step": 327 }, { "epoch": 0.26474965317190063, "grad_norm": 0.06185894086956978, "learning_rate": 9.969010509632111e-05, "loss": 0.1013, "step": 328 }, { "epoch": 0.26555681674864423, "grad_norm": 0.060405511409044266, "learning_rate": 9.968221837460579e-05, "loss": 0.107, "step": 329 }, { "epoch": 0.26636398032538783, "grad_norm": 0.06369788199663162, "learning_rate": 9.967423287260436e-05, "loss": 0.1032, "step": 330 }, { "epoch": 0.2671711439021314, "grad_norm": 0.061033330857753754, "learning_rate": 9.96661486061939e-05, "loss": 0.1051, "step": 331 }, { "epoch": 0.267978307478875, "grad_norm": 0.06178104877471924, "learning_rate": 9.965796559144789e-05, "loss": 0.1022, "step": 332 }, { "epoch": 0.2687854710556186, "grad_norm": 0.061380527913570404, "learning_rate": 9.964968384463616e-05, "loss": 0.0986, "step": 333 }, { "epoch": 0.2695926346323622, "grad_norm": 0.06231223791837692, "learning_rate": 9.964130338222482e-05, "loss": 0.1061, "step": 334 }, { "epoch": 0.2703997982091058, "grad_norm": 0.05808347463607788, "learning_rate": 9.963282422087628e-05, "loss": 0.1049, "step": 335 }, { "epoch": 0.2712069617858494, "grad_norm": 0.06290033459663391, "learning_rate": 9.962424637744914e-05, "loss": 0.1161, "step": 336 }, { "epoch": 0.272014125362593, "grad_norm": 0.05496848747134209, "learning_rate": 9.961556986899825e-05, "loss": 0.1005, "step": 337 }, { "epoch": 0.2728212889393366, "grad_norm": 0.06204630434513092, "learning_rate": 9.960679471277459e-05, "loss": 0.1054, "step": 338 }, { "epoch": 0.2736284525160802, "grad_norm": 0.06103508174419403, "learning_rate": 9.959792092622531e-05, "loss": 0.112, "step": 339 }, { "epoch": 0.2744356160928238, "grad_norm": 0.06381862610578537, "learning_rate": 9.958894852699364e-05, "loss": 0.1164, "step": 340 }, { "epoch": 0.2752427796695674, "grad_norm": 0.06371370702981949, "learning_rate": 9.957987753291889e-05, "loss": 0.1057, "step": 341 }, { "epoch": 0.276049943246311, "grad_norm": 0.06363417953252792, "learning_rate": 9.95707079620364e-05, "loss": 0.1109, "step": 342 }, { "epoch": 0.2768571068230546, "grad_norm": 0.06282646954059601, "learning_rate": 9.95614398325775e-05, "loss": 0.1066, "step": 343 }, { "epoch": 0.2776642703997982, "grad_norm": 0.05641169846057892, "learning_rate": 9.955207316296946e-05, "loss": 0.1088, "step": 344 }, { "epoch": 0.2784714339765418, "grad_norm": 0.06049324572086334, "learning_rate": 9.954260797183549e-05, "loss": 0.1109, "step": 345 }, { "epoch": 0.27927859755328543, "grad_norm": 0.06481636315584183, "learning_rate": 9.953304427799469e-05, "loss": 0.1112, "step": 346 }, { "epoch": 0.28008576113002903, "grad_norm": 0.0553961917757988, "learning_rate": 9.952338210046202e-05, "loss": 0.1054, "step": 347 }, { "epoch": 0.28089292470677263, "grad_norm": 0.05585910379886627, "learning_rate": 9.951362145844819e-05, "loss": 0.1108, "step": 348 }, { "epoch": 0.2817000882835162, "grad_norm": 0.05868499353528023, "learning_rate": 9.950376237135973e-05, "loss": 0.116, "step": 349 }, { "epoch": 0.2825072518602598, "grad_norm": 0.06385812908411026, "learning_rate": 9.949380485879892e-05, "loss": 0.1004, "step": 350 }, { "epoch": 0.2833144154370034, "grad_norm": 0.059557028114795685, "learning_rate": 9.948374894056368e-05, "loss": 0.1107, "step": 351 }, { "epoch": 0.284121579013747, "grad_norm": 0.06150607764720917, "learning_rate": 9.947359463664762e-05, "loss": 0.1094, "step": 352 }, { "epoch": 0.2849287425904906, "grad_norm": 0.05867352709174156, "learning_rate": 9.946334196723999e-05, "loss": 0.1026, "step": 353 }, { "epoch": 0.2857359061672342, "grad_norm": 0.08751073479652405, "learning_rate": 9.945299095272551e-05, "loss": 0.1031, "step": 354 }, { "epoch": 0.2865430697439778, "grad_norm": 0.05343090370297432, "learning_rate": 9.944254161368455e-05, "loss": 0.1035, "step": 355 }, { "epoch": 0.2873502333207214, "grad_norm": 0.06003037467598915, "learning_rate": 9.943199397089296e-05, "loss": 0.1021, "step": 356 }, { "epoch": 0.288157396897465, "grad_norm": 0.0622030608355999, "learning_rate": 9.942134804532193e-05, "loss": 0.1067, "step": 357 }, { "epoch": 0.2889645604742086, "grad_norm": 0.05934299901127815, "learning_rate": 9.941060385813819e-05, "loss": 0.1162, "step": 358 }, { "epoch": 0.2897717240509522, "grad_norm": 0.06125222146511078, "learning_rate": 9.939976143070377e-05, "loss": 0.0944, "step": 359 }, { "epoch": 0.2905788876276958, "grad_norm": 0.05986633151769638, "learning_rate": 9.938882078457607e-05, "loss": 0.101, "step": 360 }, { "epoch": 0.2913860512044394, "grad_norm": 0.05745348334312439, "learning_rate": 9.937778194150771e-05, "loss": 0.1006, "step": 361 }, { "epoch": 0.292193214781183, "grad_norm": 0.05899718776345253, "learning_rate": 9.93666449234466e-05, "loss": 0.102, "step": 362 }, { "epoch": 0.2930003783579266, "grad_norm": 0.06059076637029648, "learning_rate": 9.935540975253582e-05, "loss": 0.1156, "step": 363 }, { "epoch": 0.2938075419346702, "grad_norm": 0.065029077231884, "learning_rate": 9.934407645111363e-05, "loss": 0.1067, "step": 364 }, { "epoch": 0.2946147055114138, "grad_norm": 0.056683141738176346, "learning_rate": 9.933264504171337e-05, "loss": 0.1034, "step": 365 }, { "epoch": 0.2954218690881574, "grad_norm": 0.06458036601543427, "learning_rate": 9.932111554706345e-05, "loss": 0.1001, "step": 366 }, { "epoch": 0.29622903266490097, "grad_norm": 0.06666620075702667, "learning_rate": 9.930948799008728e-05, "loss": 0.1056, "step": 367 }, { "epoch": 0.2970361962416446, "grad_norm": 0.0633731558918953, "learning_rate": 9.929776239390329e-05, "loss": 0.1055, "step": 368 }, { "epoch": 0.2978433598183882, "grad_norm": 0.05547738075256348, "learning_rate": 9.928593878182479e-05, "loss": 0.0976, "step": 369 }, { "epoch": 0.2986505233951318, "grad_norm": 0.06152217462658882, "learning_rate": 9.927401717736e-05, "loss": 0.0957, "step": 370 }, { "epoch": 0.2994576869718754, "grad_norm": 0.06943576782941818, "learning_rate": 9.926199760421195e-05, "loss": 0.1075, "step": 371 }, { "epoch": 0.300264850548619, "grad_norm": 0.05713924765586853, "learning_rate": 9.924988008627846e-05, "loss": 0.0983, "step": 372 }, { "epoch": 0.3010720141253626, "grad_norm": 0.05523330345749855, "learning_rate": 9.923766464765207e-05, "loss": 0.1147, "step": 373 }, { "epoch": 0.3018791777021062, "grad_norm": 0.05684399604797363, "learning_rate": 9.922535131262008e-05, "loss": 0.1053, "step": 374 }, { "epoch": 0.3026863412788498, "grad_norm": 0.061428818851709366, "learning_rate": 9.921294010566435e-05, "loss": 0.1034, "step": 375 }, { "epoch": 0.3034935048555934, "grad_norm": 0.06132151558995247, "learning_rate": 9.920043105146136e-05, "loss": 0.1082, "step": 376 }, { "epoch": 0.304300668432337, "grad_norm": 0.06228143349289894, "learning_rate": 9.918782417488216e-05, "loss": 0.1088, "step": 377 }, { "epoch": 0.3051078320090806, "grad_norm": 0.06947155296802521, "learning_rate": 9.917511950099227e-05, "loss": 0.1072, "step": 378 }, { "epoch": 0.3059149955858242, "grad_norm": 0.05214812979102135, "learning_rate": 9.916231705505166e-05, "loss": 0.1064, "step": 379 }, { "epoch": 0.3067221591625678, "grad_norm": 0.05775217339396477, "learning_rate": 9.914941686251468e-05, "loss": 0.1039, "step": 380 }, { "epoch": 0.3075293227393114, "grad_norm": 0.0552043691277504, "learning_rate": 9.913641894903006e-05, "loss": 0.093, "step": 381 }, { "epoch": 0.308336486316055, "grad_norm": 0.057467177510261536, "learning_rate": 9.912332334044077e-05, "loss": 0.1046, "step": 382 }, { "epoch": 0.3091436498927986, "grad_norm": 0.06932041794061661, "learning_rate": 9.911013006278409e-05, "loss": 0.0964, "step": 383 }, { "epoch": 0.3099508134695422, "grad_norm": 0.06147124245762825, "learning_rate": 9.909683914229143e-05, "loss": 0.1177, "step": 384 }, { "epoch": 0.3099508134695422, "eval_loss": 0.11508329212665558, "eval_runtime": 4065.8039, "eval_samples_per_second": 2.189, "eval_steps_per_second": 2.189, "step": 384 }, { "epoch": 0.31075797704628577, "grad_norm": 0.059990234673023224, "learning_rate": 9.908345060538837e-05, "loss": 0.1004, "step": 385 }, { "epoch": 0.31156514062302937, "grad_norm": 0.060296058654785156, "learning_rate": 9.906996447869454e-05, "loss": 0.0996, "step": 386 }, { "epoch": 0.31237230419977297, "grad_norm": 0.06367689371109009, "learning_rate": 9.905638078902367e-05, "loss": 0.106, "step": 387 }, { "epoch": 0.31317946777651656, "grad_norm": 0.06004048138856888, "learning_rate": 9.90426995633834e-05, "loss": 0.1112, "step": 388 }, { "epoch": 0.31398663135326016, "grad_norm": 0.0589597225189209, "learning_rate": 9.90289208289753e-05, "loss": 0.105, "step": 389 }, { "epoch": 0.31479379493000376, "grad_norm": 0.05795726925134659, "learning_rate": 9.901504461319488e-05, "loss": 0.1101, "step": 390 }, { "epoch": 0.3156009585067474, "grad_norm": 0.059853412210941315, "learning_rate": 9.900107094363138e-05, "loss": 0.1047, "step": 391 }, { "epoch": 0.316408122083491, "grad_norm": 0.05801186338067055, "learning_rate": 9.898699984806786e-05, "loss": 0.1037, "step": 392 }, { "epoch": 0.3172152856602346, "grad_norm": 0.05655750632286072, "learning_rate": 9.897283135448105e-05, "loss": 0.0974, "step": 393 }, { "epoch": 0.3180224492369782, "grad_norm": 0.061430417001247406, "learning_rate": 9.895856549104136e-05, "loss": 0.1049, "step": 394 }, { "epoch": 0.3188296128137218, "grad_norm": 0.06355249136686325, "learning_rate": 9.894420228611278e-05, "loss": 0.1126, "step": 395 }, { "epoch": 0.3196367763904654, "grad_norm": 0.05532156303524971, "learning_rate": 9.892974176825286e-05, "loss": 0.1091, "step": 396 }, { "epoch": 0.320443939967209, "grad_norm": 0.06017182394862175, "learning_rate": 9.891518396621258e-05, "loss": 0.1073, "step": 397 }, { "epoch": 0.3212511035439526, "grad_norm": 0.05957906320691109, "learning_rate": 9.89005289089364e-05, "loss": 0.0996, "step": 398 }, { "epoch": 0.3220582671206962, "grad_norm": 0.06084957718849182, "learning_rate": 9.88857766255621e-05, "loss": 0.1033, "step": 399 }, { "epoch": 0.3228654306974398, "grad_norm": 0.05985845625400543, "learning_rate": 9.887092714542083e-05, "loss": 0.1051, "step": 400 }, { "epoch": 0.3236725942741834, "grad_norm": 0.05382286757230759, "learning_rate": 9.885598049803693e-05, "loss": 0.0997, "step": 401 }, { "epoch": 0.324479757850927, "grad_norm": 0.058221641927957535, "learning_rate": 9.884093671312796e-05, "loss": 0.1018, "step": 402 }, { "epoch": 0.3252869214276706, "grad_norm": 0.06604649871587753, "learning_rate": 9.882579582060458e-05, "loss": 0.1031, "step": 403 }, { "epoch": 0.32609408500441417, "grad_norm": 0.06164751574397087, "learning_rate": 9.881055785057061e-05, "loss": 0.1083, "step": 404 }, { "epoch": 0.32690124858115777, "grad_norm": 0.059241704642772675, "learning_rate": 9.879522283332279e-05, "loss": 0.1071, "step": 405 }, { "epoch": 0.32770841215790136, "grad_norm": 0.05559611693024635, "learning_rate": 9.877979079935086e-05, "loss": 0.0952, "step": 406 }, { "epoch": 0.32851557573464496, "grad_norm": 0.06323038786649704, "learning_rate": 9.876426177933743e-05, "loss": 0.1034, "step": 407 }, { "epoch": 0.32932273931138856, "grad_norm": 0.06014226749539375, "learning_rate": 9.874863580415796e-05, "loss": 0.1015, "step": 408 }, { "epoch": 0.33012990288813215, "grad_norm": 0.06594546139240265, "learning_rate": 9.873291290488068e-05, "loss": 0.1083, "step": 409 }, { "epoch": 0.33093706646487575, "grad_norm": 0.062910296022892, "learning_rate": 9.871709311276652e-05, "loss": 0.1074, "step": 410 }, { "epoch": 0.33174423004161935, "grad_norm": 0.05991430953145027, "learning_rate": 9.870117645926906e-05, "loss": 0.0999, "step": 411 }, { "epoch": 0.33255139361836294, "grad_norm": 0.06381808966398239, "learning_rate": 9.868516297603445e-05, "loss": 0.1099, "step": 412 }, { "epoch": 0.3333585571951066, "grad_norm": 0.062107909470796585, "learning_rate": 9.866905269490141e-05, "loss": 0.1095, "step": 413 }, { "epoch": 0.3341657207718502, "grad_norm": 0.0687824934720993, "learning_rate": 9.865284564790103e-05, "loss": 0.1032, "step": 414 }, { "epoch": 0.3349728843485938, "grad_norm": 0.06124284863471985, "learning_rate": 9.863654186725688e-05, "loss": 0.1122, "step": 415 }, { "epoch": 0.3357800479253374, "grad_norm": 0.05816042795777321, "learning_rate": 9.862014138538482e-05, "loss": 0.108, "step": 416 }, { "epoch": 0.336587211502081, "grad_norm": 0.05565424636006355, "learning_rate": 9.860364423489299e-05, "loss": 0.1101, "step": 417 }, { "epoch": 0.3373943750788246, "grad_norm": 0.06493842601776123, "learning_rate": 9.85870504485817e-05, "loss": 0.1087, "step": 418 }, { "epoch": 0.3382015386555682, "grad_norm": 0.065157450735569, "learning_rate": 9.857036005944343e-05, "loss": 0.095, "step": 419 }, { "epoch": 0.3390087022323118, "grad_norm": 0.05556097254157066, "learning_rate": 9.855357310066273e-05, "loss": 0.1007, "step": 420 }, { "epoch": 0.3398158658090554, "grad_norm": 0.06412414461374283, "learning_rate": 9.853668960561611e-05, "loss": 0.1032, "step": 421 }, { "epoch": 0.34062302938579897, "grad_norm": 0.06429965049028397, "learning_rate": 9.851970960787207e-05, "loss": 0.1019, "step": 422 }, { "epoch": 0.34143019296254257, "grad_norm": 0.06262392550706863, "learning_rate": 9.850263314119095e-05, "loss": 0.1039, "step": 423 }, { "epoch": 0.34223735653928616, "grad_norm": 0.05643380805850029, "learning_rate": 9.84854602395249e-05, "loss": 0.0921, "step": 424 }, { "epoch": 0.34304452011602976, "grad_norm": 0.05987818166613579, "learning_rate": 9.846819093701782e-05, "loss": 0.1036, "step": 425 }, { "epoch": 0.34385168369277336, "grad_norm": 0.06350398808717728, "learning_rate": 9.845082526800528e-05, "loss": 0.1004, "step": 426 }, { "epoch": 0.34465884726951695, "grad_norm": 0.06146422401070595, "learning_rate": 9.84333632670144e-05, "loss": 0.1004, "step": 427 }, { "epoch": 0.34546601084626055, "grad_norm": 0.06458184123039246, "learning_rate": 9.84158049687639e-05, "loss": 0.1043, "step": 428 }, { "epoch": 0.34627317442300415, "grad_norm": 0.05651666969060898, "learning_rate": 9.839815040816391e-05, "loss": 0.0983, "step": 429 }, { "epoch": 0.34708033799974775, "grad_norm": 0.06055047735571861, "learning_rate": 9.838039962031598e-05, "loss": 0.1084, "step": 430 }, { "epoch": 0.34788750157649134, "grad_norm": 0.07131805270910263, "learning_rate": 9.836255264051299e-05, "loss": 0.1118, "step": 431 }, { "epoch": 0.34869466515323494, "grad_norm": 0.05392352491617203, "learning_rate": 9.834460950423902e-05, "loss": 0.0956, "step": 432 }, { "epoch": 0.34950182872997854, "grad_norm": 0.05717315524816513, "learning_rate": 9.832657024716944e-05, "loss": 0.1073, "step": 433 }, { "epoch": 0.35030899230672213, "grad_norm": 0.05507933720946312, "learning_rate": 9.83084349051706e-05, "loss": 0.0905, "step": 434 }, { "epoch": 0.3511161558834658, "grad_norm": 0.057537056505680084, "learning_rate": 9.829020351429999e-05, "loss": 0.1046, "step": 435 }, { "epoch": 0.3519233194602094, "grad_norm": 0.059799034148454666, "learning_rate": 9.8271876110806e-05, "loss": 0.1126, "step": 436 }, { "epoch": 0.352730483036953, "grad_norm": 0.057366129010915756, "learning_rate": 9.825345273112796e-05, "loss": 0.0981, "step": 437 }, { "epoch": 0.3535376466136966, "grad_norm": 0.057681530714035034, "learning_rate": 9.823493341189603e-05, "loss": 0.1065, "step": 438 }, { "epoch": 0.3543448101904402, "grad_norm": 0.05149533227086067, "learning_rate": 9.82163181899311e-05, "loss": 0.1051, "step": 439 }, { "epoch": 0.35515197376718377, "grad_norm": 0.0646044909954071, "learning_rate": 9.819760710224473e-05, "loss": 0.1081, "step": 440 }, { "epoch": 0.35595913734392737, "grad_norm": 0.05751761794090271, "learning_rate": 9.817880018603909e-05, "loss": 0.1066, "step": 441 }, { "epoch": 0.35676630092067096, "grad_norm": 0.06479285657405853, "learning_rate": 9.815989747870689e-05, "loss": 0.1008, "step": 442 }, { "epoch": 0.35757346449741456, "grad_norm": 0.05851481854915619, "learning_rate": 9.81408990178313e-05, "loss": 0.1008, "step": 443 }, { "epoch": 0.35838062807415816, "grad_norm": 0.06185034662485123, "learning_rate": 9.812180484118586e-05, "loss": 0.1069, "step": 444 }, { "epoch": 0.35918779165090176, "grad_norm": 0.06002921611070633, "learning_rate": 9.81026149867344e-05, "loss": 0.1067, "step": 445 }, { "epoch": 0.35999495522764535, "grad_norm": 0.06176440045237541, "learning_rate": 9.808332949263103e-05, "loss": 0.098, "step": 446 }, { "epoch": 0.36080211880438895, "grad_norm": 0.057366177439689636, "learning_rate": 9.806394839721998e-05, "loss": 0.1068, "step": 447 }, { "epoch": 0.36160928238113255, "grad_norm": 0.05923178419470787, "learning_rate": 9.804447173903554e-05, "loss": 0.1047, "step": 448 }, { "epoch": 0.36241644595787614, "grad_norm": 0.059114400297403336, "learning_rate": 9.802489955680205e-05, "loss": 0.1032, "step": 449 }, { "epoch": 0.36322360953461974, "grad_norm": 0.06613054126501083, "learning_rate": 9.800523188943373e-05, "loss": 0.1036, "step": 450 }, { "epoch": 0.36403077311136334, "grad_norm": 0.06443540006875992, "learning_rate": 9.798546877603468e-05, "loss": 0.1061, "step": 451 }, { "epoch": 0.36483793668810693, "grad_norm": 0.05921705812215805, "learning_rate": 9.796561025589874e-05, "loss": 0.1192, "step": 452 }, { "epoch": 0.36564510026485053, "grad_norm": 0.05937834456562996, "learning_rate": 9.794565636850947e-05, "loss": 0.1052, "step": 453 }, { "epoch": 0.36645226384159413, "grad_norm": 0.05621366575360298, "learning_rate": 9.792560715354006e-05, "loss": 0.0981, "step": 454 }, { "epoch": 0.3672594274183377, "grad_norm": 0.05543336272239685, "learning_rate": 9.790546265085317e-05, "loss": 0.1002, "step": 455 }, { "epoch": 0.3680665909950813, "grad_norm": 0.05462726578116417, "learning_rate": 9.788522290050094e-05, "loss": 0.0967, "step": 456 }, { "epoch": 0.3688737545718249, "grad_norm": 0.058962322771549225, "learning_rate": 9.786488794272493e-05, "loss": 0.0951, "step": 457 }, { "epoch": 0.36968091814856857, "grad_norm": 0.061038944870233536, "learning_rate": 9.784445781795596e-05, "loss": 0.1012, "step": 458 }, { "epoch": 0.37048808172531217, "grad_norm": 0.0617917999625206, "learning_rate": 9.782393256681406e-05, "loss": 0.1019, "step": 459 }, { "epoch": 0.37129524530205577, "grad_norm": 0.057484470307826996, "learning_rate": 9.780331223010838e-05, "loss": 0.1027, "step": 460 }, { "epoch": 0.37210240887879936, "grad_norm": 0.05532921850681305, "learning_rate": 9.778259684883719e-05, "loss": 0.1018, "step": 461 }, { "epoch": 0.37290957245554296, "grad_norm": 0.05626295879483223, "learning_rate": 9.776178646418765e-05, "loss": 0.1077, "step": 462 }, { "epoch": 0.37371673603228656, "grad_norm": 0.06269638985395432, "learning_rate": 9.774088111753585e-05, "loss": 0.11, "step": 463 }, { "epoch": 0.37452389960903015, "grad_norm": 0.05651126429438591, "learning_rate": 9.77198808504467e-05, "loss": 0.0951, "step": 464 }, { "epoch": 0.37533106318577375, "grad_norm": 0.05716134235262871, "learning_rate": 9.76987857046738e-05, "loss": 0.0988, "step": 465 }, { "epoch": 0.37613822676251735, "grad_norm": 0.060417503118515015, "learning_rate": 9.767759572215945e-05, "loss": 0.1146, "step": 466 }, { "epoch": 0.37694539033926094, "grad_norm": 0.057473987340927124, "learning_rate": 9.765631094503441e-05, "loss": 0.1026, "step": 467 }, { "epoch": 0.37775255391600454, "grad_norm": 0.05464447662234306, "learning_rate": 9.763493141561801e-05, "loss": 0.1088, "step": 468 }, { "epoch": 0.37855971749274814, "grad_norm": 0.05866370350122452, "learning_rate": 9.761345717641793e-05, "loss": 0.1055, "step": 469 }, { "epoch": 0.37936688106949173, "grad_norm": 0.05870380625128746, "learning_rate": 9.759188827013016e-05, "loss": 0.0992, "step": 470 }, { "epoch": 0.38017404464623533, "grad_norm": 0.0654328390955925, "learning_rate": 9.75702247396389e-05, "loss": 0.1042, "step": 471 }, { "epoch": 0.38098120822297893, "grad_norm": 0.0614667572081089, "learning_rate": 9.754846662801651e-05, "loss": 0.1021, "step": 472 }, { "epoch": 0.3817883717997225, "grad_norm": 0.05376680940389633, "learning_rate": 9.752661397852338e-05, "loss": 0.0999, "step": 473 }, { "epoch": 0.3825955353764661, "grad_norm": 0.057742491364479065, "learning_rate": 9.750466683460786e-05, "loss": 0.1034, "step": 474 }, { "epoch": 0.3834026989532097, "grad_norm": 0.054894886910915375, "learning_rate": 9.748262523990621e-05, "loss": 0.1036, "step": 475 }, { "epoch": 0.3842098625299533, "grad_norm": 0.058366306126117706, "learning_rate": 9.746048923824245e-05, "loss": 0.1087, "step": 476 }, { "epoch": 0.3850170261066969, "grad_norm": 0.052579283714294434, "learning_rate": 9.743825887362832e-05, "loss": 0.095, "step": 477 }, { "epoch": 0.3858241896834405, "grad_norm": 0.056786615401506424, "learning_rate": 9.741593419026315e-05, "loss": 0.1024, "step": 478 }, { "epoch": 0.3866313532601841, "grad_norm": 0.05674027279019356, "learning_rate": 9.739351523253386e-05, "loss": 0.0956, "step": 479 }, { "epoch": 0.38743851683692776, "grad_norm": 0.057301007211208344, "learning_rate": 9.737100204501472e-05, "loss": 0.1072, "step": 480 }, { "epoch": 0.38824568041367136, "grad_norm": 0.05449144169688225, "learning_rate": 9.734839467246744e-05, "loss": 0.1076, "step": 481 }, { "epoch": 0.38905284399041495, "grad_norm": 0.05938861146569252, "learning_rate": 9.732569315984092e-05, "loss": 0.0991, "step": 482 }, { "epoch": 0.38986000756715855, "grad_norm": 0.06099748983979225, "learning_rate": 9.730289755227131e-05, "loss": 0.1007, "step": 483 }, { "epoch": 0.39066717114390215, "grad_norm": 0.05431505665183067, "learning_rate": 9.728000789508175e-05, "loss": 0.0974, "step": 484 }, { "epoch": 0.39147433472064574, "grad_norm": 0.06169505789875984, "learning_rate": 9.725702423378247e-05, "loss": 0.1017, "step": 485 }, { "epoch": 0.39228149829738934, "grad_norm": 0.06015540659427643, "learning_rate": 9.723394661407053e-05, "loss": 0.1038, "step": 486 }, { "epoch": 0.39308866187413294, "grad_norm": 0.050340402871370316, "learning_rate": 9.721077508182983e-05, "loss": 0.1037, "step": 487 }, { "epoch": 0.39389582545087654, "grad_norm": 0.06462303549051285, "learning_rate": 9.718750968313099e-05, "loss": 0.102, "step": 488 }, { "epoch": 0.39470298902762013, "grad_norm": 0.056420858949422836, "learning_rate": 9.716415046423126e-05, "loss": 0.0993, "step": 489 }, { "epoch": 0.39551015260436373, "grad_norm": 0.05413908511400223, "learning_rate": 9.714069747157444e-05, "loss": 0.1104, "step": 490 }, { "epoch": 0.3963173161811073, "grad_norm": 0.06213197857141495, "learning_rate": 9.711715075179076e-05, "loss": 0.0976, "step": 491 }, { "epoch": 0.3971244797578509, "grad_norm": 0.05402451381087303, "learning_rate": 9.709351035169678e-05, "loss": 0.0969, "step": 492 }, { "epoch": 0.3979316433345945, "grad_norm": 0.06179218739271164, "learning_rate": 9.706977631829535e-05, "loss": 0.1025, "step": 493 }, { "epoch": 0.3987388069113381, "grad_norm": 0.06106850132346153, "learning_rate": 9.704594869877548e-05, "loss": 0.0928, "step": 494 }, { "epoch": 0.3995459704880817, "grad_norm": 0.05836381018161774, "learning_rate": 9.702202754051228e-05, "loss": 0.1003, "step": 495 }, { "epoch": 0.4003531340648253, "grad_norm": 0.06013929098844528, "learning_rate": 9.699801289106676e-05, "loss": 0.1107, "step": 496 }, { "epoch": 0.4011602976415689, "grad_norm": 0.058785952627658844, "learning_rate": 9.697390479818589e-05, "loss": 0.1169, "step": 497 }, { "epoch": 0.4019674612183125, "grad_norm": 0.05733112245798111, "learning_rate": 9.694970330980239e-05, "loss": 0.0978, "step": 498 }, { "epoch": 0.4027746247950561, "grad_norm": 0.06685399264097214, "learning_rate": 9.692540847403468e-05, "loss": 0.0973, "step": 499 }, { "epoch": 0.4035817883717997, "grad_norm": 0.06673123687505722, "learning_rate": 9.690102033918678e-05, "loss": 0.096, "step": 500 }, { "epoch": 0.4043889519485433, "grad_norm": 0.06340870261192322, "learning_rate": 9.687653895374823e-05, "loss": 0.1023, "step": 501 }, { "epoch": 0.40519611552528695, "grad_norm": 0.05540281534194946, "learning_rate": 9.685196436639392e-05, "loss": 0.1045, "step": 502 }, { "epoch": 0.40600327910203055, "grad_norm": 0.05932539328932762, "learning_rate": 9.682729662598412e-05, "loss": 0.1044, "step": 503 }, { "epoch": 0.40681044267877414, "grad_norm": 0.055272724479436874, "learning_rate": 9.680253578156424e-05, "loss": 0.103, "step": 504 }, { "epoch": 0.40761760625551774, "grad_norm": 0.06060817465186119, "learning_rate": 9.677768188236486e-05, "loss": 0.0934, "step": 505 }, { "epoch": 0.40842476983226134, "grad_norm": 0.05533970892429352, "learning_rate": 9.675273497780155e-05, "loss": 0.1038, "step": 506 }, { "epoch": 0.40923193340900493, "grad_norm": 0.05950247123837471, "learning_rate": 9.67276951174748e-05, "loss": 0.103, "step": 507 }, { "epoch": 0.41003909698574853, "grad_norm": 0.060652147978544235, "learning_rate": 9.67025623511699e-05, "loss": 0.1134, "step": 508 }, { "epoch": 0.4108462605624921, "grad_norm": 0.05823478847742081, "learning_rate": 9.667733672885688e-05, "loss": 0.1083, "step": 509 }, { "epoch": 0.4116534241392357, "grad_norm": 0.06178595498204231, "learning_rate": 9.665201830069043e-05, "loss": 0.093, "step": 510 }, { "epoch": 0.4124605877159793, "grad_norm": 0.06450726836919785, "learning_rate": 9.662660711700967e-05, "loss": 0.102, "step": 511 }, { "epoch": 0.4132677512927229, "grad_norm": 0.0631895437836647, "learning_rate": 9.660110322833822e-05, "loss": 0.1036, "step": 512 }, { "epoch": 0.4132677512927229, "eval_loss": 0.11469773203134537, "eval_runtime": 4045.941, "eval_samples_per_second": 2.2, "eval_steps_per_second": 2.2, "step": 512 }, { "epoch": 0.4140749148694665, "grad_norm": 0.059871237725019455, "learning_rate": 9.657550668538396e-05, "loss": 0.1053, "step": 513 }, { "epoch": 0.4148820784462101, "grad_norm": 0.05471886694431305, "learning_rate": 9.654981753903906e-05, "loss": 0.0984, "step": 514 }, { "epoch": 0.4156892420229537, "grad_norm": 0.05665110424160957, "learning_rate": 9.652403584037972e-05, "loss": 0.1059, "step": 515 }, { "epoch": 0.4164964055996973, "grad_norm": 0.05893462151288986, "learning_rate": 9.649816164066623e-05, "loss": 0.0969, "step": 516 }, { "epoch": 0.4173035691764409, "grad_norm": 0.054175037890672684, "learning_rate": 9.647219499134277e-05, "loss": 0.0974, "step": 517 }, { "epoch": 0.4181107327531845, "grad_norm": 0.05906757712364197, "learning_rate": 9.644613594403734e-05, "loss": 0.1013, "step": 518 }, { "epoch": 0.4189178963299281, "grad_norm": 0.06198793277144432, "learning_rate": 9.641998455056159e-05, "loss": 0.1094, "step": 519 }, { "epoch": 0.4197250599066717, "grad_norm": 0.053826313465833664, "learning_rate": 9.639374086291087e-05, "loss": 0.0976, "step": 520 }, { "epoch": 0.4205322234834153, "grad_norm": 0.05674249678850174, "learning_rate": 9.636740493326397e-05, "loss": 0.1023, "step": 521 }, { "epoch": 0.4213393870601589, "grad_norm": 0.061340849846601486, "learning_rate": 9.634097681398311e-05, "loss": 0.1013, "step": 522 }, { "epoch": 0.4221465506369025, "grad_norm": 0.05626356974244118, "learning_rate": 9.631445655761378e-05, "loss": 0.1006, "step": 523 }, { "epoch": 0.42295371421364614, "grad_norm": 0.06247258186340332, "learning_rate": 9.628784421688468e-05, "loss": 0.1087, "step": 524 }, { "epoch": 0.42376087779038973, "grad_norm": 0.05976739525794983, "learning_rate": 9.626113984470761e-05, "loss": 0.1047, "step": 525 }, { "epoch": 0.42456804136713333, "grad_norm": 0.061096660792827606, "learning_rate": 9.623434349417729e-05, "loss": 0.1146, "step": 526 }, { "epoch": 0.42537520494387693, "grad_norm": 0.061707720160484314, "learning_rate": 9.62074552185714e-05, "loss": 0.1016, "step": 527 }, { "epoch": 0.4261823685206205, "grad_norm": 0.0637449100613594, "learning_rate": 9.618047507135032e-05, "loss": 0.0942, "step": 528 }, { "epoch": 0.4269895320973641, "grad_norm": 0.0657891109585762, "learning_rate": 9.615340310615712e-05, "loss": 0.1034, "step": 529 }, { "epoch": 0.4277966956741077, "grad_norm": 0.059370074421167374, "learning_rate": 9.612623937681743e-05, "loss": 0.0933, "step": 530 }, { "epoch": 0.4286038592508513, "grad_norm": 0.06274443119764328, "learning_rate": 9.609898393733933e-05, "loss": 0.1053, "step": 531 }, { "epoch": 0.4294110228275949, "grad_norm": 0.05938064679503441, "learning_rate": 9.607163684191322e-05, "loss": 0.0984, "step": 532 }, { "epoch": 0.4302181864043385, "grad_norm": 0.06397516280412674, "learning_rate": 9.604419814491179e-05, "loss": 0.1013, "step": 533 }, { "epoch": 0.4310253499810821, "grad_norm": 0.053470414131879807, "learning_rate": 9.601666790088977e-05, "loss": 0.0996, "step": 534 }, { "epoch": 0.4318325135578257, "grad_norm": 0.05898236483335495, "learning_rate": 9.598904616458397e-05, "loss": 0.0952, "step": 535 }, { "epoch": 0.4326396771345693, "grad_norm": 0.0700695738196373, "learning_rate": 9.59613329909131e-05, "loss": 0.11, "step": 536 }, { "epoch": 0.4334468407113129, "grad_norm": 0.060948751866817474, "learning_rate": 9.593352843497767e-05, "loss": 0.1089, "step": 537 }, { "epoch": 0.4342540042880565, "grad_norm": 0.06028398871421814, "learning_rate": 9.590563255205987e-05, "loss": 0.1105, "step": 538 }, { "epoch": 0.4350611678648001, "grad_norm": 0.05239766463637352, "learning_rate": 9.587764539762344e-05, "loss": 0.1027, "step": 539 }, { "epoch": 0.4358683314415437, "grad_norm": 0.058010172098875046, "learning_rate": 9.584956702731366e-05, "loss": 0.1031, "step": 540 }, { "epoch": 0.4366754950182873, "grad_norm": 0.06441504508256912, "learning_rate": 9.582139749695713e-05, "loss": 0.1022, "step": 541 }, { "epoch": 0.4374826585950309, "grad_norm": 0.05417362228035927, "learning_rate": 9.579313686256168e-05, "loss": 0.1002, "step": 542 }, { "epoch": 0.4382898221717745, "grad_norm": 0.0571327731013298, "learning_rate": 9.576478518031633e-05, "loss": 0.099, "step": 543 }, { "epoch": 0.4390969857485181, "grad_norm": 0.06254582107067108, "learning_rate": 9.573634250659106e-05, "loss": 0.1075, "step": 544 }, { "epoch": 0.4399041493252617, "grad_norm": 0.06605369597673416, "learning_rate": 9.57078088979368e-05, "loss": 0.0985, "step": 545 }, { "epoch": 0.44071131290200527, "grad_norm": 0.05933993309736252, "learning_rate": 9.56791844110853e-05, "loss": 0.0966, "step": 546 }, { "epoch": 0.4415184764787489, "grad_norm": 0.06004410982131958, "learning_rate": 9.565046910294896e-05, "loss": 0.1071, "step": 547 }, { "epoch": 0.4423256400554925, "grad_norm": 0.06342016905546188, "learning_rate": 9.562166303062076e-05, "loss": 0.1032, "step": 548 }, { "epoch": 0.4431328036322361, "grad_norm": 0.05832195654511452, "learning_rate": 9.559276625137416e-05, "loss": 0.0976, "step": 549 }, { "epoch": 0.4439399672089797, "grad_norm": 0.060449838638305664, "learning_rate": 9.556377882266297e-05, "loss": 0.1069, "step": 550 }, { "epoch": 0.4447471307857233, "grad_norm": 0.0566570870578289, "learning_rate": 9.553470080212122e-05, "loss": 0.1045, "step": 551 }, { "epoch": 0.4455542943624669, "grad_norm": 0.06132432073354721, "learning_rate": 9.550553224756303e-05, "loss": 0.1052, "step": 552 }, { "epoch": 0.4463614579392105, "grad_norm": 0.05979295074939728, "learning_rate": 9.547627321698256e-05, "loss": 0.0975, "step": 553 }, { "epoch": 0.4471686215159541, "grad_norm": 0.061621103435754776, "learning_rate": 9.544692376855386e-05, "loss": 0.1053, "step": 554 }, { "epoch": 0.4479757850926977, "grad_norm": 0.0648060292005539, "learning_rate": 9.541748396063076e-05, "loss": 0.1029, "step": 555 }, { "epoch": 0.4487829486694413, "grad_norm": 0.05910136178135872, "learning_rate": 9.538795385174672e-05, "loss": 0.1064, "step": 556 }, { "epoch": 0.4495901122461849, "grad_norm": 0.059514667838811874, "learning_rate": 9.535833350061473e-05, "loss": 0.0953, "step": 557 }, { "epoch": 0.4503972758229285, "grad_norm": 0.061229605227708817, "learning_rate": 9.532862296612724e-05, "loss": 0.1021, "step": 558 }, { "epoch": 0.4512044393996721, "grad_norm": 0.060088444501161575, "learning_rate": 9.5298822307356e-05, "loss": 0.1062, "step": 559 }, { "epoch": 0.4520116029764157, "grad_norm": 0.05872553959488869, "learning_rate": 9.526893158355193e-05, "loss": 0.098, "step": 560 }, { "epoch": 0.4528187665531593, "grad_norm": 0.059516701847314835, "learning_rate": 9.523895085414501e-05, "loss": 0.1117, "step": 561 }, { "epoch": 0.4536259301299029, "grad_norm": 0.06293745338916779, "learning_rate": 9.520888017874423e-05, "loss": 0.1, "step": 562 }, { "epoch": 0.4544330937066465, "grad_norm": 0.05440789833664894, "learning_rate": 9.517871961713735e-05, "loss": 0.0957, "step": 563 }, { "epoch": 0.45524025728339007, "grad_norm": 0.06674956530332565, "learning_rate": 9.51484692292909e-05, "loss": 0.107, "step": 564 }, { "epoch": 0.45604742086013367, "grad_norm": 0.06327424198389053, "learning_rate": 9.511812907534994e-05, "loss": 0.0883, "step": 565 }, { "epoch": 0.45685458443687726, "grad_norm": 0.05748824030160904, "learning_rate": 9.508769921563809e-05, "loss": 0.0899, "step": 566 }, { "epoch": 0.45766174801362086, "grad_norm": 0.05838910862803459, "learning_rate": 9.505717971065724e-05, "loss": 0.0997, "step": 567 }, { "epoch": 0.45846891159036446, "grad_norm": 0.05716516822576523, "learning_rate": 9.50265706210876e-05, "loss": 0.1066, "step": 568 }, { "epoch": 0.4592760751671081, "grad_norm": 0.057993315160274506, "learning_rate": 9.499587200778744e-05, "loss": 0.1028, "step": 569 }, { "epoch": 0.4600832387438517, "grad_norm": 0.049491968005895615, "learning_rate": 9.496508393179302e-05, "loss": 0.0999, "step": 570 }, { "epoch": 0.4608904023205953, "grad_norm": 0.05557968094944954, "learning_rate": 9.493420645431852e-05, "loss": 0.1042, "step": 571 }, { "epoch": 0.4616975658973389, "grad_norm": 0.057269830256700516, "learning_rate": 9.490323963675583e-05, "loss": 0.1078, "step": 572 }, { "epoch": 0.4625047294740825, "grad_norm": 0.05336315184831619, "learning_rate": 9.48721835406745e-05, "loss": 0.0917, "step": 573 }, { "epoch": 0.4633118930508261, "grad_norm": 0.0562533475458622, "learning_rate": 9.484103822782155e-05, "loss": 0.0983, "step": 574 }, { "epoch": 0.4641190566275697, "grad_norm": 0.05853231996297836, "learning_rate": 9.480980376012144e-05, "loss": 0.1043, "step": 575 }, { "epoch": 0.4649262202043133, "grad_norm": 0.059628695249557495, "learning_rate": 9.477848019967583e-05, "loss": 0.1081, "step": 576 }, { "epoch": 0.4657333837810569, "grad_norm": 0.05488279461860657, "learning_rate": 9.474706760876356e-05, "loss": 0.1036, "step": 577 }, { "epoch": 0.4665405473578005, "grad_norm": 0.06197020411491394, "learning_rate": 9.471556604984047e-05, "loss": 0.107, "step": 578 }, { "epoch": 0.4673477109345441, "grad_norm": 0.06235002353787422, "learning_rate": 9.468397558553928e-05, "loss": 0.0941, "step": 579 }, { "epoch": 0.4681548745112877, "grad_norm": 0.0592878982424736, "learning_rate": 9.46522962786695e-05, "loss": 0.1067, "step": 580 }, { "epoch": 0.4689620380880313, "grad_norm": 0.05823064595460892, "learning_rate": 9.462052819221726e-05, "loss": 0.0995, "step": 581 }, { "epoch": 0.46976920166477487, "grad_norm": 0.055422838777303696, "learning_rate": 9.458867138934521e-05, "loss": 0.0996, "step": 582 }, { "epoch": 0.47057636524151847, "grad_norm": 0.05832410976290703, "learning_rate": 9.45567259333924e-05, "loss": 0.0885, "step": 583 }, { "epoch": 0.47138352881826207, "grad_norm": 0.059541985392570496, "learning_rate": 9.452469188787413e-05, "loss": 0.0981, "step": 584 }, { "epoch": 0.47219069239500566, "grad_norm": 0.06076124310493469, "learning_rate": 9.449256931648185e-05, "loss": 0.0996, "step": 585 }, { "epoch": 0.47299785597174926, "grad_norm": 0.05566677451133728, "learning_rate": 9.4460358283083e-05, "loss": 0.1021, "step": 586 }, { "epoch": 0.47380501954849286, "grad_norm": 0.057820551097393036, "learning_rate": 9.442805885172093e-05, "loss": 0.1028, "step": 587 }, { "epoch": 0.47461218312523645, "grad_norm": 0.05452903360128403, "learning_rate": 9.439567108661471e-05, "loss": 0.0925, "step": 588 }, { "epoch": 0.47541934670198005, "grad_norm": 0.062080200761556625, "learning_rate": 9.436319505215911e-05, "loss": 0.1057, "step": 589 }, { "epoch": 0.47622651027872365, "grad_norm": 0.05838869512081146, "learning_rate": 9.43306308129243e-05, "loss": 0.112, "step": 590 }, { "epoch": 0.4770336738554673, "grad_norm": 0.055224865674972534, "learning_rate": 9.429797843365594e-05, "loss": 0.1109, "step": 591 }, { "epoch": 0.4778408374322109, "grad_norm": 0.06480769068002701, "learning_rate": 9.42652379792748e-05, "loss": 0.1079, "step": 592 }, { "epoch": 0.4786480010089545, "grad_norm": 0.05791940540075302, "learning_rate": 9.423240951487689e-05, "loss": 0.1001, "step": 593 }, { "epoch": 0.4794551645856981, "grad_norm": 0.05015169084072113, "learning_rate": 9.419949310573312e-05, "loss": 0.0981, "step": 594 }, { "epoch": 0.4802623281624417, "grad_norm": 0.056319478899240494, "learning_rate": 9.416648881728929e-05, "loss": 0.1011, "step": 595 }, { "epoch": 0.4810694917391853, "grad_norm": 0.06085798889398575, "learning_rate": 9.413339671516593e-05, "loss": 0.1071, "step": 596 }, { "epoch": 0.4818766553159289, "grad_norm": 0.05307924374938011, "learning_rate": 9.410021686515815e-05, "loss": 0.104, "step": 597 }, { "epoch": 0.4826838188926725, "grad_norm": 0.06125115230679512, "learning_rate": 9.406694933323555e-05, "loss": 0.1021, "step": 598 }, { "epoch": 0.4834909824694161, "grad_norm": 0.06635504961013794, "learning_rate": 9.403359418554201e-05, "loss": 0.1001, "step": 599 }, { "epoch": 0.48429814604615967, "grad_norm": 0.05414430424571037, "learning_rate": 9.400015148839565e-05, "loss": 0.1013, "step": 600 }, { "epoch": 0.48510530962290327, "grad_norm": 0.060244880616664886, "learning_rate": 9.396662130828869e-05, "loss": 0.1026, "step": 601 }, { "epoch": 0.48591247319964687, "grad_norm": 0.06309767067432404, "learning_rate": 9.393300371188719e-05, "loss": 0.1081, "step": 602 }, { "epoch": 0.48671963677639046, "grad_norm": 0.06908173114061356, "learning_rate": 9.389929876603112e-05, "loss": 0.1127, "step": 603 }, { "epoch": 0.48752680035313406, "grad_norm": 0.058037642389535904, "learning_rate": 9.386550653773408e-05, "loss": 0.096, "step": 604 }, { "epoch": 0.48833396392987766, "grad_norm": 0.06259340047836304, "learning_rate": 9.383162709418318e-05, "loss": 0.1049, "step": 605 }, { "epoch": 0.48914112750662125, "grad_norm": 0.05744437128305435, "learning_rate": 9.379766050273899e-05, "loss": 0.0964, "step": 606 }, { "epoch": 0.48994829108336485, "grad_norm": 0.05936441197991371, "learning_rate": 9.37636068309353e-05, "loss": 0.1189, "step": 607 }, { "epoch": 0.49075545466010845, "grad_norm": 0.055286724120378494, "learning_rate": 9.372946614647907e-05, "loss": 0.0952, "step": 608 }, { "epoch": 0.49156261823685204, "grad_norm": 0.0647420659661293, "learning_rate": 9.369523851725024e-05, "loss": 0.0991, "step": 609 }, { "epoch": 0.49236978181359564, "grad_norm": 0.05667410418391228, "learning_rate": 9.366092401130164e-05, "loss": 0.1112, "step": 610 }, { "epoch": 0.49317694539033924, "grad_norm": 0.0736803486943245, "learning_rate": 9.36265226968588e-05, "loss": 0.1037, "step": 611 }, { "epoch": 0.49398410896708284, "grad_norm": 0.05981089547276497, "learning_rate": 9.359203464231993e-05, "loss": 0.098, "step": 612 }, { "epoch": 0.49479127254382643, "grad_norm": 0.0615694560110569, "learning_rate": 9.355745991625556e-05, "loss": 0.1092, "step": 613 }, { "epoch": 0.4955984361205701, "grad_norm": 0.06376925855875015, "learning_rate": 9.352279858740866e-05, "loss": 0.1069, "step": 614 }, { "epoch": 0.4964055996973137, "grad_norm": 0.05940655246376991, "learning_rate": 9.348805072469435e-05, "loss": 0.1038, "step": 615 }, { "epoch": 0.4972127632740573, "grad_norm": 0.057887859642505646, "learning_rate": 9.345321639719979e-05, "loss": 0.109, "step": 616 }, { "epoch": 0.4980199268508009, "grad_norm": 0.06155795976519585, "learning_rate": 9.341829567418406e-05, "loss": 0.1052, "step": 617 }, { "epoch": 0.4988270904275445, "grad_norm": 0.060182493180036545, "learning_rate": 9.338328862507803e-05, "loss": 0.1069, "step": 618 }, { "epoch": 0.49963425400428807, "grad_norm": 0.05936333164572716, "learning_rate": 9.334819531948418e-05, "loss": 0.0911, "step": 619 }, { "epoch": 0.5004414175810317, "grad_norm": 0.06469666212797165, "learning_rate": 9.33130158271765e-05, "loss": 0.1053, "step": 620 }, { "epoch": 0.5012485811577753, "grad_norm": 0.058208584785461426, "learning_rate": 9.327775021810037e-05, "loss": 0.1024, "step": 621 }, { "epoch": 0.5020557447345189, "grad_norm": 0.061909642070531845, "learning_rate": 9.324239856237234e-05, "loss": 0.0935, "step": 622 }, { "epoch": 0.5028629083112625, "grad_norm": 0.0629388689994812, "learning_rate": 9.320696093028008e-05, "loss": 0.1025, "step": 623 }, { "epoch": 0.503670071888006, "grad_norm": 0.0534355454146862, "learning_rate": 9.317143739228216e-05, "loss": 0.1097, "step": 624 }, { "epoch": 0.5044772354647497, "grad_norm": 0.057096511125564575, "learning_rate": 9.313582801900802e-05, "loss": 0.0958, "step": 625 }, { "epoch": 0.5052843990414932, "grad_norm": 0.0584450401365757, "learning_rate": 9.31001328812577e-05, "loss": 0.1118, "step": 626 }, { "epoch": 0.5060915626182368, "grad_norm": 0.051737722009420395, "learning_rate": 9.306435205000177e-05, "loss": 0.1004, "step": 627 }, { "epoch": 0.5068987261949804, "grad_norm": 0.060018930584192276, "learning_rate": 9.302848559638121e-05, "loss": 0.1088, "step": 628 }, { "epoch": 0.507705889771724, "grad_norm": 0.05321282893419266, "learning_rate": 9.29925335917072e-05, "loss": 0.102, "step": 629 }, { "epoch": 0.5085130533484676, "grad_norm": 0.060890570282936096, "learning_rate": 9.295649610746107e-05, "loss": 0.098, "step": 630 }, { "epoch": 0.5093202169252112, "grad_norm": 0.06345129013061523, "learning_rate": 9.292037321529404e-05, "loss": 0.0985, "step": 631 }, { "epoch": 0.5101273805019548, "grad_norm": 0.05836442857980728, "learning_rate": 9.288416498702716e-05, "loss": 0.0994, "step": 632 }, { "epoch": 0.5109345440786984, "grad_norm": 0.05870697647333145, "learning_rate": 9.284787149465118e-05, "loss": 0.0971, "step": 633 }, { "epoch": 0.511741707655442, "grad_norm": 0.05738474428653717, "learning_rate": 9.281149281032635e-05, "loss": 0.1111, "step": 634 }, { "epoch": 0.5125488712321856, "grad_norm": 0.06503599137067795, "learning_rate": 9.277502900638232e-05, "loss": 0.1034, "step": 635 }, { "epoch": 0.5133560348089292, "grad_norm": 0.06152965500950813, "learning_rate": 9.273848015531795e-05, "loss": 0.109, "step": 636 }, { "epoch": 0.5141631983856728, "grad_norm": 0.06281008571386337, "learning_rate": 9.270184632980121e-05, "loss": 0.1082, "step": 637 }, { "epoch": 0.5149703619624164, "grad_norm": 0.05857095122337341, "learning_rate": 9.266512760266903e-05, "loss": 0.1035, "step": 638 }, { "epoch": 0.51577752553916, "grad_norm": 0.05782792344689369, "learning_rate": 9.262832404692714e-05, "loss": 0.104, "step": 639 }, { "epoch": 0.5165846891159036, "grad_norm": 0.07255726307630539, "learning_rate": 9.259143573574991e-05, "loss": 0.1123, "step": 640 }, { "epoch": 0.5165846891159036, "eval_loss": 0.11321243643760681, "eval_runtime": 4024.5367, "eval_samples_per_second": 2.212, "eval_steps_per_second": 2.212, "step": 640 }, { "epoch": 0.5173918526926472, "grad_norm": 0.05928253382444382, "learning_rate": 9.255446274248023e-05, "loss": 0.0978, "step": 641 }, { "epoch": 0.5181990162693908, "grad_norm": 0.06050507724285126, "learning_rate": 9.251740514062939e-05, "loss": 0.1005, "step": 642 }, { "epoch": 0.5190061798461344, "grad_norm": 0.061665698885917664, "learning_rate": 9.248026300387688e-05, "loss": 0.1117, "step": 643 }, { "epoch": 0.519813343422878, "grad_norm": 0.06009381264448166, "learning_rate": 9.244303640607025e-05, "loss": 0.107, "step": 644 }, { "epoch": 0.5206205069996216, "grad_norm": 0.06599263101816177, "learning_rate": 9.240572542122501e-05, "loss": 0.109, "step": 645 }, { "epoch": 0.5214276705763652, "grad_norm": 0.056473348289728165, "learning_rate": 9.236833012352442e-05, "loss": 0.1022, "step": 646 }, { "epoch": 0.5222348341531089, "grad_norm": 0.05093645304441452, "learning_rate": 9.23308505873194e-05, "loss": 0.0972, "step": 647 }, { "epoch": 0.5230419977298525, "grad_norm": 0.06672411412000656, "learning_rate": 9.229328688712834e-05, "loss": 0.1026, "step": 648 }, { "epoch": 0.5238491613065961, "grad_norm": 0.054747555404901505, "learning_rate": 9.225563909763701e-05, "loss": 0.1011, "step": 649 }, { "epoch": 0.5246563248833397, "grad_norm": 0.05588959529995918, "learning_rate": 9.22179072936983e-05, "loss": 0.0976, "step": 650 }, { "epoch": 0.5254634884600833, "grad_norm": 0.058736540377140045, "learning_rate": 9.218009155033218e-05, "loss": 0.1031, "step": 651 }, { "epoch": 0.5262706520368269, "grad_norm": 0.058324337005615234, "learning_rate": 9.214219194272553e-05, "loss": 0.1038, "step": 652 }, { "epoch": 0.5270778156135705, "grad_norm": 0.058941278606653214, "learning_rate": 9.210420854623192e-05, "loss": 0.0979, "step": 653 }, { "epoch": 0.5278849791903141, "grad_norm": 0.0579674169421196, "learning_rate": 9.206614143637158e-05, "loss": 0.1018, "step": 654 }, { "epoch": 0.5286921427670577, "grad_norm": 0.05206751450896263, "learning_rate": 9.202799068883112e-05, "loss": 0.0918, "step": 655 }, { "epoch": 0.5294993063438013, "grad_norm": 0.061680104583501816, "learning_rate": 9.198975637946347e-05, "loss": 0.1061, "step": 656 }, { "epoch": 0.5303064699205449, "grad_norm": 0.056848958134651184, "learning_rate": 9.195143858428773e-05, "loss": 0.0968, "step": 657 }, { "epoch": 0.5311136334972885, "grad_norm": 0.06657256186008453, "learning_rate": 9.191303737948893e-05, "loss": 0.1142, "step": 658 }, { "epoch": 0.5319207970740321, "grad_norm": 0.059337273240089417, "learning_rate": 9.187455284141797e-05, "loss": 0.1067, "step": 659 }, { "epoch": 0.5327279606507757, "grad_norm": 0.06266691535711288, "learning_rate": 9.183598504659143e-05, "loss": 0.0927, "step": 660 }, { "epoch": 0.5335351242275193, "grad_norm": 0.06318359076976776, "learning_rate": 9.179733407169145e-05, "loss": 0.1055, "step": 661 }, { "epoch": 0.5343422878042628, "grad_norm": 0.06583567708730698, "learning_rate": 9.175859999356553e-05, "loss": 0.1072, "step": 662 }, { "epoch": 0.5351494513810064, "grad_norm": 0.05304455757141113, "learning_rate": 9.171978288922638e-05, "loss": 0.0994, "step": 663 }, { "epoch": 0.53595661495775, "grad_norm": 0.06137736141681671, "learning_rate": 9.168088283585182e-05, "loss": 0.0948, "step": 664 }, { "epoch": 0.5367637785344936, "grad_norm": 0.06557868421077728, "learning_rate": 9.164189991078458e-05, "loss": 0.0982, "step": 665 }, { "epoch": 0.5375709421112372, "grad_norm": 0.06356193125247955, "learning_rate": 9.160283419153216e-05, "loss": 0.0974, "step": 666 }, { "epoch": 0.5383781056879808, "grad_norm": 0.05979546159505844, "learning_rate": 9.156368575576667e-05, "loss": 0.1022, "step": 667 }, { "epoch": 0.5391852692647244, "grad_norm": 0.05956738069653511, "learning_rate": 9.15244546813247e-05, "loss": 0.0975, "step": 668 }, { "epoch": 0.539992432841468, "grad_norm": 0.05506300553679466, "learning_rate": 9.14851410462071e-05, "loss": 0.0961, "step": 669 }, { "epoch": 0.5407995964182116, "grad_norm": 0.05451469123363495, "learning_rate": 9.144574492857892e-05, "loss": 0.1069, "step": 670 }, { "epoch": 0.5416067599949552, "grad_norm": 0.0583430640399456, "learning_rate": 9.140626640676919e-05, "loss": 0.096, "step": 671 }, { "epoch": 0.5424139235716988, "grad_norm": 0.056004684418439865, "learning_rate": 9.136670555927076e-05, "loss": 0.1012, "step": 672 }, { "epoch": 0.5432210871484424, "grad_norm": 0.06250564754009247, "learning_rate": 9.132706246474021e-05, "loss": 0.1037, "step": 673 }, { "epoch": 0.544028250725186, "grad_norm": 0.06183503940701485, "learning_rate": 9.128733720199758e-05, "loss": 0.0941, "step": 674 }, { "epoch": 0.5448354143019296, "grad_norm": 0.05493156239390373, "learning_rate": 9.124752985002631e-05, "loss": 0.1036, "step": 675 }, { "epoch": 0.5456425778786732, "grad_norm": 0.0529886856675148, "learning_rate": 9.12076404879731e-05, "loss": 0.1002, "step": 676 }, { "epoch": 0.5464497414554168, "grad_norm": 0.053665485233068466, "learning_rate": 9.116766919514765e-05, "loss": 0.0965, "step": 677 }, { "epoch": 0.5472569050321604, "grad_norm": 0.05857043340802193, "learning_rate": 9.112761605102256e-05, "loss": 0.1096, "step": 678 }, { "epoch": 0.548064068608904, "grad_norm": 0.058894120156764984, "learning_rate": 9.10874811352332e-05, "loss": 0.1034, "step": 679 }, { "epoch": 0.5488712321856476, "grad_norm": 0.051508355885744095, "learning_rate": 9.104726452757748e-05, "loss": 0.0904, "step": 680 }, { "epoch": 0.5496783957623912, "grad_norm": 0.05587790533900261, "learning_rate": 9.10069663080158e-05, "loss": 0.1007, "step": 681 }, { "epoch": 0.5504855593391348, "grad_norm": 0.05501774325966835, "learning_rate": 9.096658655667074e-05, "loss": 0.0992, "step": 682 }, { "epoch": 0.5512927229158784, "grad_norm": 0.061670105904340744, "learning_rate": 9.092612535382705e-05, "loss": 0.1018, "step": 683 }, { "epoch": 0.552099886492622, "grad_norm": 0.07154003530740738, "learning_rate": 9.088558277993142e-05, "loss": 0.0972, "step": 684 }, { "epoch": 0.5529070500693656, "grad_norm": 0.05748705938458443, "learning_rate": 9.084495891559226e-05, "loss": 0.1011, "step": 685 }, { "epoch": 0.5537142136461092, "grad_norm": 0.059228409081697464, "learning_rate": 9.080425384157971e-05, "loss": 0.101, "step": 686 }, { "epoch": 0.5545213772228528, "grad_norm": 0.059342123568058014, "learning_rate": 9.076346763882529e-05, "loss": 0.1078, "step": 687 }, { "epoch": 0.5553285407995964, "grad_norm": 0.054099153727293015, "learning_rate": 9.072260038842184e-05, "loss": 0.1033, "step": 688 }, { "epoch": 0.55613570437634, "grad_norm": 0.05002785101532936, "learning_rate": 9.068165217162337e-05, "loss": 0.1012, "step": 689 }, { "epoch": 0.5569428679530836, "grad_norm": 0.06535624712705612, "learning_rate": 9.064062306984485e-05, "loss": 0.0976, "step": 690 }, { "epoch": 0.5577500315298272, "grad_norm": 0.05861057713627815, "learning_rate": 9.059951316466208e-05, "loss": 0.1057, "step": 691 }, { "epoch": 0.5585571951065709, "grad_norm": 0.060868870466947556, "learning_rate": 9.055832253781152e-05, "loss": 0.0974, "step": 692 }, { "epoch": 0.5593643586833145, "grad_norm": 0.059375084936618805, "learning_rate": 9.051705127119011e-05, "loss": 0.1053, "step": 693 }, { "epoch": 0.5601715222600581, "grad_norm": 0.05162832885980606, "learning_rate": 9.04756994468551e-05, "loss": 0.0945, "step": 694 }, { "epoch": 0.5609786858368017, "grad_norm": 0.06435087323188782, "learning_rate": 9.043426714702397e-05, "loss": 0.1108, "step": 695 }, { "epoch": 0.5617858494135453, "grad_norm": 0.064969003200531, "learning_rate": 9.039275445407414e-05, "loss": 0.0994, "step": 696 }, { "epoch": 0.5625930129902889, "grad_norm": 0.061886534094810486, "learning_rate": 9.035116145054292e-05, "loss": 0.0983, "step": 697 }, { "epoch": 0.5634001765670325, "grad_norm": 0.07105398923158646, "learning_rate": 9.030948821912725e-05, "loss": 0.106, "step": 698 }, { "epoch": 0.564207340143776, "grad_norm": 0.05480493605136871, "learning_rate": 9.026773484268367e-05, "loss": 0.1069, "step": 699 }, { "epoch": 0.5650145037205196, "grad_norm": 0.05376574397087097, "learning_rate": 9.022590140422795e-05, "loss": 0.1053, "step": 700 }, { "epoch": 0.5658216672972632, "grad_norm": 0.06177007034420967, "learning_rate": 9.018398798693512e-05, "loss": 0.1035, "step": 701 }, { "epoch": 0.5666288308740068, "grad_norm": 0.059919267892837524, "learning_rate": 9.01419946741392e-05, "loss": 0.1017, "step": 702 }, { "epoch": 0.5674359944507504, "grad_norm": 0.06616278737783432, "learning_rate": 9.009992154933309e-05, "loss": 0.1063, "step": 703 }, { "epoch": 0.568243158027494, "grad_norm": 0.05873396620154381, "learning_rate": 9.005776869616833e-05, "loss": 0.0975, "step": 704 }, { "epoch": 0.5690503216042376, "grad_norm": 0.059044282883405685, "learning_rate": 9.001553619845502e-05, "loss": 0.0992, "step": 705 }, { "epoch": 0.5698574851809812, "grad_norm": 0.05689457803964615, "learning_rate": 8.997322414016158e-05, "loss": 0.1049, "step": 706 }, { "epoch": 0.5706646487577248, "grad_norm": 0.06355760246515274, "learning_rate": 8.993083260541465e-05, "loss": 0.0963, "step": 707 }, { "epoch": 0.5714718123344684, "grad_norm": 0.05782793462276459, "learning_rate": 8.988836167849888e-05, "loss": 0.1124, "step": 708 }, { "epoch": 0.572278975911212, "grad_norm": 0.061261456459760666, "learning_rate": 8.984581144385673e-05, "loss": 0.1032, "step": 709 }, { "epoch": 0.5730861394879556, "grad_norm": 0.06258756667375565, "learning_rate": 8.98031819860884e-05, "loss": 0.0993, "step": 710 }, { "epoch": 0.5738933030646992, "grad_norm": 0.05974671244621277, "learning_rate": 8.976047338995155e-05, "loss": 0.0963, "step": 711 }, { "epoch": 0.5747004666414428, "grad_norm": 0.053320422768592834, "learning_rate": 8.971768574036126e-05, "loss": 0.0965, "step": 712 }, { "epoch": 0.5755076302181864, "grad_norm": 0.054608117789030075, "learning_rate": 8.96748191223897e-05, "loss": 0.0927, "step": 713 }, { "epoch": 0.57631479379493, "grad_norm": 0.05651720240712166, "learning_rate": 8.963187362126613e-05, "loss": 0.1004, "step": 714 }, { "epoch": 0.5771219573716736, "grad_norm": 0.05803193897008896, "learning_rate": 8.958884932237657e-05, "loss": 0.1006, "step": 715 }, { "epoch": 0.5779291209484172, "grad_norm": 0.05038386955857277, "learning_rate": 8.954574631126378e-05, "loss": 0.1055, "step": 716 }, { "epoch": 0.5787362845251608, "grad_norm": 0.052869413048028946, "learning_rate": 8.950256467362699e-05, "loss": 0.1002, "step": 717 }, { "epoch": 0.5795434481019044, "grad_norm": 0.058921948075294495, "learning_rate": 8.945930449532171e-05, "loss": 0.1076, "step": 718 }, { "epoch": 0.580350611678648, "grad_norm": 0.05842002481222153, "learning_rate": 8.941596586235972e-05, "loss": 0.0929, "step": 719 }, { "epoch": 0.5811577752553916, "grad_norm": 0.058490000665187836, "learning_rate": 8.937254886090869e-05, "loss": 0.1014, "step": 720 }, { "epoch": 0.5819649388321352, "grad_norm": 0.05595666915178299, "learning_rate": 8.932905357729214e-05, "loss": 0.0904, "step": 721 }, { "epoch": 0.5827721024088788, "grad_norm": 0.05814690142869949, "learning_rate": 8.928548009798922e-05, "loss": 0.1017, "step": 722 }, { "epoch": 0.5835792659856224, "grad_norm": 0.0581844188272953, "learning_rate": 8.924182850963456e-05, "loss": 0.103, "step": 723 }, { "epoch": 0.584386429562366, "grad_norm": 0.056062109768390656, "learning_rate": 8.919809889901813e-05, "loss": 0.0953, "step": 724 }, { "epoch": 0.5851935931391096, "grad_norm": 0.0615321546792984, "learning_rate": 8.915429135308495e-05, "loss": 0.0968, "step": 725 }, { "epoch": 0.5860007567158532, "grad_norm": 0.06046036630868912, "learning_rate": 8.911040595893505e-05, "loss": 0.1027, "step": 726 }, { "epoch": 0.5868079202925968, "grad_norm": 0.05491003021597862, "learning_rate": 8.906644280382323e-05, "loss": 0.0994, "step": 727 }, { "epoch": 0.5876150838693404, "grad_norm": 0.05785449966788292, "learning_rate": 8.902240197515889e-05, "loss": 0.1034, "step": 728 }, { "epoch": 0.588422247446084, "grad_norm": 0.051979485899209976, "learning_rate": 8.897828356050586e-05, "loss": 0.0861, "step": 729 }, { "epoch": 0.5892294110228276, "grad_norm": 0.05865255370736122, "learning_rate": 8.893408764758223e-05, "loss": 0.1071, "step": 730 }, { "epoch": 0.5900365745995712, "grad_norm": 0.06333976984024048, "learning_rate": 8.88898143242602e-05, "loss": 0.1013, "step": 731 }, { "epoch": 0.5908437381763147, "grad_norm": 0.06349731981754303, "learning_rate": 8.884546367856586e-05, "loss": 0.0986, "step": 732 }, { "epoch": 0.5916509017530583, "grad_norm": 0.05617557093501091, "learning_rate": 8.880103579867902e-05, "loss": 0.1073, "step": 733 }, { "epoch": 0.5924580653298019, "grad_norm": 0.05159959942102432, "learning_rate": 8.87565307729331e-05, "loss": 0.0963, "step": 734 }, { "epoch": 0.5932652289065455, "grad_norm": 0.05731795355677605, "learning_rate": 8.871194868981483e-05, "loss": 0.1085, "step": 735 }, { "epoch": 0.5940723924832892, "grad_norm": 0.05207643657922745, "learning_rate": 8.866728963796423e-05, "loss": 0.0996, "step": 736 }, { "epoch": 0.5948795560600328, "grad_norm": 0.056599363684654236, "learning_rate": 8.862255370617429e-05, "loss": 0.0858, "step": 737 }, { "epoch": 0.5956867196367764, "grad_norm": 0.05684608966112137, "learning_rate": 8.857774098339089e-05, "loss": 0.1075, "step": 738 }, { "epoch": 0.59649388321352, "grad_norm": 0.05466196686029434, "learning_rate": 8.853285155871258e-05, "loss": 0.1122, "step": 739 }, { "epoch": 0.5973010467902636, "grad_norm": 0.0587836392223835, "learning_rate": 8.848788552139042e-05, "loss": 0.0943, "step": 740 }, { "epoch": 0.5981082103670072, "grad_norm": 0.061252061277627945, "learning_rate": 8.844284296082776e-05, "loss": 0.1044, "step": 741 }, { "epoch": 0.5989153739437508, "grad_norm": 0.05480275675654411, "learning_rate": 8.839772396658015e-05, "loss": 0.1041, "step": 742 }, { "epoch": 0.5997225375204944, "grad_norm": 0.05824766680598259, "learning_rate": 8.835252862835506e-05, "loss": 0.0977, "step": 743 }, { "epoch": 0.600529701097238, "grad_norm": 0.05674997717142105, "learning_rate": 8.83072570360118e-05, "loss": 0.0954, "step": 744 }, { "epoch": 0.6013368646739816, "grad_norm": 0.055385954678058624, "learning_rate": 8.826190927956123e-05, "loss": 0.0984, "step": 745 }, { "epoch": 0.6021440282507252, "grad_norm": 0.059228017926216125, "learning_rate": 8.821648544916567e-05, "loss": 0.1041, "step": 746 }, { "epoch": 0.6029511918274688, "grad_norm": 0.06590738892555237, "learning_rate": 8.817098563513875e-05, "loss": 0.1002, "step": 747 }, { "epoch": 0.6037583554042124, "grad_norm": 0.055088870227336884, "learning_rate": 8.812540992794508e-05, "loss": 0.0916, "step": 748 }, { "epoch": 0.604565518980956, "grad_norm": 0.05477510020136833, "learning_rate": 8.807975841820023e-05, "loss": 0.1152, "step": 749 }, { "epoch": 0.6053726825576996, "grad_norm": 0.06361153721809387, "learning_rate": 8.803403119667041e-05, "loss": 0.0975, "step": 750 }, { "epoch": 0.6061798461344432, "grad_norm": 0.057177018374204636, "learning_rate": 8.79882283542725e-05, "loss": 0.0993, "step": 751 }, { "epoch": 0.6069870097111868, "grad_norm": 0.06008373945951462, "learning_rate": 8.794234998207357e-05, "loss": 0.0985, "step": 752 }, { "epoch": 0.6077941732879304, "grad_norm": 0.05662092566490173, "learning_rate": 8.789639617129099e-05, "loss": 0.0957, "step": 753 }, { "epoch": 0.608601336864674, "grad_norm": 0.06325456500053406, "learning_rate": 8.785036701329204e-05, "loss": 0.0997, "step": 754 }, { "epoch": 0.6094085004414176, "grad_norm": 0.05846451222896576, "learning_rate": 8.780426259959385e-05, "loss": 0.099, "step": 755 }, { "epoch": 0.6102156640181612, "grad_norm": 0.05801314115524292, "learning_rate": 8.775808302186314e-05, "loss": 0.0996, "step": 756 }, { "epoch": 0.6110228275949048, "grad_norm": 0.05237556993961334, "learning_rate": 8.771182837191613e-05, "loss": 0.1007, "step": 757 }, { "epoch": 0.6118299911716484, "grad_norm": 0.0630403533577919, "learning_rate": 8.766549874171825e-05, "loss": 0.1122, "step": 758 }, { "epoch": 0.612637154748392, "grad_norm": 0.06096746772527695, "learning_rate": 8.761909422338405e-05, "loss": 0.0978, "step": 759 }, { "epoch": 0.6134443183251356, "grad_norm": 0.060567162930965424, "learning_rate": 8.757261490917692e-05, "loss": 0.0989, "step": 760 }, { "epoch": 0.6142514819018792, "grad_norm": 0.05741244554519653, "learning_rate": 8.752606089150903e-05, "loss": 0.1129, "step": 761 }, { "epoch": 0.6150586454786228, "grad_norm": 0.060277823358774185, "learning_rate": 8.747943226294102e-05, "loss": 0.1033, "step": 762 }, { "epoch": 0.6158658090553664, "grad_norm": 0.05726788565516472, "learning_rate": 8.743272911618193e-05, "loss": 0.0986, "step": 763 }, { "epoch": 0.61667297263211, "grad_norm": 0.05301850661635399, "learning_rate": 8.738595154408889e-05, "loss": 0.1028, "step": 764 }, { "epoch": 0.6174801362088536, "grad_norm": 0.05664315074682236, "learning_rate": 8.733909963966708e-05, "loss": 0.1059, "step": 765 }, { "epoch": 0.6182872997855972, "grad_norm": 0.06017472222447395, "learning_rate": 8.729217349606942e-05, "loss": 0.1084, "step": 766 }, { "epoch": 0.6190944633623408, "grad_norm": 0.055886559188365936, "learning_rate": 8.724517320659644e-05, "loss": 0.098, "step": 767 }, { "epoch": 0.6199016269390843, "grad_norm": 0.05797213315963745, "learning_rate": 8.719809886469615e-05, "loss": 0.0965, "step": 768 }, { "epoch": 0.6199016269390843, "eval_loss": 0.11381925642490387, "eval_runtime": 4026.3234, "eval_samples_per_second": 2.211, "eval_steps_per_second": 2.211, "step": 768 }, { "epoch": 0.620708790515828, "grad_norm": 0.05745558440685272, "learning_rate": 8.715095056396369e-05, "loss": 0.1056, "step": 769 }, { "epoch": 0.6215159540925715, "grad_norm": 0.05526769161224365, "learning_rate": 8.710372839814132e-05, "loss": 0.1063, "step": 770 }, { "epoch": 0.6223231176693151, "grad_norm": 0.05641748011112213, "learning_rate": 8.705643246111816e-05, "loss": 0.1016, "step": 771 }, { "epoch": 0.6231302812460587, "grad_norm": 0.057237640023231506, "learning_rate": 8.700906284692999e-05, "loss": 0.104, "step": 772 }, { "epoch": 0.6239374448228023, "grad_norm": 0.0587729774415493, "learning_rate": 8.696161964975906e-05, "loss": 0.0989, "step": 773 }, { "epoch": 0.6247446083995459, "grad_norm": 0.060912009328603745, "learning_rate": 8.691410296393396e-05, "loss": 0.1025, "step": 774 }, { "epoch": 0.6255517719762895, "grad_norm": 0.05620124936103821, "learning_rate": 8.686651288392937e-05, "loss": 0.1046, "step": 775 }, { "epoch": 0.6263589355530331, "grad_norm": 0.05949588492512703, "learning_rate": 8.681884950436587e-05, "loss": 0.0995, "step": 776 }, { "epoch": 0.6271660991297767, "grad_norm": 0.05683242529630661, "learning_rate": 8.677111292000985e-05, "loss": 0.1067, "step": 777 }, { "epoch": 0.6279732627065203, "grad_norm": 0.054360102862119675, "learning_rate": 8.672330322577317e-05, "loss": 0.1111, "step": 778 }, { "epoch": 0.6287804262832639, "grad_norm": 0.06651857495307922, "learning_rate": 8.66754205167131e-05, "loss": 0.1008, "step": 779 }, { "epoch": 0.6295875898600075, "grad_norm": 0.06210240349173546, "learning_rate": 8.662746488803206e-05, "loss": 0.0992, "step": 780 }, { "epoch": 0.6303947534367512, "grad_norm": 0.0577063113451004, "learning_rate": 8.657943643507746e-05, "loss": 0.0918, "step": 781 }, { "epoch": 0.6312019170134948, "grad_norm": 0.05668450519442558, "learning_rate": 8.65313352533415e-05, "loss": 0.0968, "step": 782 }, { "epoch": 0.6320090805902384, "grad_norm": 0.058506179600954056, "learning_rate": 8.6483161438461e-05, "loss": 0.0956, "step": 783 }, { "epoch": 0.632816244166982, "grad_norm": 0.06374339759349823, "learning_rate": 8.643491508621712e-05, "loss": 0.1078, "step": 784 }, { "epoch": 0.6336234077437256, "grad_norm": 0.05892103165388107, "learning_rate": 8.638659629253536e-05, "loss": 0.1103, "step": 785 }, { "epoch": 0.6344305713204692, "grad_norm": 0.054664529860019684, "learning_rate": 8.633820515348517e-05, "loss": 0.1022, "step": 786 }, { "epoch": 0.6352377348972128, "grad_norm": 0.053093813359737396, "learning_rate": 8.628974176527981e-05, "loss": 0.092, "step": 787 }, { "epoch": 0.6360448984739564, "grad_norm": 0.05747663602232933, "learning_rate": 8.624120622427632e-05, "loss": 0.1012, "step": 788 }, { "epoch": 0.6368520620507, "grad_norm": 0.0640353113412857, "learning_rate": 8.619259862697503e-05, "loss": 0.107, "step": 789 }, { "epoch": 0.6376592256274436, "grad_norm": 0.05780167132616043, "learning_rate": 8.614391907001968e-05, "loss": 0.1024, "step": 790 }, { "epoch": 0.6384663892041872, "grad_norm": 0.05896243453025818, "learning_rate": 8.609516765019698e-05, "loss": 0.0997, "step": 791 }, { "epoch": 0.6392735527809308, "grad_norm": 0.06390650570392609, "learning_rate": 8.60463444644366e-05, "loss": 0.1021, "step": 792 }, { "epoch": 0.6400807163576744, "grad_norm": 0.05848468467593193, "learning_rate": 8.599744960981085e-05, "loss": 0.1019, "step": 793 }, { "epoch": 0.640887879934418, "grad_norm": 0.05946502834558487, "learning_rate": 8.594848318353452e-05, "loss": 0.1027, "step": 794 }, { "epoch": 0.6416950435111616, "grad_norm": 0.06579360365867615, "learning_rate": 8.589944528296477e-05, "loss": 0.1039, "step": 795 }, { "epoch": 0.6425022070879052, "grad_norm": 0.05664371699094772, "learning_rate": 8.58503360056008e-05, "loss": 0.1027, "step": 796 }, { "epoch": 0.6433093706646488, "grad_norm": 0.05643333122134209, "learning_rate": 8.580115544908374e-05, "loss": 0.0943, "step": 797 }, { "epoch": 0.6441165342413924, "grad_norm": 0.06177568808197975, "learning_rate": 8.575190371119647e-05, "loss": 0.1044, "step": 798 }, { "epoch": 0.644923697818136, "grad_norm": 0.0588587149977684, "learning_rate": 8.570258088986338e-05, "loss": 0.0995, "step": 799 }, { "epoch": 0.6457308613948796, "grad_norm": 0.06245388090610504, "learning_rate": 8.565318708315017e-05, "loss": 0.1024, "step": 800 }, { "epoch": 0.6465380249716232, "grad_norm": 0.06081743165850639, "learning_rate": 8.560372238926372e-05, "loss": 0.0984, "step": 801 }, { "epoch": 0.6473451885483668, "grad_norm": 0.05319730564951897, "learning_rate": 8.55541869065518e-05, "loss": 0.1057, "step": 802 }, { "epoch": 0.6481523521251104, "grad_norm": 0.054601553827524185, "learning_rate": 8.550458073350295e-05, "loss": 0.0913, "step": 803 }, { "epoch": 0.648959515701854, "grad_norm": 0.053848639130592346, "learning_rate": 8.545490396874629e-05, "loss": 0.0943, "step": 804 }, { "epoch": 0.6497666792785975, "grad_norm": 0.066591776907444, "learning_rate": 8.540515671105123e-05, "loss": 0.1005, "step": 805 }, { "epoch": 0.6505738428553411, "grad_norm": 0.06365875154733658, "learning_rate": 8.535533905932738e-05, "loss": 0.1027, "step": 806 }, { "epoch": 0.6513810064320847, "grad_norm": 0.05890726298093796, "learning_rate": 8.530545111262432e-05, "loss": 0.1002, "step": 807 }, { "epoch": 0.6521881700088283, "grad_norm": 0.05517091602087021, "learning_rate": 8.525549297013133e-05, "loss": 0.1125, "step": 808 }, { "epoch": 0.6529953335855719, "grad_norm": 0.0570373609662056, "learning_rate": 8.520546473117734e-05, "loss": 0.0939, "step": 809 }, { "epoch": 0.6538024971623155, "grad_norm": 0.058857422322034836, "learning_rate": 8.515536649523059e-05, "loss": 0.1016, "step": 810 }, { "epoch": 0.6546096607390591, "grad_norm": 0.05612301453948021, "learning_rate": 8.510519836189851e-05, "loss": 0.1029, "step": 811 }, { "epoch": 0.6554168243158027, "grad_norm": 0.055842552334070206, "learning_rate": 8.505496043092753e-05, "loss": 0.1038, "step": 812 }, { "epoch": 0.6562239878925463, "grad_norm": 0.05799545347690582, "learning_rate": 8.500465280220277e-05, "loss": 0.0988, "step": 813 }, { "epoch": 0.6570311514692899, "grad_norm": 0.060561370104551315, "learning_rate": 8.495427557574804e-05, "loss": 0.1143, "step": 814 }, { "epoch": 0.6578383150460335, "grad_norm": 0.052691567689180374, "learning_rate": 8.490382885172545e-05, "loss": 0.1035, "step": 815 }, { "epoch": 0.6586454786227771, "grad_norm": 0.06112167611718178, "learning_rate": 8.485331273043531e-05, "loss": 0.1106, "step": 816 }, { "epoch": 0.6594526421995207, "grad_norm": 0.05821928754448891, "learning_rate": 8.480272731231592e-05, "loss": 0.0915, "step": 817 }, { "epoch": 0.6602598057762643, "grad_norm": 0.060982078313827515, "learning_rate": 8.475207269794329e-05, "loss": 0.1038, "step": 818 }, { "epoch": 0.6610669693530079, "grad_norm": 0.05511123687028885, "learning_rate": 8.470134898803114e-05, "loss": 0.1022, "step": 819 }, { "epoch": 0.6618741329297515, "grad_norm": 0.06120814010500908, "learning_rate": 8.465055628343045e-05, "loss": 0.0978, "step": 820 }, { "epoch": 0.6626812965064951, "grad_norm": 0.05916880443692207, "learning_rate": 8.459969468512943e-05, "loss": 0.0914, "step": 821 }, { "epoch": 0.6634884600832387, "grad_norm": 0.05558802932500839, "learning_rate": 8.454876429425324e-05, "loss": 0.1047, "step": 822 }, { "epoch": 0.6642956236599823, "grad_norm": 0.0636325553059578, "learning_rate": 8.44977652120639e-05, "loss": 0.0978, "step": 823 }, { "epoch": 0.6651027872367259, "grad_norm": 0.05692014470696449, "learning_rate": 8.444669753995986e-05, "loss": 0.0995, "step": 824 }, { "epoch": 0.6659099508134695, "grad_norm": 0.06272319704294205, "learning_rate": 8.439556137947608e-05, "loss": 0.1129, "step": 825 }, { "epoch": 0.6667171143902132, "grad_norm": 0.058388493955135345, "learning_rate": 8.43443568322836e-05, "loss": 0.1098, "step": 826 }, { "epoch": 0.6675242779669568, "grad_norm": 0.05326181650161743, "learning_rate": 8.42930840001895e-05, "loss": 0.0929, "step": 827 }, { "epoch": 0.6683314415437004, "grad_norm": 0.050782132893800735, "learning_rate": 8.424174298513654e-05, "loss": 0.1007, "step": 828 }, { "epoch": 0.669138605120444, "grad_norm": 0.05005459114909172, "learning_rate": 8.419033388920314e-05, "loss": 0.1045, "step": 829 }, { "epoch": 0.6699457686971876, "grad_norm": 0.05257490277290344, "learning_rate": 8.413885681460305e-05, "loss": 0.1012, "step": 830 }, { "epoch": 0.6707529322739312, "grad_norm": 0.05761224031448364, "learning_rate": 8.40873118636851e-05, "loss": 0.1053, "step": 831 }, { "epoch": 0.6715600958506748, "grad_norm": 0.05325397849082947, "learning_rate": 8.40356991389332e-05, "loss": 0.098, "step": 832 }, { "epoch": 0.6723672594274184, "grad_norm": 0.057813286781311035, "learning_rate": 8.398401874296595e-05, "loss": 0.106, "step": 833 }, { "epoch": 0.673174423004162, "grad_norm": 0.05737284943461418, "learning_rate": 8.393227077853644e-05, "loss": 0.1089, "step": 834 }, { "epoch": 0.6739815865809056, "grad_norm": 0.06171759217977524, "learning_rate": 8.388045534853221e-05, "loss": 0.098, "step": 835 }, { "epoch": 0.6747887501576492, "grad_norm": 0.06483704596757889, "learning_rate": 8.382857255597489e-05, "loss": 0.1016, "step": 836 }, { "epoch": 0.6755959137343928, "grad_norm": 0.055664464831352234, "learning_rate": 8.377662250402e-05, "loss": 0.1022, "step": 837 }, { "epoch": 0.6764030773111364, "grad_norm": 0.05665101110935211, "learning_rate": 8.372460529595688e-05, "loss": 0.1102, "step": 838 }, { "epoch": 0.67721024088788, "grad_norm": 0.05501328036189079, "learning_rate": 8.367252103520831e-05, "loss": 0.0976, "step": 839 }, { "epoch": 0.6780174044646236, "grad_norm": 0.0634823814034462, "learning_rate": 8.362036982533041e-05, "loss": 0.0986, "step": 840 }, { "epoch": 0.6788245680413671, "grad_norm": 0.06315368413925171, "learning_rate": 8.356815177001242e-05, "loss": 0.1022, "step": 841 }, { "epoch": 0.6796317316181107, "grad_norm": 0.05708141252398491, "learning_rate": 8.351586697307652e-05, "loss": 0.1053, "step": 842 }, { "epoch": 0.6804388951948543, "grad_norm": 0.05303526297211647, "learning_rate": 8.346351553847753e-05, "loss": 0.0956, "step": 843 }, { "epoch": 0.6812460587715979, "grad_norm": 0.05894466117024422, "learning_rate": 8.341109757030278e-05, "loss": 0.1008, "step": 844 }, { "epoch": 0.6820532223483415, "grad_norm": 0.058563150465488434, "learning_rate": 8.33586131727719e-05, "loss": 0.091, "step": 845 }, { "epoch": 0.6828603859250851, "grad_norm": 0.057141099125146866, "learning_rate": 8.330606245023657e-05, "loss": 0.0948, "step": 846 }, { "epoch": 0.6836675495018287, "grad_norm": 0.0587286651134491, "learning_rate": 8.325344550718037e-05, "loss": 0.0944, "step": 847 }, { "epoch": 0.6844747130785723, "grad_norm": 0.0620308443903923, "learning_rate": 8.320076244821852e-05, "loss": 0.108, "step": 848 }, { "epoch": 0.6852818766553159, "grad_norm": 0.05729060620069504, "learning_rate": 8.314801337809774e-05, "loss": 0.0974, "step": 849 }, { "epoch": 0.6860890402320595, "grad_norm": 0.057693108916282654, "learning_rate": 8.309519840169591e-05, "loss": 0.0987, "step": 850 }, { "epoch": 0.6868962038088031, "grad_norm": 0.059050656855106354, "learning_rate": 8.304231762402204e-05, "loss": 0.0974, "step": 851 }, { "epoch": 0.6877033673855467, "grad_norm": 0.053823817521333694, "learning_rate": 8.29893711502159e-05, "loss": 0.1014, "step": 852 }, { "epoch": 0.6885105309622903, "grad_norm": 0.05769362673163414, "learning_rate": 8.29363590855479e-05, "loss": 0.096, "step": 853 }, { "epoch": 0.6893176945390339, "grad_norm": 0.05778951197862625, "learning_rate": 8.288328153541889e-05, "loss": 0.0999, "step": 854 }, { "epoch": 0.6901248581157775, "grad_norm": 0.05824120342731476, "learning_rate": 8.28301386053599e-05, "loss": 0.0939, "step": 855 }, { "epoch": 0.6909320216925211, "grad_norm": 0.05829611420631409, "learning_rate": 8.277693040103192e-05, "loss": 0.0993, "step": 856 }, { "epoch": 0.6917391852692647, "grad_norm": 0.06089019402861595, "learning_rate": 8.272365702822577e-05, "loss": 0.1088, "step": 857 }, { "epoch": 0.6925463488460083, "grad_norm": 0.050972916185855865, "learning_rate": 8.267031859286186e-05, "loss": 0.0977, "step": 858 }, { "epoch": 0.6933535124227519, "grad_norm": 0.06186503544449806, "learning_rate": 8.261691520098985e-05, "loss": 0.0967, "step": 859 }, { "epoch": 0.6941606759994955, "grad_norm": 0.05329521372914314, "learning_rate": 8.256344695878865e-05, "loss": 0.0942, "step": 860 }, { "epoch": 0.6949678395762391, "grad_norm": 0.05865070968866348, "learning_rate": 8.250991397256609e-05, "loss": 0.0998, "step": 861 }, { "epoch": 0.6957750031529827, "grad_norm": 0.05860113352537155, "learning_rate": 8.24563163487587e-05, "loss": 0.0998, "step": 862 }, { "epoch": 0.6965821667297263, "grad_norm": 0.06336338818073273, "learning_rate": 8.240265419393156e-05, "loss": 0.1043, "step": 863 }, { "epoch": 0.6973893303064699, "grad_norm": 0.06041109561920166, "learning_rate": 8.234892761477802e-05, "loss": 0.0947, "step": 864 }, { "epoch": 0.6981964938832135, "grad_norm": 0.0603845939040184, "learning_rate": 8.229513671811953e-05, "loss": 0.1144, "step": 865 }, { "epoch": 0.6990036574599571, "grad_norm": 0.05735471844673157, "learning_rate": 8.224128161090543e-05, "loss": 0.1098, "step": 866 }, { "epoch": 0.6998108210367007, "grad_norm": 0.05763717368245125, "learning_rate": 8.218736240021272e-05, "loss": 0.0993, "step": 867 }, { "epoch": 0.7006179846134443, "grad_norm": 0.05468890070915222, "learning_rate": 8.213337919324586e-05, "loss": 0.1023, "step": 868 }, { "epoch": 0.7014251481901879, "grad_norm": 0.06123700365424156, "learning_rate": 8.207933209733654e-05, "loss": 0.1095, "step": 869 }, { "epoch": 0.7022323117669316, "grad_norm": 0.05638163164258003, "learning_rate": 8.202522121994347e-05, "loss": 0.1121, "step": 870 }, { "epoch": 0.7030394753436752, "grad_norm": 0.05836964398622513, "learning_rate": 8.197104666865218e-05, "loss": 0.0997, "step": 871 }, { "epoch": 0.7038466389204188, "grad_norm": 0.06184643134474754, "learning_rate": 8.191680855117483e-05, "loss": 0.1036, "step": 872 }, { "epoch": 0.7046538024971624, "grad_norm": 0.053514059633016586, "learning_rate": 8.186250697534992e-05, "loss": 0.1007, "step": 873 }, { "epoch": 0.705460966073906, "grad_norm": 0.059897251427173615, "learning_rate": 8.180814204914213e-05, "loss": 0.0983, "step": 874 }, { "epoch": 0.7062681296506496, "grad_norm": 0.05350957810878754, "learning_rate": 8.175371388064212e-05, "loss": 0.0956, "step": 875 }, { "epoch": 0.7070752932273932, "grad_norm": 0.0535610094666481, "learning_rate": 8.169922257806625e-05, "loss": 0.0934, "step": 876 }, { "epoch": 0.7078824568041368, "grad_norm": 0.06237298995256424, "learning_rate": 8.164466824975647e-05, "loss": 0.0897, "step": 877 }, { "epoch": 0.7086896203808803, "grad_norm": 0.05559561774134636, "learning_rate": 8.159005100417996e-05, "loss": 0.1003, "step": 878 }, { "epoch": 0.7094967839576239, "grad_norm": 0.06054169684648514, "learning_rate": 8.153537094992907e-05, "loss": 0.0963, "step": 879 }, { "epoch": 0.7103039475343675, "grad_norm": 0.056655194610357285, "learning_rate": 8.148062819572096e-05, "loss": 0.093, "step": 880 }, { "epoch": 0.7111111111111111, "grad_norm": 0.060454387217760086, "learning_rate": 8.142582285039752e-05, "loss": 0.0994, "step": 881 }, { "epoch": 0.7119182746878547, "grad_norm": 0.055307455360889435, "learning_rate": 8.137095502292504e-05, "loss": 0.0952, "step": 882 }, { "epoch": 0.7127254382645983, "grad_norm": 0.06706053018569946, "learning_rate": 8.131602482239404e-05, "loss": 0.1042, "step": 883 }, { "epoch": 0.7135326018413419, "grad_norm": 0.05511351302266121, "learning_rate": 8.126103235801909e-05, "loss": 0.1071, "step": 884 }, { "epoch": 0.7143397654180855, "grad_norm": 0.05557266250252724, "learning_rate": 8.120597773913852e-05, "loss": 0.1007, "step": 885 }, { "epoch": 0.7151469289948291, "grad_norm": 0.0671280100941658, "learning_rate": 8.115086107521424e-05, "loss": 0.0979, "step": 886 }, { "epoch": 0.7159540925715727, "grad_norm": 0.05973823368549347, "learning_rate": 8.109568247583155e-05, "loss": 0.0935, "step": 887 }, { "epoch": 0.7167612561483163, "grad_norm": 0.05929037928581238, "learning_rate": 8.104044205069886e-05, "loss": 0.0988, "step": 888 }, { "epoch": 0.7175684197250599, "grad_norm": 0.05249428749084473, "learning_rate": 8.098513990964753e-05, "loss": 0.1077, "step": 889 }, { "epoch": 0.7183755833018035, "grad_norm": 0.057377152144908905, "learning_rate": 8.09297761626316e-05, "loss": 0.0986, "step": 890 }, { "epoch": 0.7191827468785471, "grad_norm": 0.06096857413649559, "learning_rate": 8.087435091972761e-05, "loss": 0.0998, "step": 891 }, { "epoch": 0.7199899104552907, "grad_norm": 0.05895715579390526, "learning_rate": 8.081886429113439e-05, "loss": 0.112, "step": 892 }, { "epoch": 0.7207970740320343, "grad_norm": 0.06582362204790115, "learning_rate": 8.076331638717278e-05, "loss": 0.1076, "step": 893 }, { "epoch": 0.7216042376087779, "grad_norm": 0.058329466730356216, "learning_rate": 8.070770731828547e-05, "loss": 0.1018, "step": 894 }, { "epoch": 0.7224114011855215, "grad_norm": 0.06596577912569046, "learning_rate": 8.065203719503678e-05, "loss": 0.0974, "step": 895 }, { "epoch": 0.7232185647622651, "grad_norm": 0.0656326487660408, "learning_rate": 8.05963061281124e-05, "loss": 0.0982, "step": 896 }, { "epoch": 0.7232185647622651, "eval_loss": 0.1131877526640892, "eval_runtime": 4037.4257, "eval_samples_per_second": 2.205, "eval_steps_per_second": 2.205, "step": 896 }, { "epoch": 0.7240257283390087, "grad_norm": 0.055193450301885605, "learning_rate": 8.054051422831916e-05, "loss": 0.0997, "step": 897 }, { "epoch": 0.7248328919157523, "grad_norm": 0.058959100395441055, "learning_rate": 8.04846616065849e-05, "loss": 0.1058, "step": 898 }, { "epoch": 0.7256400554924959, "grad_norm": 0.05718646198511124, "learning_rate": 8.042874837395815e-05, "loss": 0.0933, "step": 899 }, { "epoch": 0.7264472190692395, "grad_norm": 0.05599718168377876, "learning_rate": 8.037277464160799e-05, "loss": 0.0824, "step": 900 }, { "epoch": 0.7272543826459831, "grad_norm": 0.05513780564069748, "learning_rate": 8.031674052082372e-05, "loss": 0.102, "step": 901 }, { "epoch": 0.7280615462227267, "grad_norm": 0.05448281392455101, "learning_rate": 8.026064612301479e-05, "loss": 0.1021, "step": 902 }, { "epoch": 0.7288687097994703, "grad_norm": 0.06112551689147949, "learning_rate": 8.020449155971041e-05, "loss": 0.1095, "step": 903 }, { "epoch": 0.7296758733762139, "grad_norm": 0.06002431735396385, "learning_rate": 8.014827694255948e-05, "loss": 0.1077, "step": 904 }, { "epoch": 0.7304830369529575, "grad_norm": 0.05521925538778305, "learning_rate": 8.009200238333027e-05, "loss": 0.0966, "step": 905 }, { "epoch": 0.7312902005297011, "grad_norm": 0.05689017474651337, "learning_rate": 8.003566799391024e-05, "loss": 0.1002, "step": 906 }, { "epoch": 0.7320973641064447, "grad_norm": 0.06228077411651611, "learning_rate": 7.997927388630581e-05, "loss": 0.107, "step": 907 }, { "epoch": 0.7329045276831883, "grad_norm": 0.0579240508377552, "learning_rate": 7.992282017264211e-05, "loss": 0.1032, "step": 908 }, { "epoch": 0.7337116912599319, "grad_norm": 0.05044465512037277, "learning_rate": 7.986630696516281e-05, "loss": 0.1035, "step": 909 }, { "epoch": 0.7345188548366754, "grad_norm": 0.060334645211696625, "learning_rate": 7.980973437622987e-05, "loss": 0.0919, "step": 910 }, { "epoch": 0.735326018413419, "grad_norm": 0.05746825411915779, "learning_rate": 7.975310251832329e-05, "loss": 0.0955, "step": 911 }, { "epoch": 0.7361331819901626, "grad_norm": 0.05621271952986717, "learning_rate": 7.96964115040409e-05, "loss": 0.1103, "step": 912 }, { "epoch": 0.7369403455669062, "grad_norm": 0.05809158831834793, "learning_rate": 7.963966144609821e-05, "loss": 0.0966, "step": 913 }, { "epoch": 0.7377475091436498, "grad_norm": 0.05315982550382614, "learning_rate": 7.958285245732806e-05, "loss": 0.0959, "step": 914 }, { "epoch": 0.7385546727203935, "grad_norm": 0.05869751423597336, "learning_rate": 7.952598465068048e-05, "loss": 0.1048, "step": 915 }, { "epoch": 0.7393618362971371, "grad_norm": 0.052703652530908585, "learning_rate": 7.946905813922249e-05, "loss": 0.1001, "step": 916 }, { "epoch": 0.7401689998738807, "grad_norm": 0.04977256432175636, "learning_rate": 7.941207303613773e-05, "loss": 0.1, "step": 917 }, { "epoch": 0.7409761634506243, "grad_norm": 0.05939466133713722, "learning_rate": 7.935502945472639e-05, "loss": 0.0957, "step": 918 }, { "epoch": 0.7417833270273679, "grad_norm": 0.062491778284311295, "learning_rate": 7.9297927508405e-05, "loss": 0.105, "step": 919 }, { "epoch": 0.7425904906041115, "grad_norm": 0.05859870836138725, "learning_rate": 7.924076731070596e-05, "loss": 0.097, "step": 920 }, { "epoch": 0.7433976541808551, "grad_norm": 0.05443640425801277, "learning_rate": 7.918354897527766e-05, "loss": 0.107, "step": 921 }, { "epoch": 0.7442048177575987, "grad_norm": 0.06477627158164978, "learning_rate": 7.912627261588401e-05, "loss": 0.0957, "step": 922 }, { "epoch": 0.7450119813343423, "grad_norm": 0.05534570291638374, "learning_rate": 7.906893834640428e-05, "loss": 0.0959, "step": 923 }, { "epoch": 0.7458191449110859, "grad_norm": 0.053470078855752945, "learning_rate": 7.901154628083285e-05, "loss": 0.1017, "step": 924 }, { "epoch": 0.7466263084878295, "grad_norm": 0.05378441885113716, "learning_rate": 7.89540965332791e-05, "loss": 0.0948, "step": 925 }, { "epoch": 0.7474334720645731, "grad_norm": 0.050101395696401596, "learning_rate": 7.889658921796703e-05, "loss": 0.0941, "step": 926 }, { "epoch": 0.7482406356413167, "grad_norm": 0.05417713522911072, "learning_rate": 7.883902444923513e-05, "loss": 0.1022, "step": 927 }, { "epoch": 0.7490477992180603, "grad_norm": 0.052360933274030685, "learning_rate": 7.878140234153605e-05, "loss": 0.1103, "step": 928 }, { "epoch": 0.7498549627948039, "grad_norm": 0.05249250680208206, "learning_rate": 7.872372300943655e-05, "loss": 0.0997, "step": 929 }, { "epoch": 0.7506621263715475, "grad_norm": 0.05506344139575958, "learning_rate": 7.866598656761712e-05, "loss": 0.1026, "step": 930 }, { "epoch": 0.7514692899482911, "grad_norm": 0.057028740644454956, "learning_rate": 7.860819313087177e-05, "loss": 0.0961, "step": 931 }, { "epoch": 0.7522764535250347, "grad_norm": 0.05572771653532982, "learning_rate": 7.855034281410784e-05, "loss": 0.0974, "step": 932 }, { "epoch": 0.7530836171017783, "grad_norm": 0.060490161180496216, "learning_rate": 7.849243573234581e-05, "loss": 0.0992, "step": 933 }, { "epoch": 0.7538907806785219, "grad_norm": 0.055572833865880966, "learning_rate": 7.843447200071899e-05, "loss": 0.1028, "step": 934 }, { "epoch": 0.7546979442552655, "grad_norm": 0.05921762436628342, "learning_rate": 7.837645173447328e-05, "loss": 0.1024, "step": 935 }, { "epoch": 0.7555051078320091, "grad_norm": 0.05200856178998947, "learning_rate": 7.831837504896707e-05, "loss": 0.0866, "step": 936 }, { "epoch": 0.7563122714087527, "grad_norm": 0.053354594856500626, "learning_rate": 7.826024205967084e-05, "loss": 0.1025, "step": 937 }, { "epoch": 0.7571194349854963, "grad_norm": 0.05994477868080139, "learning_rate": 7.820205288216708e-05, "loss": 0.1088, "step": 938 }, { "epoch": 0.7579265985622399, "grad_norm": 0.055941954255104065, "learning_rate": 7.814380763214996e-05, "loss": 0.0932, "step": 939 }, { "epoch": 0.7587337621389835, "grad_norm": 0.061530500650405884, "learning_rate": 7.808550642542516e-05, "loss": 0.0974, "step": 940 }, { "epoch": 0.7595409257157271, "grad_norm": 0.05178806185722351, "learning_rate": 7.80271493779096e-05, "loss": 0.1004, "step": 941 }, { "epoch": 0.7603480892924707, "grad_norm": 0.05979490652680397, "learning_rate": 7.79687366056312e-05, "loss": 0.0987, "step": 942 }, { "epoch": 0.7611552528692143, "grad_norm": 0.061689525842666626, "learning_rate": 7.791026822472875e-05, "loss": 0.1067, "step": 943 }, { "epoch": 0.7619624164459579, "grad_norm": 0.0647348165512085, "learning_rate": 7.785174435145153e-05, "loss": 0.0993, "step": 944 }, { "epoch": 0.7627695800227015, "grad_norm": 0.051596611738204956, "learning_rate": 7.779316510215918e-05, "loss": 0.1152, "step": 945 }, { "epoch": 0.763576743599445, "grad_norm": 0.06055000051856041, "learning_rate": 7.773453059332145e-05, "loss": 0.1114, "step": 946 }, { "epoch": 0.7643839071761886, "grad_norm": 0.05911140516400337, "learning_rate": 7.767584094151792e-05, "loss": 0.0949, "step": 947 }, { "epoch": 0.7651910707529322, "grad_norm": 0.06231195852160454, "learning_rate": 7.761709626343787e-05, "loss": 0.1022, "step": 948 }, { "epoch": 0.7659982343296758, "grad_norm": 0.05465016886591911, "learning_rate": 7.755829667587993e-05, "loss": 0.0942, "step": 949 }, { "epoch": 0.7668053979064194, "grad_norm": 0.05926135554909706, "learning_rate": 7.749944229575193e-05, "loss": 0.102, "step": 950 }, { "epoch": 0.767612561483163, "grad_norm": 0.05865073576569557, "learning_rate": 7.744053324007063e-05, "loss": 0.1037, "step": 951 }, { "epoch": 0.7684197250599066, "grad_norm": 0.05857458710670471, "learning_rate": 7.738156962596152e-05, "loss": 0.1179, "step": 952 }, { "epoch": 0.7692268886366502, "grad_norm": 0.06278444826602936, "learning_rate": 7.732255157065855e-05, "loss": 0.1029, "step": 953 }, { "epoch": 0.7700340522133938, "grad_norm": 0.04742460697889328, "learning_rate": 7.726347919150387e-05, "loss": 0.0922, "step": 954 }, { "epoch": 0.7708412157901374, "grad_norm": 0.052453819662332535, "learning_rate": 7.720435260594774e-05, "loss": 0.0955, "step": 955 }, { "epoch": 0.771648379366881, "grad_norm": 0.056372128427028656, "learning_rate": 7.71451719315481e-05, "loss": 0.111, "step": 956 }, { "epoch": 0.7724555429436246, "grad_norm": 0.052572913467884064, "learning_rate": 7.708593728597046e-05, "loss": 0.0962, "step": 957 }, { "epoch": 0.7732627065203682, "grad_norm": 0.051868245005607605, "learning_rate": 7.702664878698768e-05, "loss": 0.0947, "step": 958 }, { "epoch": 0.7740698700971119, "grad_norm": 0.06293325126171112, "learning_rate": 7.696730655247963e-05, "loss": 0.1141, "step": 959 }, { "epoch": 0.7748770336738555, "grad_norm": 0.061019863933324814, "learning_rate": 7.690791070043308e-05, "loss": 0.1015, "step": 960 }, { "epoch": 0.7756841972505991, "grad_norm": 0.05845112353563309, "learning_rate": 7.684846134894133e-05, "loss": 0.0994, "step": 961 }, { "epoch": 0.7764913608273427, "grad_norm": 0.06359632313251495, "learning_rate": 7.678895861620413e-05, "loss": 0.1023, "step": 962 }, { "epoch": 0.7772985244040863, "grad_norm": 0.053893398493528366, "learning_rate": 7.672940262052731e-05, "loss": 0.0921, "step": 963 }, { "epoch": 0.7781056879808299, "grad_norm": 0.05272841826081276, "learning_rate": 7.666979348032259e-05, "loss": 0.1044, "step": 964 }, { "epoch": 0.7789128515575735, "grad_norm": 0.05654517188668251, "learning_rate": 7.661013131410744e-05, "loss": 0.1043, "step": 965 }, { "epoch": 0.7797200151343171, "grad_norm": 0.06009817495942116, "learning_rate": 7.655041624050467e-05, "loss": 0.0951, "step": 966 }, { "epoch": 0.7805271787110607, "grad_norm": 0.05532561615109444, "learning_rate": 7.649064837824231e-05, "loss": 0.0984, "step": 967 }, { "epoch": 0.7813343422878043, "grad_norm": 0.06071249023079872, "learning_rate": 7.643082784615338e-05, "loss": 0.1029, "step": 968 }, { "epoch": 0.7821415058645479, "grad_norm": 0.05780842900276184, "learning_rate": 7.637095476317553e-05, "loss": 0.0992, "step": 969 }, { "epoch": 0.7829486694412915, "grad_norm": 0.05489496886730194, "learning_rate": 7.631102924835101e-05, "loss": 0.1018, "step": 970 }, { "epoch": 0.7837558330180351, "grad_norm": 0.053943563252687454, "learning_rate": 7.625105142082623e-05, "loss": 0.0989, "step": 971 }, { "epoch": 0.7845629965947787, "grad_norm": 0.0624924935400486, "learning_rate": 7.619102139985165e-05, "loss": 0.0962, "step": 972 }, { "epoch": 0.7853701601715223, "grad_norm": 0.05467823147773743, "learning_rate": 7.613093930478148e-05, "loss": 0.1039, "step": 973 }, { "epoch": 0.7861773237482659, "grad_norm": 0.06129651144146919, "learning_rate": 7.607080525507353e-05, "loss": 0.1027, "step": 974 }, { "epoch": 0.7869844873250095, "grad_norm": 0.056814711540937424, "learning_rate": 7.601061937028881e-05, "loss": 0.0981, "step": 975 }, { "epoch": 0.7877916509017531, "grad_norm": 0.055171579122543335, "learning_rate": 7.595038177009144e-05, "loss": 0.1017, "step": 976 }, { "epoch": 0.7885988144784967, "grad_norm": 0.05617377161979675, "learning_rate": 7.589009257424839e-05, "loss": 0.0987, "step": 977 }, { "epoch": 0.7894059780552403, "grad_norm": 0.05934462696313858, "learning_rate": 7.582975190262917e-05, "loss": 0.0927, "step": 978 }, { "epoch": 0.7902131416319839, "grad_norm": 0.05964481830596924, "learning_rate": 7.576935987520566e-05, "loss": 0.1108, "step": 979 }, { "epoch": 0.7910203052087275, "grad_norm": 0.061287831515073776, "learning_rate": 7.570891661205185e-05, "loss": 0.112, "step": 980 }, { "epoch": 0.7918274687854711, "grad_norm": 0.052118029445409775, "learning_rate": 7.564842223334356e-05, "loss": 0.1019, "step": 981 }, { "epoch": 0.7926346323622147, "grad_norm": 0.05632944777607918, "learning_rate": 7.558787685935828e-05, "loss": 0.0973, "step": 982 }, { "epoch": 0.7934417959389582, "grad_norm": 0.0609171986579895, "learning_rate": 7.552728061047492e-05, "loss": 0.0899, "step": 983 }, { "epoch": 0.7942489595157018, "grad_norm": 0.05473325774073601, "learning_rate": 7.546663360717343e-05, "loss": 0.0924, "step": 984 }, { "epoch": 0.7950561230924454, "grad_norm": 0.062433451414108276, "learning_rate": 7.540593597003481e-05, "loss": 0.1021, "step": 985 }, { "epoch": 0.795863286669189, "grad_norm": 0.0527990348637104, "learning_rate": 7.534518781974065e-05, "loss": 0.1035, "step": 986 }, { "epoch": 0.7966704502459326, "grad_norm": 0.05704967677593231, "learning_rate": 7.528438927707297e-05, "loss": 0.0975, "step": 987 }, { "epoch": 0.7974776138226762, "grad_norm": 0.05643763765692711, "learning_rate": 7.522354046291403e-05, "loss": 0.0976, "step": 988 }, { "epoch": 0.7982847773994198, "grad_norm": 0.05358509719371796, "learning_rate": 7.5162641498246e-05, "loss": 0.0952, "step": 989 }, { "epoch": 0.7990919409761634, "grad_norm": 0.05236755311489105, "learning_rate": 7.510169250415078e-05, "loss": 0.0998, "step": 990 }, { "epoch": 0.799899104552907, "grad_norm": 0.05170868709683418, "learning_rate": 7.504069360180971e-05, "loss": 0.0957, "step": 991 }, { "epoch": 0.8007062681296506, "grad_norm": 0.06679459661245346, "learning_rate": 7.497964491250342e-05, "loss": 0.0912, "step": 992 }, { "epoch": 0.8015134317063942, "grad_norm": 0.0518556609749794, "learning_rate": 7.491854655761148e-05, "loss": 0.0927, "step": 993 }, { "epoch": 0.8023205952831378, "grad_norm": 0.05252683162689209, "learning_rate": 7.48573986586122e-05, "loss": 0.0971, "step": 994 }, { "epoch": 0.8031277588598814, "grad_norm": 0.0634356141090393, "learning_rate": 7.479620133708246e-05, "loss": 0.1023, "step": 995 }, { "epoch": 0.803934922436625, "grad_norm": 0.053769901394844055, "learning_rate": 7.473495471469733e-05, "loss": 0.0933, "step": 996 }, { "epoch": 0.8047420860133686, "grad_norm": 0.04914426431059837, "learning_rate": 7.467365891322995e-05, "loss": 0.1039, "step": 997 }, { "epoch": 0.8055492495901122, "grad_norm": 0.0579984150826931, "learning_rate": 7.461231405455121e-05, "loss": 0.1037, "step": 998 }, { "epoch": 0.8063564131668558, "grad_norm": 0.05620402470231056, "learning_rate": 7.455092026062955e-05, "loss": 0.1046, "step": 999 }, { "epoch": 0.8071635767435994, "grad_norm": 0.061891112476587296, "learning_rate": 7.448947765353071e-05, "loss": 0.1069, "step": 1000 }, { "epoch": 0.807970740320343, "grad_norm": 0.05484484136104584, "learning_rate": 7.442798635541749e-05, "loss": 0.1017, "step": 1001 }, { "epoch": 0.8087779038970866, "grad_norm": 0.06323721259832382, "learning_rate": 7.436644648854947e-05, "loss": 0.1059, "step": 1002 }, { "epoch": 0.8095850674738302, "grad_norm": 0.05597204342484474, "learning_rate": 7.430485817528282e-05, "loss": 0.1021, "step": 1003 }, { "epoch": 0.8103922310505739, "grad_norm": 0.058914024382829666, "learning_rate": 7.424322153807003e-05, "loss": 0.1033, "step": 1004 }, { "epoch": 0.8111993946273175, "grad_norm": 0.05599389970302582, "learning_rate": 7.418153669945967e-05, "loss": 0.093, "step": 1005 }, { "epoch": 0.8120065582040611, "grad_norm": 0.05525682866573334, "learning_rate": 7.411980378209611e-05, "loss": 0.093, "step": 1006 }, { "epoch": 0.8128137217808047, "grad_norm": 0.05795508623123169, "learning_rate": 7.40580229087194e-05, "loss": 0.105, "step": 1007 }, { "epoch": 0.8136208853575483, "grad_norm": 0.05571641027927399, "learning_rate": 7.399619420216485e-05, "loss": 0.0963, "step": 1008 }, { "epoch": 0.8144280489342919, "grad_norm": 0.053936295211315155, "learning_rate": 7.393431778536291e-05, "loss": 0.0977, "step": 1009 }, { "epoch": 0.8152352125110355, "grad_norm": 0.06003548577427864, "learning_rate": 7.387239378133888e-05, "loss": 0.1116, "step": 1010 }, { "epoch": 0.8160423760877791, "grad_norm": 0.056006383150815964, "learning_rate": 7.381042231321269e-05, "loss": 0.1022, "step": 1011 }, { "epoch": 0.8168495396645227, "grad_norm": 0.060056421905756, "learning_rate": 7.374840350419865e-05, "loss": 0.1005, "step": 1012 }, { "epoch": 0.8176567032412663, "grad_norm": 0.061885274946689606, "learning_rate": 7.368633747760515e-05, "loss": 0.1095, "step": 1013 }, { "epoch": 0.8184638668180099, "grad_norm": 0.060644589364528656, "learning_rate": 7.362422435683449e-05, "loss": 0.0994, "step": 1014 }, { "epoch": 0.8192710303947535, "grad_norm": 0.06402606517076492, "learning_rate": 7.356206426538262e-05, "loss": 0.1054, "step": 1015 }, { "epoch": 0.8200781939714971, "grad_norm": 0.05782578885555267, "learning_rate": 7.349985732683886e-05, "loss": 0.0999, "step": 1016 }, { "epoch": 0.8208853575482407, "grad_norm": 0.05403415858745575, "learning_rate": 7.343760366488564e-05, "loss": 0.086, "step": 1017 }, { "epoch": 0.8216925211249843, "grad_norm": 0.054603662341833115, "learning_rate": 7.337530340329834e-05, "loss": 0.1024, "step": 1018 }, { "epoch": 0.8224996847017279, "grad_norm": 0.05976370349526405, "learning_rate": 7.3312956665945e-05, "loss": 0.1039, "step": 1019 }, { "epoch": 0.8233068482784714, "grad_norm": 0.05510777607560158, "learning_rate": 7.325056357678602e-05, "loss": 0.1004, "step": 1020 }, { "epoch": 0.824114011855215, "grad_norm": 0.05488766357302666, "learning_rate": 7.318812425987395e-05, "loss": 0.0928, "step": 1021 }, { "epoch": 0.8249211754319586, "grad_norm": 0.05334113538265228, "learning_rate": 7.31256388393533e-05, "loss": 0.1048, "step": 1022 }, { "epoch": 0.8257283390087022, "grad_norm": 0.05814628303050995, "learning_rate": 7.306310743946024e-05, "loss": 0.1081, "step": 1023 }, { "epoch": 0.8265355025854458, "grad_norm": 0.06155453622341156, "learning_rate": 7.300053018452233e-05, "loss": 0.1006, "step": 1024 }, { "epoch": 0.8265355025854458, "eval_loss": 0.1141340509057045, "eval_runtime": 4041.8446, "eval_samples_per_second": 2.202, "eval_steps_per_second": 2.202, "step": 1024 }, { "epoch": 0.8273426661621894, "grad_norm": 0.05530587211251259, "learning_rate": 7.29379071989583e-05, "loss": 0.093, "step": 1025 }, { "epoch": 0.828149829738933, "grad_norm": 0.05504893139004707, "learning_rate": 7.287523860727781e-05, "loss": 0.1061, "step": 1026 }, { "epoch": 0.8289569933156766, "grad_norm": 0.0526910237967968, "learning_rate": 7.281252453408126e-05, "loss": 0.0962, "step": 1027 }, { "epoch": 0.8297641568924202, "grad_norm": 0.05282716825604439, "learning_rate": 7.274976510405934e-05, "loss": 0.1015, "step": 1028 }, { "epoch": 0.8305713204691638, "grad_norm": 0.049580447375774384, "learning_rate": 7.268696044199304e-05, "loss": 0.0946, "step": 1029 }, { "epoch": 0.8313784840459074, "grad_norm": 0.05146600678563118, "learning_rate": 7.262411067275326e-05, "loss": 0.1083, "step": 1030 }, { "epoch": 0.832185647622651, "grad_norm": 0.05770369991660118, "learning_rate": 7.256121592130055e-05, "loss": 0.0979, "step": 1031 }, { "epoch": 0.8329928111993946, "grad_norm": 0.05818641558289528, "learning_rate": 7.24982763126849e-05, "loss": 0.0977, "step": 1032 }, { "epoch": 0.8337999747761382, "grad_norm": 0.05590706691145897, "learning_rate": 7.243529197204552e-05, "loss": 0.0979, "step": 1033 }, { "epoch": 0.8346071383528818, "grad_norm": 0.05611065775156021, "learning_rate": 7.237226302461053e-05, "loss": 0.098, "step": 1034 }, { "epoch": 0.8354143019296254, "grad_norm": 0.0554087869822979, "learning_rate": 7.230918959569674e-05, "loss": 0.0942, "step": 1035 }, { "epoch": 0.836221465506369, "grad_norm": 0.06255902349948883, "learning_rate": 7.224607181070941e-05, "loss": 0.1016, "step": 1036 }, { "epoch": 0.8370286290831126, "grad_norm": 0.05779389664530754, "learning_rate": 7.218290979514202e-05, "loss": 0.0977, "step": 1037 }, { "epoch": 0.8378357926598562, "grad_norm": 0.055514752864837646, "learning_rate": 7.21197036745759e-05, "loss": 0.0998, "step": 1038 }, { "epoch": 0.8386429562365998, "grad_norm": 0.05722029134631157, "learning_rate": 7.205645357468016e-05, "loss": 0.096, "step": 1039 }, { "epoch": 0.8394501198133434, "grad_norm": 0.05724198371171951, "learning_rate": 7.199315962121134e-05, "loss": 0.0984, "step": 1040 }, { "epoch": 0.840257283390087, "grad_norm": 0.056293364614248276, "learning_rate": 7.192982194001312e-05, "loss": 0.0956, "step": 1041 }, { "epoch": 0.8410644469668306, "grad_norm": 0.05281605198979378, "learning_rate": 7.186644065701616e-05, "loss": 0.1008, "step": 1042 }, { "epoch": 0.8418716105435742, "grad_norm": 0.06182162091135979, "learning_rate": 7.180301589823784e-05, "loss": 0.1051, "step": 1043 }, { "epoch": 0.8426787741203178, "grad_norm": 0.05831001698970795, "learning_rate": 7.173954778978192e-05, "loss": 0.1045, "step": 1044 }, { "epoch": 0.8434859376970614, "grad_norm": 0.053949158638715744, "learning_rate": 7.167603645783834e-05, "loss": 0.0933, "step": 1045 }, { "epoch": 0.844293101273805, "grad_norm": 0.061913955956697464, "learning_rate": 7.161248202868309e-05, "loss": 0.0993, "step": 1046 }, { "epoch": 0.8451002648505486, "grad_norm": 0.056907977908849716, "learning_rate": 7.15488846286777e-05, "loss": 0.0958, "step": 1047 }, { "epoch": 0.8459074284272923, "grad_norm": 0.06350523978471756, "learning_rate": 7.148524438426926e-05, "loss": 0.0925, "step": 1048 }, { "epoch": 0.8467145920040359, "grad_norm": 0.056615736335515976, "learning_rate": 7.142156142198997e-05, "loss": 0.1049, "step": 1049 }, { "epoch": 0.8475217555807795, "grad_norm": 0.0669054463505745, "learning_rate": 7.135783586845698e-05, "loss": 0.1025, "step": 1050 }, { "epoch": 0.8483289191575231, "grad_norm": 0.052905209362506866, "learning_rate": 7.129406785037214e-05, "loss": 0.0947, "step": 1051 }, { "epoch": 0.8491360827342667, "grad_norm": 0.05181756615638733, "learning_rate": 7.123025749452172e-05, "loss": 0.0893, "step": 1052 }, { "epoch": 0.8499432463110103, "grad_norm": 0.05080896615982056, "learning_rate": 7.116640492777617e-05, "loss": 0.0936, "step": 1053 }, { "epoch": 0.8507504098877539, "grad_norm": 0.053838517516851425, "learning_rate": 7.110251027708984e-05, "loss": 0.0874, "step": 1054 }, { "epoch": 0.8515575734644975, "grad_norm": 0.04993472248315811, "learning_rate": 7.103857366950081e-05, "loss": 0.1064, "step": 1055 }, { "epoch": 0.852364737041241, "grad_norm": 0.05858161672949791, "learning_rate": 7.09745952321305e-05, "loss": 0.0972, "step": 1056 }, { "epoch": 0.8531719006179846, "grad_norm": 0.05472266674041748, "learning_rate": 7.091057509218357e-05, "loss": 0.0949, "step": 1057 }, { "epoch": 0.8539790641947282, "grad_norm": 0.05345141142606735, "learning_rate": 7.084651337694758e-05, "loss": 0.0932, "step": 1058 }, { "epoch": 0.8547862277714718, "grad_norm": 0.057547010481357574, "learning_rate": 7.078241021379272e-05, "loss": 0.1051, "step": 1059 }, { "epoch": 0.8555933913482154, "grad_norm": 0.0610077939927578, "learning_rate": 7.07182657301716e-05, "loss": 0.0944, "step": 1060 }, { "epoch": 0.856400554924959, "grad_norm": 0.0533037930727005, "learning_rate": 7.065408005361903e-05, "loss": 0.1002, "step": 1061 }, { "epoch": 0.8572077185017026, "grad_norm": 0.05663416162133217, "learning_rate": 7.058985331175163e-05, "loss": 0.1081, "step": 1062 }, { "epoch": 0.8580148820784462, "grad_norm": 0.06441085040569305, "learning_rate": 7.052558563226777e-05, "loss": 0.1021, "step": 1063 }, { "epoch": 0.8588220456551898, "grad_norm": 0.0529928095638752, "learning_rate": 7.046127714294714e-05, "loss": 0.1016, "step": 1064 }, { "epoch": 0.8596292092319334, "grad_norm": 0.05552412569522858, "learning_rate": 7.039692797165061e-05, "loss": 0.109, "step": 1065 }, { "epoch": 0.860436372808677, "grad_norm": 0.05477572977542877, "learning_rate": 7.033253824631991e-05, "loss": 0.1013, "step": 1066 }, { "epoch": 0.8612435363854206, "grad_norm": 0.060571879148483276, "learning_rate": 7.026810809497744e-05, "loss": 0.0994, "step": 1067 }, { "epoch": 0.8620506999621642, "grad_norm": 0.055716127157211304, "learning_rate": 7.020363764572591e-05, "loss": 0.0994, "step": 1068 }, { "epoch": 0.8628578635389078, "grad_norm": 0.053372226655483246, "learning_rate": 7.013912702674821e-05, "loss": 0.0976, "step": 1069 }, { "epoch": 0.8636650271156514, "grad_norm": 0.058248504996299744, "learning_rate": 7.007457636630709e-05, "loss": 0.0978, "step": 1070 }, { "epoch": 0.864472190692395, "grad_norm": 0.06754770129919052, "learning_rate": 7.000998579274487e-05, "loss": 0.1202, "step": 1071 }, { "epoch": 0.8652793542691386, "grad_norm": 0.056100763380527496, "learning_rate": 6.99453554344833e-05, "loss": 0.1005, "step": 1072 }, { "epoch": 0.8660865178458822, "grad_norm": 0.053685810416936874, "learning_rate": 6.988068542002316e-05, "loss": 0.1035, "step": 1073 }, { "epoch": 0.8668936814226258, "grad_norm": 0.056245673447847366, "learning_rate": 6.981597587794412e-05, "loss": 0.0998, "step": 1074 }, { "epoch": 0.8677008449993694, "grad_norm": 0.062497835606336594, "learning_rate": 6.975122693690441e-05, "loss": 0.0942, "step": 1075 }, { "epoch": 0.868508008576113, "grad_norm": 0.054415084421634674, "learning_rate": 6.968643872564064e-05, "loss": 0.1064, "step": 1076 }, { "epoch": 0.8693151721528566, "grad_norm": 0.05727611109614372, "learning_rate": 6.962161137296743e-05, "loss": 0.0952, "step": 1077 }, { "epoch": 0.8701223357296002, "grad_norm": 0.05783065780997276, "learning_rate": 6.95567450077773e-05, "loss": 0.102, "step": 1078 }, { "epoch": 0.8709294993063438, "grad_norm": 0.054401613771915436, "learning_rate": 6.949183975904026e-05, "loss": 0.1051, "step": 1079 }, { "epoch": 0.8717366628830874, "grad_norm": 0.05676226690411568, "learning_rate": 6.94268957558037e-05, "loss": 0.1096, "step": 1080 }, { "epoch": 0.872543826459831, "grad_norm": 0.05649667978286743, "learning_rate": 6.936191312719203e-05, "loss": 0.0947, "step": 1081 }, { "epoch": 0.8733509900365746, "grad_norm": 0.05979413539171219, "learning_rate": 6.929689200240645e-05, "loss": 0.1022, "step": 1082 }, { "epoch": 0.8741581536133182, "grad_norm": 0.06256957352161407, "learning_rate": 6.923183251072468e-05, "loss": 0.0959, "step": 1083 }, { "epoch": 0.8749653171900618, "grad_norm": 0.05755530297756195, "learning_rate": 6.91667347815008e-05, "loss": 0.1027, "step": 1084 }, { "epoch": 0.8757724807668054, "grad_norm": 0.05441396310925484, "learning_rate": 6.910159894416484e-05, "loss": 0.1105, "step": 1085 }, { "epoch": 0.876579644343549, "grad_norm": 0.060316700488328934, "learning_rate": 6.903642512822263e-05, "loss": 0.1082, "step": 1086 }, { "epoch": 0.8773868079202926, "grad_norm": 0.056368134915828705, "learning_rate": 6.897121346325551e-05, "loss": 0.0937, "step": 1087 }, { "epoch": 0.8781939714970362, "grad_norm": 0.05643955618143082, "learning_rate": 6.890596407892007e-05, "loss": 0.0857, "step": 1088 }, { "epoch": 0.8790011350737797, "grad_norm": 0.05727576091885567, "learning_rate": 6.884067710494789e-05, "loss": 0.0911, "step": 1089 }, { "epoch": 0.8798082986505233, "grad_norm": 0.05666645988821983, "learning_rate": 6.877535267114525e-05, "loss": 0.0841, "step": 1090 }, { "epoch": 0.8806154622272669, "grad_norm": 0.06445831060409546, "learning_rate": 6.870999090739301e-05, "loss": 0.0912, "step": 1091 }, { "epoch": 0.8814226258040105, "grad_norm": 0.05531910061836243, "learning_rate": 6.864459194364616e-05, "loss": 0.1058, "step": 1092 }, { "epoch": 0.8822297893807542, "grad_norm": 0.05735761672258377, "learning_rate": 6.85791559099337e-05, "loss": 0.1013, "step": 1093 }, { "epoch": 0.8830369529574978, "grad_norm": 0.054130133241415024, "learning_rate": 6.851368293635832e-05, "loss": 0.0922, "step": 1094 }, { "epoch": 0.8838441165342414, "grad_norm": 0.0595361553132534, "learning_rate": 6.84481731530961e-05, "loss": 0.1181, "step": 1095 }, { "epoch": 0.884651280110985, "grad_norm": 0.05821574851870537, "learning_rate": 6.838262669039643e-05, "loss": 0.097, "step": 1096 }, { "epoch": 0.8854584436877286, "grad_norm": 0.053733207285404205, "learning_rate": 6.831704367858153e-05, "loss": 0.0942, "step": 1097 }, { "epoch": 0.8862656072644722, "grad_norm": 0.057563357055187225, "learning_rate": 6.825142424804631e-05, "loss": 0.1014, "step": 1098 }, { "epoch": 0.8870727708412158, "grad_norm": 0.06536004692316055, "learning_rate": 6.818576852925808e-05, "loss": 0.0971, "step": 1099 }, { "epoch": 0.8878799344179594, "grad_norm": 0.054236963391304016, "learning_rate": 6.812007665275636e-05, "loss": 0.0882, "step": 1100 }, { "epoch": 0.888687097994703, "grad_norm": 0.057115159928798676, "learning_rate": 6.805434874915249e-05, "loss": 0.0948, "step": 1101 }, { "epoch": 0.8894942615714466, "grad_norm": 0.05786595493555069, "learning_rate": 6.798858494912943e-05, "loss": 0.1046, "step": 1102 }, { "epoch": 0.8903014251481902, "grad_norm": 0.06195760518312454, "learning_rate": 6.792278538344161e-05, "loss": 0.1082, "step": 1103 }, { "epoch": 0.8911085887249338, "grad_norm": 0.05500982701778412, "learning_rate": 6.785695018291447e-05, "loss": 0.1039, "step": 1104 }, { "epoch": 0.8919157523016774, "grad_norm": 0.057800572365522385, "learning_rate": 6.779107947844434e-05, "loss": 0.0988, "step": 1105 }, { "epoch": 0.892722915878421, "grad_norm": 0.05536225810647011, "learning_rate": 6.772517340099816e-05, "loss": 0.0998, "step": 1106 }, { "epoch": 0.8935300794551646, "grad_norm": 0.05474833771586418, "learning_rate": 6.765923208161313e-05, "loss": 0.1068, "step": 1107 }, { "epoch": 0.8943372430319082, "grad_norm": 0.052024926990270615, "learning_rate": 6.759325565139662e-05, "loss": 0.0923, "step": 1108 }, { "epoch": 0.8951444066086518, "grad_norm": 0.05528783053159714, "learning_rate": 6.752724424152575e-05, "loss": 0.104, "step": 1109 }, { "epoch": 0.8959515701853954, "grad_norm": 0.056337881833314896, "learning_rate": 6.746119798324714e-05, "loss": 0.0934, "step": 1110 }, { "epoch": 0.896758733762139, "grad_norm": 0.05840294808149338, "learning_rate": 6.739511700787683e-05, "loss": 0.1009, "step": 1111 }, { "epoch": 0.8975658973388826, "grad_norm": 0.0591946579515934, "learning_rate": 6.732900144679976e-05, "loss": 0.1043, "step": 1112 }, { "epoch": 0.8983730609156262, "grad_norm": 0.05303280055522919, "learning_rate": 6.726285143146969e-05, "loss": 0.0977, "step": 1113 }, { "epoch": 0.8991802244923698, "grad_norm": 0.06337162107229233, "learning_rate": 6.719666709340886e-05, "loss": 0.1059, "step": 1114 }, { "epoch": 0.8999873880691134, "grad_norm": 0.05814710259437561, "learning_rate": 6.713044856420782e-05, "loss": 0.0946, "step": 1115 }, { "epoch": 0.900794551645857, "grad_norm": 0.05835239216685295, "learning_rate": 6.7064195975525e-05, "loss": 0.108, "step": 1116 }, { "epoch": 0.9016017152226006, "grad_norm": 0.05788639187812805, "learning_rate": 6.699790945908662e-05, "loss": 0.0937, "step": 1117 }, { "epoch": 0.9024088787993442, "grad_norm": 0.06337899714708328, "learning_rate": 6.693158914668631e-05, "loss": 0.0979, "step": 1118 }, { "epoch": 0.9032160423760878, "grad_norm": 0.06202017888426781, "learning_rate": 6.686523517018494e-05, "loss": 0.1029, "step": 1119 }, { "epoch": 0.9040232059528314, "grad_norm": 0.05608386918902397, "learning_rate": 6.679884766151029e-05, "loss": 0.1016, "step": 1120 }, { "epoch": 0.904830369529575, "grad_norm": 0.054898157715797424, "learning_rate": 6.67324267526568e-05, "loss": 0.0992, "step": 1121 }, { "epoch": 0.9056375331063186, "grad_norm": 0.06011251360177994, "learning_rate": 6.666597257568532e-05, "loss": 0.1082, "step": 1122 }, { "epoch": 0.9064446966830622, "grad_norm": 0.05635428428649902, "learning_rate": 6.659948526272289e-05, "loss": 0.0998, "step": 1123 }, { "epoch": 0.9072518602598058, "grad_norm": 0.05368272587656975, "learning_rate": 6.653296494596235e-05, "loss": 0.1033, "step": 1124 }, { "epoch": 0.9080590238365494, "grad_norm": 0.05892596021294594, "learning_rate": 6.646641175766221e-05, "loss": 0.0948, "step": 1125 }, { "epoch": 0.908866187413293, "grad_norm": 0.06407414376735687, "learning_rate": 6.639982583014637e-05, "loss": 0.1098, "step": 1126 }, { "epoch": 0.9096733509900365, "grad_norm": 0.05852781981229782, "learning_rate": 6.633320729580376e-05, "loss": 0.1002, "step": 1127 }, { "epoch": 0.9104805145667801, "grad_norm": 0.0529257133603096, "learning_rate": 6.626655628708815e-05, "loss": 0.1007, "step": 1128 }, { "epoch": 0.9112876781435237, "grad_norm": 0.06380467116832733, "learning_rate": 6.619987293651792e-05, "loss": 0.1047, "step": 1129 }, { "epoch": 0.9120948417202673, "grad_norm": 0.05334516242146492, "learning_rate": 6.613315737667571e-05, "loss": 0.0925, "step": 1130 }, { "epoch": 0.9129020052970109, "grad_norm": 0.05241110548377037, "learning_rate": 6.606640974020823e-05, "loss": 0.0932, "step": 1131 }, { "epoch": 0.9137091688737545, "grad_norm": 0.054784201085567474, "learning_rate": 6.599963015982593e-05, "loss": 0.1035, "step": 1132 }, { "epoch": 0.9145163324504981, "grad_norm": 0.05780637636780739, "learning_rate": 6.593281876830281e-05, "loss": 0.1019, "step": 1133 }, { "epoch": 0.9153234960272417, "grad_norm": 0.05521411448717117, "learning_rate": 6.58659756984761e-05, "loss": 0.094, "step": 1134 }, { "epoch": 0.9161306596039853, "grad_norm": 0.05601583793759346, "learning_rate": 6.579910108324599e-05, "loss": 0.1045, "step": 1135 }, { "epoch": 0.9169378231807289, "grad_norm": 0.05271285027265549, "learning_rate": 6.573219505557548e-05, "loss": 0.1018, "step": 1136 }, { "epoch": 0.9177449867574725, "grad_norm": 0.05307653173804283, "learning_rate": 6.566525774848988e-05, "loss": 0.0883, "step": 1137 }, { "epoch": 0.9185521503342162, "grad_norm": 0.0644347295165062, "learning_rate": 6.559828929507684e-05, "loss": 0.1004, "step": 1138 }, { "epoch": 0.9193593139109598, "grad_norm": 0.05631678178906441, "learning_rate": 6.553128982848584e-05, "loss": 0.1073, "step": 1139 }, { "epoch": 0.9201664774877034, "grad_norm": 0.056450724601745605, "learning_rate": 6.546425948192803e-05, "loss": 0.101, "step": 1140 }, { "epoch": 0.920973641064447, "grad_norm": 0.056897569447755814, "learning_rate": 6.539719838867604e-05, "loss": 0.1035, "step": 1141 }, { "epoch": 0.9217808046411906, "grad_norm": 0.05581962689757347, "learning_rate": 6.533010668206349e-05, "loss": 0.0868, "step": 1142 }, { "epoch": 0.9225879682179342, "grad_norm": 0.05249864235520363, "learning_rate": 6.526298449548503e-05, "loss": 0.0927, "step": 1143 }, { "epoch": 0.9233951317946778, "grad_norm": 0.04778573289513588, "learning_rate": 6.519583196239575e-05, "loss": 0.092, "step": 1144 }, { "epoch": 0.9242022953714214, "grad_norm": 0.06273570656776428, "learning_rate": 6.512864921631121e-05, "loss": 0.1015, "step": 1145 }, { "epoch": 0.925009458948165, "grad_norm": 0.0540308952331543, "learning_rate": 6.506143639080695e-05, "loss": 0.1119, "step": 1146 }, { "epoch": 0.9258166225249086, "grad_norm": 0.060974445194005966, "learning_rate": 6.499419361951838e-05, "loss": 0.1168, "step": 1147 }, { "epoch": 0.9266237861016522, "grad_norm": 0.05831320211291313, "learning_rate": 6.492692103614039e-05, "loss": 0.101, "step": 1148 }, { "epoch": 0.9274309496783958, "grad_norm": 0.05697983130812645, "learning_rate": 6.485961877442719e-05, "loss": 0.1086, "step": 1149 }, { "epoch": 0.9282381132551394, "grad_norm": 0.0526130273938179, "learning_rate": 6.479228696819198e-05, "loss": 0.0961, "step": 1150 }, { "epoch": 0.929045276831883, "grad_norm": 0.060793887823820114, "learning_rate": 6.472492575130671e-05, "loss": 0.0949, "step": 1151 }, { "epoch": 0.9298524404086266, "grad_norm": 0.05258784070611, "learning_rate": 6.465753525770177e-05, "loss": 0.1033, "step": 1152 }, { "epoch": 0.9298524404086266, "eval_loss": 0.11301594972610474, "eval_runtime": 4031.6479, "eval_samples_per_second": 2.208, "eval_steps_per_second": 2.208, "step": 1152 }, { "epoch": 0.9306596039853702, "grad_norm": 0.056455276906490326, "learning_rate": 6.459011562136582e-05, "loss": 0.1019, "step": 1153 }, { "epoch": 0.9314667675621138, "grad_norm": 0.06445401161909103, "learning_rate": 6.452266697634541e-05, "loss": 0.1007, "step": 1154 }, { "epoch": 0.9322739311388574, "grad_norm": 0.0542818121612072, "learning_rate": 6.445518945674479e-05, "loss": 0.0934, "step": 1155 }, { "epoch": 0.933081094715601, "grad_norm": 0.05672699585556984, "learning_rate": 6.438768319672561e-05, "loss": 0.0916, "step": 1156 }, { "epoch": 0.9338882582923446, "grad_norm": 0.05359961465001106, "learning_rate": 6.43201483305067e-05, "loss": 0.0994, "step": 1157 }, { "epoch": 0.9346954218690882, "grad_norm": 0.05965611711144447, "learning_rate": 6.425258499236371e-05, "loss": 0.0971, "step": 1158 }, { "epoch": 0.9355025854458318, "grad_norm": 0.053329866379499435, "learning_rate": 6.418499331662891e-05, "loss": 0.1015, "step": 1159 }, { "epoch": 0.9363097490225754, "grad_norm": 0.053885359317064285, "learning_rate": 6.411737343769095e-05, "loss": 0.1015, "step": 1160 }, { "epoch": 0.937116912599319, "grad_norm": 0.053994424641132355, "learning_rate": 6.404972548999453e-05, "loss": 0.0949, "step": 1161 }, { "epoch": 0.9379240761760625, "grad_norm": 0.057753924280405045, "learning_rate": 6.398204960804015e-05, "loss": 0.1009, "step": 1162 }, { "epoch": 0.9387312397528061, "grad_norm": 0.060837335884571075, "learning_rate": 6.391434592638385e-05, "loss": 0.1035, "step": 1163 }, { "epoch": 0.9395384033295497, "grad_norm": 0.06096120551228523, "learning_rate": 6.384661457963698e-05, "loss": 0.1018, "step": 1164 }, { "epoch": 0.9403455669062933, "grad_norm": 0.0553770512342453, "learning_rate": 6.377885570246583e-05, "loss": 0.0937, "step": 1165 }, { "epoch": 0.9411527304830369, "grad_norm": 0.06224793940782547, "learning_rate": 6.37110694295915e-05, "loss": 0.103, "step": 1166 }, { "epoch": 0.9419598940597805, "grad_norm": 0.05381052568554878, "learning_rate": 6.364325589578948e-05, "loss": 0.1038, "step": 1167 }, { "epoch": 0.9427670576365241, "grad_norm": 0.0555836521089077, "learning_rate": 6.357541523588955e-05, "loss": 0.1009, "step": 1168 }, { "epoch": 0.9435742212132677, "grad_norm": 0.05457519739866257, "learning_rate": 6.350754758477533e-05, "loss": 0.0979, "step": 1169 }, { "epoch": 0.9443813847900113, "grad_norm": 0.062222544103860855, "learning_rate": 6.343965307738419e-05, "loss": 0.1082, "step": 1170 }, { "epoch": 0.9451885483667549, "grad_norm": 0.06411632150411606, "learning_rate": 6.337173184870683e-05, "loss": 0.1046, "step": 1171 }, { "epoch": 0.9459957119434985, "grad_norm": 0.05891282483935356, "learning_rate": 6.330378403378714e-05, "loss": 0.1038, "step": 1172 }, { "epoch": 0.9468028755202421, "grad_norm": 0.056079283356666565, "learning_rate": 6.32358097677218e-05, "loss": 0.104, "step": 1173 }, { "epoch": 0.9476100390969857, "grad_norm": 0.048946309834718704, "learning_rate": 6.316780918566016e-05, "loss": 0.0911, "step": 1174 }, { "epoch": 0.9484172026737293, "grad_norm": 0.05602562054991722, "learning_rate": 6.30997824228038e-05, "loss": 0.0897, "step": 1175 }, { "epoch": 0.9492243662504729, "grad_norm": 0.052927415817976, "learning_rate": 6.303172961440645e-05, "loss": 0.0984, "step": 1176 }, { "epoch": 0.9500315298272165, "grad_norm": 0.05620706453919411, "learning_rate": 6.296365089577356e-05, "loss": 0.1063, "step": 1177 }, { "epoch": 0.9508386934039601, "grad_norm": 0.05291198194026947, "learning_rate": 6.289554640226213e-05, "loss": 0.0899, "step": 1178 }, { "epoch": 0.9516458569807037, "grad_norm": 0.05793546512722969, "learning_rate": 6.282741626928036e-05, "loss": 0.0997, "step": 1179 }, { "epoch": 0.9524530205574473, "grad_norm": 0.05427946522831917, "learning_rate": 6.27592606322875e-05, "loss": 0.0953, "step": 1180 }, { "epoch": 0.9532601841341909, "grad_norm": 0.05838523432612419, "learning_rate": 6.269107962679344e-05, "loss": 0.1079, "step": 1181 }, { "epoch": 0.9540673477109346, "grad_norm": 0.05632670968770981, "learning_rate": 6.262287338835853e-05, "loss": 0.0908, "step": 1182 }, { "epoch": 0.9548745112876782, "grad_norm": 0.05441918596625328, "learning_rate": 6.255464205259331e-05, "loss": 0.093, "step": 1183 }, { "epoch": 0.9556816748644218, "grad_norm": 0.052412502467632294, "learning_rate": 6.248638575515822e-05, "loss": 0.0905, "step": 1184 }, { "epoch": 0.9564888384411654, "grad_norm": 0.05291422829031944, "learning_rate": 6.241810463176328e-05, "loss": 0.0969, "step": 1185 }, { "epoch": 0.957296002017909, "grad_norm": 0.05520175024867058, "learning_rate": 6.234979881816793e-05, "loss": 0.1037, "step": 1186 }, { "epoch": 0.9581031655946526, "grad_norm": 0.061113983392715454, "learning_rate": 6.228146845018067e-05, "loss": 0.0968, "step": 1187 }, { "epoch": 0.9589103291713962, "grad_norm": 0.054251041263341904, "learning_rate": 6.221311366365883e-05, "loss": 0.1017, "step": 1188 }, { "epoch": 0.9597174927481398, "grad_norm": 0.06087525561451912, "learning_rate": 6.214473459450828e-05, "loss": 0.1089, "step": 1189 }, { "epoch": 0.9605246563248834, "grad_norm": 0.056257132440805435, "learning_rate": 6.207633137868318e-05, "loss": 0.0966, "step": 1190 }, { "epoch": 0.961331819901627, "grad_norm": 0.05294414982199669, "learning_rate": 6.200790415218568e-05, "loss": 0.104, "step": 1191 }, { "epoch": 0.9621389834783706, "grad_norm": 0.06054136902093887, "learning_rate": 6.19394530510657e-05, "loss": 0.1031, "step": 1192 }, { "epoch": 0.9629461470551142, "grad_norm": 0.056556586176157, "learning_rate": 6.18709782114206e-05, "loss": 0.09, "step": 1193 }, { "epoch": 0.9637533106318578, "grad_norm": 0.058104198426008224, "learning_rate": 6.180247976939495e-05, "loss": 0.1033, "step": 1194 }, { "epoch": 0.9645604742086014, "grad_norm": 0.05910739302635193, "learning_rate": 6.173395786118025e-05, "loss": 0.1071, "step": 1195 }, { "epoch": 0.965367637785345, "grad_norm": 0.054539311677217484, "learning_rate": 6.166541262301468e-05, "loss": 0.1002, "step": 1196 }, { "epoch": 0.9661748013620886, "grad_norm": 0.05593803897500038, "learning_rate": 6.159684419118274e-05, "loss": 0.0977, "step": 1197 }, { "epoch": 0.9669819649388322, "grad_norm": 0.053134482353925705, "learning_rate": 6.152825270201509e-05, "loss": 0.0957, "step": 1198 }, { "epoch": 0.9677891285155757, "grad_norm": 0.05713789910078049, "learning_rate": 6.145963829188824e-05, "loss": 0.1035, "step": 1199 }, { "epoch": 0.9685962920923193, "grad_norm": 0.06766299903392792, "learning_rate": 6.139100109722426e-05, "loss": 0.101, "step": 1200 }, { "epoch": 0.9694034556690629, "grad_norm": 0.045931894332170486, "learning_rate": 6.13223412544905e-05, "loss": 0.0933, "step": 1201 }, { "epoch": 0.9702106192458065, "grad_norm": 0.05101197957992554, "learning_rate": 6.125365890019941e-05, "loss": 0.1064, "step": 1202 }, { "epoch": 0.9710177828225501, "grad_norm": 0.05502074956893921, "learning_rate": 6.11849541709081e-05, "loss": 0.1079, "step": 1203 }, { "epoch": 0.9718249463992937, "grad_norm": 0.05950180068612099, "learning_rate": 6.111622720321824e-05, "loss": 0.0978, "step": 1204 }, { "epoch": 0.9726321099760373, "grad_norm": 0.056991562247276306, "learning_rate": 6.104747813377567e-05, "loss": 0.0942, "step": 1205 }, { "epoch": 0.9734392735527809, "grad_norm": 0.04982936009764671, "learning_rate": 6.0978707099270214e-05, "loss": 0.0938, "step": 1206 }, { "epoch": 0.9742464371295245, "grad_norm": 0.05519028753042221, "learning_rate": 6.090991423643535e-05, "loss": 0.1023, "step": 1207 }, { "epoch": 0.9750536007062681, "grad_norm": 0.05369408428668976, "learning_rate": 6.0841099682047965e-05, "loss": 0.093, "step": 1208 }, { "epoch": 0.9758607642830117, "grad_norm": 0.055152177810668945, "learning_rate": 6.077226357292802e-05, "loss": 0.0918, "step": 1209 }, { "epoch": 0.9766679278597553, "grad_norm": 0.05874713510274887, "learning_rate": 6.070340604593843e-05, "loss": 0.107, "step": 1210 }, { "epoch": 0.9774750914364989, "grad_norm": 0.059401318430900574, "learning_rate": 6.0634527237984604e-05, "loss": 0.1032, "step": 1211 }, { "epoch": 0.9782822550132425, "grad_norm": 0.05600263550877571, "learning_rate": 6.0565627286014304e-05, "loss": 0.0945, "step": 1212 }, { "epoch": 0.9790894185899861, "grad_norm": 0.05861750617623329, "learning_rate": 6.049670632701735e-05, "loss": 0.1026, "step": 1213 }, { "epoch": 0.9798965821667297, "grad_norm": 0.056487105786800385, "learning_rate": 6.0427764498025265e-05, "loss": 0.1079, "step": 1214 }, { "epoch": 0.9807037457434733, "grad_norm": 0.05619568005204201, "learning_rate": 6.0358801936111145e-05, "loss": 0.1049, "step": 1215 }, { "epoch": 0.9815109093202169, "grad_norm": 0.05534539744257927, "learning_rate": 6.028981877838925e-05, "loss": 0.0952, "step": 1216 }, { "epoch": 0.9823180728969605, "grad_norm": 0.061013299971818924, "learning_rate": 6.022081516201482e-05, "loss": 0.0965, "step": 1217 }, { "epoch": 0.9831252364737041, "grad_norm": 0.05137497931718826, "learning_rate": 6.0151791224183754e-05, "loss": 0.0991, "step": 1218 }, { "epoch": 0.9839324000504477, "grad_norm": 0.054340627044439316, "learning_rate": 6.0082747102132355e-05, "loss": 0.0916, "step": 1219 }, { "epoch": 0.9847395636271913, "grad_norm": 0.05629637837409973, "learning_rate": 6.001368293313708e-05, "loss": 0.1002, "step": 1220 }, { "epoch": 0.9855467272039349, "grad_norm": 0.060919906944036484, "learning_rate": 5.994459885451422e-05, "loss": 0.1087, "step": 1221 }, { "epoch": 0.9863538907806785, "grad_norm": 0.06388608366250992, "learning_rate": 5.987549500361966e-05, "loss": 0.0996, "step": 1222 }, { "epoch": 0.9871610543574221, "grad_norm": 0.0576111376285553, "learning_rate": 5.98063715178486e-05, "loss": 0.1121, "step": 1223 }, { "epoch": 0.9879682179341657, "grad_norm": 0.05695372819900513, "learning_rate": 5.973722853463527e-05, "loss": 0.095, "step": 1224 }, { "epoch": 0.9887753815109093, "grad_norm": 0.058951981365680695, "learning_rate": 5.9668066191452674e-05, "loss": 0.095, "step": 1225 }, { "epoch": 0.9895825450876529, "grad_norm": 0.05621155723929405, "learning_rate": 5.9598884625812315e-05, "loss": 0.1006, "step": 1226 }, { "epoch": 0.9903897086643966, "grad_norm": 0.052822425961494446, "learning_rate": 5.952968397526387e-05, "loss": 0.0955, "step": 1227 }, { "epoch": 0.9911968722411402, "grad_norm": 0.054630957543849945, "learning_rate": 5.946046437739504e-05, "loss": 0.099, "step": 1228 }, { "epoch": 0.9920040358178838, "grad_norm": 0.053919412195682526, "learning_rate": 5.9391225969831145e-05, "loss": 0.098, "step": 1229 }, { "epoch": 0.9928111993946274, "grad_norm": 0.06197540834546089, "learning_rate": 5.932196889023488e-05, "loss": 0.102, "step": 1230 }, { "epoch": 0.993618362971371, "grad_norm": 0.050488997250795364, "learning_rate": 5.925269327630615e-05, "loss": 0.0979, "step": 1231 }, { "epoch": 0.9944255265481146, "grad_norm": 0.05818356201052666, "learning_rate": 5.918339926578162e-05, "loss": 0.0977, "step": 1232 }, { "epoch": 0.9952326901248582, "grad_norm": 0.05811886489391327, "learning_rate": 5.911408699643458e-05, "loss": 0.1022, "step": 1233 }, { "epoch": 0.9960398537016018, "grad_norm": 0.05571984872221947, "learning_rate": 5.9044756606074626e-05, "loss": 0.0895, "step": 1234 }, { "epoch": 0.9968470172783453, "grad_norm": 0.0545196458697319, "learning_rate": 5.8975408232547346e-05, "loss": 0.0961, "step": 1235 }, { "epoch": 0.997654180855089, "grad_norm": 0.05770910158753395, "learning_rate": 5.890604201373411e-05, "loss": 0.1, "step": 1236 }, { "epoch": 0.9984613444318325, "grad_norm": 0.057636577636003494, "learning_rate": 5.883665808755179e-05, "loss": 0.1103, "step": 1237 }, { "epoch": 0.9992685080085761, "grad_norm": 0.06401769816875458, "learning_rate": 5.8767256591952426e-05, "loss": 0.1052, "step": 1238 }, { "epoch": 1.0, "grad_norm": 0.06305667012929916, "learning_rate": 5.869783766492299e-05, "loss": 0.0918, "step": 1239 }, { "epoch": 1.0008071635767437, "grad_norm": 0.05863628536462784, "learning_rate": 5.862840144448516e-05, "loss": 0.095, "step": 1240 }, { "epoch": 1.0016143271534872, "grad_norm": 0.05933332070708275, "learning_rate": 5.855894806869493e-05, "loss": 0.1016, "step": 1241 }, { "epoch": 1.002421490730231, "grad_norm": 0.05714670568704605, "learning_rate": 5.8489477675642444e-05, "loss": 0.0977, "step": 1242 }, { "epoch": 1.0032286543069744, "grad_norm": 0.05560676380991936, "learning_rate": 5.841999040345167e-05, "loss": 0.103, "step": 1243 }, { "epoch": 1.004035817883718, "grad_norm": 0.05497704818844795, "learning_rate": 5.835048639028018e-05, "loss": 0.0852, "step": 1244 }, { "epoch": 1.0048429814604616, "grad_norm": 0.05491700768470764, "learning_rate": 5.8280965774318744e-05, "loss": 0.0949, "step": 1245 }, { "epoch": 1.0056501450372053, "grad_norm": 0.050615597516298294, "learning_rate": 5.82114286937912e-05, "loss": 0.1017, "step": 1246 }, { "epoch": 1.0064573086139488, "grad_norm": 0.05805670842528343, "learning_rate": 5.814187528695412e-05, "loss": 0.105, "step": 1247 }, { "epoch": 1.0072644721906925, "grad_norm": 0.05982751399278641, "learning_rate": 5.8072305692096516e-05, "loss": 0.0975, "step": 1248 }, { "epoch": 1.008071635767436, "grad_norm": 0.05408954620361328, "learning_rate": 5.80027200475396e-05, "loss": 0.0993, "step": 1249 }, { "epoch": 1.0088787993441797, "grad_norm": 0.05583207309246063, "learning_rate": 5.793311849163651e-05, "loss": 0.0976, "step": 1250 }, { "epoch": 1.0096859629209232, "grad_norm": 0.05667487159371376, "learning_rate": 5.786350116277195e-05, "loss": 0.0987, "step": 1251 }, { "epoch": 1.0104931264976669, "grad_norm": 0.0547030083835125, "learning_rate": 5.77938681993621e-05, "loss": 0.1068, "step": 1252 }, { "epoch": 1.0113002900744104, "grad_norm": 0.05787511169910431, "learning_rate": 5.772421973985411e-05, "loss": 0.0975, "step": 1253 }, { "epoch": 1.012107453651154, "grad_norm": 0.05351670831441879, "learning_rate": 5.7654555922726006e-05, "loss": 0.0865, "step": 1254 }, { "epoch": 1.0129146172278976, "grad_norm": 0.05402090772986412, "learning_rate": 5.758487688648635e-05, "loss": 0.1035, "step": 1255 }, { "epoch": 1.0137217808046413, "grad_norm": 0.04921947047114372, "learning_rate": 5.7515182769673915e-05, "loss": 0.1014, "step": 1256 }, { "epoch": 1.0145289443813847, "grad_norm": 0.04979803040623665, "learning_rate": 5.744547371085751e-05, "loss": 0.0944, "step": 1257 }, { "epoch": 1.0153361079581285, "grad_norm": 0.05395800247788429, "learning_rate": 5.737574984863565e-05, "loss": 0.1052, "step": 1258 }, { "epoch": 1.016143271534872, "grad_norm": 0.0567806102335453, "learning_rate": 5.730601132163623e-05, "loss": 0.1042, "step": 1259 }, { "epoch": 1.0169504351116156, "grad_norm": 0.05688454955816269, "learning_rate": 5.7236258268516354e-05, "loss": 0.095, "step": 1260 }, { "epoch": 1.0177575986883591, "grad_norm": 0.05572457239031792, "learning_rate": 5.716649082796198e-05, "loss": 0.0878, "step": 1261 }, { "epoch": 1.0185647622651028, "grad_norm": 0.05413046479225159, "learning_rate": 5.7096709138687696e-05, "loss": 0.0904, "step": 1262 }, { "epoch": 1.0193719258418463, "grad_norm": 0.058594584465026855, "learning_rate": 5.702691333943638e-05, "loss": 0.0945, "step": 1263 }, { "epoch": 1.02017908941859, "grad_norm": 0.05193213000893593, "learning_rate": 5.695710356897902e-05, "loss": 0.1009, "step": 1264 }, { "epoch": 1.0209862529953335, "grad_norm": 0.05166451632976532, "learning_rate": 5.688727996611434e-05, "loss": 0.0964, "step": 1265 }, { "epoch": 1.0217934165720772, "grad_norm": 0.05595332011580467, "learning_rate": 5.681744266966856e-05, "loss": 0.0937, "step": 1266 }, { "epoch": 1.0226005801488207, "grad_norm": 0.05249791219830513, "learning_rate": 5.674759181849518e-05, "loss": 0.0913, "step": 1267 }, { "epoch": 1.0234077437255644, "grad_norm": 0.05256170779466629, "learning_rate": 5.667772755147459e-05, "loss": 0.0908, "step": 1268 }, { "epoch": 1.024214907302308, "grad_norm": 0.05006462335586548, "learning_rate": 5.6607850007513874e-05, "loss": 0.0937, "step": 1269 }, { "epoch": 1.0250220708790516, "grad_norm": 0.05793534219264984, "learning_rate": 5.653795932554653e-05, "loss": 0.0933, "step": 1270 }, { "epoch": 1.025829234455795, "grad_norm": 0.055386222898960114, "learning_rate": 5.6468055644532156e-05, "loss": 0.097, "step": 1271 }, { "epoch": 1.0266363980325388, "grad_norm": 0.061862241476774216, "learning_rate": 5.6398139103456216e-05, "loss": 0.1068, "step": 1272 }, { "epoch": 1.0274435616092823, "grad_norm": 0.05301091820001602, "learning_rate": 5.6328209841329724e-05, "loss": 0.1007, "step": 1273 }, { "epoch": 1.028250725186026, "grad_norm": 0.0651644691824913, "learning_rate": 5.6258267997189005e-05, "loss": 0.1042, "step": 1274 }, { "epoch": 1.0290578887627695, "grad_norm": 0.05310661345720291, "learning_rate": 5.6188313710095375e-05, "loss": 0.1088, "step": 1275 }, { "epoch": 1.0298650523395132, "grad_norm": 0.054328594356775284, "learning_rate": 5.6118347119134916e-05, "loss": 0.1043, "step": 1276 }, { "epoch": 1.0306722159162567, "grad_norm": 0.05048896372318268, "learning_rate": 5.604836836341816e-05, "loss": 0.1082, "step": 1277 }, { "epoch": 1.0314793794930004, "grad_norm": 0.053414177149534225, "learning_rate": 5.59783775820798e-05, "loss": 0.1072, "step": 1278 }, { "epoch": 1.0322865430697439, "grad_norm": 0.05465978384017944, "learning_rate": 5.59083749142785e-05, "loss": 0.0902, "step": 1279 }, { "epoch": 1.0330937066464876, "grad_norm": 0.05367380380630493, "learning_rate": 5.5838360499196504e-05, "loss": 0.0945, "step": 1280 }, { "epoch": 1.0330937066464876, "eval_loss": 0.1124713271856308, "eval_runtime": 4003.1891, "eval_samples_per_second": 2.224, "eval_steps_per_second": 2.224, "step": 1280 }, { "epoch": 1.033900870223231, "grad_norm": 0.051545917987823486, "learning_rate": 5.576833447603943e-05, "loss": 0.0943, "step": 1281 }, { "epoch": 1.0347080337999748, "grad_norm": 0.05470716208219528, "learning_rate": 5.569829698403599e-05, "loss": 0.102, "step": 1282 }, { "epoch": 1.0355151973767183, "grad_norm": 0.05812381953001022, "learning_rate": 5.562824816243769e-05, "loss": 0.0963, "step": 1283 }, { "epoch": 1.036322360953462, "grad_norm": 0.05122838541865349, "learning_rate": 5.555818815051852e-05, "loss": 0.0867, "step": 1284 }, { "epoch": 1.0371295245302057, "grad_norm": 0.05727369338274002, "learning_rate": 5.5488117087574785e-05, "loss": 0.103, "step": 1285 }, { "epoch": 1.0379366881069492, "grad_norm": 0.06371577084064484, "learning_rate": 5.541803511292474e-05, "loss": 0.1048, "step": 1286 }, { "epoch": 1.0387438516836929, "grad_norm": 0.04786060005426407, "learning_rate": 5.5347942365908313e-05, "loss": 0.0877, "step": 1287 }, { "epoch": 1.0395510152604364, "grad_norm": 0.05701855197548866, "learning_rate": 5.5277838985886874e-05, "loss": 0.0919, "step": 1288 }, { "epoch": 1.04035817883718, "grad_norm": 0.053646862506866455, "learning_rate": 5.520772511224292e-05, "loss": 0.0989, "step": 1289 }, { "epoch": 1.0411653424139236, "grad_norm": 0.053217992186546326, "learning_rate": 5.513760088437983e-05, "loss": 0.0901, "step": 1290 }, { "epoch": 1.0419725059906673, "grad_norm": 0.057724494487047195, "learning_rate": 5.506746644172154e-05, "loss": 0.1095, "step": 1291 }, { "epoch": 1.0427796695674107, "grad_norm": 0.05722779408097267, "learning_rate": 5.499732192371232e-05, "loss": 0.1058, "step": 1292 }, { "epoch": 1.0435868331441545, "grad_norm": 0.05360390245914459, "learning_rate": 5.492716746981647e-05, "loss": 0.0909, "step": 1293 }, { "epoch": 1.044393996720898, "grad_norm": 0.05921318009495735, "learning_rate": 5.4857003219518036e-05, "loss": 0.0967, "step": 1294 }, { "epoch": 1.0452011602976417, "grad_norm": 0.05126386880874634, "learning_rate": 5.478682931232053e-05, "loss": 0.0953, "step": 1295 }, { "epoch": 1.0460083238743851, "grad_norm": 0.060095321387052536, "learning_rate": 5.471664588774671e-05, "loss": 0.1034, "step": 1296 }, { "epoch": 1.0468154874511288, "grad_norm": 0.05124751478433609, "learning_rate": 5.46464530853382e-05, "loss": 0.0943, "step": 1297 }, { "epoch": 1.0476226510278723, "grad_norm": 0.05389925464987755, "learning_rate": 5.457625104465533e-05, "loss": 0.0893, "step": 1298 }, { "epoch": 1.048429814604616, "grad_norm": 0.056433673948049545, "learning_rate": 5.45060399052767e-05, "loss": 0.0963, "step": 1299 }, { "epoch": 1.0492369781813595, "grad_norm": 0.0595649816095829, "learning_rate": 5.4435819806799136e-05, "loss": 0.1021, "step": 1300 }, { "epoch": 1.0500441417581032, "grad_norm": 0.05584030598402023, "learning_rate": 5.436559088883717e-05, "loss": 0.095, "step": 1301 }, { "epoch": 1.0508513053348467, "grad_norm": 0.054570771753787994, "learning_rate": 5.429535329102291e-05, "loss": 0.1007, "step": 1302 }, { "epoch": 1.0516584689115904, "grad_norm": 0.060201797634363174, "learning_rate": 5.422510715300572e-05, "loss": 0.1031, "step": 1303 }, { "epoch": 1.052465632488334, "grad_norm": 0.052695661783218384, "learning_rate": 5.415485261445193e-05, "loss": 0.0977, "step": 1304 }, { "epoch": 1.0532727960650776, "grad_norm": 0.05536072701215744, "learning_rate": 5.408458981504458e-05, "loss": 0.0913, "step": 1305 }, { "epoch": 1.054079959641821, "grad_norm": 0.052456263452768326, "learning_rate": 5.4014318894483175e-05, "loss": 0.0912, "step": 1306 }, { "epoch": 1.0548871232185648, "grad_norm": 0.05065172538161278, "learning_rate": 5.3944039992483274e-05, "loss": 0.0849, "step": 1307 }, { "epoch": 1.0556942867953083, "grad_norm": 0.054474327713251114, "learning_rate": 5.387375324877639e-05, "loss": 0.0909, "step": 1308 }, { "epoch": 1.056501450372052, "grad_norm": 0.05537593364715576, "learning_rate": 5.3803458803109606e-05, "loss": 0.0994, "step": 1309 }, { "epoch": 1.0573086139487955, "grad_norm": 0.05871691182255745, "learning_rate": 5.373315679524529e-05, "loss": 0.0966, "step": 1310 }, { "epoch": 1.0581157775255392, "grad_norm": 0.05529142543673515, "learning_rate": 5.3662847364960855e-05, "loss": 0.0892, "step": 1311 }, { "epoch": 1.0589229411022827, "grad_norm": 0.06075415760278702, "learning_rate": 5.359253065204851e-05, "loss": 0.1144, "step": 1312 }, { "epoch": 1.0597301046790264, "grad_norm": 0.056413717567920685, "learning_rate": 5.352220679631491e-05, "loss": 0.1035, "step": 1313 }, { "epoch": 1.0605372682557699, "grad_norm": 0.057084791362285614, "learning_rate": 5.3451875937580885e-05, "loss": 0.0864, "step": 1314 }, { "epoch": 1.0613444318325136, "grad_norm": 0.05835185945034027, "learning_rate": 5.338153821568127e-05, "loss": 0.0994, "step": 1315 }, { "epoch": 1.062151595409257, "grad_norm": 0.05809670686721802, "learning_rate": 5.331119377046446e-05, "loss": 0.0993, "step": 1316 }, { "epoch": 1.0629587589860008, "grad_norm": 0.046955656260252, "learning_rate": 5.324084274179228e-05, "loss": 0.0985, "step": 1317 }, { "epoch": 1.0637659225627443, "grad_norm": 0.05682864040136337, "learning_rate": 5.317048526953958e-05, "loss": 0.105, "step": 1318 }, { "epoch": 1.064573086139488, "grad_norm": 0.05557282269001007, "learning_rate": 5.310012149359411e-05, "loss": 0.1, "step": 1319 }, { "epoch": 1.0653802497162315, "grad_norm": 0.05755150690674782, "learning_rate": 5.302975155385606e-05, "loss": 0.0994, "step": 1320 }, { "epoch": 1.0661874132929752, "grad_norm": 0.05640566349029541, "learning_rate": 5.295937559023794e-05, "loss": 0.0969, "step": 1321 }, { "epoch": 1.0669945768697187, "grad_norm": 0.05607366934418678, "learning_rate": 5.2888993742664206e-05, "loss": 0.0961, "step": 1322 }, { "epoch": 1.0678017404464624, "grad_norm": 0.06296985596418381, "learning_rate": 5.2818606151071015e-05, "loss": 0.0908, "step": 1323 }, { "epoch": 1.0686089040232059, "grad_norm": 0.0514247752726078, "learning_rate": 5.274821295540597e-05, "loss": 0.1083, "step": 1324 }, { "epoch": 1.0694160675999496, "grad_norm": 0.05896959826350212, "learning_rate": 5.267781429562779e-05, "loss": 0.1068, "step": 1325 }, { "epoch": 1.0702232311766933, "grad_norm": 0.05480825901031494, "learning_rate": 5.260741031170605e-05, "loss": 0.099, "step": 1326 }, { "epoch": 1.0710303947534368, "grad_norm": 0.05655493587255478, "learning_rate": 5.253700114362096e-05, "loss": 0.0912, "step": 1327 }, { "epoch": 1.0718375583301802, "grad_norm": 0.0591539703309536, "learning_rate": 5.246658693136296e-05, "loss": 0.0972, "step": 1328 }, { "epoch": 1.072644721906924, "grad_norm": 0.05432627722620964, "learning_rate": 5.2396167814932595e-05, "loss": 0.0976, "step": 1329 }, { "epoch": 1.0734518854836677, "grad_norm": 0.05290958657860756, "learning_rate": 5.232574393434012e-05, "loss": 0.0994, "step": 1330 }, { "epoch": 1.0742590490604111, "grad_norm": 0.05922912806272507, "learning_rate": 5.225531542960528e-05, "loss": 0.0933, "step": 1331 }, { "epoch": 1.0750662126371548, "grad_norm": 0.0539131835103035, "learning_rate": 5.2184882440756975e-05, "loss": 0.0973, "step": 1332 }, { "epoch": 1.0758733762138983, "grad_norm": 0.05409527197480202, "learning_rate": 5.211444510783309e-05, "loss": 0.0954, "step": 1333 }, { "epoch": 1.076680539790642, "grad_norm": 0.05391722172498703, "learning_rate": 5.2044003570880074e-05, "loss": 0.0978, "step": 1334 }, { "epoch": 1.0774877033673855, "grad_norm": 0.05169634521007538, "learning_rate": 5.197355796995277e-05, "loss": 0.0982, "step": 1335 }, { "epoch": 1.0782948669441292, "grad_norm": 0.05485386773943901, "learning_rate": 5.190310844511412e-05, "loss": 0.0951, "step": 1336 }, { "epoch": 1.0791020305208727, "grad_norm": 0.0600421316921711, "learning_rate": 5.1832655136434835e-05, "loss": 0.1019, "step": 1337 }, { "epoch": 1.0799091940976164, "grad_norm": 0.05321107059717178, "learning_rate": 5.176219818399316e-05, "loss": 0.0898, "step": 1338 }, { "epoch": 1.08071635767436, "grad_norm": 0.055844005197286606, "learning_rate": 5.169173772787458e-05, "loss": 0.0994, "step": 1339 }, { "epoch": 1.0815235212511036, "grad_norm": 0.05290304124355316, "learning_rate": 5.162127390817156e-05, "loss": 0.0905, "step": 1340 }, { "epoch": 1.0823306848278471, "grad_norm": 0.0583515465259552, "learning_rate": 5.155080686498324e-05, "loss": 0.0904, "step": 1341 }, { "epoch": 1.0831378484045908, "grad_norm": 0.062015097588300705, "learning_rate": 5.148033673841517e-05, "loss": 0.1075, "step": 1342 }, { "epoch": 1.0839450119813343, "grad_norm": 0.05287199094891548, "learning_rate": 5.140986366857904e-05, "loss": 0.0987, "step": 1343 }, { "epoch": 1.084752175558078, "grad_norm": 0.05343778431415558, "learning_rate": 5.133938779559239e-05, "loss": 0.0904, "step": 1344 }, { "epoch": 1.0855593391348215, "grad_norm": 0.053152844309806824, "learning_rate": 5.126890925957831e-05, "loss": 0.0952, "step": 1345 }, { "epoch": 1.0863665027115652, "grad_norm": 0.05567016452550888, "learning_rate": 5.1198428200665227e-05, "loss": 0.0993, "step": 1346 }, { "epoch": 1.0871736662883087, "grad_norm": 0.05333978310227394, "learning_rate": 5.1127944758986545e-05, "loss": 0.0975, "step": 1347 }, { "epoch": 1.0879808298650524, "grad_norm": 0.05006587132811546, "learning_rate": 5.105745907468043e-05, "loss": 0.0978, "step": 1348 }, { "epoch": 1.0887879934417959, "grad_norm": 0.05252319201827049, "learning_rate": 5.098697128788951e-05, "loss": 0.0913, "step": 1349 }, { "epoch": 1.0895951570185396, "grad_norm": 0.05744358152151108, "learning_rate": 5.091648153876054e-05, "loss": 0.0857, "step": 1350 }, { "epoch": 1.090402320595283, "grad_norm": 0.06418123096227646, "learning_rate": 5.0845989967444255e-05, "loss": 0.1001, "step": 1351 }, { "epoch": 1.0912094841720268, "grad_norm": 0.05817200616002083, "learning_rate": 5.077549671409497e-05, "loss": 0.094, "step": 1352 }, { "epoch": 1.0920166477487703, "grad_norm": 0.05800556764006615, "learning_rate": 5.070500191887033e-05, "loss": 0.1053, "step": 1353 }, { "epoch": 1.092823811325514, "grad_norm": 0.05047079548239708, "learning_rate": 5.0634505721931105e-05, "loss": 0.0973, "step": 1354 }, { "epoch": 1.0936309749022575, "grad_norm": 0.056162212044000626, "learning_rate": 5.056400826344077e-05, "loss": 0.0973, "step": 1355 }, { "epoch": 1.0944381384790012, "grad_norm": 0.06032247841358185, "learning_rate": 5.0493509683565365e-05, "loss": 0.0959, "step": 1356 }, { "epoch": 1.0952453020557447, "grad_norm": 0.052834268659353256, "learning_rate": 5.042301012247317e-05, "loss": 0.1014, "step": 1357 }, { "epoch": 1.0960524656324884, "grad_norm": 0.05730166286230087, "learning_rate": 5.0352509720334376e-05, "loss": 0.1101, "step": 1358 }, { "epoch": 1.0968596292092319, "grad_norm": 0.05395761877298355, "learning_rate": 5.028200861732083e-05, "loss": 0.093, "step": 1359 }, { "epoch": 1.0976667927859756, "grad_norm": 0.05834250897169113, "learning_rate": 5.0211506953605855e-05, "loss": 0.0839, "step": 1360 }, { "epoch": 1.098473956362719, "grad_norm": 0.05798870697617531, "learning_rate": 5.014100486936383e-05, "loss": 0.1017, "step": 1361 }, { "epoch": 1.0992811199394628, "grad_norm": 0.05584033578634262, "learning_rate": 5.0070502504769945e-05, "loss": 0.1017, "step": 1362 }, { "epoch": 1.1000882835162062, "grad_norm": 0.058435454964637756, "learning_rate": 5e-05, "loss": 0.0957, "step": 1363 }, { "epoch": 1.10089544709295, "grad_norm": 0.05149945989251137, "learning_rate": 4.992949749523006e-05, "loss": 0.0997, "step": 1364 }, { "epoch": 1.1017026106696934, "grad_norm": 0.04980111122131348, "learning_rate": 4.985899513063618e-05, "loss": 0.1042, "step": 1365 }, { "epoch": 1.1025097742464371, "grad_norm": 0.05775781348347664, "learning_rate": 4.9788493046394136e-05, "loss": 0.1023, "step": 1366 }, { "epoch": 1.1033169378231806, "grad_norm": 0.057785581797361374, "learning_rate": 4.9717991382679175e-05, "loss": 0.0938, "step": 1367 }, { "epoch": 1.1041241013999243, "grad_norm": 0.05840395390987396, "learning_rate": 4.964749027966563e-05, "loss": 0.1002, "step": 1368 }, { "epoch": 1.1049312649766678, "grad_norm": 0.04966799542307854, "learning_rate": 4.9576989877526845e-05, "loss": 0.0931, "step": 1369 }, { "epoch": 1.1057384285534115, "grad_norm": 0.05498110502958298, "learning_rate": 4.950649031643463e-05, "loss": 0.0969, "step": 1370 }, { "epoch": 1.1065455921301552, "grad_norm": 0.053672220557928085, "learning_rate": 4.9435991736559245e-05, "loss": 0.1064, "step": 1371 }, { "epoch": 1.1073527557068987, "grad_norm": 0.05393766239285469, "learning_rate": 4.936549427806891e-05, "loss": 0.0915, "step": 1372 }, { "epoch": 1.1081599192836422, "grad_norm": 0.057955510914325714, "learning_rate": 4.929499808112969e-05, "loss": 0.1034, "step": 1373 }, { "epoch": 1.108967082860386, "grad_norm": 0.05071543529629707, "learning_rate": 4.9224503285905046e-05, "loss": 0.0906, "step": 1374 }, { "epoch": 1.1097742464371296, "grad_norm": 0.048853229731321335, "learning_rate": 4.915401003255577e-05, "loss": 0.0974, "step": 1375 }, { "epoch": 1.1105814100138731, "grad_norm": 0.05494729429483414, "learning_rate": 4.908351846123947e-05, "loss": 0.0989, "step": 1376 }, { "epoch": 1.1113885735906168, "grad_norm": 0.05539925768971443, "learning_rate": 4.901302871211052e-05, "loss": 0.0977, "step": 1377 }, { "epoch": 1.1121957371673603, "grad_norm": 0.05466871336102486, "learning_rate": 4.894254092531957e-05, "loss": 0.1027, "step": 1378 }, { "epoch": 1.113002900744104, "grad_norm": 0.06426863372325897, "learning_rate": 4.887205524101345e-05, "loss": 0.1014, "step": 1379 }, { "epoch": 1.1138100643208475, "grad_norm": 0.05359235033392906, "learning_rate": 4.880157179933478e-05, "loss": 0.098, "step": 1380 }, { "epoch": 1.1146172278975912, "grad_norm": 0.046127818524837494, "learning_rate": 4.8731090740421685e-05, "loss": 0.0996, "step": 1381 }, { "epoch": 1.1154243914743347, "grad_norm": 0.06031598150730133, "learning_rate": 4.866061220440763e-05, "loss": 0.1022, "step": 1382 }, { "epoch": 1.1162315550510784, "grad_norm": 0.05615110695362091, "learning_rate": 4.859013633142096e-05, "loss": 0.0918, "step": 1383 }, { "epoch": 1.117038718627822, "grad_norm": 0.049914490431547165, "learning_rate": 4.851966326158485e-05, "loss": 0.1001, "step": 1384 }, { "epoch": 1.1178458822045656, "grad_norm": 0.053959961980581284, "learning_rate": 4.844919313501677e-05, "loss": 0.1046, "step": 1385 }, { "epoch": 1.118653045781309, "grad_norm": 0.0659768357872963, "learning_rate": 4.837872609182846e-05, "loss": 0.0978, "step": 1386 }, { "epoch": 1.1194602093580528, "grad_norm": 0.05655699968338013, "learning_rate": 4.830826227212543e-05, "loss": 0.1052, "step": 1387 }, { "epoch": 1.1202673729347963, "grad_norm": 0.05162840336561203, "learning_rate": 4.823780181600685e-05, "loss": 0.0991, "step": 1388 }, { "epoch": 1.12107453651154, "grad_norm": 0.05492141470313072, "learning_rate": 4.816734486356517e-05, "loss": 0.1038, "step": 1389 }, { "epoch": 1.1218817000882835, "grad_norm": 0.06244286522269249, "learning_rate": 4.8096891554885896e-05, "loss": 0.1042, "step": 1390 }, { "epoch": 1.1226888636650272, "grad_norm": 0.05401325598359108, "learning_rate": 4.802644203004723e-05, "loss": 0.0947, "step": 1391 }, { "epoch": 1.1234960272417707, "grad_norm": 0.06098264455795288, "learning_rate": 4.795599642911994e-05, "loss": 0.0925, "step": 1392 }, { "epoch": 1.1243031908185144, "grad_norm": 0.051814790815114975, "learning_rate": 4.7885554892166924e-05, "loss": 0.1002, "step": 1393 }, { "epoch": 1.1251103543952579, "grad_norm": 0.06303991377353668, "learning_rate": 4.781511755924302e-05, "loss": 0.1017, "step": 1394 }, { "epoch": 1.1259175179720016, "grad_norm": 0.05456051975488663, "learning_rate": 4.774468457039473e-05, "loss": 0.0858, "step": 1395 }, { "epoch": 1.126724681548745, "grad_norm": 0.054906927049160004, "learning_rate": 4.767425606565987e-05, "loss": 0.1034, "step": 1396 }, { "epoch": 1.1275318451254888, "grad_norm": 0.05684875324368477, "learning_rate": 4.7603832185067416e-05, "loss": 0.1086, "step": 1397 }, { "epoch": 1.1283390087022322, "grad_norm": 0.051478926092386246, "learning_rate": 4.753341306863704e-05, "loss": 0.0963, "step": 1398 }, { "epoch": 1.129146172278976, "grad_norm": 0.05514628067612648, "learning_rate": 4.7462998856379065e-05, "loss": 0.0912, "step": 1399 }, { "epoch": 1.1299533358557194, "grad_norm": 0.059231579303741455, "learning_rate": 4.739258968829396e-05, "loss": 0.0905, "step": 1400 }, { "epoch": 1.1307604994324632, "grad_norm": 0.05544985458254814, "learning_rate": 4.7322185704372234e-05, "loss": 0.0943, "step": 1401 }, { "epoch": 1.1315676630092066, "grad_norm": 0.05559937283396721, "learning_rate": 4.725178704459404e-05, "loss": 0.0853, "step": 1402 }, { "epoch": 1.1323748265859503, "grad_norm": 0.057922158390283585, "learning_rate": 4.7181393848929e-05, "loss": 0.0887, "step": 1403 }, { "epoch": 1.1331819901626938, "grad_norm": 0.054624754935503006, "learning_rate": 4.711100625733581e-05, "loss": 0.1006, "step": 1404 }, { "epoch": 1.1339891537394375, "grad_norm": 0.05147130787372589, "learning_rate": 4.704062440976209e-05, "loss": 0.1019, "step": 1405 }, { "epoch": 1.134796317316181, "grad_norm": 0.05304765701293945, "learning_rate": 4.697024844614396e-05, "loss": 0.0964, "step": 1406 }, { "epoch": 1.1356034808929247, "grad_norm": 0.057767629623413086, "learning_rate": 4.6899878506405906e-05, "loss": 0.0923, "step": 1407 }, { "epoch": 1.1364106444696682, "grad_norm": 0.056161005049943924, "learning_rate": 4.6829514730460425e-05, "loss": 0.0998, "step": 1408 }, { "epoch": 1.1364106444696682, "eval_loss": 0.11367829889059067, "eval_runtime": 4007.655, "eval_samples_per_second": 2.221, "eval_steps_per_second": 2.221, "step": 1408 }, { "epoch": 1.137217808046412, "grad_norm": 0.05180082097649574, "learning_rate": 4.675915725820773e-05, "loss": 0.099, "step": 1409 }, { "epoch": 1.1380249716231554, "grad_norm": 0.06259894371032715, "learning_rate": 4.668880622953554e-05, "loss": 0.0957, "step": 1410 }, { "epoch": 1.1388321351998991, "grad_norm": 0.05633513256907463, "learning_rate": 4.661846178431873e-05, "loss": 0.0958, "step": 1411 }, { "epoch": 1.1396392987766428, "grad_norm": 0.05600317567586899, "learning_rate": 4.6548124062419126e-05, "loss": 0.1027, "step": 1412 }, { "epoch": 1.1404464623533863, "grad_norm": 0.05271922051906586, "learning_rate": 4.6477793203685096e-05, "loss": 0.0913, "step": 1413 }, { "epoch": 1.1412536259301298, "grad_norm": 0.05019792914390564, "learning_rate": 4.640746934795151e-05, "loss": 0.0953, "step": 1414 }, { "epoch": 1.1420607895068735, "grad_norm": 0.05359852313995361, "learning_rate": 4.633715263503916e-05, "loss": 0.0975, "step": 1415 }, { "epoch": 1.1428679530836172, "grad_norm": 0.06329607963562012, "learning_rate": 4.626684320475473e-05, "loss": 0.1055, "step": 1416 }, { "epoch": 1.1436751166603607, "grad_norm": 0.04810573533177376, "learning_rate": 4.6196541196890406e-05, "loss": 0.0821, "step": 1417 }, { "epoch": 1.1444822802371042, "grad_norm": 0.051786649972200394, "learning_rate": 4.612624675122362e-05, "loss": 0.0983, "step": 1418 }, { "epoch": 1.145289443813848, "grad_norm": 0.05178788676857948, "learning_rate": 4.605596000751673e-05, "loss": 0.0888, "step": 1419 }, { "epoch": 1.1460966073905916, "grad_norm": 0.05726385861635208, "learning_rate": 4.5985681105516857e-05, "loss": 0.1002, "step": 1420 }, { "epoch": 1.146903770967335, "grad_norm": 0.0544670931994915, "learning_rate": 4.591541018495542e-05, "loss": 0.1, "step": 1421 }, { "epoch": 1.1477109345440786, "grad_norm": 0.05139665678143501, "learning_rate": 4.584514738554807e-05, "loss": 0.0936, "step": 1422 }, { "epoch": 1.1485180981208223, "grad_norm": 0.054147884249687195, "learning_rate": 4.5774892846994295e-05, "loss": 0.0982, "step": 1423 }, { "epoch": 1.149325261697566, "grad_norm": 0.05432237312197685, "learning_rate": 4.5704646708977096e-05, "loss": 0.0903, "step": 1424 }, { "epoch": 1.1501324252743095, "grad_norm": 0.05467607080936432, "learning_rate": 4.563440911116284e-05, "loss": 0.0954, "step": 1425 }, { "epoch": 1.1509395888510532, "grad_norm": 0.05028024688363075, "learning_rate": 4.556418019320087e-05, "loss": 0.0874, "step": 1426 }, { "epoch": 1.1517467524277967, "grad_norm": 0.05326267331838608, "learning_rate": 4.549396009472331e-05, "loss": 0.1097, "step": 1427 }, { "epoch": 1.1525539160045404, "grad_norm": 0.051553092896938324, "learning_rate": 4.5423748955344685e-05, "loss": 0.1025, "step": 1428 }, { "epoch": 1.1533610795812839, "grad_norm": 0.050990067422389984, "learning_rate": 4.535354691466182e-05, "loss": 0.0982, "step": 1429 }, { "epoch": 1.1541682431580276, "grad_norm": 0.04973863810300827, "learning_rate": 4.528335411225331e-05, "loss": 0.0954, "step": 1430 }, { "epoch": 1.154975406734771, "grad_norm": 0.05767285078763962, "learning_rate": 4.521317068767949e-05, "loss": 0.0981, "step": 1431 }, { "epoch": 1.1557825703115148, "grad_norm": 0.05843889340758324, "learning_rate": 4.514299678048198e-05, "loss": 0.1091, "step": 1432 }, { "epoch": 1.1565897338882583, "grad_norm": 0.047380756586790085, "learning_rate": 4.507283253018355e-05, "loss": 0.0998, "step": 1433 }, { "epoch": 1.157396897465002, "grad_norm": 0.05887581408023834, "learning_rate": 4.5002678076287685e-05, "loss": 0.1091, "step": 1434 }, { "epoch": 1.1582040610417454, "grad_norm": 0.054402656853199005, "learning_rate": 4.493253355827846e-05, "loss": 0.1061, "step": 1435 }, { "epoch": 1.1590112246184892, "grad_norm": 0.05517078936100006, "learning_rate": 4.4862399115620184e-05, "loss": 0.0921, "step": 1436 }, { "epoch": 1.1598183881952326, "grad_norm": 0.05497118830680847, "learning_rate": 4.479227488775707e-05, "loss": 0.0977, "step": 1437 }, { "epoch": 1.1606255517719763, "grad_norm": 0.05431082472205162, "learning_rate": 4.472216101411313e-05, "loss": 0.0932, "step": 1438 }, { "epoch": 1.1614327153487198, "grad_norm": 0.05252033472061157, "learning_rate": 4.4652057634091685e-05, "loss": 0.0967, "step": 1439 }, { "epoch": 1.1622398789254635, "grad_norm": 0.05644332617521286, "learning_rate": 4.458196488707527e-05, "loss": 0.1076, "step": 1440 }, { "epoch": 1.163047042502207, "grad_norm": 0.05601223558187485, "learning_rate": 4.451188291242521e-05, "loss": 0.0941, "step": 1441 }, { "epoch": 1.1638542060789507, "grad_norm": 0.057686906307935715, "learning_rate": 4.4441811849481505e-05, "loss": 0.1047, "step": 1442 }, { "epoch": 1.1646613696556942, "grad_norm": 0.0583050511777401, "learning_rate": 4.4371751837562326e-05, "loss": 0.0988, "step": 1443 }, { "epoch": 1.165468533232438, "grad_norm": 0.054821230471134186, "learning_rate": 4.430170301596403e-05, "loss": 0.0998, "step": 1444 }, { "epoch": 1.1662756968091814, "grad_norm": 0.0576142892241478, "learning_rate": 4.423166552396058e-05, "loss": 0.0987, "step": 1445 }, { "epoch": 1.1670828603859251, "grad_norm": 0.05591549724340439, "learning_rate": 4.4161639500803515e-05, "loss": 0.1003, "step": 1446 }, { "epoch": 1.1678900239626686, "grad_norm": 0.055217571556568146, "learning_rate": 4.409162508572151e-05, "loss": 0.0966, "step": 1447 }, { "epoch": 1.1686971875394123, "grad_norm": 0.057856589555740356, "learning_rate": 4.4021622417920214e-05, "loss": 0.1092, "step": 1448 }, { "epoch": 1.1695043511161558, "grad_norm": 0.050127651542425156, "learning_rate": 4.395163163658186e-05, "loss": 0.0838, "step": 1449 }, { "epoch": 1.1703115146928995, "grad_norm": 0.06182342767715454, "learning_rate": 4.388165288086508e-05, "loss": 0.1025, "step": 1450 }, { "epoch": 1.171118678269643, "grad_norm": 0.05751679092645645, "learning_rate": 4.381168628990464e-05, "loss": 0.1082, "step": 1451 }, { "epoch": 1.1719258418463867, "grad_norm": 0.053951386362314224, "learning_rate": 4.3741732002810986e-05, "loss": 0.0879, "step": 1452 }, { "epoch": 1.1727330054231302, "grad_norm": 0.054338566958904266, "learning_rate": 4.367179015867028e-05, "loss": 0.1017, "step": 1453 }, { "epoch": 1.173540168999874, "grad_norm": 0.057407528162002563, "learning_rate": 4.360186089654379e-05, "loss": 0.097, "step": 1454 }, { "epoch": 1.1743473325766174, "grad_norm": 0.05695270746946335, "learning_rate": 4.3531944355467855e-05, "loss": 0.1046, "step": 1455 }, { "epoch": 1.175154496153361, "grad_norm": 0.05589112639427185, "learning_rate": 4.346204067445348e-05, "loss": 0.0983, "step": 1456 }, { "epoch": 1.1759616597301048, "grad_norm": 0.05427222698926926, "learning_rate": 4.3392149992486144e-05, "loss": 0.0992, "step": 1457 }, { "epoch": 1.1767688233068483, "grad_norm": 0.0536048486828804, "learning_rate": 4.332227244852543e-05, "loss": 0.0964, "step": 1458 }, { "epoch": 1.1775759868835918, "grad_norm": 0.05518953502178192, "learning_rate": 4.3252408181504844e-05, "loss": 0.1085, "step": 1459 }, { "epoch": 1.1783831504603355, "grad_norm": 0.06058667600154877, "learning_rate": 4.318255733033145e-05, "loss": 0.0934, "step": 1460 }, { "epoch": 1.1791903140370792, "grad_norm": 0.0479750782251358, "learning_rate": 4.311272003388568e-05, "loss": 0.0994, "step": 1461 }, { "epoch": 1.1799974776138227, "grad_norm": 0.05248510465025902, "learning_rate": 4.304289643102099e-05, "loss": 0.1049, "step": 1462 }, { "epoch": 1.1808046411905662, "grad_norm": 0.05330374091863632, "learning_rate": 4.297308666056362e-05, "loss": 0.0988, "step": 1463 }, { "epoch": 1.1816118047673099, "grad_norm": 0.05739835277199745, "learning_rate": 4.2903290861312316e-05, "loss": 0.104, "step": 1464 }, { "epoch": 1.1824189683440536, "grad_norm": 0.056255366653203964, "learning_rate": 4.283350917203802e-05, "loss": 0.0887, "step": 1465 }, { "epoch": 1.183226131920797, "grad_norm": 0.05646447092294693, "learning_rate": 4.2763741731483664e-05, "loss": 0.0917, "step": 1466 }, { "epoch": 1.1840332954975405, "grad_norm": 0.05628127604722977, "learning_rate": 4.2693988678363766e-05, "loss": 0.0988, "step": 1467 }, { "epoch": 1.1848404590742843, "grad_norm": 0.05189945921301842, "learning_rate": 4.262425015136436e-05, "loss": 0.0961, "step": 1468 }, { "epoch": 1.185647622651028, "grad_norm": 0.05240646004676819, "learning_rate": 4.255452628914248e-05, "loss": 0.0995, "step": 1469 }, { "epoch": 1.1864547862277715, "grad_norm": 0.055181413888931274, "learning_rate": 4.248481723032609e-05, "loss": 0.0958, "step": 1470 }, { "epoch": 1.1872619498045152, "grad_norm": 0.05319532752037048, "learning_rate": 4.241512311351366e-05, "loss": 0.1, "step": 1471 }, { "epoch": 1.1880691133812586, "grad_norm": 0.05409524217247963, "learning_rate": 4.2345444077274e-05, "loss": 0.0987, "step": 1472 }, { "epoch": 1.1888762769580024, "grad_norm": 0.05742429569363594, "learning_rate": 4.22757802601459e-05, "loss": 0.0984, "step": 1473 }, { "epoch": 1.1896834405347458, "grad_norm": 0.058722954243421555, "learning_rate": 4.2206131800637924e-05, "loss": 0.1073, "step": 1474 }, { "epoch": 1.1904906041114895, "grad_norm": 0.0513107106089592, "learning_rate": 4.213649883722805e-05, "loss": 0.0921, "step": 1475 }, { "epoch": 1.191297767688233, "grad_norm": 0.055246639996767044, "learning_rate": 4.2066881508363523e-05, "loss": 0.099, "step": 1476 }, { "epoch": 1.1921049312649767, "grad_norm": 0.04966241121292114, "learning_rate": 4.199727995246041e-05, "loss": 0.099, "step": 1477 }, { "epoch": 1.1929120948417202, "grad_norm": 0.05299234762787819, "learning_rate": 4.192769430790349e-05, "loss": 0.1072, "step": 1478 }, { "epoch": 1.193719258418464, "grad_norm": 0.05512527748942375, "learning_rate": 4.1858124713045885e-05, "loss": 0.0974, "step": 1479 }, { "epoch": 1.1945264219952074, "grad_norm": 0.0532386414706707, "learning_rate": 4.17885713062088e-05, "loss": 0.0846, "step": 1480 }, { "epoch": 1.1953335855719511, "grad_norm": 0.056560348719358444, "learning_rate": 4.1719034225681274e-05, "loss": 0.108, "step": 1481 }, { "epoch": 1.1961407491486946, "grad_norm": 0.056977029889822006, "learning_rate": 4.164951360971982e-05, "loss": 0.0934, "step": 1482 }, { "epoch": 1.1969479127254383, "grad_norm": 0.052842721343040466, "learning_rate": 4.158000959654833e-05, "loss": 0.0966, "step": 1483 }, { "epoch": 1.1977550763021818, "grad_norm": 0.05269904062151909, "learning_rate": 4.151052232435757e-05, "loss": 0.0988, "step": 1484 }, { "epoch": 1.1985622398789255, "grad_norm": 0.05394161120057106, "learning_rate": 4.1441051931305093e-05, "loss": 0.0906, "step": 1485 }, { "epoch": 1.199369403455669, "grad_norm": 0.04908270388841629, "learning_rate": 4.137159855551486e-05, "loss": 0.092, "step": 1486 }, { "epoch": 1.2001765670324127, "grad_norm": 0.050416216254234314, "learning_rate": 4.130216233507701e-05, "loss": 0.0944, "step": 1487 }, { "epoch": 1.2009837306091562, "grad_norm": 0.06348618865013123, "learning_rate": 4.123274340804758e-05, "loss": 0.0946, "step": 1488 }, { "epoch": 1.2017908941859, "grad_norm": 0.056067679077386856, "learning_rate": 4.116334191244823e-05, "loss": 0.099, "step": 1489 }, { "epoch": 1.2025980577626434, "grad_norm": 0.05565875396132469, "learning_rate": 4.10939579862659e-05, "loss": 0.1064, "step": 1490 }, { "epoch": 1.203405221339387, "grad_norm": 0.05398828163743019, "learning_rate": 4.102459176745267e-05, "loss": 0.1015, "step": 1491 }, { "epoch": 1.2042123849161306, "grad_norm": 0.05584413558244705, "learning_rate": 4.095524339392539e-05, "loss": 0.0876, "step": 1492 }, { "epoch": 1.2050195484928743, "grad_norm": 0.057046130299568176, "learning_rate": 4.088591300356542e-05, "loss": 0.0993, "step": 1493 }, { "epoch": 1.2058267120696178, "grad_norm": 0.06014026701450348, "learning_rate": 4.081660073421838e-05, "loss": 0.0907, "step": 1494 }, { "epoch": 1.2066338756463615, "grad_norm": 0.05617552623152733, "learning_rate": 4.074730672369385e-05, "loss": 0.094, "step": 1495 }, { "epoch": 1.207441039223105, "grad_norm": 0.055147264152765274, "learning_rate": 4.067803110976513e-05, "loss": 0.0968, "step": 1496 }, { "epoch": 1.2082482027998487, "grad_norm": 0.05785325914621353, "learning_rate": 4.060877403016886e-05, "loss": 0.1006, "step": 1497 }, { "epoch": 1.2090553663765922, "grad_norm": 0.05535483360290527, "learning_rate": 4.053953562260497e-05, "loss": 0.0978, "step": 1498 }, { "epoch": 1.2098625299533359, "grad_norm": 0.054639652371406555, "learning_rate": 4.0470316024736135e-05, "loss": 0.0971, "step": 1499 }, { "epoch": 1.2106696935300794, "grad_norm": 0.05322081968188286, "learning_rate": 4.040111537418771e-05, "loss": 0.0983, "step": 1500 }, { "epoch": 1.211476857106823, "grad_norm": 0.059996165335178375, "learning_rate": 4.033193380854734e-05, "loss": 0.1001, "step": 1501 }, { "epoch": 1.2122840206835668, "grad_norm": 0.05914293974637985, "learning_rate": 4.0262771465364744e-05, "loss": 0.0923, "step": 1502 }, { "epoch": 1.2130911842603103, "grad_norm": 0.0504484660923481, "learning_rate": 4.0193628482151405e-05, "loss": 0.1016, "step": 1503 }, { "epoch": 1.2138983478370537, "grad_norm": 0.05175400152802467, "learning_rate": 4.0124504996380356e-05, "loss": 0.0941, "step": 1504 }, { "epoch": 1.2147055114137975, "grad_norm": 0.05279899388551712, "learning_rate": 4.0055401145485795e-05, "loss": 0.0955, "step": 1505 }, { "epoch": 1.2155126749905412, "grad_norm": 0.05489225313067436, "learning_rate": 3.998631706686292e-05, "loss": 0.1056, "step": 1506 }, { "epoch": 1.2163198385672847, "grad_norm": 0.05567755177617073, "learning_rate": 3.991725289786765e-05, "loss": 0.0864, "step": 1507 }, { "epoch": 1.2171270021440281, "grad_norm": 0.05051262676715851, "learning_rate": 3.984820877581625e-05, "loss": 0.095, "step": 1508 }, { "epoch": 1.2179341657207718, "grad_norm": 0.053347863256931305, "learning_rate": 3.977918483798519e-05, "loss": 0.0899, "step": 1509 }, { "epoch": 1.2187413292975156, "grad_norm": 0.05010542646050453, "learning_rate": 3.971018122161075e-05, "loss": 0.0805, "step": 1510 }, { "epoch": 1.219548492874259, "grad_norm": 0.054851412773132324, "learning_rate": 3.964119806388887e-05, "loss": 0.0974, "step": 1511 }, { "epoch": 1.2203556564510025, "grad_norm": 0.052156005054712296, "learning_rate": 3.957223550197473e-05, "loss": 0.0957, "step": 1512 }, { "epoch": 1.2211628200277462, "grad_norm": 0.0597258135676384, "learning_rate": 3.950329367298268e-05, "loss": 0.1076, "step": 1513 }, { "epoch": 1.22196998360449, "grad_norm": 0.058639198541641235, "learning_rate": 3.943437271398571e-05, "loss": 0.0962, "step": 1514 }, { "epoch": 1.2227771471812334, "grad_norm": 0.055115483701229095, "learning_rate": 3.9365472762015415e-05, "loss": 0.0977, "step": 1515 }, { "epoch": 1.2235843107579771, "grad_norm": 0.05183776095509529, "learning_rate": 3.929659395406159e-05, "loss": 0.0978, "step": 1516 }, { "epoch": 1.2243914743347206, "grad_norm": 0.05798739194869995, "learning_rate": 3.9227736427071995e-05, "loss": 0.0898, "step": 1517 }, { "epoch": 1.2251986379114643, "grad_norm": 0.05078835040330887, "learning_rate": 3.9158900317952054e-05, "loss": 0.1008, "step": 1518 }, { "epoch": 1.2260058014882078, "grad_norm": 0.05702638626098633, "learning_rate": 3.9090085763564667e-05, "loss": 0.0971, "step": 1519 }, { "epoch": 1.2268129650649515, "grad_norm": 0.051120009273290634, "learning_rate": 3.90212929007298e-05, "loss": 0.0961, "step": 1520 }, { "epoch": 1.227620128641695, "grad_norm": 0.05171988904476166, "learning_rate": 3.895252186622433e-05, "loss": 0.1048, "step": 1521 }, { "epoch": 1.2284272922184387, "grad_norm": 0.05262458696961403, "learning_rate": 3.888377279678178e-05, "loss": 0.1016, "step": 1522 }, { "epoch": 1.2292344557951822, "grad_norm": 0.04878690093755722, "learning_rate": 3.881504582909191e-05, "loss": 0.0928, "step": 1523 }, { "epoch": 1.230041619371926, "grad_norm": 0.051622919738292694, "learning_rate": 3.8746341099800604e-05, "loss": 0.0919, "step": 1524 }, { "epoch": 1.2308487829486694, "grad_norm": 0.05369985103607178, "learning_rate": 3.867765874550949e-05, "loss": 0.1034, "step": 1525 }, { "epoch": 1.231655946525413, "grad_norm": 0.05775514617562294, "learning_rate": 3.860899890277575e-05, "loss": 0.0923, "step": 1526 }, { "epoch": 1.2324631101021566, "grad_norm": 0.05649261549115181, "learning_rate": 3.854036170811176e-05, "loss": 0.0931, "step": 1527 }, { "epoch": 1.2332702736789003, "grad_norm": 0.051947299391031265, "learning_rate": 3.8471747297984925e-05, "loss": 0.099, "step": 1528 }, { "epoch": 1.2340774372556438, "grad_norm": 0.054033875465393066, "learning_rate": 3.840315580881728e-05, "loss": 0.1062, "step": 1529 }, { "epoch": 1.2348846008323875, "grad_norm": 0.0523831769824028, "learning_rate": 3.8334587376985344e-05, "loss": 0.0889, "step": 1530 }, { "epoch": 1.235691764409131, "grad_norm": 0.05607297644019127, "learning_rate": 3.8266042138819746e-05, "loss": 0.102, "step": 1531 }, { "epoch": 1.2364989279858747, "grad_norm": 0.05678236111998558, "learning_rate": 3.819752023060506e-05, "loss": 0.0888, "step": 1532 }, { "epoch": 1.2373060915626182, "grad_norm": 0.05291256681084633, "learning_rate": 3.812902178857941e-05, "loss": 0.0948, "step": 1533 }, { "epoch": 1.2381132551393619, "grad_norm": 0.05564703419804573, "learning_rate": 3.806054694893432e-05, "loss": 0.1034, "step": 1534 }, { "epoch": 1.2389204187161054, "grad_norm": 0.05492163822054863, "learning_rate": 3.7992095847814337e-05, "loss": 0.094, "step": 1535 }, { "epoch": 1.239727582292849, "grad_norm": 0.059076059609651566, "learning_rate": 3.7923668621316824e-05, "loss": 0.0893, "step": 1536 }, { "epoch": 1.239727582292849, "eval_loss": 0.11290178447961807, "eval_runtime": 4022.4299, "eval_samples_per_second": 2.213, "eval_steps_per_second": 2.213, "step": 1536 }, { "epoch": 1.2405347458695926, "grad_norm": 0.05612736567854881, "learning_rate": 3.785526540549173e-05, "loss": 0.085, "step": 1537 }, { "epoch": 1.2413419094463363, "grad_norm": 0.058760516345500946, "learning_rate": 3.778688633634117e-05, "loss": 0.0987, "step": 1538 }, { "epoch": 1.2421490730230798, "grad_norm": 0.059618331491947174, "learning_rate": 3.771853154981934e-05, "loss": 0.0955, "step": 1539 }, { "epoch": 1.2429562365998235, "grad_norm": 0.052046000957489014, "learning_rate": 3.7650201181832065e-05, "loss": 0.0932, "step": 1540 }, { "epoch": 1.243763400176567, "grad_norm": 0.06632925570011139, "learning_rate": 3.758189536823673e-05, "loss": 0.1005, "step": 1541 }, { "epoch": 1.2445705637533107, "grad_norm": 0.05406505987048149, "learning_rate": 3.7513614244841796e-05, "loss": 0.0978, "step": 1542 }, { "epoch": 1.2453777273300541, "grad_norm": 0.059586476534605026, "learning_rate": 3.744535794740671e-05, "loss": 0.0995, "step": 1543 }, { "epoch": 1.2461848909067978, "grad_norm": 0.05669262260198593, "learning_rate": 3.737712661164149e-05, "loss": 0.0986, "step": 1544 }, { "epoch": 1.2469920544835413, "grad_norm": 0.0556720569729805, "learning_rate": 3.730892037320659e-05, "loss": 0.1023, "step": 1545 }, { "epoch": 1.247799218060285, "grad_norm": 0.05552751570940018, "learning_rate": 3.724073936771252e-05, "loss": 0.0926, "step": 1546 }, { "epoch": 1.2486063816370288, "grad_norm": 0.04934966564178467, "learning_rate": 3.717258373071965e-05, "loss": 0.0936, "step": 1547 }, { "epoch": 1.2494135452137722, "grad_norm": 0.060293421149253845, "learning_rate": 3.710445359773788e-05, "loss": 0.096, "step": 1548 }, { "epoch": 1.2502207087905157, "grad_norm": 0.05721204727888107, "learning_rate": 3.703634910422643e-05, "loss": 0.106, "step": 1549 }, { "epoch": 1.2510278723672594, "grad_norm": 0.05165942758321762, "learning_rate": 3.6968270385593556e-05, "loss": 0.096, "step": 1550 }, { "epoch": 1.2518350359440031, "grad_norm": 0.052315447479486465, "learning_rate": 3.6900217577196185e-05, "loss": 0.1023, "step": 1551 }, { "epoch": 1.2526421995207466, "grad_norm": 0.053604334592819214, "learning_rate": 3.683219081433986e-05, "loss": 0.1047, "step": 1552 }, { "epoch": 1.25344936309749, "grad_norm": 0.05139333754777908, "learning_rate": 3.6764190232278195e-05, "loss": 0.0953, "step": 1553 }, { "epoch": 1.2542565266742338, "grad_norm": 0.0529954731464386, "learning_rate": 3.669621596621288e-05, "loss": 0.0961, "step": 1554 }, { "epoch": 1.2550636902509775, "grad_norm": 0.05284703150391579, "learning_rate": 3.6628268151293165e-05, "loss": 0.0984, "step": 1555 }, { "epoch": 1.255870853827721, "grad_norm": 0.053427234292030334, "learning_rate": 3.656034692261582e-05, "loss": 0.1081, "step": 1556 }, { "epoch": 1.2566780174044645, "grad_norm": 0.052795860916376114, "learning_rate": 3.649245241522468e-05, "loss": 0.0919, "step": 1557 }, { "epoch": 1.2574851809812082, "grad_norm": 0.05850130319595337, "learning_rate": 3.642458476411048e-05, "loss": 0.0967, "step": 1558 }, { "epoch": 1.258292344557952, "grad_norm": 0.05718611553311348, "learning_rate": 3.635674410421053e-05, "loss": 0.0938, "step": 1559 }, { "epoch": 1.2590995081346954, "grad_norm": 0.05824499949812889, "learning_rate": 3.628893057040853e-05, "loss": 0.0912, "step": 1560 }, { "epoch": 1.2599066717114389, "grad_norm": 0.05663374438881874, "learning_rate": 3.622114429753418e-05, "loss": 0.0926, "step": 1561 }, { "epoch": 1.2607138352881826, "grad_norm": 0.053009457886219025, "learning_rate": 3.615338542036304e-05, "loss": 0.0883, "step": 1562 }, { "epoch": 1.2615209988649263, "grad_norm": 0.05274448171257973, "learning_rate": 3.608565407361615e-05, "loss": 0.0997, "step": 1563 }, { "epoch": 1.2623281624416698, "grad_norm": 0.055354684591293335, "learning_rate": 3.601795039195985e-05, "loss": 0.1, "step": 1564 }, { "epoch": 1.2631353260184135, "grad_norm": 0.05688100680708885, "learning_rate": 3.5950274510005486e-05, "loss": 0.1016, "step": 1565 }, { "epoch": 1.263942489595157, "grad_norm": 0.0497344546020031, "learning_rate": 3.588262656230904e-05, "loss": 0.1016, "step": 1566 }, { "epoch": 1.2647496531719007, "grad_norm": 0.048181965947151184, "learning_rate": 3.58150066833711e-05, "loss": 0.0918, "step": 1567 }, { "epoch": 1.2655568167486442, "grad_norm": 0.059475284069776535, "learning_rate": 3.5747415007636304e-05, "loss": 0.1096, "step": 1568 }, { "epoch": 1.2663639803253879, "grad_norm": 0.05641814321279526, "learning_rate": 3.5679851669493306e-05, "loss": 0.1023, "step": 1569 }, { "epoch": 1.2671711439021314, "grad_norm": 0.0580049529671669, "learning_rate": 3.561231680327438e-05, "loss": 0.0974, "step": 1570 }, { "epoch": 1.267978307478875, "grad_norm": 0.06340297311544418, "learning_rate": 3.554481054325522e-05, "loss": 0.107, "step": 1571 }, { "epoch": 1.2687854710556186, "grad_norm": 0.053959768265485764, "learning_rate": 3.54773330236546e-05, "loss": 0.1051, "step": 1572 }, { "epoch": 1.2695926346323623, "grad_norm": 0.05161672458052635, "learning_rate": 3.540988437863421e-05, "loss": 0.0915, "step": 1573 }, { "epoch": 1.2703997982091058, "grad_norm": 0.0530218668282032, "learning_rate": 3.534246474229824e-05, "loss": 0.0983, "step": 1574 }, { "epoch": 1.2712069617858495, "grad_norm": 0.05331966653466225, "learning_rate": 3.527507424869332e-05, "loss": 0.1027, "step": 1575 }, { "epoch": 1.272014125362593, "grad_norm": 0.05721476674079895, "learning_rate": 3.520771303180803e-05, "loss": 0.1004, "step": 1576 }, { "epoch": 1.2728212889393367, "grad_norm": 0.050080329179763794, "learning_rate": 3.514038122557283e-05, "loss": 0.093, "step": 1577 }, { "epoch": 1.2736284525160801, "grad_norm": 0.05331555753946304, "learning_rate": 3.5073078963859615e-05, "loss": 0.0951, "step": 1578 }, { "epoch": 1.2744356160928239, "grad_norm": 0.052781980484724045, "learning_rate": 3.500580638048163e-05, "loss": 0.096, "step": 1579 }, { "epoch": 1.2752427796695673, "grad_norm": 0.054603688418865204, "learning_rate": 3.493856360919305e-05, "loss": 0.0999, "step": 1580 }, { "epoch": 1.276049943246311, "grad_norm": 0.06206927075982094, "learning_rate": 3.48713507836888e-05, "loss": 0.1057, "step": 1581 }, { "epoch": 1.2768571068230545, "grad_norm": 0.06230514869093895, "learning_rate": 3.4804168037604265e-05, "loss": 0.0962, "step": 1582 }, { "epoch": 1.2776642703997982, "grad_norm": 0.053634416311979294, "learning_rate": 3.473701550451499e-05, "loss": 0.0933, "step": 1583 }, { "epoch": 1.2784714339765417, "grad_norm": 0.047685541212558746, "learning_rate": 3.4669893317936506e-05, "loss": 0.0911, "step": 1584 }, { "epoch": 1.2792785975532854, "grad_norm": 0.052652470767498016, "learning_rate": 3.4602801611323976e-05, "loss": 0.1032, "step": 1585 }, { "epoch": 1.280085761130029, "grad_norm": 0.0600561685860157, "learning_rate": 3.4535740518071966e-05, "loss": 0.105, "step": 1586 }, { "epoch": 1.2808929247067726, "grad_norm": 0.04975568503141403, "learning_rate": 3.4468710171514175e-05, "loss": 0.103, "step": 1587 }, { "epoch": 1.2817000882835163, "grad_norm": 0.057458680123090744, "learning_rate": 3.440171070492319e-05, "loss": 0.101, "step": 1588 }, { "epoch": 1.2825072518602598, "grad_norm": 0.05034846067428589, "learning_rate": 3.4334742251510127e-05, "loss": 0.1023, "step": 1589 }, { "epoch": 1.2833144154370033, "grad_norm": 0.05214514955878258, "learning_rate": 3.426780494442455e-05, "loss": 0.1003, "step": 1590 }, { "epoch": 1.284121579013747, "grad_norm": 0.060507941991090775, "learning_rate": 3.420089891675401e-05, "loss": 0.0977, "step": 1591 }, { "epoch": 1.2849287425904907, "grad_norm": 0.05341675505042076, "learning_rate": 3.4134024301523917e-05, "loss": 0.0939, "step": 1592 }, { "epoch": 1.2857359061672342, "grad_norm": 0.054945435374975204, "learning_rate": 3.4067181231697195e-05, "loss": 0.0915, "step": 1593 }, { "epoch": 1.2865430697439777, "grad_norm": 0.04722421243786812, "learning_rate": 3.400036984017407e-05, "loss": 0.0974, "step": 1594 }, { "epoch": 1.2873502333207214, "grad_norm": 0.0567021444439888, "learning_rate": 3.3933590259791784e-05, "loss": 0.1081, "step": 1595 }, { "epoch": 1.2881573968974651, "grad_norm": 0.051238927990198135, "learning_rate": 3.386684262332429e-05, "loss": 0.1009, "step": 1596 }, { "epoch": 1.2889645604742086, "grad_norm": 0.05792234092950821, "learning_rate": 3.380012706348209e-05, "loss": 0.0995, "step": 1597 }, { "epoch": 1.289771724050952, "grad_norm": 0.06169963628053665, "learning_rate": 3.373344371291186e-05, "loss": 0.1062, "step": 1598 }, { "epoch": 1.2905788876276958, "grad_norm": 0.04784747585654259, "learning_rate": 3.366679270419626e-05, "loss": 0.096, "step": 1599 }, { "epoch": 1.2913860512044395, "grad_norm": 0.04714877903461456, "learning_rate": 3.360017416985364e-05, "loss": 0.1054, "step": 1600 }, { "epoch": 1.292193214781183, "grad_norm": 0.05442623421549797, "learning_rate": 3.35335882423378e-05, "loss": 0.1012, "step": 1601 }, { "epoch": 1.2930003783579265, "grad_norm": 0.06258675456047058, "learning_rate": 3.3467035054037665e-05, "loss": 0.1103, "step": 1602 }, { "epoch": 1.2938075419346702, "grad_norm": 0.0527527816593647, "learning_rate": 3.340051473727715e-05, "loss": 0.0962, "step": 1603 }, { "epoch": 1.2946147055114139, "grad_norm": 0.055114783346652985, "learning_rate": 3.333402742431469e-05, "loss": 0.1109, "step": 1604 }, { "epoch": 1.2954218690881574, "grad_norm": 0.060824207961559296, "learning_rate": 3.326757324734322e-05, "loss": 0.0878, "step": 1605 }, { "epoch": 1.2962290326649009, "grad_norm": 0.05124923959374428, "learning_rate": 3.3201152338489726e-05, "loss": 0.0901, "step": 1606 }, { "epoch": 1.2970361962416446, "grad_norm": 0.05657273530960083, "learning_rate": 3.313476482981506e-05, "loss": 0.0967, "step": 1607 }, { "epoch": 1.2978433598183883, "grad_norm": 0.055235255509614944, "learning_rate": 3.3068410853313694e-05, "loss": 0.0962, "step": 1608 }, { "epoch": 1.2986505233951318, "grad_norm": 0.05091793090105057, "learning_rate": 3.300209054091339e-05, "loss": 0.0992, "step": 1609 }, { "epoch": 1.2994576869718755, "grad_norm": 0.05806726589798927, "learning_rate": 3.293580402447501e-05, "loss": 0.1038, "step": 1610 }, { "epoch": 1.300264850548619, "grad_norm": 0.05401713028550148, "learning_rate": 3.2869551435792184e-05, "loss": 0.0987, "step": 1611 }, { "epoch": 1.3010720141253627, "grad_norm": 0.050620146095752716, "learning_rate": 3.2803332906591146e-05, "loss": 0.1026, "step": 1612 }, { "epoch": 1.3018791777021061, "grad_norm": 0.05707493796944618, "learning_rate": 3.273714856853033e-05, "loss": 0.1021, "step": 1613 }, { "epoch": 1.3026863412788499, "grad_norm": 0.05424903333187103, "learning_rate": 3.267099855320026e-05, "loss": 0.1057, "step": 1614 }, { "epoch": 1.3034935048555933, "grad_norm": 0.0574818029999733, "learning_rate": 3.260488299212319e-05, "loss": 0.1029, "step": 1615 }, { "epoch": 1.304300668432337, "grad_norm": 0.061917245388031006, "learning_rate": 3.253880201675287e-05, "loss": 0.0975, "step": 1616 }, { "epoch": 1.3051078320090805, "grad_norm": 0.056983813643455505, "learning_rate": 3.247275575847427e-05, "loss": 0.097, "step": 1617 }, { "epoch": 1.3059149955858242, "grad_norm": 0.061101365834474564, "learning_rate": 3.24067443486034e-05, "loss": 0.0901, "step": 1618 }, { "epoch": 1.3067221591625677, "grad_norm": 0.05307383835315704, "learning_rate": 3.2340767918386884e-05, "loss": 0.0955, "step": 1619 }, { "epoch": 1.3075293227393114, "grad_norm": 0.05253106728196144, "learning_rate": 3.2274826599001876e-05, "loss": 0.092, "step": 1620 }, { "epoch": 1.308336486316055, "grad_norm": 0.052322931587696075, "learning_rate": 3.2208920521555676e-05, "loss": 0.095, "step": 1621 }, { "epoch": 1.3091436498927986, "grad_norm": 0.06094197928905487, "learning_rate": 3.2143049817085536e-05, "loss": 0.0964, "step": 1622 }, { "epoch": 1.3099508134695421, "grad_norm": 0.05321243032813072, "learning_rate": 3.2077214616558396e-05, "loss": 0.0985, "step": 1623 }, { "epoch": 1.3107579770462858, "grad_norm": 0.05316761136054993, "learning_rate": 3.201141505087056e-05, "loss": 0.0994, "step": 1624 }, { "epoch": 1.3115651406230293, "grad_norm": 0.04972616210579872, "learning_rate": 3.194565125084753e-05, "loss": 0.0941, "step": 1625 }, { "epoch": 1.312372304199773, "grad_norm": 0.0504201278090477, "learning_rate": 3.187992334724363e-05, "loss": 0.0917, "step": 1626 }, { "epoch": 1.3131794677765165, "grad_norm": 0.05232101306319237, "learning_rate": 3.181423147074192e-05, "loss": 0.1045, "step": 1627 }, { "epoch": 1.3139866313532602, "grad_norm": 0.055394839495420456, "learning_rate": 3.1748575751953703e-05, "loss": 0.1057, "step": 1628 }, { "epoch": 1.3147937949300037, "grad_norm": 0.052785638719797134, "learning_rate": 3.1682956321418484e-05, "loss": 0.0935, "step": 1629 }, { "epoch": 1.3156009585067474, "grad_norm": 0.062250085175037384, "learning_rate": 3.161737330960357e-05, "loss": 0.0971, "step": 1630 }, { "epoch": 1.316408122083491, "grad_norm": 0.05189654603600502, "learning_rate": 3.1551826846903896e-05, "loss": 0.1038, "step": 1631 }, { "epoch": 1.3172152856602346, "grad_norm": 0.054302625358104706, "learning_rate": 3.14863170636417e-05, "loss": 0.1048, "step": 1632 }, { "epoch": 1.3180224492369783, "grad_norm": 0.059888824820518494, "learning_rate": 3.142084409006632e-05, "loss": 0.0995, "step": 1633 }, { "epoch": 1.3188296128137218, "grad_norm": 0.05456269532442093, "learning_rate": 3.135540805635385e-05, "loss": 0.0984, "step": 1634 }, { "epoch": 1.3196367763904653, "grad_norm": 0.0575900562107563, "learning_rate": 3.1290009092606984e-05, "loss": 0.1054, "step": 1635 }, { "epoch": 1.320443939967209, "grad_norm": 0.06439562141895294, "learning_rate": 3.122464732885476e-05, "loss": 0.1055, "step": 1636 }, { "epoch": 1.3212511035439527, "grad_norm": 0.060913003981113434, "learning_rate": 3.115932289505213e-05, "loss": 0.1036, "step": 1637 }, { "epoch": 1.3220582671206962, "grad_norm": 0.055851638317108154, "learning_rate": 3.1094035921079944e-05, "loss": 0.1001, "step": 1638 }, { "epoch": 1.3228654306974397, "grad_norm": 0.05525706708431244, "learning_rate": 3.102878653674449e-05, "loss": 0.1043, "step": 1639 }, { "epoch": 1.3236725942741834, "grad_norm": 0.05342097580432892, "learning_rate": 3.0963574871777366e-05, "loss": 0.1115, "step": 1640 }, { "epoch": 1.324479757850927, "grad_norm": 0.05820820480585098, "learning_rate": 3.0898401055835156e-05, "loss": 0.0983, "step": 1641 }, { "epoch": 1.3252869214276706, "grad_norm": 0.06024933606386185, "learning_rate": 3.0833265218499216e-05, "loss": 0.112, "step": 1642 }, { "epoch": 1.326094085004414, "grad_norm": 0.06074465438723564, "learning_rate": 3.0768167489275325e-05, "loss": 0.1074, "step": 1643 }, { "epoch": 1.3269012485811578, "grad_norm": 0.05460326001048088, "learning_rate": 3.070310799759358e-05, "loss": 0.1045, "step": 1644 }, { "epoch": 1.3277084121579015, "grad_norm": 0.06157265976071358, "learning_rate": 3.0638086872807986e-05, "loss": 0.0974, "step": 1645 }, { "epoch": 1.328515575734645, "grad_norm": 0.05285029485821724, "learning_rate": 3.057310424419632e-05, "loss": 0.0958, "step": 1646 }, { "epoch": 1.3293227393113884, "grad_norm": 0.05376741290092468, "learning_rate": 3.050816024095975e-05, "loss": 0.0921, "step": 1647 }, { "epoch": 1.3301299028881322, "grad_norm": 0.05140222981572151, "learning_rate": 3.0443254992222737e-05, "loss": 0.1081, "step": 1648 }, { "epoch": 1.3309370664648759, "grad_norm": 0.04787709563970566, "learning_rate": 3.037838862703258e-05, "loss": 0.0905, "step": 1649 }, { "epoch": 1.3317442300416193, "grad_norm": 0.05303593724966049, "learning_rate": 3.031356127435937e-05, "loss": 0.1007, "step": 1650 }, { "epoch": 1.3325513936183628, "grad_norm": 0.060761451721191406, "learning_rate": 3.0248773063095603e-05, "loss": 0.1088, "step": 1651 }, { "epoch": 1.3333585571951065, "grad_norm": 0.058095987886190414, "learning_rate": 3.0184024122055886e-05, "loss": 0.1006, "step": 1652 }, { "epoch": 1.3341657207718502, "grad_norm": 0.05155572295188904, "learning_rate": 3.0119314579976855e-05, "loss": 0.1096, "step": 1653 }, { "epoch": 1.3349728843485937, "grad_norm": 0.05454425513744354, "learning_rate": 3.005464456551671e-05, "loss": 0.1001, "step": 1654 }, { "epoch": 1.3357800479253374, "grad_norm": 0.05280786380171776, "learning_rate": 2.9990014207255136e-05, "loss": 0.0993, "step": 1655 }, { "epoch": 1.336587211502081, "grad_norm": 0.05005814880132675, "learning_rate": 2.9925423633692923e-05, "loss": 0.0967, "step": 1656 }, { "epoch": 1.3373943750788246, "grad_norm": 0.05483141914010048, "learning_rate": 2.9860872973251814e-05, "loss": 0.0966, "step": 1657 }, { "epoch": 1.3382015386555681, "grad_norm": 0.052813570946455, "learning_rate": 2.979636235427411e-05, "loss": 0.0974, "step": 1658 }, { "epoch": 1.3390087022323118, "grad_norm": 0.0537811778485775, "learning_rate": 2.973189190502259e-05, "loss": 0.1009, "step": 1659 }, { "epoch": 1.3398158658090553, "grad_norm": 0.053487300872802734, "learning_rate": 2.9667461753680098e-05, "loss": 0.0919, "step": 1660 }, { "epoch": 1.340623029385799, "grad_norm": 0.055632639676332474, "learning_rate": 2.960307202834941e-05, "loss": 0.1056, "step": 1661 }, { "epoch": 1.3414301929625425, "grad_norm": 0.05244695395231247, "learning_rate": 2.953872285705287e-05, "loss": 0.0947, "step": 1662 }, { "epoch": 1.3422373565392862, "grad_norm": 0.054442182183265686, "learning_rate": 2.947441436773224e-05, "loss": 0.0919, "step": 1663 }, { "epoch": 1.3430445201160297, "grad_norm": 0.05397431552410126, "learning_rate": 2.9410146688248375e-05, "loss": 0.1033, "step": 1664 }, { "epoch": 1.3430445201160297, "eval_loss": 0.11258527636528015, "eval_runtime": 4000.985, "eval_samples_per_second": 2.225, "eval_steps_per_second": 2.225, "step": 1664 }, { "epoch": 1.3438516836927734, "grad_norm": 0.049582675099372864, "learning_rate": 2.9345919946380983e-05, "loss": 0.0966, "step": 1665 }, { "epoch": 1.344658847269517, "grad_norm": 0.05338102951645851, "learning_rate": 2.9281734269828408e-05, "loss": 0.09, "step": 1666 }, { "epoch": 1.3454660108462606, "grad_norm": 0.05314387008547783, "learning_rate": 2.9217589786207294e-05, "loss": 0.1069, "step": 1667 }, { "epoch": 1.346273174423004, "grad_norm": 0.053496479988098145, "learning_rate": 2.9153486623052438e-05, "loss": 0.1021, "step": 1668 }, { "epoch": 1.3470803379997478, "grad_norm": 0.050988148897886276, "learning_rate": 2.908942490781643e-05, "loss": 0.0985, "step": 1669 }, { "epoch": 1.3478875015764913, "grad_norm": 0.05206780135631561, "learning_rate": 2.9025404767869525e-05, "loss": 0.1012, "step": 1670 }, { "epoch": 1.348694665153235, "grad_norm": 0.055445197969675064, "learning_rate": 2.896142633049922e-05, "loss": 0.102, "step": 1671 }, { "epoch": 1.3495018287299785, "grad_norm": 0.05146980285644531, "learning_rate": 2.8897489722910165e-05, "loss": 0.092, "step": 1672 }, { "epoch": 1.3503089923067222, "grad_norm": 0.0521780289709568, "learning_rate": 2.883359507222384e-05, "loss": 0.1054, "step": 1673 }, { "epoch": 1.351116155883466, "grad_norm": 0.058990299701690674, "learning_rate": 2.8769742505478294e-05, "loss": 0.1026, "step": 1674 }, { "epoch": 1.3519233194602094, "grad_norm": 0.056281570345163345, "learning_rate": 2.870593214962787e-05, "loss": 0.1026, "step": 1675 }, { "epoch": 1.3527304830369529, "grad_norm": 0.056861743330955505, "learning_rate": 2.8642164131543048e-05, "loss": 0.1059, "step": 1676 }, { "epoch": 1.3535376466136966, "grad_norm": 0.05862145870923996, "learning_rate": 2.8578438578010053e-05, "loss": 0.0982, "step": 1677 }, { "epoch": 1.3543448101904403, "grad_norm": 0.05626539885997772, "learning_rate": 2.8514755615730754e-05, "loss": 0.1024, "step": 1678 }, { "epoch": 1.3551519737671838, "grad_norm": 0.054287947714328766, "learning_rate": 2.84511153713223e-05, "loss": 0.1104, "step": 1679 }, { "epoch": 1.3559591373439273, "grad_norm": 0.05797120928764343, "learning_rate": 2.8387517971316918e-05, "loss": 0.0973, "step": 1680 }, { "epoch": 1.356766300920671, "grad_norm": 0.05042315647006035, "learning_rate": 2.8323963542161663e-05, "loss": 0.0894, "step": 1681 }, { "epoch": 1.3575734644974147, "grad_norm": 0.05028513818979263, "learning_rate": 2.82604522102181e-05, "loss": 0.1063, "step": 1682 }, { "epoch": 1.3583806280741582, "grad_norm": 0.05265726149082184, "learning_rate": 2.819698410176218e-05, "loss": 0.0967, "step": 1683 }, { "epoch": 1.3591877916509016, "grad_norm": 0.06303221732378006, "learning_rate": 2.8133559342983822e-05, "loss": 0.1045, "step": 1684 }, { "epoch": 1.3599949552276454, "grad_norm": 0.058435652405023575, "learning_rate": 2.807017805998689e-05, "loss": 0.1097, "step": 1685 }, { "epoch": 1.360802118804389, "grad_norm": 0.05776131898164749, "learning_rate": 2.800684037878867e-05, "loss": 0.1011, "step": 1686 }, { "epoch": 1.3616092823811325, "grad_norm": 0.056916963309049606, "learning_rate": 2.7943546425319854e-05, "loss": 0.102, "step": 1687 }, { "epoch": 1.362416445957876, "grad_norm": 0.05490521714091301, "learning_rate": 2.7880296325424116e-05, "loss": 0.1007, "step": 1688 }, { "epoch": 1.3632236095346197, "grad_norm": 0.05438702553510666, "learning_rate": 2.7817090204858e-05, "loss": 0.0966, "step": 1689 }, { "epoch": 1.3640307731113634, "grad_norm": 0.05037625879049301, "learning_rate": 2.7753928189290585e-05, "loss": 0.0924, "step": 1690 }, { "epoch": 1.364837936688107, "grad_norm": 0.05397520959377289, "learning_rate": 2.7690810404303276e-05, "loss": 0.1006, "step": 1691 }, { "epoch": 1.3656451002648504, "grad_norm": 0.051877815276384354, "learning_rate": 2.7627736975389486e-05, "loss": 0.0994, "step": 1692 }, { "epoch": 1.3664522638415941, "grad_norm": 0.05059212073683739, "learning_rate": 2.756470802795449e-05, "loss": 0.1053, "step": 1693 }, { "epoch": 1.3672594274183378, "grad_norm": 0.05204560607671738, "learning_rate": 2.7501723687315118e-05, "loss": 0.1063, "step": 1694 }, { "epoch": 1.3680665909950813, "grad_norm": 0.05313366279006004, "learning_rate": 2.743878407869947e-05, "loss": 0.1051, "step": 1695 }, { "epoch": 1.3688737545718248, "grad_norm": 0.05289191007614136, "learning_rate": 2.7375889327246744e-05, "loss": 0.0923, "step": 1696 }, { "epoch": 1.3696809181485685, "grad_norm": 0.05902362987399101, "learning_rate": 2.7313039558006953e-05, "loss": 0.1091, "step": 1697 }, { "epoch": 1.3704880817253122, "grad_norm": 0.0705089122056961, "learning_rate": 2.725023489594068e-05, "loss": 0.1038, "step": 1698 }, { "epoch": 1.3712952453020557, "grad_norm": 0.0559990257024765, "learning_rate": 2.7187475465918765e-05, "loss": 0.0995, "step": 1699 }, { "epoch": 1.3721024088787994, "grad_norm": 0.04908592253923416, "learning_rate": 2.71247613927222e-05, "loss": 0.0995, "step": 1700 }, { "epoch": 1.372909572455543, "grad_norm": 0.053379930555820465, "learning_rate": 2.7062092801041716e-05, "loss": 0.1021, "step": 1701 }, { "epoch": 1.3737167360322866, "grad_norm": 0.053664952516555786, "learning_rate": 2.6999469815477683e-05, "loss": 0.0921, "step": 1702 }, { "epoch": 1.37452389960903, "grad_norm": 0.05141190066933632, "learning_rate": 2.693689256053976e-05, "loss": 0.1003, "step": 1703 }, { "epoch": 1.3753310631857738, "grad_norm": 0.05114316940307617, "learning_rate": 2.687436116064671e-05, "loss": 0.1039, "step": 1704 }, { "epoch": 1.3761382267625173, "grad_norm": 0.06711278855800629, "learning_rate": 2.6811875740126064e-05, "loss": 0.1028, "step": 1705 }, { "epoch": 1.376945390339261, "grad_norm": 0.05416456609964371, "learning_rate": 2.6749436423214e-05, "loss": 0.1012, "step": 1706 }, { "epoch": 1.3777525539160045, "grad_norm": 0.05701085925102234, "learning_rate": 2.6687043334055017e-05, "loss": 0.099, "step": 1707 }, { "epoch": 1.3785597174927482, "grad_norm": 0.05621578171849251, "learning_rate": 2.662469659670164e-05, "loss": 0.1083, "step": 1708 }, { "epoch": 1.3793668810694917, "grad_norm": 0.05745275691151619, "learning_rate": 2.656239633511437e-05, "loss": 0.1044, "step": 1709 }, { "epoch": 1.3801740446462354, "grad_norm": 0.056720633059740067, "learning_rate": 2.6500142673161155e-05, "loss": 0.105, "step": 1710 }, { "epoch": 1.3809812082229789, "grad_norm": 0.05769463628530502, "learning_rate": 2.6437935734617393e-05, "loss": 0.1074, "step": 1711 }, { "epoch": 1.3817883717997226, "grad_norm": 0.054312288761138916, "learning_rate": 2.637577564316551e-05, "loss": 0.1054, "step": 1712 }, { "epoch": 1.382595535376466, "grad_norm": 0.059184182435274124, "learning_rate": 2.6313662522394876e-05, "loss": 0.1023, "step": 1713 }, { "epoch": 1.3834026989532098, "grad_norm": 0.052861351519823074, "learning_rate": 2.6251596495801358e-05, "loss": 0.1119, "step": 1714 }, { "epoch": 1.3842098625299533, "grad_norm": 0.05458124354481697, "learning_rate": 2.6189577686787315e-05, "loss": 0.1012, "step": 1715 }, { "epoch": 1.385017026106697, "grad_norm": 0.052267804741859436, "learning_rate": 2.612760621866113e-05, "loss": 0.0989, "step": 1716 }, { "epoch": 1.3858241896834405, "grad_norm": 0.05563811957836151, "learning_rate": 2.6065682214637123e-05, "loss": 0.097, "step": 1717 }, { "epoch": 1.3866313532601842, "grad_norm": 0.05647456273436546, "learning_rate": 2.6003805797835173e-05, "loss": 0.1092, "step": 1718 }, { "epoch": 1.3874385168369279, "grad_norm": 0.05414953827857971, "learning_rate": 2.594197709128061e-05, "loss": 0.1046, "step": 1719 }, { "epoch": 1.3882456804136714, "grad_norm": 0.054344624280929565, "learning_rate": 2.5880196217903883e-05, "loss": 0.096, "step": 1720 }, { "epoch": 1.3890528439904148, "grad_norm": 0.057636577636003494, "learning_rate": 2.581846330054034e-05, "loss": 0.1011, "step": 1721 }, { "epoch": 1.3898600075671586, "grad_norm": 0.05502058193087578, "learning_rate": 2.5756778461929987e-05, "loss": 0.105, "step": 1722 }, { "epoch": 1.3906671711439023, "grad_norm": 0.0539773628115654, "learning_rate": 2.5695141824717183e-05, "loss": 0.0989, "step": 1723 }, { "epoch": 1.3914743347206457, "grad_norm": 0.054985322058200836, "learning_rate": 2.5633553511450548e-05, "loss": 0.111, "step": 1724 }, { "epoch": 1.3922814982973892, "grad_norm": 0.05923767015337944, "learning_rate": 2.5572013644582522e-05, "loss": 0.0946, "step": 1725 }, { "epoch": 1.393088661874133, "grad_norm": 0.05336596444249153, "learning_rate": 2.551052234646929e-05, "loss": 0.0935, "step": 1726 }, { "epoch": 1.3938958254508766, "grad_norm": 0.05548320710659027, "learning_rate": 2.544907973937045e-05, "loss": 0.0956, "step": 1727 }, { "epoch": 1.3947029890276201, "grad_norm": 0.055060893297195435, "learning_rate": 2.5387685945448807e-05, "loss": 0.1109, "step": 1728 }, { "epoch": 1.3955101526043636, "grad_norm": 0.0519677959382534, "learning_rate": 2.5326341086770062e-05, "loss": 0.0999, "step": 1729 }, { "epoch": 1.3963173161811073, "grad_norm": 0.05345570296049118, "learning_rate": 2.526504528530269e-05, "loss": 0.104, "step": 1730 }, { "epoch": 1.397124479757851, "grad_norm": 0.05952931195497513, "learning_rate": 2.5203798662917555e-05, "loss": 0.1164, "step": 1731 }, { "epoch": 1.3979316433345945, "grad_norm": 0.05161213502287865, "learning_rate": 2.5142601341387805e-05, "loss": 0.0966, "step": 1732 }, { "epoch": 1.398738806911338, "grad_norm": 0.053028494119644165, "learning_rate": 2.5081453442388535e-05, "loss": 0.0871, "step": 1733 }, { "epoch": 1.3995459704880817, "grad_norm": 0.05534541606903076, "learning_rate": 2.5020355087496605e-05, "loss": 0.0986, "step": 1734 }, { "epoch": 1.4003531340648254, "grad_norm": 0.05603024363517761, "learning_rate": 2.4959306398190303e-05, "loss": 0.0974, "step": 1735 }, { "epoch": 1.401160297641569, "grad_norm": 0.05230935662984848, "learning_rate": 2.489830749584924e-05, "loss": 0.1087, "step": 1736 }, { "epoch": 1.4019674612183124, "grad_norm": 0.051334280520677567, "learning_rate": 2.4837358501754022e-05, "loss": 0.1098, "step": 1737 }, { "epoch": 1.402774624795056, "grad_norm": 0.05653442442417145, "learning_rate": 2.477645953708596e-05, "loss": 0.1048, "step": 1738 }, { "epoch": 1.4035817883717998, "grad_norm": 0.057875242084264755, "learning_rate": 2.471561072292703e-05, "loss": 0.1112, "step": 1739 }, { "epoch": 1.4043889519485433, "grad_norm": 0.05275672301650047, "learning_rate": 2.465481218025935e-05, "loss": 0.0988, "step": 1740 }, { "epoch": 1.405196115525287, "grad_norm": 0.05356324091553688, "learning_rate": 2.4594064029965197e-05, "loss": 0.0942, "step": 1741 }, { "epoch": 1.4060032791020305, "grad_norm": 0.051271434873342514, "learning_rate": 2.4533366392826574e-05, "loss": 0.096, "step": 1742 }, { "epoch": 1.4068104426787742, "grad_norm": 0.059903211891651154, "learning_rate": 2.44727193895251e-05, "loss": 0.0971, "step": 1743 }, { "epoch": 1.4076176062555177, "grad_norm": 0.055205002427101135, "learning_rate": 2.441212314064172e-05, "loss": 0.1052, "step": 1744 }, { "epoch": 1.4084247698322614, "grad_norm": 0.056146785616874695, "learning_rate": 2.4351577766656462e-05, "loss": 0.1049, "step": 1745 }, { "epoch": 1.4092319334090049, "grad_norm": 0.049882203340530396, "learning_rate": 2.429108338794817e-05, "loss": 0.1048, "step": 1746 }, { "epoch": 1.4100390969857486, "grad_norm": 0.049041178077459335, "learning_rate": 2.423064012479436e-05, "loss": 0.0963, "step": 1747 }, { "epoch": 1.410846260562492, "grad_norm": 0.05437014624476433, "learning_rate": 2.417024809737084e-05, "loss": 0.0921, "step": 1748 }, { "epoch": 1.4116534241392358, "grad_norm": 0.05400558561086655, "learning_rate": 2.4109907425751614e-05, "loss": 0.106, "step": 1749 }, { "epoch": 1.4124605877159793, "grad_norm": 0.051685404032468796, "learning_rate": 2.404961822990856e-05, "loss": 0.0912, "step": 1750 }, { "epoch": 1.413267751292723, "grad_norm": 0.05169227346777916, "learning_rate": 2.3989380629711194e-05, "loss": 0.1011, "step": 1751 }, { "epoch": 1.4140749148694665, "grad_norm": 0.053023871034383774, "learning_rate": 2.3929194744926488e-05, "loss": 0.096, "step": 1752 }, { "epoch": 1.4148820784462102, "grad_norm": 0.05740707367658615, "learning_rate": 2.3869060695218516e-05, "loss": 0.1031, "step": 1753 }, { "epoch": 1.4156892420229537, "grad_norm": 0.05690925568342209, "learning_rate": 2.3808978600148374e-05, "loss": 0.0984, "step": 1754 }, { "epoch": 1.4164964055996974, "grad_norm": 0.05569424852728844, "learning_rate": 2.374894857917379e-05, "loss": 0.1056, "step": 1755 }, { "epoch": 1.4173035691764408, "grad_norm": 0.05926217883825302, "learning_rate": 2.3688970751649002e-05, "loss": 0.1015, "step": 1756 }, { "epoch": 1.4181107327531846, "grad_norm": 0.059993576258420944, "learning_rate": 2.362904523682447e-05, "loss": 0.0998, "step": 1757 }, { "epoch": 1.418917896329928, "grad_norm": 0.053482286632061005, "learning_rate": 2.3569172153846646e-05, "loss": 0.1014, "step": 1758 }, { "epoch": 1.4197250599066717, "grad_norm": 0.05349402129650116, "learning_rate": 2.3509351621757692e-05, "loss": 0.1096, "step": 1759 }, { "epoch": 1.4205322234834152, "grad_norm": 0.05483704060316086, "learning_rate": 2.3449583759495348e-05, "loss": 0.1025, "step": 1760 }, { "epoch": 1.421339387060159, "grad_norm": 0.05466461926698685, "learning_rate": 2.3389868685892573e-05, "loss": 0.1089, "step": 1761 }, { "epoch": 1.4221465506369024, "grad_norm": 0.05059335380792618, "learning_rate": 2.3330206519677412e-05, "loss": 0.0986, "step": 1762 }, { "epoch": 1.4229537142136461, "grad_norm": 0.05445451661944389, "learning_rate": 2.3270597379472714e-05, "loss": 0.1007, "step": 1763 }, { "epoch": 1.4237608777903898, "grad_norm": 0.053052790462970734, "learning_rate": 2.3211041383795884e-05, "loss": 0.0914, "step": 1764 }, { "epoch": 1.4245680413671333, "grad_norm": 0.056334786117076874, "learning_rate": 2.3151538651058686e-05, "loss": 0.1046, "step": 1765 }, { "epoch": 1.4253752049438768, "grad_norm": 0.054768700152635574, "learning_rate": 2.309208929956694e-05, "loss": 0.0972, "step": 1766 }, { "epoch": 1.4261823685206205, "grad_norm": 0.05421154201030731, "learning_rate": 2.3032693447520386e-05, "loss": 0.1036, "step": 1767 }, { "epoch": 1.4269895320973642, "grad_norm": 0.05574481189250946, "learning_rate": 2.297335121301232e-05, "loss": 0.1136, "step": 1768 }, { "epoch": 1.4277966956741077, "grad_norm": 0.060776546597480774, "learning_rate": 2.2914062714029544e-05, "loss": 0.0926, "step": 1769 }, { "epoch": 1.4286038592508512, "grad_norm": 0.051522959023714066, "learning_rate": 2.285482806845191e-05, "loss": 0.1021, "step": 1770 }, { "epoch": 1.429411022827595, "grad_norm": 0.056177109479904175, "learning_rate": 2.2795647394052284e-05, "loss": 0.1093, "step": 1771 }, { "epoch": 1.4302181864043386, "grad_norm": 0.056069549173116684, "learning_rate": 2.2736520808496136e-05, "loss": 0.1045, "step": 1772 }, { "epoch": 1.431025349981082, "grad_norm": 0.054305411875247955, "learning_rate": 2.267744842934147e-05, "loss": 0.1038, "step": 1773 }, { "epoch": 1.4318325135578256, "grad_norm": 0.05415286123752594, "learning_rate": 2.261843037403848e-05, "loss": 0.0963, "step": 1774 }, { "epoch": 1.4326396771345693, "grad_norm": 0.05577448010444641, "learning_rate": 2.255946675992938e-05, "loss": 0.0974, "step": 1775 }, { "epoch": 1.433446840711313, "grad_norm": 0.053556524217128754, "learning_rate": 2.2500557704248083e-05, "loss": 0.1089, "step": 1776 }, { "epoch": 1.4342540042880565, "grad_norm": 0.054036013782024384, "learning_rate": 2.2441703324120095e-05, "loss": 0.1011, "step": 1777 }, { "epoch": 1.4350611678648, "grad_norm": 0.05641414225101471, "learning_rate": 2.2382903736562145e-05, "loss": 0.1054, "step": 1778 }, { "epoch": 1.4358683314415437, "grad_norm": 0.05762660130858421, "learning_rate": 2.2324159058482085e-05, "loss": 0.1144, "step": 1779 }, { "epoch": 1.4366754950182874, "grad_norm": 0.05654531344771385, "learning_rate": 2.2265469406678557e-05, "loss": 0.1077, "step": 1780 }, { "epoch": 1.4374826585950309, "grad_norm": 0.05274472385644913, "learning_rate": 2.220683489784081e-05, "loss": 0.0976, "step": 1781 }, { "epoch": 1.4382898221717744, "grad_norm": 0.049314264208078384, "learning_rate": 2.214825564854848e-05, "loss": 0.0901, "step": 1782 }, { "epoch": 1.439096985748518, "grad_norm": 0.05207086354494095, "learning_rate": 2.208973177527125e-05, "loss": 0.0997, "step": 1783 }, { "epoch": 1.4399041493252618, "grad_norm": 0.05724550783634186, "learning_rate": 2.2031263394368812e-05, "loss": 0.1059, "step": 1784 }, { "epoch": 1.4407113129020053, "grad_norm": 0.05788930505514145, "learning_rate": 2.1972850622090423e-05, "loss": 0.101, "step": 1785 }, { "epoch": 1.441518476478749, "grad_norm": 0.052863020449876785, "learning_rate": 2.1914493574574858e-05, "loss": 0.102, "step": 1786 }, { "epoch": 1.4423256400554925, "grad_norm": 0.05530736222863197, "learning_rate": 2.185619236785005e-05, "loss": 0.0962, "step": 1787 }, { "epoch": 1.4431328036322362, "grad_norm": 0.05367308855056763, "learning_rate": 2.1797947117832944e-05, "loss": 0.0954, "step": 1788 }, { "epoch": 1.4439399672089797, "grad_norm": 0.04809948429465294, "learning_rate": 2.1739757940329175e-05, "loss": 0.1041, "step": 1789 }, { "epoch": 1.4447471307857234, "grad_norm": 0.053945332765579224, "learning_rate": 2.1681624951032965e-05, "loss": 0.1056, "step": 1790 }, { "epoch": 1.4455542943624669, "grad_norm": 0.05288871005177498, "learning_rate": 2.162354826552673e-05, "loss": 0.1056, "step": 1791 }, { "epoch": 1.4463614579392106, "grad_norm": 0.05683917924761772, "learning_rate": 2.1565527999281003e-05, "loss": 0.0936, "step": 1792 }, { "epoch": 1.4463614579392106, "eval_loss": 0.11289119720458984, "eval_runtime": 4083.9124, "eval_samples_per_second": 2.18, "eval_steps_per_second": 2.18, "step": 1792 }, { "epoch": 1.447168621515954, "grad_norm": 0.059716347604990005, "learning_rate": 2.1507564267654184e-05, "loss": 0.1046, "step": 1793 }, { "epoch": 1.4479757850926978, "grad_norm": 0.05664849653840065, "learning_rate": 2.1449657185892153e-05, "loss": 0.1055, "step": 1794 }, { "epoch": 1.4487829486694412, "grad_norm": 0.05474560335278511, "learning_rate": 2.139180686912825e-05, "loss": 0.0993, "step": 1795 }, { "epoch": 1.449590112246185, "grad_norm": 0.05671496316790581, "learning_rate": 2.1334013432382894e-05, "loss": 0.1073, "step": 1796 }, { "epoch": 1.4503972758229284, "grad_norm": 0.05500929430127144, "learning_rate": 2.127627699056345e-05, "loss": 0.1023, "step": 1797 }, { "epoch": 1.4512044393996721, "grad_norm": 0.059289563447237015, "learning_rate": 2.1218597658463947e-05, "loss": 0.1012, "step": 1798 }, { "epoch": 1.4520116029764156, "grad_norm": 0.05336535722017288, "learning_rate": 2.11609755507649e-05, "loss": 0.1, "step": 1799 }, { "epoch": 1.4528187665531593, "grad_norm": 0.05500759556889534, "learning_rate": 2.1103410782032973e-05, "loss": 0.1014, "step": 1800 }, { "epoch": 1.4536259301299028, "grad_norm": 0.054132163524627686, "learning_rate": 2.1045903466720913e-05, "loss": 0.1069, "step": 1801 }, { "epoch": 1.4544330937066465, "grad_norm": 0.057197920978069305, "learning_rate": 2.0988453719167156e-05, "loss": 0.1012, "step": 1802 }, { "epoch": 1.45524025728339, "grad_norm": 0.05759512633085251, "learning_rate": 2.0931061653595742e-05, "loss": 0.1111, "step": 1803 }, { "epoch": 1.4560474208601337, "grad_norm": 0.05895911902189255, "learning_rate": 2.0873727384115994e-05, "loss": 0.1214, "step": 1804 }, { "epoch": 1.4568545844368772, "grad_norm": 0.05513091757893562, "learning_rate": 2.0816451024722343e-05, "loss": 0.1026, "step": 1805 }, { "epoch": 1.457661748013621, "grad_norm": 0.05542128160595894, "learning_rate": 2.0759232689294044e-05, "loss": 0.1018, "step": 1806 }, { "epoch": 1.4584689115903644, "grad_norm": 0.05589337274432182, "learning_rate": 2.0702072491595022e-05, "loss": 0.1081, "step": 1807 }, { "epoch": 1.459276075167108, "grad_norm": 0.0492415688931942, "learning_rate": 2.064497054527362e-05, "loss": 0.1007, "step": 1808 }, { "epoch": 1.4600832387438518, "grad_norm": 0.055094800889492035, "learning_rate": 2.0587926963862285e-05, "loss": 0.1077, "step": 1809 }, { "epoch": 1.4608904023205953, "grad_norm": 0.058931972831487656, "learning_rate": 2.053094186077752e-05, "loss": 0.1052, "step": 1810 }, { "epoch": 1.4616975658973388, "grad_norm": 0.055273983627557755, "learning_rate": 2.0474015349319503e-05, "loss": 0.096, "step": 1811 }, { "epoch": 1.4625047294740825, "grad_norm": 0.055502042174339294, "learning_rate": 2.041714754267195e-05, "loss": 0.1024, "step": 1812 }, { "epoch": 1.4633118930508262, "grad_norm": 0.060050707310438156, "learning_rate": 2.0360338553901796e-05, "loss": 0.1026, "step": 1813 }, { "epoch": 1.4641190566275697, "grad_norm": 0.0563698448240757, "learning_rate": 2.0303588495959113e-05, "loss": 0.1124, "step": 1814 }, { "epoch": 1.4649262202043132, "grad_norm": 0.05678456276655197, "learning_rate": 2.0246897481676737e-05, "loss": 0.1011, "step": 1815 }, { "epoch": 1.4657333837810569, "grad_norm": 0.062189266085624695, "learning_rate": 2.0190265623770143e-05, "loss": 0.1007, "step": 1816 }, { "epoch": 1.4665405473578006, "grad_norm": 0.05120007321238518, "learning_rate": 2.0133693034837192e-05, "loss": 0.0971, "step": 1817 }, { "epoch": 1.467347710934544, "grad_norm": 0.05674745887517929, "learning_rate": 2.0077179827357907e-05, "loss": 0.1056, "step": 1818 }, { "epoch": 1.4681548745112876, "grad_norm": 0.056586798280477524, "learning_rate": 2.0020726113694204e-05, "loss": 0.0941, "step": 1819 }, { "epoch": 1.4689620380880313, "grad_norm": 0.056127533316612244, "learning_rate": 1.996433200608978e-05, "loss": 0.0976, "step": 1820 }, { "epoch": 1.469769201664775, "grad_norm": 0.05353543907403946, "learning_rate": 1.990799761666975e-05, "loss": 0.0976, "step": 1821 }, { "epoch": 1.4705763652415185, "grad_norm": 0.049592096358537674, "learning_rate": 1.9851723057440517e-05, "loss": 0.0908, "step": 1822 }, { "epoch": 1.471383528818262, "grad_norm": 0.05143113434314728, "learning_rate": 1.9795508440289602e-05, "loss": 0.0967, "step": 1823 }, { "epoch": 1.4721906923950057, "grad_norm": 0.05011619254946709, "learning_rate": 1.9739353876985222e-05, "loss": 0.1027, "step": 1824 }, { "epoch": 1.4729978559717494, "grad_norm": 0.056136928498744965, "learning_rate": 1.9683259479176292e-05, "loss": 0.0958, "step": 1825 }, { "epoch": 1.4738050195484929, "grad_norm": 0.05808064714074135, "learning_rate": 1.962722535839202e-05, "loss": 0.0879, "step": 1826 }, { "epoch": 1.4746121831252363, "grad_norm": 0.056719373911619186, "learning_rate": 1.9571251626041847e-05, "loss": 0.094, "step": 1827 }, { "epoch": 1.47541934670198, "grad_norm": 0.05653686448931694, "learning_rate": 1.95153383934151e-05, "loss": 0.0967, "step": 1828 }, { "epoch": 1.4762265102787238, "grad_norm": 0.05714252591133118, "learning_rate": 1.9459485771680857e-05, "loss": 0.0978, "step": 1829 }, { "epoch": 1.4770336738554672, "grad_norm": 0.05405423790216446, "learning_rate": 1.9403693871887617e-05, "loss": 0.1087, "step": 1830 }, { "epoch": 1.477840837432211, "grad_norm": 0.0466889813542366, "learning_rate": 1.9347962804963238e-05, "loss": 0.1013, "step": 1831 }, { "epoch": 1.4786480010089544, "grad_norm": 0.058556921780109406, "learning_rate": 1.9292292681714535e-05, "loss": 0.1009, "step": 1832 }, { "epoch": 1.4794551645856981, "grad_norm": 0.05409836024045944, "learning_rate": 1.9236683612827228e-05, "loss": 0.0927, "step": 1833 }, { "epoch": 1.4802623281624416, "grad_norm": 0.05637505650520325, "learning_rate": 1.918113570886561e-05, "loss": 0.0998, "step": 1834 }, { "epoch": 1.4810694917391853, "grad_norm": 0.056996509432792664, "learning_rate": 1.9125649080272383e-05, "loss": 0.0995, "step": 1835 }, { "epoch": 1.4818766553159288, "grad_norm": 0.06082282215356827, "learning_rate": 1.9070223837368412e-05, "loss": 0.1089, "step": 1836 }, { "epoch": 1.4826838188926725, "grad_norm": 0.05421457439661026, "learning_rate": 1.9014860090352476e-05, "loss": 0.0987, "step": 1837 }, { "epoch": 1.483490982469416, "grad_norm": 0.05922079458832741, "learning_rate": 1.895955794930115e-05, "loss": 0.1063, "step": 1838 }, { "epoch": 1.4842981460461597, "grad_norm": 0.055270712822675705, "learning_rate": 1.8904317524168458e-05, "loss": 0.0968, "step": 1839 }, { "epoch": 1.4851053096229032, "grad_norm": 0.05599242076277733, "learning_rate": 1.884913892478576e-05, "loss": 0.1063, "step": 1840 }, { "epoch": 1.485912473199647, "grad_norm": 0.04959672689437866, "learning_rate": 1.8794022260861482e-05, "loss": 0.0902, "step": 1841 }, { "epoch": 1.4867196367763904, "grad_norm": 0.05435161292552948, "learning_rate": 1.8738967641980925e-05, "loss": 0.1085, "step": 1842 }, { "epoch": 1.4875268003531341, "grad_norm": 0.0501483753323555, "learning_rate": 1.8683975177605968e-05, "loss": 0.1085, "step": 1843 }, { "epoch": 1.4883339639298776, "grad_norm": 0.05165189132094383, "learning_rate": 1.8629044977074983e-05, "loss": 0.1043, "step": 1844 }, { "epoch": 1.4891411275066213, "grad_norm": 0.05199553817510605, "learning_rate": 1.8574177149602495e-05, "loss": 0.0982, "step": 1845 }, { "epoch": 1.4899482910833648, "grad_norm": 0.05365710332989693, "learning_rate": 1.8519371804279046e-05, "loss": 0.0919, "step": 1846 }, { "epoch": 1.4907554546601085, "grad_norm": 0.06219770014286041, "learning_rate": 1.8464629050070942e-05, "loss": 0.1049, "step": 1847 }, { "epoch": 1.491562618236852, "grad_norm": 0.057056963443756104, "learning_rate": 1.8409948995820054e-05, "loss": 0.0887, "step": 1848 }, { "epoch": 1.4923697818135957, "grad_norm": 0.06005426496267319, "learning_rate": 1.8355331750243548e-05, "loss": 0.1055, "step": 1849 }, { "epoch": 1.4931769453903392, "grad_norm": 0.050560999661684036, "learning_rate": 1.830077742193375e-05, "loss": 0.1021, "step": 1850 }, { "epoch": 1.493984108967083, "grad_norm": 0.056629497557878494, "learning_rate": 1.8246286119357903e-05, "loss": 0.1043, "step": 1851 }, { "epoch": 1.4947912725438264, "grad_norm": 0.05496688187122345, "learning_rate": 1.8191857950857872e-05, "loss": 0.1003, "step": 1852 }, { "epoch": 1.49559843612057, "grad_norm": 0.05393729731440544, "learning_rate": 1.8137493024650093e-05, "loss": 0.1057, "step": 1853 }, { "epoch": 1.4964055996973138, "grad_norm": 0.05840306356549263, "learning_rate": 1.8083191448825176e-05, "loss": 0.1065, "step": 1854 }, { "epoch": 1.4972127632740573, "grad_norm": 0.05580420047044754, "learning_rate": 1.802895333134783e-05, "loss": 0.102, "step": 1855 }, { "epoch": 1.4980199268508008, "grad_norm": 0.05232969671487808, "learning_rate": 1.797477878005655e-05, "loss": 0.0948, "step": 1856 }, { "epoch": 1.4988270904275445, "grad_norm": 0.05730554088950157, "learning_rate": 1.792066790266348e-05, "loss": 0.0945, "step": 1857 }, { "epoch": 1.4996342540042882, "grad_norm": 0.056307774037122726, "learning_rate": 1.7866620806754146e-05, "loss": 0.101, "step": 1858 }, { "epoch": 1.5004414175810317, "grad_norm": 0.049549784511327744, "learning_rate": 1.7812637599787297e-05, "loss": 0.094, "step": 1859 }, { "epoch": 1.5012485811577752, "grad_norm": 0.05495047941803932, "learning_rate": 1.7758718389094582e-05, "loss": 0.0989, "step": 1860 }, { "epoch": 1.5020557447345189, "grad_norm": 0.058380722999572754, "learning_rate": 1.7704863281880496e-05, "loss": 0.1096, "step": 1861 }, { "epoch": 1.5028629083112626, "grad_norm": 0.05579933151602745, "learning_rate": 1.7651072385222e-05, "loss": 0.1113, "step": 1862 }, { "epoch": 1.503670071888006, "grad_norm": 0.0589706227183342, "learning_rate": 1.759734580606845e-05, "loss": 0.1014, "step": 1863 }, { "epoch": 1.5044772354647495, "grad_norm": 0.057490408420562744, "learning_rate": 1.7543683651241298e-05, "loss": 0.104, "step": 1864 }, { "epoch": 1.5052843990414932, "grad_norm": 0.0531342551112175, "learning_rate": 1.7490086027433912e-05, "loss": 0.1112, "step": 1865 }, { "epoch": 1.506091562618237, "grad_norm": 0.0518353171646595, "learning_rate": 1.743655304121136e-05, "loss": 0.091, "step": 1866 }, { "epoch": 1.5068987261949804, "grad_norm": 0.05573621019721031, "learning_rate": 1.7383084799010163e-05, "loss": 0.1108, "step": 1867 }, { "epoch": 1.507705889771724, "grad_norm": 0.05670027807354927, "learning_rate": 1.732968140713817e-05, "loss": 0.1002, "step": 1868 }, { "epoch": 1.5085130533484676, "grad_norm": 0.05580359324812889, "learning_rate": 1.7276342971774223e-05, "loss": 0.0981, "step": 1869 }, { "epoch": 1.5093202169252113, "grad_norm": 0.047418493777513504, "learning_rate": 1.7223069598968083e-05, "loss": 0.098, "step": 1870 }, { "epoch": 1.5101273805019548, "grad_norm": 0.054328709840774536, "learning_rate": 1.7169861394640107e-05, "loss": 0.1006, "step": 1871 }, { "epoch": 1.5109345440786983, "grad_norm": 0.057824574410915375, "learning_rate": 1.7116718464581123e-05, "loss": 0.0951, "step": 1872 }, { "epoch": 1.511741707655442, "grad_norm": 0.05723113194108009, "learning_rate": 1.706364091445211e-05, "loss": 0.1042, "step": 1873 }, { "epoch": 1.5125488712321857, "grad_norm": 0.05162542685866356, "learning_rate": 1.7010628849784133e-05, "loss": 0.0938, "step": 1874 }, { "epoch": 1.5133560348089292, "grad_norm": 0.05240887776017189, "learning_rate": 1.6957682375977985e-05, "loss": 0.0999, "step": 1875 }, { "epoch": 1.5141631983856727, "grad_norm": 0.056586526334285736, "learning_rate": 1.6904801598304095e-05, "loss": 0.0971, "step": 1876 }, { "epoch": 1.5149703619624164, "grad_norm": 0.05775398015975952, "learning_rate": 1.6851986621902265e-05, "loss": 0.1008, "step": 1877 }, { "epoch": 1.5157775255391601, "grad_norm": 0.055042628198862076, "learning_rate": 1.6799237551781465e-05, "loss": 0.1024, "step": 1878 }, { "epoch": 1.5165846891159036, "grad_norm": 0.05209149047732353, "learning_rate": 1.674655449281964e-05, "loss": 0.0992, "step": 1879 }, { "epoch": 1.517391852692647, "grad_norm": 0.05653234198689461, "learning_rate": 1.669393754976344e-05, "loss": 0.0983, "step": 1880 }, { "epoch": 1.5181990162693908, "grad_norm": 0.05031287670135498, "learning_rate": 1.6641386827228105e-05, "loss": 0.1046, "step": 1881 }, { "epoch": 1.5190061798461345, "grad_norm": 0.05126282200217247, "learning_rate": 1.6588902429697217e-05, "loss": 0.0973, "step": 1882 }, { "epoch": 1.519813343422878, "grad_norm": 0.05497574806213379, "learning_rate": 1.653648446152248e-05, "loss": 0.0908, "step": 1883 }, { "epoch": 1.5206205069996215, "grad_norm": 0.055155180394649506, "learning_rate": 1.6484133026923475e-05, "loss": 0.1036, "step": 1884 }, { "epoch": 1.5214276705763652, "grad_norm": 0.06486766785383224, "learning_rate": 1.6431848229987584e-05, "loss": 0.1082, "step": 1885 }, { "epoch": 1.522234834153109, "grad_norm": 0.052395597100257874, "learning_rate": 1.637963017466961e-05, "loss": 0.0843, "step": 1886 }, { "epoch": 1.5230419977298526, "grad_norm": 0.05243591219186783, "learning_rate": 1.6327478964791705e-05, "loss": 0.1083, "step": 1887 }, { "epoch": 1.523849161306596, "grad_norm": 0.055100247263908386, "learning_rate": 1.6275394704043124e-05, "loss": 0.0991, "step": 1888 }, { "epoch": 1.5246563248833396, "grad_norm": 0.0525592565536499, "learning_rate": 1.622337749598e-05, "loss": 0.1059, "step": 1889 }, { "epoch": 1.5254634884600833, "grad_norm": 0.052325937896966934, "learning_rate": 1.6171427444025116e-05, "loss": 0.1097, "step": 1890 }, { "epoch": 1.526270652036827, "grad_norm": 0.0521961972117424, "learning_rate": 1.61195446514678e-05, "loss": 0.0991, "step": 1891 }, { "epoch": 1.5270778156135705, "grad_norm": 0.05103736370801926, "learning_rate": 1.606772922146357e-05, "loss": 0.0973, "step": 1892 }, { "epoch": 1.527884979190314, "grad_norm": 0.05229256674647331, "learning_rate": 1.6015981257034067e-05, "loss": 0.0988, "step": 1893 }, { "epoch": 1.5286921427670577, "grad_norm": 0.053779397159814835, "learning_rate": 1.5964300861066795e-05, "loss": 0.105, "step": 1894 }, { "epoch": 1.5294993063438014, "grad_norm": 0.059099841862916946, "learning_rate": 1.5912688136314884e-05, "loss": 0.1032, "step": 1895 }, { "epoch": 1.5303064699205449, "grad_norm": 0.05991920083761215, "learning_rate": 1.586114318539697e-05, "loss": 0.1093, "step": 1896 }, { "epoch": 1.5311136334972884, "grad_norm": 0.058639418333768845, "learning_rate": 1.5809666110796855e-05, "loss": 0.1093, "step": 1897 }, { "epoch": 1.531920797074032, "grad_norm": 0.052382756024599075, "learning_rate": 1.575825701486347e-05, "loss": 0.1, "step": 1898 }, { "epoch": 1.5327279606507758, "grad_norm": 0.05266576632857323, "learning_rate": 1.570691599981053e-05, "loss": 0.0994, "step": 1899 }, { "epoch": 1.5335351242275193, "grad_norm": 0.053456973284482956, "learning_rate": 1.565564316771641e-05, "loss": 0.1002, "step": 1900 }, { "epoch": 1.5343422878042627, "grad_norm": 0.05390861630439758, "learning_rate": 1.560443862052393e-05, "loss": 0.1015, "step": 1901 }, { "epoch": 1.5351494513810064, "grad_norm": 0.05490504950284958, "learning_rate": 1.5553302460040153e-05, "loss": 0.101, "step": 1902 }, { "epoch": 1.5359566149577502, "grad_norm": 0.05852966010570526, "learning_rate": 1.550223478793612e-05, "loss": 0.1037, "step": 1903 }, { "epoch": 1.5367637785344936, "grad_norm": 0.05627080798149109, "learning_rate": 1.545123570574677e-05, "loss": 0.0974, "step": 1904 }, { "epoch": 1.5375709421112371, "grad_norm": 0.05497778579592705, "learning_rate": 1.5400305314870596e-05, "loss": 0.0964, "step": 1905 }, { "epoch": 1.5383781056879808, "grad_norm": 0.0551411509513855, "learning_rate": 1.534944371656955e-05, "loss": 0.1017, "step": 1906 }, { "epoch": 1.5391852692647245, "grad_norm": 0.047306887805461884, "learning_rate": 1.5298651011968866e-05, "loss": 0.1028, "step": 1907 }, { "epoch": 1.539992432841468, "grad_norm": 0.061401695013046265, "learning_rate": 1.5247927302056703e-05, "loss": 0.1095, "step": 1908 }, { "epoch": 1.5407995964182115, "grad_norm": 0.06022539734840393, "learning_rate": 1.5197272687684106e-05, "loss": 0.1061, "step": 1909 }, { "epoch": 1.5416067599949552, "grad_norm": 0.05995117127895355, "learning_rate": 1.5146687269564691e-05, "loss": 0.0994, "step": 1910 }, { "epoch": 1.542413923571699, "grad_norm": 0.06327615678310394, "learning_rate": 1.5096171148274546e-05, "loss": 0.1117, "step": 1911 }, { "epoch": 1.5432210871484424, "grad_norm": 0.05335117504000664, "learning_rate": 1.504572442425195e-05, "loss": 0.1054, "step": 1912 }, { "epoch": 1.544028250725186, "grad_norm": 0.05532221496105194, "learning_rate": 1.4995347197797227e-05, "loss": 0.1051, "step": 1913 }, { "epoch": 1.5448354143019296, "grad_norm": 0.0532710999250412, "learning_rate": 1.4945039569072484e-05, "loss": 0.1124, "step": 1914 }, { "epoch": 1.5456425778786733, "grad_norm": 0.052019793540239334, "learning_rate": 1.4894801638101503e-05, "loss": 0.0974, "step": 1915 }, { "epoch": 1.5464497414554168, "grad_norm": 0.05374704673886299, "learning_rate": 1.4844633504769422e-05, "loss": 0.1086, "step": 1916 }, { "epoch": 1.5472569050321603, "grad_norm": 0.0566440112888813, "learning_rate": 1.4794535268822673e-05, "loss": 0.1143, "step": 1917 }, { "epoch": 1.548064068608904, "grad_norm": 0.04957743361592293, "learning_rate": 1.4744507029868675e-05, "loss": 0.1012, "step": 1918 }, { "epoch": 1.5488712321856477, "grad_norm": 0.05408566817641258, "learning_rate": 1.4694548887375708e-05, "loss": 0.096, "step": 1919 }, { "epoch": 1.5496783957623912, "grad_norm": 0.05855069309473038, "learning_rate": 1.4644660940672627e-05, "loss": 0.1141, "step": 1920 }, { "epoch": 1.5496783957623912, "eval_loss": 0.11269907653331757, "eval_runtime": 4002.4638, "eval_samples_per_second": 2.224, "eval_steps_per_second": 2.224, "step": 1920 }, { "epoch": 1.5504855593391347, "grad_norm": 0.05562373995780945, "learning_rate": 1.4594843288948773e-05, "loss": 0.1079, "step": 1921 }, { "epoch": 1.5512927229158784, "grad_norm": 0.0556560717523098, "learning_rate": 1.454509603125373e-05, "loss": 0.0973, "step": 1922 }, { "epoch": 1.552099886492622, "grad_norm": 0.0528298057615757, "learning_rate": 1.4495419266497052e-05, "loss": 0.1004, "step": 1923 }, { "epoch": 1.5529070500693656, "grad_norm": 0.05220425873994827, "learning_rate": 1.4445813093448207e-05, "loss": 0.1017, "step": 1924 }, { "epoch": 1.553714213646109, "grad_norm": 0.055257648229599, "learning_rate": 1.4396277610736286e-05, "loss": 0.0997, "step": 1925 }, { "epoch": 1.5545213772228528, "grad_norm": 0.06071949377655983, "learning_rate": 1.4346812916849839e-05, "loss": 0.1067, "step": 1926 }, { "epoch": 1.5553285407995965, "grad_norm": 0.05044953525066376, "learning_rate": 1.4297419110136628e-05, "loss": 0.1037, "step": 1927 }, { "epoch": 1.55613570437634, "grad_norm": 0.05712079629302025, "learning_rate": 1.4248096288803548e-05, "loss": 0.1073, "step": 1928 }, { "epoch": 1.5569428679530835, "grad_norm": 0.05678488314151764, "learning_rate": 1.4198844550916279e-05, "loss": 0.1135, "step": 1929 }, { "epoch": 1.5577500315298272, "grad_norm": 0.053431954234838486, "learning_rate": 1.4149663994399221e-05, "loss": 0.1044, "step": 1930 }, { "epoch": 1.5585571951065709, "grad_norm": 0.057647522538900375, "learning_rate": 1.4100554717035241e-05, "loss": 0.1028, "step": 1931 }, { "epoch": 1.5593643586833146, "grad_norm": 0.05114518105983734, "learning_rate": 1.4051516816465488e-05, "loss": 0.1035, "step": 1932 }, { "epoch": 1.560171522260058, "grad_norm": 0.05414271354675293, "learning_rate": 1.4002550390189161e-05, "loss": 0.1181, "step": 1933 }, { "epoch": 1.5609786858368015, "grad_norm": 0.055508337914943695, "learning_rate": 1.3953655535563415e-05, "loss": 0.1105, "step": 1934 }, { "epoch": 1.5617858494135453, "grad_norm": 0.05464039370417595, "learning_rate": 1.390483234980301e-05, "loss": 0.1092, "step": 1935 }, { "epoch": 1.562593012990289, "grad_norm": 0.05162185803055763, "learning_rate": 1.385608092998032e-05, "loss": 0.1028, "step": 1936 }, { "epoch": 1.5634001765670325, "grad_norm": 0.054172419011592865, "learning_rate": 1.3807401373024969e-05, "loss": 0.1048, "step": 1937 }, { "epoch": 1.564207340143776, "grad_norm": 0.06221926584839821, "learning_rate": 1.3758793775723693e-05, "loss": 0.1115, "step": 1938 }, { "epoch": 1.5650145037205196, "grad_norm": 0.05670136958360672, "learning_rate": 1.3710258234720192e-05, "loss": 0.0975, "step": 1939 }, { "epoch": 1.5658216672972634, "grad_norm": 0.05433516204357147, "learning_rate": 1.3661794846514846e-05, "loss": 0.1021, "step": 1940 }, { "epoch": 1.5666288308740068, "grad_norm": 0.0568232461810112, "learning_rate": 1.3613403707464639e-05, "loss": 0.1076, "step": 1941 }, { "epoch": 1.5674359944507503, "grad_norm": 0.04918437823653221, "learning_rate": 1.3565084913782867e-05, "loss": 0.1083, "step": 1942 }, { "epoch": 1.568243158027494, "grad_norm": 0.057148393243551254, "learning_rate": 1.351683856153902e-05, "loss": 0.1155, "step": 1943 }, { "epoch": 1.5690503216042377, "grad_norm": 0.05117778852581978, "learning_rate": 1.34686647466585e-05, "loss": 0.097, "step": 1944 }, { "epoch": 1.5698574851809812, "grad_norm": 0.05000148341059685, "learning_rate": 1.342056356492255e-05, "loss": 0.1017, "step": 1945 }, { "epoch": 1.5706646487577247, "grad_norm": 0.056538600474596024, "learning_rate": 1.3372535111967949e-05, "loss": 0.1086, "step": 1946 }, { "epoch": 1.5714718123344684, "grad_norm": 0.05574408918619156, "learning_rate": 1.3324579483286908e-05, "loss": 0.1076, "step": 1947 }, { "epoch": 1.5722789759112121, "grad_norm": 0.05071093514561653, "learning_rate": 1.3276696774226832e-05, "loss": 0.1052, "step": 1948 }, { "epoch": 1.5730861394879556, "grad_norm": 0.05552903190255165, "learning_rate": 1.3228887079990153e-05, "loss": 0.104, "step": 1949 }, { "epoch": 1.573893303064699, "grad_norm": 0.059206344187259674, "learning_rate": 1.3181150495634138e-05, "loss": 0.1058, "step": 1950 }, { "epoch": 1.5747004666414428, "grad_norm": 0.06013191491365433, "learning_rate": 1.3133487116070643e-05, "loss": 0.1091, "step": 1951 }, { "epoch": 1.5755076302181865, "grad_norm": 0.05298176035284996, "learning_rate": 1.3085897036066058e-05, "loss": 0.0979, "step": 1952 }, { "epoch": 1.57631479379493, "grad_norm": 0.052545368671417236, "learning_rate": 1.3038380350240947e-05, "loss": 0.1074, "step": 1953 }, { "epoch": 1.5771219573716735, "grad_norm": 0.05258338898420334, "learning_rate": 1.299093715307002e-05, "loss": 0.1016, "step": 1954 }, { "epoch": 1.5779291209484172, "grad_norm": 0.0619804747402668, "learning_rate": 1.294356753888184e-05, "loss": 0.107, "step": 1955 }, { "epoch": 1.578736284525161, "grad_norm": 0.05426359176635742, "learning_rate": 1.289627160185869e-05, "loss": 0.108, "step": 1956 }, { "epoch": 1.5795434481019044, "grad_norm": 0.05034402757883072, "learning_rate": 1.2849049436036326e-05, "loss": 0.0956, "step": 1957 }, { "epoch": 1.5803506116786479, "grad_norm": 0.05863840505480766, "learning_rate": 1.2801901135303879e-05, "loss": 0.0968, "step": 1958 }, { "epoch": 1.5811577752553916, "grad_norm": 0.055555909872055054, "learning_rate": 1.2754826793403562e-05, "loss": 0.1049, "step": 1959 }, { "epoch": 1.5819649388321353, "grad_norm": 0.051523443311452866, "learning_rate": 1.2707826503930592e-05, "loss": 0.0996, "step": 1960 }, { "epoch": 1.5827721024088788, "grad_norm": 0.055236246436834335, "learning_rate": 1.2660900360332928e-05, "loss": 0.0929, "step": 1961 }, { "epoch": 1.5835792659856223, "grad_norm": 0.05559065565466881, "learning_rate": 1.2614048455911121e-05, "loss": 0.1081, "step": 1962 }, { "epoch": 1.584386429562366, "grad_norm": 0.05728769674897194, "learning_rate": 1.256727088381809e-05, "loss": 0.102, "step": 1963 }, { "epoch": 1.5851935931391097, "grad_norm": 0.055622417479753494, "learning_rate": 1.252056773705898e-05, "loss": 0.0963, "step": 1964 }, { "epoch": 1.5860007567158532, "grad_norm": 0.04982447624206543, "learning_rate": 1.2473939108490973e-05, "loss": 0.1107, "step": 1965 }, { "epoch": 1.5868079202925967, "grad_norm": 0.05720730870962143, "learning_rate": 1.2427385090823073e-05, "loss": 0.1094, "step": 1966 }, { "epoch": 1.5876150838693404, "grad_norm": 0.05976979807019234, "learning_rate": 1.2380905776615958e-05, "loss": 0.1053, "step": 1967 }, { "epoch": 1.588422247446084, "grad_norm": 0.053743358701467514, "learning_rate": 1.2334501258281745e-05, "loss": 0.0989, "step": 1968 }, { "epoch": 1.5892294110228276, "grad_norm": 0.05539080873131752, "learning_rate": 1.2288171628083883e-05, "loss": 0.1056, "step": 1969 }, { "epoch": 1.590036574599571, "grad_norm": 0.04971492290496826, "learning_rate": 1.2241916978136864e-05, "loss": 0.1039, "step": 1970 }, { "epoch": 1.5908437381763147, "grad_norm": 0.058353058993816376, "learning_rate": 1.2195737400406166e-05, "loss": 0.1072, "step": 1971 }, { "epoch": 1.5916509017530585, "grad_norm": 0.060463909059762955, "learning_rate": 1.2149632986707964e-05, "loss": 0.1023, "step": 1972 }, { "epoch": 1.592458065329802, "grad_norm": 0.05185023695230484, "learning_rate": 1.2103603828709021e-05, "loss": 0.1049, "step": 1973 }, { "epoch": 1.5932652289065454, "grad_norm": 0.057643067091703415, "learning_rate": 1.205765001792643e-05, "loss": 0.1083, "step": 1974 }, { "epoch": 1.5940723924832891, "grad_norm": 0.05921928212046623, "learning_rate": 1.201177164572752e-05, "loss": 0.1157, "step": 1975 }, { "epoch": 1.5948795560600328, "grad_norm": 0.05458223819732666, "learning_rate": 1.1965968803329585e-05, "loss": 0.1088, "step": 1976 }, { "epoch": 1.5956867196367766, "grad_norm": 0.056385304778814316, "learning_rate": 1.1920241581799791e-05, "loss": 0.1104, "step": 1977 }, { "epoch": 1.59649388321352, "grad_norm": 0.05384686216711998, "learning_rate": 1.1874590072054925e-05, "loss": 0.1065, "step": 1978 }, { "epoch": 1.5973010467902635, "grad_norm": 0.05801048502326012, "learning_rate": 1.1829014364861251e-05, "loss": 0.1093, "step": 1979 }, { "epoch": 1.5981082103670072, "grad_norm": 0.053353242576122284, "learning_rate": 1.178351455083433e-05, "loss": 0.1014, "step": 1980 }, { "epoch": 1.598915373943751, "grad_norm": 0.054095447063446045, "learning_rate": 1.1738090720438782e-05, "loss": 0.1051, "step": 1981 }, { "epoch": 1.5997225375204944, "grad_norm": 0.058450426906347275, "learning_rate": 1.1692742963988223e-05, "loss": 0.113, "step": 1982 }, { "epoch": 1.600529701097238, "grad_norm": 0.052430205047130585, "learning_rate": 1.164747137164494e-05, "loss": 0.1, "step": 1983 }, { "epoch": 1.6013368646739816, "grad_norm": 0.05620815232396126, "learning_rate": 1.1602276033419856e-05, "loss": 0.1067, "step": 1984 }, { "epoch": 1.6021440282507253, "grad_norm": 0.05659813806414604, "learning_rate": 1.1557157039172239e-05, "loss": 0.1073, "step": 1985 }, { "epoch": 1.6029511918274688, "grad_norm": 0.054069194942712784, "learning_rate": 1.1512114478609598e-05, "loss": 0.1048, "step": 1986 }, { "epoch": 1.6037583554042123, "grad_norm": 0.05347205698490143, "learning_rate": 1.1467148441287423e-05, "loss": 0.1, "step": 1987 }, { "epoch": 1.604565518980956, "grad_norm": 0.055569425225257874, "learning_rate": 1.1422259016609127e-05, "loss": 0.1024, "step": 1988 }, { "epoch": 1.6053726825576997, "grad_norm": 0.05994495749473572, "learning_rate": 1.1377446293825717e-05, "loss": 0.0995, "step": 1989 }, { "epoch": 1.6061798461344432, "grad_norm": 0.05607868358492851, "learning_rate": 1.1332710362035791e-05, "loss": 0.1058, "step": 1990 }, { "epoch": 1.6069870097111867, "grad_norm": 0.04957025125622749, "learning_rate": 1.1288051310185182e-05, "loss": 0.1101, "step": 1991 }, { "epoch": 1.6077941732879304, "grad_norm": 0.05893281474709511, "learning_rate": 1.1243469227066916e-05, "loss": 0.1017, "step": 1992 }, { "epoch": 1.608601336864674, "grad_norm": 0.05835820734500885, "learning_rate": 1.1198964201320994e-05, "loss": 0.101, "step": 1993 }, { "epoch": 1.6094085004414176, "grad_norm": 0.06377571821212769, "learning_rate": 1.1154536321434157e-05, "loss": 0.1106, "step": 1994 }, { "epoch": 1.610215664018161, "grad_norm": 0.058504946529865265, "learning_rate": 1.1110185675739803e-05, "loss": 0.0943, "step": 1995 }, { "epoch": 1.6110228275949048, "grad_norm": 0.05540391430258751, "learning_rate": 1.1065912352417768e-05, "loss": 0.1027, "step": 1996 }, { "epoch": 1.6118299911716485, "grad_norm": 0.05161216855049133, "learning_rate": 1.1021716439494156e-05, "loss": 0.0974, "step": 1997 }, { "epoch": 1.612637154748392, "grad_norm": 0.052085842937231064, "learning_rate": 1.0977598024841117e-05, "loss": 0.109, "step": 1998 }, { "epoch": 1.6134443183251355, "grad_norm": 0.055512141436338425, "learning_rate": 1.093355719617678e-05, "loss": 0.1023, "step": 1999 }, { "epoch": 1.6142514819018792, "grad_norm": 0.05392886698246002, "learning_rate": 1.0889594041064954e-05, "loss": 0.0955, "step": 2000 }, { "epoch": 1.6150586454786229, "grad_norm": 0.05657012015581131, "learning_rate": 1.0845708646915054e-05, "loss": 0.1031, "step": 2001 }, { "epoch": 1.6158658090553664, "grad_norm": 0.05190359055995941, "learning_rate": 1.0801901100981876e-05, "loss": 0.0995, "step": 2002 }, { "epoch": 1.6166729726321099, "grad_norm": 0.05486932396888733, "learning_rate": 1.0758171490365443e-05, "loss": 0.0986, "step": 2003 }, { "epoch": 1.6174801362088536, "grad_norm": 0.0508626364171505, "learning_rate": 1.0714519902010794e-05, "loss": 0.1026, "step": 2004 }, { "epoch": 1.6182872997855973, "grad_norm": 0.05402788892388344, "learning_rate": 1.0670946422707883e-05, "loss": 0.1022, "step": 2005 }, { "epoch": 1.6190944633623408, "grad_norm": 0.05172837898135185, "learning_rate": 1.0627451139091321e-05, "loss": 0.095, "step": 2006 }, { "epoch": 1.6199016269390842, "grad_norm": 0.061015430837869644, "learning_rate": 1.0584034137640281e-05, "loss": 0.1064, "step": 2007 }, { "epoch": 1.620708790515828, "grad_norm": 0.0507437027990818, "learning_rate": 1.0540695504678283e-05, "loss": 0.0994, "step": 2008 }, { "epoch": 1.6215159540925717, "grad_norm": 0.05702191963791847, "learning_rate": 1.0497435326373022e-05, "loss": 0.099, "step": 2009 }, { "epoch": 1.6223231176693151, "grad_norm": 0.05072639882564545, "learning_rate": 1.0454253688736226e-05, "loss": 0.1089, "step": 2010 }, { "epoch": 1.6231302812460586, "grad_norm": 0.05342007428407669, "learning_rate": 1.0411150677623438e-05, "loss": 0.1001, "step": 2011 }, { "epoch": 1.6239374448228023, "grad_norm": 0.05594373866915703, "learning_rate": 1.0368126378733895e-05, "loss": 0.1086, "step": 2012 }, { "epoch": 1.624744608399546, "grad_norm": 0.05941212549805641, "learning_rate": 1.0325180877610312e-05, "loss": 0.1136, "step": 2013 }, { "epoch": 1.6255517719762895, "grad_norm": 0.054361309856176376, "learning_rate": 1.0282314259638753e-05, "loss": 0.1145, "step": 2014 }, { "epoch": 1.626358935553033, "grad_norm": 0.06129881367087364, "learning_rate": 1.023952661004845e-05, "loss": 0.1058, "step": 2015 }, { "epoch": 1.6271660991297767, "grad_norm": 0.06263244152069092, "learning_rate": 1.0196818013911625e-05, "loss": 0.0968, "step": 2016 }, { "epoch": 1.6279732627065204, "grad_norm": 0.05637633427977562, "learning_rate": 1.0154188556143285e-05, "loss": 0.1039, "step": 2017 }, { "epoch": 1.628780426283264, "grad_norm": 0.061631835997104645, "learning_rate": 1.0111638321501149e-05, "loss": 0.108, "step": 2018 }, { "epoch": 1.6295875898600074, "grad_norm": 0.06122696027159691, "learning_rate": 1.006916739458535e-05, "loss": 0.0965, "step": 2019 }, { "epoch": 1.630394753436751, "grad_norm": 0.0557895265519619, "learning_rate": 1.0026775859838417e-05, "loss": 0.0943, "step": 2020 }, { "epoch": 1.6312019170134948, "grad_norm": 0.06163759157061577, "learning_rate": 9.98446380154499e-06, "loss": 0.1064, "step": 2021 }, { "epoch": 1.6320090805902385, "grad_norm": 0.057027336210012436, "learning_rate": 9.942231303831678e-06, "loss": 0.1039, "step": 2022 }, { "epoch": 1.632816244166982, "grad_norm": 0.05407130345702171, "learning_rate": 9.900078450666927e-06, "loss": 0.1003, "step": 2023 }, { "epoch": 1.6336234077437255, "grad_norm": 0.0556282214820385, "learning_rate": 9.858005325860808e-06, "loss": 0.1141, "step": 2024 }, { "epoch": 1.6344305713204692, "grad_norm": 0.052581753581762314, "learning_rate": 9.81601201306489e-06, "loss": 0.0922, "step": 2025 }, { "epoch": 1.635237734897213, "grad_norm": 0.050861701369285583, "learning_rate": 9.774098595772058e-06, "loss": 0.0883, "step": 2026 }, { "epoch": 1.6360448984739564, "grad_norm": 0.05524609610438347, "learning_rate": 9.732265157316345e-06, "loss": 0.0996, "step": 2027 }, { "epoch": 1.6368520620506999, "grad_norm": 0.057807937264442444, "learning_rate": 9.69051178087274e-06, "loss": 0.1059, "step": 2028 }, { "epoch": 1.6376592256274436, "grad_norm": 0.050353292375802994, "learning_rate": 9.6488385494571e-06, "loss": 0.0964, "step": 2029 }, { "epoch": 1.6384663892041873, "grad_norm": 0.050203051418066025, "learning_rate": 9.60724554592588e-06, "loss": 0.0977, "step": 2030 }, { "epoch": 1.6392735527809308, "grad_norm": 0.04976612329483032, "learning_rate": 9.56573285297605e-06, "loss": 0.1016, "step": 2031 }, { "epoch": 1.6400807163576743, "grad_norm": 0.05390538275241852, "learning_rate": 9.524300553144905e-06, "loss": 0.0991, "step": 2032 }, { "epoch": 1.640887879934418, "grad_norm": 0.05189698189496994, "learning_rate": 9.48294872880991e-06, "loss": 0.1036, "step": 2033 }, { "epoch": 1.6416950435111617, "grad_norm": 0.057177428156137466, "learning_rate": 9.441677462188486e-06, "loss": 0.1003, "step": 2034 }, { "epoch": 1.6425022070879052, "grad_norm": 0.061864301562309265, "learning_rate": 9.400486835337913e-06, "loss": 0.1125, "step": 2035 }, { "epoch": 1.6433093706646487, "grad_norm": 0.05714981257915497, "learning_rate": 9.359376930155157e-06, "loss": 0.1033, "step": 2036 }, { "epoch": 1.6441165342413924, "grad_norm": 0.05356486514210701, "learning_rate": 9.318347828376639e-06, "loss": 0.1012, "step": 2037 }, { "epoch": 1.644923697818136, "grad_norm": 0.061940163373947144, "learning_rate": 9.277399611578175e-06, "loss": 0.1105, "step": 2038 }, { "epoch": 1.6457308613948796, "grad_norm": 0.04700857773423195, "learning_rate": 9.236532361174726e-06, "loss": 0.0999, "step": 2039 }, { "epoch": 1.646538024971623, "grad_norm": 0.05263765901327133, "learning_rate": 9.195746158420304e-06, "loss": 0.1044, "step": 2040 }, { "epoch": 1.6473451885483668, "grad_norm": 0.05332957208156586, "learning_rate": 9.15504108440774e-06, "loss": 0.1058, "step": 2041 }, { "epoch": 1.6481523521251105, "grad_norm": 0.057344742119312286, "learning_rate": 9.114417220068604e-06, "loss": 0.1151, "step": 2042 }, { "epoch": 1.648959515701854, "grad_norm": 0.058973751962184906, "learning_rate": 9.073874646172958e-06, "loss": 0.1117, "step": 2043 }, { "epoch": 1.6497666792785974, "grad_norm": 0.058070022612810135, "learning_rate": 9.03341344332927e-06, "loss": 0.1028, "step": 2044 }, { "epoch": 1.6505738428553411, "grad_norm": 0.052241433411836624, "learning_rate": 8.993033691984215e-06, "loss": 0.1051, "step": 2045 }, { "epoch": 1.6513810064320849, "grad_norm": 0.05317528173327446, "learning_rate": 8.95273547242253e-06, "loss": 0.0959, "step": 2046 }, { "epoch": 1.6521881700088283, "grad_norm": 0.053493160754442215, "learning_rate": 8.912518864766816e-06, "loss": 0.1012, "step": 2047 }, { "epoch": 1.6529953335855718, "grad_norm": 0.05735216662287712, "learning_rate": 8.872383948977459e-06, "loss": 0.1102, "step": 2048 }, { "epoch": 1.6529953335855718, "eval_loss": 0.11343793570995331, "eval_runtime": 4035.456, "eval_samples_per_second": 2.206, "eval_steps_per_second": 2.206, "step": 2048 }, { "epoch": 1.6538024971623155, "grad_norm": 0.05593269690871239, "learning_rate": 8.832330804852352e-06, "loss": 0.1025, "step": 2049 }, { "epoch": 1.6546096607390592, "grad_norm": 0.055891454219818115, "learning_rate": 8.792359512026894e-06, "loss": 0.1052, "step": 2050 }, { "epoch": 1.6554168243158027, "grad_norm": 0.04758215323090553, "learning_rate": 8.752470149973684e-06, "loss": 0.1037, "step": 2051 }, { "epoch": 1.6562239878925462, "grad_norm": 0.054334938526153564, "learning_rate": 8.71266279800243e-06, "loss": 0.1051, "step": 2052 }, { "epoch": 1.65703115146929, "grad_norm": 0.050931211560964584, "learning_rate": 8.672937535259812e-06, "loss": 0.0961, "step": 2053 }, { "epoch": 1.6578383150460336, "grad_norm": 0.05113285034894943, "learning_rate": 8.63329444072924e-06, "loss": 0.1069, "step": 2054 }, { "epoch": 1.6586454786227771, "grad_norm": 0.061175376176834106, "learning_rate": 8.593733593230813e-06, "loss": 0.1052, "step": 2055 }, { "epoch": 1.6594526421995206, "grad_norm": 0.053239572793245316, "learning_rate": 8.55425507142108e-06, "loss": 0.0962, "step": 2056 }, { "epoch": 1.6602598057762643, "grad_norm": 0.05916755646467209, "learning_rate": 8.51485895379291e-06, "loss": 0.103, "step": 2057 }, { "epoch": 1.661066969353008, "grad_norm": 0.05605355277657509, "learning_rate": 8.475545318675315e-06, "loss": 0.0938, "step": 2058 }, { "epoch": 1.6618741329297515, "grad_norm": 0.05565459281206131, "learning_rate": 8.43631424423334e-06, "loss": 0.1156, "step": 2059 }, { "epoch": 1.662681296506495, "grad_norm": 0.059311460703611374, "learning_rate": 8.39716580846785e-06, "loss": 0.101, "step": 2060 }, { "epoch": 1.6634884600832387, "grad_norm": 0.05521459877490997, "learning_rate": 8.358100089215426e-06, "loss": 0.1098, "step": 2061 }, { "epoch": 1.6642956236599824, "grad_norm": 0.05719214305281639, "learning_rate": 8.319117164148183e-06, "loss": 0.1115, "step": 2062 }, { "epoch": 1.665102787236726, "grad_norm": 0.05529273301362991, "learning_rate": 8.280217110773624e-06, "loss": 0.1014, "step": 2063 }, { "epoch": 1.6659099508134694, "grad_norm": 0.05324285849928856, "learning_rate": 8.241400006434486e-06, "loss": 0.0976, "step": 2064 }, { "epoch": 1.666717114390213, "grad_norm": 0.05101999267935753, "learning_rate": 8.20266592830855e-06, "loss": 0.0951, "step": 2065 }, { "epoch": 1.6675242779669568, "grad_norm": 0.054667405784130096, "learning_rate": 8.164014953408578e-06, "loss": 0.0976, "step": 2066 }, { "epoch": 1.6683314415437005, "grad_norm": 0.05465232580900192, "learning_rate": 8.125447158582044e-06, "loss": 0.0973, "step": 2067 }, { "epoch": 1.669138605120444, "grad_norm": 0.05029634013772011, "learning_rate": 8.086962620511079e-06, "loss": 0.0978, "step": 2068 }, { "epoch": 1.6699457686971875, "grad_norm": 0.053344808518886566, "learning_rate": 8.048561415712269e-06, "loss": 0.1088, "step": 2069 }, { "epoch": 1.6707529322739312, "grad_norm": 0.050939690321683884, "learning_rate": 8.010243620536528e-06, "loss": 0.1033, "step": 2070 }, { "epoch": 1.6715600958506749, "grad_norm": 0.05607762560248375, "learning_rate": 7.972009311168882e-06, "loss": 0.1042, "step": 2071 }, { "epoch": 1.6723672594274184, "grad_norm": 0.04818984493613243, "learning_rate": 7.933858563628438e-06, "loss": 0.0999, "step": 2072 }, { "epoch": 1.6731744230041619, "grad_norm": 0.05935560539364815, "learning_rate": 7.895791453768076e-06, "loss": 0.1078, "step": 2073 }, { "epoch": 1.6739815865809056, "grad_norm": 0.052476853132247925, "learning_rate": 7.857808057274486e-06, "loss": 0.0926, "step": 2074 }, { "epoch": 1.6747887501576493, "grad_norm": 0.0528566800057888, "learning_rate": 7.819908449667823e-06, "loss": 0.1001, "step": 2075 }, { "epoch": 1.6755959137343928, "grad_norm": 0.049353450536727905, "learning_rate": 7.782092706301719e-06, "loss": 0.0992, "step": 2076 }, { "epoch": 1.6764030773111362, "grad_norm": 0.05010814964771271, "learning_rate": 7.744360902363002e-06, "loss": 0.0967, "step": 2077 }, { "epoch": 1.67721024088788, "grad_norm": 0.05267290398478508, "learning_rate": 7.706713112871656e-06, "loss": 0.0953, "step": 2078 }, { "epoch": 1.6780174044646237, "grad_norm": 0.05748289078474045, "learning_rate": 7.669149412680605e-06, "loss": 0.0962, "step": 2079 }, { "epoch": 1.6788245680413671, "grad_norm": 0.053290631622076035, "learning_rate": 7.631669876475584e-06, "loss": 0.0981, "step": 2080 }, { "epoch": 1.6796317316181106, "grad_norm": 0.05639868229627609, "learning_rate": 7.5942745787750065e-06, "loss": 0.112, "step": 2081 }, { "epoch": 1.6804388951948543, "grad_norm": 0.052571941167116165, "learning_rate": 7.556963593929755e-06, "loss": 0.1045, "step": 2082 }, { "epoch": 1.681246058771598, "grad_norm": 0.057555168867111206, "learning_rate": 7.519736996123139e-06, "loss": 0.1038, "step": 2083 }, { "epoch": 1.6820532223483415, "grad_norm": 0.060305386781692505, "learning_rate": 7.482594859370618e-06, "loss": 0.1083, "step": 2084 }, { "epoch": 1.682860385925085, "grad_norm": 0.05349772423505783, "learning_rate": 7.445537257519774e-06, "loss": 0.1038, "step": 2085 }, { "epoch": 1.6836675495018287, "grad_norm": 0.047478146851062775, "learning_rate": 7.4085642642501005e-06, "loss": 0.0961, "step": 2086 }, { "epoch": 1.6844747130785724, "grad_norm": 0.04748820140957832, "learning_rate": 7.371675953072871e-06, "loss": 0.0913, "step": 2087 }, { "epoch": 1.685281876655316, "grad_norm": 0.051477715373039246, "learning_rate": 7.334872397330972e-06, "loss": 0.1023, "step": 2088 }, { "epoch": 1.6860890402320594, "grad_norm": 0.053738512098789215, "learning_rate": 7.298153670198798e-06, "loss": 0.1017, "step": 2089 }, { "epoch": 1.6868962038088031, "grad_norm": 0.05442385748028755, "learning_rate": 7.2615198446820574e-06, "loss": 0.098, "step": 2090 }, { "epoch": 1.6877033673855468, "grad_norm": 0.05031757801771164, "learning_rate": 7.224970993617686e-06, "loss": 0.1, "step": 2091 }, { "epoch": 1.6885105309622903, "grad_norm": 0.049417149275541306, "learning_rate": 7.188507189673649e-06, "loss": 0.1126, "step": 2092 }, { "epoch": 1.6893176945390338, "grad_norm": 0.0511915497481823, "learning_rate": 7.152128505348821e-06, "loss": 0.11, "step": 2093 }, { "epoch": 1.6901248581157775, "grad_norm": 0.05763173848390579, "learning_rate": 7.115835012972855e-06, "loss": 0.0993, "step": 2094 }, { "epoch": 1.6909320216925212, "grad_norm": 0.05264758691191673, "learning_rate": 7.079626784705978e-06, "loss": 0.1052, "step": 2095 }, { "epoch": 1.6917391852692647, "grad_norm": 0.05531957745552063, "learning_rate": 7.04350389253895e-06, "loss": 0.1082, "step": 2096 }, { "epoch": 1.6925463488460082, "grad_norm": 0.05216442048549652, "learning_rate": 7.007466408292801e-06, "loss": 0.1093, "step": 2097 }, { "epoch": 1.693353512422752, "grad_norm": 0.05685226246714592, "learning_rate": 6.971514403618801e-06, "loss": 0.0996, "step": 2098 }, { "epoch": 1.6941606759994956, "grad_norm": 0.054208770394325256, "learning_rate": 6.93564794999823e-06, "loss": 0.1203, "step": 2099 }, { "epoch": 1.694967839576239, "grad_norm": 0.05413041263818741, "learning_rate": 6.899867118742314e-06, "loss": 0.1009, "step": 2100 }, { "epoch": 1.6957750031529826, "grad_norm": 0.05898387357592583, "learning_rate": 6.864171980991985e-06, "loss": 0.0988, "step": 2101 }, { "epoch": 1.6965821667297263, "grad_norm": 0.05415396764874458, "learning_rate": 6.8285626077178474e-06, "loss": 0.1039, "step": 2102 }, { "epoch": 1.69738933030647, "grad_norm": 0.054988276213407516, "learning_rate": 6.793039069719926e-06, "loss": 0.1088, "step": 2103 }, { "epoch": 1.6981964938832135, "grad_norm": 0.05541759729385376, "learning_rate": 6.7576014376276645e-06, "loss": 0.105, "step": 2104 }, { "epoch": 1.699003657459957, "grad_norm": 0.05139584094285965, "learning_rate": 6.722249781899631e-06, "loss": 0.105, "step": 2105 }, { "epoch": 1.6998108210367007, "grad_norm": 0.05328596383333206, "learning_rate": 6.686984172823491e-06, "loss": 0.0935, "step": 2106 }, { "epoch": 1.7006179846134444, "grad_norm": 0.04749865084886551, "learning_rate": 6.6518046805158274e-06, "loss": 0.1068, "step": 2107 }, { "epoch": 1.7014251481901879, "grad_norm": 0.05210341513156891, "learning_rate": 6.616711374921975e-06, "loss": 0.1048, "step": 2108 }, { "epoch": 1.7022323117669316, "grad_norm": 0.05280543491244316, "learning_rate": 6.58170432581594e-06, "loss": 0.1022, "step": 2109 }, { "epoch": 1.703039475343675, "grad_norm": 0.058702826499938965, "learning_rate": 6.546783602800211e-06, "loss": 0.1033, "step": 2110 }, { "epoch": 1.7038466389204188, "grad_norm": 0.05437362939119339, "learning_rate": 6.5119492753056565e-06, "loss": 0.1001, "step": 2111 }, { "epoch": 1.7046538024971625, "grad_norm": 0.05018357187509537, "learning_rate": 6.477201412591338e-06, "loss": 0.0996, "step": 2112 }, { "epoch": 1.705460966073906, "grad_norm": 0.050614457577466965, "learning_rate": 6.4425400837444526e-06, "loss": 0.1083, "step": 2113 }, { "epoch": 1.7062681296506494, "grad_norm": 0.056503310799598694, "learning_rate": 6.407965357680084e-06, "loss": 0.1049, "step": 2114 }, { "epoch": 1.7070752932273932, "grad_norm": 0.05261369049549103, "learning_rate": 6.37347730314119e-06, "loss": 0.0983, "step": 2115 }, { "epoch": 1.7078824568041369, "grad_norm": 0.05572700873017311, "learning_rate": 6.339075988698367e-06, "loss": 0.1083, "step": 2116 }, { "epoch": 1.7086896203808803, "grad_norm": 0.05254945904016495, "learning_rate": 6.304761482749777e-06, "loss": 0.1138, "step": 2117 }, { "epoch": 1.7094967839576238, "grad_norm": 0.05212052911520004, "learning_rate": 6.270533853520949e-06, "loss": 0.102, "step": 2118 }, { "epoch": 1.7103039475343675, "grad_norm": 0.05497318133711815, "learning_rate": 6.2363931690647195e-06, "loss": 0.1024, "step": 2119 }, { "epoch": 1.7111111111111112, "grad_norm": 0.059619609266519547, "learning_rate": 6.202339497261028e-06, "loss": 0.109, "step": 2120 }, { "epoch": 1.7119182746878547, "grad_norm": 0.05584368854761124, "learning_rate": 6.168372905816821e-06, "loss": 0.116, "step": 2121 }, { "epoch": 1.7127254382645982, "grad_norm": 0.05195939168334007, "learning_rate": 6.134493462265928e-06, "loss": 0.1029, "step": 2122 }, { "epoch": 1.713532601841342, "grad_norm": 0.06104809790849686, "learning_rate": 6.100701233968875e-06, "loss": 0.1026, "step": 2123 }, { "epoch": 1.7143397654180856, "grad_norm": 0.05512300133705139, "learning_rate": 6.0669962881128195e-06, "loss": 0.1038, "step": 2124 }, { "epoch": 1.7151469289948291, "grad_norm": 0.05650010332465172, "learning_rate": 6.033378691711334e-06, "loss": 0.114, "step": 2125 }, { "epoch": 1.7159540925715726, "grad_norm": 0.056171122938394547, "learning_rate": 5.9998485116043614e-06, "loss": 0.0993, "step": 2126 }, { "epoch": 1.7167612561483163, "grad_norm": 0.055440809577703476, "learning_rate": 5.966405814457998e-06, "loss": 0.0956, "step": 2127 }, { "epoch": 1.71756841972506, "grad_norm": 0.05753964185714722, "learning_rate": 5.933050666764467e-06, "loss": 0.0978, "step": 2128 }, { "epoch": 1.7183755833018035, "grad_norm": 0.054824139922857285, "learning_rate": 5.899783134841846e-06, "loss": 0.0947, "step": 2129 }, { "epoch": 1.719182746878547, "grad_norm": 0.059018611907958984, "learning_rate": 5.866603284834077e-06, "loss": 0.1019, "step": 2130 }, { "epoch": 1.7199899104552907, "grad_norm": 0.05978579446673393, "learning_rate": 5.833511182710716e-06, "loss": 0.1019, "step": 2131 }, { "epoch": 1.7207970740320344, "grad_norm": 0.05853411555290222, "learning_rate": 5.8005068942669e-06, "loss": 0.0997, "step": 2132 }, { "epoch": 1.721604237608778, "grad_norm": 0.05306762829422951, "learning_rate": 5.76759048512312e-06, "loss": 0.1004, "step": 2133 }, { "epoch": 1.7224114011855214, "grad_norm": 0.05306650325655937, "learning_rate": 5.73476202072521e-06, "loss": 0.1062, "step": 2134 }, { "epoch": 1.723218564762265, "grad_norm": 0.05345844477415085, "learning_rate": 5.702021566344079e-06, "loss": 0.1036, "step": 2135 }, { "epoch": 1.7240257283390088, "grad_norm": 0.058258797973394394, "learning_rate": 5.6693691870756905e-06, "loss": 0.118, "step": 2136 }, { "epoch": 1.7248328919157523, "grad_norm": 0.05188046395778656, "learning_rate": 5.636804947840907e-06, "loss": 0.107, "step": 2137 }, { "epoch": 1.7256400554924958, "grad_norm": 0.049947090446949005, "learning_rate": 5.604328913385287e-06, "loss": 0.099, "step": 2138 }, { "epoch": 1.7264472190692395, "grad_norm": 0.05316640064120293, "learning_rate": 5.571941148279081e-06, "loss": 0.1084, "step": 2139 }, { "epoch": 1.7272543826459832, "grad_norm": 0.05948293209075928, "learning_rate": 5.539641716917004e-06, "loss": 0.0993, "step": 2140 }, { "epoch": 1.7280615462227267, "grad_norm": 0.052130237221717834, "learning_rate": 5.507430683518161e-06, "loss": 0.1105, "step": 2141 }, { "epoch": 1.7288687097994702, "grad_norm": 0.05353371798992157, "learning_rate": 5.475308112125871e-06, "loss": 0.1029, "step": 2142 }, { "epoch": 1.7296758733762139, "grad_norm": 0.053427476435899734, "learning_rate": 5.443274066607606e-06, "loss": 0.1006, "step": 2143 }, { "epoch": 1.7304830369529576, "grad_norm": 0.057806458324193954, "learning_rate": 5.4113286106547925e-06, "loss": 0.0931, "step": 2144 }, { "epoch": 1.731290200529701, "grad_norm": 0.056647028774023056, "learning_rate": 5.379471807782743e-06, "loss": 0.1098, "step": 2145 }, { "epoch": 1.7320973641064445, "grad_norm": 0.05371788889169693, "learning_rate": 5.3477037213304995e-06, "loss": 0.1008, "step": 2146 }, { "epoch": 1.7329045276831883, "grad_norm": 0.055092182010412216, "learning_rate": 5.3160244144607294e-06, "loss": 0.1028, "step": 2147 }, { "epoch": 1.733711691259932, "grad_norm": 0.05630061402916908, "learning_rate": 5.28443395015954e-06, "loss": 0.1016, "step": 2148 }, { "epoch": 1.7345188548366754, "grad_norm": 0.04999915510416031, "learning_rate": 5.252932391236443e-06, "loss": 0.1039, "step": 2149 }, { "epoch": 1.735326018413419, "grad_norm": 0.058542054146528244, "learning_rate": 5.221519800324181e-06, "loss": 0.1074, "step": 2150 }, { "epoch": 1.7361331819901626, "grad_norm": 0.05749601498246193, "learning_rate": 5.19019623987857e-06, "loss": 0.1061, "step": 2151 }, { "epoch": 1.7369403455669064, "grad_norm": 0.05236474797129631, "learning_rate": 5.15896177217845e-06, "loss": 0.0923, "step": 2152 }, { "epoch": 1.7377475091436498, "grad_norm": 0.058656200766563416, "learning_rate": 5.127816459325507e-06, "loss": 0.1078, "step": 2153 }, { "epoch": 1.7385546727203935, "grad_norm": 0.06022967770695686, "learning_rate": 5.09676036324418e-06, "loss": 0.116, "step": 2154 }, { "epoch": 1.739361836297137, "grad_norm": 0.05391302332282066, "learning_rate": 5.065793545681491e-06, "loss": 0.1057, "step": 2155 }, { "epoch": 1.7401689998738807, "grad_norm": 0.052378371357917786, "learning_rate": 5.034916068206996e-06, "loss": 0.1006, "step": 2156 }, { "epoch": 1.7409761634506244, "grad_norm": 0.061786018311977386, "learning_rate": 5.0041279922125705e-06, "loss": 0.1159, "step": 2157 }, { "epoch": 1.741783327027368, "grad_norm": 0.05633746087551117, "learning_rate": 4.973429378912409e-06, "loss": 0.1069, "step": 2158 }, { "epoch": 1.7425904906041114, "grad_norm": 0.048832159489393234, "learning_rate": 4.942820289342759e-06, "loss": 0.1093, "step": 2159 }, { "epoch": 1.7433976541808551, "grad_norm": 0.050932351499795914, "learning_rate": 4.912300784361923e-06, "loss": 0.0985, "step": 2160 }, { "epoch": 1.7442048177575988, "grad_norm": 0.05225188285112381, "learning_rate": 4.881870924650062e-06, "loss": 0.1018, "step": 2161 }, { "epoch": 1.7450119813343423, "grad_norm": 0.053992629051208496, "learning_rate": 4.851530770709112e-06, "loss": 0.1048, "step": 2162 }, { "epoch": 1.7458191449110858, "grad_norm": 0.051555927842855453, "learning_rate": 4.821280382862647e-06, "loss": 0.1011, "step": 2163 }, { "epoch": 1.7466263084878295, "grad_norm": 0.05575854331254959, "learning_rate": 4.791119821255769e-06, "loss": 0.1053, "step": 2164 }, { "epoch": 1.7474334720645732, "grad_norm": 0.055230334401130676, "learning_rate": 4.76104914585499e-06, "loss": 0.0827, "step": 2165 }, { "epoch": 1.7482406356413167, "grad_norm": 0.05285778269171715, "learning_rate": 4.731068416448081e-06, "loss": 0.1043, "step": 2166 }, { "epoch": 1.7490477992180602, "grad_norm": 0.05494954437017441, "learning_rate": 4.70117769264401e-06, "loss": 0.1095, "step": 2167 }, { "epoch": 1.749854962794804, "grad_norm": 0.05983591824769974, "learning_rate": 4.671377033872765e-06, "loss": 0.1096, "step": 2168 }, { "epoch": 1.7506621263715476, "grad_norm": 0.05807788670063019, "learning_rate": 4.641666499385278e-06, "loss": 0.1113, "step": 2169 }, { "epoch": 1.751469289948291, "grad_norm": 0.05357687547802925, "learning_rate": 4.612046148253291e-06, "loss": 0.1078, "step": 2170 }, { "epoch": 1.7522764535250346, "grad_norm": 0.060467056930065155, "learning_rate": 4.5825160393692445e-06, "loss": 0.1065, "step": 2171 }, { "epoch": 1.7530836171017783, "grad_norm": 0.05593674257397652, "learning_rate": 4.55307623144614e-06, "loss": 0.1055, "step": 2172 }, { "epoch": 1.753890780678522, "grad_norm": 0.055938106030225754, "learning_rate": 4.523726783017457e-06, "loss": 0.1018, "step": 2173 }, { "epoch": 1.7546979442552655, "grad_norm": 0.05240335315465927, "learning_rate": 4.494467752436993e-06, "loss": 0.1066, "step": 2174 }, { "epoch": 1.755505107832009, "grad_norm": 0.05795678868889809, "learning_rate": 4.465299197878797e-06, "loss": 0.1054, "step": 2175 }, { "epoch": 1.7563122714087527, "grad_norm": 0.05561627075076103, "learning_rate": 4.43622117733703e-06, "loss": 0.0973, "step": 2176 }, { "epoch": 1.7563122714087527, "eval_loss": 0.11313911527395248, "eval_runtime": 4032.6434, "eval_samples_per_second": 2.207, "eval_steps_per_second": 2.207, "step": 2176 }, { "epoch": 1.7571194349854964, "grad_norm": 0.05447455495595932, "learning_rate": 4.407233748625839e-06, "loss": 0.1044, "step": 2177 }, { "epoch": 1.7579265985622399, "grad_norm": 0.05230431631207466, "learning_rate": 4.378336969379243e-06, "loss": 0.1029, "step": 2178 }, { "epoch": 1.7587337621389834, "grad_norm": 0.05118807405233383, "learning_rate": 4.349530897051047e-06, "loss": 0.1013, "step": 2179 }, { "epoch": 1.759540925715727, "grad_norm": 0.05441461503505707, "learning_rate": 4.320815588914706e-06, "loss": 0.1049, "step": 2180 }, { "epoch": 1.7603480892924708, "grad_norm": 0.060378219932317734, "learning_rate": 4.292191102063192e-06, "loss": 0.11, "step": 2181 }, { "epoch": 1.7611552528692143, "grad_norm": 0.05949053913354874, "learning_rate": 4.263657493408951e-06, "loss": 0.1033, "step": 2182 }, { "epoch": 1.7619624164459577, "grad_norm": 0.052824269980192184, "learning_rate": 4.23521481968368e-06, "loss": 0.0949, "step": 2183 }, { "epoch": 1.7627695800227015, "grad_norm": 0.0537712462246418, "learning_rate": 4.206863137438327e-06, "loss": 0.111, "step": 2184 }, { "epoch": 1.7635767435994452, "grad_norm": 0.05734507739543915, "learning_rate": 4.178602503042878e-06, "loss": 0.1116, "step": 2185 }, { "epoch": 1.7643839071761886, "grad_norm": 0.06269229203462601, "learning_rate": 4.150432972686352e-06, "loss": 0.1162, "step": 2186 }, { "epoch": 1.7651910707529321, "grad_norm": 0.05198785290122032, "learning_rate": 4.12235460237656e-06, "loss": 0.1112, "step": 2187 }, { "epoch": 1.7659982343296758, "grad_norm": 0.052965350449085236, "learning_rate": 4.094367447940151e-06, "loss": 0.1052, "step": 2188 }, { "epoch": 1.7668053979064196, "grad_norm": 0.054847247898578644, "learning_rate": 4.066471565022334e-06, "loss": 0.1024, "step": 2189 }, { "epoch": 1.767612561483163, "grad_norm": 0.05498647317290306, "learning_rate": 4.038667009086905e-06, "loss": 0.0977, "step": 2190 }, { "epoch": 1.7684197250599065, "grad_norm": 0.054329607635736465, "learning_rate": 4.010953835416037e-06, "loss": 0.1027, "step": 2191 }, { "epoch": 1.7692268886366502, "grad_norm": 0.05932670831680298, "learning_rate": 3.983332099110237e-06, "loss": 0.1023, "step": 2192 }, { "epoch": 1.770034052213394, "grad_norm": 0.057906486093997955, "learning_rate": 3.95580185508822e-06, "loss": 0.0959, "step": 2193 }, { "epoch": 1.7708412157901374, "grad_norm": 0.05080775171518326, "learning_rate": 3.9283631580867674e-06, "loss": 0.1143, "step": 2194 }, { "epoch": 1.771648379366881, "grad_norm": 0.04834539815783501, "learning_rate": 3.901016062660673e-06, "loss": 0.0988, "step": 2195 }, { "epoch": 1.7724555429436246, "grad_norm": 0.05879495292901993, "learning_rate": 3.87376062318257e-06, "loss": 0.1037, "step": 2196 }, { "epoch": 1.7732627065203683, "grad_norm": 0.058465395122766495, "learning_rate": 3.846596893842891e-06, "loss": 0.1025, "step": 2197 }, { "epoch": 1.774069870097112, "grad_norm": 0.057317476719617844, "learning_rate": 3.819524928649692e-06, "loss": 0.1027, "step": 2198 }, { "epoch": 1.7748770336738555, "grad_norm": 0.05100231617689133, "learning_rate": 3.7925447814286087e-06, "loss": 0.0916, "step": 2199 }, { "epoch": 1.775684197250599, "grad_norm": 0.05537811294198036, "learning_rate": 3.765656505822707e-06, "loss": 0.099, "step": 2200 }, { "epoch": 1.7764913608273427, "grad_norm": 0.055858466774225235, "learning_rate": 3.7388601552924062e-06, "loss": 0.1019, "step": 2201 }, { "epoch": 1.7772985244040864, "grad_norm": 0.051511071622371674, "learning_rate": 3.712155783115323e-06, "loss": 0.1102, "step": 2202 }, { "epoch": 1.77810568798083, "grad_norm": 0.05437035113573074, "learning_rate": 3.6855434423862355e-06, "loss": 0.0975, "step": 2203 }, { "epoch": 1.7789128515575734, "grad_norm": 0.0597914382815361, "learning_rate": 3.6590231860169077e-06, "loss": 0.1032, "step": 2204 }, { "epoch": 1.779720015134317, "grad_norm": 0.05074330046772957, "learning_rate": 3.6325950667360444e-06, "loss": 0.1029, "step": 2205 }, { "epoch": 1.7805271787110608, "grad_norm": 0.053755007684230804, "learning_rate": 3.606259137089141e-06, "loss": 0.0987, "step": 2206 }, { "epoch": 1.7813343422878043, "grad_norm": 0.048825155943632126, "learning_rate": 3.5800154494384175e-06, "loss": 0.1043, "step": 2207 }, { "epoch": 1.7821415058645478, "grad_norm": 0.05263788625597954, "learning_rate": 3.5538640559626857e-06, "loss": 0.1056, "step": 2208 }, { "epoch": 1.7829486694412915, "grad_norm": 0.050162188708782196, "learning_rate": 3.5278050086572314e-06, "loss": 0.1026, "step": 2209 }, { "epoch": 1.7837558330180352, "grad_norm": 0.05450249835848808, "learning_rate": 3.5018383593337754e-06, "loss": 0.1115, "step": 2210 }, { "epoch": 1.7845629965947787, "grad_norm": 0.061941396445035934, "learning_rate": 3.4759641596202762e-06, "loss": 0.1077, "step": 2211 }, { "epoch": 1.7853701601715222, "grad_norm": 0.05394100770354271, "learning_rate": 3.4501824609609546e-06, "loss": 0.1051, "step": 2212 }, { "epoch": 1.7861773237482659, "grad_norm": 0.050046734511852264, "learning_rate": 3.4244933146160395e-06, "loss": 0.1061, "step": 2213 }, { "epoch": 1.7869844873250096, "grad_norm": 0.052463337779045105, "learning_rate": 3.398896771661797e-06, "loss": 0.1036, "step": 2214 }, { "epoch": 1.787791650901753, "grad_norm": 0.05225534364581108, "learning_rate": 3.3733928829903395e-06, "loss": 0.1059, "step": 2215 }, { "epoch": 1.7885988144784966, "grad_norm": 0.051552556455135345, "learning_rate": 3.347981699309588e-06, "loss": 0.1005, "step": 2216 }, { "epoch": 1.7894059780552403, "grad_norm": 0.0524485781788826, "learning_rate": 3.3226632711431115e-06, "loss": 0.0961, "step": 2217 }, { "epoch": 1.790213141631984, "grad_norm": 0.054720714688301086, "learning_rate": 3.297437648830115e-06, "loss": 0.0976, "step": 2218 }, { "epoch": 1.7910203052087275, "grad_norm": 0.056126561015844345, "learning_rate": 3.2723048825252177e-06, "loss": 0.1056, "step": 2219 }, { "epoch": 1.791827468785471, "grad_norm": 0.04973268508911133, "learning_rate": 3.2472650221984537e-06, "loss": 0.0957, "step": 2220 }, { "epoch": 1.7926346323622147, "grad_norm": 0.055015917867422104, "learning_rate": 3.2223181176351426e-06, "loss": 0.1037, "step": 2221 }, { "epoch": 1.7934417959389584, "grad_norm": 0.050197046250104904, "learning_rate": 3.197464218435764e-06, "loss": 0.1054, "step": 2222 }, { "epoch": 1.7942489595157018, "grad_norm": 0.050583165138959885, "learning_rate": 3.172703374015884e-06, "loss": 0.1098, "step": 2223 }, { "epoch": 1.7950561230924453, "grad_norm": 0.05217958241701126, "learning_rate": 3.148035633606072e-06, "loss": 0.0888, "step": 2224 }, { "epoch": 1.795863286669189, "grad_norm": 0.052840378135442734, "learning_rate": 3.12346104625178e-06, "loss": 0.107, "step": 2225 }, { "epoch": 1.7966704502459327, "grad_norm": 0.061937116086483, "learning_rate": 3.098979660813217e-06, "loss": 0.1109, "step": 2226 }, { "epoch": 1.7974776138226762, "grad_norm": 0.0525975376367569, "learning_rate": 3.074591525965331e-06, "loss": 0.0999, "step": 2227 }, { "epoch": 1.7982847773994197, "grad_norm": 0.05639638751745224, "learning_rate": 3.0502966901976237e-06, "loss": 0.0965, "step": 2228 }, { "epoch": 1.7990919409761634, "grad_norm": 0.05193613842129707, "learning_rate": 3.026095201814122e-06, "loss": 0.1067, "step": 2229 }, { "epoch": 1.7998991045529071, "grad_norm": 0.058059584349393845, "learning_rate": 3.001987108933246e-06, "loss": 0.1051, "step": 2230 }, { "epoch": 1.8007062681296506, "grad_norm": 0.04914858937263489, "learning_rate": 2.977972459487738e-06, "loss": 0.0986, "step": 2231 }, { "epoch": 1.801513431706394, "grad_norm": 0.05680011212825775, "learning_rate": 2.9540513012245197e-06, "loss": 0.1104, "step": 2232 }, { "epoch": 1.8023205952831378, "grad_norm": 0.05547124147415161, "learning_rate": 2.9302236817046634e-06, "loss": 0.1138, "step": 2233 }, { "epoch": 1.8031277588598815, "grad_norm": 0.053021933883428574, "learning_rate": 2.90648964830324e-06, "loss": 0.104, "step": 2234 }, { "epoch": 1.803934922436625, "grad_norm": 0.058564409613609314, "learning_rate": 2.8828492482092575e-06, "loss": 0.0946, "step": 2235 }, { "epoch": 1.8047420860133685, "grad_norm": 0.052088137716054916, "learning_rate": 2.8593025284255614e-06, "loss": 0.1092, "step": 2236 }, { "epoch": 1.8055492495901122, "grad_norm": 0.04589877650141716, "learning_rate": 2.8358495357687364e-06, "loss": 0.1012, "step": 2237 }, { "epoch": 1.806356413166856, "grad_norm": 0.056709449738264084, "learning_rate": 2.8124903168690153e-06, "loss": 0.1054, "step": 2238 }, { "epoch": 1.8071635767435994, "grad_norm": 0.057234060019254684, "learning_rate": 2.7892249181701802e-06, "loss": 0.0993, "step": 2239 }, { "epoch": 1.8079707403203429, "grad_norm": 0.05641083046793938, "learning_rate": 2.7660533859294847e-06, "loss": 0.1064, "step": 2240 }, { "epoch": 1.8087779038970866, "grad_norm": 0.058661896735429764, "learning_rate": 2.7429757662175314e-06, "loss": 0.1117, "step": 2241 }, { "epoch": 1.8095850674738303, "grad_norm": 0.05297205597162247, "learning_rate": 2.7199921049182455e-06, "loss": 0.0996, "step": 2242 }, { "epoch": 1.810392231050574, "grad_norm": 0.05201645940542221, "learning_rate": 2.6971024477287e-06, "loss": 0.1066, "step": 2243 }, { "epoch": 1.8111993946273175, "grad_norm": 0.05294370278716087, "learning_rate": 2.6743068401590798e-06, "loss": 0.1036, "step": 2244 }, { "epoch": 1.812006558204061, "grad_norm": 0.05200006067752838, "learning_rate": 2.651605327532569e-06, "loss": 0.1115, "step": 2245 }, { "epoch": 1.8128137217808047, "grad_norm": 0.049929820001125336, "learning_rate": 2.6289979549852795e-06, "loss": 0.0957, "step": 2246 }, { "epoch": 1.8136208853575484, "grad_norm": 0.053833093494176865, "learning_rate": 2.6064847674661496e-06, "loss": 0.1085, "step": 2247 }, { "epoch": 1.8144280489342919, "grad_norm": 0.05641954019665718, "learning_rate": 2.584065809736852e-06, "loss": 0.0969, "step": 2248 }, { "epoch": 1.8152352125110354, "grad_norm": 0.05503935366868973, "learning_rate": 2.561741126371692e-06, "loss": 0.1005, "step": 2249 }, { "epoch": 1.816042376087779, "grad_norm": 0.059185806661844254, "learning_rate": 2.539510761757552e-06, "loss": 0.1017, "step": 2250 }, { "epoch": 1.8168495396645228, "grad_norm": 0.04860822856426239, "learning_rate": 2.5173747600937993e-06, "loss": 0.1031, "step": 2251 }, { "epoch": 1.8176567032412663, "grad_norm": 0.0525892935693264, "learning_rate": 2.4953331653921496e-06, "loss": 0.1035, "step": 2252 }, { "epoch": 1.8184638668180098, "grad_norm": 0.05452253669500351, "learning_rate": 2.4733860214766313e-06, "loss": 0.1043, "step": 2253 }, { "epoch": 1.8192710303947535, "grad_norm": 0.050698548555374146, "learning_rate": 2.4515333719835e-06, "loss": 0.1019, "step": 2254 }, { "epoch": 1.8200781939714972, "grad_norm": 0.057535648345947266, "learning_rate": 2.429775260361106e-06, "loss": 0.1047, "step": 2255 }, { "epoch": 1.8208853575482407, "grad_norm": 0.05610840395092964, "learning_rate": 2.408111729869844e-06, "loss": 0.0928, "step": 2256 }, { "epoch": 1.8216925211249841, "grad_norm": 0.059024106711149216, "learning_rate": 2.3865428235820776e-06, "loss": 0.1033, "step": 2257 }, { "epoch": 1.8224996847017279, "grad_norm": 0.05342676863074303, "learning_rate": 2.3650685843819907e-06, "loss": 0.0978, "step": 2258 }, { "epoch": 1.8233068482784716, "grad_norm": 0.05745955929160118, "learning_rate": 2.343689054965592e-06, "loss": 0.104, "step": 2259 }, { "epoch": 1.824114011855215, "grad_norm": 0.04817991703748703, "learning_rate": 2.3224042778405563e-06, "loss": 0.1099, "step": 2260 }, { "epoch": 1.8249211754319585, "grad_norm": 0.05289851501584053, "learning_rate": 2.3012142953261928e-06, "loss": 0.0919, "step": 2261 }, { "epoch": 1.8257283390087022, "grad_norm": 0.04999127238988876, "learning_rate": 2.2801191495533004e-06, "loss": 0.105, "step": 2262 }, { "epoch": 1.826535502585446, "grad_norm": 0.052538540214300156, "learning_rate": 2.2591188824641505e-06, "loss": 0.0996, "step": 2263 }, { "epoch": 1.8273426661621894, "grad_norm": 0.049098581075668335, "learning_rate": 2.2382135358123614e-06, "loss": 0.0966, "step": 2264 }, { "epoch": 1.828149829738933, "grad_norm": 0.05998978018760681, "learning_rate": 2.217403151162817e-06, "loss": 0.0985, "step": 2265 }, { "epoch": 1.8289569933156766, "grad_norm": 0.05416387319564819, "learning_rate": 2.19668776989162e-06, "loss": 0.0839, "step": 2266 }, { "epoch": 1.8297641568924203, "grad_norm": 0.05751457065343857, "learning_rate": 2.1760674331859522e-06, "loss": 0.1149, "step": 2267 }, { "epoch": 1.8305713204691638, "grad_norm": 0.05687270313501358, "learning_rate": 2.155542182044046e-06, "loss": 0.1098, "step": 2268 }, { "epoch": 1.8313784840459073, "grad_norm": 0.05647604912519455, "learning_rate": 2.1351120572750736e-06, "loss": 0.1103, "step": 2269 }, { "epoch": 1.832185647622651, "grad_norm": 0.053684622049331665, "learning_rate": 2.114777099499071e-06, "loss": 0.1009, "step": 2270 }, { "epoch": 1.8329928111993947, "grad_norm": 0.05863429978489876, "learning_rate": 2.0945373491468466e-06, "loss": 0.1011, "step": 2271 }, { "epoch": 1.8337999747761382, "grad_norm": 0.05199775472283363, "learning_rate": 2.074392846459955e-06, "loss": 0.1103, "step": 2272 }, { "epoch": 1.8346071383528817, "grad_norm": 0.05165868252515793, "learning_rate": 2.054343631490524e-06, "loss": 0.1027, "step": 2273 }, { "epoch": 1.8354143019296254, "grad_norm": 0.05146782845258713, "learning_rate": 2.034389744101267e-06, "loss": 0.106, "step": 2274 }, { "epoch": 1.836221465506369, "grad_norm": 0.05743264779448509, "learning_rate": 2.0145312239653323e-06, "loss": 0.1032, "step": 2275 }, { "epoch": 1.8370286290831126, "grad_norm": 0.05055628716945648, "learning_rate": 1.9947681105662806e-06, "loss": 0.0997, "step": 2276 }, { "epoch": 1.837835792659856, "grad_norm": 0.052286725491285324, "learning_rate": 1.975100443197958e-06, "loss": 0.1041, "step": 2277 }, { "epoch": 1.8386429562365998, "grad_norm": 0.05882134288549423, "learning_rate": 1.9555282609644565e-06, "loss": 0.1135, "step": 2278 }, { "epoch": 1.8394501198133435, "grad_norm": 0.04997078329324722, "learning_rate": 1.936051602780026e-06, "loss": 0.106, "step": 2279 }, { "epoch": 1.840257283390087, "grad_norm": 0.055530156940221786, "learning_rate": 1.9166705073689617e-06, "loss": 0.105, "step": 2280 }, { "epoch": 1.8410644469668305, "grad_norm": 0.05959452688694, "learning_rate": 1.8973850132655956e-06, "loss": 0.0989, "step": 2281 }, { "epoch": 1.8418716105435742, "grad_norm": 0.062332622706890106, "learning_rate": 1.87819515881415e-06, "loss": 0.0991, "step": 2282 }, { "epoch": 1.8426787741203179, "grad_norm": 0.05939004570245743, "learning_rate": 1.8591009821687045e-06, "loss": 0.1114, "step": 2283 }, { "epoch": 1.8434859376970614, "grad_norm": 0.065061055123806, "learning_rate": 1.8401025212931133e-06, "loss": 0.1119, "step": 2284 }, { "epoch": 1.8442931012738049, "grad_norm": 0.051620181649923325, "learning_rate": 1.8211998139609221e-06, "loss": 0.1052, "step": 2285 }, { "epoch": 1.8451002648505486, "grad_norm": 0.0563787966966629, "learning_rate": 1.8023928977552839e-06, "loss": 0.1121, "step": 2286 }, { "epoch": 1.8459074284272923, "grad_norm": 0.05358082056045532, "learning_rate": 1.7836818100689102e-06, "loss": 0.1037, "step": 2287 }, { "epoch": 1.846714592004036, "grad_norm": 0.059966087341308594, "learning_rate": 1.7650665881039697e-06, "loss": 0.1076, "step": 2288 }, { "epoch": 1.8475217555807795, "grad_norm": 0.05631956458091736, "learning_rate": 1.7465472688720398e-06, "loss": 0.1141, "step": 2289 }, { "epoch": 1.848328919157523, "grad_norm": 0.056420013308525085, "learning_rate": 1.728123889194011e-06, "loss": 0.1023, "step": 2290 }, { "epoch": 1.8491360827342667, "grad_norm": 0.061246734112501144, "learning_rate": 1.7097964857000327e-06, "loss": 0.0966, "step": 2291 }, { "epoch": 1.8499432463110104, "grad_norm": 0.058128464967012405, "learning_rate": 1.6915650948294115e-06, "loss": 0.1084, "step": 2292 }, { "epoch": 1.8507504098877539, "grad_norm": 0.053485989570617676, "learning_rate": 1.6734297528305686e-06, "loss": 0.0983, "step": 2293 }, { "epoch": 1.8515575734644973, "grad_norm": 0.054993174970149994, "learning_rate": 1.6553904957609778e-06, "loss": 0.1031, "step": 2294 }, { "epoch": 1.852364737041241, "grad_norm": 0.05583370104432106, "learning_rate": 1.6374473594870155e-06, "loss": 0.0984, "step": 2295 }, { "epoch": 1.8531719006179848, "grad_norm": 0.05189277604222298, "learning_rate": 1.619600379684022e-06, "loss": 0.1056, "step": 2296 }, { "epoch": 1.8539790641947282, "grad_norm": 0.0509740449488163, "learning_rate": 1.6018495918360965e-06, "loss": 0.1184, "step": 2297 }, { "epoch": 1.8547862277714717, "grad_norm": 0.05554984509944916, "learning_rate": 1.584195031236113e-06, "loss": 0.1076, "step": 2298 }, { "epoch": 1.8555933913482154, "grad_norm": 0.0544416718184948, "learning_rate": 1.5666367329856046e-06, "loss": 0.0979, "step": 2299 }, { "epoch": 1.8564005549249591, "grad_norm": 0.0503256730735302, "learning_rate": 1.549174731994729e-06, "loss": 0.1066, "step": 2300 }, { "epoch": 1.8572077185017026, "grad_norm": 0.05609061196446419, "learning_rate": 1.5318090629821757e-06, "loss": 0.1046, "step": 2301 }, { "epoch": 1.8580148820784461, "grad_norm": 0.05201556533575058, "learning_rate": 1.5145397604751032e-06, "loss": 0.1033, "step": 2302 }, { "epoch": 1.8588220456551898, "grad_norm": 0.050383590161800385, "learning_rate": 1.4973668588090572e-06, "loss": 0.0975, "step": 2303 }, { "epoch": 1.8596292092319335, "grad_norm": 0.05271477997303009, "learning_rate": 1.4802903921279476e-06, "loss": 0.103, "step": 2304 }, { "epoch": 1.8596292092319335, "eval_loss": 0.1133606880903244, "eval_runtime": 4035.9581, "eval_samples_per_second": 2.206, "eval_steps_per_second": 2.206, "step": 2304 }, { "epoch": 1.860436372808677, "grad_norm": 0.05869864299893379, "learning_rate": 1.4633103943839044e-06, "loss": 0.0995, "step": 2305 }, { "epoch": 1.8612435363854205, "grad_norm": 0.05507369339466095, "learning_rate": 1.4464268993372831e-06, "loss": 0.1086, "step": 2306 }, { "epoch": 1.8620506999621642, "grad_norm": 0.0547940656542778, "learning_rate": 1.4296399405565708e-06, "loss": 0.1009, "step": 2307 }, { "epoch": 1.862857863538908, "grad_norm": 0.05511258915066719, "learning_rate": 1.4129495514183023e-06, "loss": 0.1042, "step": 2308 }, { "epoch": 1.8636650271156514, "grad_norm": 0.053386714309453964, "learning_rate": 1.3963557651070159e-06, "loss": 0.1112, "step": 2309 }, { "epoch": 1.864472190692395, "grad_norm": 0.05036446452140808, "learning_rate": 1.379858614615176e-06, "loss": 0.1021, "step": 2310 }, { "epoch": 1.8652793542691386, "grad_norm": 0.050247881561517715, "learning_rate": 1.3634581327431229e-06, "loss": 0.0896, "step": 2311 }, { "epoch": 1.8660865178458823, "grad_norm": 0.05350091680884361, "learning_rate": 1.3471543520989726e-06, "loss": 0.107, "step": 2312 }, { "epoch": 1.8668936814226258, "grad_norm": 0.053893182426691055, "learning_rate": 1.3309473050986065e-06, "loss": 0.1137, "step": 2313 }, { "epoch": 1.8677008449993693, "grad_norm": 0.060737404972314835, "learning_rate": 1.3148370239655484e-06, "loss": 0.107, "step": 2314 }, { "epoch": 1.868508008576113, "grad_norm": 0.05786401033401489, "learning_rate": 1.298823540730948e-06, "loss": 0.1051, "step": 2315 }, { "epoch": 1.8693151721528567, "grad_norm": 0.05506041273474693, "learning_rate": 1.282906887233487e-06, "loss": 0.0985, "step": 2316 }, { "epoch": 1.8701223357296002, "grad_norm": 0.05153512954711914, "learning_rate": 1.2670870951193292e-06, "loss": 0.0961, "step": 2317 }, { "epoch": 1.8709294993063437, "grad_norm": 0.05423891171813011, "learning_rate": 1.2513641958420476e-06, "loss": 0.1071, "step": 2318 }, { "epoch": 1.8717366628830874, "grad_norm": 0.05118107795715332, "learning_rate": 1.2357382206625801e-06, "loss": 0.1085, "step": 2319 }, { "epoch": 1.872543826459831, "grad_norm": 0.058040544390678406, "learning_rate": 1.2202092006491528e-06, "loss": 0.1153, "step": 2320 }, { "epoch": 1.8733509900365746, "grad_norm": 0.05272466689348221, "learning_rate": 1.2047771666772122e-06, "loss": 0.1082, "step": 2321 }, { "epoch": 1.874158153613318, "grad_norm": 0.05692138150334358, "learning_rate": 1.1894421494293984e-06, "loss": 0.1104, "step": 2322 }, { "epoch": 1.8749653171900618, "grad_norm": 0.055956706404685974, "learning_rate": 1.174204179395416e-06, "loss": 0.1074, "step": 2323 }, { "epoch": 1.8757724807668055, "grad_norm": 0.04813195765018463, "learning_rate": 1.1590632868720641e-06, "loss": 0.0972, "step": 2324 }, { "epoch": 1.876579644343549, "grad_norm": 0.051958899945020676, "learning_rate": 1.1440195019630785e-06, "loss": 0.1131, "step": 2325 }, { "epoch": 1.8773868079202924, "grad_norm": 0.04973503574728966, "learning_rate": 1.1290728545791773e-06, "loss": 0.0995, "step": 2326 }, { "epoch": 1.8781939714970362, "grad_norm": 0.057134222239255905, "learning_rate": 1.114223374437895e-06, "loss": 0.1214, "step": 2327 }, { "epoch": 1.8790011350737799, "grad_norm": 0.05511888116598129, "learning_rate": 1.0994710910636087e-06, "loss": 0.1122, "step": 2328 }, { "epoch": 1.8798082986505233, "grad_norm": 0.057254187762737274, "learning_rate": 1.0848160337874224e-06, "loss": 0.1077, "step": 2329 }, { "epoch": 1.8806154622272668, "grad_norm": 0.05191498249769211, "learning_rate": 1.0702582317471455e-06, "loss": 0.1055, "step": 2330 }, { "epoch": 1.8814226258040105, "grad_norm": 0.05174419283866882, "learning_rate": 1.0557977138872132e-06, "loss": 0.0987, "step": 2331 }, { "epoch": 1.8822297893807542, "grad_norm": 0.051735904067754745, "learning_rate": 1.041434508958644e-06, "loss": 0.1025, "step": 2332 }, { "epoch": 1.883036952957498, "grad_norm": 0.05299517512321472, "learning_rate": 1.0271686455189554e-06, "loss": 0.0995, "step": 2333 }, { "epoch": 1.8838441165342414, "grad_norm": 0.055690132081508636, "learning_rate": 1.0130001519321586e-06, "loss": 0.1087, "step": 2334 }, { "epoch": 1.884651280110985, "grad_norm": 0.05386724695563316, "learning_rate": 9.989290563686305e-07, "loss": 0.0972, "step": 2335 }, { "epoch": 1.8854584436877286, "grad_norm": 0.05822715535759926, "learning_rate": 9.849553868051364e-07, "loss": 0.1043, "step": 2336 }, { "epoch": 1.8862656072644723, "grad_norm": 0.052286822348833084, "learning_rate": 9.710791710247025e-07, "loss": 0.0998, "step": 2337 }, { "epoch": 1.8870727708412158, "grad_norm": 0.05924304947257042, "learning_rate": 9.57300436616615e-07, "loss": 0.1231, "step": 2338 }, { "epoch": 1.8878799344179593, "grad_norm": 0.05502745509147644, "learning_rate": 9.436192109763376e-07, "loss": 0.0992, "step": 2339 }, { "epoch": 1.888687097994703, "grad_norm": 0.06130314990878105, "learning_rate": 9.300355213054556e-07, "loss": 0.1087, "step": 2340 }, { "epoch": 1.8894942615714467, "grad_norm": 0.05439957231283188, "learning_rate": 9.165493946116432e-07, "loss": 0.1049, "step": 2341 }, { "epoch": 1.8903014251481902, "grad_norm": 0.05813838914036751, "learning_rate": 9.031608577085737e-07, "loss": 0.1043, "step": 2342 }, { "epoch": 1.8911085887249337, "grad_norm": 0.05892227590084076, "learning_rate": 8.898699372159147e-07, "loss": 0.1018, "step": 2343 }, { "epoch": 1.8919157523016774, "grad_norm": 0.0522124357521534, "learning_rate": 8.766766595592279e-07, "loss": 0.1043, "step": 2344 }, { "epoch": 1.8927229158784211, "grad_norm": 0.049786198884248734, "learning_rate": 8.635810509699582e-07, "loss": 0.0965, "step": 2345 }, { "epoch": 1.8935300794551646, "grad_norm": 0.05828925967216492, "learning_rate": 8.505831374853279e-07, "loss": 0.1129, "step": 2346 }, { "epoch": 1.894337243031908, "grad_norm": 0.05888107419013977, "learning_rate": 8.376829449483537e-07, "loss": 0.1071, "step": 2347 }, { "epoch": 1.8951444066086518, "grad_norm": 0.05682583153247833, "learning_rate": 8.248804990077407e-07, "loss": 0.1092, "step": 2348 }, { "epoch": 1.8959515701853955, "grad_norm": 0.05380990728735924, "learning_rate": 8.121758251178391e-07, "loss": 0.0993, "step": 2349 }, { "epoch": 1.896758733762139, "grad_norm": 0.05939152091741562, "learning_rate": 7.995689485386426e-07, "loss": 0.1123, "step": 2350 }, { "epoch": 1.8975658973388825, "grad_norm": 0.058953169733285904, "learning_rate": 7.870598943356622e-07, "loss": 0.1018, "step": 2351 }, { "epoch": 1.8983730609156262, "grad_norm": 0.05629204213619232, "learning_rate": 7.746486873799253e-07, "loss": 0.1051, "step": 2352 }, { "epoch": 1.89918022449237, "grad_norm": 0.056679412722587585, "learning_rate": 7.62335352347926e-07, "loss": 0.0988, "step": 2353 }, { "epoch": 1.8999873880691134, "grad_norm": 0.05464073643088341, "learning_rate": 7.501199137215475e-07, "loss": 0.0877, "step": 2354 }, { "epoch": 1.9007945516458569, "grad_norm": 0.054273270070552826, "learning_rate": 7.38002395788051e-07, "loss": 0.0955, "step": 2355 }, { "epoch": 1.9016017152226006, "grad_norm": 0.052456121891736984, "learning_rate": 7.259828226399978e-07, "loss": 0.1048, "step": 2356 }, { "epoch": 1.9024088787993443, "grad_norm": 0.05419051647186279, "learning_rate": 7.140612181752048e-07, "loss": 0.0911, "step": 2357 }, { "epoch": 1.9032160423760878, "grad_norm": 0.051975369453430176, "learning_rate": 7.022376060967118e-07, "loss": 0.1108, "step": 2358 }, { "epoch": 1.9040232059528313, "grad_norm": 0.05294034257531166, "learning_rate": 6.90512009912725e-07, "loss": 0.1055, "step": 2359 }, { "epoch": 1.904830369529575, "grad_norm": 0.05298817530274391, "learning_rate": 6.788844529365623e-07, "loss": 0.1042, "step": 2360 }, { "epoch": 1.9056375331063187, "grad_norm": 0.055983733385801315, "learning_rate": 6.673549582866367e-07, "loss": 0.0997, "step": 2361 }, { "epoch": 1.9064446966830622, "grad_norm": 0.052642546594142914, "learning_rate": 6.55923548886378e-07, "loss": 0.1129, "step": 2362 }, { "epoch": 1.9072518602598056, "grad_norm": 0.054758548736572266, "learning_rate": 6.44590247464183e-07, "loss": 0.1117, "step": 2363 }, { "epoch": 1.9080590238365494, "grad_norm": 0.05365023761987686, "learning_rate": 6.333550765534102e-07, "loss": 0.1059, "step": 2364 }, { "epoch": 1.908866187413293, "grad_norm": 0.05435223504900932, "learning_rate": 6.222180584923021e-07, "loss": 0.1085, "step": 2365 }, { "epoch": 1.9096733509900365, "grad_norm": 0.056373197585344315, "learning_rate": 6.111792154239404e-07, "loss": 0.1024, "step": 2366 }, { "epoch": 1.91048051456678, "grad_norm": 0.0549466498196125, "learning_rate": 6.002385692962243e-07, "loss": 0.0993, "step": 2367 }, { "epoch": 1.9112876781435237, "grad_norm": 0.057139717042446136, "learning_rate": 5.893961418618143e-07, "loss": 0.1031, "step": 2368 }, { "epoch": 1.9120948417202674, "grad_norm": 0.050729408860206604, "learning_rate": 5.786519546780778e-07, "loss": 0.1028, "step": 2369 }, { "epoch": 1.912902005297011, "grad_norm": 0.052295979112386703, "learning_rate": 5.680060291070599e-07, "loss": 0.091, "step": 2370 }, { "epoch": 1.9137091688737544, "grad_norm": 0.05668051168322563, "learning_rate": 5.574583863154403e-07, "loss": 0.1083, "step": 2371 }, { "epoch": 1.9145163324504981, "grad_norm": 0.05355283245444298, "learning_rate": 5.470090472744937e-07, "loss": 0.1103, "step": 2372 }, { "epoch": 1.9153234960272418, "grad_norm": 0.05786500871181488, "learning_rate": 5.36658032760029e-07, "loss": 0.1139, "step": 2373 }, { "epoch": 1.9161306596039853, "grad_norm": 0.05370327830314636, "learning_rate": 5.264053633523724e-07, "loss": 0.1034, "step": 2374 }, { "epoch": 1.9169378231807288, "grad_norm": 0.061825767159461975, "learning_rate": 5.162510594363235e-07, "loss": 0.1188, "step": 2375 }, { "epoch": 1.9177449867574725, "grad_norm": 0.057944025844335556, "learning_rate": 5.061951412010879e-07, "loss": 0.1092, "step": 2376 }, { "epoch": 1.9185521503342162, "grad_norm": 0.05096985399723053, "learning_rate": 4.962376286402782e-07, "loss": 0.0986, "step": 2377 }, { "epoch": 1.91935931391096, "grad_norm": 0.052559446543455124, "learning_rate": 4.863785415518296e-07, "loss": 0.0988, "step": 2378 }, { "epoch": 1.9201664774877034, "grad_norm": 0.05140148103237152, "learning_rate": 4.766178995379955e-07, "loss": 0.1062, "step": 2379 }, { "epoch": 1.920973641064447, "grad_norm": 0.05909739062190056, "learning_rate": 4.6695572200531337e-07, "loss": 0.0997, "step": 2380 }, { "epoch": 1.9217808046411906, "grad_norm": 0.05428718030452728, "learning_rate": 4.573920281645161e-07, "loss": 0.1035, "step": 2381 }, { "epoch": 1.9225879682179343, "grad_norm": 0.05871422961354256, "learning_rate": 4.4792683703054896e-07, "loss": 0.1052, "step": 2382 }, { "epoch": 1.9233951317946778, "grad_norm": 0.05472823232412338, "learning_rate": 4.3856016742250814e-07, "loss": 0.1005, "step": 2383 }, { "epoch": 1.9242022953714213, "grad_norm": 0.051089297980070114, "learning_rate": 4.2929203796359673e-07, "loss": 0.1036, "step": 2384 }, { "epoch": 1.925009458948165, "grad_norm": 0.05744870379567146, "learning_rate": 4.201224670811077e-07, "loss": 0.0993, "step": 2385 }, { "epoch": 1.9258166225249087, "grad_norm": 0.05773751065135002, "learning_rate": 4.1105147300636303e-07, "loss": 0.0995, "step": 2386 }, { "epoch": 1.9266237861016522, "grad_norm": 0.05376359820365906, "learning_rate": 4.020790737746971e-07, "loss": 0.1064, "step": 2387 }, { "epoch": 1.9274309496783957, "grad_norm": 0.05906595289707184, "learning_rate": 3.932052872254233e-07, "loss": 0.113, "step": 2388 }, { "epoch": 1.9282381132551394, "grad_norm": 0.0536089725792408, "learning_rate": 3.844301310017673e-07, "loss": 0.1029, "step": 2389 }, { "epoch": 1.929045276831883, "grad_norm": 0.05490358546376228, "learning_rate": 3.757536225508673e-07, "loss": 0.105, "step": 2390 }, { "epoch": 1.9298524404086266, "grad_norm": 0.05410759896039963, "learning_rate": 3.67175779123724e-07, "loss": 0.0985, "step": 2391 }, { "epoch": 1.93065960398537, "grad_norm": 0.04968889802694321, "learning_rate": 3.586966177751727e-07, "loss": 0.1026, "step": 2392 }, { "epoch": 1.9314667675621138, "grad_norm": 0.052902400493621826, "learning_rate": 3.5031615536384454e-07, "loss": 0.1058, "step": 2393 }, { "epoch": 1.9322739311388575, "grad_norm": 0.05356273427605629, "learning_rate": 3.4203440855211656e-07, "loss": 0.1053, "step": 2394 }, { "epoch": 1.933081094715601, "grad_norm": 0.048135895282030106, "learning_rate": 3.338513938061172e-07, "loss": 0.106, "step": 2395 }, { "epoch": 1.9338882582923445, "grad_norm": 0.055870212614536285, "learning_rate": 3.2576712739565416e-07, "loss": 0.1039, "step": 2396 }, { "epoch": 1.9346954218690882, "grad_norm": 0.05684385076165199, "learning_rate": 3.1778162539421453e-07, "loss": 0.1091, "step": 2397 }, { "epoch": 1.9355025854458319, "grad_norm": 0.05628974735736847, "learning_rate": 3.098949036789034e-07, "loss": 0.1003, "step": 2398 }, { "epoch": 1.9363097490225754, "grad_norm": 0.05405397713184357, "learning_rate": 3.0210697793044975e-07, "loss": 0.1025, "step": 2399 }, { "epoch": 1.9371169125993188, "grad_norm": 0.05219145491719246, "learning_rate": 2.944178636331174e-07, "loss": 0.1092, "step": 2400 }, { "epoch": 1.9379240761760625, "grad_norm": 0.04952447488903999, "learning_rate": 2.8682757607474407e-07, "loss": 0.0987, "step": 2401 }, { "epoch": 1.9387312397528063, "grad_norm": 0.05636390671133995, "learning_rate": 2.7933613034664686e-07, "loss": 0.1119, "step": 2402 }, { "epoch": 1.9395384033295497, "grad_norm": 0.0466810017824173, "learning_rate": 2.7194354134363885e-07, "loss": 0.105, "step": 2403 }, { "epoch": 1.9403455669062932, "grad_norm": 0.05365828797221184, "learning_rate": 2.6464982376398494e-07, "loss": 0.1021, "step": 2404 }, { "epoch": 1.941152730483037, "grad_norm": 0.05150432139635086, "learning_rate": 2.574549921093627e-07, "loss": 0.1039, "step": 2405 }, { "epoch": 1.9419598940597806, "grad_norm": 0.053471650928258896, "learning_rate": 2.5035906068482937e-07, "loss": 0.099, "step": 2406 }, { "epoch": 1.9427670576365241, "grad_norm": 0.04858791083097458, "learning_rate": 2.433620435988215e-07, "loss": 0.1001, "step": 2407 }, { "epoch": 1.9435742212132676, "grad_norm": 0.05246763676404953, "learning_rate": 2.3646395476310533e-07, "loss": 0.1011, "step": 2408 }, { "epoch": 1.9443813847900113, "grad_norm": 0.05339714139699936, "learning_rate": 2.2966480789275436e-07, "loss": 0.1038, "step": 2409 }, { "epoch": 1.945188548366755, "grad_norm": 0.05488397181034088, "learning_rate": 2.229646165061161e-07, "loss": 0.1095, "step": 2410 }, { "epoch": 1.9459957119434985, "grad_norm": 0.05164272338151932, "learning_rate": 2.1636339392479553e-07, "loss": 0.1008, "step": 2411 }, { "epoch": 1.946802875520242, "grad_norm": 0.05595352500677109, "learning_rate": 2.0986115327362166e-07, "loss": 0.0982, "step": 2412 }, { "epoch": 1.9476100390969857, "grad_norm": 0.054809074848890305, "learning_rate": 2.0345790748062532e-07, "loss": 0.0989, "step": 2413 }, { "epoch": 1.9484172026737294, "grad_norm": 0.0583934411406517, "learning_rate": 1.9715366927701152e-07, "loss": 0.1032, "step": 2414 }, { "epoch": 1.949224366250473, "grad_norm": 0.05368480086326599, "learning_rate": 1.90948451197126e-07, "loss": 0.1048, "step": 2415 }, { "epoch": 1.9500315298272164, "grad_norm": 0.05365470051765442, "learning_rate": 1.848422655784554e-07, "loss": 0.1059, "step": 2416 }, { "epoch": 1.95083869340396, "grad_norm": 0.05489076301455498, "learning_rate": 1.788351245615716e-07, "loss": 0.1081, "step": 2417 }, { "epoch": 1.9516458569807038, "grad_norm": 0.05638003349304199, "learning_rate": 1.7292704009012617e-07, "loss": 0.0953, "step": 2418 }, { "epoch": 1.9524530205574473, "grad_norm": 0.05091138556599617, "learning_rate": 1.671180239108172e-07, "loss": 0.1031, "step": 2419 }, { "epoch": 1.9532601841341908, "grad_norm": 0.04912186414003372, "learning_rate": 1.6140808757338922e-07, "loss": 0.1116, "step": 2420 }, { "epoch": 1.9540673477109345, "grad_norm": 0.055606454610824585, "learning_rate": 1.557972424305665e-07, "loss": 0.0988, "step": 2421 }, { "epoch": 1.9548745112876782, "grad_norm": 0.05227714031934738, "learning_rate": 1.5028549963806982e-07, "loss": 0.1027, "step": 2422 }, { "epoch": 1.955681674864422, "grad_norm": 0.05962540954351425, "learning_rate": 1.4487287015458874e-07, "loss": 0.11, "step": 2423 }, { "epoch": 1.9564888384411654, "grad_norm": 0.0601976104080677, "learning_rate": 1.395593647417315e-07, "loss": 0.106, "step": 2424 }, { "epoch": 1.9572960020179089, "grad_norm": 0.059958428144454956, "learning_rate": 1.3434499396404176e-07, "loss": 0.1147, "step": 2425 }, { "epoch": 1.9581031655946526, "grad_norm": 0.05322643741965294, "learning_rate": 1.2922976818894317e-07, "loss": 0.1069, "step": 2426 }, { "epoch": 1.9589103291713963, "grad_norm": 0.056679584085941315, "learning_rate": 1.2421369758675027e-07, "loss": 0.105, "step": 2427 }, { "epoch": 1.9597174927481398, "grad_norm": 0.05454505980014801, "learning_rate": 1.1929679213062429e-07, "loss": 0.1055, "step": 2428 }, { "epoch": 1.9605246563248833, "grad_norm": 0.053598515689373016, "learning_rate": 1.1447906159656741e-07, "loss": 0.1032, "step": 2429 }, { "epoch": 1.961331819901627, "grad_norm": 0.05258147045969963, "learning_rate": 1.0976051556339518e-07, "loss": 0.0993, "step": 2430 }, { "epoch": 1.9621389834783707, "grad_norm": 0.05337973311543465, "learning_rate": 1.0514116341271418e-07, "loss": 0.1132, "step": 2431 }, { "epoch": 1.9629461470551142, "grad_norm": 0.05588565394282341, "learning_rate": 1.0062101432892212e-07, "loss": 0.0979, "step": 2432 }, { "epoch": 1.9629461470551142, "eval_loss": 0.11350435763597488, "eval_runtime": 4038.0344, "eval_samples_per_second": 2.205, "eval_steps_per_second": 2.205, "step": 2432 }, { "epoch": 1.9637533106318577, "grad_norm": 0.05099209025502205, "learning_rate": 9.620007729916336e-08, "loss": 0.1062, "step": 2433 }, { "epoch": 1.9645604742086014, "grad_norm": 0.05412807688117027, "learning_rate": 9.187836111334003e-08, "loss": 0.1159, "step": 2434 }, { "epoch": 1.965367637785345, "grad_norm": 0.04912622645497322, "learning_rate": 8.765587436406763e-08, "loss": 0.1065, "step": 2435 }, { "epoch": 1.9661748013620886, "grad_norm": 0.05380428954958916, "learning_rate": 8.353262544666951e-08, "loss": 0.1046, "step": 2436 }, { "epoch": 1.966981964938832, "grad_norm": 0.057944659143686295, "learning_rate": 7.95086225591657e-08, "loss": 0.1066, "step": 2437 }, { "epoch": 1.9677891285155757, "grad_norm": 0.05862908437848091, "learning_rate": 7.558387370225073e-08, "loss": 0.1047, "step": 2438 }, { "epoch": 1.9685962920923195, "grad_norm": 0.054054997861385345, "learning_rate": 7.175838667927148e-08, "loss": 0.1012, "step": 2439 }, { "epoch": 1.969403455669063, "grad_norm": 0.060232579708099365, "learning_rate": 6.803216909623267e-08, "loss": 0.1058, "step": 2440 }, { "epoch": 1.9702106192458064, "grad_norm": 0.05727238208055496, "learning_rate": 6.440522836174135e-08, "loss": 0.1038, "step": 2441 }, { "epoch": 1.9710177828225501, "grad_norm": 0.06608133763074875, "learning_rate": 6.087757168705132e-08, "loss": 0.1073, "step": 2442 }, { "epoch": 1.9718249463992938, "grad_norm": 0.04964543506503105, "learning_rate": 5.744920608598547e-08, "loss": 0.0909, "step": 2443 }, { "epoch": 1.9726321099760373, "grad_norm": 0.056635163724422455, "learning_rate": 5.412013837497454e-08, "loss": 0.0981, "step": 2444 }, { "epoch": 1.9734392735527808, "grad_norm": 0.05173860117793083, "learning_rate": 5.089037517300721e-08, "loss": 0.1065, "step": 2445 }, { "epoch": 1.9742464371295245, "grad_norm": 0.051431745290756226, "learning_rate": 4.775992290163567e-08, "loss": 0.112, "step": 2446 }, { "epoch": 1.9750536007062682, "grad_norm": 0.052781738340854645, "learning_rate": 4.472878778495892e-08, "loss": 0.112, "step": 2447 }, { "epoch": 1.9758607642830117, "grad_norm": 0.047435224056243896, "learning_rate": 4.1796975849606176e-08, "loss": 0.102, "step": 2448 }, { "epoch": 1.9766679278597552, "grad_norm": 0.05535315349698067, "learning_rate": 3.8964492924731255e-08, "loss": 0.119, "step": 2449 }, { "epoch": 1.977475091436499, "grad_norm": 0.05024454742670059, "learning_rate": 3.6231344641990404e-08, "loss": 0.0996, "step": 2450 }, { "epoch": 1.9782822550132426, "grad_norm": 0.05654919892549515, "learning_rate": 3.359753643555341e-08, "loss": 0.101, "step": 2451 }, { "epoch": 1.979089418589986, "grad_norm": 0.06143656000494957, "learning_rate": 3.106307354206472e-08, "loss": 0.1076, "step": 2452 }, { "epoch": 1.9798965821667296, "grad_norm": 0.05115377902984619, "learning_rate": 2.862796100065457e-08, "loss": 0.0974, "step": 2453 }, { "epoch": 1.9807037457434733, "grad_norm": 0.056528013199567795, "learning_rate": 2.6292203652905635e-08, "loss": 0.102, "step": 2454 }, { "epoch": 1.981510909320217, "grad_norm": 0.049975406378507614, "learning_rate": 2.405580614288083e-08, "loss": 0.0938, "step": 2455 }, { "epoch": 1.9823180728969605, "grad_norm": 0.050315845757722855, "learning_rate": 2.191877291707889e-08, "loss": 0.0932, "step": 2456 }, { "epoch": 1.983125236473704, "grad_norm": 0.05693768337368965, "learning_rate": 1.9881108224434342e-08, "loss": 0.1017, "step": 2457 }, { "epoch": 1.9839324000504477, "grad_norm": 0.04970702528953552, "learning_rate": 1.794281611631754e-08, "loss": 0.1009, "step": 2458 }, { "epoch": 1.9847395636271914, "grad_norm": 0.0602552592754364, "learning_rate": 1.6103900446534648e-08, "loss": 0.1041, "step": 2459 }, { "epoch": 1.9855467272039349, "grad_norm": 0.0588485486805439, "learning_rate": 1.436436487127768e-08, "loss": 0.1081, "step": 2460 }, { "epoch": 1.9863538907806784, "grad_norm": 0.05388660728931427, "learning_rate": 1.2724212849180017e-08, "loss": 0.1037, "step": 2461 }, { "epoch": 1.987161054357422, "grad_norm": 0.05426393449306488, "learning_rate": 1.1183447641249789e-08, "loss": 0.1043, "step": 2462 }, { "epoch": 1.9879682179341658, "grad_norm": 0.05377504229545593, "learning_rate": 9.742072310908734e-09, "loss": 0.1071, "step": 2463 }, { "epoch": 1.9887753815109093, "grad_norm": 0.05249738693237305, "learning_rate": 8.400089723964444e-09, "loss": 0.1096, "step": 2464 }, { "epoch": 1.9895825450876528, "grad_norm": 0.05633946508169174, "learning_rate": 7.157502548588157e-09, "loss": 0.1001, "step": 2465 }, { "epoch": 1.9903897086643965, "grad_norm": 0.056451957672834396, "learning_rate": 6.0143132553591716e-09, "loss": 0.1142, "step": 2466 }, { "epoch": 1.9911968722411402, "grad_norm": 0.057267967611551285, "learning_rate": 4.97052411720933e-09, "loss": 0.1028, "step": 2467 }, { "epoch": 1.9920040358178839, "grad_norm": 0.05662749335169792, "learning_rate": 4.026137209439673e-09, "loss": 0.1005, "step": 2468 }, { "epoch": 1.9928111993946274, "grad_norm": 0.06071709468960762, "learning_rate": 3.1811544097259947e-09, "loss": 0.0991, "step": 2469 }, { "epoch": 1.9936183629713708, "grad_norm": 0.057182807475328445, "learning_rate": 2.4355773981021846e-09, "loss": 0.1012, "step": 2470 }, { "epoch": 1.9944255265481146, "grad_norm": 0.05665000155568123, "learning_rate": 1.7894076569435757e-09, "loss": 0.1104, "step": 2471 }, { "epoch": 1.9952326901248583, "grad_norm": 0.05336397513747215, "learning_rate": 1.2426464710058039e-09, "loss": 0.1141, "step": 2472 }, { "epoch": 1.9960398537016018, "grad_norm": 0.058378174901008606, "learning_rate": 7.952949273748455e-10, "loss": 0.1051, "step": 2473 }, { "epoch": 1.9968470172783452, "grad_norm": 0.059408750385046005, "learning_rate": 4.473539154892237e-10, "loss": 0.1139, "step": 2474 }, { "epoch": 1.997654180855089, "grad_norm": 0.05929576978087425, "learning_rate": 1.9882412715110933e-10, "loss": 0.1064, "step": 2475 }, { "epoch": 1.9984613444318327, "grad_norm": 0.05685753747820854, "learning_rate": 4.970605649301519e-11, "loss": 0.1038, "step": 2476 }, { "epoch": 1.9984613444318327, "step": 2476, "total_flos": 7.982224121073802e+18, "train_loss": 0.10261009003431423, "train_runtime": 248124.4211, "train_samples_per_second": 1.278, "train_steps_per_second": 0.01 } ], "logging_steps": 1, "max_steps": 2476, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 128, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.982224121073802e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }