{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 97, "global_step": 772, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025906735751295338, "grad_norm": 758.2562349755826, "learning_rate": 0.0, "loss": 1.3719, "step": 1 }, { "epoch": 0.0025906735751295338, "eval_loss": 1.3159157037734985, "eval_runtime": 36.907, "eval_samples_per_second": 20.159, "eval_steps_per_second": 1.273, "step": 1 }, { "epoch": 0.0051813471502590676, "grad_norm": 666.308184823038, "learning_rate": 1.0000000000000002e-06, "loss": 1.36, "step": 2 }, { "epoch": 0.007772020725388601, "grad_norm": 211.0771195353068, "learning_rate": 2.0000000000000003e-06, "loss": 1.3746, "step": 3 }, { "epoch": 0.010362694300518135, "grad_norm": 431.5114709683218, "learning_rate": 3e-06, "loss": 1.3412, "step": 4 }, { "epoch": 0.012953367875647668, "grad_norm": 230.87468433791625, "learning_rate": 4.000000000000001e-06, "loss": 1.3837, "step": 5 }, { "epoch": 0.015544041450777202, "grad_norm": 635.1636587738542, "learning_rate": 5e-06, "loss": 1.3761, "step": 6 }, { "epoch": 0.018134715025906734, "grad_norm": 791.5536958334704, "learning_rate": 6e-06, "loss": 1.2855, "step": 7 }, { "epoch": 0.02072538860103627, "grad_norm": 667.7197994216477, "learning_rate": 7e-06, "loss": 1.3267, "step": 8 }, { "epoch": 0.023316062176165803, "grad_norm": 254.3855973692125, "learning_rate": 8.000000000000001e-06, "loss": 1.2977, "step": 9 }, { "epoch": 0.025906735751295335, "grad_norm": 162.29347257682093, "learning_rate": 9e-06, "loss": 1.3522, "step": 10 }, { "epoch": 0.02849740932642487, "grad_norm": 352.6352930651456, "learning_rate": 1e-05, "loss": 1.2688, "step": 11 }, { "epoch": 0.031088082901554404, "grad_norm": 148.2629265526552, "learning_rate": 1.1000000000000001e-05, "loss": 1.3342, "step": 12 }, { "epoch": 0.03367875647668394, "grad_norm": 249.88753789723657, "learning_rate": 1.2e-05, "loss": 1.2983, "step": 13 }, { "epoch": 0.03626943005181347, "grad_norm": 184.03358422636597, "learning_rate": 1.3000000000000001e-05, "loss": 1.3291, "step": 14 }, { "epoch": 0.038860103626943004, "grad_norm": 198.4491469860763, "learning_rate": 1.4e-05, "loss": 1.4014, "step": 15 }, { "epoch": 0.04145077720207254, "grad_norm": 680.9537058769038, "learning_rate": 1.5000000000000002e-05, "loss": 1.3775, "step": 16 }, { "epoch": 0.04404145077720207, "grad_norm": 563.0247638614801, "learning_rate": 1.6000000000000003e-05, "loss": 1.3228, "step": 17 }, { "epoch": 0.046632124352331605, "grad_norm": 271.985463813746, "learning_rate": 1.7e-05, "loss": 1.3695, "step": 18 }, { "epoch": 0.04922279792746114, "grad_norm": 399.51218452223316, "learning_rate": 1.8e-05, "loss": 1.2556, "step": 19 }, { "epoch": 0.05181347150259067, "grad_norm": 160.70697055826656, "learning_rate": 1.9e-05, "loss": 1.2982, "step": 20 }, { "epoch": 0.054404145077720206, "grad_norm": 227.8927504687491, "learning_rate": 2e-05, "loss": 1.3532, "step": 21 }, { "epoch": 0.05699481865284974, "grad_norm": 550.1538868076032, "learning_rate": 2.1000000000000002e-05, "loss": 1.2603, "step": 22 }, { "epoch": 0.05958549222797927, "grad_norm": 291.8994359919024, "learning_rate": 2.2000000000000003e-05, "loss": 1.3663, "step": 23 }, { "epoch": 0.06217616580310881, "grad_norm": 120.60677833129643, "learning_rate": 2.3e-05, "loss": 1.3129, "step": 24 }, { "epoch": 0.06476683937823834, "grad_norm": 414.4006662101242, "learning_rate": 2.4e-05, "loss": 1.3037, "step": 25 }, { "epoch": 0.06735751295336788, "grad_norm": 141.48324465317884, "learning_rate": 2.5e-05, "loss": 1.3095, "step": 26 }, { "epoch": 0.06994818652849741, "grad_norm": 147.86066819937994, "learning_rate": 2.6000000000000002e-05, "loss": 1.2372, "step": 27 }, { "epoch": 0.07253886010362694, "grad_norm": 214.47337614964576, "learning_rate": 2.7000000000000002e-05, "loss": 1.3384, "step": 28 }, { "epoch": 0.07512953367875648, "grad_norm": 898.4324889241673, "learning_rate": 2.8e-05, "loss": 1.2003, "step": 29 }, { "epoch": 0.07772020725388601, "grad_norm": 128.83026557596128, "learning_rate": 2.9e-05, "loss": 1.2172, "step": 30 }, { "epoch": 0.08031088082901554, "grad_norm": 183.0777862405529, "learning_rate": 3.0000000000000004e-05, "loss": 1.2674, "step": 31 }, { "epoch": 0.08290155440414508, "grad_norm": 119.01841833358732, "learning_rate": 3.1e-05, "loss": 1.2554, "step": 32 }, { "epoch": 0.08549222797927461, "grad_norm": 117.65980267542858, "learning_rate": 3.2000000000000005e-05, "loss": 1.2716, "step": 33 }, { "epoch": 0.08808290155440414, "grad_norm": 82.40151099433953, "learning_rate": 3.3e-05, "loss": 1.2019, "step": 34 }, { "epoch": 0.09067357512953368, "grad_norm": 82.61816783653785, "learning_rate": 3.4e-05, "loss": 1.2424, "step": 35 }, { "epoch": 0.09326424870466321, "grad_norm": 136.42743433868276, "learning_rate": 3.5000000000000004e-05, "loss": 1.2066, "step": 36 }, { "epoch": 0.09585492227979274, "grad_norm": 36.775911657584444, "learning_rate": 3.6e-05, "loss": 1.2485, "step": 37 }, { "epoch": 0.09844559585492228, "grad_norm": 56.55022603284064, "learning_rate": 3.7000000000000005e-05, "loss": 1.2112, "step": 38 }, { "epoch": 0.10103626943005181, "grad_norm": 50.09896932886107, "learning_rate": 3.8e-05, "loss": 1.2027, "step": 39 }, { "epoch": 0.10362694300518134, "grad_norm": 54.2661481198025, "learning_rate": 3.9e-05, "loss": 1.2673, "step": 40 }, { "epoch": 0.10621761658031088, "grad_norm": 60.04145981731815, "learning_rate": 4e-05, "loss": 1.1648, "step": 41 }, { "epoch": 0.10880829015544041, "grad_norm": 169.47741055545822, "learning_rate": 3.999981580539036e-05, "loss": 1.2393, "step": 42 }, { "epoch": 0.11139896373056994, "grad_norm": 43.64716987307323, "learning_rate": 3.9999263224954204e-05, "loss": 1.2906, "step": 43 }, { "epoch": 0.11398963730569948, "grad_norm": 51.3206609767585, "learning_rate": 3.999834226886976e-05, "loss": 1.1807, "step": 44 }, { "epoch": 0.11658031088082901, "grad_norm": 38.95055887413869, "learning_rate": 3.999705295410054e-05, "loss": 1.1825, "step": 45 }, { "epoch": 0.11917098445595854, "grad_norm": 40.59968974426338, "learning_rate": 3.999539530439504e-05, "loss": 1.193, "step": 46 }, { "epoch": 0.12176165803108809, "grad_norm": 34.5796571445333, "learning_rate": 3.9993369350286265e-05, "loss": 1.2127, "step": 47 }, { "epoch": 0.12435233160621761, "grad_norm": 37.97693356149241, "learning_rate": 3.99909751290912e-05, "loss": 1.1543, "step": 48 }, { "epoch": 0.12694300518134716, "grad_norm": 82.9217015858092, "learning_rate": 3.9988212684910107e-05, "loss": 1.2329, "step": 49 }, { "epoch": 0.12953367875647667, "grad_norm": 49.256542144400214, "learning_rate": 3.9985082068625724e-05, "loss": 1.212, "step": 50 }, { "epoch": 0.13212435233160622, "grad_norm": 45.025980435259484, "learning_rate": 3.998158333790231e-05, "loss": 1.2129, "step": 51 }, { "epoch": 0.13471502590673576, "grad_norm": 45.98465689592428, "learning_rate": 3.99777165571846e-05, "loss": 1.1709, "step": 52 }, { "epoch": 0.13730569948186527, "grad_norm": 43.481241408477906, "learning_rate": 3.997348179769661e-05, "loss": 1.1614, "step": 53 }, { "epoch": 0.13989637305699482, "grad_norm": 82.17633750834132, "learning_rate": 3.996887913744033e-05, "loss": 1.2205, "step": 54 }, { "epoch": 0.14248704663212436, "grad_norm": 53.0176514970764, "learning_rate": 3.9963908661194285e-05, "loss": 1.1204, "step": 55 }, { "epoch": 0.14507772020725387, "grad_norm": 67.86382426995611, "learning_rate": 3.995857046051196e-05, "loss": 1.1839, "step": 56 }, { "epoch": 0.14766839378238342, "grad_norm": 31.282407703790597, "learning_rate": 3.995286463372013e-05, "loss": 1.2126, "step": 57 }, { "epoch": 0.15025906735751296, "grad_norm": 52.200764429265604, "learning_rate": 3.994679128591706e-05, "loss": 1.2036, "step": 58 }, { "epoch": 0.15284974093264247, "grad_norm": 60.706608653531895, "learning_rate": 3.9940350528970535e-05, "loss": 1.1848, "step": 59 }, { "epoch": 0.15544041450777202, "grad_norm": 47.31754062899529, "learning_rate": 3.993354248151583e-05, "loss": 1.0869, "step": 60 }, { "epoch": 0.15803108808290156, "grad_norm": 49.42450836392811, "learning_rate": 3.9926367268953514e-05, "loss": 1.2651, "step": 61 }, { "epoch": 0.16062176165803108, "grad_norm": 38.791167030088886, "learning_rate": 3.991882502344712e-05, "loss": 1.1881, "step": 62 }, { "epoch": 0.16321243523316062, "grad_norm": 56.16339499737216, "learning_rate": 3.991091588392077e-05, "loss": 1.1518, "step": 63 }, { "epoch": 0.16580310880829016, "grad_norm": 861.8559063020828, "learning_rate": 3.990263999605652e-05, "loss": 1.1614, "step": 64 }, { "epoch": 0.16839378238341968, "grad_norm": 50.92822786500888, "learning_rate": 3.989399751229179e-05, "loss": 1.1998, "step": 65 }, { "epoch": 0.17098445595854922, "grad_norm": 31.04121324055666, "learning_rate": 3.988498859181645e-05, "loss": 1.1795, "step": 66 }, { "epoch": 0.17357512953367876, "grad_norm": 50.33061983380845, "learning_rate": 3.9875613400569975e-05, "loss": 1.1742, "step": 67 }, { "epoch": 0.17616580310880828, "grad_norm": 75.20462514003519, "learning_rate": 3.986587211123833e-05, "loss": 1.1856, "step": 68 }, { "epoch": 0.17875647668393782, "grad_norm": 38.82139317052205, "learning_rate": 3.98557649032508e-05, "loss": 1.1529, "step": 69 }, { "epoch": 0.18134715025906736, "grad_norm": 36.55988806615175, "learning_rate": 3.984529196277674e-05, "loss": 1.1884, "step": 70 }, { "epoch": 0.18393782383419688, "grad_norm": 104.8931793971097, "learning_rate": 3.983445348272203e-05, "loss": 1.2182, "step": 71 }, { "epoch": 0.18652849740932642, "grad_norm": 36.50395409234617, "learning_rate": 3.982324966272566e-05, "loss": 1.1609, "step": 72 }, { "epoch": 0.18911917098445596, "grad_norm": 35.019191693448626, "learning_rate": 3.981168070915594e-05, "loss": 1.173, "step": 73 }, { "epoch": 0.19170984455958548, "grad_norm": 33.378390048053596, "learning_rate": 3.979974683510677e-05, "loss": 1.173, "step": 74 }, { "epoch": 0.19430051813471502, "grad_norm": 43.356840136984154, "learning_rate": 3.978744826039366e-05, "loss": 1.2032, "step": 75 }, { "epoch": 0.19689119170984457, "grad_norm": 31.285725922510768, "learning_rate": 3.977478521154974e-05, "loss": 1.1569, "step": 76 }, { "epoch": 0.19948186528497408, "grad_norm": 35.19264482867074, "learning_rate": 3.9761757921821544e-05, "loss": 1.1365, "step": 77 }, { "epoch": 0.20207253886010362, "grad_norm": 44.66037256551279, "learning_rate": 3.974836663116472e-05, "loss": 1.164, "step": 78 }, { "epoch": 0.20466321243523317, "grad_norm": 68.91101457952654, "learning_rate": 3.973461158623963e-05, "loss": 1.2256, "step": 79 }, { "epoch": 0.20725388601036268, "grad_norm": 45.866521854583, "learning_rate": 3.9720493040406786e-05, "loss": 1.1697, "step": 80 }, { "epoch": 0.20984455958549222, "grad_norm": 59.63095169617338, "learning_rate": 3.970601125372218e-05, "loss": 1.2094, "step": 81 }, { "epoch": 0.21243523316062177, "grad_norm": 39.085597271064216, "learning_rate": 3.9691166492932535e-05, "loss": 1.1048, "step": 82 }, { "epoch": 0.21502590673575128, "grad_norm": 36.40256073477861, "learning_rate": 3.9675959031470336e-05, "loss": 1.248, "step": 83 }, { "epoch": 0.21761658031088082, "grad_norm": 29.846921716586085, "learning_rate": 3.966038914944881e-05, "loss": 1.1718, "step": 84 }, { "epoch": 0.22020725388601037, "grad_norm": 50.87052190327881, "learning_rate": 3.964445713365682e-05, "loss": 1.1529, "step": 85 }, { "epoch": 0.22279792746113988, "grad_norm": 35.32915760431302, "learning_rate": 3.9628163277553486e-05, "loss": 1.1767, "step": 86 }, { "epoch": 0.22538860103626943, "grad_norm": 157.5587514654703, "learning_rate": 3.961150788126286e-05, "loss": 1.2194, "step": 87 }, { "epoch": 0.22797927461139897, "grad_norm": 25.03485489120971, "learning_rate": 3.9594491251568376e-05, "loss": 1.1392, "step": 88 }, { "epoch": 0.23056994818652848, "grad_norm": 80.55933867045263, "learning_rate": 3.957711370190716e-05, "loss": 1.1819, "step": 89 }, { "epoch": 0.23316062176165803, "grad_norm": 272.22874004071406, "learning_rate": 3.9559375552364325e-05, "loss": 1.0998, "step": 90 }, { "epoch": 0.23575129533678757, "grad_norm": 91.94671663482514, "learning_rate": 3.954127712966702e-05, "loss": 1.2494, "step": 91 }, { "epoch": 0.23834196891191708, "grad_norm": 54.31533598131098, "learning_rate": 3.952281876717843e-05, "loss": 1.1385, "step": 92 }, { "epoch": 0.24093264248704663, "grad_norm": 103.20789745908105, "learning_rate": 3.950400080489165e-05, "loss": 1.1398, "step": 93 }, { "epoch": 0.24352331606217617, "grad_norm": 45.14746362545893, "learning_rate": 3.94848235894234e-05, "loss": 1.2697, "step": 94 }, { "epoch": 0.24611398963730569, "grad_norm": 21.271923336142002, "learning_rate": 3.9465287474007654e-05, "loss": 1.1397, "step": 95 }, { "epoch": 0.24870466321243523, "grad_norm": 93.89786795431422, "learning_rate": 3.944539281848912e-05, "loss": 1.1542, "step": 96 }, { "epoch": 0.25129533678756477, "grad_norm": 32.38768349342839, "learning_rate": 3.942513998931663e-05, "loss": 1.1693, "step": 97 }, { "epoch": 0.25129533678756477, "eval_loss": 1.1344976425170898, "eval_runtime": 37.8807, "eval_samples_per_second": 19.641, "eval_steps_per_second": 1.241, "step": 97 }, { "epoch": 0.2538860103626943, "grad_norm": 91.41293468177638, "learning_rate": 3.940452935953639e-05, "loss": 1.1724, "step": 98 }, { "epoch": 0.25647668393782386, "grad_norm": 39.20645478419229, "learning_rate": 3.9383561308785075e-05, "loss": 1.1583, "step": 99 }, { "epoch": 0.25906735751295334, "grad_norm": 35.32804513153546, "learning_rate": 3.9362236223282885e-05, "loss": 1.158, "step": 100 }, { "epoch": 0.2616580310880829, "grad_norm": 35.24783762804842, "learning_rate": 3.934055449582641e-05, "loss": 1.1552, "step": 101 }, { "epoch": 0.26424870466321243, "grad_norm": 33.743808031979775, "learning_rate": 3.931851652578137e-05, "loss": 1.264, "step": 102 }, { "epoch": 0.266839378238342, "grad_norm": 113.49798793226394, "learning_rate": 3.92961227190753e-05, "loss": 1.2361, "step": 103 }, { "epoch": 0.2694300518134715, "grad_norm": 31.813807349410364, "learning_rate": 3.9273373488190036e-05, "loss": 1.1246, "step": 104 }, { "epoch": 0.27202072538860106, "grad_norm": 29.391695486306187, "learning_rate": 3.925026925215417e-05, "loss": 1.1142, "step": 105 }, { "epoch": 0.27461139896373055, "grad_norm": 33.79933331839905, "learning_rate": 3.922681043653526e-05, "loss": 1.1401, "step": 106 }, { "epoch": 0.2772020725388601, "grad_norm": 39.09509012730907, "learning_rate": 3.920299747343204e-05, "loss": 1.1822, "step": 107 }, { "epoch": 0.27979274611398963, "grad_norm": 37.81471938433609, "learning_rate": 3.9178830801466465e-05, "loss": 1.1592, "step": 108 }, { "epoch": 0.2823834196891192, "grad_norm": 69.07753778460207, "learning_rate": 3.915431086577561e-05, "loss": 1.1683, "step": 109 }, { "epoch": 0.2849740932642487, "grad_norm": 28.864787246081605, "learning_rate": 3.912943811800347e-05, "loss": 1.1179, "step": 110 }, { "epoch": 0.28756476683937826, "grad_norm": 28.842042951717836, "learning_rate": 3.910421301629264e-05, "loss": 1.1317, "step": 111 }, { "epoch": 0.29015544041450775, "grad_norm": 51.475482074695506, "learning_rate": 3.9078636025275904e-05, "loss": 1.1451, "step": 112 }, { "epoch": 0.2927461139896373, "grad_norm": 33.48279556713943, "learning_rate": 3.9052707616067654e-05, "loss": 1.1554, "step": 113 }, { "epoch": 0.29533678756476683, "grad_norm": 21.279603575929844, "learning_rate": 3.9026428266255205e-05, "loss": 1.1636, "step": 114 }, { "epoch": 0.2979274611398964, "grad_norm": 36.226178034876675, "learning_rate": 3.899979845989003e-05, "loss": 1.1966, "step": 115 }, { "epoch": 0.3005181347150259, "grad_norm": 29.90506353145981, "learning_rate": 3.897281868747878e-05, "loss": 1.1888, "step": 116 }, { "epoch": 0.30310880829015546, "grad_norm": 36.04602777809767, "learning_rate": 3.894548944597434e-05, "loss": 1.2066, "step": 117 }, { "epoch": 0.30569948186528495, "grad_norm": 36.42793844948301, "learning_rate": 3.8917811238766606e-05, "loss": 1.1712, "step": 118 }, { "epoch": 0.3082901554404145, "grad_norm": 58.788967662325696, "learning_rate": 3.888978457567323e-05, "loss": 1.1225, "step": 119 }, { "epoch": 0.31088082901554404, "grad_norm": 29.357299816022326, "learning_rate": 3.886140997293024e-05, "loss": 1.1315, "step": 120 }, { "epoch": 0.3134715025906736, "grad_norm": 95.08345317107502, "learning_rate": 3.883268795318252e-05, "loss": 1.1852, "step": 121 }, { "epoch": 0.3160621761658031, "grad_norm": 33.6623824593179, "learning_rate": 3.88036190454742e-05, "loss": 1.16, "step": 122 }, { "epoch": 0.31865284974093266, "grad_norm": 42.587546987131105, "learning_rate": 3.8774203785238886e-05, "loss": 1.1374, "step": 123 }, { "epoch": 0.32124352331606215, "grad_norm": 33.360649853064245, "learning_rate": 3.8744442714289816e-05, "loss": 1.1757, "step": 124 }, { "epoch": 0.3238341968911917, "grad_norm": 49.09256643961471, "learning_rate": 3.8714336380809874e-05, "loss": 1.1782, "step": 125 }, { "epoch": 0.32642487046632124, "grad_norm": 31.505007051172793, "learning_rate": 3.86838853393415e-05, "loss": 1.195, "step": 126 }, { "epoch": 0.3290155440414508, "grad_norm": 34.36735417254799, "learning_rate": 3.865309015077645e-05, "loss": 1.1078, "step": 127 }, { "epoch": 0.3316062176165803, "grad_norm": 36.63220606142181, "learning_rate": 3.862195138234551e-05, "loss": 1.1319, "step": 128 }, { "epoch": 0.33419689119170987, "grad_norm": 53.324986862513676, "learning_rate": 3.859046960760801e-05, "loss": 1.2301, "step": 129 }, { "epoch": 0.33678756476683935, "grad_norm": 47.41445409144979, "learning_rate": 3.855864540644126e-05, "loss": 1.2366, "step": 130 }, { "epoch": 0.3393782383419689, "grad_norm": 32.57355122427366, "learning_rate": 3.8526479365029906e-05, "loss": 1.142, "step": 131 }, { "epoch": 0.34196891191709844, "grad_norm": 28.445824333644715, "learning_rate": 3.849397207585508e-05, "loss": 1.0847, "step": 132 }, { "epoch": 0.344559585492228, "grad_norm": 49.23062726715889, "learning_rate": 3.846112413768353e-05, "loss": 1.2241, "step": 133 }, { "epoch": 0.3471502590673575, "grad_norm": 53.424206543788074, "learning_rate": 3.842793615555657e-05, "loss": 1.2392, "step": 134 }, { "epoch": 0.34974093264248707, "grad_norm": 38.19316140175426, "learning_rate": 3.8394408740778934e-05, "loss": 1.1208, "step": 135 }, { "epoch": 0.35233160621761656, "grad_norm": 32.35931252369273, "learning_rate": 3.836054251090755e-05, "loss": 1.1604, "step": 136 }, { "epoch": 0.3549222797927461, "grad_norm": 37.90085344799495, "learning_rate": 3.83263380897401e-05, "loss": 1.1134, "step": 137 }, { "epoch": 0.35751295336787564, "grad_norm": 44.49191588319939, "learning_rate": 3.829179610730359e-05, "loss": 1.1281, "step": 138 }, { "epoch": 0.3601036269430052, "grad_norm": 141.98524430756757, "learning_rate": 3.8256917199842715e-05, "loss": 1.0928, "step": 139 }, { "epoch": 0.3626943005181347, "grad_norm": 30.887093976524472, "learning_rate": 3.822170200980815e-05, "loss": 1.0936, "step": 140 }, { "epoch": 0.36528497409326427, "grad_norm": 21.980521878837745, "learning_rate": 3.818615118584472e-05, "loss": 1.1368, "step": 141 }, { "epoch": 0.36787564766839376, "grad_norm": 538.6650762618656, "learning_rate": 3.815026538277943e-05, "loss": 1.0918, "step": 142 }, { "epoch": 0.3704663212435233, "grad_norm": 40.842881572203, "learning_rate": 3.811404526160943e-05, "loss": 1.1705, "step": 143 }, { "epoch": 0.37305699481865284, "grad_norm": 26.891553492377298, "learning_rate": 3.8077491489489835e-05, "loss": 1.1468, "step": 144 }, { "epoch": 0.3756476683937824, "grad_norm": 45.138483181178074, "learning_rate": 3.8040604739721415e-05, "loss": 1.1679, "step": 145 }, { "epoch": 0.37823834196891193, "grad_norm": 35.133763086168244, "learning_rate": 3.8003385691738227e-05, "loss": 1.1029, "step": 146 }, { "epoch": 0.38082901554404147, "grad_norm": 36.941250802707344, "learning_rate": 3.7965835031095065e-05, "loss": 1.1491, "step": 147 }, { "epoch": 0.38341968911917096, "grad_norm": 90.1080256703095, "learning_rate": 3.792795344945485e-05, "loss": 1.1212, "step": 148 }, { "epoch": 0.3860103626943005, "grad_norm": 39.70360899750413, "learning_rate": 3.7889741644575914e-05, "loss": 1.15, "step": 149 }, { "epoch": 0.38860103626943004, "grad_norm": 28.229369877304094, "learning_rate": 3.78512003202991e-05, "loss": 1.1111, "step": 150 }, { "epoch": 0.3911917098445596, "grad_norm": 31.611752191925987, "learning_rate": 3.7812330186534815e-05, "loss": 1.1366, "step": 151 }, { "epoch": 0.39378238341968913, "grad_norm": 38.196015586772425, "learning_rate": 3.777313195924998e-05, "loss": 1.1433, "step": 152 }, { "epoch": 0.3963730569948187, "grad_norm": 22.732638044547453, "learning_rate": 3.773360636045481e-05, "loss": 1.1125, "step": 153 }, { "epoch": 0.39896373056994816, "grad_norm": 90.19158665385014, "learning_rate": 3.7693754118189525e-05, "loss": 1.1242, "step": 154 }, { "epoch": 0.4015544041450777, "grad_norm": 42.43479974993017, "learning_rate": 3.765357596651095e-05, "loss": 1.1191, "step": 155 }, { "epoch": 0.40414507772020725, "grad_norm": 88.0076735720364, "learning_rate": 3.761307264547899e-05, "loss": 1.1718, "step": 156 }, { "epoch": 0.4067357512953368, "grad_norm": 30.782507703935767, "learning_rate": 3.757224490114297e-05, "loss": 1.109, "step": 157 }, { "epoch": 0.40932642487046633, "grad_norm": 69.89871106113397, "learning_rate": 3.7531093485527943e-05, "loss": 1.1018, "step": 158 }, { "epoch": 0.4119170984455959, "grad_norm": 37.339006645717305, "learning_rate": 3.7489619156620796e-05, "loss": 1.1358, "step": 159 }, { "epoch": 0.41450777202072536, "grad_norm": 28.06388054378899, "learning_rate": 3.744782267835632e-05, "loss": 1.0847, "step": 160 }, { "epoch": 0.4170984455958549, "grad_norm": 54.05874281297702, "learning_rate": 3.740570482060311e-05, "loss": 1.1682, "step": 161 }, { "epoch": 0.41968911917098445, "grad_norm": 32.299093265328835, "learning_rate": 3.73632663591494e-05, "loss": 1.1413, "step": 162 }, { "epoch": 0.422279792746114, "grad_norm": 31.213652090157694, "learning_rate": 3.732050807568878e-05, "loss": 1.1313, "step": 163 }, { "epoch": 0.42487046632124353, "grad_norm": 40.01090035937505, "learning_rate": 3.727743075780578e-05, "loss": 1.1513, "step": 164 }, { "epoch": 0.4274611398963731, "grad_norm": 47.11352577964853, "learning_rate": 3.723403519896136e-05, "loss": 1.2192, "step": 165 }, { "epoch": 0.43005181347150256, "grad_norm": 28.645086506093037, "learning_rate": 3.7190322198478355e-05, "loss": 1.1097, "step": 166 }, { "epoch": 0.4326424870466321, "grad_norm": 35.28541113925116, "learning_rate": 3.7146292561526654e-05, "loss": 1.1557, "step": 167 }, { "epoch": 0.43523316062176165, "grad_norm": 58.30281063037669, "learning_rate": 3.7101947099108425e-05, "loss": 1.1829, "step": 168 }, { "epoch": 0.4378238341968912, "grad_norm": 26.33563548968379, "learning_rate": 3.70572866280432e-05, "loss": 1.147, "step": 169 }, { "epoch": 0.44041450777202074, "grad_norm": 57.00052875402651, "learning_rate": 3.701231197095277e-05, "loss": 1.1212, "step": 170 }, { "epoch": 0.4430051813471503, "grad_norm": 23.672828037237174, "learning_rate": 3.696702395624608e-05, "loss": 1.1152, "step": 171 }, { "epoch": 0.44559585492227977, "grad_norm": 41.1264174112964, "learning_rate": 3.692142341810395e-05, "loss": 1.1154, "step": 172 }, { "epoch": 0.4481865284974093, "grad_norm": 26.72177706144361, "learning_rate": 3.6875511196463715e-05, "loss": 1.1725, "step": 173 }, { "epoch": 0.45077720207253885, "grad_norm": 95.4088800585977, "learning_rate": 3.682928813700375e-05, "loss": 1.1339, "step": 174 }, { "epoch": 0.4533678756476684, "grad_norm": 34.33666578349465, "learning_rate": 3.678275509112788e-05, "loss": 1.1867, "step": 175 }, { "epoch": 0.45595854922279794, "grad_norm": 31.032304531003014, "learning_rate": 3.6735912915949745e-05, "loss": 1.1386, "step": 176 }, { "epoch": 0.4585492227979275, "grad_norm": 55.22043313188224, "learning_rate": 3.6688762474276945e-05, "loss": 1.1102, "step": 177 }, { "epoch": 0.46113989637305697, "grad_norm": 29.82713377876857, "learning_rate": 3.6641304634595216e-05, "loss": 1.1564, "step": 178 }, { "epoch": 0.4637305699481865, "grad_norm": 35.71025459541737, "learning_rate": 3.659354027105238e-05, "loss": 1.0939, "step": 179 }, { "epoch": 0.46632124352331605, "grad_norm": 52.41175655642653, "learning_rate": 3.6545470263442265e-05, "loss": 1.1578, "step": 180 }, { "epoch": 0.4689119170984456, "grad_norm": 27.682485766528306, "learning_rate": 3.649709549718849e-05, "loss": 1.1875, "step": 181 }, { "epoch": 0.47150259067357514, "grad_norm": 36.53293663303487, "learning_rate": 3.6448416863328186e-05, "loss": 1.1111, "step": 182 }, { "epoch": 0.4740932642487047, "grad_norm": 31.45177998538027, "learning_rate": 3.639943525849555e-05, "loss": 1.113, "step": 183 }, { "epoch": 0.47668393782383417, "grad_norm": 28.323097072885673, "learning_rate": 3.635015158490533e-05, "loss": 1.1159, "step": 184 }, { "epoch": 0.4792746113989637, "grad_norm": 47.75573754341213, "learning_rate": 3.6300566750336225e-05, "loss": 1.1305, "step": 185 }, { "epoch": 0.48186528497409326, "grad_norm": 21.384095061494357, "learning_rate": 3.625068166811418e-05, "loss": 1.1369, "step": 186 }, { "epoch": 0.4844559585492228, "grad_norm": 30.714645036809546, "learning_rate": 3.6200497257095504e-05, "loss": 1.1858, "step": 187 }, { "epoch": 0.48704663212435234, "grad_norm": 35.12161426399798, "learning_rate": 3.615001444165001e-05, "loss": 1.1293, "step": 188 }, { "epoch": 0.4896373056994819, "grad_norm": 116.83443661381396, "learning_rate": 3.6099234151643924e-05, "loss": 1.1515, "step": 189 }, { "epoch": 0.49222797927461137, "grad_norm": 55.47885243409044, "learning_rate": 3.604815732242283e-05, "loss": 1.112, "step": 190 }, { "epoch": 0.4948186528497409, "grad_norm": 32.332747429034285, "learning_rate": 3.5996784894794394e-05, "loss": 1.1661, "step": 191 }, { "epoch": 0.49740932642487046, "grad_norm": 33.039210183180046, "learning_rate": 3.594511781501103e-05, "loss": 1.1244, "step": 192 }, { "epoch": 0.5, "grad_norm": 21.325687337182504, "learning_rate": 3.58931570347525e-05, "loss": 1.1634, "step": 193 }, { "epoch": 0.5025906735751295, "grad_norm": 51.37599478469561, "learning_rate": 3.584090351110838e-05, "loss": 1.2106, "step": 194 }, { "epoch": 0.5025906735751295, "eval_loss": 1.1119717359542847, "eval_runtime": 49.6027, "eval_samples_per_second": 14.999, "eval_steps_per_second": 0.948, "step": 194 }, { "epoch": 0.5051813471502591, "grad_norm": 42.105169991612456, "learning_rate": 3.57883582065604e-05, "loss": 1.1303, "step": 195 }, { "epoch": 0.5077720207253886, "grad_norm": 37.14457014578168, "learning_rate": 3.573552208896474e-05, "loss": 1.1483, "step": 196 }, { "epoch": 0.5103626943005182, "grad_norm": 28.56241612018119, "learning_rate": 3.568239613153421e-05, "loss": 1.0843, "step": 197 }, { "epoch": 0.5129533678756477, "grad_norm": 35.399304035761865, "learning_rate": 3.5628981312820315e-05, "loss": 1.1177, "step": 198 }, { "epoch": 0.5155440414507773, "grad_norm": 25.91156850470446, "learning_rate": 3.557527861669522e-05, "loss": 1.1215, "step": 199 }, { "epoch": 0.5181347150259067, "grad_norm": 43.509516777992324, "learning_rate": 3.552128903233363e-05, "loss": 1.1532, "step": 200 }, { "epoch": 0.5207253886010362, "grad_norm": 38.18164449834795, "learning_rate": 3.54670135541946e-05, "loss": 1.1142, "step": 201 }, { "epoch": 0.5233160621761658, "grad_norm": 48.576743289054534, "learning_rate": 3.541245318200318e-05, "loss": 1.1152, "step": 202 }, { "epoch": 0.5259067357512953, "grad_norm": 38.65411737007163, "learning_rate": 3.5357608920732e-05, "loss": 1.1607, "step": 203 }, { "epoch": 0.5284974093264249, "grad_norm": 35.663493907396834, "learning_rate": 3.530248178058282e-05, "loss": 1.1273, "step": 204 }, { "epoch": 0.5310880829015544, "grad_norm": 26.829817821665976, "learning_rate": 3.5247072776967805e-05, "loss": 1.1174, "step": 205 }, { "epoch": 0.533678756476684, "grad_norm": 39.79604912152638, "learning_rate": 3.519138293049097e-05, "loss": 1.1811, "step": 206 }, { "epoch": 0.5362694300518135, "grad_norm": 32.26179097390416, "learning_rate": 3.513541326692925e-05, "loss": 1.1346, "step": 207 }, { "epoch": 0.538860103626943, "grad_norm": 24.35769329902787, "learning_rate": 3.5079164817213684e-05, "loss": 1.1061, "step": 208 }, { "epoch": 0.5414507772020726, "grad_norm": 26.645546258363844, "learning_rate": 3.5022638617410396e-05, "loss": 1.0514, "step": 209 }, { "epoch": 0.5440414507772021, "grad_norm": 105.19676603444857, "learning_rate": 3.496583570870152e-05, "loss": 1.1474, "step": 210 }, { "epoch": 0.5466321243523317, "grad_norm": 61.600623030405885, "learning_rate": 3.4908757137366006e-05, "loss": 1.104, "step": 211 }, { "epoch": 0.5492227979274611, "grad_norm": 31.65460129853052, "learning_rate": 3.485140395476038e-05, "loss": 1.0737, "step": 212 }, { "epoch": 0.5518134715025906, "grad_norm": 26.860379117211497, "learning_rate": 3.4793777217299346e-05, "loss": 1.1119, "step": 213 }, { "epoch": 0.5544041450777202, "grad_norm": 39.89324262309783, "learning_rate": 3.473587798643633e-05, "loss": 1.1626, "step": 214 }, { "epoch": 0.5569948186528497, "grad_norm": 39.77638257731599, "learning_rate": 3.467770732864399e-05, "loss": 1.1545, "step": 215 }, { "epoch": 0.5595854922279793, "grad_norm": 30.994657564291458, "learning_rate": 3.461926631539445e-05, "loss": 1.1646, "step": 216 }, { "epoch": 0.5621761658031088, "grad_norm": 51.99674092516571, "learning_rate": 3.4560556023139695e-05, "loss": 1.1638, "step": 217 }, { "epoch": 0.5647668393782384, "grad_norm": 58.5132713002146, "learning_rate": 3.450157753329166e-05, "loss": 1.1461, "step": 218 }, { "epoch": 0.5673575129533679, "grad_norm": 30.712469030418482, "learning_rate": 3.4442331932202326e-05, "loss": 1.1583, "step": 219 }, { "epoch": 0.5699481865284974, "grad_norm": 47.00217426642832, "learning_rate": 3.438282031114374e-05, "loss": 1.1154, "step": 220 }, { "epoch": 0.572538860103627, "grad_norm": 37.33927961163222, "learning_rate": 3.432304376628787e-05, "loss": 1.1372, "step": 221 }, { "epoch": 0.5751295336787565, "grad_norm": 28.858636933974392, "learning_rate": 3.4263003398686464e-05, "loss": 1.0488, "step": 222 }, { "epoch": 0.5777202072538861, "grad_norm": 37.842230890171486, "learning_rate": 3.420270031425072e-05, "loss": 1.1892, "step": 223 }, { "epoch": 0.5803108808290155, "grad_norm": 32.65394945357516, "learning_rate": 3.4142135623730954e-05, "loss": 1.1218, "step": 224 }, { "epoch": 0.582901554404145, "grad_norm": 115.22040829465772, "learning_rate": 3.4081310442696114e-05, "loss": 1.1546, "step": 225 }, { "epoch": 0.5854922279792746, "grad_norm": 31.20514468446119, "learning_rate": 3.402022589151325e-05, "loss": 1.0969, "step": 226 }, { "epoch": 0.5880829015544041, "grad_norm": 52.8397361926395, "learning_rate": 3.395888309532687e-05, "loss": 1.1218, "step": 227 }, { "epoch": 0.5906735751295337, "grad_norm": 51.7991692917308, "learning_rate": 3.3897283184038215e-05, "loss": 1.1395, "step": 228 }, { "epoch": 0.5932642487046632, "grad_norm": 33.56775233970504, "learning_rate": 3.3835427292284445e-05, "loss": 1.1107, "step": 229 }, { "epoch": 0.5958549222797928, "grad_norm": 46.081120788214314, "learning_rate": 3.3773316559417734e-05, "loss": 1.1472, "step": 230 }, { "epoch": 0.5984455958549223, "grad_norm": 41.72558170492288, "learning_rate": 3.371095212948431e-05, "loss": 1.1871, "step": 231 }, { "epoch": 0.6010362694300518, "grad_norm": 34.27957927587091, "learning_rate": 3.364833515120336e-05, "loss": 1.1376, "step": 232 }, { "epoch": 0.6036269430051814, "grad_norm": 36.58452602010953, "learning_rate": 3.358546677794586e-05, "loss": 1.1885, "step": 233 }, { "epoch": 0.6062176165803109, "grad_norm": 28.010809914189192, "learning_rate": 3.352234816771337e-05, "loss": 1.102, "step": 234 }, { "epoch": 0.6088082901554405, "grad_norm": 24.78419558611963, "learning_rate": 3.3458980483116664e-05, "loss": 1.0818, "step": 235 }, { "epoch": 0.6113989637305699, "grad_norm": 28.12830040081226, "learning_rate": 3.3395364891354316e-05, "loss": 1.1862, "step": 236 }, { "epoch": 0.6139896373056994, "grad_norm": 37.94181651161551, "learning_rate": 3.333150256419127e-05, "loss": 1.147, "step": 237 }, { "epoch": 0.616580310880829, "grad_norm": 21.809518482701854, "learning_rate": 3.3267394677937134e-05, "loss": 1.0994, "step": 238 }, { "epoch": 0.6191709844559585, "grad_norm": 32.12135773753589, "learning_rate": 3.320304241342464e-05, "loss": 1.1531, "step": 239 }, { "epoch": 0.6217616580310881, "grad_norm": 51.959731073524054, "learning_rate": 3.31384469559878e-05, "loss": 1.1717, "step": 240 }, { "epoch": 0.6243523316062176, "grad_norm": 28.045815836372345, "learning_rate": 3.307360949544012e-05, "loss": 1.1814, "step": 241 }, { "epoch": 0.6269430051813472, "grad_norm": 39.55208384578746, "learning_rate": 3.300853122605268e-05, "loss": 1.1483, "step": 242 }, { "epoch": 0.6295336787564767, "grad_norm": 29.799974205160808, "learning_rate": 3.294321334653213e-05, "loss": 1.1838, "step": 243 }, { "epoch": 0.6321243523316062, "grad_norm": 124.31035254102245, "learning_rate": 3.2877657059998584e-05, "loss": 1.0698, "step": 244 }, { "epoch": 0.6347150259067358, "grad_norm": 37.989925180187655, "learning_rate": 3.281186357396351e-05, "loss": 1.0984, "step": 245 }, { "epoch": 0.6373056994818653, "grad_norm": 55.72599333657572, "learning_rate": 3.274583410030745e-05, "loss": 1.2333, "step": 246 }, { "epoch": 0.6398963730569949, "grad_norm": 46.77079456439719, "learning_rate": 3.267956985525774e-05, "loss": 1.2157, "step": 247 }, { "epoch": 0.6424870466321243, "grad_norm": 33.62329915252562, "learning_rate": 3.261307205936603e-05, "loss": 1.1752, "step": 248 }, { "epoch": 0.6450777202072538, "grad_norm": 34.11794183225494, "learning_rate": 3.2546341937485884e-05, "loss": 1.1265, "step": 249 }, { "epoch": 0.6476683937823834, "grad_norm": 36.027636323913896, "learning_rate": 3.247938071875017e-05, "loss": 1.103, "step": 250 }, { "epoch": 0.6502590673575129, "grad_norm": 35.393219337329946, "learning_rate": 3.2412189636548456e-05, "loss": 1.1148, "step": 251 }, { "epoch": 0.6528497409326425, "grad_norm": 31.578919022569924, "learning_rate": 3.234476992850425e-05, "loss": 1.1149, "step": 252 }, { "epoch": 0.655440414507772, "grad_norm": 28.93717647736964, "learning_rate": 3.227712283645224e-05, "loss": 1.1425, "step": 253 }, { "epoch": 0.6580310880829016, "grad_norm": 34.170026750703684, "learning_rate": 3.2209249606415394e-05, "loss": 1.1591, "step": 254 }, { "epoch": 0.6606217616580311, "grad_norm": 27.52194954061608, "learning_rate": 3.214115148858201e-05, "loss": 1.1704, "step": 255 }, { "epoch": 0.6632124352331606, "grad_norm": 81.65404753769732, "learning_rate": 3.207282973728273e-05, "loss": 1.161, "step": 256 }, { "epoch": 0.6658031088082902, "grad_norm": 57.45351536522683, "learning_rate": 3.200428561096737e-05, "loss": 1.116, "step": 257 }, { "epoch": 0.6683937823834197, "grad_norm": 30.968529074463714, "learning_rate": 3.193552037218179e-05, "loss": 1.1265, "step": 258 }, { "epoch": 0.6709844559585493, "grad_norm": 37.8817748068655, "learning_rate": 3.186653528754464e-05, "loss": 1.1287, "step": 259 }, { "epoch": 0.6735751295336787, "grad_norm": 29.197031189172545, "learning_rate": 3.179733162772398e-05, "loss": 1.1045, "step": 260 }, { "epoch": 0.6761658031088082, "grad_norm": 36.56253841299107, "learning_rate": 3.172791066741392e-05, "loss": 1.1539, "step": 261 }, { "epoch": 0.6787564766839378, "grad_norm": 25.799921116950998, "learning_rate": 3.165827368531113e-05, "loss": 1.0796, "step": 262 }, { "epoch": 0.6813471502590673, "grad_norm": 82.81825216532526, "learning_rate": 3.1588421964091276e-05, "loss": 1.142, "step": 263 }, { "epoch": 0.6839378238341969, "grad_norm": 31.100074747569124, "learning_rate": 3.151835679038542e-05, "loss": 1.0908, "step": 264 }, { "epoch": 0.6865284974093264, "grad_norm": 25.57297200703221, "learning_rate": 3.14480794547563e-05, "loss": 1.1436, "step": 265 }, { "epoch": 0.689119170984456, "grad_norm": 23.92492773149328, "learning_rate": 3.137759125167455e-05, "loss": 1.1202, "step": 266 }, { "epoch": 0.6917098445595855, "grad_norm": 22.14274360766396, "learning_rate": 3.130689347949486e-05, "loss": 1.1113, "step": 267 }, { "epoch": 0.694300518134715, "grad_norm": 26.68725288649902, "learning_rate": 3.123598744043211e-05, "loss": 1.1517, "step": 268 }, { "epoch": 0.6968911917098446, "grad_norm": 25.559817524659362, "learning_rate": 3.1164874440537295e-05, "loss": 1.0976, "step": 269 }, { "epoch": 0.6994818652849741, "grad_norm": 28.89996834100355, "learning_rate": 3.109355578967356e-05, "loss": 1.1932, "step": 270 }, { "epoch": 0.7020725388601037, "grad_norm": 32.09658045195569, "learning_rate": 3.1022032801492e-05, "loss": 1.1161, "step": 271 }, { "epoch": 0.7046632124352331, "grad_norm": 30.623705646213768, "learning_rate": 3.095030679340751e-05, "loss": 1.1993, "step": 272 }, { "epoch": 0.7072538860103627, "grad_norm": 41.71263710932429, "learning_rate": 3.0878379086574494e-05, "loss": 1.1624, "step": 273 }, { "epoch": 0.7098445595854922, "grad_norm": 34.68352639470226, "learning_rate": 3.0806251005862535e-05, "loss": 1.1156, "step": 274 }, { "epoch": 0.7124352331606217, "grad_norm": 23.52580702428812, "learning_rate": 3.073392387983202e-05, "loss": 1.0963, "step": 275 }, { "epoch": 0.7150259067357513, "grad_norm": 28.10687988214902, "learning_rate": 3.0661399040709584e-05, "loss": 1.1095, "step": 276 }, { "epoch": 0.7176165803108808, "grad_norm": 66.72288729975841, "learning_rate": 3.05886778243637e-05, "loss": 1.0865, "step": 277 }, { "epoch": 0.7202072538860104, "grad_norm": 25.775217430321934, "learning_rate": 3.051576157027998e-05, "loss": 1.1058, "step": 278 }, { "epoch": 0.7227979274611399, "grad_norm": 36.82942099016794, "learning_rate": 3.0442651621536502e-05, "loss": 1.1211, "step": 279 }, { "epoch": 0.7253886010362695, "grad_norm": 27.878820856521013, "learning_rate": 3.0369349324779115e-05, "loss": 1.1471, "step": 280 }, { "epoch": 0.727979274611399, "grad_norm": 31.293156717285573, "learning_rate": 3.0295856030196618e-05, "loss": 1.0748, "step": 281 }, { "epoch": 0.7305699481865285, "grad_norm": 39.315952115194435, "learning_rate": 3.022217309149588e-05, "loss": 1.0993, "step": 282 }, { "epoch": 0.7331606217616581, "grad_norm": 36.79954071435495, "learning_rate": 3.0148301865876913e-05, "loss": 1.1045, "step": 283 }, { "epoch": 0.7357512953367875, "grad_norm": 26.127389502147167, "learning_rate": 3.0074243714007875e-05, "loss": 1.1424, "step": 284 }, { "epoch": 0.7383419689119171, "grad_norm": 25.608778060317068, "learning_rate": 3.0000000000000004e-05, "loss": 1.1055, "step": 285 }, { "epoch": 0.7409326424870466, "grad_norm": 36.22629669671894, "learning_rate": 2.992557209138249e-05, "loss": 1.0845, "step": 286 }, { "epoch": 0.7435233160621761, "grad_norm": 35.30642111132886, "learning_rate": 2.9850961359077293e-05, "loss": 1.204, "step": 287 }, { "epoch": 0.7461139896373057, "grad_norm": 29.765894622087952, "learning_rate": 2.977616917737388e-05, "loss": 1.168, "step": 288 }, { "epoch": 0.7487046632124352, "grad_norm": 27.194683587397567, "learning_rate": 2.9701196923903927e-05, "loss": 1.1236, "step": 289 }, { "epoch": 0.7512953367875648, "grad_norm": 63.09779240191165, "learning_rate": 2.9626045979615928e-05, "loss": 1.1395, "step": 290 }, { "epoch": 0.7538860103626943, "grad_norm": 25.014233377763066, "learning_rate": 2.9550717728749768e-05, "loss": 1.1054, "step": 291 }, { "epoch": 0.7538860103626943, "eval_loss": 1.0996382236480713, "eval_runtime": 37.9545, "eval_samples_per_second": 19.602, "eval_steps_per_second": 1.238, "step": 291 }, { "epoch": 0.7564766839378239, "grad_norm": 27.481891737318097, "learning_rate": 2.947521355881122e-05, "loss": 1.1252, "step": 292 }, { "epoch": 0.7590673575129534, "grad_norm": 67.57807413949878, "learning_rate": 2.9399534860546404e-05, "loss": 1.1761, "step": 293 }, { "epoch": 0.7616580310880829, "grad_norm": 65.66834495909988, "learning_rate": 2.932368302791614e-05, "loss": 1.0551, "step": 294 }, { "epoch": 0.7642487046632125, "grad_norm": 30.051210942517116, "learning_rate": 2.92476594580703e-05, "loss": 1.138, "step": 295 }, { "epoch": 0.7668393782383419, "grad_norm": 22.693089678510507, "learning_rate": 2.917146555132206e-05, "loss": 1.1495, "step": 296 }, { "epoch": 0.7694300518134715, "grad_norm": 53.84166280540606, "learning_rate": 2.909510271112212e-05, "loss": 1.1409, "step": 297 }, { "epoch": 0.772020725388601, "grad_norm": 32.69106061524578, "learning_rate": 2.9018572344032823e-05, "loss": 1.1709, "step": 298 }, { "epoch": 0.7746113989637305, "grad_norm": 39.44484991312582, "learning_rate": 2.8941875859702283e-05, "loss": 1.1138, "step": 299 }, { "epoch": 0.7772020725388601, "grad_norm": 31.51857596969122, "learning_rate": 2.88650146708384e-05, "loss": 1.1931, "step": 300 }, { "epoch": 0.7797927461139896, "grad_norm": 70.51218412614058, "learning_rate": 2.878799019318283e-05, "loss": 1.155, "step": 301 }, { "epoch": 0.7823834196891192, "grad_norm": 80.27969224752457, "learning_rate": 2.8710803845484955e-05, "loss": 1.1425, "step": 302 }, { "epoch": 0.7849740932642487, "grad_norm": 28.16560857981767, "learning_rate": 2.8633457049475678e-05, "loss": 1.1072, "step": 303 }, { "epoch": 0.7875647668393783, "grad_norm": 41.15138307552231, "learning_rate": 2.855595122984129e-05, "loss": 1.1492, "step": 304 }, { "epoch": 0.7901554404145078, "grad_norm": 23.894217282116276, "learning_rate": 2.847828781419722e-05, "loss": 1.1136, "step": 305 }, { "epoch": 0.7927461139896373, "grad_norm": 25.005501120810248, "learning_rate": 2.8400468233061708e-05, "loss": 1.0921, "step": 306 }, { "epoch": 0.7953367875647669, "grad_norm": 30.91791938195468, "learning_rate": 2.832249391982949e-05, "loss": 1.1098, "step": 307 }, { "epoch": 0.7979274611398963, "grad_norm": 44.776563922922726, "learning_rate": 2.8244366310745398e-05, "loss": 1.1845, "step": 308 }, { "epoch": 0.8005181347150259, "grad_norm": 19.059329544784376, "learning_rate": 2.816608684487787e-05, "loss": 1.169, "step": 309 }, { "epoch": 0.8031088082901554, "grad_norm": 63.97334641962602, "learning_rate": 2.8087656964092472e-05, "loss": 1.124, "step": 310 }, { "epoch": 0.805699481865285, "grad_norm": 30.878848859015882, "learning_rate": 2.8009078113025335e-05, "loss": 1.2087, "step": 311 }, { "epoch": 0.8082901554404145, "grad_norm": 34.63835471543836, "learning_rate": 2.7930351739056533e-05, "loss": 1.1338, "step": 312 }, { "epoch": 0.810880829015544, "grad_norm": 30.03178182445718, "learning_rate": 2.7851479292283442e-05, "loss": 1.1321, "step": 313 }, { "epoch": 0.8134715025906736, "grad_norm": 38.42236523356876, "learning_rate": 2.7772462225494013e-05, "loss": 1.1557, "step": 314 }, { "epoch": 0.8160621761658031, "grad_norm": 39.179683790956744, "learning_rate": 2.7693301994140026e-05, "loss": 1.1201, "step": 315 }, { "epoch": 0.8186528497409327, "grad_norm": 38.32243159447327, "learning_rate": 2.761400005631028e-05, "loss": 1.1105, "step": 316 }, { "epoch": 0.8212435233160622, "grad_norm": 39.913808227411835, "learning_rate": 2.7534557872703705e-05, "loss": 1.1598, "step": 317 }, { "epoch": 0.8238341968911918, "grad_norm": 69.73521867812421, "learning_rate": 2.7454976906602513e-05, "loss": 1.1145, "step": 318 }, { "epoch": 0.8264248704663213, "grad_norm": 65.55887588207746, "learning_rate": 2.7375258623845207e-05, "loss": 1.1255, "step": 319 }, { "epoch": 0.8290155440414507, "grad_norm": 30.980111545641563, "learning_rate": 2.7295404492799575e-05, "loss": 1.122, "step": 320 }, { "epoch": 0.8316062176165803, "grad_norm": 30.12179911444832, "learning_rate": 2.721541598433567e-05, "loss": 1.113, "step": 321 }, { "epoch": 0.8341968911917098, "grad_norm": 28.329434659508582, "learning_rate": 2.7135294571798706e-05, "loss": 1.0498, "step": 322 }, { "epoch": 0.8367875647668394, "grad_norm": 25.114787597049578, "learning_rate": 2.70550417309819e-05, "loss": 1.0633, "step": 323 }, { "epoch": 0.8393782383419689, "grad_norm": 27.754037709590385, "learning_rate": 2.6974658940099337e-05, "loss": 1.1585, "step": 324 }, { "epoch": 0.8419689119170984, "grad_norm": 29.489888159179444, "learning_rate": 2.6894147679758678e-05, "loss": 1.1259, "step": 325 }, { "epoch": 0.844559585492228, "grad_norm": 24.426102194202898, "learning_rate": 2.6813509432933957e-05, "loss": 1.1515, "step": 326 }, { "epoch": 0.8471502590673575, "grad_norm": 24.75197483331429, "learning_rate": 2.673274568493821e-05, "loss": 1.15, "step": 327 }, { "epoch": 0.8497409326424871, "grad_norm": 40.604864626683366, "learning_rate": 2.6651857923396132e-05, "loss": 1.1219, "step": 328 }, { "epoch": 0.8523316062176166, "grad_norm": 34.694568404196026, "learning_rate": 2.6570847638216698e-05, "loss": 1.103, "step": 329 }, { "epoch": 0.8549222797927462, "grad_norm": 48.715136403425035, "learning_rate": 2.648971632156569e-05, "loss": 1.1675, "step": 330 }, { "epoch": 0.8575129533678757, "grad_norm": 97.77526410121799, "learning_rate": 2.6408465467838225e-05, "loss": 1.1502, "step": 331 }, { "epoch": 0.8601036269430051, "grad_norm": 54.697215318949276, "learning_rate": 2.632709657363124e-05, "loss": 1.1446, "step": 332 }, { "epoch": 0.8626943005181347, "grad_norm": 38.09192002041798, "learning_rate": 2.6245611137715897e-05, "loss": 1.1333, "step": 333 }, { "epoch": 0.8652849740932642, "grad_norm": 46.713623556984956, "learning_rate": 2.6164010661010007e-05, "loss": 1.1252, "step": 334 }, { "epoch": 0.8678756476683938, "grad_norm": 46.40552686286593, "learning_rate": 2.6082296646550364e-05, "loss": 1.121, "step": 335 }, { "epoch": 0.8704663212435233, "grad_norm": 37.57424454065957, "learning_rate": 2.6000470599465065e-05, "loss": 1.1671, "step": 336 }, { "epoch": 0.8730569948186528, "grad_norm": 38.580777053099204, "learning_rate": 2.5918534026945787e-05, "loss": 1.0849, "step": 337 }, { "epoch": 0.8756476683937824, "grad_norm": 154.3106712010981, "learning_rate": 2.5836488438220044e-05, "loss": 1.0663, "step": 338 }, { "epoch": 0.8782383419689119, "grad_norm": 34.21394067951015, "learning_rate": 2.575433534452334e-05, "loss": 1.0895, "step": 339 }, { "epoch": 0.8808290155440415, "grad_norm": 36.291611242733886, "learning_rate": 2.5672076259071385e-05, "loss": 1.1242, "step": 340 }, { "epoch": 0.883419689119171, "grad_norm": 29.411623389655112, "learning_rate": 2.558971269703219e-05, "loss": 1.1005, "step": 341 }, { "epoch": 0.8860103626943006, "grad_norm": 30.24903086761753, "learning_rate": 2.5507246175498174e-05, "loss": 1.1134, "step": 342 }, { "epoch": 0.8886010362694301, "grad_norm": 22.032293114161938, "learning_rate": 2.5424678213458202e-05, "loss": 1.1121, "step": 343 }, { "epoch": 0.8911917098445595, "grad_norm": 34.997361528376956, "learning_rate": 2.5342010331769635e-05, "loss": 1.1341, "step": 344 }, { "epoch": 0.8937823834196891, "grad_norm": 28.212824875732352, "learning_rate": 2.5259244053130295e-05, "loss": 1.0748, "step": 345 }, { "epoch": 0.8963730569948186, "grad_norm": 23.870011592985897, "learning_rate": 2.5176380902050418e-05, "loss": 1.0643, "step": 346 }, { "epoch": 0.8989637305699482, "grad_norm": 26.10018699309748, "learning_rate": 2.5093422404824574e-05, "loss": 1.1662, "step": 347 }, { "epoch": 0.9015544041450777, "grad_norm": 30.191468778559166, "learning_rate": 2.5010370089503578e-05, "loss": 1.1023, "step": 348 }, { "epoch": 0.9041450777202072, "grad_norm": 55.799581973427415, "learning_rate": 2.4927225485866297e-05, "loss": 1.1538, "step": 349 }, { "epoch": 0.9067357512953368, "grad_norm": 35.7030284720465, "learning_rate": 2.4843990125391516e-05, "loss": 1.1, "step": 350 }, { "epoch": 0.9093264248704663, "grad_norm": 28.61763302791738, "learning_rate": 2.4760665541229712e-05, "loss": 1.0914, "step": 351 }, { "epoch": 0.9119170984455959, "grad_norm": 33.34233685155311, "learning_rate": 2.467725326817481e-05, "loss": 1.0862, "step": 352 }, { "epoch": 0.9145077720207254, "grad_norm": 25.441052078480084, "learning_rate": 2.4593754842635917e-05, "loss": 1.1422, "step": 353 }, { "epoch": 0.917098445595855, "grad_norm": 24.217974454985058, "learning_rate": 2.451017180260902e-05, "loss": 1.132, "step": 354 }, { "epoch": 0.9196891191709845, "grad_norm": 57.986011465793155, "learning_rate": 2.4426505687648653e-05, "loss": 1.2082, "step": 355 }, { "epoch": 0.9222797927461139, "grad_norm": 34.058264716876195, "learning_rate": 2.4342758038839573e-05, "loss": 1.1679, "step": 356 }, { "epoch": 0.9248704663212435, "grad_norm": 28.621514922275253, "learning_rate": 2.4258930398768317e-05, "loss": 1.1319, "step": 357 }, { "epoch": 0.927461139896373, "grad_norm": 35.33355417283227, "learning_rate": 2.4175024311494835e-05, "loss": 1.0705, "step": 358 }, { "epoch": 0.9300518134715026, "grad_norm": 46.579572933583265, "learning_rate": 2.4091041322524023e-05, "loss": 1.0842, "step": 359 }, { "epoch": 0.9326424870466321, "grad_norm": 35.494740787672974, "learning_rate": 2.4006982978777263e-05, "loss": 1.1072, "step": 360 }, { "epoch": 0.9352331606217616, "grad_norm": 44.56606839509262, "learning_rate": 2.392285082856394e-05, "loss": 1.1125, "step": 361 }, { "epoch": 0.9378238341968912, "grad_norm": 46.26363869084929, "learning_rate": 2.3838646421552917e-05, "loss": 1.1268, "step": 362 }, { "epoch": 0.9404145077720207, "grad_norm": 89.17676267680146, "learning_rate": 2.3754371308743975e-05, "loss": 1.0893, "step": 363 }, { "epoch": 0.9430051813471503, "grad_norm": 34.87700187494181, "learning_rate": 2.367002704243927e-05, "loss": 1.1203, "step": 364 }, { "epoch": 0.9455958549222798, "grad_norm": 32.92806939217504, "learning_rate": 2.3585615176214716e-05, "loss": 1.1488, "step": 365 }, { "epoch": 0.9481865284974094, "grad_norm": 27.27458755248548, "learning_rate": 2.3501137264891396e-05, "loss": 1.0874, "step": 366 }, { "epoch": 0.9507772020725389, "grad_norm": 24.959123789739834, "learning_rate": 2.3416594864506887e-05, "loss": 1.1783, "step": 367 }, { "epoch": 0.9533678756476683, "grad_norm": 31.838670988369724, "learning_rate": 2.333198953228664e-05, "loss": 1.0759, "step": 368 }, { "epoch": 0.9559585492227979, "grad_norm": 28.112870222863155, "learning_rate": 2.3247322826615276e-05, "loss": 1.1481, "step": 369 }, { "epoch": 0.9585492227979274, "grad_norm": 35.08461098450067, "learning_rate": 2.316259630700787e-05, "loss": 1.0953, "step": 370 }, { "epoch": 0.961139896373057, "grad_norm": 37.80899503618479, "learning_rate": 2.307781153408124e-05, "loss": 1.1224, "step": 371 }, { "epoch": 0.9637305699481865, "grad_norm": 31.644978122007387, "learning_rate": 2.2992970069525202e-05, "loss": 1.1608, "step": 372 }, { "epoch": 0.966321243523316, "grad_norm": 23.51029318210938, "learning_rate": 2.29080734760738e-05, "loss": 1.0914, "step": 373 }, { "epoch": 0.9689119170984456, "grad_norm": 28.97240481418573, "learning_rate": 2.2823123317476522e-05, "loss": 1.1117, "step": 374 }, { "epoch": 0.9715025906735751, "grad_norm": 36.613893678320395, "learning_rate": 2.273812115846951e-05, "loss": 1.1118, "step": 375 }, { "epoch": 0.9740932642487047, "grad_norm": 26.402979304578093, "learning_rate": 2.2653068564746692e-05, "loss": 1.13, "step": 376 }, { "epoch": 0.9766839378238342, "grad_norm": 114.3000444613392, "learning_rate": 2.2567967102931025e-05, "loss": 1.1539, "step": 377 }, { "epoch": 0.9792746113989638, "grad_norm": 26.861359932396834, "learning_rate": 2.2482818340545534e-05, "loss": 1.0566, "step": 378 }, { "epoch": 0.9818652849740933, "grad_norm": 32.75509374223994, "learning_rate": 2.2397623845984548e-05, "loss": 1.1746, "step": 379 }, { "epoch": 0.9844559585492227, "grad_norm": 34.11964206838379, "learning_rate": 2.2312385188484718e-05, "loss": 1.0834, "step": 380 }, { "epoch": 0.9870466321243523, "grad_norm": 38.019564122226434, "learning_rate": 2.2227103938096176e-05, "loss": 1.1074, "step": 381 }, { "epoch": 0.9896373056994818, "grad_norm": 39.5073811375391, "learning_rate": 2.2141781665653584e-05, "loss": 1.1082, "step": 382 }, { "epoch": 0.9922279792746114, "grad_norm": 298.4258332795163, "learning_rate": 2.205641994274721e-05, "loss": 1.125, "step": 383 }, { "epoch": 0.9948186528497409, "grad_norm": 36.444415670935506, "learning_rate": 2.1971020341693973e-05, "loss": 1.0935, "step": 384 }, { "epoch": 0.9974093264248705, "grad_norm": 28.96533429210575, "learning_rate": 2.188558443550849e-05, "loss": 1.0957, "step": 385 }, { "epoch": 1.0, "grad_norm": 66.41241684127401, "learning_rate": 2.180011379787411e-05, "loss": 1.1335, "step": 386 }, { "epoch": 1.0025906735751295, "grad_norm": 28.75549619538953, "learning_rate": 2.1714610003113887e-05, "loss": 1.1316, "step": 387 }, { "epoch": 1.005181347150259, "grad_norm": 26.911837500852275, "learning_rate": 2.1629074626161647e-05, "loss": 1.1026, "step": 388 }, { "epoch": 1.005181347150259, "eval_loss": 1.0908173322677612, "eval_runtime": 37.7642, "eval_samples_per_second": 19.701, "eval_steps_per_second": 1.245, "step": 388 }, { "epoch": 1.0077720207253886, "grad_norm": 34.28722746775385, "learning_rate": 2.1543509242532932e-05, "loss": 1.1104, "step": 389 }, { "epoch": 1.0103626943005182, "grad_norm": 37.97709310694863, "learning_rate": 2.145791542829597e-05, "loss": 1.0663, "step": 390 }, { "epoch": 1.0129533678756477, "grad_norm": 39.379668162327384, "learning_rate": 2.1372294760042686e-05, "loss": 1.1405, "step": 391 }, { "epoch": 1.0155440414507773, "grad_norm": 27.136201219298698, "learning_rate": 2.1286648814859636e-05, "loss": 1.0963, "step": 392 }, { "epoch": 1.0181347150259068, "grad_norm": 39.34261641469313, "learning_rate": 2.120097917029897e-05, "loss": 1.1276, "step": 393 }, { "epoch": 1.0207253886010363, "grad_norm": 46.77583801285328, "learning_rate": 2.1115287404349357e-05, "loss": 1.1171, "step": 394 }, { "epoch": 1.0233160621761659, "grad_norm": 55.10335066695868, "learning_rate": 2.1029575095406933e-05, "loss": 1.0831, "step": 395 }, { "epoch": 1.0259067357512954, "grad_norm": 76.88533851789373, "learning_rate": 2.0943843822246234e-05, "loss": 1.0925, "step": 396 }, { "epoch": 1.028497409326425, "grad_norm": 29.604569209708462, "learning_rate": 2.0858095163991094e-05, "loss": 1.1259, "step": 397 }, { "epoch": 1.0310880829015545, "grad_norm": 37.71348366628868, "learning_rate": 2.077233070008557e-05, "loss": 1.0792, "step": 398 }, { "epoch": 1.0336787564766838, "grad_norm": 26.866133194031644, "learning_rate": 2.0686552010264872e-05, "loss": 1.1649, "step": 399 }, { "epoch": 1.0362694300518134, "grad_norm": 35.739274800620635, "learning_rate": 2.060076067452622e-05, "loss": 1.0837, "step": 400 }, { "epoch": 1.038860103626943, "grad_norm": 24.479129391259896, "learning_rate": 2.0514958273099778e-05, "loss": 1.073, "step": 401 }, { "epoch": 1.0414507772020725, "grad_norm": 50.49963650108008, "learning_rate": 2.042914638641952e-05, "loss": 1.0912, "step": 402 }, { "epoch": 1.044041450777202, "grad_norm": 35.6875451072032, "learning_rate": 2.0343326595094154e-05, "loss": 1.0936, "step": 403 }, { "epoch": 1.0466321243523315, "grad_norm": 30.212298193414487, "learning_rate": 2.0257500479877965e-05, "loss": 1.089, "step": 404 }, { "epoch": 1.049222797927461, "grad_norm": 28.65828720015124, "learning_rate": 2.0171669621641743e-05, "loss": 1.1727, "step": 405 }, { "epoch": 1.0518134715025906, "grad_norm": 39.2199058392425, "learning_rate": 2.0085835601343627e-05, "loss": 1.1493, "step": 406 }, { "epoch": 1.0544041450777202, "grad_norm": 110.01204177059546, "learning_rate": 2e-05, "loss": 1.1245, "step": 407 }, { "epoch": 1.0569948186528497, "grad_norm": 43.427381349600374, "learning_rate": 1.9914164398656383e-05, "loss": 1.1183, "step": 408 }, { "epoch": 1.0595854922279793, "grad_norm": 64.78768909817894, "learning_rate": 1.9828330378358264e-05, "loss": 1.1528, "step": 409 }, { "epoch": 1.0621761658031088, "grad_norm": 26.50257915912425, "learning_rate": 1.974249952012204e-05, "loss": 1.1568, "step": 410 }, { "epoch": 1.0647668393782384, "grad_norm": 27.63159204178893, "learning_rate": 1.9656673404905852e-05, "loss": 1.1071, "step": 411 }, { "epoch": 1.067357512953368, "grad_norm": 27.0795355533723, "learning_rate": 1.957085361358049e-05, "loss": 1.0809, "step": 412 }, { "epoch": 1.0699481865284974, "grad_norm": 41.84795332660821, "learning_rate": 1.9485041726900232e-05, "loss": 1.0744, "step": 413 }, { "epoch": 1.072538860103627, "grad_norm": 143.2109134427192, "learning_rate": 1.939923932547379e-05, "loss": 1.0905, "step": 414 }, { "epoch": 1.0751295336787565, "grad_norm": 89.55384065946154, "learning_rate": 1.931344798973513e-05, "loss": 1.1012, "step": 415 }, { "epoch": 1.077720207253886, "grad_norm": 31.072074793068015, "learning_rate": 1.922766929991443e-05, "loss": 1.1141, "step": 416 }, { "epoch": 1.0803108808290156, "grad_norm": 29.82683189045969, "learning_rate": 1.914190483600891e-05, "loss": 1.0842, "step": 417 }, { "epoch": 1.0829015544041452, "grad_norm": 30.09708662586305, "learning_rate": 1.9056156177753776e-05, "loss": 1.1088, "step": 418 }, { "epoch": 1.0854922279792747, "grad_norm": 27.637437518920503, "learning_rate": 1.897042490459307e-05, "loss": 1.058, "step": 419 }, { "epoch": 1.0880829015544042, "grad_norm": 69.34285700381683, "learning_rate": 1.8884712595650653e-05, "loss": 1.0314, "step": 420 }, { "epoch": 1.0906735751295338, "grad_norm": 25.644927284592956, "learning_rate": 1.8799020829701036e-05, "loss": 1.0916, "step": 421 }, { "epoch": 1.093264248704663, "grad_norm": 30.3898986852319, "learning_rate": 1.871335118514037e-05, "loss": 1.0797, "step": 422 }, { "epoch": 1.0958549222797926, "grad_norm": 22.271334693423444, "learning_rate": 1.862770523995732e-05, "loss": 1.1134, "step": 423 }, { "epoch": 1.0984455958549222, "grad_norm": 35.85874616678876, "learning_rate": 1.854208457170404e-05, "loss": 1.0927, "step": 424 }, { "epoch": 1.1010362694300517, "grad_norm": 43.06832041948097, "learning_rate": 1.8456490757467075e-05, "loss": 1.093, "step": 425 }, { "epoch": 1.1036269430051813, "grad_norm": 37.83777637993467, "learning_rate": 1.8370925373838356e-05, "loss": 1.1268, "step": 426 }, { "epoch": 1.1062176165803108, "grad_norm": 23.798059023605177, "learning_rate": 1.8285389996886113e-05, "loss": 1.0989, "step": 427 }, { "epoch": 1.1088082901554404, "grad_norm": 25.443104465500795, "learning_rate": 1.8199886202125897e-05, "loss": 1.0581, "step": 428 }, { "epoch": 1.11139896373057, "grad_norm": 23.76241444847441, "learning_rate": 1.8114415564491513e-05, "loss": 1.0908, "step": 429 }, { "epoch": 1.1139896373056994, "grad_norm": 26.5600693044426, "learning_rate": 1.8028979658306033e-05, "loss": 1.1321, "step": 430 }, { "epoch": 1.116580310880829, "grad_norm": 44.854375199828986, "learning_rate": 1.794358005725279e-05, "loss": 1.0762, "step": 431 }, { "epoch": 1.1191709844559585, "grad_norm": 28.05797777410846, "learning_rate": 1.785821833434642e-05, "loss": 1.0698, "step": 432 }, { "epoch": 1.121761658031088, "grad_norm": 26.488479630212364, "learning_rate": 1.7772896061903824e-05, "loss": 1.1223, "step": 433 }, { "epoch": 1.1243523316062176, "grad_norm": 32.77084542157883, "learning_rate": 1.768761481151529e-05, "loss": 1.0984, "step": 434 }, { "epoch": 1.1269430051813472, "grad_norm": 39.13198413130026, "learning_rate": 1.7602376154015456e-05, "loss": 1.1551, "step": 435 }, { "epoch": 1.1295336787564767, "grad_norm": 23.878966995283953, "learning_rate": 1.751718165945447e-05, "loss": 1.1133, "step": 436 }, { "epoch": 1.1321243523316062, "grad_norm": 33.90472985566232, "learning_rate": 1.743203289706898e-05, "loss": 1.1219, "step": 437 }, { "epoch": 1.1347150259067358, "grad_norm": 23.340369938533712, "learning_rate": 1.734693143525331e-05, "loss": 1.1244, "step": 438 }, { "epoch": 1.1373056994818653, "grad_norm": 105.6885206147852, "learning_rate": 1.7261878841530494e-05, "loss": 1.0788, "step": 439 }, { "epoch": 1.1398963730569949, "grad_norm": 28.453526076458317, "learning_rate": 1.717687668252348e-05, "loss": 1.1576, "step": 440 }, { "epoch": 1.1424870466321244, "grad_norm": 36.1473991485961, "learning_rate": 1.7091926523926205e-05, "loss": 1.0859, "step": 441 }, { "epoch": 1.145077720207254, "grad_norm": 27.043461146902448, "learning_rate": 1.7007029930474804e-05, "loss": 1.1072, "step": 442 }, { "epoch": 1.1476683937823835, "grad_norm": 28.066170619981435, "learning_rate": 1.6922188465918763e-05, "loss": 1.1279, "step": 443 }, { "epoch": 1.150259067357513, "grad_norm": 38.62445822837212, "learning_rate": 1.6837403692992136e-05, "loss": 1.1275, "step": 444 }, { "epoch": 1.1528497409326426, "grad_norm": 28.077258963587767, "learning_rate": 1.6752677173384734e-05, "loss": 1.1004, "step": 445 }, { "epoch": 1.1554404145077721, "grad_norm": 42.1405744301338, "learning_rate": 1.6668010467713363e-05, "loss": 1.1141, "step": 446 }, { "epoch": 1.1580310880829017, "grad_norm": 26.827291684301034, "learning_rate": 1.658340513549312e-05, "loss": 1.1216, "step": 447 }, { "epoch": 1.160621761658031, "grad_norm": 30.863489441619983, "learning_rate": 1.649886273510861e-05, "loss": 1.1898, "step": 448 }, { "epoch": 1.1632124352331605, "grad_norm": 27.73579733476068, "learning_rate": 1.641438482378529e-05, "loss": 1.0971, "step": 449 }, { "epoch": 1.16580310880829, "grad_norm": 32.84347174567353, "learning_rate": 1.6329972957560736e-05, "loss": 1.0579, "step": 450 }, { "epoch": 1.1683937823834196, "grad_norm": 30.06456192962641, "learning_rate": 1.6245628691256032e-05, "loss": 1.1057, "step": 451 }, { "epoch": 1.1709844559585492, "grad_norm": 36.554506394377846, "learning_rate": 1.616135357844709e-05, "loss": 1.1008, "step": 452 }, { "epoch": 1.1735751295336787, "grad_norm": 27.358643056184114, "learning_rate": 1.6077149171436063e-05, "loss": 1.101, "step": 453 }, { "epoch": 1.1761658031088082, "grad_norm": 111.13373813893604, "learning_rate": 1.599301702122274e-05, "loss": 1.0688, "step": 454 }, { "epoch": 1.1787564766839378, "grad_norm": 33.94168250727336, "learning_rate": 1.590895867747599e-05, "loss": 1.0721, "step": 455 }, { "epoch": 1.1813471502590673, "grad_norm": 53.93978395349692, "learning_rate": 1.582497568850517e-05, "loss": 1.0584, "step": 456 }, { "epoch": 1.1839378238341969, "grad_norm": 29.19245794937285, "learning_rate": 1.574106960123169e-05, "loss": 1.067, "step": 457 }, { "epoch": 1.1865284974093264, "grad_norm": 28.06897801999048, "learning_rate": 1.5657241961160434e-05, "loss": 1.0899, "step": 458 }, { "epoch": 1.189119170984456, "grad_norm": 52.31256652964293, "learning_rate": 1.557349431235135e-05, "loss": 1.0925, "step": 459 }, { "epoch": 1.1917098445595855, "grad_norm": 65.39771110845307, "learning_rate": 1.5489828197390988e-05, "loss": 1.1448, "step": 460 }, { "epoch": 1.194300518134715, "grad_norm": 27.062780348557254, "learning_rate": 1.5406245157364093e-05, "loss": 1.0871, "step": 461 }, { "epoch": 1.1968911917098446, "grad_norm": 41.667025056250424, "learning_rate": 1.5322746731825195e-05, "loss": 1.048, "step": 462 }, { "epoch": 1.1994818652849741, "grad_norm": 24.936669803360665, "learning_rate": 1.5239334458770291e-05, "loss": 1.1243, "step": 463 }, { "epoch": 1.2020725388601037, "grad_norm": 26.65392149600558, "learning_rate": 1.5156009874608484e-05, "loss": 1.0919, "step": 464 }, { "epoch": 1.2046632124352332, "grad_norm": 48.57730651937978, "learning_rate": 1.5072774514133708e-05, "loss": 1.1259, "step": 465 }, { "epoch": 1.2072538860103628, "grad_norm": 31.34891257114439, "learning_rate": 1.4989629910496424e-05, "loss": 1.0733, "step": 466 }, { "epoch": 1.2098445595854923, "grad_norm": 24.541559850584985, "learning_rate": 1.4906577595175428e-05, "loss": 1.1166, "step": 467 }, { "epoch": 1.2124352331606219, "grad_norm": 20.4345832961354, "learning_rate": 1.4823619097949584e-05, "loss": 1.0916, "step": 468 }, { "epoch": 1.2150259067357512, "grad_norm": 28.860712194727487, "learning_rate": 1.4740755946869708e-05, "loss": 1.1043, "step": 469 }, { "epoch": 1.2176165803108807, "grad_norm": 25.71820242946282, "learning_rate": 1.4657989668230363e-05, "loss": 1.0949, "step": 470 }, { "epoch": 1.2202072538860103, "grad_norm": 51.16994773097077, "learning_rate": 1.4575321786541801e-05, "loss": 1.141, "step": 471 }, { "epoch": 1.2227979274611398, "grad_norm": 32.70442309640389, "learning_rate": 1.4492753824501833e-05, "loss": 1.1127, "step": 472 }, { "epoch": 1.2253886010362693, "grad_norm": 21.913285172411495, "learning_rate": 1.4410287302967813e-05, "loss": 1.084, "step": 473 }, { "epoch": 1.2279792746113989, "grad_norm": 34.45727214001296, "learning_rate": 1.4327923740928613e-05, "loss": 1.0836, "step": 474 }, { "epoch": 1.2305699481865284, "grad_norm": 26.768013926034776, "learning_rate": 1.4245664655476663e-05, "loss": 1.1264, "step": 475 }, { "epoch": 1.233160621761658, "grad_norm": 28.401965255935572, "learning_rate": 1.4163511561779956e-05, "loss": 1.0805, "step": 476 }, { "epoch": 1.2357512953367875, "grad_norm": 29.19935757288793, "learning_rate": 1.4081465973054216e-05, "loss": 1.0825, "step": 477 }, { "epoch": 1.238341968911917, "grad_norm": 24.55918541541201, "learning_rate": 1.3999529400534941e-05, "loss": 1.1164, "step": 478 }, { "epoch": 1.2409326424870466, "grad_norm": 25.35635406268312, "learning_rate": 1.3917703353449646e-05, "loss": 1.1334, "step": 479 }, { "epoch": 1.2435233160621761, "grad_norm": 45.453901005004184, "learning_rate": 1.3835989338989996e-05, "loss": 1.1387, "step": 480 }, { "epoch": 1.2461139896373057, "grad_norm": 21.67852694202104, "learning_rate": 1.375438886228411e-05, "loss": 1.0846, "step": 481 }, { "epoch": 1.2487046632124352, "grad_norm": 171.2474074894732, "learning_rate": 1.3672903426368773e-05, "loss": 1.1388, "step": 482 }, { "epoch": 1.2512953367875648, "grad_norm": 43.18223835070906, "learning_rate": 1.3591534532161781e-05, "loss": 1.1483, "step": 483 }, { "epoch": 1.2538860103626943, "grad_norm": 29.447332565856644, "learning_rate": 1.3510283678434317e-05, "loss": 1.07, "step": 484 }, { "epoch": 1.2564766839378239, "grad_norm": 28.600251051615228, "learning_rate": 1.3429152361783307e-05, "loss": 1.0798, "step": 485 }, { "epoch": 1.2564766839378239, "eval_loss": 1.085669755935669, "eval_runtime": 38.1134, "eval_samples_per_second": 19.521, "eval_steps_per_second": 1.233, "step": 485 }, { "epoch": 1.2590673575129534, "grad_norm": 47.124643074410464, "learning_rate": 1.3348142076603876e-05, "loss": 1.0875, "step": 486 }, { "epoch": 1.261658031088083, "grad_norm": 42.06019726307143, "learning_rate": 1.3267254315061797e-05, "loss": 1.1429, "step": 487 }, { "epoch": 1.2642487046632125, "grad_norm": 18.950734630756962, "learning_rate": 1.318649056706605e-05, "loss": 1.0747, "step": 488 }, { "epoch": 1.266839378238342, "grad_norm": 31.903949502516806, "learning_rate": 1.3105852320241326e-05, "loss": 1.1041, "step": 489 }, { "epoch": 1.2694300518134716, "grad_norm": 22.957473008085927, "learning_rate": 1.3025341059900675e-05, "loss": 1.1046, "step": 490 }, { "epoch": 1.2720207253886011, "grad_norm": 22.325983256563678, "learning_rate": 1.2944958269018103e-05, "loss": 1.0643, "step": 491 }, { "epoch": 1.2746113989637307, "grad_norm": 29.689383331974955, "learning_rate": 1.2864705428201307e-05, "loss": 1.0949, "step": 492 }, { "epoch": 1.2772020725388602, "grad_norm": 25.338298442945575, "learning_rate": 1.2784584015664337e-05, "loss": 1.0725, "step": 493 }, { "epoch": 1.2797927461139897, "grad_norm": 31.591732488078588, "learning_rate": 1.2704595507200435e-05, "loss": 1.0347, "step": 494 }, { "epoch": 1.2823834196891193, "grad_norm": 42.96243570696118, "learning_rate": 1.26247413761548e-05, "loss": 1.1196, "step": 495 }, { "epoch": 1.2849740932642488, "grad_norm": 26.559546676266024, "learning_rate": 1.254502309339749e-05, "loss": 1.0187, "step": 496 }, { "epoch": 1.2875647668393784, "grad_norm": 27.58444017584016, "learning_rate": 1.2465442127296297e-05, "loss": 1.0985, "step": 497 }, { "epoch": 1.2901554404145077, "grad_norm": 36.53028730423797, "learning_rate": 1.2385999943689732e-05, "loss": 1.068, "step": 498 }, { "epoch": 1.2927461139896372, "grad_norm": 38.94837307599113, "learning_rate": 1.2306698005859975e-05, "loss": 1.0736, "step": 499 }, { "epoch": 1.2953367875647668, "grad_norm": 36.67208266195125, "learning_rate": 1.2227537774505996e-05, "loss": 1.119, "step": 500 }, { "epoch": 1.2979274611398963, "grad_norm": 31.086410648635283, "learning_rate": 1.2148520707716567e-05, "loss": 1.1094, "step": 501 }, { "epoch": 1.3005181347150259, "grad_norm": 27.96977481605826, "learning_rate": 1.2069648260943473e-05, "loss": 1.1345, "step": 502 }, { "epoch": 1.3031088082901554, "grad_norm": 22.89450502840197, "learning_rate": 1.1990921886974669e-05, "loss": 1.12, "step": 503 }, { "epoch": 1.305699481865285, "grad_norm": 18.54206032224653, "learning_rate": 1.1912343035907535e-05, "loss": 1.0929, "step": 504 }, { "epoch": 1.3082901554404145, "grad_norm": 38.9386007237313, "learning_rate": 1.1833913155122132e-05, "loss": 1.1381, "step": 505 }, { "epoch": 1.310880829015544, "grad_norm": 37.05899458809635, "learning_rate": 1.1755633689254609e-05, "loss": 1.0535, "step": 506 }, { "epoch": 1.3134715025906736, "grad_norm": 27.716372794195156, "learning_rate": 1.1677506080170512e-05, "loss": 1.1342, "step": 507 }, { "epoch": 1.3160621761658031, "grad_norm": 40.42306246079416, "learning_rate": 1.1599531766938306e-05, "loss": 1.0887, "step": 508 }, { "epoch": 1.3186528497409327, "grad_norm": 98.56681767405578, "learning_rate": 1.1521712185802789e-05, "loss": 1.0954, "step": 509 }, { "epoch": 1.3212435233160622, "grad_norm": 34.42816933350743, "learning_rate": 1.1444048770158718e-05, "loss": 1.0512, "step": 510 }, { "epoch": 1.3238341968911918, "grad_norm": 52.457523653614096, "learning_rate": 1.136654295052433e-05, "loss": 1.1599, "step": 511 }, { "epoch": 1.3264248704663213, "grad_norm": 26.832339531661276, "learning_rate": 1.1289196154515048e-05, "loss": 1.0602, "step": 512 }, { "epoch": 1.3290155440414508, "grad_norm": 32.746047673769816, "learning_rate": 1.1212009806817163e-05, "loss": 1.1544, "step": 513 }, { "epoch": 1.3316062176165804, "grad_norm": 37.44483451702055, "learning_rate": 1.1134985329161608e-05, "loss": 1.1421, "step": 514 }, { "epoch": 1.33419689119171, "grad_norm": 28.625976525737606, "learning_rate": 1.1058124140297718e-05, "loss": 1.0858, "step": 515 }, { "epoch": 1.3367875647668392, "grad_norm": 38.64141195246213, "learning_rate": 1.0981427655967183e-05, "loss": 1.0983, "step": 516 }, { "epoch": 1.3393782383419688, "grad_norm": 29.989753893533425, "learning_rate": 1.0904897288877891e-05, "loss": 1.1269, "step": 517 }, { "epoch": 1.3419689119170983, "grad_norm": 48.63990665515511, "learning_rate": 1.0828534448677942e-05, "loss": 1.0844, "step": 518 }, { "epoch": 1.3445595854922279, "grad_norm": 25.477227318250847, "learning_rate": 1.0752340541929711e-05, "loss": 1.0742, "step": 519 }, { "epoch": 1.3471502590673574, "grad_norm": 26.363588814537763, "learning_rate": 1.0676316972083867e-05, "loss": 1.0533, "step": 520 }, { "epoch": 1.349740932642487, "grad_norm": 34.59968737708606, "learning_rate": 1.060046513945361e-05, "loss": 1.0983, "step": 521 }, { "epoch": 1.3523316062176165, "grad_norm": 52.51652561846762, "learning_rate": 1.0524786441188786e-05, "loss": 1.1319, "step": 522 }, { "epoch": 1.354922279792746, "grad_norm": 21.360221214301127, "learning_rate": 1.0449282271250239e-05, "loss": 1.0627, "step": 523 }, { "epoch": 1.3575129533678756, "grad_norm": 37.00053933682603, "learning_rate": 1.0373954020384073e-05, "loss": 1.096, "step": 524 }, { "epoch": 1.3601036269430051, "grad_norm": 39.212240822687484, "learning_rate": 1.029880307609608e-05, "loss": 1.0512, "step": 525 }, { "epoch": 1.3626943005181347, "grad_norm": 24.89842378385804, "learning_rate": 1.0223830822626124e-05, "loss": 1.0538, "step": 526 }, { "epoch": 1.3652849740932642, "grad_norm": 29.14416894424653, "learning_rate": 1.0149038640922715e-05, "loss": 1.1538, "step": 527 }, { "epoch": 1.3678756476683938, "grad_norm": 31.688722122648855, "learning_rate": 1.0074427908617515e-05, "loss": 1.171, "step": 528 }, { "epoch": 1.3704663212435233, "grad_norm": 41.918909004413734, "learning_rate": 1.0000000000000006e-05, "loss": 1.1203, "step": 529 }, { "epoch": 1.3730569948186528, "grad_norm": 26.70963454516576, "learning_rate": 9.92575628599213e-06, "loss": 1.0855, "step": 530 }, { "epoch": 1.3756476683937824, "grad_norm": 24.819351173466824, "learning_rate": 9.851698134123095e-06, "loss": 1.0972, "step": 531 }, { "epoch": 1.378238341968912, "grad_norm": 22.100465399566815, "learning_rate": 9.777826908504126e-06, "loss": 1.08, "step": 532 }, { "epoch": 1.3808290155440415, "grad_norm": 29.31574709406259, "learning_rate": 9.704143969803392e-06, "loss": 1.0835, "step": 533 }, { "epoch": 1.383419689119171, "grad_norm": 25.551326748473052, "learning_rate": 9.630650675220892e-06, "loss": 1.0396, "step": 534 }, { "epoch": 1.3860103626943006, "grad_norm": 59.07595627892596, "learning_rate": 9.557348378463503e-06, "loss": 1.0814, "step": 535 }, { "epoch": 1.38860103626943, "grad_norm": 24.96501978981908, "learning_rate": 9.484238429720018e-06, "loss": 1.0187, "step": 536 }, { "epoch": 1.3911917098445596, "grad_norm": 42.530604702279234, "learning_rate": 9.411322175636298e-06, "loss": 1.074, "step": 537 }, { "epoch": 1.3937823834196892, "grad_norm": 34.91129065632851, "learning_rate": 9.338600959290414e-06, "loss": 1.0878, "step": 538 }, { "epoch": 1.3963730569948187, "grad_norm": 32.07525956876426, "learning_rate": 9.266076120167992e-06, "loss": 1.0962, "step": 539 }, { "epoch": 1.3989637305699483, "grad_norm": 40.18387743296675, "learning_rate": 9.193748994137462e-06, "loss": 1.1033, "step": 540 }, { "epoch": 1.4015544041450778, "grad_norm": 66.68031460980451, "learning_rate": 9.121620913425508e-06, "loss": 1.1466, "step": 541 }, { "epoch": 1.4041450777202074, "grad_norm": 34.07506059584738, "learning_rate": 9.04969320659249e-06, "loss": 1.1184, "step": 542 }, { "epoch": 1.406735751295337, "grad_norm": 17.130845779169075, "learning_rate": 8.977967198508001e-06, "loss": 1.0803, "step": 543 }, { "epoch": 1.4093264248704664, "grad_norm": 22.4457025132615, "learning_rate": 8.906444210326441e-06, "loss": 1.0745, "step": 544 }, { "epoch": 1.411917098445596, "grad_norm": 73.43971735356851, "learning_rate": 8.83512555946271e-06, "loss": 1.0717, "step": 545 }, { "epoch": 1.4145077720207253, "grad_norm": 38.16321297719761, "learning_rate": 8.764012559567899e-06, "loss": 1.1371, "step": 546 }, { "epoch": 1.4170984455958548, "grad_norm": 56.14718024907725, "learning_rate": 8.693106520505147e-06, "loss": 1.0185, "step": 547 }, { "epoch": 1.4196891191709844, "grad_norm": 53.3812598790062, "learning_rate": 8.622408748325461e-06, "loss": 1.0859, "step": 548 }, { "epoch": 1.422279792746114, "grad_norm": 39.69041631433326, "learning_rate": 8.551920545243704e-06, "loss": 1.1146, "step": 549 }, { "epoch": 1.4248704663212435, "grad_norm": 24.099260758984773, "learning_rate": 8.481643209614576e-06, "loss": 1.0968, "step": 550 }, { "epoch": 1.427461139896373, "grad_norm": 22.623850373369237, "learning_rate": 8.411578035908728e-06, "loss": 1.0642, "step": 551 }, { "epoch": 1.4300518134715026, "grad_norm": 25.343746374404027, "learning_rate": 8.341726314688875e-06, "loss": 1.0815, "step": 552 }, { "epoch": 1.432642487046632, "grad_norm": 35.82641011588973, "learning_rate": 8.272089332586089e-06, "loss": 1.1012, "step": 553 }, { "epoch": 1.4352331606217616, "grad_norm": 24.81161215784662, "learning_rate": 8.20266837227603e-06, "loss": 1.1086, "step": 554 }, { "epoch": 1.4378238341968912, "grad_norm": 54.18243481591251, "learning_rate": 8.133464712455364e-06, "loss": 1.0704, "step": 555 }, { "epoch": 1.4404145077720207, "grad_norm": 23.602598217141395, "learning_rate": 8.064479627818213e-06, "loss": 1.1519, "step": 556 }, { "epoch": 1.4430051813471503, "grad_norm": 31.124404868409982, "learning_rate": 7.995714389032638e-06, "loss": 1.0705, "step": 557 }, { "epoch": 1.4455958549222798, "grad_norm": 24.14171016995626, "learning_rate": 7.927170262717284e-06, "loss": 1.1083, "step": 558 }, { "epoch": 1.4481865284974094, "grad_norm": 47.987203109917175, "learning_rate": 7.858848511417998e-06, "loss": 1.0836, "step": 559 }, { "epoch": 1.450777202072539, "grad_norm": 25.871447098066056, "learning_rate": 7.790750393584616e-06, "loss": 1.0787, "step": 560 }, { "epoch": 1.4533678756476685, "grad_norm": 23.820249113937482, "learning_rate": 7.72287716354776e-06, "loss": 1.1165, "step": 561 }, { "epoch": 1.455958549222798, "grad_norm": 48.04131308947624, "learning_rate": 7.65523007149575e-06, "loss": 1.0819, "step": 562 }, { "epoch": 1.4585492227979275, "grad_norm": 29.273494083692352, "learning_rate": 7.587810363451544e-06, "loss": 1.0302, "step": 563 }, { "epoch": 1.4611398963730569, "grad_norm": 120.01571222366722, "learning_rate": 7.5206192812498345e-06, "loss": 1.1291, "step": 564 }, { "epoch": 1.4637305699481864, "grad_norm": 33.16947662083338, "learning_rate": 7.4536580625141244e-06, "loss": 1.0842, "step": 565 }, { "epoch": 1.466321243523316, "grad_norm": 29.979556378166713, "learning_rate": 7.386927940633981e-06, "loss": 1.1116, "step": 566 }, { "epoch": 1.4689119170984455, "grad_norm": 27.172344859281896, "learning_rate": 7.32043014474227e-06, "loss": 1.0676, "step": 567 }, { "epoch": 1.471502590673575, "grad_norm": 30.208548637757318, "learning_rate": 7.254165899692554e-06, "loss": 1.1104, "step": 568 }, { "epoch": 1.4740932642487046, "grad_norm": 19.385421184583773, "learning_rate": 7.188136426036498e-06, "loss": 1.0085, "step": 569 }, { "epoch": 1.4766839378238341, "grad_norm": 30.350787749309685, "learning_rate": 7.12234294000143e-06, "loss": 1.0584, "step": 570 }, { "epoch": 1.4792746113989637, "grad_norm": 31.520305600900198, "learning_rate": 7.056786653467882e-06, "loss": 1.0831, "step": 571 }, { "epoch": 1.4818652849740932, "grad_norm": 46.13006972574487, "learning_rate": 6.991468773947321e-06, "loss": 1.1761, "step": 572 }, { "epoch": 1.4844559585492227, "grad_norm": 26.72340868362835, "learning_rate": 6.926390504559879e-06, "loss": 1.0605, "step": 573 }, { "epoch": 1.4870466321243523, "grad_norm": 25.992965411102556, "learning_rate": 6.861553044012206e-06, "loss": 1.1015, "step": 574 }, { "epoch": 1.4896373056994818, "grad_norm": 38.60187420279626, "learning_rate": 6.796957586575364e-06, "loss": 1.1232, "step": 575 }, { "epoch": 1.4922279792746114, "grad_norm": 21.7618591565717, "learning_rate": 6.732605322062869e-06, "loss": 1.1196, "step": 576 }, { "epoch": 1.494818652849741, "grad_norm": 28.233093007170996, "learning_rate": 6.668497435808736e-06, "loss": 1.1451, "step": 577 }, { "epoch": 1.4974093264248705, "grad_norm": 28.061514297823816, "learning_rate": 6.604635108645683e-06, "loss": 1.0832, "step": 578 }, { "epoch": 1.5, "grad_norm": 35.34503147975386, "learning_rate": 6.5410195168833425e-06, "loss": 1.118, "step": 579 }, { "epoch": 1.5025906735751295, "grad_norm": 31.940516004139344, "learning_rate": 6.477651832286633e-06, "loss": 1.1052, "step": 580 }, { "epoch": 1.505181347150259, "grad_norm": 25.647504733675635, "learning_rate": 6.414533222054138e-06, "loss": 1.1055, "step": 581 }, { "epoch": 1.5077720207253886, "grad_norm": 68.16422579698298, "learning_rate": 6.3516648487966456e-06, "loss": 1.0784, "step": 582 }, { "epoch": 1.5077720207253886, "eval_loss": 1.0824710130691528, "eval_runtime": 37.4923, "eval_samples_per_second": 19.844, "eval_steps_per_second": 1.254, "step": 582 }, { "epoch": 1.5103626943005182, "grad_norm": 46.95363643283118, "learning_rate": 6.289047870515692e-06, "loss": 1.1271, "step": 583 }, { "epoch": 1.5129533678756477, "grad_norm": 37.80701104174098, "learning_rate": 6.226683440582268e-06, "loss": 1.126, "step": 584 }, { "epoch": 1.5155440414507773, "grad_norm": 32.03225059321182, "learning_rate": 6.164572707715564e-06, "loss": 1.0152, "step": 585 }, { "epoch": 1.5181347150259068, "grad_norm": 31.21438627768379, "learning_rate": 6.102716815961787e-06, "loss": 1.1595, "step": 586 }, { "epoch": 1.5207253886010363, "grad_norm": 23.55515793723355, "learning_rate": 6.041116904673125e-06, "loss": 1.0943, "step": 587 }, { "epoch": 1.5233160621761659, "grad_norm": 26.92022994571063, "learning_rate": 5.979774108486751e-06, "loss": 1.0554, "step": 588 }, { "epoch": 1.5259067357512954, "grad_norm": 24.957086694295352, "learning_rate": 5.918689557303885e-06, "loss": 1.0711, "step": 589 }, { "epoch": 1.528497409326425, "grad_norm": 87.48440577770464, "learning_rate": 5.857864376269051e-06, "loss": 1.1679, "step": 590 }, { "epoch": 1.5310880829015545, "grad_norm": 21.756969247026838, "learning_rate": 5.7972996857492896e-06, "loss": 1.0716, "step": 591 }, { "epoch": 1.533678756476684, "grad_norm": 33.92695136944769, "learning_rate": 5.736996601313545e-06, "loss": 1.0376, "step": 592 }, { "epoch": 1.5362694300518136, "grad_norm": 32.738888590276794, "learning_rate": 5.676956233712139e-06, "loss": 1.0245, "step": 593 }, { "epoch": 1.5388601036269431, "grad_norm": 22.38597679049821, "learning_rate": 5.617179688856271e-06, "loss": 1.1103, "step": 594 }, { "epoch": 1.5414507772020727, "grad_norm": 30.168619654124416, "learning_rate": 5.557668067797677e-06, "loss": 1.2007, "step": 595 }, { "epoch": 1.5440414507772022, "grad_norm": 24.460334668593116, "learning_rate": 5.498422466708349e-06, "loss": 1.0842, "step": 596 }, { "epoch": 1.5466321243523318, "grad_norm": 25.877463433966412, "learning_rate": 5.439443976860306e-06, "loss": 1.0537, "step": 597 }, { "epoch": 1.549222797927461, "grad_norm": 27.67111694532404, "learning_rate": 5.38073368460555e-06, "loss": 1.0863, "step": 598 }, { "epoch": 1.5518134715025906, "grad_norm": 43.112045139256026, "learning_rate": 5.32229267135602e-06, "loss": 1.1168, "step": 599 }, { "epoch": 1.5544041450777202, "grad_norm": 31.60344278763487, "learning_rate": 5.2641220135636685e-06, "loss": 1.0939, "step": 600 }, { "epoch": 1.5569948186528497, "grad_norm": 37.795536334167195, "learning_rate": 5.206222782700667e-06, "loss": 1.1084, "step": 601 }, { "epoch": 1.5595854922279793, "grad_norm": 27.529824319458413, "learning_rate": 5.1485960452396266e-06, "loss": 1.0755, "step": 602 }, { "epoch": 1.5621761658031088, "grad_norm": 29.172376961452496, "learning_rate": 5.091242862634e-06, "loss": 1.0231, "step": 603 }, { "epoch": 1.5647668393782384, "grad_norm": 24.94560254083931, "learning_rate": 5.0341642912984844e-06, "loss": 1.0782, "step": 604 }, { "epoch": 1.567357512953368, "grad_norm": 31.79546143794924, "learning_rate": 4.977361382589607e-06, "loss": 1.1202, "step": 605 }, { "epoch": 1.5699481865284974, "grad_norm": 39.3795372477718, "learning_rate": 4.920835182786316e-06, "loss": 1.0349, "step": 606 }, { "epoch": 1.572538860103627, "grad_norm": 31.308429467189708, "learning_rate": 4.864586733070755e-06, "loss": 1.0582, "step": 607 }, { "epoch": 1.5751295336787565, "grad_norm": 32.82748366949945, "learning_rate": 4.808617069509034e-06, "loss": 1.1246, "step": 608 }, { "epoch": 1.577720207253886, "grad_norm": 24.281936328515055, "learning_rate": 4.752927223032196e-06, "loss": 1.0679, "step": 609 }, { "epoch": 1.5803108808290154, "grad_norm": 111.23884469313498, "learning_rate": 4.697518219417188e-06, "loss": 1.1319, "step": 610 }, { "epoch": 1.582901554404145, "grad_norm": 35.484299416160596, "learning_rate": 4.6423910792680005e-06, "loss": 1.1348, "step": 611 }, { "epoch": 1.5854922279792745, "grad_norm": 27.135342529418295, "learning_rate": 4.587546817996826e-06, "loss": 1.0948, "step": 612 }, { "epoch": 1.588082901554404, "grad_norm": 81.98158494527004, "learning_rate": 4.532986445805405e-06, "loss": 1.0864, "step": 613 }, { "epoch": 1.5906735751295336, "grad_norm": 61.490418707157346, "learning_rate": 4.478710967666371e-06, "loss": 1.0693, "step": 614 }, { "epoch": 1.593264248704663, "grad_norm": 25.633018846282962, "learning_rate": 4.424721383304791e-06, "loss": 1.1084, "step": 615 }, { "epoch": 1.5958549222797926, "grad_norm": 28.194280804517373, "learning_rate": 4.371018687179689e-06, "loss": 1.1722, "step": 616 }, { "epoch": 1.5984455958549222, "grad_norm": 27.8080566828581, "learning_rate": 4.317603868465794e-06, "loss": 1.1171, "step": 617 }, { "epoch": 1.6010362694300517, "grad_norm": 42.959036729178806, "learning_rate": 4.264477911035265e-06, "loss": 1.074, "step": 618 }, { "epoch": 1.6036269430051813, "grad_norm": 23.937218136554392, "learning_rate": 4.211641793439609e-06, "loss": 1.13, "step": 619 }, { "epoch": 1.6062176165803108, "grad_norm": 43.913677975121566, "learning_rate": 4.159096488891623e-06, "loss": 1.1671, "step": 620 }, { "epoch": 1.6088082901554404, "grad_norm": 48.107566289352114, "learning_rate": 4.106842965247497e-06, "loss": 1.1071, "step": 621 }, { "epoch": 1.61139896373057, "grad_norm": 28.25790913819402, "learning_rate": 4.054882184988971e-06, "loss": 1.0716, "step": 622 }, { "epoch": 1.6139896373056994, "grad_norm": 26.59960827233381, "learning_rate": 4.003215105205613e-06, "loss": 1.146, "step": 623 }, { "epoch": 1.616580310880829, "grad_norm": 22.79614250574067, "learning_rate": 3.951842677577171e-06, "loss": 1.0761, "step": 624 }, { "epoch": 1.6191709844559585, "grad_norm": 24.24036779343114, "learning_rate": 3.900765848356083e-06, "loss": 1.1037, "step": 625 }, { "epoch": 1.621761658031088, "grad_norm": 27.295669679621373, "learning_rate": 3.849985558349998e-06, "loss": 1.1015, "step": 626 }, { "epoch": 1.6243523316062176, "grad_norm": 54.413225233914176, "learning_rate": 3.799502742904497e-06, "loss": 1.0318, "step": 627 }, { "epoch": 1.6269430051813472, "grad_norm": 38.84848713400369, "learning_rate": 3.749318331885825e-06, "loss": 1.1147, "step": 628 }, { "epoch": 1.6295336787564767, "grad_norm": 23.912199342429506, "learning_rate": 3.699433249663775e-06, "loss": 1.1439, "step": 629 }, { "epoch": 1.6321243523316062, "grad_norm": 48.95526983090661, "learning_rate": 3.649848415094681e-06, "loss": 1.0229, "step": 630 }, { "epoch": 1.6347150259067358, "grad_norm": 32.099897123524585, "learning_rate": 3.60056474150446e-06, "loss": 1.0589, "step": 631 }, { "epoch": 1.6373056994818653, "grad_norm": 31.802660850585973, "learning_rate": 3.551583136671817e-06, "loss": 1.1137, "step": 632 }, { "epoch": 1.6398963730569949, "grad_norm": 34.2655686599537, "learning_rate": 3.5029045028115105e-06, "loss": 1.1318, "step": 633 }, { "epoch": 1.6424870466321244, "grad_norm": 191.48847051006786, "learning_rate": 3.4545297365577437e-06, "loss": 1.0921, "step": 634 }, { "epoch": 1.645077720207254, "grad_norm": 24.236450154622357, "learning_rate": 3.406459728947622e-06, "loss": 1.0851, "step": 635 }, { "epoch": 1.6476683937823835, "grad_norm": 38.819342476228876, "learning_rate": 3.358695365404785e-06, "loss": 1.0962, "step": 636 }, { "epoch": 1.650259067357513, "grad_norm": 31.53545103406636, "learning_rate": 3.3112375257230547e-06, "loss": 1.0994, "step": 637 }, { "epoch": 1.6528497409326426, "grad_norm": 71.55299438562814, "learning_rate": 3.2640870840502646e-06, "loss": 1.08, "step": 638 }, { "epoch": 1.6554404145077721, "grad_norm": 57.94234006640972, "learning_rate": 3.2172449088721235e-06, "loss": 1.0921, "step": 639 }, { "epoch": 1.6580310880829017, "grad_norm": 58.15229256885828, "learning_rate": 3.1707118629962607e-06, "loss": 1.0981, "step": 640 }, { "epoch": 1.6606217616580312, "grad_norm": 25.105795165561457, "learning_rate": 3.1244888035362875e-06, "loss": 1.101, "step": 641 }, { "epoch": 1.6632124352331608, "grad_norm": 33.15366058006866, "learning_rate": 3.0785765818960534e-06, "loss": 1.0517, "step": 642 }, { "epoch": 1.6658031088082903, "grad_norm": 35.79893709161297, "learning_rate": 3.0329760437539233e-06, "loss": 1.0886, "step": 643 }, { "epoch": 1.6683937823834198, "grad_norm": 49.59918009099835, "learning_rate": 2.9876880290472376e-06, "loss": 1.0756, "step": 644 }, { "epoch": 1.6709844559585494, "grad_norm": 21.485142494367135, "learning_rate": 2.942713371956809e-06, "loss": 1.1017, "step": 645 }, { "epoch": 1.6735751295336787, "grad_norm": 29.23169287520316, "learning_rate": 2.8980529008915793e-06, "loss": 1.1241, "step": 646 }, { "epoch": 1.6761658031088082, "grad_norm": 27.913868608886553, "learning_rate": 2.853707438473352e-06, "loss": 1.0841, "step": 647 }, { "epoch": 1.6787564766839378, "grad_norm": 18.438597602055644, "learning_rate": 2.8096778015216484e-06, "loss": 1.0891, "step": 648 }, { "epoch": 1.6813471502590673, "grad_norm": 54.0556941620233, "learning_rate": 2.7659648010386365e-06, "loss": 1.0589, "step": 649 }, { "epoch": 1.6839378238341969, "grad_norm": 108.10101848740734, "learning_rate": 2.7225692421942306e-06, "loss": 1.0766, "step": 650 }, { "epoch": 1.6865284974093264, "grad_norm": 106.58835736628185, "learning_rate": 2.679491924311226e-06, "loss": 1.1144, "step": 651 }, { "epoch": 1.689119170984456, "grad_norm": 31.53371570516213, "learning_rate": 2.6367336408506063e-06, "loss": 1.02, "step": 652 }, { "epoch": 1.6917098445595855, "grad_norm": 36.263088086669775, "learning_rate": 2.594295179396895e-06, "loss": 1.0679, "step": 653 }, { "epoch": 1.694300518134715, "grad_norm": 24.47507184337666, "learning_rate": 2.5521773216436875e-06, "loss": 1.1092, "step": 654 }, { "epoch": 1.6968911917098446, "grad_norm": 33.05899532106974, "learning_rate": 2.5103808433792075e-06, "loss": 1.053, "step": 655 }, { "epoch": 1.6994818652849741, "grad_norm": 29.132344102799873, "learning_rate": 2.468906514472065e-06, "loss": 1.0518, "step": 656 }, { "epoch": 1.7020725388601037, "grad_norm": 43.48960854254409, "learning_rate": 2.4277550988570362e-06, "loss": 1.0537, "step": 657 }, { "epoch": 1.704663212435233, "grad_norm": 28.13627467897817, "learning_rate": 2.3869273545210158e-06, "loss": 1.0558, "step": 658 }, { "epoch": 1.7072538860103625, "grad_norm": 33.18164212520423, "learning_rate": 2.3464240334890496e-06, "loss": 1.054, "step": 659 }, { "epoch": 1.709844559585492, "grad_norm": 41.884394437273144, "learning_rate": 2.3062458818104804e-06, "loss": 1.0871, "step": 660 }, { "epoch": 1.7124352331606216, "grad_norm": 27.119840736470916, "learning_rate": 2.266393639545197e-06, "loss": 1.0743, "step": 661 }, { "epoch": 1.7150259067357512, "grad_norm": 20.70474999023591, "learning_rate": 2.22686804075003e-06, "loss": 1.0718, "step": 662 }, { "epoch": 1.7176165803108807, "grad_norm": 21.469651089617198, "learning_rate": 2.187669813465192e-06, "loss": 1.0584, "step": 663 }, { "epoch": 1.7202072538860103, "grad_norm": 29.901704269591495, "learning_rate": 2.1487996797009103e-06, "loss": 1.1175, "step": 664 }, { "epoch": 1.7227979274611398, "grad_norm": 75.06310533674302, "learning_rate": 2.110258355424093e-06, "loss": 1.124, "step": 665 }, { "epoch": 1.7253886010362693, "grad_norm": 34.13349153293387, "learning_rate": 2.0720465505451524e-06, "loss": 1.1395, "step": 666 }, { "epoch": 1.7279792746113989, "grad_norm": 26.83922350447555, "learning_rate": 2.0341649689049458e-06, "loss": 1.0449, "step": 667 }, { "epoch": 1.7305699481865284, "grad_norm": 37.284339589086024, "learning_rate": 1.9966143082617797e-06, "loss": 1.0332, "step": 668 }, { "epoch": 1.733160621761658, "grad_norm": 46.453238969399074, "learning_rate": 1.959395260278587e-06, "loss": 1.1303, "step": 669 }, { "epoch": 1.7357512953367875, "grad_norm": 22.743791018223284, "learning_rate": 1.922508510510166e-06, "loss": 1.0993, "step": 670 }, { "epoch": 1.738341968911917, "grad_norm": 27.788137087891727, "learning_rate": 1.885954738390572e-06, "loss": 1.1234, "step": 671 }, { "epoch": 1.7409326424870466, "grad_norm": 34.03637743502625, "learning_rate": 1.8497346172205733e-06, "loss": 1.085, "step": 672 }, { "epoch": 1.7435233160621761, "grad_norm": 30.308363072599853, "learning_rate": 1.8138488141552856e-06, "loss": 1.0348, "step": 673 }, { "epoch": 1.7461139896373057, "grad_norm": 26.81612464278571, "learning_rate": 1.7782979901918507e-06, "loss": 1.0672, "step": 674 }, { "epoch": 1.7487046632124352, "grad_norm": 46.96340147563577, "learning_rate": 1.7430828001572897e-06, "loss": 1.0807, "step": 675 }, { "epoch": 1.7512953367875648, "grad_norm": 30.87064631308438, "learning_rate": 1.7082038926964162e-06, "loss": 1.1411, "step": 676 }, { "epoch": 1.7538860103626943, "grad_norm": 79.59411718865987, "learning_rate": 1.6736619102599073e-06, "loss": 1.0234, "step": 677 }, { "epoch": 1.7564766839378239, "grad_norm": 30.875792565440594, "learning_rate": 1.6394574890924574e-06, "loss": 1.1506, "step": 678 }, { "epoch": 1.7590673575129534, "grad_norm": 34.227935587917464, "learning_rate": 1.605591259221071e-06, "loss": 1.0981, "step": 679 }, { "epoch": 1.7590673575129534, "eval_loss": 1.0809757709503174, "eval_runtime": 37.9729, "eval_samples_per_second": 19.593, "eval_steps_per_second": 1.238, "step": 679 }, { "epoch": 1.761658031088083, "grad_norm": 31.849171622198522, "learning_rate": 1.572063844443441e-06, "loss": 1.1227, "step": 680 }, { "epoch": 1.7642487046632125, "grad_norm": 32.75765881856165, "learning_rate": 1.5388758623164802e-06, "loss": 1.0842, "step": 681 }, { "epoch": 1.766839378238342, "grad_norm": 27.83779558188967, "learning_rate": 1.5060279241449304e-06, "loss": 1.0419, "step": 682 }, { "epoch": 1.7694300518134716, "grad_norm": 30.646833576522408, "learning_rate": 1.4735206349701003e-06, "loss": 1.0983, "step": 683 }, { "epoch": 1.7720207253886011, "grad_norm": 29.748071428344947, "learning_rate": 1.4413545935587415e-06, "loss": 1.1276, "step": 684 }, { "epoch": 1.7746113989637307, "grad_norm": 32.57104117085742, "learning_rate": 1.4095303923919956e-06, "loss": 1.0728, "step": 685 }, { "epoch": 1.7772020725388602, "grad_norm": 32.02209671450587, "learning_rate": 1.3780486176544905e-06, "loss": 1.1148, "step": 686 }, { "epoch": 1.7797927461139897, "grad_norm": 31.902388050458736, "learning_rate": 1.3469098492235521e-06, "loss": 1.0873, "step": 687 }, { "epoch": 1.7823834196891193, "grad_norm": 33.159581668201604, "learning_rate": 1.316114660658505e-06, "loss": 1.0308, "step": 688 }, { "epoch": 1.7849740932642488, "grad_norm": 25.531240947030152, "learning_rate": 1.2856636191901296e-06, "loss": 1.0893, "step": 689 }, { "epoch": 1.7875647668393784, "grad_norm": 25.382870674663973, "learning_rate": 1.255557285710185e-06, "loss": 1.1089, "step": 690 }, { "epoch": 1.790155440414508, "grad_norm": 26.184606368046406, "learning_rate": 1.225796214761117e-06, "loss": 1.1515, "step": 691 }, { "epoch": 1.7927461139896375, "grad_norm": 27.78595815725415, "learning_rate": 1.196380954525802e-06, "loss": 1.0871, "step": 692 }, { "epoch": 1.795336787564767, "grad_norm": 32.137607036645285, "learning_rate": 1.1673120468174837e-06, "loss": 1.1396, "step": 693 }, { "epoch": 1.7979274611398963, "grad_norm": 31.931928767500203, "learning_rate": 1.1385900270697658e-06, "loss": 1.1175, "step": 694 }, { "epoch": 1.8005181347150259, "grad_norm": 36.61199052966244, "learning_rate": 1.110215424326775e-06, "loss": 1.1867, "step": 695 }, { "epoch": 1.8031088082901554, "grad_norm": 49.9081839820131, "learning_rate": 1.0821887612333959e-06, "loss": 1.1266, "step": 696 }, { "epoch": 1.805699481865285, "grad_norm": 25.346034138603734, "learning_rate": 1.0545105540256628e-06, "loss": 1.0614, "step": 697 }, { "epoch": 1.8082901554404145, "grad_norm": 47.53838459679947, "learning_rate": 1.0271813125212237e-06, "loss": 1.1314, "step": 698 }, { "epoch": 1.810880829015544, "grad_norm": 30.496460286583815, "learning_rate": 1.0002015401099797e-06, "loss": 1.1067, "step": 699 }, { "epoch": 1.8134715025906736, "grad_norm": 29.929097539381686, "learning_rate": 9.735717337447981e-07, "loss": 1.0424, "step": 700 }, { "epoch": 1.8160621761658031, "grad_norm": 30.887132457194266, "learning_rate": 9.4729238393235e-07, "loss": 1.1248, "step": 701 }, { "epoch": 1.8186528497409327, "grad_norm": 24.26916275448189, "learning_rate": 9.21363974724101e-07, "loss": 1.0577, "step": 702 }, { "epoch": 1.8212435233160622, "grad_norm": 40.34641617989283, "learning_rate": 8.957869837073673e-07, "loss": 1.1639, "step": 703 }, { "epoch": 1.8238341968911918, "grad_norm": 34.3133374466777, "learning_rate": 8.705618819965411e-07, "loss": 1.0866, "step": 704 }, { "epoch": 1.8264248704663213, "grad_norm": 25.164299615685284, "learning_rate": 8.456891342243945e-07, "loss": 1.1232, "step": 705 }, { "epoch": 1.8290155440414506, "grad_norm": 129.91297199628124, "learning_rate": 8.211691985335357e-07, "loss": 1.1542, "step": 706 }, { "epoch": 1.8316062176165802, "grad_norm": 23.928927141144797, "learning_rate": 7.970025265679648e-07, "loss": 1.0813, "step": 707 }, { "epoch": 1.8341968911917097, "grad_norm": 22.631504479886225, "learning_rate": 7.731895634647513e-07, "loss": 1.1164, "step": 708 }, { "epoch": 1.8367875647668392, "grad_norm": 84.2359250723018, "learning_rate": 7.497307478458382e-07, "loss": 1.1081, "step": 709 }, { "epoch": 1.8393782383419688, "grad_norm": 51.39142883893451, "learning_rate": 7.266265118099669e-07, "loss": 1.105, "step": 710 }, { "epoch": 1.8419689119170983, "grad_norm": 41.18280727079993, "learning_rate": 7.038772809247075e-07, "loss": 1.1211, "step": 711 }, { "epoch": 1.8445595854922279, "grad_norm": 34.330855277813534, "learning_rate": 6.814834742186361e-07, "loss": 1.0783, "step": 712 }, { "epoch": 1.8471502590673574, "grad_norm": 46.858780552576334, "learning_rate": 6.594455041735925e-07, "loss": 1.0214, "step": 713 }, { "epoch": 1.849740932642487, "grad_norm": 94.2712798319484, "learning_rate": 6.377637767171152e-07, "loss": 1.098, "step": 714 }, { "epoch": 1.8523316062176165, "grad_norm": 33.00073975184253, "learning_rate": 6.164386912149289e-07, "loss": 1.0906, "step": 715 }, { "epoch": 1.854922279792746, "grad_norm": 30.030119862133272, "learning_rate": 5.954706404636179e-07, "loss": 1.1073, "step": 716 }, { "epoch": 1.8575129533678756, "grad_norm": 46.42282973245658, "learning_rate": 5.748600106833735e-07, "loss": 1.1553, "step": 717 }, { "epoch": 1.8601036269430051, "grad_norm": 26.48910946182044, "learning_rate": 5.546071815108845e-07, "loss": 1.0704, "step": 718 }, { "epoch": 1.8626943005181347, "grad_norm": 29.34093197155635, "learning_rate": 5.347125259923491e-07, "loss": 1.1, "step": 719 }, { "epoch": 1.8652849740932642, "grad_norm": 24.689130499541356, "learning_rate": 5.151764105766011e-07, "loss": 1.067, "step": 720 }, { "epoch": 1.8678756476683938, "grad_norm": 21.25619644617847, "learning_rate": 4.959991951083498e-07, "loss": 1.1125, "step": 721 }, { "epoch": 1.8704663212435233, "grad_norm": 23.946272802272112, "learning_rate": 4.771812328215708e-07, "loss": 1.0798, "step": 722 }, { "epoch": 1.8730569948186528, "grad_norm": 33.286030816378954, "learning_rate": 4.587228703329838e-07, "loss": 1.0756, "step": 723 }, { "epoch": 1.8756476683937824, "grad_norm": 109.02542545414109, "learning_rate": 4.40624447635678e-07, "loss": 1.073, "step": 724 }, { "epoch": 1.878238341968912, "grad_norm": 133.80505789447585, "learning_rate": 4.228862980928439e-07, "loss": 1.1218, "step": 725 }, { "epoch": 1.8808290155440415, "grad_norm": 28.671374209715793, "learning_rate": 4.0550874843163337e-07, "loss": 1.1546, "step": 726 }, { "epoch": 1.883419689119171, "grad_norm": 20.092775273550536, "learning_rate": 3.8849211873714266e-07, "loss": 1.0608, "step": 727 }, { "epoch": 1.8860103626943006, "grad_norm": 18.87195408427635, "learning_rate": 3.7183672244652135e-07, "loss": 1.0437, "step": 728 }, { "epoch": 1.88860103626943, "grad_norm": 24.985644120932864, "learning_rate": 3.5554286634318814e-07, "loss": 1.0989, "step": 729 }, { "epoch": 1.8911917098445596, "grad_norm": 24.09887960702925, "learning_rate": 3.3961085055119083e-07, "loss": 1.0347, "step": 730 }, { "epoch": 1.8937823834196892, "grad_norm": 98.50926523613283, "learning_rate": 3.2404096852967305e-07, "loss": 1.1163, "step": 731 }, { "epoch": 1.8963730569948187, "grad_norm": 42.45357973111845, "learning_rate": 3.0883350706746973e-07, "loss": 1.1497, "step": 732 }, { "epoch": 1.8989637305699483, "grad_norm": 25.430184794482617, "learning_rate": 2.9398874627782014e-07, "loss": 1.0154, "step": 733 }, { "epoch": 1.9015544041450778, "grad_norm": 32.56552224066898, "learning_rate": 2.7950695959322093e-07, "loss": 1.0976, "step": 734 }, { "epoch": 1.9041450777202074, "grad_norm": 25.518391980867197, "learning_rate": 2.653884137603702e-07, "loss": 1.1122, "step": 735 }, { "epoch": 1.906735751295337, "grad_norm": 20.537146853099735, "learning_rate": 2.516333688352801e-07, "loss": 1.0592, "step": 736 }, { "epoch": 1.9093264248704664, "grad_norm": 25.28898033119641, "learning_rate": 2.382420781784589e-07, "loss": 1.0706, "step": 737 }, { "epoch": 1.911917098445596, "grad_norm": 55.74230904177274, "learning_rate": 2.2521478845025867e-07, "loss": 1.1706, "step": 738 }, { "epoch": 1.9145077720207255, "grad_norm": 42.768439146141375, "learning_rate": 2.1255173960634146e-07, "loss": 1.0917, "step": 739 }, { "epoch": 1.917098445595855, "grad_norm": 31.627146067352545, "learning_rate": 2.0025316489323597e-07, "loss": 1.0842, "step": 740 }, { "epoch": 1.9196891191709846, "grad_norm": 67.01614151937272, "learning_rate": 1.8831929084406119e-07, "loss": 1.1287, "step": 741 }, { "epoch": 1.922279792746114, "grad_norm": 56.931018082229045, "learning_rate": 1.7675033727434288e-07, "loss": 1.148, "step": 742 }, { "epoch": 1.9248704663212435, "grad_norm": 35.24107640275113, "learning_rate": 1.655465172779702e-07, "loss": 1.0814, "step": 743 }, { "epoch": 1.927461139896373, "grad_norm": 28.45308969334642, "learning_rate": 1.547080372232679e-07, "loss": 1.1092, "step": 744 }, { "epoch": 1.9300518134715026, "grad_norm": 67.36918357149847, "learning_rate": 1.44235096749199e-07, "loss": 1.1332, "step": 745 }, { "epoch": 1.932642487046632, "grad_norm": 33.50866269131509, "learning_rate": 1.3412788876167925e-07, "loss": 1.0884, "step": 746 }, { "epoch": 1.9352331606217616, "grad_norm": 34.359505767271465, "learning_rate": 1.2438659943003306e-07, "loss": 0.9982, "step": 747 }, { "epoch": 1.9378238341968912, "grad_norm": 44.805290236152125, "learning_rate": 1.1501140818355627e-07, "loss": 1.065, "step": 748 }, { "epoch": 1.9404145077720207, "grad_norm": 35.70322964853727, "learning_rate": 1.0600248770821886e-07, "loss": 1.1435, "step": 749 }, { "epoch": 1.9430051813471503, "grad_norm": 37.7037381444634, "learning_rate": 9.736000394348299e-08, "loss": 1.1085, "step": 750 }, { "epoch": 1.9455958549222798, "grad_norm": 19.88028370873119, "learning_rate": 8.908411607923884e-08, "loss": 1.0903, "step": 751 }, { "epoch": 1.9481865284974094, "grad_norm": 22.037441897095253, "learning_rate": 8.117497655287798e-08, "loss": 1.0621, "step": 752 }, { "epoch": 1.950777202072539, "grad_norm": 36.597366625713235, "learning_rate": 7.363273104648904e-08, "loss": 1.134, "step": 753 }, { "epoch": 1.9533678756476682, "grad_norm": 36.91544331752125, "learning_rate": 6.645751848417093e-08, "loss": 1.0894, "step": 754 }, { "epoch": 1.9559585492227978, "grad_norm": 30.791496804716704, "learning_rate": 5.964947102946594e-08, "loss": 1.0774, "step": 755 }, { "epoch": 1.9585492227979273, "grad_norm": 24.76204564200231, "learning_rate": 5.320871408294403e-08, "loss": 1.1167, "step": 756 }, { "epoch": 1.9611398963730569, "grad_norm": 31.78111531944549, "learning_rate": 4.713536627987347e-08, "loss": 1.0709, "step": 757 }, { "epoch": 1.9637305699481864, "grad_norm": 36.388018093644106, "learning_rate": 4.1429539488047066e-08, "loss": 1.0492, "step": 758 }, { "epoch": 1.966321243523316, "grad_norm": 27.235358627643226, "learning_rate": 3.6091338805719356e-08, "loss": 1.1128, "step": 759 }, { "epoch": 1.9689119170984455, "grad_norm": 26.526882273916378, "learning_rate": 3.1120862559670396e-08, "loss": 1.1129, "step": 760 }, { "epoch": 1.971502590673575, "grad_norm": 28.962449597773997, "learning_rate": 2.651820230338942e-08, "loss": 1.1286, "step": 761 }, { "epoch": 1.9740932642487046, "grad_norm": 104.33848533313731, "learning_rate": 2.2283442815402845e-08, "loss": 1.117, "step": 762 }, { "epoch": 1.9766839378238341, "grad_norm": 179.66099272542536, "learning_rate": 1.8416662097693326e-08, "loss": 1.0788, "step": 763 }, { "epoch": 1.9792746113989637, "grad_norm": 28.438877123785307, "learning_rate": 1.491793137427866e-08, "loss": 1.1436, "step": 764 }, { "epoch": 1.9818652849740932, "grad_norm": 44.454308819411644, "learning_rate": 1.1787315089895057e-08, "loss": 1.1108, "step": 765 }, { "epoch": 1.9844559585492227, "grad_norm": 53.23249975862293, "learning_rate": 9.024870908802552e-09, "loss": 0.9971, "step": 766 }, { "epoch": 1.9870466321243523, "grad_norm": 35.2043549019015, "learning_rate": 6.630649713739168e-09, "loss": 1.1205, "step": 767 }, { "epoch": 1.9896373056994818, "grad_norm": 22.286284343829376, "learning_rate": 4.6046956049639045e-09, "loss": 1.0848, "step": 768 }, { "epoch": 1.9922279792746114, "grad_norm": 24.94719200433733, "learning_rate": 2.94704589946182e-09, "loss": 1.1308, "step": 769 }, { "epoch": 1.994818652849741, "grad_norm": 41.684623957583106, "learning_rate": 1.657731130246809e-09, "loss": 1.1555, "step": 770 }, { "epoch": 1.9974093264248705, "grad_norm": 55.480495348949425, "learning_rate": 7.367750458020518e-10, "loss": 1.129, "step": 771 }, { "epoch": 2.0, "grad_norm": 43.2148652279276, "learning_rate": 1.8419460964258505e-10, "loss": 1.0835, "step": 772 } ], "logging_steps": 1, "max_steps": 772, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 193, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3363988309999616e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }