diff --git "a/checkpoint-579/trainer_state.json" "b/checkpoint-579/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-579/trainer_state.json" @@ -0,0 +1,4135 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.5, + "eval_steps": 97, + "global_step": 579, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0025906735751295338, + "grad_norm": 758.2562349755826, + "learning_rate": 0.0, + "loss": 1.3719, + "step": 1 + }, + { + "epoch": 0.0025906735751295338, + "eval_loss": 1.3159157037734985, + "eval_runtime": 36.907, + "eval_samples_per_second": 20.159, + "eval_steps_per_second": 1.273, + "step": 1 + }, + { + "epoch": 0.0051813471502590676, + "grad_norm": 666.308184823038, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.36, + "step": 2 + }, + { + "epoch": 0.007772020725388601, + "grad_norm": 211.0771195353068, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.3746, + "step": 3 + }, + { + "epoch": 0.010362694300518135, + "grad_norm": 431.5114709683218, + "learning_rate": 3e-06, + "loss": 1.3412, + "step": 4 + }, + { + "epoch": 0.012953367875647668, + "grad_norm": 230.87468433791625, + "learning_rate": 4.000000000000001e-06, + "loss": 1.3837, + "step": 5 + }, + { + "epoch": 0.015544041450777202, + "grad_norm": 635.1636587738542, + "learning_rate": 5e-06, + "loss": 1.3761, + "step": 6 + }, + { + "epoch": 0.018134715025906734, + "grad_norm": 791.5536958334704, + "learning_rate": 6e-06, + "loss": 1.2855, + "step": 7 + }, + { + "epoch": 0.02072538860103627, + "grad_norm": 667.7197994216477, + "learning_rate": 7e-06, + "loss": 1.3267, + "step": 8 + }, + { + "epoch": 0.023316062176165803, + "grad_norm": 254.3855973692125, + "learning_rate": 8.000000000000001e-06, + "loss": 1.2977, + "step": 9 + }, + { + "epoch": 0.025906735751295335, + "grad_norm": 162.29347257682093, + "learning_rate": 9e-06, + "loss": 1.3522, + "step": 10 + }, + { + "epoch": 0.02849740932642487, + "grad_norm": 352.6352930651456, + "learning_rate": 1e-05, + "loss": 1.2688, + "step": 11 + }, + { + "epoch": 0.031088082901554404, + "grad_norm": 148.2629265526552, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.3342, + "step": 12 + }, + { + "epoch": 0.03367875647668394, + "grad_norm": 249.88753789723657, + "learning_rate": 1.2e-05, + "loss": 1.2983, + "step": 13 + }, + { + "epoch": 0.03626943005181347, + "grad_norm": 184.03358422636597, + "learning_rate": 1.3000000000000001e-05, + "loss": 1.3291, + "step": 14 + }, + { + "epoch": 0.038860103626943004, + "grad_norm": 198.4491469860763, + "learning_rate": 1.4e-05, + "loss": 1.4014, + "step": 15 + }, + { + "epoch": 0.04145077720207254, + "grad_norm": 680.9537058769038, + "learning_rate": 1.5000000000000002e-05, + "loss": 1.3775, + "step": 16 + }, + { + "epoch": 0.04404145077720207, + "grad_norm": 563.0247638614801, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.3228, + "step": 17 + }, + { + "epoch": 0.046632124352331605, + "grad_norm": 271.985463813746, + "learning_rate": 1.7e-05, + "loss": 1.3695, + "step": 18 + }, + { + "epoch": 0.04922279792746114, + "grad_norm": 399.51218452223316, + "learning_rate": 1.8e-05, + "loss": 1.2556, + "step": 19 + }, + { + "epoch": 0.05181347150259067, + "grad_norm": 160.70697055826656, + "learning_rate": 1.9e-05, + "loss": 1.2982, + "step": 20 + }, + { + "epoch": 0.054404145077720206, + "grad_norm": 227.8927504687491, + "learning_rate": 2e-05, + "loss": 1.3532, + "step": 21 + }, + { + "epoch": 0.05699481865284974, + "grad_norm": 550.1538868076032, + "learning_rate": 2.1000000000000002e-05, + "loss": 1.2603, + "step": 22 + }, + { + "epoch": 0.05958549222797927, + "grad_norm": 291.8994359919024, + "learning_rate": 2.2000000000000003e-05, + "loss": 1.3663, + "step": 23 + }, + { + "epoch": 0.06217616580310881, + "grad_norm": 120.60677833129643, + "learning_rate": 2.3e-05, + "loss": 1.3129, + "step": 24 + }, + { + "epoch": 0.06476683937823834, + "grad_norm": 414.4006662101242, + "learning_rate": 2.4e-05, + "loss": 1.3037, + "step": 25 + }, + { + "epoch": 0.06735751295336788, + "grad_norm": 141.48324465317884, + "learning_rate": 2.5e-05, + "loss": 1.3095, + "step": 26 + }, + { + "epoch": 0.06994818652849741, + "grad_norm": 147.86066819937994, + "learning_rate": 2.6000000000000002e-05, + "loss": 1.2372, + "step": 27 + }, + { + "epoch": 0.07253886010362694, + "grad_norm": 214.47337614964576, + "learning_rate": 2.7000000000000002e-05, + "loss": 1.3384, + "step": 28 + }, + { + "epoch": 0.07512953367875648, + "grad_norm": 898.4324889241673, + "learning_rate": 2.8e-05, + "loss": 1.2003, + "step": 29 + }, + { + "epoch": 0.07772020725388601, + "grad_norm": 128.83026557596128, + "learning_rate": 2.9e-05, + "loss": 1.2172, + "step": 30 + }, + { + "epoch": 0.08031088082901554, + "grad_norm": 183.0777862405529, + "learning_rate": 3.0000000000000004e-05, + "loss": 1.2674, + "step": 31 + }, + { + "epoch": 0.08290155440414508, + "grad_norm": 119.01841833358732, + "learning_rate": 3.1e-05, + "loss": 1.2554, + "step": 32 + }, + { + "epoch": 0.08549222797927461, + "grad_norm": 117.65980267542858, + "learning_rate": 3.2000000000000005e-05, + "loss": 1.2716, + "step": 33 + }, + { + "epoch": 0.08808290155440414, + "grad_norm": 82.40151099433953, + "learning_rate": 3.3e-05, + "loss": 1.2019, + "step": 34 + }, + { + "epoch": 0.09067357512953368, + "grad_norm": 82.61816783653785, + "learning_rate": 3.4e-05, + "loss": 1.2424, + "step": 35 + }, + { + "epoch": 0.09326424870466321, + "grad_norm": 136.42743433868276, + "learning_rate": 3.5000000000000004e-05, + "loss": 1.2066, + "step": 36 + }, + { + "epoch": 0.09585492227979274, + "grad_norm": 36.775911657584444, + "learning_rate": 3.6e-05, + "loss": 1.2485, + "step": 37 + }, + { + "epoch": 0.09844559585492228, + "grad_norm": 56.55022603284064, + "learning_rate": 3.7000000000000005e-05, + "loss": 1.2112, + "step": 38 + }, + { + "epoch": 0.10103626943005181, + "grad_norm": 50.09896932886107, + "learning_rate": 3.8e-05, + "loss": 1.2027, + "step": 39 + }, + { + "epoch": 0.10362694300518134, + "grad_norm": 54.2661481198025, + "learning_rate": 3.9e-05, + "loss": 1.2673, + "step": 40 + }, + { + "epoch": 0.10621761658031088, + "grad_norm": 60.04145981731815, + "learning_rate": 4e-05, + "loss": 1.1648, + "step": 41 + }, + { + "epoch": 0.10880829015544041, + "grad_norm": 169.47741055545822, + "learning_rate": 3.999981580539036e-05, + "loss": 1.2393, + "step": 42 + }, + { + "epoch": 0.11139896373056994, + "grad_norm": 43.64716987307323, + "learning_rate": 3.9999263224954204e-05, + "loss": 1.2906, + "step": 43 + }, + { + "epoch": 0.11398963730569948, + "grad_norm": 51.3206609767585, + "learning_rate": 3.999834226886976e-05, + "loss": 1.1807, + "step": 44 + }, + { + "epoch": 0.11658031088082901, + "grad_norm": 38.95055887413869, + "learning_rate": 3.999705295410054e-05, + "loss": 1.1825, + "step": 45 + }, + { + "epoch": 0.11917098445595854, + "grad_norm": 40.59968974426338, + "learning_rate": 3.999539530439504e-05, + "loss": 1.193, + "step": 46 + }, + { + "epoch": 0.12176165803108809, + "grad_norm": 34.5796571445333, + "learning_rate": 3.9993369350286265e-05, + "loss": 1.2127, + "step": 47 + }, + { + "epoch": 0.12435233160621761, + "grad_norm": 37.97693356149241, + "learning_rate": 3.99909751290912e-05, + "loss": 1.1543, + "step": 48 + }, + { + "epoch": 0.12694300518134716, + "grad_norm": 82.9217015858092, + "learning_rate": 3.9988212684910107e-05, + "loss": 1.2329, + "step": 49 + }, + { + "epoch": 0.12953367875647667, + "grad_norm": 49.256542144400214, + "learning_rate": 3.9985082068625724e-05, + "loss": 1.212, + "step": 50 + }, + { + "epoch": 0.13212435233160622, + "grad_norm": 45.025980435259484, + "learning_rate": 3.998158333790231e-05, + "loss": 1.2129, + "step": 51 + }, + { + "epoch": 0.13471502590673576, + "grad_norm": 45.98465689592428, + "learning_rate": 3.99777165571846e-05, + "loss": 1.1709, + "step": 52 + }, + { + "epoch": 0.13730569948186527, + "grad_norm": 43.481241408477906, + "learning_rate": 3.997348179769661e-05, + "loss": 1.1614, + "step": 53 + }, + { + "epoch": 0.13989637305699482, + "grad_norm": 82.17633750834132, + "learning_rate": 3.996887913744033e-05, + "loss": 1.2205, + "step": 54 + }, + { + "epoch": 0.14248704663212436, + "grad_norm": 53.0176514970764, + "learning_rate": 3.9963908661194285e-05, + "loss": 1.1204, + "step": 55 + }, + { + "epoch": 0.14507772020725387, + "grad_norm": 67.86382426995611, + "learning_rate": 3.995857046051196e-05, + "loss": 1.1839, + "step": 56 + }, + { + "epoch": 0.14766839378238342, + "grad_norm": 31.282407703790597, + "learning_rate": 3.995286463372013e-05, + "loss": 1.2126, + "step": 57 + }, + { + "epoch": 0.15025906735751296, + "grad_norm": 52.200764429265604, + "learning_rate": 3.994679128591706e-05, + "loss": 1.2036, + "step": 58 + }, + { + "epoch": 0.15284974093264247, + "grad_norm": 60.706608653531895, + "learning_rate": 3.9940350528970535e-05, + "loss": 1.1848, + "step": 59 + }, + { + "epoch": 0.15544041450777202, + "grad_norm": 47.31754062899529, + "learning_rate": 3.993354248151583e-05, + "loss": 1.0869, + "step": 60 + }, + { + "epoch": 0.15803108808290156, + "grad_norm": 49.42450836392811, + "learning_rate": 3.9926367268953514e-05, + "loss": 1.2651, + "step": 61 + }, + { + "epoch": 0.16062176165803108, + "grad_norm": 38.791167030088886, + "learning_rate": 3.991882502344712e-05, + "loss": 1.1881, + "step": 62 + }, + { + "epoch": 0.16321243523316062, + "grad_norm": 56.16339499737216, + "learning_rate": 3.991091588392077e-05, + "loss": 1.1518, + "step": 63 + }, + { + "epoch": 0.16580310880829016, + "grad_norm": 861.8559063020828, + "learning_rate": 3.990263999605652e-05, + "loss": 1.1614, + "step": 64 + }, + { + "epoch": 0.16839378238341968, + "grad_norm": 50.92822786500888, + "learning_rate": 3.989399751229179e-05, + "loss": 1.1998, + "step": 65 + }, + { + "epoch": 0.17098445595854922, + "grad_norm": 31.04121324055666, + "learning_rate": 3.988498859181645e-05, + "loss": 1.1795, + "step": 66 + }, + { + "epoch": 0.17357512953367876, + "grad_norm": 50.33061983380845, + "learning_rate": 3.9875613400569975e-05, + "loss": 1.1742, + "step": 67 + }, + { + "epoch": 0.17616580310880828, + "grad_norm": 75.20462514003519, + "learning_rate": 3.986587211123833e-05, + "loss": 1.1856, + "step": 68 + }, + { + "epoch": 0.17875647668393782, + "grad_norm": 38.82139317052205, + "learning_rate": 3.98557649032508e-05, + "loss": 1.1529, + "step": 69 + }, + { + "epoch": 0.18134715025906736, + "grad_norm": 36.55988806615175, + "learning_rate": 3.984529196277674e-05, + "loss": 1.1884, + "step": 70 + }, + { + "epoch": 0.18393782383419688, + "grad_norm": 104.8931793971097, + "learning_rate": 3.983445348272203e-05, + "loss": 1.2182, + "step": 71 + }, + { + "epoch": 0.18652849740932642, + "grad_norm": 36.50395409234617, + "learning_rate": 3.982324966272566e-05, + "loss": 1.1609, + "step": 72 + }, + { + "epoch": 0.18911917098445596, + "grad_norm": 35.019191693448626, + "learning_rate": 3.981168070915594e-05, + "loss": 1.173, + "step": 73 + }, + { + "epoch": 0.19170984455958548, + "grad_norm": 33.378390048053596, + "learning_rate": 3.979974683510677e-05, + "loss": 1.173, + "step": 74 + }, + { + "epoch": 0.19430051813471502, + "grad_norm": 43.356840136984154, + "learning_rate": 3.978744826039366e-05, + "loss": 1.2032, + "step": 75 + }, + { + "epoch": 0.19689119170984457, + "grad_norm": 31.285725922510768, + "learning_rate": 3.977478521154974e-05, + "loss": 1.1569, + "step": 76 + }, + { + "epoch": 0.19948186528497408, + "grad_norm": 35.19264482867074, + "learning_rate": 3.9761757921821544e-05, + "loss": 1.1365, + "step": 77 + }, + { + "epoch": 0.20207253886010362, + "grad_norm": 44.66037256551279, + "learning_rate": 3.974836663116472e-05, + "loss": 1.164, + "step": 78 + }, + { + "epoch": 0.20466321243523317, + "grad_norm": 68.91101457952654, + "learning_rate": 3.973461158623963e-05, + "loss": 1.2256, + "step": 79 + }, + { + "epoch": 0.20725388601036268, + "grad_norm": 45.866521854583, + "learning_rate": 3.9720493040406786e-05, + "loss": 1.1697, + "step": 80 + }, + { + "epoch": 0.20984455958549222, + "grad_norm": 59.63095169617338, + "learning_rate": 3.970601125372218e-05, + "loss": 1.2094, + "step": 81 + }, + { + "epoch": 0.21243523316062177, + "grad_norm": 39.085597271064216, + "learning_rate": 3.9691166492932535e-05, + "loss": 1.1048, + "step": 82 + }, + { + "epoch": 0.21502590673575128, + "grad_norm": 36.40256073477861, + "learning_rate": 3.9675959031470336e-05, + "loss": 1.248, + "step": 83 + }, + { + "epoch": 0.21761658031088082, + "grad_norm": 29.846921716586085, + "learning_rate": 3.966038914944881e-05, + "loss": 1.1718, + "step": 84 + }, + { + "epoch": 0.22020725388601037, + "grad_norm": 50.87052190327881, + "learning_rate": 3.964445713365682e-05, + "loss": 1.1529, + "step": 85 + }, + { + "epoch": 0.22279792746113988, + "grad_norm": 35.32915760431302, + "learning_rate": 3.9628163277553486e-05, + "loss": 1.1767, + "step": 86 + }, + { + "epoch": 0.22538860103626943, + "grad_norm": 157.5587514654703, + "learning_rate": 3.961150788126286e-05, + "loss": 1.2194, + "step": 87 + }, + { + "epoch": 0.22797927461139897, + "grad_norm": 25.03485489120971, + "learning_rate": 3.9594491251568376e-05, + "loss": 1.1392, + "step": 88 + }, + { + "epoch": 0.23056994818652848, + "grad_norm": 80.55933867045263, + "learning_rate": 3.957711370190716e-05, + "loss": 1.1819, + "step": 89 + }, + { + "epoch": 0.23316062176165803, + "grad_norm": 272.22874004071406, + "learning_rate": 3.9559375552364325e-05, + "loss": 1.0998, + "step": 90 + }, + { + "epoch": 0.23575129533678757, + "grad_norm": 91.94671663482514, + "learning_rate": 3.954127712966702e-05, + "loss": 1.2494, + "step": 91 + }, + { + "epoch": 0.23834196891191708, + "grad_norm": 54.31533598131098, + "learning_rate": 3.952281876717843e-05, + "loss": 1.1385, + "step": 92 + }, + { + "epoch": 0.24093264248704663, + "grad_norm": 103.20789745908105, + "learning_rate": 3.950400080489165e-05, + "loss": 1.1398, + "step": 93 + }, + { + "epoch": 0.24352331606217617, + "grad_norm": 45.14746362545893, + "learning_rate": 3.94848235894234e-05, + "loss": 1.2697, + "step": 94 + }, + { + "epoch": 0.24611398963730569, + "grad_norm": 21.271923336142002, + "learning_rate": 3.9465287474007654e-05, + "loss": 1.1397, + "step": 95 + }, + { + "epoch": 0.24870466321243523, + "grad_norm": 93.89786795431422, + "learning_rate": 3.944539281848912e-05, + "loss": 1.1542, + "step": 96 + }, + { + "epoch": 0.25129533678756477, + "grad_norm": 32.38768349342839, + "learning_rate": 3.942513998931663e-05, + "loss": 1.1693, + "step": 97 + }, + { + "epoch": 0.25129533678756477, + "eval_loss": 1.1344976425170898, + "eval_runtime": 37.8807, + "eval_samples_per_second": 19.641, + "eval_steps_per_second": 1.241, + "step": 97 + }, + { + "epoch": 0.2538860103626943, + "grad_norm": 91.41293468177638, + "learning_rate": 3.940452935953639e-05, + "loss": 1.1724, + "step": 98 + }, + { + "epoch": 0.25647668393782386, + "grad_norm": 39.20645478419229, + "learning_rate": 3.9383561308785075e-05, + "loss": 1.1583, + "step": 99 + }, + { + "epoch": 0.25906735751295334, + "grad_norm": 35.32804513153546, + "learning_rate": 3.9362236223282885e-05, + "loss": 1.158, + "step": 100 + }, + { + "epoch": 0.2616580310880829, + "grad_norm": 35.24783762804842, + "learning_rate": 3.934055449582641e-05, + "loss": 1.1552, + "step": 101 + }, + { + "epoch": 0.26424870466321243, + "grad_norm": 33.743808031979775, + "learning_rate": 3.931851652578137e-05, + "loss": 1.264, + "step": 102 + }, + { + "epoch": 0.266839378238342, + "grad_norm": 113.49798793226394, + "learning_rate": 3.92961227190753e-05, + "loss": 1.2361, + "step": 103 + }, + { + "epoch": 0.2694300518134715, + "grad_norm": 31.813807349410364, + "learning_rate": 3.9273373488190036e-05, + "loss": 1.1246, + "step": 104 + }, + { + "epoch": 0.27202072538860106, + "grad_norm": 29.391695486306187, + "learning_rate": 3.925026925215417e-05, + "loss": 1.1142, + "step": 105 + }, + { + "epoch": 0.27461139896373055, + "grad_norm": 33.79933331839905, + "learning_rate": 3.922681043653526e-05, + "loss": 1.1401, + "step": 106 + }, + { + "epoch": 0.2772020725388601, + "grad_norm": 39.09509012730907, + "learning_rate": 3.920299747343204e-05, + "loss": 1.1822, + "step": 107 + }, + { + "epoch": 0.27979274611398963, + "grad_norm": 37.81471938433609, + "learning_rate": 3.9178830801466465e-05, + "loss": 1.1592, + "step": 108 + }, + { + "epoch": 0.2823834196891192, + "grad_norm": 69.07753778460207, + "learning_rate": 3.915431086577561e-05, + "loss": 1.1683, + "step": 109 + }, + { + "epoch": 0.2849740932642487, + "grad_norm": 28.864787246081605, + "learning_rate": 3.912943811800347e-05, + "loss": 1.1179, + "step": 110 + }, + { + "epoch": 0.28756476683937826, + "grad_norm": 28.842042951717836, + "learning_rate": 3.910421301629264e-05, + "loss": 1.1317, + "step": 111 + }, + { + "epoch": 0.29015544041450775, + "grad_norm": 51.475482074695506, + "learning_rate": 3.9078636025275904e-05, + "loss": 1.1451, + "step": 112 + }, + { + "epoch": 0.2927461139896373, + "grad_norm": 33.48279556713943, + "learning_rate": 3.9052707616067654e-05, + "loss": 1.1554, + "step": 113 + }, + { + "epoch": 0.29533678756476683, + "grad_norm": 21.279603575929844, + "learning_rate": 3.9026428266255205e-05, + "loss": 1.1636, + "step": 114 + }, + { + "epoch": 0.2979274611398964, + "grad_norm": 36.226178034876675, + "learning_rate": 3.899979845989003e-05, + "loss": 1.1966, + "step": 115 + }, + { + "epoch": 0.3005181347150259, + "grad_norm": 29.90506353145981, + "learning_rate": 3.897281868747878e-05, + "loss": 1.1888, + "step": 116 + }, + { + "epoch": 0.30310880829015546, + "grad_norm": 36.04602777809767, + "learning_rate": 3.894548944597434e-05, + "loss": 1.2066, + "step": 117 + }, + { + "epoch": 0.30569948186528495, + "grad_norm": 36.42793844948301, + "learning_rate": 3.8917811238766606e-05, + "loss": 1.1712, + "step": 118 + }, + { + "epoch": 0.3082901554404145, + "grad_norm": 58.788967662325696, + "learning_rate": 3.888978457567323e-05, + "loss": 1.1225, + "step": 119 + }, + { + "epoch": 0.31088082901554404, + "grad_norm": 29.357299816022326, + "learning_rate": 3.886140997293024e-05, + "loss": 1.1315, + "step": 120 + }, + { + "epoch": 0.3134715025906736, + "grad_norm": 95.08345317107502, + "learning_rate": 3.883268795318252e-05, + "loss": 1.1852, + "step": 121 + }, + { + "epoch": 0.3160621761658031, + "grad_norm": 33.6623824593179, + "learning_rate": 3.88036190454742e-05, + "loss": 1.16, + "step": 122 + }, + { + "epoch": 0.31865284974093266, + "grad_norm": 42.587546987131105, + "learning_rate": 3.8774203785238886e-05, + "loss": 1.1374, + "step": 123 + }, + { + "epoch": 0.32124352331606215, + "grad_norm": 33.360649853064245, + "learning_rate": 3.8744442714289816e-05, + "loss": 1.1757, + "step": 124 + }, + { + "epoch": 0.3238341968911917, + "grad_norm": 49.09256643961471, + "learning_rate": 3.8714336380809874e-05, + "loss": 1.1782, + "step": 125 + }, + { + "epoch": 0.32642487046632124, + "grad_norm": 31.505007051172793, + "learning_rate": 3.86838853393415e-05, + "loss": 1.195, + "step": 126 + }, + { + "epoch": 0.3290155440414508, + "grad_norm": 34.36735417254799, + "learning_rate": 3.865309015077645e-05, + "loss": 1.1078, + "step": 127 + }, + { + "epoch": 0.3316062176165803, + "grad_norm": 36.63220606142181, + "learning_rate": 3.862195138234551e-05, + "loss": 1.1319, + "step": 128 + }, + { + "epoch": 0.33419689119170987, + "grad_norm": 53.324986862513676, + "learning_rate": 3.859046960760801e-05, + "loss": 1.2301, + "step": 129 + }, + { + "epoch": 0.33678756476683935, + "grad_norm": 47.41445409144979, + "learning_rate": 3.855864540644126e-05, + "loss": 1.2366, + "step": 130 + }, + { + "epoch": 0.3393782383419689, + "grad_norm": 32.57355122427366, + "learning_rate": 3.8526479365029906e-05, + "loss": 1.142, + "step": 131 + }, + { + "epoch": 0.34196891191709844, + "grad_norm": 28.445824333644715, + "learning_rate": 3.849397207585508e-05, + "loss": 1.0847, + "step": 132 + }, + { + "epoch": 0.344559585492228, + "grad_norm": 49.23062726715889, + "learning_rate": 3.846112413768353e-05, + "loss": 1.2241, + "step": 133 + }, + { + "epoch": 0.3471502590673575, + "grad_norm": 53.424206543788074, + "learning_rate": 3.842793615555657e-05, + "loss": 1.2392, + "step": 134 + }, + { + "epoch": 0.34974093264248707, + "grad_norm": 38.19316140175426, + "learning_rate": 3.8394408740778934e-05, + "loss": 1.1208, + "step": 135 + }, + { + "epoch": 0.35233160621761656, + "grad_norm": 32.35931252369273, + "learning_rate": 3.836054251090755e-05, + "loss": 1.1604, + "step": 136 + }, + { + "epoch": 0.3549222797927461, + "grad_norm": 37.90085344799495, + "learning_rate": 3.83263380897401e-05, + "loss": 1.1134, + "step": 137 + }, + { + "epoch": 0.35751295336787564, + "grad_norm": 44.49191588319939, + "learning_rate": 3.829179610730359e-05, + "loss": 1.1281, + "step": 138 + }, + { + "epoch": 0.3601036269430052, + "grad_norm": 141.98524430756757, + "learning_rate": 3.8256917199842715e-05, + "loss": 1.0928, + "step": 139 + }, + { + "epoch": 0.3626943005181347, + "grad_norm": 30.887093976524472, + "learning_rate": 3.822170200980815e-05, + "loss": 1.0936, + "step": 140 + }, + { + "epoch": 0.36528497409326427, + "grad_norm": 21.980521878837745, + "learning_rate": 3.818615118584472e-05, + "loss": 1.1368, + "step": 141 + }, + { + "epoch": 0.36787564766839376, + "grad_norm": 538.6650762618656, + "learning_rate": 3.815026538277943e-05, + "loss": 1.0918, + "step": 142 + }, + { + "epoch": 0.3704663212435233, + "grad_norm": 40.842881572203, + "learning_rate": 3.811404526160943e-05, + "loss": 1.1705, + "step": 143 + }, + { + "epoch": 0.37305699481865284, + "grad_norm": 26.891553492377298, + "learning_rate": 3.8077491489489835e-05, + "loss": 1.1468, + "step": 144 + }, + { + "epoch": 0.3756476683937824, + "grad_norm": 45.138483181178074, + "learning_rate": 3.8040604739721415e-05, + "loss": 1.1679, + "step": 145 + }, + { + "epoch": 0.37823834196891193, + "grad_norm": 35.133763086168244, + "learning_rate": 3.8003385691738227e-05, + "loss": 1.1029, + "step": 146 + }, + { + "epoch": 0.38082901554404147, + "grad_norm": 36.941250802707344, + "learning_rate": 3.7965835031095065e-05, + "loss": 1.1491, + "step": 147 + }, + { + "epoch": 0.38341968911917096, + "grad_norm": 90.1080256703095, + "learning_rate": 3.792795344945485e-05, + "loss": 1.1212, + "step": 148 + }, + { + "epoch": 0.3860103626943005, + "grad_norm": 39.70360899750413, + "learning_rate": 3.7889741644575914e-05, + "loss": 1.15, + "step": 149 + }, + { + "epoch": 0.38860103626943004, + "grad_norm": 28.229369877304094, + "learning_rate": 3.78512003202991e-05, + "loss": 1.1111, + "step": 150 + }, + { + "epoch": 0.3911917098445596, + "grad_norm": 31.611752191925987, + "learning_rate": 3.7812330186534815e-05, + "loss": 1.1366, + "step": 151 + }, + { + "epoch": 0.39378238341968913, + "grad_norm": 38.196015586772425, + "learning_rate": 3.777313195924998e-05, + "loss": 1.1433, + "step": 152 + }, + { + "epoch": 0.3963730569948187, + "grad_norm": 22.732638044547453, + "learning_rate": 3.773360636045481e-05, + "loss": 1.1125, + "step": 153 + }, + { + "epoch": 0.39896373056994816, + "grad_norm": 90.19158665385014, + "learning_rate": 3.7693754118189525e-05, + "loss": 1.1242, + "step": 154 + }, + { + "epoch": 0.4015544041450777, + "grad_norm": 42.43479974993017, + "learning_rate": 3.765357596651095e-05, + "loss": 1.1191, + "step": 155 + }, + { + "epoch": 0.40414507772020725, + "grad_norm": 88.0076735720364, + "learning_rate": 3.761307264547899e-05, + "loss": 1.1718, + "step": 156 + }, + { + "epoch": 0.4067357512953368, + "grad_norm": 30.782507703935767, + "learning_rate": 3.757224490114297e-05, + "loss": 1.109, + "step": 157 + }, + { + "epoch": 0.40932642487046633, + "grad_norm": 69.89871106113397, + "learning_rate": 3.7531093485527943e-05, + "loss": 1.1018, + "step": 158 + }, + { + "epoch": 0.4119170984455959, + "grad_norm": 37.339006645717305, + "learning_rate": 3.7489619156620796e-05, + "loss": 1.1358, + "step": 159 + }, + { + "epoch": 0.41450777202072536, + "grad_norm": 28.06388054378899, + "learning_rate": 3.744782267835632e-05, + "loss": 1.0847, + "step": 160 + }, + { + "epoch": 0.4170984455958549, + "grad_norm": 54.05874281297702, + "learning_rate": 3.740570482060311e-05, + "loss": 1.1682, + "step": 161 + }, + { + "epoch": 0.41968911917098445, + "grad_norm": 32.299093265328835, + "learning_rate": 3.73632663591494e-05, + "loss": 1.1413, + "step": 162 + }, + { + "epoch": 0.422279792746114, + "grad_norm": 31.213652090157694, + "learning_rate": 3.732050807568878e-05, + "loss": 1.1313, + "step": 163 + }, + { + "epoch": 0.42487046632124353, + "grad_norm": 40.01090035937505, + "learning_rate": 3.727743075780578e-05, + "loss": 1.1513, + "step": 164 + }, + { + "epoch": 0.4274611398963731, + "grad_norm": 47.11352577964853, + "learning_rate": 3.723403519896136e-05, + "loss": 1.2192, + "step": 165 + }, + { + "epoch": 0.43005181347150256, + "grad_norm": 28.645086506093037, + "learning_rate": 3.7190322198478355e-05, + "loss": 1.1097, + "step": 166 + }, + { + "epoch": 0.4326424870466321, + "grad_norm": 35.28541113925116, + "learning_rate": 3.7146292561526654e-05, + "loss": 1.1557, + "step": 167 + }, + { + "epoch": 0.43523316062176165, + "grad_norm": 58.30281063037669, + "learning_rate": 3.7101947099108425e-05, + "loss": 1.1829, + "step": 168 + }, + { + "epoch": 0.4378238341968912, + "grad_norm": 26.33563548968379, + "learning_rate": 3.70572866280432e-05, + "loss": 1.147, + "step": 169 + }, + { + "epoch": 0.44041450777202074, + "grad_norm": 57.00052875402651, + "learning_rate": 3.701231197095277e-05, + "loss": 1.1212, + "step": 170 + }, + { + "epoch": 0.4430051813471503, + "grad_norm": 23.672828037237174, + "learning_rate": 3.696702395624608e-05, + "loss": 1.1152, + "step": 171 + }, + { + "epoch": 0.44559585492227977, + "grad_norm": 41.1264174112964, + "learning_rate": 3.692142341810395e-05, + "loss": 1.1154, + "step": 172 + }, + { + "epoch": 0.4481865284974093, + "grad_norm": 26.72177706144361, + "learning_rate": 3.6875511196463715e-05, + "loss": 1.1725, + "step": 173 + }, + { + "epoch": 0.45077720207253885, + "grad_norm": 95.4088800585977, + "learning_rate": 3.682928813700375e-05, + "loss": 1.1339, + "step": 174 + }, + { + "epoch": 0.4533678756476684, + "grad_norm": 34.33666578349465, + "learning_rate": 3.678275509112788e-05, + "loss": 1.1867, + "step": 175 + }, + { + "epoch": 0.45595854922279794, + "grad_norm": 31.032304531003014, + "learning_rate": 3.6735912915949745e-05, + "loss": 1.1386, + "step": 176 + }, + { + "epoch": 0.4585492227979275, + "grad_norm": 55.22043313188224, + "learning_rate": 3.6688762474276945e-05, + "loss": 1.1102, + "step": 177 + }, + { + "epoch": 0.46113989637305697, + "grad_norm": 29.82713377876857, + "learning_rate": 3.6641304634595216e-05, + "loss": 1.1564, + "step": 178 + }, + { + "epoch": 0.4637305699481865, + "grad_norm": 35.71025459541737, + "learning_rate": 3.659354027105238e-05, + "loss": 1.0939, + "step": 179 + }, + { + "epoch": 0.46632124352331605, + "grad_norm": 52.41175655642653, + "learning_rate": 3.6545470263442265e-05, + "loss": 1.1578, + "step": 180 + }, + { + "epoch": 0.4689119170984456, + "grad_norm": 27.682485766528306, + "learning_rate": 3.649709549718849e-05, + "loss": 1.1875, + "step": 181 + }, + { + "epoch": 0.47150259067357514, + "grad_norm": 36.53293663303487, + "learning_rate": 3.6448416863328186e-05, + "loss": 1.1111, + "step": 182 + }, + { + "epoch": 0.4740932642487047, + "grad_norm": 31.45177998538027, + "learning_rate": 3.639943525849555e-05, + "loss": 1.113, + "step": 183 + }, + { + "epoch": 0.47668393782383417, + "grad_norm": 28.323097072885673, + "learning_rate": 3.635015158490533e-05, + "loss": 1.1159, + "step": 184 + }, + { + "epoch": 0.4792746113989637, + "grad_norm": 47.75573754341213, + "learning_rate": 3.6300566750336225e-05, + "loss": 1.1305, + "step": 185 + }, + { + "epoch": 0.48186528497409326, + "grad_norm": 21.384095061494357, + "learning_rate": 3.625068166811418e-05, + "loss": 1.1369, + "step": 186 + }, + { + "epoch": 0.4844559585492228, + "grad_norm": 30.714645036809546, + "learning_rate": 3.6200497257095504e-05, + "loss": 1.1858, + "step": 187 + }, + { + "epoch": 0.48704663212435234, + "grad_norm": 35.12161426399798, + "learning_rate": 3.615001444165001e-05, + "loss": 1.1293, + "step": 188 + }, + { + "epoch": 0.4896373056994819, + "grad_norm": 116.83443661381396, + "learning_rate": 3.6099234151643924e-05, + "loss": 1.1515, + "step": 189 + }, + { + "epoch": 0.49222797927461137, + "grad_norm": 55.47885243409044, + "learning_rate": 3.604815732242283e-05, + "loss": 1.112, + "step": 190 + }, + { + "epoch": 0.4948186528497409, + "grad_norm": 32.332747429034285, + "learning_rate": 3.5996784894794394e-05, + "loss": 1.1661, + "step": 191 + }, + { + "epoch": 0.49740932642487046, + "grad_norm": 33.039210183180046, + "learning_rate": 3.594511781501103e-05, + "loss": 1.1244, + "step": 192 + }, + { + "epoch": 0.5, + "grad_norm": 21.325687337182504, + "learning_rate": 3.58931570347525e-05, + "loss": 1.1634, + "step": 193 + }, + { + "epoch": 0.5025906735751295, + "grad_norm": 51.37599478469561, + "learning_rate": 3.584090351110838e-05, + "loss": 1.2106, + "step": 194 + }, + { + "epoch": 0.5025906735751295, + "eval_loss": 1.1119717359542847, + "eval_runtime": 49.6027, + "eval_samples_per_second": 14.999, + "eval_steps_per_second": 0.948, + "step": 194 + }, + { + "epoch": 0.5051813471502591, + "grad_norm": 42.105169991612456, + "learning_rate": 3.57883582065604e-05, + "loss": 1.1303, + "step": 195 + }, + { + "epoch": 0.5077720207253886, + "grad_norm": 37.14457014578168, + "learning_rate": 3.573552208896474e-05, + "loss": 1.1483, + "step": 196 + }, + { + "epoch": 0.5103626943005182, + "grad_norm": 28.56241612018119, + "learning_rate": 3.568239613153421e-05, + "loss": 1.0843, + "step": 197 + }, + { + "epoch": 0.5129533678756477, + "grad_norm": 35.399304035761865, + "learning_rate": 3.5628981312820315e-05, + "loss": 1.1177, + "step": 198 + }, + { + "epoch": 0.5155440414507773, + "grad_norm": 25.91156850470446, + "learning_rate": 3.557527861669522e-05, + "loss": 1.1215, + "step": 199 + }, + { + "epoch": 0.5181347150259067, + "grad_norm": 43.509516777992324, + "learning_rate": 3.552128903233363e-05, + "loss": 1.1532, + "step": 200 + }, + { + "epoch": 0.5207253886010362, + "grad_norm": 38.18164449834795, + "learning_rate": 3.54670135541946e-05, + "loss": 1.1142, + "step": 201 + }, + { + "epoch": 0.5233160621761658, + "grad_norm": 48.576743289054534, + "learning_rate": 3.541245318200318e-05, + "loss": 1.1152, + "step": 202 + }, + { + "epoch": 0.5259067357512953, + "grad_norm": 38.65411737007163, + "learning_rate": 3.5357608920732e-05, + "loss": 1.1607, + "step": 203 + }, + { + "epoch": 0.5284974093264249, + "grad_norm": 35.663493907396834, + "learning_rate": 3.530248178058282e-05, + "loss": 1.1273, + "step": 204 + }, + { + "epoch": 0.5310880829015544, + "grad_norm": 26.829817821665976, + "learning_rate": 3.5247072776967805e-05, + "loss": 1.1174, + "step": 205 + }, + { + "epoch": 0.533678756476684, + "grad_norm": 39.79604912152638, + "learning_rate": 3.519138293049097e-05, + "loss": 1.1811, + "step": 206 + }, + { + "epoch": 0.5362694300518135, + "grad_norm": 32.26179097390416, + "learning_rate": 3.513541326692925e-05, + "loss": 1.1346, + "step": 207 + }, + { + "epoch": 0.538860103626943, + "grad_norm": 24.35769329902787, + "learning_rate": 3.5079164817213684e-05, + "loss": 1.1061, + "step": 208 + }, + { + "epoch": 0.5414507772020726, + "grad_norm": 26.645546258363844, + "learning_rate": 3.5022638617410396e-05, + "loss": 1.0514, + "step": 209 + }, + { + "epoch": 0.5440414507772021, + "grad_norm": 105.19676603444857, + "learning_rate": 3.496583570870152e-05, + "loss": 1.1474, + "step": 210 + }, + { + "epoch": 0.5466321243523317, + "grad_norm": 61.600623030405885, + "learning_rate": 3.4908757137366006e-05, + "loss": 1.104, + "step": 211 + }, + { + "epoch": 0.5492227979274611, + "grad_norm": 31.65460129853052, + "learning_rate": 3.485140395476038e-05, + "loss": 1.0737, + "step": 212 + }, + { + "epoch": 0.5518134715025906, + "grad_norm": 26.860379117211497, + "learning_rate": 3.4793777217299346e-05, + "loss": 1.1119, + "step": 213 + }, + { + "epoch": 0.5544041450777202, + "grad_norm": 39.89324262309783, + "learning_rate": 3.473587798643633e-05, + "loss": 1.1626, + "step": 214 + }, + { + "epoch": 0.5569948186528497, + "grad_norm": 39.77638257731599, + "learning_rate": 3.467770732864399e-05, + "loss": 1.1545, + "step": 215 + }, + { + "epoch": 0.5595854922279793, + "grad_norm": 30.994657564291458, + "learning_rate": 3.461926631539445e-05, + "loss": 1.1646, + "step": 216 + }, + { + "epoch": 0.5621761658031088, + "grad_norm": 51.99674092516571, + "learning_rate": 3.4560556023139695e-05, + "loss": 1.1638, + "step": 217 + }, + { + "epoch": 0.5647668393782384, + "grad_norm": 58.5132713002146, + "learning_rate": 3.450157753329166e-05, + "loss": 1.1461, + "step": 218 + }, + { + "epoch": 0.5673575129533679, + "grad_norm": 30.712469030418482, + "learning_rate": 3.4442331932202326e-05, + "loss": 1.1583, + "step": 219 + }, + { + "epoch": 0.5699481865284974, + "grad_norm": 47.00217426642832, + "learning_rate": 3.438282031114374e-05, + "loss": 1.1154, + "step": 220 + }, + { + "epoch": 0.572538860103627, + "grad_norm": 37.33927961163222, + "learning_rate": 3.432304376628787e-05, + "loss": 1.1372, + "step": 221 + }, + { + "epoch": 0.5751295336787565, + "grad_norm": 28.858636933974392, + "learning_rate": 3.4263003398686464e-05, + "loss": 1.0488, + "step": 222 + }, + { + "epoch": 0.5777202072538861, + "grad_norm": 37.842230890171486, + "learning_rate": 3.420270031425072e-05, + "loss": 1.1892, + "step": 223 + }, + { + "epoch": 0.5803108808290155, + "grad_norm": 32.65394945357516, + "learning_rate": 3.4142135623730954e-05, + "loss": 1.1218, + "step": 224 + }, + { + "epoch": 0.582901554404145, + "grad_norm": 115.22040829465772, + "learning_rate": 3.4081310442696114e-05, + "loss": 1.1546, + "step": 225 + }, + { + "epoch": 0.5854922279792746, + "grad_norm": 31.20514468446119, + "learning_rate": 3.402022589151325e-05, + "loss": 1.0969, + "step": 226 + }, + { + "epoch": 0.5880829015544041, + "grad_norm": 52.8397361926395, + "learning_rate": 3.395888309532687e-05, + "loss": 1.1218, + "step": 227 + }, + { + "epoch": 0.5906735751295337, + "grad_norm": 51.7991692917308, + "learning_rate": 3.3897283184038215e-05, + "loss": 1.1395, + "step": 228 + }, + { + "epoch": 0.5932642487046632, + "grad_norm": 33.56775233970504, + "learning_rate": 3.3835427292284445e-05, + "loss": 1.1107, + "step": 229 + }, + { + "epoch": 0.5958549222797928, + "grad_norm": 46.081120788214314, + "learning_rate": 3.3773316559417734e-05, + "loss": 1.1472, + "step": 230 + }, + { + "epoch": 0.5984455958549223, + "grad_norm": 41.72558170492288, + "learning_rate": 3.371095212948431e-05, + "loss": 1.1871, + "step": 231 + }, + { + "epoch": 0.6010362694300518, + "grad_norm": 34.27957927587091, + "learning_rate": 3.364833515120336e-05, + "loss": 1.1376, + "step": 232 + }, + { + "epoch": 0.6036269430051814, + "grad_norm": 36.58452602010953, + "learning_rate": 3.358546677794586e-05, + "loss": 1.1885, + "step": 233 + }, + { + "epoch": 0.6062176165803109, + "grad_norm": 28.010809914189192, + "learning_rate": 3.352234816771337e-05, + "loss": 1.102, + "step": 234 + }, + { + "epoch": 0.6088082901554405, + "grad_norm": 24.78419558611963, + "learning_rate": 3.3458980483116664e-05, + "loss": 1.0818, + "step": 235 + }, + { + "epoch": 0.6113989637305699, + "grad_norm": 28.12830040081226, + "learning_rate": 3.3395364891354316e-05, + "loss": 1.1862, + "step": 236 + }, + { + "epoch": 0.6139896373056994, + "grad_norm": 37.94181651161551, + "learning_rate": 3.333150256419127e-05, + "loss": 1.147, + "step": 237 + }, + { + "epoch": 0.616580310880829, + "grad_norm": 21.809518482701854, + "learning_rate": 3.3267394677937134e-05, + "loss": 1.0994, + "step": 238 + }, + { + "epoch": 0.6191709844559585, + "grad_norm": 32.12135773753589, + "learning_rate": 3.320304241342464e-05, + "loss": 1.1531, + "step": 239 + }, + { + "epoch": 0.6217616580310881, + "grad_norm": 51.959731073524054, + "learning_rate": 3.31384469559878e-05, + "loss": 1.1717, + "step": 240 + }, + { + "epoch": 0.6243523316062176, + "grad_norm": 28.045815836372345, + "learning_rate": 3.307360949544012e-05, + "loss": 1.1814, + "step": 241 + }, + { + "epoch": 0.6269430051813472, + "grad_norm": 39.55208384578746, + "learning_rate": 3.300853122605268e-05, + "loss": 1.1483, + "step": 242 + }, + { + "epoch": 0.6295336787564767, + "grad_norm": 29.799974205160808, + "learning_rate": 3.294321334653213e-05, + "loss": 1.1838, + "step": 243 + }, + { + "epoch": 0.6321243523316062, + "grad_norm": 124.31035254102245, + "learning_rate": 3.2877657059998584e-05, + "loss": 1.0698, + "step": 244 + }, + { + "epoch": 0.6347150259067358, + "grad_norm": 37.989925180187655, + "learning_rate": 3.281186357396351e-05, + "loss": 1.0984, + "step": 245 + }, + { + "epoch": 0.6373056994818653, + "grad_norm": 55.72599333657572, + "learning_rate": 3.274583410030745e-05, + "loss": 1.2333, + "step": 246 + }, + { + "epoch": 0.6398963730569949, + "grad_norm": 46.77079456439719, + "learning_rate": 3.267956985525774e-05, + "loss": 1.2157, + "step": 247 + }, + { + "epoch": 0.6424870466321243, + "grad_norm": 33.62329915252562, + "learning_rate": 3.261307205936603e-05, + "loss": 1.1752, + "step": 248 + }, + { + "epoch": 0.6450777202072538, + "grad_norm": 34.11794183225494, + "learning_rate": 3.2546341937485884e-05, + "loss": 1.1265, + "step": 249 + }, + { + "epoch": 0.6476683937823834, + "grad_norm": 36.027636323913896, + "learning_rate": 3.247938071875017e-05, + "loss": 1.103, + "step": 250 + }, + { + "epoch": 0.6502590673575129, + "grad_norm": 35.393219337329946, + "learning_rate": 3.2412189636548456e-05, + "loss": 1.1148, + "step": 251 + }, + { + "epoch": 0.6528497409326425, + "grad_norm": 31.578919022569924, + "learning_rate": 3.234476992850425e-05, + "loss": 1.1149, + "step": 252 + }, + { + "epoch": 0.655440414507772, + "grad_norm": 28.93717647736964, + "learning_rate": 3.227712283645224e-05, + "loss": 1.1425, + "step": 253 + }, + { + "epoch": 0.6580310880829016, + "grad_norm": 34.170026750703684, + "learning_rate": 3.2209249606415394e-05, + "loss": 1.1591, + "step": 254 + }, + { + "epoch": 0.6606217616580311, + "grad_norm": 27.52194954061608, + "learning_rate": 3.214115148858201e-05, + "loss": 1.1704, + "step": 255 + }, + { + "epoch": 0.6632124352331606, + "grad_norm": 81.65404753769732, + "learning_rate": 3.207282973728273e-05, + "loss": 1.161, + "step": 256 + }, + { + "epoch": 0.6658031088082902, + "grad_norm": 57.45351536522683, + "learning_rate": 3.200428561096737e-05, + "loss": 1.116, + "step": 257 + }, + { + "epoch": 0.6683937823834197, + "grad_norm": 30.968529074463714, + "learning_rate": 3.193552037218179e-05, + "loss": 1.1265, + "step": 258 + }, + { + "epoch": 0.6709844559585493, + "grad_norm": 37.8817748068655, + "learning_rate": 3.186653528754464e-05, + "loss": 1.1287, + "step": 259 + }, + { + "epoch": 0.6735751295336787, + "grad_norm": 29.197031189172545, + "learning_rate": 3.179733162772398e-05, + "loss": 1.1045, + "step": 260 + }, + { + "epoch": 0.6761658031088082, + "grad_norm": 36.56253841299107, + "learning_rate": 3.172791066741392e-05, + "loss": 1.1539, + "step": 261 + }, + { + "epoch": 0.6787564766839378, + "grad_norm": 25.799921116950998, + "learning_rate": 3.165827368531113e-05, + "loss": 1.0796, + "step": 262 + }, + { + "epoch": 0.6813471502590673, + "grad_norm": 82.81825216532526, + "learning_rate": 3.1588421964091276e-05, + "loss": 1.142, + "step": 263 + }, + { + "epoch": 0.6839378238341969, + "grad_norm": 31.100074747569124, + "learning_rate": 3.151835679038542e-05, + "loss": 1.0908, + "step": 264 + }, + { + "epoch": 0.6865284974093264, + "grad_norm": 25.57297200703221, + "learning_rate": 3.14480794547563e-05, + "loss": 1.1436, + "step": 265 + }, + { + "epoch": 0.689119170984456, + "grad_norm": 23.92492773149328, + "learning_rate": 3.137759125167455e-05, + "loss": 1.1202, + "step": 266 + }, + { + "epoch": 0.6917098445595855, + "grad_norm": 22.14274360766396, + "learning_rate": 3.130689347949486e-05, + "loss": 1.1113, + "step": 267 + }, + { + "epoch": 0.694300518134715, + "grad_norm": 26.68725288649902, + "learning_rate": 3.123598744043211e-05, + "loss": 1.1517, + "step": 268 + }, + { + "epoch": 0.6968911917098446, + "grad_norm": 25.559817524659362, + "learning_rate": 3.1164874440537295e-05, + "loss": 1.0976, + "step": 269 + }, + { + "epoch": 0.6994818652849741, + "grad_norm": 28.89996834100355, + "learning_rate": 3.109355578967356e-05, + "loss": 1.1932, + "step": 270 + }, + { + "epoch": 0.7020725388601037, + "grad_norm": 32.09658045195569, + "learning_rate": 3.1022032801492e-05, + "loss": 1.1161, + "step": 271 + }, + { + "epoch": 0.7046632124352331, + "grad_norm": 30.623705646213768, + "learning_rate": 3.095030679340751e-05, + "loss": 1.1993, + "step": 272 + }, + { + "epoch": 0.7072538860103627, + "grad_norm": 41.71263710932429, + "learning_rate": 3.0878379086574494e-05, + "loss": 1.1624, + "step": 273 + }, + { + "epoch": 0.7098445595854922, + "grad_norm": 34.68352639470226, + "learning_rate": 3.0806251005862535e-05, + "loss": 1.1156, + "step": 274 + }, + { + "epoch": 0.7124352331606217, + "grad_norm": 23.52580702428812, + "learning_rate": 3.073392387983202e-05, + "loss": 1.0963, + "step": 275 + }, + { + "epoch": 0.7150259067357513, + "grad_norm": 28.10687988214902, + "learning_rate": 3.0661399040709584e-05, + "loss": 1.1095, + "step": 276 + }, + { + "epoch": 0.7176165803108808, + "grad_norm": 66.72288729975841, + "learning_rate": 3.05886778243637e-05, + "loss": 1.0865, + "step": 277 + }, + { + "epoch": 0.7202072538860104, + "grad_norm": 25.775217430321934, + "learning_rate": 3.051576157027998e-05, + "loss": 1.1058, + "step": 278 + }, + { + "epoch": 0.7227979274611399, + "grad_norm": 36.82942099016794, + "learning_rate": 3.0442651621536502e-05, + "loss": 1.1211, + "step": 279 + }, + { + "epoch": 0.7253886010362695, + "grad_norm": 27.878820856521013, + "learning_rate": 3.0369349324779115e-05, + "loss": 1.1471, + "step": 280 + }, + { + "epoch": 0.727979274611399, + "grad_norm": 31.293156717285573, + "learning_rate": 3.0295856030196618e-05, + "loss": 1.0748, + "step": 281 + }, + { + "epoch": 0.7305699481865285, + "grad_norm": 39.315952115194435, + "learning_rate": 3.022217309149588e-05, + "loss": 1.0993, + "step": 282 + }, + { + "epoch": 0.7331606217616581, + "grad_norm": 36.79954071435495, + "learning_rate": 3.0148301865876913e-05, + "loss": 1.1045, + "step": 283 + }, + { + "epoch": 0.7357512953367875, + "grad_norm": 26.127389502147167, + "learning_rate": 3.0074243714007875e-05, + "loss": 1.1424, + "step": 284 + }, + { + "epoch": 0.7383419689119171, + "grad_norm": 25.608778060317068, + "learning_rate": 3.0000000000000004e-05, + "loss": 1.1055, + "step": 285 + }, + { + "epoch": 0.7409326424870466, + "grad_norm": 36.22629669671894, + "learning_rate": 2.992557209138249e-05, + "loss": 1.0845, + "step": 286 + }, + { + "epoch": 0.7435233160621761, + "grad_norm": 35.30642111132886, + "learning_rate": 2.9850961359077293e-05, + "loss": 1.204, + "step": 287 + }, + { + "epoch": 0.7461139896373057, + "grad_norm": 29.765894622087952, + "learning_rate": 2.977616917737388e-05, + "loss": 1.168, + "step": 288 + }, + { + "epoch": 0.7487046632124352, + "grad_norm": 27.194683587397567, + "learning_rate": 2.9701196923903927e-05, + "loss": 1.1236, + "step": 289 + }, + { + "epoch": 0.7512953367875648, + "grad_norm": 63.09779240191165, + "learning_rate": 2.9626045979615928e-05, + "loss": 1.1395, + "step": 290 + }, + { + "epoch": 0.7538860103626943, + "grad_norm": 25.014233377763066, + "learning_rate": 2.9550717728749768e-05, + "loss": 1.1054, + "step": 291 + }, + { + "epoch": 0.7538860103626943, + "eval_loss": 1.0996382236480713, + "eval_runtime": 37.9545, + "eval_samples_per_second": 19.602, + "eval_steps_per_second": 1.238, + "step": 291 + }, + { + "epoch": 0.7564766839378239, + "grad_norm": 27.481891737318097, + "learning_rate": 2.947521355881122e-05, + "loss": 1.1252, + "step": 292 + }, + { + "epoch": 0.7590673575129534, + "grad_norm": 67.57807413949878, + "learning_rate": 2.9399534860546404e-05, + "loss": 1.1761, + "step": 293 + }, + { + "epoch": 0.7616580310880829, + "grad_norm": 65.66834495909988, + "learning_rate": 2.932368302791614e-05, + "loss": 1.0551, + "step": 294 + }, + { + "epoch": 0.7642487046632125, + "grad_norm": 30.051210942517116, + "learning_rate": 2.92476594580703e-05, + "loss": 1.138, + "step": 295 + }, + { + "epoch": 0.7668393782383419, + "grad_norm": 22.693089678510507, + "learning_rate": 2.917146555132206e-05, + "loss": 1.1495, + "step": 296 + }, + { + "epoch": 0.7694300518134715, + "grad_norm": 53.84166280540606, + "learning_rate": 2.909510271112212e-05, + "loss": 1.1409, + "step": 297 + }, + { + "epoch": 0.772020725388601, + "grad_norm": 32.69106061524578, + "learning_rate": 2.9018572344032823e-05, + "loss": 1.1709, + "step": 298 + }, + { + "epoch": 0.7746113989637305, + "grad_norm": 39.44484991312582, + "learning_rate": 2.8941875859702283e-05, + "loss": 1.1138, + "step": 299 + }, + { + "epoch": 0.7772020725388601, + "grad_norm": 31.51857596969122, + "learning_rate": 2.88650146708384e-05, + "loss": 1.1931, + "step": 300 + }, + { + "epoch": 0.7797927461139896, + "grad_norm": 70.51218412614058, + "learning_rate": 2.878799019318283e-05, + "loss": 1.155, + "step": 301 + }, + { + "epoch": 0.7823834196891192, + "grad_norm": 80.27969224752457, + "learning_rate": 2.8710803845484955e-05, + "loss": 1.1425, + "step": 302 + }, + { + "epoch": 0.7849740932642487, + "grad_norm": 28.16560857981767, + "learning_rate": 2.8633457049475678e-05, + "loss": 1.1072, + "step": 303 + }, + { + "epoch": 0.7875647668393783, + "grad_norm": 41.15138307552231, + "learning_rate": 2.855595122984129e-05, + "loss": 1.1492, + "step": 304 + }, + { + "epoch": 0.7901554404145078, + "grad_norm": 23.894217282116276, + "learning_rate": 2.847828781419722e-05, + "loss": 1.1136, + "step": 305 + }, + { + "epoch": 0.7927461139896373, + "grad_norm": 25.005501120810248, + "learning_rate": 2.8400468233061708e-05, + "loss": 1.0921, + "step": 306 + }, + { + "epoch": 0.7953367875647669, + "grad_norm": 30.91791938195468, + "learning_rate": 2.832249391982949e-05, + "loss": 1.1098, + "step": 307 + }, + { + "epoch": 0.7979274611398963, + "grad_norm": 44.776563922922726, + "learning_rate": 2.8244366310745398e-05, + "loss": 1.1845, + "step": 308 + }, + { + "epoch": 0.8005181347150259, + "grad_norm": 19.059329544784376, + "learning_rate": 2.816608684487787e-05, + "loss": 1.169, + "step": 309 + }, + { + "epoch": 0.8031088082901554, + "grad_norm": 63.97334641962602, + "learning_rate": 2.8087656964092472e-05, + "loss": 1.124, + "step": 310 + }, + { + "epoch": 0.805699481865285, + "grad_norm": 30.878848859015882, + "learning_rate": 2.8009078113025335e-05, + "loss": 1.2087, + "step": 311 + }, + { + "epoch": 0.8082901554404145, + "grad_norm": 34.63835471543836, + "learning_rate": 2.7930351739056533e-05, + "loss": 1.1338, + "step": 312 + }, + { + "epoch": 0.810880829015544, + "grad_norm": 30.03178182445718, + "learning_rate": 2.7851479292283442e-05, + "loss": 1.1321, + "step": 313 + }, + { + "epoch": 0.8134715025906736, + "grad_norm": 38.42236523356876, + "learning_rate": 2.7772462225494013e-05, + "loss": 1.1557, + "step": 314 + }, + { + "epoch": 0.8160621761658031, + "grad_norm": 39.179683790956744, + "learning_rate": 2.7693301994140026e-05, + "loss": 1.1201, + "step": 315 + }, + { + "epoch": 0.8186528497409327, + "grad_norm": 38.32243159447327, + "learning_rate": 2.761400005631028e-05, + "loss": 1.1105, + "step": 316 + }, + { + "epoch": 0.8212435233160622, + "grad_norm": 39.913808227411835, + "learning_rate": 2.7534557872703705e-05, + "loss": 1.1598, + "step": 317 + }, + { + "epoch": 0.8238341968911918, + "grad_norm": 69.73521867812421, + "learning_rate": 2.7454976906602513e-05, + "loss": 1.1145, + "step": 318 + }, + { + "epoch": 0.8264248704663213, + "grad_norm": 65.55887588207746, + "learning_rate": 2.7375258623845207e-05, + "loss": 1.1255, + "step": 319 + }, + { + "epoch": 0.8290155440414507, + "grad_norm": 30.980111545641563, + "learning_rate": 2.7295404492799575e-05, + "loss": 1.122, + "step": 320 + }, + { + "epoch": 0.8316062176165803, + "grad_norm": 30.12179911444832, + "learning_rate": 2.721541598433567e-05, + "loss": 1.113, + "step": 321 + }, + { + "epoch": 0.8341968911917098, + "grad_norm": 28.329434659508582, + "learning_rate": 2.7135294571798706e-05, + "loss": 1.0498, + "step": 322 + }, + { + "epoch": 0.8367875647668394, + "grad_norm": 25.114787597049578, + "learning_rate": 2.70550417309819e-05, + "loss": 1.0633, + "step": 323 + }, + { + "epoch": 0.8393782383419689, + "grad_norm": 27.754037709590385, + "learning_rate": 2.6974658940099337e-05, + "loss": 1.1585, + "step": 324 + }, + { + "epoch": 0.8419689119170984, + "grad_norm": 29.489888159179444, + "learning_rate": 2.6894147679758678e-05, + "loss": 1.1259, + "step": 325 + }, + { + "epoch": 0.844559585492228, + "grad_norm": 24.426102194202898, + "learning_rate": 2.6813509432933957e-05, + "loss": 1.1515, + "step": 326 + }, + { + "epoch": 0.8471502590673575, + "grad_norm": 24.75197483331429, + "learning_rate": 2.673274568493821e-05, + "loss": 1.15, + "step": 327 + }, + { + "epoch": 0.8497409326424871, + "grad_norm": 40.604864626683366, + "learning_rate": 2.6651857923396132e-05, + "loss": 1.1219, + "step": 328 + }, + { + "epoch": 0.8523316062176166, + "grad_norm": 34.694568404196026, + "learning_rate": 2.6570847638216698e-05, + "loss": 1.103, + "step": 329 + }, + { + "epoch": 0.8549222797927462, + "grad_norm": 48.715136403425035, + "learning_rate": 2.648971632156569e-05, + "loss": 1.1675, + "step": 330 + }, + { + "epoch": 0.8575129533678757, + "grad_norm": 97.77526410121799, + "learning_rate": 2.6408465467838225e-05, + "loss": 1.1502, + "step": 331 + }, + { + "epoch": 0.8601036269430051, + "grad_norm": 54.697215318949276, + "learning_rate": 2.632709657363124e-05, + "loss": 1.1446, + "step": 332 + }, + { + "epoch": 0.8626943005181347, + "grad_norm": 38.09192002041798, + "learning_rate": 2.6245611137715897e-05, + "loss": 1.1333, + "step": 333 + }, + { + "epoch": 0.8652849740932642, + "grad_norm": 46.713623556984956, + "learning_rate": 2.6164010661010007e-05, + "loss": 1.1252, + "step": 334 + }, + { + "epoch": 0.8678756476683938, + "grad_norm": 46.40552686286593, + "learning_rate": 2.6082296646550364e-05, + "loss": 1.121, + "step": 335 + }, + { + "epoch": 0.8704663212435233, + "grad_norm": 37.57424454065957, + "learning_rate": 2.6000470599465065e-05, + "loss": 1.1671, + "step": 336 + }, + { + "epoch": 0.8730569948186528, + "grad_norm": 38.580777053099204, + "learning_rate": 2.5918534026945787e-05, + "loss": 1.0849, + "step": 337 + }, + { + "epoch": 0.8756476683937824, + "grad_norm": 154.3106712010981, + "learning_rate": 2.5836488438220044e-05, + "loss": 1.0663, + "step": 338 + }, + { + "epoch": 0.8782383419689119, + "grad_norm": 34.21394067951015, + "learning_rate": 2.575433534452334e-05, + "loss": 1.0895, + "step": 339 + }, + { + "epoch": 0.8808290155440415, + "grad_norm": 36.291611242733886, + "learning_rate": 2.5672076259071385e-05, + "loss": 1.1242, + "step": 340 + }, + { + "epoch": 0.883419689119171, + "grad_norm": 29.411623389655112, + "learning_rate": 2.558971269703219e-05, + "loss": 1.1005, + "step": 341 + }, + { + "epoch": 0.8860103626943006, + "grad_norm": 30.24903086761753, + "learning_rate": 2.5507246175498174e-05, + "loss": 1.1134, + "step": 342 + }, + { + "epoch": 0.8886010362694301, + "grad_norm": 22.032293114161938, + "learning_rate": 2.5424678213458202e-05, + "loss": 1.1121, + "step": 343 + }, + { + "epoch": 0.8911917098445595, + "grad_norm": 34.997361528376956, + "learning_rate": 2.5342010331769635e-05, + "loss": 1.1341, + "step": 344 + }, + { + "epoch": 0.8937823834196891, + "grad_norm": 28.212824875732352, + "learning_rate": 2.5259244053130295e-05, + "loss": 1.0748, + "step": 345 + }, + { + "epoch": 0.8963730569948186, + "grad_norm": 23.870011592985897, + "learning_rate": 2.5176380902050418e-05, + "loss": 1.0643, + "step": 346 + }, + { + "epoch": 0.8989637305699482, + "grad_norm": 26.10018699309748, + "learning_rate": 2.5093422404824574e-05, + "loss": 1.1662, + "step": 347 + }, + { + "epoch": 0.9015544041450777, + "grad_norm": 30.191468778559166, + "learning_rate": 2.5010370089503578e-05, + "loss": 1.1023, + "step": 348 + }, + { + "epoch": 0.9041450777202072, + "grad_norm": 55.799581973427415, + "learning_rate": 2.4927225485866297e-05, + "loss": 1.1538, + "step": 349 + }, + { + "epoch": 0.9067357512953368, + "grad_norm": 35.7030284720465, + "learning_rate": 2.4843990125391516e-05, + "loss": 1.1, + "step": 350 + }, + { + "epoch": 0.9093264248704663, + "grad_norm": 28.61763302791738, + "learning_rate": 2.4760665541229712e-05, + "loss": 1.0914, + "step": 351 + }, + { + "epoch": 0.9119170984455959, + "grad_norm": 33.34233685155311, + "learning_rate": 2.467725326817481e-05, + "loss": 1.0862, + "step": 352 + }, + { + "epoch": 0.9145077720207254, + "grad_norm": 25.441052078480084, + "learning_rate": 2.4593754842635917e-05, + "loss": 1.1422, + "step": 353 + }, + { + "epoch": 0.917098445595855, + "grad_norm": 24.217974454985058, + "learning_rate": 2.451017180260902e-05, + "loss": 1.132, + "step": 354 + }, + { + "epoch": 0.9196891191709845, + "grad_norm": 57.986011465793155, + "learning_rate": 2.4426505687648653e-05, + "loss": 1.2082, + "step": 355 + }, + { + "epoch": 0.9222797927461139, + "grad_norm": 34.058264716876195, + "learning_rate": 2.4342758038839573e-05, + "loss": 1.1679, + "step": 356 + }, + { + "epoch": 0.9248704663212435, + "grad_norm": 28.621514922275253, + "learning_rate": 2.4258930398768317e-05, + "loss": 1.1319, + "step": 357 + }, + { + "epoch": 0.927461139896373, + "grad_norm": 35.33355417283227, + "learning_rate": 2.4175024311494835e-05, + "loss": 1.0705, + "step": 358 + }, + { + "epoch": 0.9300518134715026, + "grad_norm": 46.579572933583265, + "learning_rate": 2.4091041322524023e-05, + "loss": 1.0842, + "step": 359 + }, + { + "epoch": 0.9326424870466321, + "grad_norm": 35.494740787672974, + "learning_rate": 2.4006982978777263e-05, + "loss": 1.1072, + "step": 360 + }, + { + "epoch": 0.9352331606217616, + "grad_norm": 44.56606839509262, + "learning_rate": 2.392285082856394e-05, + "loss": 1.1125, + "step": 361 + }, + { + "epoch": 0.9378238341968912, + "grad_norm": 46.26363869084929, + "learning_rate": 2.3838646421552917e-05, + "loss": 1.1268, + "step": 362 + }, + { + "epoch": 0.9404145077720207, + "grad_norm": 89.17676267680146, + "learning_rate": 2.3754371308743975e-05, + "loss": 1.0893, + "step": 363 + }, + { + "epoch": 0.9430051813471503, + "grad_norm": 34.87700187494181, + "learning_rate": 2.367002704243927e-05, + "loss": 1.1203, + "step": 364 + }, + { + "epoch": 0.9455958549222798, + "grad_norm": 32.92806939217504, + "learning_rate": 2.3585615176214716e-05, + "loss": 1.1488, + "step": 365 + }, + { + "epoch": 0.9481865284974094, + "grad_norm": 27.27458755248548, + "learning_rate": 2.3501137264891396e-05, + "loss": 1.0874, + "step": 366 + }, + { + "epoch": 0.9507772020725389, + "grad_norm": 24.959123789739834, + "learning_rate": 2.3416594864506887e-05, + "loss": 1.1783, + "step": 367 + }, + { + "epoch": 0.9533678756476683, + "grad_norm": 31.838670988369724, + "learning_rate": 2.333198953228664e-05, + "loss": 1.0759, + "step": 368 + }, + { + "epoch": 0.9559585492227979, + "grad_norm": 28.112870222863155, + "learning_rate": 2.3247322826615276e-05, + "loss": 1.1481, + "step": 369 + }, + { + "epoch": 0.9585492227979274, + "grad_norm": 35.08461098450067, + "learning_rate": 2.316259630700787e-05, + "loss": 1.0953, + "step": 370 + }, + { + "epoch": 0.961139896373057, + "grad_norm": 37.80899503618479, + "learning_rate": 2.307781153408124e-05, + "loss": 1.1224, + "step": 371 + }, + { + "epoch": 0.9637305699481865, + "grad_norm": 31.644978122007387, + "learning_rate": 2.2992970069525202e-05, + "loss": 1.1608, + "step": 372 + }, + { + "epoch": 0.966321243523316, + "grad_norm": 23.51029318210938, + "learning_rate": 2.29080734760738e-05, + "loss": 1.0914, + "step": 373 + }, + { + "epoch": 0.9689119170984456, + "grad_norm": 28.97240481418573, + "learning_rate": 2.2823123317476522e-05, + "loss": 1.1117, + "step": 374 + }, + { + "epoch": 0.9715025906735751, + "grad_norm": 36.613893678320395, + "learning_rate": 2.273812115846951e-05, + "loss": 1.1118, + "step": 375 + }, + { + "epoch": 0.9740932642487047, + "grad_norm": 26.402979304578093, + "learning_rate": 2.2653068564746692e-05, + "loss": 1.13, + "step": 376 + }, + { + "epoch": 0.9766839378238342, + "grad_norm": 114.3000444613392, + "learning_rate": 2.2567967102931025e-05, + "loss": 1.1539, + "step": 377 + }, + { + "epoch": 0.9792746113989638, + "grad_norm": 26.861359932396834, + "learning_rate": 2.2482818340545534e-05, + "loss": 1.0566, + "step": 378 + }, + { + "epoch": 0.9818652849740933, + "grad_norm": 32.75509374223994, + "learning_rate": 2.2397623845984548e-05, + "loss": 1.1746, + "step": 379 + }, + { + "epoch": 0.9844559585492227, + "grad_norm": 34.11964206838379, + "learning_rate": 2.2312385188484718e-05, + "loss": 1.0834, + "step": 380 + }, + { + "epoch": 0.9870466321243523, + "grad_norm": 38.019564122226434, + "learning_rate": 2.2227103938096176e-05, + "loss": 1.1074, + "step": 381 + }, + { + "epoch": 0.9896373056994818, + "grad_norm": 39.5073811375391, + "learning_rate": 2.2141781665653584e-05, + "loss": 1.1082, + "step": 382 + }, + { + "epoch": 0.9922279792746114, + "grad_norm": 298.4258332795163, + "learning_rate": 2.205641994274721e-05, + "loss": 1.125, + "step": 383 + }, + { + "epoch": 0.9948186528497409, + "grad_norm": 36.444415670935506, + "learning_rate": 2.1971020341693973e-05, + "loss": 1.0935, + "step": 384 + }, + { + "epoch": 0.9974093264248705, + "grad_norm": 28.96533429210575, + "learning_rate": 2.188558443550849e-05, + "loss": 1.0957, + "step": 385 + }, + { + "epoch": 1.0, + "grad_norm": 66.41241684127401, + "learning_rate": 2.180011379787411e-05, + "loss": 1.1335, + "step": 386 + }, + { + "epoch": 1.0025906735751295, + "grad_norm": 28.75549619538953, + "learning_rate": 2.1714610003113887e-05, + "loss": 1.1316, + "step": 387 + }, + { + "epoch": 1.005181347150259, + "grad_norm": 26.911837500852275, + "learning_rate": 2.1629074626161647e-05, + "loss": 1.1026, + "step": 388 + }, + { + "epoch": 1.005181347150259, + "eval_loss": 1.0908173322677612, + "eval_runtime": 37.7642, + "eval_samples_per_second": 19.701, + "eval_steps_per_second": 1.245, + "step": 388 + }, + { + "epoch": 1.0077720207253886, + "grad_norm": 34.28722746775385, + "learning_rate": 2.1543509242532932e-05, + "loss": 1.1104, + "step": 389 + }, + { + "epoch": 1.0103626943005182, + "grad_norm": 37.97709310694863, + "learning_rate": 2.145791542829597e-05, + "loss": 1.0663, + "step": 390 + }, + { + "epoch": 1.0129533678756477, + "grad_norm": 39.379668162327384, + "learning_rate": 2.1372294760042686e-05, + "loss": 1.1405, + "step": 391 + }, + { + "epoch": 1.0155440414507773, + "grad_norm": 27.136201219298698, + "learning_rate": 2.1286648814859636e-05, + "loss": 1.0963, + "step": 392 + }, + { + "epoch": 1.0181347150259068, + "grad_norm": 39.34261641469313, + "learning_rate": 2.120097917029897e-05, + "loss": 1.1276, + "step": 393 + }, + { + "epoch": 1.0207253886010363, + "grad_norm": 46.77583801285328, + "learning_rate": 2.1115287404349357e-05, + "loss": 1.1171, + "step": 394 + }, + { + "epoch": 1.0233160621761659, + "grad_norm": 55.10335066695868, + "learning_rate": 2.1029575095406933e-05, + "loss": 1.0831, + "step": 395 + }, + { + "epoch": 1.0259067357512954, + "grad_norm": 76.88533851789373, + "learning_rate": 2.0943843822246234e-05, + "loss": 1.0925, + "step": 396 + }, + { + "epoch": 1.028497409326425, + "grad_norm": 29.604569209708462, + "learning_rate": 2.0858095163991094e-05, + "loss": 1.1259, + "step": 397 + }, + { + "epoch": 1.0310880829015545, + "grad_norm": 37.71348366628868, + "learning_rate": 2.077233070008557e-05, + "loss": 1.0792, + "step": 398 + }, + { + "epoch": 1.0336787564766838, + "grad_norm": 26.866133194031644, + "learning_rate": 2.0686552010264872e-05, + "loss": 1.1649, + "step": 399 + }, + { + "epoch": 1.0362694300518134, + "grad_norm": 35.739274800620635, + "learning_rate": 2.060076067452622e-05, + "loss": 1.0837, + "step": 400 + }, + { + "epoch": 1.038860103626943, + "grad_norm": 24.479129391259896, + "learning_rate": 2.0514958273099778e-05, + "loss": 1.073, + "step": 401 + }, + { + "epoch": 1.0414507772020725, + "grad_norm": 50.49963650108008, + "learning_rate": 2.042914638641952e-05, + "loss": 1.0912, + "step": 402 + }, + { + "epoch": 1.044041450777202, + "grad_norm": 35.6875451072032, + "learning_rate": 2.0343326595094154e-05, + "loss": 1.0936, + "step": 403 + }, + { + "epoch": 1.0466321243523315, + "grad_norm": 30.212298193414487, + "learning_rate": 2.0257500479877965e-05, + "loss": 1.089, + "step": 404 + }, + { + "epoch": 1.049222797927461, + "grad_norm": 28.65828720015124, + "learning_rate": 2.0171669621641743e-05, + "loss": 1.1727, + "step": 405 + }, + { + "epoch": 1.0518134715025906, + "grad_norm": 39.2199058392425, + "learning_rate": 2.0085835601343627e-05, + "loss": 1.1493, + "step": 406 + }, + { + "epoch": 1.0544041450777202, + "grad_norm": 110.01204177059546, + "learning_rate": 2e-05, + "loss": 1.1245, + "step": 407 + }, + { + "epoch": 1.0569948186528497, + "grad_norm": 43.427381349600374, + "learning_rate": 1.9914164398656383e-05, + "loss": 1.1183, + "step": 408 + }, + { + "epoch": 1.0595854922279793, + "grad_norm": 64.78768909817894, + "learning_rate": 1.9828330378358264e-05, + "loss": 1.1528, + "step": 409 + }, + { + "epoch": 1.0621761658031088, + "grad_norm": 26.50257915912425, + "learning_rate": 1.974249952012204e-05, + "loss": 1.1568, + "step": 410 + }, + { + "epoch": 1.0647668393782384, + "grad_norm": 27.63159204178893, + "learning_rate": 1.9656673404905852e-05, + "loss": 1.1071, + "step": 411 + }, + { + "epoch": 1.067357512953368, + "grad_norm": 27.0795355533723, + "learning_rate": 1.957085361358049e-05, + "loss": 1.0809, + "step": 412 + }, + { + "epoch": 1.0699481865284974, + "grad_norm": 41.84795332660821, + "learning_rate": 1.9485041726900232e-05, + "loss": 1.0744, + "step": 413 + }, + { + "epoch": 1.072538860103627, + "grad_norm": 143.2109134427192, + "learning_rate": 1.939923932547379e-05, + "loss": 1.0905, + "step": 414 + }, + { + "epoch": 1.0751295336787565, + "grad_norm": 89.55384065946154, + "learning_rate": 1.931344798973513e-05, + "loss": 1.1012, + "step": 415 + }, + { + "epoch": 1.077720207253886, + "grad_norm": 31.072074793068015, + "learning_rate": 1.922766929991443e-05, + "loss": 1.1141, + "step": 416 + }, + { + "epoch": 1.0803108808290156, + "grad_norm": 29.82683189045969, + "learning_rate": 1.914190483600891e-05, + "loss": 1.0842, + "step": 417 + }, + { + "epoch": 1.0829015544041452, + "grad_norm": 30.09708662586305, + "learning_rate": 1.9056156177753776e-05, + "loss": 1.1088, + "step": 418 + }, + { + "epoch": 1.0854922279792747, + "grad_norm": 27.637437518920503, + "learning_rate": 1.897042490459307e-05, + "loss": 1.058, + "step": 419 + }, + { + "epoch": 1.0880829015544042, + "grad_norm": 69.34285700381683, + "learning_rate": 1.8884712595650653e-05, + "loss": 1.0314, + "step": 420 + }, + { + "epoch": 1.0906735751295338, + "grad_norm": 25.644927284592956, + "learning_rate": 1.8799020829701036e-05, + "loss": 1.0916, + "step": 421 + }, + { + "epoch": 1.093264248704663, + "grad_norm": 30.3898986852319, + "learning_rate": 1.871335118514037e-05, + "loss": 1.0797, + "step": 422 + }, + { + "epoch": 1.0958549222797926, + "grad_norm": 22.271334693423444, + "learning_rate": 1.862770523995732e-05, + "loss": 1.1134, + "step": 423 + }, + { + "epoch": 1.0984455958549222, + "grad_norm": 35.85874616678876, + "learning_rate": 1.854208457170404e-05, + "loss": 1.0927, + "step": 424 + }, + { + "epoch": 1.1010362694300517, + "grad_norm": 43.06832041948097, + "learning_rate": 1.8456490757467075e-05, + "loss": 1.093, + "step": 425 + }, + { + "epoch": 1.1036269430051813, + "grad_norm": 37.83777637993467, + "learning_rate": 1.8370925373838356e-05, + "loss": 1.1268, + "step": 426 + }, + { + "epoch": 1.1062176165803108, + "grad_norm": 23.798059023605177, + "learning_rate": 1.8285389996886113e-05, + "loss": 1.0989, + "step": 427 + }, + { + "epoch": 1.1088082901554404, + "grad_norm": 25.443104465500795, + "learning_rate": 1.8199886202125897e-05, + "loss": 1.0581, + "step": 428 + }, + { + "epoch": 1.11139896373057, + "grad_norm": 23.76241444847441, + "learning_rate": 1.8114415564491513e-05, + "loss": 1.0908, + "step": 429 + }, + { + "epoch": 1.1139896373056994, + "grad_norm": 26.5600693044426, + "learning_rate": 1.8028979658306033e-05, + "loss": 1.1321, + "step": 430 + }, + { + "epoch": 1.116580310880829, + "grad_norm": 44.854375199828986, + "learning_rate": 1.794358005725279e-05, + "loss": 1.0762, + "step": 431 + }, + { + "epoch": 1.1191709844559585, + "grad_norm": 28.05797777410846, + "learning_rate": 1.785821833434642e-05, + "loss": 1.0698, + "step": 432 + }, + { + "epoch": 1.121761658031088, + "grad_norm": 26.488479630212364, + "learning_rate": 1.7772896061903824e-05, + "loss": 1.1223, + "step": 433 + }, + { + "epoch": 1.1243523316062176, + "grad_norm": 32.77084542157883, + "learning_rate": 1.768761481151529e-05, + "loss": 1.0984, + "step": 434 + }, + { + "epoch": 1.1269430051813472, + "grad_norm": 39.13198413130026, + "learning_rate": 1.7602376154015456e-05, + "loss": 1.1551, + "step": 435 + }, + { + "epoch": 1.1295336787564767, + "grad_norm": 23.878966995283953, + "learning_rate": 1.751718165945447e-05, + "loss": 1.1133, + "step": 436 + }, + { + "epoch": 1.1321243523316062, + "grad_norm": 33.90472985566232, + "learning_rate": 1.743203289706898e-05, + "loss": 1.1219, + "step": 437 + }, + { + "epoch": 1.1347150259067358, + "grad_norm": 23.340369938533712, + "learning_rate": 1.734693143525331e-05, + "loss": 1.1244, + "step": 438 + }, + { + "epoch": 1.1373056994818653, + "grad_norm": 105.6885206147852, + "learning_rate": 1.7261878841530494e-05, + "loss": 1.0788, + "step": 439 + }, + { + "epoch": 1.1398963730569949, + "grad_norm": 28.453526076458317, + "learning_rate": 1.717687668252348e-05, + "loss": 1.1576, + "step": 440 + }, + { + "epoch": 1.1424870466321244, + "grad_norm": 36.1473991485961, + "learning_rate": 1.7091926523926205e-05, + "loss": 1.0859, + "step": 441 + }, + { + "epoch": 1.145077720207254, + "grad_norm": 27.043461146902448, + "learning_rate": 1.7007029930474804e-05, + "loss": 1.1072, + "step": 442 + }, + { + "epoch": 1.1476683937823835, + "grad_norm": 28.066170619981435, + "learning_rate": 1.6922188465918763e-05, + "loss": 1.1279, + "step": 443 + }, + { + "epoch": 1.150259067357513, + "grad_norm": 38.62445822837212, + "learning_rate": 1.6837403692992136e-05, + "loss": 1.1275, + "step": 444 + }, + { + "epoch": 1.1528497409326426, + "grad_norm": 28.077258963587767, + "learning_rate": 1.6752677173384734e-05, + "loss": 1.1004, + "step": 445 + }, + { + "epoch": 1.1554404145077721, + "grad_norm": 42.1405744301338, + "learning_rate": 1.6668010467713363e-05, + "loss": 1.1141, + "step": 446 + }, + { + "epoch": 1.1580310880829017, + "grad_norm": 26.827291684301034, + "learning_rate": 1.658340513549312e-05, + "loss": 1.1216, + "step": 447 + }, + { + "epoch": 1.160621761658031, + "grad_norm": 30.863489441619983, + "learning_rate": 1.649886273510861e-05, + "loss": 1.1898, + "step": 448 + }, + { + "epoch": 1.1632124352331605, + "grad_norm": 27.73579733476068, + "learning_rate": 1.641438482378529e-05, + "loss": 1.0971, + "step": 449 + }, + { + "epoch": 1.16580310880829, + "grad_norm": 32.84347174567353, + "learning_rate": 1.6329972957560736e-05, + "loss": 1.0579, + "step": 450 + }, + { + "epoch": 1.1683937823834196, + "grad_norm": 30.06456192962641, + "learning_rate": 1.6245628691256032e-05, + "loss": 1.1057, + "step": 451 + }, + { + "epoch": 1.1709844559585492, + "grad_norm": 36.554506394377846, + "learning_rate": 1.616135357844709e-05, + "loss": 1.1008, + "step": 452 + }, + { + "epoch": 1.1735751295336787, + "grad_norm": 27.358643056184114, + "learning_rate": 1.6077149171436063e-05, + "loss": 1.101, + "step": 453 + }, + { + "epoch": 1.1761658031088082, + "grad_norm": 111.13373813893604, + "learning_rate": 1.599301702122274e-05, + "loss": 1.0688, + "step": 454 + }, + { + "epoch": 1.1787564766839378, + "grad_norm": 33.94168250727336, + "learning_rate": 1.590895867747599e-05, + "loss": 1.0721, + "step": 455 + }, + { + "epoch": 1.1813471502590673, + "grad_norm": 53.93978395349692, + "learning_rate": 1.582497568850517e-05, + "loss": 1.0584, + "step": 456 + }, + { + "epoch": 1.1839378238341969, + "grad_norm": 29.19245794937285, + "learning_rate": 1.574106960123169e-05, + "loss": 1.067, + "step": 457 + }, + { + "epoch": 1.1865284974093264, + "grad_norm": 28.06897801999048, + "learning_rate": 1.5657241961160434e-05, + "loss": 1.0899, + "step": 458 + }, + { + "epoch": 1.189119170984456, + "grad_norm": 52.31256652964293, + "learning_rate": 1.557349431235135e-05, + "loss": 1.0925, + "step": 459 + }, + { + "epoch": 1.1917098445595855, + "grad_norm": 65.39771110845307, + "learning_rate": 1.5489828197390988e-05, + "loss": 1.1448, + "step": 460 + }, + { + "epoch": 1.194300518134715, + "grad_norm": 27.062780348557254, + "learning_rate": 1.5406245157364093e-05, + "loss": 1.0871, + "step": 461 + }, + { + "epoch": 1.1968911917098446, + "grad_norm": 41.667025056250424, + "learning_rate": 1.5322746731825195e-05, + "loss": 1.048, + "step": 462 + }, + { + "epoch": 1.1994818652849741, + "grad_norm": 24.936669803360665, + "learning_rate": 1.5239334458770291e-05, + "loss": 1.1243, + "step": 463 + }, + { + "epoch": 1.2020725388601037, + "grad_norm": 26.65392149600558, + "learning_rate": 1.5156009874608484e-05, + "loss": 1.0919, + "step": 464 + }, + { + "epoch": 1.2046632124352332, + "grad_norm": 48.57730651937978, + "learning_rate": 1.5072774514133708e-05, + "loss": 1.1259, + "step": 465 + }, + { + "epoch": 1.2072538860103628, + "grad_norm": 31.34891257114439, + "learning_rate": 1.4989629910496424e-05, + "loss": 1.0733, + "step": 466 + }, + { + "epoch": 1.2098445595854923, + "grad_norm": 24.541559850584985, + "learning_rate": 1.4906577595175428e-05, + "loss": 1.1166, + "step": 467 + }, + { + "epoch": 1.2124352331606219, + "grad_norm": 20.4345832961354, + "learning_rate": 1.4823619097949584e-05, + "loss": 1.0916, + "step": 468 + }, + { + "epoch": 1.2150259067357512, + "grad_norm": 28.860712194727487, + "learning_rate": 1.4740755946869708e-05, + "loss": 1.1043, + "step": 469 + }, + { + "epoch": 1.2176165803108807, + "grad_norm": 25.71820242946282, + "learning_rate": 1.4657989668230363e-05, + "loss": 1.0949, + "step": 470 + }, + { + "epoch": 1.2202072538860103, + "grad_norm": 51.16994773097077, + "learning_rate": 1.4575321786541801e-05, + "loss": 1.141, + "step": 471 + }, + { + "epoch": 1.2227979274611398, + "grad_norm": 32.70442309640389, + "learning_rate": 1.4492753824501833e-05, + "loss": 1.1127, + "step": 472 + }, + { + "epoch": 1.2253886010362693, + "grad_norm": 21.913285172411495, + "learning_rate": 1.4410287302967813e-05, + "loss": 1.084, + "step": 473 + }, + { + "epoch": 1.2279792746113989, + "grad_norm": 34.45727214001296, + "learning_rate": 1.4327923740928613e-05, + "loss": 1.0836, + "step": 474 + }, + { + "epoch": 1.2305699481865284, + "grad_norm": 26.768013926034776, + "learning_rate": 1.4245664655476663e-05, + "loss": 1.1264, + "step": 475 + }, + { + "epoch": 1.233160621761658, + "grad_norm": 28.401965255935572, + "learning_rate": 1.4163511561779956e-05, + "loss": 1.0805, + "step": 476 + }, + { + "epoch": 1.2357512953367875, + "grad_norm": 29.19935757288793, + "learning_rate": 1.4081465973054216e-05, + "loss": 1.0825, + "step": 477 + }, + { + "epoch": 1.238341968911917, + "grad_norm": 24.55918541541201, + "learning_rate": 1.3999529400534941e-05, + "loss": 1.1164, + "step": 478 + }, + { + "epoch": 1.2409326424870466, + "grad_norm": 25.35635406268312, + "learning_rate": 1.3917703353449646e-05, + "loss": 1.1334, + "step": 479 + }, + { + "epoch": 1.2435233160621761, + "grad_norm": 45.453901005004184, + "learning_rate": 1.3835989338989996e-05, + "loss": 1.1387, + "step": 480 + }, + { + "epoch": 1.2461139896373057, + "grad_norm": 21.67852694202104, + "learning_rate": 1.375438886228411e-05, + "loss": 1.0846, + "step": 481 + }, + { + "epoch": 1.2487046632124352, + "grad_norm": 171.2474074894732, + "learning_rate": 1.3672903426368773e-05, + "loss": 1.1388, + "step": 482 + }, + { + "epoch": 1.2512953367875648, + "grad_norm": 43.18223835070906, + "learning_rate": 1.3591534532161781e-05, + "loss": 1.1483, + "step": 483 + }, + { + "epoch": 1.2538860103626943, + "grad_norm": 29.447332565856644, + "learning_rate": 1.3510283678434317e-05, + "loss": 1.07, + "step": 484 + }, + { + "epoch": 1.2564766839378239, + "grad_norm": 28.600251051615228, + "learning_rate": 1.3429152361783307e-05, + "loss": 1.0798, + "step": 485 + }, + { + "epoch": 1.2564766839378239, + "eval_loss": 1.085669755935669, + "eval_runtime": 38.1134, + "eval_samples_per_second": 19.521, + "eval_steps_per_second": 1.233, + "step": 485 + }, + { + "epoch": 1.2590673575129534, + "grad_norm": 47.124643074410464, + "learning_rate": 1.3348142076603876e-05, + "loss": 1.0875, + "step": 486 + }, + { + "epoch": 1.261658031088083, + "grad_norm": 42.06019726307143, + "learning_rate": 1.3267254315061797e-05, + "loss": 1.1429, + "step": 487 + }, + { + "epoch": 1.2642487046632125, + "grad_norm": 18.950734630756962, + "learning_rate": 1.318649056706605e-05, + "loss": 1.0747, + "step": 488 + }, + { + "epoch": 1.266839378238342, + "grad_norm": 31.903949502516806, + "learning_rate": 1.3105852320241326e-05, + "loss": 1.1041, + "step": 489 + }, + { + "epoch": 1.2694300518134716, + "grad_norm": 22.957473008085927, + "learning_rate": 1.3025341059900675e-05, + "loss": 1.1046, + "step": 490 + }, + { + "epoch": 1.2720207253886011, + "grad_norm": 22.325983256563678, + "learning_rate": 1.2944958269018103e-05, + "loss": 1.0643, + "step": 491 + }, + { + "epoch": 1.2746113989637307, + "grad_norm": 29.689383331974955, + "learning_rate": 1.2864705428201307e-05, + "loss": 1.0949, + "step": 492 + }, + { + "epoch": 1.2772020725388602, + "grad_norm": 25.338298442945575, + "learning_rate": 1.2784584015664337e-05, + "loss": 1.0725, + "step": 493 + }, + { + "epoch": 1.2797927461139897, + "grad_norm": 31.591732488078588, + "learning_rate": 1.2704595507200435e-05, + "loss": 1.0347, + "step": 494 + }, + { + "epoch": 1.2823834196891193, + "grad_norm": 42.96243570696118, + "learning_rate": 1.26247413761548e-05, + "loss": 1.1196, + "step": 495 + }, + { + "epoch": 1.2849740932642488, + "grad_norm": 26.559546676266024, + "learning_rate": 1.254502309339749e-05, + "loss": 1.0187, + "step": 496 + }, + { + "epoch": 1.2875647668393784, + "grad_norm": 27.58444017584016, + "learning_rate": 1.2465442127296297e-05, + "loss": 1.0985, + "step": 497 + }, + { + "epoch": 1.2901554404145077, + "grad_norm": 36.53028730423797, + "learning_rate": 1.2385999943689732e-05, + "loss": 1.068, + "step": 498 + }, + { + "epoch": 1.2927461139896372, + "grad_norm": 38.94837307599113, + "learning_rate": 1.2306698005859975e-05, + "loss": 1.0736, + "step": 499 + }, + { + "epoch": 1.2953367875647668, + "grad_norm": 36.67208266195125, + "learning_rate": 1.2227537774505996e-05, + "loss": 1.119, + "step": 500 + }, + { + "epoch": 1.2979274611398963, + "grad_norm": 31.086410648635283, + "learning_rate": 1.2148520707716567e-05, + "loss": 1.1094, + "step": 501 + }, + { + "epoch": 1.3005181347150259, + "grad_norm": 27.96977481605826, + "learning_rate": 1.2069648260943473e-05, + "loss": 1.1345, + "step": 502 + }, + { + "epoch": 1.3031088082901554, + "grad_norm": 22.89450502840197, + "learning_rate": 1.1990921886974669e-05, + "loss": 1.12, + "step": 503 + }, + { + "epoch": 1.305699481865285, + "grad_norm": 18.54206032224653, + "learning_rate": 1.1912343035907535e-05, + "loss": 1.0929, + "step": 504 + }, + { + "epoch": 1.3082901554404145, + "grad_norm": 38.9386007237313, + "learning_rate": 1.1833913155122132e-05, + "loss": 1.1381, + "step": 505 + }, + { + "epoch": 1.310880829015544, + "grad_norm": 37.05899458809635, + "learning_rate": 1.1755633689254609e-05, + "loss": 1.0535, + "step": 506 + }, + { + "epoch": 1.3134715025906736, + "grad_norm": 27.716372794195156, + "learning_rate": 1.1677506080170512e-05, + "loss": 1.1342, + "step": 507 + }, + { + "epoch": 1.3160621761658031, + "grad_norm": 40.42306246079416, + "learning_rate": 1.1599531766938306e-05, + "loss": 1.0887, + "step": 508 + }, + { + "epoch": 1.3186528497409327, + "grad_norm": 98.56681767405578, + "learning_rate": 1.1521712185802789e-05, + "loss": 1.0954, + "step": 509 + }, + { + "epoch": 1.3212435233160622, + "grad_norm": 34.42816933350743, + "learning_rate": 1.1444048770158718e-05, + "loss": 1.0512, + "step": 510 + }, + { + "epoch": 1.3238341968911918, + "grad_norm": 52.457523653614096, + "learning_rate": 1.136654295052433e-05, + "loss": 1.1599, + "step": 511 + }, + { + "epoch": 1.3264248704663213, + "grad_norm": 26.832339531661276, + "learning_rate": 1.1289196154515048e-05, + "loss": 1.0602, + "step": 512 + }, + { + "epoch": 1.3290155440414508, + "grad_norm": 32.746047673769816, + "learning_rate": 1.1212009806817163e-05, + "loss": 1.1544, + "step": 513 + }, + { + "epoch": 1.3316062176165804, + "grad_norm": 37.44483451702055, + "learning_rate": 1.1134985329161608e-05, + "loss": 1.1421, + "step": 514 + }, + { + "epoch": 1.33419689119171, + "grad_norm": 28.625976525737606, + "learning_rate": 1.1058124140297718e-05, + "loss": 1.0858, + "step": 515 + }, + { + "epoch": 1.3367875647668392, + "grad_norm": 38.64141195246213, + "learning_rate": 1.0981427655967183e-05, + "loss": 1.0983, + "step": 516 + }, + { + "epoch": 1.3393782383419688, + "grad_norm": 29.989753893533425, + "learning_rate": 1.0904897288877891e-05, + "loss": 1.1269, + "step": 517 + }, + { + "epoch": 1.3419689119170983, + "grad_norm": 48.63990665515511, + "learning_rate": 1.0828534448677942e-05, + "loss": 1.0844, + "step": 518 + }, + { + "epoch": 1.3445595854922279, + "grad_norm": 25.477227318250847, + "learning_rate": 1.0752340541929711e-05, + "loss": 1.0742, + "step": 519 + }, + { + "epoch": 1.3471502590673574, + "grad_norm": 26.363588814537763, + "learning_rate": 1.0676316972083867e-05, + "loss": 1.0533, + "step": 520 + }, + { + "epoch": 1.349740932642487, + "grad_norm": 34.59968737708606, + "learning_rate": 1.060046513945361e-05, + "loss": 1.0983, + "step": 521 + }, + { + "epoch": 1.3523316062176165, + "grad_norm": 52.51652561846762, + "learning_rate": 1.0524786441188786e-05, + "loss": 1.1319, + "step": 522 + }, + { + "epoch": 1.354922279792746, + "grad_norm": 21.360221214301127, + "learning_rate": 1.0449282271250239e-05, + "loss": 1.0627, + "step": 523 + }, + { + "epoch": 1.3575129533678756, + "grad_norm": 37.00053933682603, + "learning_rate": 1.0373954020384073e-05, + "loss": 1.096, + "step": 524 + }, + { + "epoch": 1.3601036269430051, + "grad_norm": 39.212240822687484, + "learning_rate": 1.029880307609608e-05, + "loss": 1.0512, + "step": 525 + }, + { + "epoch": 1.3626943005181347, + "grad_norm": 24.89842378385804, + "learning_rate": 1.0223830822626124e-05, + "loss": 1.0538, + "step": 526 + }, + { + "epoch": 1.3652849740932642, + "grad_norm": 29.14416894424653, + "learning_rate": 1.0149038640922715e-05, + "loss": 1.1538, + "step": 527 + }, + { + "epoch": 1.3678756476683938, + "grad_norm": 31.688722122648855, + "learning_rate": 1.0074427908617515e-05, + "loss": 1.171, + "step": 528 + }, + { + "epoch": 1.3704663212435233, + "grad_norm": 41.918909004413734, + "learning_rate": 1.0000000000000006e-05, + "loss": 1.1203, + "step": 529 + }, + { + "epoch": 1.3730569948186528, + "grad_norm": 26.70963454516576, + "learning_rate": 9.92575628599213e-06, + "loss": 1.0855, + "step": 530 + }, + { + "epoch": 1.3756476683937824, + "grad_norm": 24.819351173466824, + "learning_rate": 9.851698134123095e-06, + "loss": 1.0972, + "step": 531 + }, + { + "epoch": 1.378238341968912, + "grad_norm": 22.100465399566815, + "learning_rate": 9.777826908504126e-06, + "loss": 1.08, + "step": 532 + }, + { + "epoch": 1.3808290155440415, + "grad_norm": 29.31574709406259, + "learning_rate": 9.704143969803392e-06, + "loss": 1.0835, + "step": 533 + }, + { + "epoch": 1.383419689119171, + "grad_norm": 25.551326748473052, + "learning_rate": 9.630650675220892e-06, + "loss": 1.0396, + "step": 534 + }, + { + "epoch": 1.3860103626943006, + "grad_norm": 59.07595627892596, + "learning_rate": 9.557348378463503e-06, + "loss": 1.0814, + "step": 535 + }, + { + "epoch": 1.38860103626943, + "grad_norm": 24.96501978981908, + "learning_rate": 9.484238429720018e-06, + "loss": 1.0187, + "step": 536 + }, + { + "epoch": 1.3911917098445596, + "grad_norm": 42.530604702279234, + "learning_rate": 9.411322175636298e-06, + "loss": 1.074, + "step": 537 + }, + { + "epoch": 1.3937823834196892, + "grad_norm": 34.91129065632851, + "learning_rate": 9.338600959290414e-06, + "loss": 1.0878, + "step": 538 + }, + { + "epoch": 1.3963730569948187, + "grad_norm": 32.07525956876426, + "learning_rate": 9.266076120167992e-06, + "loss": 1.0962, + "step": 539 + }, + { + "epoch": 1.3989637305699483, + "grad_norm": 40.18387743296675, + "learning_rate": 9.193748994137462e-06, + "loss": 1.1033, + "step": 540 + }, + { + "epoch": 1.4015544041450778, + "grad_norm": 66.68031460980451, + "learning_rate": 9.121620913425508e-06, + "loss": 1.1466, + "step": 541 + }, + { + "epoch": 1.4041450777202074, + "grad_norm": 34.07506059584738, + "learning_rate": 9.04969320659249e-06, + "loss": 1.1184, + "step": 542 + }, + { + "epoch": 1.406735751295337, + "grad_norm": 17.130845779169075, + "learning_rate": 8.977967198508001e-06, + "loss": 1.0803, + "step": 543 + }, + { + "epoch": 1.4093264248704664, + "grad_norm": 22.4457025132615, + "learning_rate": 8.906444210326441e-06, + "loss": 1.0745, + "step": 544 + }, + { + "epoch": 1.411917098445596, + "grad_norm": 73.43971735356851, + "learning_rate": 8.83512555946271e-06, + "loss": 1.0717, + "step": 545 + }, + { + "epoch": 1.4145077720207253, + "grad_norm": 38.16321297719761, + "learning_rate": 8.764012559567899e-06, + "loss": 1.1371, + "step": 546 + }, + { + "epoch": 1.4170984455958548, + "grad_norm": 56.14718024907725, + "learning_rate": 8.693106520505147e-06, + "loss": 1.0185, + "step": 547 + }, + { + "epoch": 1.4196891191709844, + "grad_norm": 53.3812598790062, + "learning_rate": 8.622408748325461e-06, + "loss": 1.0859, + "step": 548 + }, + { + "epoch": 1.422279792746114, + "grad_norm": 39.69041631433326, + "learning_rate": 8.551920545243704e-06, + "loss": 1.1146, + "step": 549 + }, + { + "epoch": 1.4248704663212435, + "grad_norm": 24.099260758984773, + "learning_rate": 8.481643209614576e-06, + "loss": 1.0968, + "step": 550 + }, + { + "epoch": 1.427461139896373, + "grad_norm": 22.623850373369237, + "learning_rate": 8.411578035908728e-06, + "loss": 1.0642, + "step": 551 + }, + { + "epoch": 1.4300518134715026, + "grad_norm": 25.343746374404027, + "learning_rate": 8.341726314688875e-06, + "loss": 1.0815, + "step": 552 + }, + { + "epoch": 1.432642487046632, + "grad_norm": 35.82641011588973, + "learning_rate": 8.272089332586089e-06, + "loss": 1.1012, + "step": 553 + }, + { + "epoch": 1.4352331606217616, + "grad_norm": 24.81161215784662, + "learning_rate": 8.20266837227603e-06, + "loss": 1.1086, + "step": 554 + }, + { + "epoch": 1.4378238341968912, + "grad_norm": 54.18243481591251, + "learning_rate": 8.133464712455364e-06, + "loss": 1.0704, + "step": 555 + }, + { + "epoch": 1.4404145077720207, + "grad_norm": 23.602598217141395, + "learning_rate": 8.064479627818213e-06, + "loss": 1.1519, + "step": 556 + }, + { + "epoch": 1.4430051813471503, + "grad_norm": 31.124404868409982, + "learning_rate": 7.995714389032638e-06, + "loss": 1.0705, + "step": 557 + }, + { + "epoch": 1.4455958549222798, + "grad_norm": 24.14171016995626, + "learning_rate": 7.927170262717284e-06, + "loss": 1.1083, + "step": 558 + }, + { + "epoch": 1.4481865284974094, + "grad_norm": 47.987203109917175, + "learning_rate": 7.858848511417998e-06, + "loss": 1.0836, + "step": 559 + }, + { + "epoch": 1.450777202072539, + "grad_norm": 25.871447098066056, + "learning_rate": 7.790750393584616e-06, + "loss": 1.0787, + "step": 560 + }, + { + "epoch": 1.4533678756476685, + "grad_norm": 23.820249113937482, + "learning_rate": 7.72287716354776e-06, + "loss": 1.1165, + "step": 561 + }, + { + "epoch": 1.455958549222798, + "grad_norm": 48.04131308947624, + "learning_rate": 7.65523007149575e-06, + "loss": 1.0819, + "step": 562 + }, + { + "epoch": 1.4585492227979275, + "grad_norm": 29.273494083692352, + "learning_rate": 7.587810363451544e-06, + "loss": 1.0302, + "step": 563 + }, + { + "epoch": 1.4611398963730569, + "grad_norm": 120.01571222366722, + "learning_rate": 7.5206192812498345e-06, + "loss": 1.1291, + "step": 564 + }, + { + "epoch": 1.4637305699481864, + "grad_norm": 33.16947662083338, + "learning_rate": 7.4536580625141244e-06, + "loss": 1.0842, + "step": 565 + }, + { + "epoch": 1.466321243523316, + "grad_norm": 29.979556378166713, + "learning_rate": 7.386927940633981e-06, + "loss": 1.1116, + "step": 566 + }, + { + "epoch": 1.4689119170984455, + "grad_norm": 27.172344859281896, + "learning_rate": 7.32043014474227e-06, + "loss": 1.0676, + "step": 567 + }, + { + "epoch": 1.471502590673575, + "grad_norm": 30.208548637757318, + "learning_rate": 7.254165899692554e-06, + "loss": 1.1104, + "step": 568 + }, + { + "epoch": 1.4740932642487046, + "grad_norm": 19.385421184583773, + "learning_rate": 7.188136426036498e-06, + "loss": 1.0085, + "step": 569 + }, + { + "epoch": 1.4766839378238341, + "grad_norm": 30.350787749309685, + "learning_rate": 7.12234294000143e-06, + "loss": 1.0584, + "step": 570 + }, + { + "epoch": 1.4792746113989637, + "grad_norm": 31.520305600900198, + "learning_rate": 7.056786653467882e-06, + "loss": 1.0831, + "step": 571 + }, + { + "epoch": 1.4818652849740932, + "grad_norm": 46.13006972574487, + "learning_rate": 6.991468773947321e-06, + "loss": 1.1761, + "step": 572 + }, + { + "epoch": 1.4844559585492227, + "grad_norm": 26.72340868362835, + "learning_rate": 6.926390504559879e-06, + "loss": 1.0605, + "step": 573 + }, + { + "epoch": 1.4870466321243523, + "grad_norm": 25.992965411102556, + "learning_rate": 6.861553044012206e-06, + "loss": 1.1015, + "step": 574 + }, + { + "epoch": 1.4896373056994818, + "grad_norm": 38.60187420279626, + "learning_rate": 6.796957586575364e-06, + "loss": 1.1232, + "step": 575 + }, + { + "epoch": 1.4922279792746114, + "grad_norm": 21.7618591565717, + "learning_rate": 6.732605322062869e-06, + "loss": 1.1196, + "step": 576 + }, + { + "epoch": 1.494818652849741, + "grad_norm": 28.233093007170996, + "learning_rate": 6.668497435808736e-06, + "loss": 1.1451, + "step": 577 + }, + { + "epoch": 1.4974093264248705, + "grad_norm": 28.061514297823816, + "learning_rate": 6.604635108645683e-06, + "loss": 1.0832, + "step": 578 + }, + { + "epoch": 1.5, + "grad_norm": 35.34503147975386, + "learning_rate": 6.5410195168833425e-06, + "loss": 1.118, + "step": 579 + } + ], + "logging_steps": 1, + "max_steps": 772, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 193, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0022991232499712e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}