diff --git "a/checkpoint-682/trainer_state.json" "b/checkpoint-682/trainer_state.json" deleted file mode 100644--- "a/checkpoint-682/trainer_state.json" +++ /dev/null @@ -1,4871 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 1.9917174177831911, - "eval_steps": 86, - "global_step": 682, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0029254022428083864, - "grad_norm": 56.61502838582006, - "learning_rate": 1.4705882352941178e-07, - "loss": 1.336, - "step": 1 - }, - { - "epoch": 0.0029254022428083864, - "eval_loss": 1.7113733291625977, - "eval_runtime": 65.4428, - "eval_samples_per_second": 14.807, - "eval_steps_per_second": 2.475, - "step": 1 - }, - { - "epoch": 0.005850804485616773, - "grad_norm": 51.81724700444768, - "learning_rate": 2.9411764705882356e-07, - "loss": 1.3698, - "step": 2 - }, - { - "epoch": 0.008776206728425159, - "grad_norm": 47.60979608727599, - "learning_rate": 4.4117647058823536e-07, - "loss": 1.34, - "step": 3 - }, - { - "epoch": 0.011701608971233545, - "grad_norm": 48.80875038414928, - "learning_rate": 5.882352941176471e-07, - "loss": 1.3392, - "step": 4 - }, - { - "epoch": 0.01462701121404193, - "grad_norm": 53.27305939885202, - "learning_rate": 7.352941176470589e-07, - "loss": 1.3409, - "step": 5 - }, - { - "epoch": 0.017552413456850317, - "grad_norm": 47.925702270624, - "learning_rate": 8.823529411764707e-07, - "loss": 1.3373, - "step": 6 - }, - { - "epoch": 0.020477815699658702, - "grad_norm": 48.37389105675583, - "learning_rate": 1.0294117647058825e-06, - "loss": 1.3137, - "step": 7 - }, - { - "epoch": 0.02340321794246709, - "grad_norm": 39.37861206526259, - "learning_rate": 1.1764705882352942e-06, - "loss": 1.3369, - "step": 8 - }, - { - "epoch": 0.026328620185275476, - "grad_norm": 25.61893601522815, - "learning_rate": 1.323529411764706e-06, - "loss": 1.3237, - "step": 9 - }, - { - "epoch": 0.02925402242808386, - "grad_norm": 24.674790384396417, - "learning_rate": 1.4705882352941177e-06, - "loss": 1.3056, - "step": 10 - }, - { - "epoch": 0.03217942467089225, - "grad_norm": 24.91679564935755, - "learning_rate": 1.6176470588235297e-06, - "loss": 1.3015, - "step": 11 - }, - { - "epoch": 0.035104826913700635, - "grad_norm": 16.05920632953936, - "learning_rate": 1.7647058823529414e-06, - "loss": 1.2387, - "step": 12 - }, - { - "epoch": 0.03803022915650902, - "grad_norm": 17.21678860344456, - "learning_rate": 1.9117647058823528e-06, - "loss": 1.2592, - "step": 13 - }, - { - "epoch": 0.040955631399317405, - "grad_norm": 9.449613433058063, - "learning_rate": 2.058823529411765e-06, - "loss": 1.2485, - "step": 14 - }, - { - "epoch": 0.04388103364212579, - "grad_norm": 6.021711489397231, - "learning_rate": 2.2058823529411767e-06, - "loss": 1.2225, - "step": 15 - }, - { - "epoch": 0.04680643588493418, - "grad_norm": 5.216307369247847, - "learning_rate": 2.3529411764705885e-06, - "loss": 1.2108, - "step": 16 - }, - { - "epoch": 0.04973183812774257, - "grad_norm": 3.0940157822273058, - "learning_rate": 2.5e-06, - "loss": 1.1818, - "step": 17 - }, - { - "epoch": 0.05265724037055095, - "grad_norm": 2.91820920733528, - "learning_rate": 2.647058823529412e-06, - "loss": 1.1673, - "step": 18 - }, - { - "epoch": 0.05558264261335934, - "grad_norm": 1.8161119748737125, - "learning_rate": 2.7941176470588237e-06, - "loss": 1.1415, - "step": 19 - }, - { - "epoch": 0.05850804485616772, - "grad_norm": 1.7624989349292328, - "learning_rate": 2.9411764705882355e-06, - "loss": 1.1534, - "step": 20 - }, - { - "epoch": 0.06143344709897611, - "grad_norm": 1.8082316113855639, - "learning_rate": 3.0882352941176476e-06, - "loss": 1.1441, - "step": 21 - }, - { - "epoch": 0.0643588493417845, - "grad_norm": 1.2223752752561972, - "learning_rate": 3.2352941176470594e-06, - "loss": 1.1258, - "step": 22 - }, - { - "epoch": 0.06728425158459288, - "grad_norm": 1.1968274873672384, - "learning_rate": 3.382352941176471e-06, - "loss": 1.1235, - "step": 23 - }, - { - "epoch": 0.07020965382740127, - "grad_norm": 1.6582129576098923, - "learning_rate": 3.529411764705883e-06, - "loss": 1.1197, - "step": 24 - }, - { - "epoch": 0.07313505607020965, - "grad_norm": 1.4716149175252013, - "learning_rate": 3.6764705882352946e-06, - "loss": 1.1242, - "step": 25 - }, - { - "epoch": 0.07606045831301804, - "grad_norm": 1.406278144326081, - "learning_rate": 3.8235294117647055e-06, - "loss": 1.1048, - "step": 26 - }, - { - "epoch": 0.07898586055582643, - "grad_norm": 1.2947816562679035, - "learning_rate": 3.970588235294118e-06, - "loss": 1.078, - "step": 27 - }, - { - "epoch": 0.08191126279863481, - "grad_norm": 1.3762114230758338, - "learning_rate": 4.11764705882353e-06, - "loss": 1.082, - "step": 28 - }, - { - "epoch": 0.0848366650414432, - "grad_norm": 0.924113394021786, - "learning_rate": 4.264705882352942e-06, - "loss": 1.0763, - "step": 29 - }, - { - "epoch": 0.08776206728425158, - "grad_norm": 1.2229665744650906, - "learning_rate": 4.411764705882353e-06, - "loss": 1.0967, - "step": 30 - }, - { - "epoch": 0.09068746952705997, - "grad_norm": 0.7673671598420616, - "learning_rate": 4.558823529411765e-06, - "loss": 1.092, - "step": 31 - }, - { - "epoch": 0.09361287176986836, - "grad_norm": 1.0178922272446433, - "learning_rate": 4.705882352941177e-06, - "loss": 1.0787, - "step": 32 - }, - { - "epoch": 0.09653827401267674, - "grad_norm": 0.9452497607845647, - "learning_rate": 4.852941176470589e-06, - "loss": 1.068, - "step": 33 - }, - { - "epoch": 0.09946367625548513, - "grad_norm": 0.8975537032410301, - "learning_rate": 5e-06, - "loss": 1.0575, - "step": 34 - }, - { - "epoch": 0.10238907849829351, - "grad_norm": 0.8883502117498131, - "learning_rate": 5.147058823529411e-06, - "loss": 1.0562, - "step": 35 - }, - { - "epoch": 0.1053144807411019, - "grad_norm": 0.7786108075639211, - "learning_rate": 5.294117647058824e-06, - "loss": 1.0628, - "step": 36 - }, - { - "epoch": 0.10823988298391028, - "grad_norm": 0.7164270179895762, - "learning_rate": 5.441176470588236e-06, - "loss": 1.0647, - "step": 37 - }, - { - "epoch": 0.11116528522671867, - "grad_norm": 0.6489067271106712, - "learning_rate": 5.588235294117647e-06, - "loss": 1.0377, - "step": 38 - }, - { - "epoch": 0.11409068746952707, - "grad_norm": 0.7092316432914678, - "learning_rate": 5.735294117647059e-06, - "loss": 1.0349, - "step": 39 - }, - { - "epoch": 0.11701608971233544, - "grad_norm": 0.8243939653767051, - "learning_rate": 5.882352941176471e-06, - "loss": 1.051, - "step": 40 - }, - { - "epoch": 0.11994149195514384, - "grad_norm": 0.6741750939661741, - "learning_rate": 6.029411764705883e-06, - "loss": 1.0402, - "step": 41 - }, - { - "epoch": 0.12286689419795221, - "grad_norm": 0.7267097354641292, - "learning_rate": 6.176470588235295e-06, - "loss": 1.0288, - "step": 42 - }, - { - "epoch": 0.1257922964407606, - "grad_norm": 0.8103776243322237, - "learning_rate": 6.323529411764706e-06, - "loss": 1.0408, - "step": 43 - }, - { - "epoch": 0.128717698683569, - "grad_norm": 0.7774449672806364, - "learning_rate": 6.470588235294119e-06, - "loss": 1.0151, - "step": 44 - }, - { - "epoch": 0.13164310092637738, - "grad_norm": 0.6835822537993219, - "learning_rate": 6.61764705882353e-06, - "loss": 1.0032, - "step": 45 - }, - { - "epoch": 0.13456850316918575, - "grad_norm": 0.7671516033152175, - "learning_rate": 6.764705882352942e-06, - "loss": 1.0031, - "step": 46 - }, - { - "epoch": 0.13749390541199416, - "grad_norm": 0.6872713961130001, - "learning_rate": 6.911764705882353e-06, - "loss": 1.0366, - "step": 47 - }, - { - "epoch": 0.14041930765480254, - "grad_norm": 0.6944081716419906, - "learning_rate": 7.058823529411766e-06, - "loss": 1.0206, - "step": 48 - }, - { - "epoch": 0.14334470989761092, - "grad_norm": 0.6724442918688381, - "learning_rate": 7.205882352941177e-06, - "loss": 1.0054, - "step": 49 - }, - { - "epoch": 0.1462701121404193, - "grad_norm": 0.70069910168755, - "learning_rate": 7.352941176470589e-06, - "loss": 0.984, - "step": 50 - }, - { - "epoch": 0.1491955143832277, - "grad_norm": 0.8321780541795489, - "learning_rate": 7.500000000000001e-06, - "loss": 1.0222, - "step": 51 - }, - { - "epoch": 0.15212091662603608, - "grad_norm": 0.7461017416034388, - "learning_rate": 7.647058823529411e-06, - "loss": 0.9958, - "step": 52 - }, - { - "epoch": 0.15504631886884446, - "grad_norm": 0.7638996030200154, - "learning_rate": 7.794117647058825e-06, - "loss": 1.0099, - "step": 53 - }, - { - "epoch": 0.15797172111165286, - "grad_norm": 0.8833995259882905, - "learning_rate": 7.941176470588236e-06, - "loss": 0.9979, - "step": 54 - }, - { - "epoch": 0.16089712335446124, - "grad_norm": 0.7840192819559926, - "learning_rate": 8.088235294117648e-06, - "loss": 0.9948, - "step": 55 - }, - { - "epoch": 0.16382252559726962, - "grad_norm": 0.7644797684213736, - "learning_rate": 8.23529411764706e-06, - "loss": 1.0009, - "step": 56 - }, - { - "epoch": 0.166747927840078, - "grad_norm": 0.688025084080775, - "learning_rate": 8.382352941176472e-06, - "loss": 1.0178, - "step": 57 - }, - { - "epoch": 0.1696733300828864, - "grad_norm": 0.7265519968150936, - "learning_rate": 8.529411764705883e-06, - "loss": 0.9973, - "step": 58 - }, - { - "epoch": 0.17259873232569478, - "grad_norm": 0.683233517732171, - "learning_rate": 8.676470588235295e-06, - "loss": 0.9954, - "step": 59 - }, - { - "epoch": 0.17552413456850316, - "grad_norm": 0.704502531642411, - "learning_rate": 8.823529411764707e-06, - "loss": 0.9876, - "step": 60 - }, - { - "epoch": 0.17844953681131157, - "grad_norm": 0.743891308875026, - "learning_rate": 8.970588235294119e-06, - "loss": 0.9763, - "step": 61 - }, - { - "epoch": 0.18137493905411994, - "grad_norm": 0.7598121118829709, - "learning_rate": 9.11764705882353e-06, - "loss": 0.9587, - "step": 62 - }, - { - "epoch": 0.18430034129692832, - "grad_norm": 0.8066231503341631, - "learning_rate": 9.264705882352942e-06, - "loss": 0.9989, - "step": 63 - }, - { - "epoch": 0.18722574353973673, - "grad_norm": 0.7979897760646133, - "learning_rate": 9.411764705882354e-06, - "loss": 1.0033, - "step": 64 - }, - { - "epoch": 0.1901511457825451, - "grad_norm": 0.7334146161657895, - "learning_rate": 9.558823529411766e-06, - "loss": 0.9882, - "step": 65 - }, - { - "epoch": 0.19307654802535348, - "grad_norm": 0.676521441443209, - "learning_rate": 9.705882352941177e-06, - "loss": 0.9789, - "step": 66 - }, - { - "epoch": 0.19600195026816186, - "grad_norm": 0.7552083619322828, - "learning_rate": 9.852941176470589e-06, - "loss": 0.9834, - "step": 67 - }, - { - "epoch": 0.19892735251097027, - "grad_norm": 0.7613688019506333, - "learning_rate": 1e-05, - "loss": 0.984, - "step": 68 - }, - { - "epoch": 0.20185275475377865, - "grad_norm": 0.740622944338628, - "learning_rate": 9.999934551143319e-06, - "loss": 0.9401, - "step": 69 - }, - { - "epoch": 0.20477815699658702, - "grad_norm": 0.7221110895270001, - "learning_rate": 9.9997382062867e-06, - "loss": 0.9822, - "step": 70 - }, - { - "epoch": 0.20770355923939543, - "grad_norm": 0.7826701133526415, - "learning_rate": 9.999410970570358e-06, - "loss": 0.9631, - "step": 71 - }, - { - "epoch": 0.2106289614822038, - "grad_norm": 0.8740335657819791, - "learning_rate": 9.998952852561176e-06, - "loss": 0.9726, - "step": 72 - }, - { - "epoch": 0.2135543637250122, - "grad_norm": 0.7252364315085953, - "learning_rate": 9.998363864252474e-06, - "loss": 0.9686, - "step": 73 - }, - { - "epoch": 0.21647976596782056, - "grad_norm": 0.6747540030851431, - "learning_rate": 9.997644021063698e-06, - "loss": 0.9739, - "step": 74 - }, - { - "epoch": 0.21940516821062897, - "grad_norm": 0.7824714311195131, - "learning_rate": 9.99679334184001e-06, - "loss": 0.9655, - "step": 75 - }, - { - "epoch": 0.22233057045343735, - "grad_norm": 0.7945498898412752, - "learning_rate": 9.995811848851807e-06, - "loss": 0.964, - "step": 76 - }, - { - "epoch": 0.22525597269624573, - "grad_norm": 0.6630711865020658, - "learning_rate": 9.994699567794123e-06, - "loss": 0.9911, - "step": 77 - }, - { - "epoch": 0.22818137493905413, - "grad_norm": 0.6575181705021402, - "learning_rate": 9.99345652778597e-06, - "loss": 0.9723, - "step": 78 - }, - { - "epoch": 0.2311067771818625, - "grad_norm": 0.8716226724365626, - "learning_rate": 9.992082761369567e-06, - "loss": 0.9545, - "step": 79 - }, - { - "epoch": 0.2340321794246709, - "grad_norm": 0.7058306465830354, - "learning_rate": 9.990578304509488e-06, - "loss": 0.9483, - "step": 80 - }, - { - "epoch": 0.23695758166747927, - "grad_norm": 0.7161030185458794, - "learning_rate": 9.988943196591727e-06, - "loss": 0.9826, - "step": 81 - }, - { - "epoch": 0.23988298391028767, - "grad_norm": 0.7246834402057637, - "learning_rate": 9.987177480422663e-06, - "loss": 0.9743, - "step": 82 - }, - { - "epoch": 0.24280838615309605, - "grad_norm": 0.7675354559101374, - "learning_rate": 9.985281202227936e-06, - "loss": 0.9611, - "step": 83 - }, - { - "epoch": 0.24573378839590443, - "grad_norm": 0.6584357041279372, - "learning_rate": 9.983254411651242e-06, - "loss": 0.9474, - "step": 84 - }, - { - "epoch": 0.24865919063871283, - "grad_norm": 0.7664607604465142, - "learning_rate": 9.981097161753032e-06, - "loss": 0.9618, - "step": 85 - }, - { - "epoch": 0.2515845928815212, - "grad_norm": 0.6944551302100264, - "learning_rate": 9.978809509009121e-06, - "loss": 0.9631, - "step": 86 - }, - { - "epoch": 0.2515845928815212, - "eval_loss": 1.4097907543182373, - "eval_runtime": 65.4251, - "eval_samples_per_second": 14.811, - "eval_steps_per_second": 2.476, - "step": 86 - }, - { - "epoch": 0.2545099951243296, - "grad_norm": 0.6240545271387105, - "learning_rate": 9.976391513309212e-06, - "loss": 0.9676, - "step": 87 - }, - { - "epoch": 0.257435397367138, - "grad_norm": 0.7423832349267365, - "learning_rate": 9.973843237955328e-06, - "loss": 0.979, - "step": 88 - }, - { - "epoch": 0.26036079960994635, - "grad_norm": 0.6689567084685912, - "learning_rate": 9.971164749660149e-06, - "loss": 0.9611, - "step": 89 - }, - { - "epoch": 0.26328620185275475, - "grad_norm": 0.6301374142385534, - "learning_rate": 9.968356118545277e-06, - "loss": 0.9702, - "step": 90 - }, - { - "epoch": 0.26621160409556316, - "grad_norm": 0.7418455357086029, - "learning_rate": 9.96541741813939e-06, - "loss": 0.9615, - "step": 91 - }, - { - "epoch": 0.2691370063383715, - "grad_norm": 0.6892741528738031, - "learning_rate": 9.962348725376318e-06, - "loss": 0.938, - "step": 92 - }, - { - "epoch": 0.2720624085811799, - "grad_norm": 0.6706415096539721, - "learning_rate": 9.959150120593035e-06, - "loss": 0.9629, - "step": 93 - }, - { - "epoch": 0.2749878108239883, - "grad_norm": 0.7046784189658271, - "learning_rate": 9.955821687527554e-06, - "loss": 0.9664, - "step": 94 - }, - { - "epoch": 0.27791321306679667, - "grad_norm": 0.7485580596660371, - "learning_rate": 9.952363513316727e-06, - "loss": 0.9482, - "step": 95 - }, - { - "epoch": 0.2808386153096051, - "grad_norm": 0.6878028657133218, - "learning_rate": 9.948775688493974e-06, - "loss": 0.9498, - "step": 96 - }, - { - "epoch": 0.2837640175524135, - "grad_norm": 0.6680669216250931, - "learning_rate": 9.945058306986911e-06, - "loss": 0.9435, - "step": 97 - }, - { - "epoch": 0.28668941979522183, - "grad_norm": 0.6898717996278425, - "learning_rate": 9.941211466114883e-06, - "loss": 0.9684, - "step": 98 - }, - { - "epoch": 0.28961482203803024, - "grad_norm": 0.6484486515830594, - "learning_rate": 9.937235266586425e-06, - "loss": 0.9434, - "step": 99 - }, - { - "epoch": 0.2925402242808386, - "grad_norm": 0.7331156843054286, - "learning_rate": 9.933129812496623e-06, - "loss": 0.9491, - "step": 100 - }, - { - "epoch": 0.295465626523647, - "grad_norm": 0.6159411880792645, - "learning_rate": 9.928895211324387e-06, - "loss": 0.9414, - "step": 101 - }, - { - "epoch": 0.2983910287664554, - "grad_norm": 0.6680220411471304, - "learning_rate": 9.924531573929641e-06, - "loss": 0.9293, - "step": 102 - }, - { - "epoch": 0.30131643100926375, - "grad_norm": 0.639276890448279, - "learning_rate": 9.920039014550413e-06, - "loss": 0.9545, - "step": 103 - }, - { - "epoch": 0.30424183325207216, - "grad_norm": 0.6428680336292386, - "learning_rate": 9.915417650799855e-06, - "loss": 0.9506, - "step": 104 - }, - { - "epoch": 0.30716723549488056, - "grad_norm": 0.6169710049517818, - "learning_rate": 9.910667603663156e-06, - "loss": 0.9786, - "step": 105 - }, - { - "epoch": 0.3100926377376889, - "grad_norm": 0.684509417371433, - "learning_rate": 9.905788997494377e-06, - "loss": 0.9525, - "step": 106 - }, - { - "epoch": 0.3130180399804973, - "grad_norm": 0.6503641190326895, - "learning_rate": 9.9007819600132e-06, - "loss": 0.9325, - "step": 107 - }, - { - "epoch": 0.3159434422233057, - "grad_norm": 0.6372262818423288, - "learning_rate": 9.89564662230157e-06, - "loss": 0.9469, - "step": 108 - }, - { - "epoch": 0.3188688444661141, - "grad_norm": 0.741895816894913, - "learning_rate": 9.890383118800287e-06, - "loss": 0.9433, - "step": 109 - }, - { - "epoch": 0.3217942467089225, - "grad_norm": 0.6678635900965723, - "learning_rate": 9.884991587305459e-06, - "loss": 0.9507, - "step": 110 - }, - { - "epoch": 0.3247196489517309, - "grad_norm": 0.6557776782736461, - "learning_rate": 9.87947216896492e-06, - "loss": 0.9489, - "step": 111 - }, - { - "epoch": 0.32764505119453924, - "grad_norm": 0.8386523679095728, - "learning_rate": 9.873825008274514e-06, - "loss": 0.9513, - "step": 112 - }, - { - "epoch": 0.33057045343734764, - "grad_norm": 0.6871663810393576, - "learning_rate": 9.868050253074328e-06, - "loss": 0.9461, - "step": 113 - }, - { - "epoch": 0.333495855680156, - "grad_norm": 0.668413921617873, - "learning_rate": 9.862148054544812e-06, - "loss": 0.9658, - "step": 114 - }, - { - "epoch": 0.3364212579229644, - "grad_norm": 0.7873362433736362, - "learning_rate": 9.85611856720282e-06, - "loss": 0.9397, - "step": 115 - }, - { - "epoch": 0.3393466601657728, - "grad_norm": 0.607062349214138, - "learning_rate": 9.849961948897582e-06, - "loss": 0.9241, - "step": 116 - }, - { - "epoch": 0.34227206240858116, - "grad_norm": 0.6608850440766699, - "learning_rate": 9.843678360806542e-06, - "loss": 0.9459, - "step": 117 - }, - { - "epoch": 0.34519746465138956, - "grad_norm": 0.648707994851676, - "learning_rate": 9.837267967431164e-06, - "loss": 0.9375, - "step": 118 - }, - { - "epoch": 0.34812286689419797, - "grad_norm": 0.7079372926717654, - "learning_rate": 9.830730936592615e-06, - "loss": 0.9182, - "step": 119 - }, - { - "epoch": 0.3510482691370063, - "grad_norm": 0.7662791064726716, - "learning_rate": 9.824067439427374e-06, - "loss": 0.9202, - "step": 120 - }, - { - "epoch": 0.3539736713798147, - "grad_norm": 0.6847400348854458, - "learning_rate": 9.81727765038275e-06, - "loss": 0.9554, - "step": 121 - }, - { - "epoch": 0.35689907362262313, - "grad_norm": 0.7143572416493496, - "learning_rate": 9.810361747212313e-06, - "loss": 0.9479, - "step": 122 - }, - { - "epoch": 0.3598244758654315, - "grad_norm": 0.796457278880475, - "learning_rate": 9.803319910971248e-06, - "loss": 0.9514, - "step": 123 - }, - { - "epoch": 0.3627498781082399, - "grad_norm": 0.7165439305935597, - "learning_rate": 9.796152326011604e-06, - "loss": 0.9301, - "step": 124 - }, - { - "epoch": 0.3656752803510483, - "grad_norm": 0.7105461017783379, - "learning_rate": 9.788859179977478e-06, - "loss": 0.9197, - "step": 125 - }, - { - "epoch": 0.36860068259385664, - "grad_norm": 0.6678675689530374, - "learning_rate": 9.781440663800099e-06, - "loss": 0.9327, - "step": 126 - }, - { - "epoch": 0.37152608483666505, - "grad_norm": 0.6998315944401519, - "learning_rate": 9.77389697169283e-06, - "loss": 0.9424, - "step": 127 - }, - { - "epoch": 0.37445148707947346, - "grad_norm": 0.7063153026165354, - "learning_rate": 9.766228301146074e-06, - "loss": 0.9117, - "step": 128 - }, - { - "epoch": 0.3773768893222818, - "grad_norm": 0.665683830431616, - "learning_rate": 9.758434852922124e-06, - "loss": 0.9338, - "step": 129 - }, - { - "epoch": 0.3803022915650902, - "grad_norm": 0.6511286552329526, - "learning_rate": 9.75051683104989e-06, - "loss": 0.9239, - "step": 130 - }, - { - "epoch": 0.38322769380789856, - "grad_norm": 0.7263937749419739, - "learning_rate": 9.742474442819561e-06, - "loss": 0.9469, - "step": 131 - }, - { - "epoch": 0.38615309605070697, - "grad_norm": 0.6877891571625458, - "learning_rate": 9.734307898777187e-06, - "loss": 0.9448, - "step": 132 - }, - { - "epoch": 0.3890784982935154, - "grad_norm": 0.652680067409742, - "learning_rate": 9.726017412719151e-06, - "loss": 0.9387, - "step": 133 - }, - { - "epoch": 0.3920039005363237, - "grad_norm": 0.7368224595734011, - "learning_rate": 9.71760320168659e-06, - "loss": 0.947, - "step": 134 - }, - { - "epoch": 0.39492930277913213, - "grad_norm": 0.5904957774047593, - "learning_rate": 9.7090654859597e-06, - "loss": 0.9169, - "step": 135 - }, - { - "epoch": 0.39785470502194054, - "grad_norm": 0.7590457981077249, - "learning_rate": 9.700404489051974e-06, - "loss": 0.9451, - "step": 136 - }, - { - "epoch": 0.4007801072647489, - "grad_norm": 0.6735223362511448, - "learning_rate": 9.69162043770435e-06, - "loss": 0.9099, - "step": 137 - }, - { - "epoch": 0.4037055095075573, - "grad_norm": 0.6866553871773976, - "learning_rate": 9.682713561879275e-06, - "loss": 0.9114, - "step": 138 - }, - { - "epoch": 0.4066309117503657, - "grad_norm": 0.6661362317941144, - "learning_rate": 9.673684094754686e-06, - "loss": 0.9263, - "step": 139 - }, - { - "epoch": 0.40955631399317405, - "grad_norm": 0.6773611746763274, - "learning_rate": 9.664532272717902e-06, - "loss": 0.9119, - "step": 140 - }, - { - "epoch": 0.41248171623598245, - "grad_norm": 0.6330232345134237, - "learning_rate": 9.655258335359438e-06, - "loss": 0.9235, - "step": 141 - }, - { - "epoch": 0.41540711847879086, - "grad_norm": 0.6364881510059489, - "learning_rate": 9.645862525466734e-06, - "loss": 0.9299, - "step": 142 - }, - { - "epoch": 0.4183325207215992, - "grad_norm": 0.7507102668694946, - "learning_rate": 9.636345089017795e-06, - "loss": 0.9526, - "step": 143 - }, - { - "epoch": 0.4212579229644076, - "grad_norm": 0.6526196555099488, - "learning_rate": 9.626706275174754e-06, - "loss": 0.9398, - "step": 144 - }, - { - "epoch": 0.42418332520721597, - "grad_norm": 0.6700118261014734, - "learning_rate": 9.61694633627735e-06, - "loss": 0.9124, - "step": 145 - }, - { - "epoch": 0.4271087274500244, - "grad_norm": 0.6335057158130035, - "learning_rate": 9.607065527836324e-06, - "loss": 0.9603, - "step": 146 - }, - { - "epoch": 0.4300341296928328, - "grad_norm": 0.6277095717664852, - "learning_rate": 9.597064108526715e-06, - "loss": 0.9221, - "step": 147 - }, - { - "epoch": 0.43295953193564113, - "grad_norm": 0.6712875310890567, - "learning_rate": 9.58694234018111e-06, - "loss": 0.9204, - "step": 148 - }, - { - "epoch": 0.43588493417844953, - "grad_norm": 0.6592731934180556, - "learning_rate": 9.576700487782775e-06, - "loss": 0.9438, - "step": 149 - }, - { - "epoch": 0.43881033642125794, - "grad_norm": 0.6479345778232325, - "learning_rate": 9.566338819458726e-06, - "loss": 0.937, - "step": 150 - }, - { - "epoch": 0.4417357386640663, - "grad_norm": 0.6617896622163715, - "learning_rate": 9.555857606472692e-06, - "loss": 0.9499, - "step": 151 - }, - { - "epoch": 0.4446611409068747, - "grad_norm": 0.6245931528735381, - "learning_rate": 9.545257123218043e-06, - "loss": 0.9215, - "step": 152 - }, - { - "epoch": 0.4475865431496831, - "grad_norm": 0.6579890487283353, - "learning_rate": 9.534537647210582e-06, - "loss": 0.9517, - "step": 153 - }, - { - "epoch": 0.45051194539249145, - "grad_norm": 0.695847847970159, - "learning_rate": 9.523699459081285e-06, - "loss": 0.9305, - "step": 154 - }, - { - "epoch": 0.45343734763529986, - "grad_norm": 0.5946097480044219, - "learning_rate": 9.512742842568964e-06, - "loss": 0.9141, - "step": 155 - }, - { - "epoch": 0.45636274987810826, - "grad_norm": 0.6524432888789063, - "learning_rate": 9.501668084512827e-06, - "loss": 0.9122, - "step": 156 - }, - { - "epoch": 0.4592881521209166, - "grad_norm": 0.6715958526339862, - "learning_rate": 9.490475474844976e-06, - "loss": 0.9336, - "step": 157 - }, - { - "epoch": 0.462213554363725, - "grad_norm": 0.6010391743428036, - "learning_rate": 9.479165306582811e-06, - "loss": 0.9139, - "step": 158 - }, - { - "epoch": 0.46513895660653337, - "grad_norm": 0.7089381455175933, - "learning_rate": 9.467737875821368e-06, - "loss": 0.9295, - "step": 159 - }, - { - "epoch": 0.4680643588493418, - "grad_norm": 0.7289713827153292, - "learning_rate": 9.456193481725555e-06, - "loss": 0.9095, - "step": 160 - }, - { - "epoch": 0.4709897610921502, - "grad_norm": 0.6768392933447552, - "learning_rate": 9.444532426522334e-06, - "loss": 0.916, - "step": 161 - }, - { - "epoch": 0.47391516333495853, - "grad_norm": 0.6859161203017968, - "learning_rate": 9.432755015492794e-06, - "loss": 0.9191, - "step": 162 - }, - { - "epoch": 0.47684056557776694, - "grad_norm": 0.6293363193961834, - "learning_rate": 9.42086155696417e-06, - "loss": 0.9315, - "step": 163 - }, - { - "epoch": 0.47976596782057535, - "grad_norm": 0.6705616648771443, - "learning_rate": 9.408852362301768e-06, - "loss": 0.9252, - "step": 164 - }, - { - "epoch": 0.4826913700633837, - "grad_norm": 0.5708431906967768, - "learning_rate": 9.396727745900811e-06, - "loss": 0.9251, - "step": 165 - }, - { - "epoch": 0.4856167723061921, - "grad_norm": 0.6083895665414887, - "learning_rate": 9.384488025178214e-06, - "loss": 0.9294, - "step": 166 - }, - { - "epoch": 0.4885421745490005, - "grad_norm": 0.6313624264612219, - "learning_rate": 9.372133520564264e-06, - "loss": 0.934, - "step": 167 - }, - { - "epoch": 0.49146757679180886, - "grad_norm": 0.6152317431277435, - "learning_rate": 9.359664555494244e-06, - "loss": 0.9426, - "step": 168 - }, - { - "epoch": 0.49439297903461726, - "grad_norm": 0.5950351172049964, - "learning_rate": 9.347081456399958e-06, - "loss": 0.92, - "step": 169 - }, - { - "epoch": 0.49731838127742567, - "grad_norm": 0.6063209229661293, - "learning_rate": 9.334384552701183e-06, - "loss": 0.9158, - "step": 170 - }, - { - "epoch": 0.500243783520234, - "grad_norm": 0.654811581491953, - "learning_rate": 9.321574176797055e-06, - "loss": 0.9137, - "step": 171 - }, - { - "epoch": 0.5031691857630424, - "grad_norm": 0.6443091025598243, - "learning_rate": 9.308650664057352e-06, - "loss": 0.9347, - "step": 172 - }, - { - "epoch": 0.5031691857630424, - "eval_loss": 1.382827877998352, - "eval_runtime": 175.4317, - "eval_samples_per_second": 5.524, - "eval_steps_per_second": 0.923, - "step": 172 - }, - { - "epoch": 0.5060945880058508, - "grad_norm": 0.6026207581703545, - "learning_rate": 9.295614352813732e-06, - "loss": 0.9563, - "step": 173 - }, - { - "epoch": 0.5090199902486592, - "grad_norm": 0.6335528865651444, - "learning_rate": 9.282465584350856e-06, - "loss": 0.9351, - "step": 174 - }, - { - "epoch": 0.5119453924914675, - "grad_norm": 0.6241890440961206, - "learning_rate": 9.269204702897476e-06, - "loss": 0.9218, - "step": 175 - }, - { - "epoch": 0.514870794734276, - "grad_norm": 0.6159111786845223, - "learning_rate": 9.2558320556174e-06, - "loss": 0.929, - "step": 176 - }, - { - "epoch": 0.5177961969770843, - "grad_norm": 0.5980336848061285, - "learning_rate": 9.242347992600416e-06, - "loss": 0.9304, - "step": 177 - }, - { - "epoch": 0.5207215992198927, - "grad_norm": 0.6410233986431451, - "learning_rate": 9.22875286685313e-06, - "loss": 0.9122, - "step": 178 - }, - { - "epoch": 0.5236470014627012, - "grad_norm": 0.6597655503440497, - "learning_rate": 9.215047034289716e-06, - "loss": 0.9097, - "step": 179 - }, - { - "epoch": 0.5265724037055095, - "grad_norm": 0.6154485141199222, - "learning_rate": 9.201230853722603e-06, - "loss": 0.9153, - "step": 180 - }, - { - "epoch": 0.5294978059483179, - "grad_norm": 0.6414319139886607, - "learning_rate": 9.187304686853078e-06, - "loss": 0.9176, - "step": 181 - }, - { - "epoch": 0.5324232081911263, - "grad_norm": 0.6454394793945475, - "learning_rate": 9.173268898261822e-06, - "loss": 0.9532, - "step": 182 - }, - { - "epoch": 0.5353486104339347, - "grad_norm": 0.6318137898521786, - "learning_rate": 9.159123855399364e-06, - "loss": 0.9154, - "step": 183 - }, - { - "epoch": 0.538274012676743, - "grad_norm": 0.673121635478712, - "learning_rate": 9.144869928576451e-06, - "loss": 0.9205, - "step": 184 - }, - { - "epoch": 0.5411994149195515, - "grad_norm": 0.69673708523528, - "learning_rate": 9.130507490954375e-06, - "loss": 0.8939, - "step": 185 - }, - { - "epoch": 0.5441248171623598, - "grad_norm": 0.6235430793813558, - "learning_rate": 9.11603691853518e-06, - "loss": 0.9286, - "step": 186 - }, - { - "epoch": 0.5470502194051682, - "grad_norm": 0.671802703912287, - "learning_rate": 9.101458590151837e-06, - "loss": 0.917, - "step": 187 - }, - { - "epoch": 0.5499756216479766, - "grad_norm": 0.7068672175457833, - "learning_rate": 9.086772887458314e-06, - "loss": 0.9066, - "step": 188 - }, - { - "epoch": 0.552901023890785, - "grad_norm": 0.6408445103986982, - "learning_rate": 9.071980194919592e-06, - "loss": 0.9104, - "step": 189 - }, - { - "epoch": 0.5558264261335933, - "grad_norm": 0.6433394608747284, - "learning_rate": 9.057080899801598e-06, - "loss": 0.888, - "step": 190 - }, - { - "epoch": 0.5587518283764018, - "grad_norm": 0.5837415931981027, - "learning_rate": 9.042075392161062e-06, - "loss": 0.9336, - "step": 191 - }, - { - "epoch": 0.5616772306192102, - "grad_norm": 0.6199039250762649, - "learning_rate": 9.026964064835312e-06, - "loss": 0.9328, - "step": 192 - }, - { - "epoch": 0.5646026328620185, - "grad_norm": 0.6319233508882481, - "learning_rate": 9.011747313431988e-06, - "loss": 0.905, - "step": 193 - }, - { - "epoch": 0.567528035104827, - "grad_norm": 0.601081786670075, - "learning_rate": 8.996425536318683e-06, - "loss": 0.9074, - "step": 194 - }, - { - "epoch": 0.5704534373476353, - "grad_norm": 0.6033456681131416, - "learning_rate": 8.980999134612512e-06, - "loss": 0.8969, - "step": 195 - }, - { - "epoch": 0.5733788395904437, - "grad_norm": 0.5954276802274396, - "learning_rate": 8.96546851216962e-06, - "loss": 0.8876, - "step": 196 - }, - { - "epoch": 0.576304241833252, - "grad_norm": 0.5790973908218665, - "learning_rate": 8.949834075574595e-06, - "loss": 0.9295, - "step": 197 - }, - { - "epoch": 0.5792296440760605, - "grad_norm": 0.6403087898665544, - "learning_rate": 8.934096234129843e-06, - "loss": 0.9303, - "step": 198 - }, - { - "epoch": 0.5821550463188688, - "grad_norm": 0.6474848524459008, - "learning_rate": 8.918255399844855e-06, - "loss": 0.9118, - "step": 199 - }, - { - "epoch": 0.5850804485616772, - "grad_norm": 0.6317608858369101, - "learning_rate": 8.902311987425422e-06, - "loss": 0.9259, - "step": 200 - }, - { - "epoch": 0.5880058508044856, - "grad_norm": 0.6299668796988298, - "learning_rate": 8.886266414262797e-06, - "loss": 0.9113, - "step": 201 - }, - { - "epoch": 0.590931253047294, - "grad_norm": 0.6478367513320172, - "learning_rate": 8.870119100422743e-06, - "loss": 0.9131, - "step": 202 - }, - { - "epoch": 0.5938566552901023, - "grad_norm": 0.6750176409365689, - "learning_rate": 8.853870468634554e-06, - "loss": 0.9136, - "step": 203 - }, - { - "epoch": 0.5967820575329108, - "grad_norm": 0.6034423133153598, - "learning_rate": 8.837520944279976e-06, - "loss": 0.9299, - "step": 204 - }, - { - "epoch": 0.5997074597757192, - "grad_norm": 0.7246936174658474, - "learning_rate": 8.821070955382082e-06, - "loss": 0.934, - "step": 205 - }, - { - "epoch": 0.6026328620185275, - "grad_norm": 0.5842207824302007, - "learning_rate": 8.804520932594061e-06, - "loss": 0.8955, - "step": 206 - }, - { - "epoch": 0.605558264261336, - "grad_norm": 0.6464732426728426, - "learning_rate": 8.787871309187936e-06, - "loss": 0.9143, - "step": 207 - }, - { - "epoch": 0.6084836665041443, - "grad_norm": 0.593796254274566, - "learning_rate": 8.771122521043236e-06, - "loss": 0.9366, - "step": 208 - }, - { - "epoch": 0.6114090687469527, - "grad_norm": 0.6413805961118216, - "learning_rate": 8.754275006635573e-06, - "loss": 0.9141, - "step": 209 - }, - { - "epoch": 0.6143344709897611, - "grad_norm": 0.5915119964358855, - "learning_rate": 8.737329207025172e-06, - "loss": 0.9263, - "step": 210 - }, - { - "epoch": 0.6172598732325695, - "grad_norm": 0.602387309046525, - "learning_rate": 8.720285565845313e-06, - "loss": 0.9252, - "step": 211 - }, - { - "epoch": 0.6201852754753778, - "grad_norm": 0.5724972842726512, - "learning_rate": 8.703144529290733e-06, - "loss": 0.9126, - "step": 212 - }, - { - "epoch": 0.6231106777181863, - "grad_norm": 0.6072757135591695, - "learning_rate": 8.685906546105925e-06, - "loss": 0.9132, - "step": 213 - }, - { - "epoch": 0.6260360799609946, - "grad_norm": 0.5918104303300601, - "learning_rate": 8.668572067573409e-06, - "loss": 0.9041, - "step": 214 - }, - { - "epoch": 0.628961482203803, - "grad_norm": 0.7069632584780451, - "learning_rate": 8.651141547501904e-06, - "loss": 0.9147, - "step": 215 - }, - { - "epoch": 0.6318868844466115, - "grad_norm": 0.5939488489914676, - "learning_rate": 8.633615442214452e-06, - "loss": 0.8861, - "step": 216 - }, - { - "epoch": 0.6348122866894198, - "grad_norm": 0.594163983383459, - "learning_rate": 8.615994210536479e-06, - "loss": 0.904, - "step": 217 - }, - { - "epoch": 0.6377376889322282, - "grad_norm": 0.6115411803684507, - "learning_rate": 8.598278313783765e-06, - "loss": 0.906, - "step": 218 - }, - { - "epoch": 0.6406630911750366, - "grad_norm": 0.6098845828745101, - "learning_rate": 8.580468215750392e-06, - "loss": 0.9144, - "step": 219 - }, - { - "epoch": 0.643588493417845, - "grad_norm": 0.655945088038022, - "learning_rate": 8.562564382696578e-06, - "loss": 0.9141, - "step": 220 - }, - { - "epoch": 0.6465138956606533, - "grad_norm": 0.6384785223163155, - "learning_rate": 8.544567283336484e-06, - "loss": 0.9107, - "step": 221 - }, - { - "epoch": 0.6494392979034618, - "grad_norm": 0.6868191616720155, - "learning_rate": 8.52647738882594e-06, - "loss": 0.893, - "step": 222 - }, - { - "epoch": 0.6523647001462701, - "grad_norm": 0.626480892609364, - "learning_rate": 8.508295172750116e-06, - "loss": 0.9305, - "step": 223 - }, - { - "epoch": 0.6552901023890785, - "grad_norm": 0.6699010228701286, - "learning_rate": 8.490021111111108e-06, - "loss": 0.9023, - "step": 224 - }, - { - "epoch": 0.6582155046318869, - "grad_norm": 0.5986398715684018, - "learning_rate": 8.471655682315496e-06, - "loss": 0.905, - "step": 225 - }, - { - "epoch": 0.6611409068746953, - "grad_norm": 0.6561568745480283, - "learning_rate": 8.453199367161804e-06, - "loss": 0.9048, - "step": 226 - }, - { - "epoch": 0.6640663091175036, - "grad_norm": 0.6270511738126053, - "learning_rate": 8.434652648827925e-06, - "loss": 0.8712, - "step": 227 - }, - { - "epoch": 0.666991711360312, - "grad_norm": 0.6107362295918622, - "learning_rate": 8.41601601285846e-06, - "loss": 0.9044, - "step": 228 - }, - { - "epoch": 0.6699171136031205, - "grad_norm": 0.6799488575081768, - "learning_rate": 8.397289947152021e-06, - "loss": 0.8803, - "step": 229 - }, - { - "epoch": 0.6728425158459288, - "grad_norm": 0.5784070848804973, - "learning_rate": 8.378474941948437e-06, - "loss": 0.9148, - "step": 230 - }, - { - "epoch": 0.6757679180887372, - "grad_norm": 0.6629135318112295, - "learning_rate": 8.359571489815946e-06, - "loss": 0.9213, - "step": 231 - }, - { - "epoch": 0.6786933203315456, - "grad_norm": 0.5852740864772179, - "learning_rate": 8.340580085638275e-06, - "loss": 0.9095, - "step": 232 - }, - { - "epoch": 0.681618722574354, - "grad_norm": 0.64996093688458, - "learning_rate": 8.321501226601702e-06, - "loss": 0.9169, - "step": 233 - }, - { - "epoch": 0.6845441248171623, - "grad_norm": 0.5694164027851655, - "learning_rate": 8.302335412182034e-06, - "loss": 0.891, - "step": 234 - }, - { - "epoch": 0.6874695270599708, - "grad_norm": 0.6356418602057949, - "learning_rate": 8.283083144131523e-06, - "loss": 0.9105, - "step": 235 - }, - { - "epoch": 0.6903949293027791, - "grad_norm": 0.6154729150264902, - "learning_rate": 8.263744926465744e-06, - "loss": 0.9277, - "step": 236 - }, - { - "epoch": 0.6933203315455875, - "grad_norm": 0.6117307726030774, - "learning_rate": 8.24432126545039e-06, - "loss": 0.9106, - "step": 237 - }, - { - "epoch": 0.6962457337883959, - "grad_norm": 0.5948671774603161, - "learning_rate": 8.224812669588028e-06, - "loss": 0.9091, - "step": 238 - }, - { - "epoch": 0.6991711360312043, - "grad_norm": 0.6747894010908736, - "learning_rate": 8.20521964960477e-06, - "loss": 0.9099, - "step": 239 - }, - { - "epoch": 0.7020965382740126, - "grad_norm": 0.6067781389366469, - "learning_rate": 8.185542718436923e-06, - "loss": 0.8824, - "step": 240 - }, - { - "epoch": 0.7050219405168211, - "grad_norm": 0.6218954056208628, - "learning_rate": 8.165782391217543e-06, - "loss": 0.9268, - "step": 241 - }, - { - "epoch": 0.7079473427596294, - "grad_norm": 0.6047649631286887, - "learning_rate": 8.145939185262963e-06, - "loss": 0.9068, - "step": 242 - }, - { - "epoch": 0.7108727450024378, - "grad_norm": 0.6627849155859088, - "learning_rate": 8.126013620059236e-06, - "loss": 0.8658, - "step": 243 - }, - { - "epoch": 0.7137981472452463, - "grad_norm": 0.5743150371347046, - "learning_rate": 8.106006217248552e-06, - "loss": 0.9165, - "step": 244 - }, - { - "epoch": 0.7167235494880546, - "grad_norm": 0.6372225785944298, - "learning_rate": 8.08591750061556e-06, - "loss": 0.8766, - "step": 245 - }, - { - "epoch": 0.719648951730863, - "grad_norm": 0.622673683239682, - "learning_rate": 8.065747996073681e-06, - "loss": 0.8979, - "step": 246 - }, - { - "epoch": 0.7225743539736714, - "grad_norm": 0.6078854821295643, - "learning_rate": 8.045498231651314e-06, - "loss": 0.894, - "step": 247 - }, - { - "epoch": 0.7254997562164798, - "grad_norm": 0.6137768530335086, - "learning_rate": 8.025168737478034e-06, - "loss": 0.9017, - "step": 248 - }, - { - "epoch": 0.7284251584592881, - "grad_norm": 0.5890466187531522, - "learning_rate": 8.004760045770702e-06, - "loss": 0.9028, - "step": 249 - }, - { - "epoch": 0.7313505607020966, - "grad_norm": 0.6534358893635163, - "learning_rate": 7.98427269081953e-06, - "loss": 0.9123, - "step": 250 - }, - { - "epoch": 0.7342759629449049, - "grad_norm": 0.5810190162761808, - "learning_rate": 7.963707208974103e-06, - "loss": 0.8841, - "step": 251 - }, - { - "epoch": 0.7372013651877133, - "grad_norm": 0.65324837663941, - "learning_rate": 7.943064138629332e-06, - "loss": 0.8962, - "step": 252 - }, - { - "epoch": 0.7401267674305217, - "grad_norm": 0.6566544273903392, - "learning_rate": 7.922344020211357e-06, - "loss": 0.8839, - "step": 253 - }, - { - "epoch": 0.7430521696733301, - "grad_norm": 0.6238544855656891, - "learning_rate": 7.9015473961634e-06, - "loss": 0.8777, - "step": 254 - }, - { - "epoch": 0.7459775719161384, - "grad_norm": 0.6095828534395317, - "learning_rate": 7.880674810931572e-06, - "loss": 0.9006, - "step": 255 - }, - { - "epoch": 0.7489029741589469, - "grad_norm": 0.6434213102088783, - "learning_rate": 7.859726810950606e-06, - "loss": 0.9302, - "step": 256 - }, - { - "epoch": 0.7518283764017553, - "grad_norm": 0.6997507230121648, - "learning_rate": 7.83870394462956e-06, - "loss": 0.8709, - "step": 257 - }, - { - "epoch": 0.7547537786445636, - "grad_norm": 0.6092872815949443, - "learning_rate": 7.817606762337465e-06, - "loss": 0.9142, - "step": 258 - }, - { - "epoch": 0.7547537786445636, - "eval_loss": 1.3693060874938965, - "eval_runtime": 65.5257, - "eval_samples_per_second": 14.788, - "eval_steps_per_second": 2.472, - "step": 258 - }, - { - "epoch": 0.757679180887372, - "grad_norm": 0.6246678441323336, - "learning_rate": 7.796435816388899e-06, - "loss": 0.9005, - "step": 259 - }, - { - "epoch": 0.7606045831301804, - "grad_norm": 0.5912377668528622, - "learning_rate": 7.77519166102955e-06, - "loss": 0.9017, - "step": 260 - }, - { - "epoch": 0.7635299853729888, - "grad_norm": 0.6053057054784722, - "learning_rate": 7.753874852421685e-06, - "loss": 0.8919, - "step": 261 - }, - { - "epoch": 0.7664553876157971, - "grad_norm": 0.5744146885343826, - "learning_rate": 7.73248594862961e-06, - "loss": 0.9064, - "step": 262 - }, - { - "epoch": 0.7693807898586056, - "grad_norm": 0.6319612148391044, - "learning_rate": 7.711025509605041e-06, - "loss": 0.9031, - "step": 263 - }, - { - "epoch": 0.7723061921014139, - "grad_norm": 0.5912614698781004, - "learning_rate": 7.689494097172457e-06, - "loss": 0.901, - "step": 264 - }, - { - "epoch": 0.7752315943442223, - "grad_norm": 0.6664929332095557, - "learning_rate": 7.66789227501439e-06, - "loss": 0.899, - "step": 265 - }, - { - "epoch": 0.7781569965870307, - "grad_norm": 0.6153709745040032, - "learning_rate": 7.646220608656662e-06, - "loss": 0.8814, - "step": 266 - }, - { - "epoch": 0.7810823988298391, - "grad_norm": 0.6256834772904273, - "learning_rate": 7.624479665453593e-06, - "loss": 0.898, - "step": 267 - }, - { - "epoch": 0.7840078010726474, - "grad_norm": 0.629527156196636, - "learning_rate": 7.602670014573128e-06, - "loss": 0.8804, - "step": 268 - }, - { - "epoch": 0.7869332033154559, - "grad_norm": 0.6326221554834266, - "learning_rate": 7.580792226981954e-06, - "loss": 0.8719, - "step": 269 - }, - { - "epoch": 0.7898586055582643, - "grad_norm": 0.6228147000647801, - "learning_rate": 7.558846875430548e-06, - "loss": 0.8951, - "step": 270 - }, - { - "epoch": 0.7927840078010726, - "grad_norm": 0.6300762602114698, - "learning_rate": 7.536834534438174e-06, - "loss": 0.8786, - "step": 271 - }, - { - "epoch": 0.7957094100438811, - "grad_norm": 0.602370277875063, - "learning_rate": 7.514755780277854e-06, - "loss": 0.899, - "step": 272 - }, - { - "epoch": 0.7986348122866894, - "grad_norm": 0.668424013856032, - "learning_rate": 7.492611190961272e-06, - "loss": 0.922, - "step": 273 - }, - { - "epoch": 0.8015602145294978, - "grad_norm": 0.643421950775731, - "learning_rate": 7.470401346223653e-06, - "loss": 0.9129, - "step": 274 - }, - { - "epoch": 0.8044856167723062, - "grad_norm": 0.6598339736650571, - "learning_rate": 7.448126827508573e-06, - "loss": 0.8857, - "step": 275 - }, - { - "epoch": 0.8074110190151146, - "grad_norm": 0.5798833085074291, - "learning_rate": 7.425788217952744e-06, - "loss": 0.9137, - "step": 276 - }, - { - "epoch": 0.8103364212579229, - "grad_norm": 0.6402512533541488, - "learning_rate": 7.403386102370751e-06, - "loss": 0.9064, - "step": 277 - }, - { - "epoch": 0.8132618235007314, - "grad_norm": 0.5779865664446158, - "learning_rate": 7.380921067239732e-06, - "loss": 0.8877, - "step": 278 - }, - { - "epoch": 0.8161872257435397, - "grad_norm": 0.6252271460860866, - "learning_rate": 7.3583937006840335e-06, - "loss": 0.887, - "step": 279 - }, - { - "epoch": 0.8191126279863481, - "grad_norm": 0.5885152139853308, - "learning_rate": 7.335804592459811e-06, - "loss": 0.8857, - "step": 280 - }, - { - "epoch": 0.8220380302291566, - "grad_norm": 0.6246237578334921, - "learning_rate": 7.313154333939587e-06, - "loss": 0.8807, - "step": 281 - }, - { - "epoch": 0.8249634324719649, - "grad_norm": 0.6141579913787804, - "learning_rate": 7.2904435180967695e-06, - "loss": 0.8912, - "step": 282 - }, - { - "epoch": 0.8278888347147733, - "grad_norm": 0.6181601204601509, - "learning_rate": 7.26767273949013e-06, - "loss": 0.884, - "step": 283 - }, - { - "epoch": 0.8308142369575817, - "grad_norm": 0.5658821051405528, - "learning_rate": 7.244842594248244e-06, - "loss": 0.8924, - "step": 284 - }, - { - "epoch": 0.8337396392003901, - "grad_norm": 0.598596513563131, - "learning_rate": 7.221953680053867e-06, - "loss": 0.8869, - "step": 285 - }, - { - "epoch": 0.8366650414431984, - "grad_norm": 0.5757584762578442, - "learning_rate": 7.1990065961283075e-06, - "loss": 0.8883, - "step": 286 - }, - { - "epoch": 0.8395904436860068, - "grad_norm": 0.6121626641719572, - "learning_rate": 7.1760019432157295e-06, - "loss": 0.9127, - "step": 287 - }, - { - "epoch": 0.8425158459288152, - "grad_norm": 0.6040818769702724, - "learning_rate": 7.1529403235674236e-06, - "loss": 0.8925, - "step": 288 - }, - { - "epoch": 0.8454412481716236, - "grad_norm": 0.6694613001668741, - "learning_rate": 7.129822340926045e-06, - "loss": 0.8911, - "step": 289 - }, - { - "epoch": 0.8483666504144319, - "grad_norm": 0.6263429758841262, - "learning_rate": 7.106648600509809e-06, - "loss": 0.8704, - "step": 290 - }, - { - "epoch": 0.8512920526572404, - "grad_norm": 0.6284317810383815, - "learning_rate": 7.083419708996641e-06, - "loss": 0.93, - "step": 291 - }, - { - "epoch": 0.8542174549000487, - "grad_norm": 0.6255860929144157, - "learning_rate": 7.060136274508296e-06, - "loss": 0.9188, - "step": 292 - }, - { - "epoch": 0.8571428571428571, - "grad_norm": 0.6875778643587599, - "learning_rate": 7.036798906594442e-06, - "loss": 0.8848, - "step": 293 - }, - { - "epoch": 0.8600682593856656, - "grad_norm": 0.6292538167195054, - "learning_rate": 7.0134082162167e-06, - "loss": 0.8798, - "step": 294 - }, - { - "epoch": 0.8629936616284739, - "grad_norm": 0.6780000891153708, - "learning_rate": 6.989964815732643e-06, - "loss": 0.8935, - "step": 295 - }, - { - "epoch": 0.8659190638712823, - "grad_norm": 0.590176250826063, - "learning_rate": 6.9664693188797776e-06, - "loss": 0.9009, - "step": 296 - }, - { - "epoch": 0.8688444661140907, - "grad_norm": 0.5785779594775584, - "learning_rate": 6.942922340759465e-06, - "loss": 0.9051, - "step": 297 - }, - { - "epoch": 0.8717698683568991, - "grad_norm": 0.5978845535479155, - "learning_rate": 6.9193244978208226e-06, - "loss": 0.9006, - "step": 298 - }, - { - "epoch": 0.8746952705997074, - "grad_norm": 0.5859143049414189, - "learning_rate": 6.895676407844587e-06, - "loss": 0.9037, - "step": 299 - }, - { - "epoch": 0.8776206728425159, - "grad_norm": 0.5839240106472464, - "learning_rate": 6.8719786899269426e-06, - "loss": 0.8989, - "step": 300 - }, - { - "epoch": 0.8805460750853242, - "grad_norm": 0.6103420392295683, - "learning_rate": 6.848231964463301e-06, - "loss": 0.8953, - "step": 301 - }, - { - "epoch": 0.8834714773281326, - "grad_norm": 0.5686560770840599, - "learning_rate": 6.8244368531320795e-06, - "loss": 0.8766, - "step": 302 - }, - { - "epoch": 0.886396879570941, - "grad_norm": 0.61072247574655, - "learning_rate": 6.800593978878407e-06, - "loss": 0.8762, - "step": 303 - }, - { - "epoch": 0.8893222818137494, - "grad_norm": 0.6158369105071173, - "learning_rate": 6.776703965897831e-06, - "loss": 0.9028, - "step": 304 - }, - { - "epoch": 0.8922476840565577, - "grad_norm": 0.5937764317875888, - "learning_rate": 6.752767439619961e-06, - "loss": 0.9047, - "step": 305 - }, - { - "epoch": 0.8951730862993662, - "grad_norm": 0.5936048944160339, - "learning_rate": 6.728785026692113e-06, - "loss": 0.9093, - "step": 306 - }, - { - "epoch": 0.8980984885421746, - "grad_norm": 0.5761666127145109, - "learning_rate": 6.704757354962888e-06, - "loss": 0.8963, - "step": 307 - }, - { - "epoch": 0.9010238907849829, - "grad_norm": 0.5698243858767273, - "learning_rate": 6.680685053465743e-06, - "loss": 0.889, - "step": 308 - }, - { - "epoch": 0.9039492930277914, - "grad_norm": 0.6187986831036327, - "learning_rate": 6.656568752402521e-06, - "loss": 0.8793, - "step": 309 - }, - { - "epoch": 0.9068746952705997, - "grad_norm": 0.599039810186881, - "learning_rate": 6.632409083126959e-06, - "loss": 0.9014, - "step": 310 - }, - { - "epoch": 0.9098000975134081, - "grad_norm": 0.5965723397933764, - "learning_rate": 6.608206678128143e-06, - "loss": 0.9036, - "step": 311 - }, - { - "epoch": 0.9127254997562165, - "grad_norm": 0.5993919939223842, - "learning_rate": 6.583962171013974e-06, - "loss": 0.8921, - "step": 312 - }, - { - "epoch": 0.9156509019990249, - "grad_norm": 0.5924590487598618, - "learning_rate": 6.559676196494555e-06, - "loss": 0.9203, - "step": 313 - }, - { - "epoch": 0.9185763042418332, - "grad_norm": 0.5637168699409383, - "learning_rate": 6.535349390365597e-06, - "loss": 0.8827, - "step": 314 - }, - { - "epoch": 0.9215017064846417, - "grad_norm": 0.6068559586638762, - "learning_rate": 6.510982389491756e-06, - "loss": 0.9011, - "step": 315 - }, - { - "epoch": 0.92442710872745, - "grad_norm": 0.5818145636426237, - "learning_rate": 6.486575831789974e-06, - "loss": 0.8927, - "step": 316 - }, - { - "epoch": 0.9273525109702584, - "grad_norm": 0.6129278738869518, - "learning_rate": 6.462130356212768e-06, - "loss": 0.9059, - "step": 317 - }, - { - "epoch": 0.9302779132130667, - "grad_norm": 0.5842516474759907, - "learning_rate": 6.437646602731509e-06, - "loss": 0.8781, - "step": 318 - }, - { - "epoch": 0.9332033154558752, - "grad_norm": 0.5740360820130547, - "learning_rate": 6.413125212319664e-06, - "loss": 0.9031, - "step": 319 - }, - { - "epoch": 0.9361287176986836, - "grad_norm": 0.6305695619214373, - "learning_rate": 6.388566826936025e-06, - "loss": 0.9111, - "step": 320 - }, - { - "epoch": 0.9390541199414919, - "grad_norm": 0.6403885483257377, - "learning_rate": 6.363972089507886e-06, - "loss": 0.8765, - "step": 321 - }, - { - "epoch": 0.9419795221843004, - "grad_norm": 0.634525294542331, - "learning_rate": 6.3393416439142255e-06, - "loss": 0.9096, - "step": 322 - }, - { - "epoch": 0.9449049244271087, - "grad_norm": 0.629802499730145, - "learning_rate": 6.314676134968845e-06, - "loss": 0.8842, - "step": 323 - }, - { - "epoch": 0.9478303266699171, - "grad_norm": 0.6138154866450481, - "learning_rate": 6.28997620840349e-06, - "loss": 0.9042, - "step": 324 - }, - { - "epoch": 0.9507557289127255, - "grad_norm": 0.5889081155134466, - "learning_rate": 6.26524251085094e-06, - "loss": 0.8978, - "step": 325 - }, - { - "epoch": 0.9536811311555339, - "grad_norm": 0.5885045686141304, - "learning_rate": 6.240475689828087e-06, - "loss": 0.8669, - "step": 326 - }, - { - "epoch": 0.9566065333983422, - "grad_norm": 0.5813834952894164, - "learning_rate": 6.21567639371898e-06, - "loss": 0.8573, - "step": 327 - }, - { - "epoch": 0.9595319356411507, - "grad_norm": 0.6297897811406099, - "learning_rate": 6.190845271757846e-06, - "loss": 0.901, - "step": 328 - }, - { - "epoch": 0.962457337883959, - "grad_norm": 0.5833991294096019, - "learning_rate": 6.165982974012104e-06, - "loss": 0.9206, - "step": 329 - }, - { - "epoch": 0.9653827401267674, - "grad_norm": 0.5693519759193476, - "learning_rate": 6.141090151365341e-06, - "loss": 0.8581, - "step": 330 - }, - { - "epoch": 0.9683081423695759, - "grad_norm": 0.6086478543119002, - "learning_rate": 6.116167455500265e-06, - "loss": 0.8945, - "step": 331 - }, - { - "epoch": 0.9712335446123842, - "grad_norm": 0.5650924803920132, - "learning_rate": 6.0912155388816584e-06, - "loss": 0.8941, - "step": 332 - }, - { - "epoch": 0.9741589468551926, - "grad_norm": 0.5903130575046514, - "learning_rate": 6.066235054739289e-06, - "loss": 0.8899, - "step": 333 - }, - { - "epoch": 0.977084349098001, - "grad_norm": 0.5975948126480749, - "learning_rate": 6.041226657050804e-06, - "loss": 0.8607, - "step": 334 - }, - { - "epoch": 0.9800097513408094, - "grad_norm": 0.6025105840404439, - "learning_rate": 6.01619100052462e-06, - "loss": 0.8885, - "step": 335 - }, - { - "epoch": 0.9829351535836177, - "grad_norm": 0.6046007091634541, - "learning_rate": 5.991128740582774e-06, - "loss": 0.8897, - "step": 336 - }, - { - "epoch": 0.9858605558264262, - "grad_norm": 0.5434521802533933, - "learning_rate": 5.966040533343772e-06, - "loss": 0.8844, - "step": 337 - }, - { - "epoch": 0.9887859580692345, - "grad_norm": 0.6303574796088046, - "learning_rate": 5.9409270356054025e-06, - "loss": 0.8802, - "step": 338 - }, - { - "epoch": 0.9917113603120429, - "grad_norm": 0.5819203211883127, - "learning_rate": 5.915788904827553e-06, - "loss": 0.8948, - "step": 339 - }, - { - "epoch": 0.9946367625548513, - "grad_norm": 0.5729927595936578, - "learning_rate": 5.890626799114991e-06, - "loss": 0.9198, - "step": 340 - }, - { - "epoch": 0.9975621647976597, - "grad_norm": 0.5643044236514093, - "learning_rate": 5.865441377200137e-06, - "loss": 0.8738, - "step": 341 - }, - { - "epoch": 1.000487567040468, - "grad_norm": 0.5893972684647703, - "learning_rate": 5.840233298425818e-06, - "loss": 0.8821, - "step": 342 - }, - { - "epoch": 1.0007308160779538, - "grad_norm": 0.68110454506424, - "learning_rate": 5.815003222728007e-06, - "loss": 0.8728, - "step": 343 - }, - { - "epoch": 1.0036540803897687, - "grad_norm": 0.8602651253070958, - "learning_rate": 5.789751810618551e-06, - "loss": 0.7967, - "step": 344 - }, - { - "epoch": 1.0036540803897687, - "eval_loss": 1.365891695022583, - "eval_runtime": 65.455, - "eval_samples_per_second": 14.804, - "eval_steps_per_second": 2.475, - "step": 344 - }, - { - "epoch": 1.0065773447015833, - "grad_norm": 0.6693783832444782, - "learning_rate": 5.764479723167867e-06, - "loss": 0.8162, - "step": 345 - }, - { - "epoch": 1.0095006090133982, - "grad_norm": 0.8649026077052362, - "learning_rate": 5.739187621987649e-06, - "loss": 0.7811, - "step": 346 - }, - { - "epoch": 1.012423873325213, - "grad_norm": 0.7283056665598604, - "learning_rate": 5.713876169213538e-06, - "loss": 0.8023, - "step": 347 - }, - { - "epoch": 1.015347137637028, - "grad_norm": 0.6464070682314531, - "learning_rate": 5.688546027487793e-06, - "loss": 0.8299, - "step": 348 - }, - { - "epoch": 1.0182704019488429, - "grad_norm": 0.7459265332536604, - "learning_rate": 5.663197859941938e-06, - "loss": 0.7923, - "step": 349 - }, - { - "epoch": 1.0211936662606578, - "grad_norm": 0.6961432197550056, - "learning_rate": 5.637832330179409e-06, - "loss": 0.7985, - "step": 350 - }, - { - "epoch": 1.0241169305724727, - "grad_norm": 0.6700485502738509, - "learning_rate": 5.612450102258175e-06, - "loss": 0.7976, - "step": 351 - }, - { - "epoch": 1.0270401948842875, - "grad_norm": 0.7702215817383979, - "learning_rate": 5.587051840673356e-06, - "loss": 0.8131, - "step": 352 - }, - { - "epoch": 1.0299634591961022, - "grad_norm": 0.6852662863129292, - "learning_rate": 5.56163821033982e-06, - "loss": 0.8029, - "step": 353 - }, - { - "epoch": 1.032886723507917, - "grad_norm": 0.7217757781141521, - "learning_rate": 5.536209876574793e-06, - "loss": 0.7978, - "step": 354 - }, - { - "epoch": 1.035809987819732, - "grad_norm": 0.7300263042947811, - "learning_rate": 5.510767505080419e-06, - "loss": 0.8166, - "step": 355 - }, - { - "epoch": 1.0387332521315469, - "grad_norm": 0.6920161152507025, - "learning_rate": 5.4853117619263496e-06, - "loss": 0.8128, - "step": 356 - }, - { - "epoch": 1.0416565164433618, - "grad_norm": 0.7747760542004816, - "learning_rate": 5.4598433135323015e-06, - "loss": 0.8232, - "step": 357 - }, - { - "epoch": 1.0445797807551767, - "grad_norm": 0.6763340836215797, - "learning_rate": 5.434362826650603e-06, - "loss": 0.8125, - "step": 358 - }, - { - "epoch": 1.0475030450669915, - "grad_norm": 0.6209130990558949, - "learning_rate": 5.4088709683487494e-06, - "loss": 0.808, - "step": 359 - }, - { - "epoch": 1.0504263093788064, - "grad_norm": 0.6893839443256197, - "learning_rate": 5.383368405991932e-06, - "loss": 0.8136, - "step": 360 - }, - { - "epoch": 1.053349573690621, - "grad_norm": 0.6233006238408513, - "learning_rate": 5.357855807225573e-06, - "loss": 0.8202, - "step": 361 - }, - { - "epoch": 1.056272838002436, - "grad_norm": 0.6163295638238943, - "learning_rate": 5.332333839957835e-06, - "loss": 0.7972, - "step": 362 - }, - { - "epoch": 1.0591961023142509, - "grad_norm": 0.6239471219126078, - "learning_rate": 5.3068031723421545e-06, - "loss": 0.7933, - "step": 363 - }, - { - "epoch": 1.0621193666260658, - "grad_norm": 0.6491549577786548, - "learning_rate": 5.281264472759731e-06, - "loss": 0.8144, - "step": 364 - }, - { - "epoch": 1.0650426309378807, - "grad_norm": 0.6527091363470504, - "learning_rate": 5.255718409802041e-06, - "loss": 0.7687, - "step": 365 - }, - { - "epoch": 1.0679658952496955, - "grad_norm": 0.6385672567759509, - "learning_rate": 5.230165652253329e-06, - "loss": 0.8159, - "step": 366 - }, - { - "epoch": 1.0708891595615104, - "grad_norm": 0.5864111560675286, - "learning_rate": 5.2046068690731035e-06, - "loss": 0.7831, - "step": 367 - }, - { - "epoch": 1.0738124238733253, - "grad_norm": 0.6194785781749127, - "learning_rate": 5.179042729378616e-06, - "loss": 0.8052, - "step": 368 - }, - { - "epoch": 1.07673568818514, - "grad_norm": 0.6362356799862054, - "learning_rate": 5.153473902427355e-06, - "loss": 0.8196, - "step": 369 - }, - { - "epoch": 1.0796589524969549, - "grad_norm": 0.5942323267279191, - "learning_rate": 5.127901057599517e-06, - "loss": 0.8161, - "step": 370 - }, - { - "epoch": 1.0825822168087698, - "grad_norm": 0.5742483554036059, - "learning_rate": 5.1023248643804845e-06, - "loss": 0.8228, - "step": 371 - }, - { - "epoch": 1.0855054811205846, - "grad_norm": 0.5860306244571696, - "learning_rate": 5.076745992343297e-06, - "loss": 0.8212, - "step": 372 - }, - { - "epoch": 1.0884287454323995, - "grad_norm": 0.5773619726397791, - "learning_rate": 5.0511651111311285e-06, - "loss": 0.8018, - "step": 373 - }, - { - "epoch": 1.0913520097442144, - "grad_norm": 0.6105870953754982, - "learning_rate": 5.025582890439752e-06, - "loss": 0.8227, - "step": 374 - }, - { - "epoch": 1.0942752740560293, - "grad_norm": 0.608222048968305, - "learning_rate": 5e-06, - "loss": 0.7955, - "step": 375 - }, - { - "epoch": 1.0971985383678442, - "grad_norm": 0.602520252718154, - "learning_rate": 4.97441710956025e-06, - "loss": 0.8038, - "step": 376 - }, - { - "epoch": 1.1001218026796589, - "grad_norm": 0.647759737480258, - "learning_rate": 4.948834888868872e-06, - "loss": 0.7891, - "step": 377 - }, - { - "epoch": 1.1030450669914738, - "grad_norm": 0.6466942994000808, - "learning_rate": 4.9232540076567034e-06, - "loss": 0.838, - "step": 378 - }, - { - "epoch": 1.1059683313032886, - "grad_norm": 0.7597398060034541, - "learning_rate": 4.897675135619517e-06, - "loss": 0.7992, - "step": 379 - }, - { - "epoch": 1.1088915956151035, - "grad_norm": 0.5916241197525671, - "learning_rate": 4.872098942400484e-06, - "loss": 0.834, - "step": 380 - }, - { - "epoch": 1.1118148599269184, - "grad_norm": 0.7488543256724444, - "learning_rate": 4.846526097572646e-06, - "loss": 0.8056, - "step": 381 - }, - { - "epoch": 1.1147381242387333, - "grad_norm": 0.6127152632610644, - "learning_rate": 4.820957270621385e-06, - "loss": 0.8152, - "step": 382 - }, - { - "epoch": 1.1176613885505482, - "grad_norm": 0.6772622229366005, - "learning_rate": 4.795393130926899e-06, - "loss": 0.8112, - "step": 383 - }, - { - "epoch": 1.1205846528623629, - "grad_norm": 0.6100582490809515, - "learning_rate": 4.769834347746672e-06, - "loss": 0.7971, - "step": 384 - }, - { - "epoch": 1.1235079171741778, - "grad_norm": 0.6138527497182587, - "learning_rate": 4.744281590197961e-06, - "loss": 0.8096, - "step": 385 - }, - { - "epoch": 1.1264311814859926, - "grad_norm": 0.5946729261402381, - "learning_rate": 4.71873552724027e-06, - "loss": 0.7972, - "step": 386 - }, - { - "epoch": 1.1293544457978075, - "grad_norm": 0.6272869943876259, - "learning_rate": 4.693196827657848e-06, - "loss": 0.8099, - "step": 387 - }, - { - "epoch": 1.1322777101096224, - "grad_norm": 0.6709827503445973, - "learning_rate": 4.667666160042166e-06, - "loss": 0.8035, - "step": 388 - }, - { - "epoch": 1.1352009744214373, - "grad_norm": 0.5802507520284231, - "learning_rate": 4.642144192774429e-06, - "loss": 0.8259, - "step": 389 - }, - { - "epoch": 1.1381242387332522, - "grad_norm": 0.5934148999822147, - "learning_rate": 4.616631594008069e-06, - "loss": 0.7784, - "step": 390 - }, - { - "epoch": 1.141047503045067, - "grad_norm": 0.670256603061476, - "learning_rate": 4.591129031651252e-06, - "loss": 0.7982, - "step": 391 - }, - { - "epoch": 1.143970767356882, - "grad_norm": 0.5729733544495296, - "learning_rate": 4.5656371733493986e-06, - "loss": 0.7902, - "step": 392 - }, - { - "epoch": 1.1468940316686966, - "grad_norm": 0.6506904279982785, - "learning_rate": 4.540156686467699e-06, - "loss": 0.8144, - "step": 393 - }, - { - "epoch": 1.1498172959805115, - "grad_norm": 0.6305703446057582, - "learning_rate": 4.514688238073651e-06, - "loss": 0.8346, - "step": 394 - }, - { - "epoch": 1.1527405602923264, - "grad_norm": 0.6207821437251902, - "learning_rate": 4.489232494919583e-06, - "loss": 0.814, - "step": 395 - }, - { - "epoch": 1.1556638246041413, - "grad_norm": 0.6326520891011468, - "learning_rate": 4.463790123425209e-06, - "loss": 0.8173, - "step": 396 - }, - { - "epoch": 1.1585870889159562, - "grad_norm": 0.6361956657928358, - "learning_rate": 4.438361789660182e-06, - "loss": 0.8024, - "step": 397 - }, - { - "epoch": 1.161510353227771, - "grad_norm": 0.6274266062590439, - "learning_rate": 4.412948159326647e-06, - "loss": 0.8029, - "step": 398 - }, - { - "epoch": 1.164433617539586, - "grad_norm": 0.5964467349901271, - "learning_rate": 4.3875498977418255e-06, - "loss": 0.8064, - "step": 399 - }, - { - "epoch": 1.1673568818514006, - "grad_norm": 0.7270366961339101, - "learning_rate": 4.362167669820593e-06, - "loss": 0.8061, - "step": 400 - }, - { - "epoch": 1.1702801461632155, - "grad_norm": 0.5862080854450132, - "learning_rate": 4.3368021400580635e-06, - "loss": 0.8113, - "step": 401 - }, - { - "epoch": 1.1732034104750304, - "grad_norm": 0.6141962978102556, - "learning_rate": 4.311453972512209e-06, - "loss": 0.8161, - "step": 402 - }, - { - "epoch": 1.1761266747868453, - "grad_norm": 0.6096769412968969, - "learning_rate": 4.286123830786463e-06, - "loss": 0.7977, - "step": 403 - }, - { - "epoch": 1.1790499390986602, - "grad_norm": 0.5895182827045679, - "learning_rate": 4.260812378012353e-06, - "loss": 0.8123, - "step": 404 - }, - { - "epoch": 1.181973203410475, - "grad_norm": 0.6004537708021467, - "learning_rate": 4.235520276832134e-06, - "loss": 0.797, - "step": 405 - }, - { - "epoch": 1.18489646772229, - "grad_norm": 0.5948518568661362, - "learning_rate": 4.2102481893814504e-06, - "loss": 0.8172, - "step": 406 - }, - { - "epoch": 1.1878197320341048, - "grad_norm": 0.5912718596576074, - "learning_rate": 4.1849967772719935e-06, - "loss": 0.7908, - "step": 407 - }, - { - "epoch": 1.1907429963459195, - "grad_norm": 0.5912188183434732, - "learning_rate": 4.159766701574184e-06, - "loss": 0.7847, - "step": 408 - }, - { - "epoch": 1.1936662606577344, - "grad_norm": 0.6458915180295917, - "learning_rate": 4.1345586227998645e-06, - "loss": 0.7956, - "step": 409 - }, - { - "epoch": 1.1965895249695493, - "grad_norm": 0.5860427146707542, - "learning_rate": 4.109373200885011e-06, - "loss": 0.8171, - "step": 410 - }, - { - "epoch": 1.1995127892813642, - "grad_norm": 0.5965859399277259, - "learning_rate": 4.084211095172448e-06, - "loss": 0.8057, - "step": 411 - }, - { - "epoch": 1.202436053593179, - "grad_norm": 0.6145061256713188, - "learning_rate": 4.059072964394599e-06, - "loss": 0.7856, - "step": 412 - }, - { - "epoch": 1.205359317904994, - "grad_norm": 0.5938060609434931, - "learning_rate": 4.0339594666562294e-06, - "loss": 0.8035, - "step": 413 - }, - { - "epoch": 1.2082825822168088, - "grad_norm": 0.6074730571618651, - "learning_rate": 4.008871259417227e-06, - "loss": 0.7978, - "step": 414 - }, - { - "epoch": 1.2112058465286237, - "grad_norm": 0.5955227016960192, - "learning_rate": 3.983808999475381e-06, - "loss": 0.8164, - "step": 415 - }, - { - "epoch": 1.2141291108404384, - "grad_norm": 0.5890660617212938, - "learning_rate": 3.958773342949196e-06, - "loss": 0.8189, - "step": 416 - }, - { - "epoch": 1.2170523751522533, - "grad_norm": 0.6069302365393437, - "learning_rate": 3.933764945260712e-06, - "loss": 0.8037, - "step": 417 - }, - { - "epoch": 1.2199756394640682, - "grad_norm": 0.596287098938014, - "learning_rate": 3.9087844611183415e-06, - "loss": 0.8057, - "step": 418 - }, - { - "epoch": 1.222898903775883, - "grad_norm": 0.6002784512679291, - "learning_rate": 3.883832544499735e-06, - "loss": 0.7997, - "step": 419 - }, - { - "epoch": 1.225822168087698, - "grad_norm": 0.6099781756811369, - "learning_rate": 3.858909848634661e-06, - "loss": 0.8128, - "step": 420 - }, - { - "epoch": 1.2287454323995128, - "grad_norm": 0.6974709875164911, - "learning_rate": 3.8340170259878965e-06, - "loss": 0.7665, - "step": 421 - }, - { - "epoch": 1.2316686967113277, - "grad_norm": 0.5941348378805758, - "learning_rate": 3.8091547282421544e-06, - "loss": 0.8018, - "step": 422 - }, - { - "epoch": 1.2345919610231424, - "grad_norm": 0.6157891176270607, - "learning_rate": 3.7843236062810216e-06, - "loss": 0.7911, - "step": 423 - }, - { - "epoch": 1.2375152253349573, - "grad_norm": 0.6544854607872745, - "learning_rate": 3.7595243101719126e-06, - "loss": 0.8172, - "step": 424 - }, - { - "epoch": 1.2404384896467722, - "grad_norm": 0.5745985979840479, - "learning_rate": 3.7347574891490597e-06, - "loss": 0.804, - "step": 425 - }, - { - "epoch": 1.243361753958587, - "grad_norm": 0.6354083348942862, - "learning_rate": 3.71002379159651e-06, - "loss": 0.7796, - "step": 426 - }, - { - "epoch": 1.246285018270402, - "grad_norm": 0.620970843336737, - "learning_rate": 3.6853238650311553e-06, - "loss": 0.8224, - "step": 427 - }, - { - "epoch": 1.2492082825822168, - "grad_norm": 0.6232417871814647, - "learning_rate": 3.6606583560857757e-06, - "loss": 0.818, - "step": 428 - }, - { - "epoch": 1.2521315468940317, - "grad_norm": 0.6127780408174556, - "learning_rate": 3.636027910492115e-06, - "loss": 0.8117, - "step": 429 - }, - { - "epoch": 1.2550548112058466, - "grad_norm": 0.5789315663515364, - "learning_rate": 3.611433173063976e-06, - "loss": 0.7912, - "step": 430 - }, - { - "epoch": 1.2550548112058466, - "eval_loss": 1.3727632761001587, - "eval_runtime": 65.3881, - "eval_samples_per_second": 14.819, - "eval_steps_per_second": 2.478, - "step": 430 - }, - { - "epoch": 1.2579780755176615, - "grad_norm": 0.6009615498767048, - "learning_rate": 3.5868747876803356e-06, - "loss": 0.7952, - "step": 431 - }, - { - "epoch": 1.2609013398294762, - "grad_norm": 0.6045746410233208, - "learning_rate": 3.562353397268492e-06, - "loss": 0.7934, - "step": 432 - }, - { - "epoch": 1.263824604141291, - "grad_norm": 0.5931695254070988, - "learning_rate": 3.5378696437872334e-06, - "loss": 0.8137, - "step": 433 - }, - { - "epoch": 1.266747868453106, - "grad_norm": 0.6149684612885844, - "learning_rate": 3.5134241682100266e-06, - "loss": 0.7945, - "step": 434 - }, - { - "epoch": 1.2696711327649208, - "grad_norm": 0.572462214001696, - "learning_rate": 3.4890176105082436e-06, - "loss": 0.8232, - "step": 435 - }, - { - "epoch": 1.2725943970767357, - "grad_norm": 0.5900294807807364, - "learning_rate": 3.464650609634403e-06, - "loss": 0.812, - "step": 436 - }, - { - "epoch": 1.2755176613885506, - "grad_norm": 0.5719536650263874, - "learning_rate": 3.4403238035054453e-06, - "loss": 0.7951, - "step": 437 - }, - { - "epoch": 1.2784409257003655, - "grad_norm": 0.5654407271189463, - "learning_rate": 3.4160378289860272e-06, - "loss": 0.8344, - "step": 438 - }, - { - "epoch": 1.2813641900121802, - "grad_norm": 0.6141919442543494, - "learning_rate": 3.391793321871857e-06, - "loss": 0.8152, - "step": 439 - }, - { - "epoch": 1.284287454323995, - "grad_norm": 0.5873637470479766, - "learning_rate": 3.3675909168730435e-06, - "loss": 0.7867, - "step": 440 - }, - { - "epoch": 1.28721071863581, - "grad_norm": 0.5902445618697549, - "learning_rate": 3.343431247597479e-06, - "loss": 0.7765, - "step": 441 - }, - { - "epoch": 1.2901339829476248, - "grad_norm": 0.6015910173648741, - "learning_rate": 3.319314946534258e-06, - "loss": 0.8018, - "step": 442 - }, - { - "epoch": 1.2930572472594397, - "grad_norm": 0.5712364823565985, - "learning_rate": 3.295242645037112e-06, - "loss": 0.8085, - "step": 443 - }, - { - "epoch": 1.2959805115712546, - "grad_norm": 0.5562180227854343, - "learning_rate": 3.271214973307887e-06, - "loss": 0.8129, - "step": 444 - }, - { - "epoch": 1.2989037758830695, - "grad_norm": 0.5839696563715857, - "learning_rate": 3.2472325603800382e-06, - "loss": 0.7825, - "step": 445 - }, - { - "epoch": 1.3018270401948842, - "grad_norm": 0.5749209131454011, - "learning_rate": 3.2232960341021703e-06, - "loss": 0.7829, - "step": 446 - }, - { - "epoch": 1.3047503045066993, - "grad_norm": 0.5833897427603767, - "learning_rate": 3.1994060211215936e-06, - "loss": 0.7875, - "step": 447 - }, - { - "epoch": 1.307673568818514, - "grad_norm": 0.5642135859057068, - "learning_rate": 3.1755631468679217e-06, - "loss": 0.7904, - "step": 448 - }, - { - "epoch": 1.3105968331303288, - "grad_norm": 0.644206424516224, - "learning_rate": 3.1517680355366985e-06, - "loss": 0.7944, - "step": 449 - }, - { - "epoch": 1.3135200974421437, - "grad_norm": 0.6001327772482385, - "learning_rate": 3.1280213100730595e-06, - "loss": 0.805, - "step": 450 - }, - { - "epoch": 1.3164433617539586, - "grad_norm": 0.5813006153323479, - "learning_rate": 3.104323592155413e-06, - "loss": 0.8016, - "step": 451 - }, - { - "epoch": 1.3193666260657735, - "grad_norm": 0.5928090466919959, - "learning_rate": 3.08067550217918e-06, - "loss": 0.7969, - "step": 452 - }, - { - "epoch": 1.3222898903775884, - "grad_norm": 0.5908564631070854, - "learning_rate": 3.057077659240538e-06, - "loss": 0.8087, - "step": 453 - }, - { - "epoch": 1.3252131546894033, - "grad_norm": 0.5768938363404773, - "learning_rate": 3.033530681120225e-06, - "loss": 0.7975, - "step": 454 - }, - { - "epoch": 1.328136419001218, - "grad_norm": 0.5898657463165522, - "learning_rate": 3.0100351842673593e-06, - "loss": 0.8066, - "step": 455 - }, - { - "epoch": 1.3310596833130328, - "grad_norm": 0.6028044807784166, - "learning_rate": 2.9865917837833025e-06, - "loss": 0.792, - "step": 456 - }, - { - "epoch": 1.3339829476248477, - "grad_norm": 0.5902269088580138, - "learning_rate": 2.963201093405559e-06, - "loss": 0.7963, - "step": 457 - }, - { - "epoch": 1.3369062119366626, - "grad_norm": 0.570908075119228, - "learning_rate": 2.939863725491706e-06, - "loss": 0.7971, - "step": 458 - }, - { - "epoch": 1.3398294762484775, - "grad_norm": 0.6173543812499931, - "learning_rate": 2.916580291003361e-06, - "loss": 0.8191, - "step": 459 - }, - { - "epoch": 1.3427527405602924, - "grad_norm": 0.6343832043531719, - "learning_rate": 2.893351399490194e-06, - "loss": 0.8074, - "step": 460 - }, - { - "epoch": 1.3456760048721073, - "grad_norm": 0.5782617555335829, - "learning_rate": 2.870177659073958e-06, - "loss": 0.7978, - "step": 461 - }, - { - "epoch": 1.348599269183922, - "grad_norm": 0.6172015597729136, - "learning_rate": 2.84705967643258e-06, - "loss": 0.7965, - "step": 462 - }, - { - "epoch": 1.351522533495737, - "grad_norm": 0.6359340039521935, - "learning_rate": 2.8239980567842734e-06, - "loss": 0.8076, - "step": 463 - }, - { - "epoch": 1.3544457978075517, - "grad_norm": 0.6717197639966513, - "learning_rate": 2.800993403871694e-06, - "loss": 0.7873, - "step": 464 - }, - { - "epoch": 1.3573690621193666, - "grad_norm": 0.5964892377721963, - "learning_rate": 2.778046319946135e-06, - "loss": 0.7964, - "step": 465 - }, - { - "epoch": 1.3602923264311815, - "grad_norm": 0.6014805489255499, - "learning_rate": 2.75515740575176e-06, - "loss": 0.785, - "step": 466 - }, - { - "epoch": 1.3632155907429964, - "grad_norm": 0.59722861045332, - "learning_rate": 2.7323272605098718e-06, - "loss": 0.8169, - "step": 467 - }, - { - "epoch": 1.3661388550548113, - "grad_norm": 0.5872035783585831, - "learning_rate": 2.709556481903234e-06, - "loss": 0.7847, - "step": 468 - }, - { - "epoch": 1.3690621193666261, - "grad_norm": 0.5758494465266432, - "learning_rate": 2.686845666060416e-06, - "loss": 0.8166, - "step": 469 - }, - { - "epoch": 1.371985383678441, - "grad_norm": 0.5970395782412861, - "learning_rate": 2.6641954075401904e-06, - "loss": 0.7806, - "step": 470 - }, - { - "epoch": 1.3749086479902557, - "grad_norm": 0.5973735252514245, - "learning_rate": 2.6416062993159673e-06, - "loss": 0.8073, - "step": 471 - }, - { - "epoch": 1.3778319123020706, - "grad_norm": 0.6080027009026632, - "learning_rate": 2.6190789327602695e-06, - "loss": 0.8054, - "step": 472 - }, - { - "epoch": 1.3807551766138855, - "grad_norm": 0.575058281647644, - "learning_rate": 2.5966138976292525e-06, - "loss": 0.8219, - "step": 473 - }, - { - "epoch": 1.3836784409257004, - "grad_norm": 0.5511199798733908, - "learning_rate": 2.5742117820472585e-06, - "loss": 0.8058, - "step": 474 - }, - { - "epoch": 1.3866017052375152, - "grad_norm": 0.5894915190468311, - "learning_rate": 2.5518731724914296e-06, - "loss": 0.8003, - "step": 475 - }, - { - "epoch": 1.3895249695493301, - "grad_norm": 0.6008815228155614, - "learning_rate": 2.529598653776349e-06, - "loss": 0.7676, - "step": 476 - }, - { - "epoch": 1.392448233861145, - "grad_norm": 0.5582304902542878, - "learning_rate": 2.5073888090387288e-06, - "loss": 0.8153, - "step": 477 - }, - { - "epoch": 1.3953714981729597, - "grad_norm": 0.5361237402188473, - "learning_rate": 2.485244219722148e-06, - "loss": 0.7952, - "step": 478 - }, - { - "epoch": 1.3982947624847748, - "grad_norm": 0.5689438925320016, - "learning_rate": 2.4631654655618293e-06, - "loss": 0.8035, - "step": 479 - }, - { - "epoch": 1.4012180267965895, - "grad_norm": 0.5874254989274982, - "learning_rate": 2.4411531245694546e-06, - "loss": 0.7966, - "step": 480 - }, - { - "epoch": 1.4041412911084044, - "grad_norm": 0.5749456696437114, - "learning_rate": 2.4192077730180475e-06, - "loss": 0.8173, - "step": 481 - }, - { - "epoch": 1.4070645554202192, - "grad_norm": 0.5672450900355861, - "learning_rate": 2.3973299854268743e-06, - "loss": 0.7861, - "step": 482 - }, - { - "epoch": 1.4099878197320341, - "grad_norm": 0.5822534798540568, - "learning_rate": 2.3755203345464093e-06, - "loss": 0.8107, - "step": 483 - }, - { - "epoch": 1.412911084043849, - "grad_norm": 0.5735638363168305, - "learning_rate": 2.3537793913433386e-06, - "loss": 0.8111, - "step": 484 - }, - { - "epoch": 1.415834348355664, - "grad_norm": 0.5630609970931171, - "learning_rate": 2.3321077249856123e-06, - "loss": 0.8195, - "step": 485 - }, - { - "epoch": 1.4187576126674788, - "grad_norm": 0.5673965152389683, - "learning_rate": 2.3105059028275467e-06, - "loss": 0.7669, - "step": 486 - }, - { - "epoch": 1.4216808769792935, - "grad_norm": 0.5941423353872316, - "learning_rate": 2.288974490394962e-06, - "loss": 0.805, - "step": 487 - }, - { - "epoch": 1.4246041412911084, - "grad_norm": 0.583271460902054, - "learning_rate": 2.2675140513703924e-06, - "loss": 0.8062, - "step": 488 - }, - { - "epoch": 1.4275274056029232, - "grad_norm": 0.5569237007034656, - "learning_rate": 2.246125147578316e-06, - "loss": 0.8198, - "step": 489 - }, - { - "epoch": 1.4304506699147381, - "grad_norm": 0.556510419136389, - "learning_rate": 2.224808338970452e-06, - "loss": 0.8003, - "step": 490 - }, - { - "epoch": 1.433373934226553, - "grad_norm": 0.585029225572491, - "learning_rate": 2.2035641836111014e-06, - "loss": 0.8344, - "step": 491 - }, - { - "epoch": 1.436297198538368, - "grad_norm": 0.5623539594731303, - "learning_rate": 2.182393237662536e-06, - "loss": 0.795, - "step": 492 - }, - { - "epoch": 1.4392204628501828, - "grad_norm": 0.5730620202071608, - "learning_rate": 2.161296055370441e-06, - "loss": 0.8072, - "step": 493 - }, - { - "epoch": 1.4421437271619975, - "grad_norm": 0.5858480193047055, - "learning_rate": 2.140273189049396e-06, - "loss": 0.808, - "step": 494 - }, - { - "epoch": 1.4450669914738123, - "grad_norm": 0.5833871713302182, - "learning_rate": 2.1193251890684297e-06, - "loss": 0.7781, - "step": 495 - }, - { - "epoch": 1.4479902557856272, - "grad_norm": 0.5679251391544968, - "learning_rate": 2.0984526038366005e-06, - "loss": 0.7848, - "step": 496 - }, - { - "epoch": 1.4509135200974421, - "grad_norm": 0.5628769222538363, - "learning_rate": 2.0776559797886445e-06, - "loss": 0.8082, - "step": 497 - }, - { - "epoch": 1.453836784409257, - "grad_norm": 0.6023793540211936, - "learning_rate": 2.0569358613706685e-06, - "loss": 0.8097, - "step": 498 - }, - { - "epoch": 1.456760048721072, - "grad_norm": 0.5794155584920526, - "learning_rate": 2.036292791025899e-06, - "loss": 0.7773, - "step": 499 - }, - { - "epoch": 1.4596833130328868, - "grad_norm": 0.5696461615256238, - "learning_rate": 2.0157273091804725e-06, - "loss": 0.7882, - "step": 500 - }, - { - "epoch": 1.4626065773447015, - "grad_norm": 0.5827853659037967, - "learning_rate": 1.995239954229301e-06, - "loss": 0.7998, - "step": 501 - }, - { - "epoch": 1.4655298416565166, - "grad_norm": 0.5615811380760124, - "learning_rate": 1.9748312625219674e-06, - "loss": 0.7792, - "step": 502 - }, - { - "epoch": 1.4684531059683312, - "grad_norm": 0.5702951187471944, - "learning_rate": 1.954501768348687e-06, - "loss": 0.8027, - "step": 503 - }, - { - "epoch": 1.4713763702801461, - "grad_norm": 0.5859521759939277, - "learning_rate": 1.9342520039263206e-06, - "loss": 0.7939, - "step": 504 - }, - { - "epoch": 1.474299634591961, - "grad_norm": 0.5622948101159606, - "learning_rate": 1.9140824993844396e-06, - "loss": 0.8005, - "step": 505 - }, - { - "epoch": 1.477222898903776, - "grad_norm": 0.5709515740173828, - "learning_rate": 1.8939937827514509e-06, - "loss": 0.7931, - "step": 506 - }, - { - "epoch": 1.4801461632155908, - "grad_norm": 0.5744268405944757, - "learning_rate": 1.8739863799407644e-06, - "loss": 0.7715, - "step": 507 - }, - { - "epoch": 1.4830694275274057, - "grad_norm": 0.5658023138017119, - "learning_rate": 1.8540608147370386e-06, - "loss": 0.8154, - "step": 508 - }, - { - "epoch": 1.4859926918392206, - "grad_norm": 0.5683407498717216, - "learning_rate": 1.8342176087824576e-06, - "loss": 0.8085, - "step": 509 - }, - { - "epoch": 1.4889159561510352, - "grad_norm": 0.5689375651699262, - "learning_rate": 1.814457281563078e-06, - "loss": 0.8053, - "step": 510 - }, - { - "epoch": 1.4918392204628501, - "grad_norm": 0.5766874459611464, - "learning_rate": 1.7947803503952298e-06, - "loss": 0.8122, - "step": 511 - }, - { - "epoch": 1.494762484774665, - "grad_norm": 0.5738977689581138, - "learning_rate": 1.7751873304119743e-06, - "loss": 0.7907, - "step": 512 - }, - { - "epoch": 1.49768574908648, - "grad_norm": 0.5912437120374228, - "learning_rate": 1.7556787345496102e-06, - "loss": 0.7699, - "step": 513 - }, - { - "epoch": 1.5006090133982948, - "grad_norm": 0.5807184894644836, - "learning_rate": 1.7362550735342575e-06, - "loss": 0.8012, - "step": 514 - }, - { - "epoch": 1.5035322777101097, - "grad_norm": 0.5796763127763196, - "learning_rate": 1.7169168558684784e-06, - "loss": 0.7855, - "step": 515 - }, - { - "epoch": 1.5064555420219246, - "grad_norm": 0.5711139940080867, - "learning_rate": 1.6976645878179677e-06, - "loss": 0.7957, - "step": 516 - }, - { - "epoch": 1.5064555420219246, - "eval_loss": 1.3729921579360962, - "eval_runtime": 65.3779, - "eval_samples_per_second": 14.822, - "eval_steps_per_second": 2.478, - "step": 516 - }, - { - "epoch": 1.5093788063337392, - "grad_norm": 0.565406176459499, - "learning_rate": 1.6784987733982978e-06, - "loss": 0.7772, - "step": 517 - }, - { - "epoch": 1.5123020706455543, - "grad_norm": 0.5792656566869839, - "learning_rate": 1.6594199143617252e-06, - "loss": 0.798, - "step": 518 - }, - { - "epoch": 1.515225334957369, - "grad_norm": 0.6141774984650593, - "learning_rate": 1.6404285101840567e-06, - "loss": 0.8104, - "step": 519 - }, - { - "epoch": 1.5181485992691839, - "grad_norm": 0.571296298879974, - "learning_rate": 1.621525058051564e-06, - "loss": 0.797, - "step": 520 - }, - { - "epoch": 1.5210718635809988, - "grad_norm": 0.5695495052273746, - "learning_rate": 1.6027100528479816e-06, - "loss": 0.8464, - "step": 521 - }, - { - "epoch": 1.5239951278928137, - "grad_norm": 0.5835018650178468, - "learning_rate": 1.5839839871415403e-06, - "loss": 0.7886, - "step": 522 - }, - { - "epoch": 1.5269183922046285, - "grad_norm": 0.6345187051641058, - "learning_rate": 1.5653473511720762e-06, - "loss": 0.8144, - "step": 523 - }, - { - "epoch": 1.5298416565164432, - "grad_norm": 0.5652000093656501, - "learning_rate": 1.5468006328381968e-06, - "loss": 0.7828, - "step": 524 - }, - { - "epoch": 1.5327649208282583, - "grad_norm": 0.5584288669041867, - "learning_rate": 1.5283443176845053e-06, - "loss": 0.814, - "step": 525 - }, - { - "epoch": 1.535688185140073, - "grad_norm": 0.594558247637692, - "learning_rate": 1.509978888888894e-06, - "loss": 0.8035, - "step": 526 - }, - { - "epoch": 1.538611449451888, - "grad_norm": 0.6043871959994697, - "learning_rate": 1.4917048272498862e-06, - "loss": 0.7994, - "step": 527 - }, - { - "epoch": 1.5415347137637028, - "grad_norm": 0.6124817051177354, - "learning_rate": 1.4735226111740603e-06, - "loss": 0.7827, - "step": 528 - }, - { - "epoch": 1.5444579780755177, - "grad_norm": 0.5734031250327122, - "learning_rate": 1.4554327166635173e-06, - "loss": 0.7859, - "step": 529 - }, - { - "epoch": 1.5473812423873325, - "grad_norm": 0.5645313610687146, - "learning_rate": 1.4374356173034232e-06, - "loss": 0.7807, - "step": 530 - }, - { - "epoch": 1.5503045066991474, - "grad_norm": 0.5466051542548562, - "learning_rate": 1.4195317842496081e-06, - "loss": 0.7557, - "step": 531 - }, - { - "epoch": 1.5532277710109623, - "grad_norm": 0.5609559828543491, - "learning_rate": 1.4017216862162358e-06, - "loss": 0.7927, - "step": 532 - }, - { - "epoch": 1.556151035322777, - "grad_norm": 0.6046966850745031, - "learning_rate": 1.3840057894635239e-06, - "loss": 0.7796, - "step": 533 - }, - { - "epoch": 1.559074299634592, - "grad_norm": 0.5651293261693624, - "learning_rate": 1.3663845577855489e-06, - "loss": 0.81, - "step": 534 - }, - { - "epoch": 1.5619975639464068, - "grad_norm": 0.565172387672836, - "learning_rate": 1.348858452498098e-06, - "loss": 0.7936, - "step": 535 - }, - { - "epoch": 1.5649208282582217, - "grad_norm": 0.5583131688271231, - "learning_rate": 1.3314279324265922e-06, - "loss": 0.791, - "step": 536 - }, - { - "epoch": 1.5678440925700365, - "grad_norm": 0.5641499114882876, - "learning_rate": 1.3140934538940754e-06, - "loss": 0.7965, - "step": 537 - }, - { - "epoch": 1.5707673568818514, - "grad_norm": 0.5611630143037816, - "learning_rate": 1.2968554707092684e-06, - "loss": 0.7894, - "step": 538 - }, - { - "epoch": 1.5736906211936663, - "grad_norm": 0.5819581090048779, - "learning_rate": 1.2797144341546886e-06, - "loss": 0.8112, - "step": 539 - }, - { - "epoch": 1.576613885505481, - "grad_norm": 0.5505606683684824, - "learning_rate": 1.262670792974831e-06, - "loss": 0.8149, - "step": 540 - }, - { - "epoch": 1.579537149817296, - "grad_norm": 0.5546492045824807, - "learning_rate": 1.2457249933644289e-06, - "loss": 0.8161, - "step": 541 - }, - { - "epoch": 1.5824604141291108, - "grad_norm": 0.5743697061886555, - "learning_rate": 1.2288774789567659e-06, - "loss": 0.801, - "step": 542 - }, - { - "epoch": 1.5853836784409256, - "grad_norm": 0.5901849741770847, - "learning_rate": 1.212128690812065e-06, - "loss": 0.8005, - "step": 543 - }, - { - "epoch": 1.5883069427527405, - "grad_norm": 0.5547631751970594, - "learning_rate": 1.1954790674059401e-06, - "loss": 0.7835, - "step": 544 - }, - { - "epoch": 1.5912302070645554, - "grad_norm": 0.5471908249775947, - "learning_rate": 1.1789290446179168e-06, - "loss": 0.8024, - "step": 545 - }, - { - "epoch": 1.5941534713763703, - "grad_norm": 0.5416328400001001, - "learning_rate": 1.1624790557200255e-06, - "loss": 0.7736, - "step": 546 - }, - { - "epoch": 1.597076735688185, - "grad_norm": 0.5726606497356488, - "learning_rate": 1.1461295313654486e-06, - "loss": 0.818, - "step": 547 - }, - { - "epoch": 1.6, - "grad_norm": 0.5674174196876537, - "learning_rate": 1.129880899577258e-06, - "loss": 0.8045, - "step": 548 - }, - { - "epoch": 1.6029232643118148, - "grad_norm": 0.5412849489916108, - "learning_rate": 1.1137335857372045e-06, - "loss": 0.7867, - "step": 549 - }, - { - "epoch": 1.6058465286236299, - "grad_norm": 0.5478307274016229, - "learning_rate": 1.097688012574578e-06, - "loss": 0.7959, - "step": 550 - }, - { - "epoch": 1.6087697929354445, - "grad_norm": 0.5721374560063277, - "learning_rate": 1.0817446001551467e-06, - "loss": 0.7816, - "step": 551 - }, - { - "epoch": 1.6116930572472594, - "grad_norm": 0.5589042015324429, - "learning_rate": 1.0659037658701576e-06, - "loss": 0.7948, - "step": 552 - }, - { - "epoch": 1.6146163215590743, - "grad_norm": 0.5665702488629619, - "learning_rate": 1.0501659244254053e-06, - "loss": 0.8034, - "step": 553 - }, - { - "epoch": 1.6175395858708892, - "grad_norm": 0.542393514153079, - "learning_rate": 1.0345314878303826e-06, - "loss": 0.8034, - "step": 554 - }, - { - "epoch": 1.620462850182704, - "grad_norm": 0.5702730710287914, - "learning_rate": 1.019000865387489e-06, - "loss": 0.8023, - "step": 555 - }, - { - "epoch": 1.6233861144945188, - "grad_norm": 0.5582049722213746, - "learning_rate": 1.0035744636813188e-06, - "loss": 0.7928, - "step": 556 - }, - { - "epoch": 1.6263093788063339, - "grad_norm": 0.5698270653733194, - "learning_rate": 9.882526865680125e-07, - "loss": 0.8115, - "step": 557 - }, - { - "epoch": 1.6292326431181485, - "grad_norm": 0.5417472484118098, - "learning_rate": 9.730359351646885e-07, - "loss": 0.8037, - "step": 558 - }, - { - "epoch": 1.6321559074299634, - "grad_norm": 0.5579811422659328, - "learning_rate": 9.579246078389404e-07, - "loss": 0.804, - "step": 559 - }, - { - "epoch": 1.6350791717417783, - "grad_norm": 0.5582565241558918, - "learning_rate": 9.42919100198404e-07, - "loss": 0.805, - "step": 560 - }, - { - "epoch": 1.6380024360535932, - "grad_norm": 0.5630923483130202, - "learning_rate": 9.28019805080409e-07, - "loss": 0.7621, - "step": 561 - }, - { - "epoch": 1.640925700365408, - "grad_norm": 0.554999857236509, - "learning_rate": 9.132271125416875e-07, - "loss": 0.7989, - "step": 562 - }, - { - "epoch": 1.6438489646772227, - "grad_norm": 0.5503314126678172, - "learning_rate": 8.985414098481643e-07, - "loss": 0.8051, - "step": 563 - }, - { - "epoch": 1.6467722289890379, - "grad_norm": 0.5486584898884448, - "learning_rate": 8.839630814648204e-07, - "loss": 0.7978, - "step": 564 - }, - { - "epoch": 1.6496954933008525, - "grad_norm": 0.5680902840851282, - "learning_rate": 8.694925090456268e-07, - "loss": 0.781, - "step": 565 - }, - { - "epoch": 1.6526187576126676, - "grad_norm": 0.5633446162141174, - "learning_rate": 8.551300714235494e-07, - "loss": 0.7955, - "step": 566 - }, - { - "epoch": 1.6555420219244823, - "grad_norm": 0.5776751766396825, - "learning_rate": 8.408761446006381e-07, - "loss": 0.7994, - "step": 567 - }, - { - "epoch": 1.6584652862362972, - "grad_norm": 0.5726237065590929, - "learning_rate": 8.267311017381779e-07, - "loss": 0.7961, - "step": 568 - }, - { - "epoch": 1.661388550548112, - "grad_norm": 0.5520396730996392, - "learning_rate": 8.126953131469229e-07, - "loss": 0.8106, - "step": 569 - }, - { - "epoch": 1.664311814859927, - "grad_norm": 0.5549872707674027, - "learning_rate": 7.987691462773983e-07, - "loss": 0.8111, - "step": 570 - }, - { - "epoch": 1.6672350791717419, - "grad_norm": 0.5875671745972523, - "learning_rate": 7.84952965710285e-07, - "loss": 0.7812, - "step": 571 - }, - { - "epoch": 1.6701583434835565, - "grad_norm": 0.5459123012014073, - "learning_rate": 7.712471331468718e-07, - "loss": 0.7908, - "step": 572 - }, - { - "epoch": 1.6730816077953716, - "grad_norm": 0.5733921108835325, - "learning_rate": 7.576520073995858e-07, - "loss": 0.814, - "step": 573 - }, - { - "epoch": 1.6760048721071863, - "grad_norm": 0.5629500916763001, - "learning_rate": 7.441679443826022e-07, - "loss": 0.7807, - "step": 574 - }, - { - "epoch": 1.6789281364190012, - "grad_norm": 0.5774799237532218, - "learning_rate": 7.307952971025245e-07, - "loss": 0.8243, - "step": 575 - }, - { - "epoch": 1.681851400730816, - "grad_norm": 0.5599476562864103, - "learning_rate": 7.175344156491432e-07, - "loss": 0.7929, - "step": 576 - }, - { - "epoch": 1.684774665042631, - "grad_norm": 0.5612867129741338, - "learning_rate": 7.043856471862692e-07, - "loss": 0.8138, - "step": 577 - }, - { - "epoch": 1.6876979293544458, - "grad_norm": 0.5678404737687553, - "learning_rate": 6.913493359426476e-07, - "loss": 0.7984, - "step": 578 - }, - { - "epoch": 1.6906211936662605, - "grad_norm": 0.5536617424718886, - "learning_rate": 6.784258232029473e-07, - "loss": 0.8155, - "step": 579 - }, - { - "epoch": 1.6935444579780756, - "grad_norm": 0.5592434257345329, - "learning_rate": 6.656154472988174e-07, - "loss": 0.8002, - "step": 580 - }, - { - "epoch": 1.6964677222898903, - "grad_norm": 0.5383379786819102, - "learning_rate": 6.529185436000435e-07, - "loss": 0.8048, - "step": 581 - }, - { - "epoch": 1.6993909866017054, - "grad_norm": 0.5529241606848215, - "learning_rate": 6.403354445057569e-07, - "loss": 0.7806, - "step": 582 - }, - { - "epoch": 1.70231425091352, - "grad_norm": 0.5522679012780315, - "learning_rate": 6.278664794357369e-07, - "loss": 0.799, - "step": 583 - }, - { - "epoch": 1.705237515225335, - "grad_norm": 0.5386503966088895, - "learning_rate": 6.155119748217874e-07, - "loss": 0.7912, - "step": 584 - }, - { - "epoch": 1.7081607795371498, - "grad_norm": 0.5528415532861808, - "learning_rate": 6.032722540991897e-07, - "loss": 0.7802, - "step": 585 - }, - { - "epoch": 1.7110840438489647, - "grad_norm": 0.5479801792595665, - "learning_rate": 5.911476376982333e-07, - "loss": 0.8089, - "step": 586 - }, - { - "epoch": 1.7140073081607796, - "grad_norm": 0.5493407119719024, - "learning_rate": 5.79138443035831e-07, - "loss": 0.8171, - "step": 587 - }, - { - "epoch": 1.7169305724725943, - "grad_norm": 0.5478947861028238, - "learning_rate": 5.67244984507207e-07, - "loss": 0.8022, - "step": 588 - }, - { - "epoch": 1.7198538367844094, - "grad_norm": 0.5678203988994112, - "learning_rate": 5.554675734776666e-07, - "loss": 0.7938, - "step": 589 - }, - { - "epoch": 1.722777101096224, - "grad_norm": 0.5547706153338823, - "learning_rate": 5.43806518274444e-07, - "loss": 0.7812, - "step": 590 - }, - { - "epoch": 1.725700365408039, - "grad_norm": 0.5420125657053475, - "learning_rate": 5.322621241786325e-07, - "loss": 0.7976, - "step": 591 - }, - { - "epoch": 1.7286236297198538, - "grad_norm": 0.5480819734230008, - "learning_rate": 5.208346934171898e-07, - "loss": 0.7917, - "step": 592 - }, - { - "epoch": 1.7315468940316687, - "grad_norm": 0.5481736497317199, - "learning_rate": 5.095245251550257e-07, - "loss": 0.8205, - "step": 593 - }, - { - "epoch": 1.7344701583434836, - "grad_norm": 0.5562578075897706, - "learning_rate": 4.983319154871741e-07, - "loss": 0.8096, - "step": 594 - }, - { - "epoch": 1.7373934226552983, - "grad_norm": 0.5835157779299706, - "learning_rate": 4.87257157431037e-07, - "loss": 0.7891, - "step": 595 - }, - { - "epoch": 1.7403166869671134, - "grad_norm": 0.546986297315357, - "learning_rate": 4.763005409187155e-07, - "loss": 0.8173, - "step": 596 - }, - { - "epoch": 1.743239951278928, - "grad_norm": 0.550677582279951, - "learning_rate": 4.654623527894192e-07, - "loss": 0.7931, - "step": 597 - }, - { - "epoch": 1.746163215590743, - "grad_norm": 0.5573466505687258, - "learning_rate": 4.5474287678195785e-07, - "loss": 0.8092, - "step": 598 - }, - { - "epoch": 1.7490864799025578, - "grad_norm": 0.5626582035488121, - "learning_rate": 4.441423935273087e-07, - "loss": 0.8024, - "step": 599 - }, - { - "epoch": 1.7520097442143727, - "grad_norm": 0.5427943084328981, - "learning_rate": 4.336611805412766e-07, - "loss": 0.8031, - "step": 600 - }, - { - "epoch": 1.7549330085261876, - "grad_norm": 0.5373310678367565, - "learning_rate": 4.232995122172245e-07, - "loss": 0.7968, - "step": 601 - }, - { - "epoch": 1.7578562728380023, - "grad_norm": 0.5683595129842872, - "learning_rate": 4.130576598188907e-07, - "loss": 0.7951, - "step": 602 - }, - { - "epoch": 1.7578562728380023, - "eval_loss": 1.3712928295135498, - "eval_runtime": 65.5109, - "eval_samples_per_second": 14.791, - "eval_steps_per_second": 2.473, - "step": 602 - }, - { - "epoch": 1.7607795371498174, - "grad_norm": 0.5557264560397862, - "learning_rate": 4.029358914732862e-07, - "loss": 0.8006, - "step": 603 - }, - { - "epoch": 1.763702801461632, - "grad_norm": 0.5654569835636465, - "learning_rate": 3.929344721636774e-07, - "loss": 0.8074, - "step": 604 - }, - { - "epoch": 1.7666260657734472, - "grad_norm": 0.5444513248898442, - "learning_rate": 3.830536637226495e-07, - "loss": 0.8105, - "step": 605 - }, - { - "epoch": 1.7695493300852618, - "grad_norm": 0.5517616074003878, - "learning_rate": 3.732937248252472e-07, - "loss": 0.7761, - "step": 606 - }, - { - "epoch": 1.7724725943970767, - "grad_norm": 0.5458114020073324, - "learning_rate": 3.6365491098220683e-07, - "loss": 0.7726, - "step": 607 - }, - { - "epoch": 1.7753958587088916, - "grad_norm": 0.5455753814843181, - "learning_rate": 3.5413747453326766e-07, - "loss": 0.7888, - "step": 608 - }, - { - "epoch": 1.7783191230207065, - "grad_norm": 0.5458705191532202, - "learning_rate": 3.4474166464056327e-07, - "loss": 0.8093, - "step": 609 - }, - { - "epoch": 1.7812423873325214, - "grad_norm": 0.5781120106352705, - "learning_rate": 3.3546772728209944e-07, - "loss": 0.797, - "step": 610 - }, - { - "epoch": 1.784165651644336, - "grad_norm": 0.550989780518617, - "learning_rate": 3.2631590524531466e-07, - "loss": 0.7864, - "step": 611 - }, - { - "epoch": 1.7870889159561512, - "grad_norm": 0.5472308067663272, - "learning_rate": 3.172864381207252e-07, - "loss": 0.7749, - "step": 612 - }, - { - "epoch": 1.7900121802679658, - "grad_norm": 0.546457202866733, - "learning_rate": 3.0837956229565146e-07, - "loss": 0.7992, - "step": 613 - }, - { - "epoch": 1.7929354445797807, - "grad_norm": 0.5634760074822219, - "learning_rate": 2.995955109480275e-07, - "loss": 0.7927, - "step": 614 - }, - { - "epoch": 1.7958587088915956, - "grad_norm": 0.5473493176360688, - "learning_rate": 2.909345140403019e-07, - "loss": 0.811, - "step": 615 - }, - { - "epoch": 1.7987819732034105, - "grad_norm": 0.5605341671034767, - "learning_rate": 2.8239679831341126e-07, - "loss": 0.7963, - "step": 616 - }, - { - "epoch": 1.8017052375152254, - "grad_norm": 0.5440992569777762, - "learning_rate": 2.739825872808505e-07, - "loss": 0.808, - "step": 617 - }, - { - "epoch": 1.80462850182704, - "grad_norm": 0.5481480560745237, - "learning_rate": 2.656921012228153e-07, - "loss": 0.8086, - "step": 618 - }, - { - "epoch": 1.8075517661388552, - "grad_norm": 0.5493180019568236, - "learning_rate": 2.575255571804391e-07, - "loss": 0.8116, - "step": 619 - }, - { - "epoch": 1.8104750304506698, - "grad_norm": 0.560573897234777, - "learning_rate": 2.49483168950112e-07, - "loss": 0.8138, - "step": 620 - }, - { - "epoch": 1.813398294762485, - "grad_norm": 0.5640910581601931, - "learning_rate": 2.4156514707787683e-07, - "loss": 0.8114, - "step": 621 - }, - { - "epoch": 1.8163215590742996, - "grad_norm": 0.5411636499272622, - "learning_rate": 2.3377169885392737e-07, - "loss": 0.7896, - "step": 622 - }, - { - "epoch": 1.8192448233861145, - "grad_norm": 0.5336917707791728, - "learning_rate": 2.2610302830717302e-07, - "loss": 0.7834, - "step": 623 - }, - { - "epoch": 1.8221680876979294, - "grad_norm": 0.560440289551308, - "learning_rate": 2.1855933619990167e-07, - "loss": 0.7762, - "step": 624 - }, - { - "epoch": 1.8250913520097443, - "grad_norm": 0.5380057617702301, - "learning_rate": 2.111408200225229e-07, - "loss": 0.781, - "step": 625 - }, - { - "epoch": 1.8280146163215591, - "grad_norm": 0.5365628702514941, - "learning_rate": 2.038476739883982e-07, - "loss": 0.7903, - "step": 626 - }, - { - "epoch": 1.8309378806333738, - "grad_norm": 0.5403634255666717, - "learning_rate": 1.96680089028754e-07, - "loss": 0.8218, - "step": 627 - }, - { - "epoch": 1.833861144945189, - "grad_norm": 0.5519636117104554, - "learning_rate": 1.8963825278768776e-07, - "loss": 0.7809, - "step": 628 - }, - { - "epoch": 1.8367844092570036, - "grad_norm": 0.5527641215629426, - "learning_rate": 1.827223496172509e-07, - "loss": 0.7927, - "step": 629 - }, - { - "epoch": 1.8397076735688185, - "grad_norm": 0.5452268766681745, - "learning_rate": 1.7593256057262642e-07, - "loss": 0.8176, - "step": 630 - }, - { - "epoch": 1.8426309378806334, - "grad_norm": 0.5447122583732273, - "learning_rate": 1.6926906340738568e-07, - "loss": 0.8101, - "step": 631 - }, - { - "epoch": 1.8455542021924483, - "grad_norm": 0.571978575668362, - "learning_rate": 1.627320325688375e-07, - "loss": 0.7932, - "step": 632 - }, - { - "epoch": 1.8484774665042631, - "grad_norm": 0.5627331109300794, - "learning_rate": 1.5632163919346077e-07, - "loss": 0.8091, - "step": 633 - }, - { - "epoch": 1.8514007308160778, - "grad_norm": 0.5624338396181234, - "learning_rate": 1.5003805110241963e-07, - "loss": 0.7924, - "step": 634 - }, - { - "epoch": 1.854323995127893, - "grad_norm": 0.5555064792697382, - "learning_rate": 1.438814327971788e-07, - "loss": 0.8153, - "step": 635 - }, - { - "epoch": 1.8572472594397076, - "grad_norm": 0.5481418092928929, - "learning_rate": 1.3785194545518965e-07, - "loss": 0.7958, - "step": 636 - }, - { - "epoch": 1.8601705237515227, - "grad_norm": 0.5567365893755013, - "learning_rate": 1.3194974692567254e-07, - "loss": 0.804, - "step": 637 - }, - { - "epoch": 1.8630937880633374, - "grad_norm": 0.5546768686267454, - "learning_rate": 1.261749917254862e-07, - "loss": 0.7952, - "step": 638 - }, - { - "epoch": 1.8660170523751523, - "grad_norm": 0.5657655388451881, - "learning_rate": 1.2052783103508104e-07, - "loss": 0.7968, - "step": 639 - }, - { - "epoch": 1.8689403166869671, - "grad_norm": 0.543022031896953, - "learning_rate": 1.1500841269454166e-07, - "loss": 0.8117, - "step": 640 - }, - { - "epoch": 1.871863580998782, - "grad_norm": 0.5514475359589998, - "learning_rate": 1.0961688119971447e-07, - "loss": 0.7979, - "step": 641 - }, - { - "epoch": 1.874786845310597, - "grad_norm": 0.5555814897175765, - "learning_rate": 1.0435337769843012e-07, - "loss": 0.8057, - "step": 642 - }, - { - "epoch": 1.8777101096224116, - "grad_norm": 0.5500005285162863, - "learning_rate": 9.921803998680202e-08, - "loss": 0.804, - "step": 643 - }, - { - "epoch": 1.8806333739342267, - "grad_norm": 0.5502401343376152, - "learning_rate": 9.42110025056231e-08, - "loss": 0.8072, - "step": 644 - }, - { - "epoch": 1.8835566382460414, - "grad_norm": 0.5591300521244459, - "learning_rate": 8.933239633684531e-08, - "loss": 0.8061, - "step": 645 - }, - { - "epoch": 1.8864799025578562, - "grad_norm": 0.5576906288465678, - "learning_rate": 8.458234920014685e-08, - "loss": 0.8044, - "step": 646 - }, - { - "epoch": 1.8894031668696711, - "grad_norm": 0.539696174418833, - "learning_rate": 7.996098544958863e-08, - "loss": 0.7732, - "step": 647 - }, - { - "epoch": 1.892326431181486, - "grad_norm": 0.5324355982304806, - "learning_rate": 7.546842607036086e-08, - "loss": 0.7825, - "step": 648 - }, - { - "epoch": 1.895249695493301, - "grad_norm": 0.5536750938709009, - "learning_rate": 7.110478867561332e-08, - "loss": 0.8269, - "step": 649 - }, - { - "epoch": 1.8981729598051156, - "grad_norm": 0.5464539956275944, - "learning_rate": 6.687018750337726e-08, - "loss": 0.7878, - "step": 650 - }, - { - "epoch": 1.9010962241169307, - "grad_norm": 0.5744225595981897, - "learning_rate": 6.276473341357558e-08, - "loss": 0.8342, - "step": 651 - }, - { - "epoch": 1.9040194884287454, - "grad_norm": 0.5439117198062463, - "learning_rate": 5.878853388511796e-08, - "loss": 0.8092, - "step": 652 - }, - { - "epoch": 1.9069427527405602, - "grad_norm": 0.5604604058278684, - "learning_rate": 5.494169301309027e-08, - "loss": 0.8129, - "step": 653 - }, - { - "epoch": 1.9098660170523751, - "grad_norm": 0.5496376371773798, - "learning_rate": 5.122431150602625e-08, - "loss": 0.7992, - "step": 654 - }, - { - "epoch": 1.91278928136419, - "grad_norm": 0.5431909281162958, - "learning_rate": 4.7636486683274585e-08, - "loss": 0.7869, - "step": 655 - }, - { - "epoch": 1.915712545676005, - "grad_norm": 0.5480429034794597, - "learning_rate": 4.417831247244819e-08, - "loss": 0.8026, - "step": 656 - }, - { - "epoch": 1.9186358099878196, - "grad_norm": 0.5487230201400353, - "learning_rate": 4.084987940696561e-08, - "loss": 0.8072, - "step": 657 - }, - { - "epoch": 1.9215590742996347, - "grad_norm": 0.5485859368785915, - "learning_rate": 3.7651274623683454e-08, - "loss": 0.8085, - "step": 658 - }, - { - "epoch": 1.9244823386114494, - "grad_norm": 0.5506557337352462, - "learning_rate": 3.4582581860612144e-08, - "loss": 0.7935, - "step": 659 - }, - { - "epoch": 1.9274056029232645, - "grad_norm": 0.5567801755660966, - "learning_rate": 3.164388145472375e-08, - "loss": 0.8013, - "step": 660 - }, - { - "epoch": 1.9303288672350791, - "grad_norm": 0.5353178830443843, - "learning_rate": 2.8835250339851463e-08, - "loss": 0.8265, - "step": 661 - }, - { - "epoch": 1.933252131546894, - "grad_norm": 0.5707903317589319, - "learning_rate": 2.6156762044673435e-08, - "loss": 0.8012, - "step": 662 - }, - { - "epoch": 1.936175395858709, - "grad_norm": 0.5370757996768764, - "learning_rate": 2.3608486690788745e-08, - "loss": 0.7786, - "step": 663 - }, - { - "epoch": 1.9390986601705238, - "grad_norm": 0.5506663519049135, - "learning_rate": 2.1190490990879997e-08, - "loss": 0.8157, - "step": 664 - }, - { - "epoch": 1.9420219244823387, - "grad_norm": 0.5563685534564358, - "learning_rate": 1.8902838246969147e-08, - "loss": 0.8045, - "step": 665 - }, - { - "epoch": 1.9449451887941533, - "grad_norm": 0.5590283271247752, - "learning_rate": 1.6745588348758836e-08, - "loss": 0.7784, - "step": 666 - }, - { - "epoch": 1.9478684531059685, - "grad_norm": 0.5433875826003821, - "learning_rate": 1.4718797772065308e-08, - "loss": 0.782, - "step": 667 - }, - { - "epoch": 1.9507917174177831, - "grad_norm": 0.5377709363211058, - "learning_rate": 1.2822519577337934e-08, - "loss": 0.7732, - "step": 668 - }, - { - "epoch": 1.953714981729598, - "grad_norm": 0.5547233788315695, - "learning_rate": 1.1056803408273086e-08, - "loss": 0.7925, - "step": 669 - }, - { - "epoch": 1.956638246041413, - "grad_norm": 0.5265543464527667, - "learning_rate": 9.421695490512418e-09, - "loss": 0.7721, - "step": 670 - }, - { - "epoch": 1.9595615103532278, - "grad_norm": 0.5472775339764673, - "learning_rate": 7.91723863043381e-09, - "loss": 0.8027, - "step": 671 - }, - { - "epoch": 1.9624847746650427, - "grad_norm": 0.5304979167160336, - "learning_rate": 6.543472214030066e-09, - "loss": 0.7873, - "step": 672 - }, - { - "epoch": 1.9654080389768573, - "grad_norm": 0.556360161329628, - "learning_rate": 5.300432205876949e-09, - "loss": 0.8141, - "step": 673 - }, - { - "epoch": 1.9683313032886725, - "grad_norm": 0.5406193054564133, - "learning_rate": 4.188151148193931e-09, - "loss": 0.8245, - "step": 674 - }, - { - "epoch": 1.9712545676004871, - "grad_norm": 0.55228239018778, - "learning_rate": 3.206658159989884e-09, - "loss": 0.7906, - "step": 675 - }, - { - "epoch": 1.9741778319123022, - "grad_norm": 0.5469662294340832, - "learning_rate": 2.355978936303127e-09, - "loss": 0.8227, - "step": 676 - }, - { - "epoch": 1.977101096224117, - "grad_norm": 0.5510036445718073, - "learning_rate": 1.6361357475258577e-09, - "loss": 0.8159, - "step": 677 - }, - { - "epoch": 1.9800243605359318, - "grad_norm": 0.5513896517514993, - "learning_rate": 1.0471474388240588e-09, - "loss": 0.7983, - "step": 678 - }, - { - "epoch": 1.9829476248477467, - "grad_norm": 0.5424914150028971, - "learning_rate": 5.890294296428955e-10, - "loss": 0.7835, - "step": 679 - }, - { - "epoch": 1.9858708891595616, - "grad_norm": 0.5397929907446958, - "learning_rate": 2.617937133009285e-10, - "loss": 0.786, - "step": 680 - }, - { - "epoch": 1.9887941534713764, - "grad_norm": 0.5455780668882535, - "learning_rate": 6.544885668036128e-11, - "loss": 0.7874, - "step": 681 - }, - { - "epoch": 1.9917174177831911, - "grad_norm": 0.5487145812291044, - "learning_rate": 0.0, - "loss": 0.8062, - "step": 682 - } - ], - "logging_steps": 1, - "max_steps": 682, - "num_input_tokens_seen": 0, - "num_train_epochs": 2, - "save_steps": 171, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": true - }, - "attributes": {} - } - }, - "total_flos": 8.721458602411295e+17, - "train_batch_size": 3, - "trial_name": null, - "trial_params": null -}