diff --git "a/checkpoint-14000/trainer_state.json" "b/checkpoint-14000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-14000/trainer_state.json" @@ -0,0 +1,98033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.6264885274469938, + "eval_steps": 500, + "global_step": 14000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00011617775196049956, + "grad_norm": 1.1400724649429321, + "learning_rate": 0.0001, + "loss": 1.9564, + "step": 1 + }, + { + "epoch": 0.00023235550392099912, + "grad_norm": 0.5516552925109863, + "learning_rate": 0.0001, + "loss": 1.9434, + "step": 2 + }, + { + "epoch": 0.0003485332558814987, + "grad_norm": 2.07415771484375, + "learning_rate": 0.0001, + "loss": 1.9873, + "step": 3 + }, + { + "epoch": 0.00046471100784199824, + "grad_norm": 0.6526162028312683, + "learning_rate": 0.0001, + "loss": 1.9786, + "step": 4 + }, + { + "epoch": 0.0005808887598024978, + "grad_norm": 0.5003806352615356, + "learning_rate": 0.0001, + "loss": 1.8828, + "step": 5 + }, + { + "epoch": 0.0006970665117629974, + "grad_norm": 0.5392526388168335, + "learning_rate": 0.0001, + "loss": 1.8913, + "step": 6 + }, + { + "epoch": 0.000813244263723497, + "grad_norm": 0.4057665765285492, + "learning_rate": 0.0001, + "loss": 1.7552, + "step": 7 + }, + { + "epoch": 0.0009294220156839965, + "grad_norm": 0.42006543278694153, + "learning_rate": 0.0001, + "loss": 2.0082, + "step": 8 + }, + { + "epoch": 0.001045599767644496, + "grad_norm": 1.1451431512832642, + "learning_rate": 0.0001, + "loss": 1.8661, + "step": 9 + }, + { + "epoch": 0.0011617775196049957, + "grad_norm": 0.44178342819213867, + "learning_rate": 0.0001, + "loss": 1.914, + "step": 10 + }, + { + "epoch": 0.0012779552715654952, + "grad_norm": 0.3882751762866974, + "learning_rate": 0.0001, + "loss": 1.902, + "step": 11 + }, + { + "epoch": 0.0013941330235259948, + "grad_norm": 0.35292503237724304, + "learning_rate": 0.0001, + "loss": 1.89, + "step": 12 + }, + { + "epoch": 0.0015103107754864944, + "grad_norm": 0.4178386926651001, + "learning_rate": 0.0001, + "loss": 1.8888, + "step": 13 + }, + { + "epoch": 0.001626488527446994, + "grad_norm": 0.39220985770225525, + "learning_rate": 0.0001, + "loss": 1.7775, + "step": 14 + }, + { + "epoch": 0.0017426662794074934, + "grad_norm": 0.4134705364704132, + "learning_rate": 0.0001, + "loss": 1.8794, + "step": 15 + }, + { + "epoch": 0.001858844031367993, + "grad_norm": 0.3392408788204193, + "learning_rate": 0.0001, + "loss": 1.6998, + "step": 16 + }, + { + "epoch": 0.0019750217833284928, + "grad_norm": 0.3672613203525543, + "learning_rate": 0.0001, + "loss": 2.0741, + "step": 17 + }, + { + "epoch": 0.002091199535288992, + "grad_norm": 0.4194653630256653, + "learning_rate": 0.0001, + "loss": 1.9617, + "step": 18 + }, + { + "epoch": 0.002207377287249492, + "grad_norm": 0.38323554396629333, + "learning_rate": 0.0001, + "loss": 1.8911, + "step": 19 + }, + { + "epoch": 0.0023235550392099913, + "grad_norm": 0.35623419284820557, + "learning_rate": 0.0001, + "loss": 1.8062, + "step": 20 + }, + { + "epoch": 0.0024397327911704907, + "grad_norm": 0.3997187614440918, + "learning_rate": 0.0001, + "loss": 2.1459, + "step": 21 + }, + { + "epoch": 0.0025559105431309905, + "grad_norm": 0.31873300671577454, + "learning_rate": 0.0001, + "loss": 1.813, + "step": 22 + }, + { + "epoch": 0.00267208829509149, + "grad_norm": 0.36987119913101196, + "learning_rate": 0.0001, + "loss": 1.8538, + "step": 23 + }, + { + "epoch": 0.0027882660470519897, + "grad_norm": 0.38269954919815063, + "learning_rate": 0.0001, + "loss": 1.9971, + "step": 24 + }, + { + "epoch": 0.002904443799012489, + "grad_norm": 0.3547232449054718, + "learning_rate": 0.0001, + "loss": 1.9081, + "step": 25 + }, + { + "epoch": 0.003020621550972989, + "grad_norm": 0.33162739872932434, + "learning_rate": 0.0001, + "loss": 1.6442, + "step": 26 + }, + { + "epoch": 0.003136799302933488, + "grad_norm": 0.3379284143447876, + "learning_rate": 0.0001, + "loss": 1.856, + "step": 27 + }, + { + "epoch": 0.003252977054893988, + "grad_norm": 0.35247287154197693, + "learning_rate": 0.0001, + "loss": 1.7758, + "step": 28 + }, + { + "epoch": 0.0033691548068544874, + "grad_norm": 0.3893706500530243, + "learning_rate": 0.0001, + "loss": 2.0282, + "step": 29 + }, + { + "epoch": 0.0034853325588149867, + "grad_norm": 0.39306116104125977, + "learning_rate": 0.0001, + "loss": 1.7966, + "step": 30 + }, + { + "epoch": 0.0036015103107754865, + "grad_norm": 0.36439645290374756, + "learning_rate": 0.0001, + "loss": 1.9552, + "step": 31 + }, + { + "epoch": 0.003717688062735986, + "grad_norm": 0.3758845925331116, + "learning_rate": 0.0001, + "loss": 1.9639, + "step": 32 + }, + { + "epoch": 0.0038338658146964857, + "grad_norm": 0.3357931971549988, + "learning_rate": 0.0001, + "loss": 1.8929, + "step": 33 + }, + { + "epoch": 0.0039500435666569855, + "grad_norm": 0.3393707573413849, + "learning_rate": 0.0001, + "loss": 1.7972, + "step": 34 + }, + { + "epoch": 0.004066221318617485, + "grad_norm": 0.3457406163215637, + "learning_rate": 0.0001, + "loss": 1.7516, + "step": 35 + }, + { + "epoch": 0.004182399070577984, + "grad_norm": 0.33674880862236023, + "learning_rate": 0.0001, + "loss": 1.7137, + "step": 36 + }, + { + "epoch": 0.004298576822538484, + "grad_norm": 0.33614569902420044, + "learning_rate": 0.0001, + "loss": 1.8855, + "step": 37 + }, + { + "epoch": 0.004414754574498984, + "grad_norm": 0.34421899914741516, + "learning_rate": 0.0001, + "loss": 1.8091, + "step": 38 + }, + { + "epoch": 0.004530932326459483, + "grad_norm": 0.39817526936531067, + "learning_rate": 0.0001, + "loss": 1.941, + "step": 39 + }, + { + "epoch": 0.004647110078419983, + "grad_norm": 0.3206139802932739, + "learning_rate": 0.0001, + "loss": 1.672, + "step": 40 + }, + { + "epoch": 0.004763287830380482, + "grad_norm": 0.37263521552085876, + "learning_rate": 0.0001, + "loss": 1.8275, + "step": 41 + }, + { + "epoch": 0.004879465582340981, + "grad_norm": 0.3837355077266693, + "learning_rate": 0.0001, + "loss": 1.9595, + "step": 42 + }, + { + "epoch": 0.004995643334301482, + "grad_norm": 0.33184581995010376, + "learning_rate": 0.0001, + "loss": 1.6796, + "step": 43 + }, + { + "epoch": 0.005111821086261981, + "grad_norm": 0.32549911737442017, + "learning_rate": 0.0001, + "loss": 1.5881, + "step": 44 + }, + { + "epoch": 0.00522799883822248, + "grad_norm": 0.35189497470855713, + "learning_rate": 0.0001, + "loss": 1.7428, + "step": 45 + }, + { + "epoch": 0.00534417659018298, + "grad_norm": 0.34523481130599976, + "learning_rate": 0.0001, + "loss": 1.9006, + "step": 46 + }, + { + "epoch": 0.00546035434214348, + "grad_norm": 0.3432101905345917, + "learning_rate": 0.0001, + "loss": 2.0098, + "step": 47 + }, + { + "epoch": 0.005576532094103979, + "grad_norm": 0.3387945294380188, + "learning_rate": 0.0001, + "loss": 1.8139, + "step": 48 + }, + { + "epoch": 0.005692709846064479, + "grad_norm": 0.33659058809280396, + "learning_rate": 0.0001, + "loss": 1.8409, + "step": 49 + }, + { + "epoch": 0.005808887598024978, + "grad_norm": 0.32735776901245117, + "learning_rate": 0.0001, + "loss": 1.7861, + "step": 50 + }, + { + "epoch": 0.005925065349985477, + "grad_norm": 0.35173168778419495, + "learning_rate": 0.0001, + "loss": 1.8177, + "step": 51 + }, + { + "epoch": 0.006041243101945978, + "grad_norm": 0.3859328031539917, + "learning_rate": 0.0001, + "loss": 2.0822, + "step": 52 + }, + { + "epoch": 0.006157420853906477, + "grad_norm": 0.3218703866004944, + "learning_rate": 0.0001, + "loss": 1.7588, + "step": 53 + }, + { + "epoch": 0.006273598605866976, + "grad_norm": 0.34024327993392944, + "learning_rate": 0.0001, + "loss": 1.8432, + "step": 54 + }, + { + "epoch": 0.006389776357827476, + "grad_norm": 0.3500777781009674, + "learning_rate": 0.0001, + "loss": 1.946, + "step": 55 + }, + { + "epoch": 0.006505954109787976, + "grad_norm": 0.35199397802352905, + "learning_rate": 0.0001, + "loss": 1.7764, + "step": 56 + }, + { + "epoch": 0.006622131861748475, + "grad_norm": 0.3386881947517395, + "learning_rate": 0.0001, + "loss": 1.8424, + "step": 57 + }, + { + "epoch": 0.006738309613708975, + "grad_norm": 0.333283931016922, + "learning_rate": 0.0001, + "loss": 1.8125, + "step": 58 + }, + { + "epoch": 0.006854487365669474, + "grad_norm": 0.32521334290504456, + "learning_rate": 0.0001, + "loss": 1.7354, + "step": 59 + }, + { + "epoch": 0.0069706651176299735, + "grad_norm": 0.36899396777153015, + "learning_rate": 0.0001, + "loss": 1.8834, + "step": 60 + }, + { + "epoch": 0.007086842869590474, + "grad_norm": 0.33825087547302246, + "learning_rate": 0.0001, + "loss": 1.905, + "step": 61 + }, + { + "epoch": 0.007203020621550973, + "grad_norm": 0.3499239981174469, + "learning_rate": 0.0001, + "loss": 1.8053, + "step": 62 + }, + { + "epoch": 0.0073191983735114725, + "grad_norm": 0.3182366192340851, + "learning_rate": 0.0001, + "loss": 1.7816, + "step": 63 + }, + { + "epoch": 0.007435376125471972, + "grad_norm": 0.34978607296943665, + "learning_rate": 0.0001, + "loss": 1.9504, + "step": 64 + }, + { + "epoch": 0.007551553877432472, + "grad_norm": 0.3370610177516937, + "learning_rate": 0.0001, + "loss": 1.8897, + "step": 65 + }, + { + "epoch": 0.0076677316293929714, + "grad_norm": 0.3260805606842041, + "learning_rate": 0.0001, + "loss": 1.8774, + "step": 66 + }, + { + "epoch": 0.007783909381353471, + "grad_norm": 0.32136961817741394, + "learning_rate": 0.0001, + "loss": 1.7582, + "step": 67 + }, + { + "epoch": 0.007900087133313971, + "grad_norm": 0.34404799342155457, + "learning_rate": 0.0001, + "loss": 1.9441, + "step": 68 + }, + { + "epoch": 0.00801626488527447, + "grad_norm": 0.3686494529247284, + "learning_rate": 0.0001, + "loss": 2.0653, + "step": 69 + }, + { + "epoch": 0.00813244263723497, + "grad_norm": 0.34148523211479187, + "learning_rate": 0.0001, + "loss": 1.9369, + "step": 70 + }, + { + "epoch": 0.00824862038919547, + "grad_norm": 0.3419267535209656, + "learning_rate": 0.0001, + "loss": 1.8754, + "step": 71 + }, + { + "epoch": 0.008364798141155969, + "grad_norm": 0.34724754095077515, + "learning_rate": 0.0001, + "loss": 1.6847, + "step": 72 + }, + { + "epoch": 0.008480975893116468, + "grad_norm": 0.3402698338031769, + "learning_rate": 0.0001, + "loss": 1.8966, + "step": 73 + }, + { + "epoch": 0.008597153645076967, + "grad_norm": 0.3450843393802643, + "learning_rate": 0.0001, + "loss": 1.9082, + "step": 74 + }, + { + "epoch": 0.008713331397037467, + "grad_norm": 0.3541632294654846, + "learning_rate": 0.0001, + "loss": 1.8932, + "step": 75 + }, + { + "epoch": 0.008829509148997968, + "grad_norm": 0.3470132648944855, + "learning_rate": 0.0001, + "loss": 1.7195, + "step": 76 + }, + { + "epoch": 0.008945686900958467, + "grad_norm": 0.33801767230033875, + "learning_rate": 0.0001, + "loss": 1.8521, + "step": 77 + }, + { + "epoch": 0.009061864652918966, + "grad_norm": 0.3319573402404785, + "learning_rate": 0.0001, + "loss": 1.7786, + "step": 78 + }, + { + "epoch": 0.009178042404879466, + "grad_norm": 0.35573163628578186, + "learning_rate": 0.0001, + "loss": 1.8646, + "step": 79 + }, + { + "epoch": 0.009294220156839965, + "grad_norm": 0.36378416419029236, + "learning_rate": 0.0001, + "loss": 2.0346, + "step": 80 + }, + { + "epoch": 0.009410397908800465, + "grad_norm": 0.3243565559387207, + "learning_rate": 0.0001, + "loss": 1.7084, + "step": 81 + }, + { + "epoch": 0.009526575660760964, + "grad_norm": 0.32637423276901245, + "learning_rate": 0.0001, + "loss": 1.8714, + "step": 82 + }, + { + "epoch": 0.009642753412721463, + "grad_norm": 0.3473580479621887, + "learning_rate": 0.0001, + "loss": 1.7707, + "step": 83 + }, + { + "epoch": 0.009758931164681963, + "grad_norm": 0.3237003982067108, + "learning_rate": 0.0001, + "loss": 1.7546, + "step": 84 + }, + { + "epoch": 0.009875108916642464, + "grad_norm": 0.3218970000743866, + "learning_rate": 0.0001, + "loss": 1.7922, + "step": 85 + }, + { + "epoch": 0.009991286668602963, + "grad_norm": 0.3358110189437866, + "learning_rate": 0.0001, + "loss": 1.7284, + "step": 86 + }, + { + "epoch": 0.010107464420563463, + "grad_norm": 0.34907999634742737, + "learning_rate": 0.0001, + "loss": 1.8581, + "step": 87 + }, + { + "epoch": 0.010223642172523962, + "grad_norm": 0.3303886950016022, + "learning_rate": 0.0001, + "loss": 1.8264, + "step": 88 + }, + { + "epoch": 0.010339819924484461, + "grad_norm": 0.33527591824531555, + "learning_rate": 0.0001, + "loss": 1.7832, + "step": 89 + }, + { + "epoch": 0.01045599767644496, + "grad_norm": 0.3530234098434448, + "learning_rate": 0.0001, + "loss": 1.8076, + "step": 90 + }, + { + "epoch": 0.01057217542840546, + "grad_norm": 0.35667964816093445, + "learning_rate": 0.0001, + "loss": 1.9292, + "step": 91 + }, + { + "epoch": 0.01068835318036596, + "grad_norm": 0.32774579524993896, + "learning_rate": 0.0001, + "loss": 1.902, + "step": 92 + }, + { + "epoch": 0.010804530932326459, + "grad_norm": 0.3343551456928253, + "learning_rate": 0.0001, + "loss": 1.8557, + "step": 93 + }, + { + "epoch": 0.01092070868428696, + "grad_norm": 0.3439468741416931, + "learning_rate": 0.0001, + "loss": 1.8196, + "step": 94 + }, + { + "epoch": 0.01103688643624746, + "grad_norm": 0.31505489349365234, + "learning_rate": 0.0001, + "loss": 1.7903, + "step": 95 + }, + { + "epoch": 0.011153064188207959, + "grad_norm": 0.3402290344238281, + "learning_rate": 0.0001, + "loss": 1.7981, + "step": 96 + }, + { + "epoch": 0.011269241940168458, + "grad_norm": 0.31825143098831177, + "learning_rate": 0.0001, + "loss": 1.7931, + "step": 97 + }, + { + "epoch": 0.011385419692128957, + "grad_norm": 0.32378312945365906, + "learning_rate": 0.0001, + "loss": 1.6732, + "step": 98 + }, + { + "epoch": 0.011501597444089457, + "grad_norm": 0.33641722798347473, + "learning_rate": 0.0001, + "loss": 1.8258, + "step": 99 + }, + { + "epoch": 0.011617775196049956, + "grad_norm": 0.3626033663749695, + "learning_rate": 0.0001, + "loss": 1.9063, + "step": 100 + }, + { + "epoch": 0.011733952948010455, + "grad_norm": 0.3551986813545227, + "learning_rate": 0.0001, + "loss": 1.876, + "step": 101 + }, + { + "epoch": 0.011850130699970955, + "grad_norm": 0.364032506942749, + "learning_rate": 0.0001, + "loss": 1.8289, + "step": 102 + }, + { + "epoch": 0.011966308451931456, + "grad_norm": 0.3228561282157898, + "learning_rate": 0.0001, + "loss": 1.7241, + "step": 103 + }, + { + "epoch": 0.012082486203891955, + "grad_norm": 0.34352198243141174, + "learning_rate": 0.0001, + "loss": 1.8273, + "step": 104 + }, + { + "epoch": 0.012198663955852455, + "grad_norm": 0.3475358784198761, + "learning_rate": 0.0001, + "loss": 1.6767, + "step": 105 + }, + { + "epoch": 0.012314841707812954, + "grad_norm": 0.3112235367298126, + "learning_rate": 0.0001, + "loss": 1.635, + "step": 106 + }, + { + "epoch": 0.012431019459773453, + "grad_norm": 0.3369406759738922, + "learning_rate": 0.0001, + "loss": 1.9139, + "step": 107 + }, + { + "epoch": 0.012547197211733953, + "grad_norm": 0.3227371573448181, + "learning_rate": 0.0001, + "loss": 1.8202, + "step": 108 + }, + { + "epoch": 0.012663374963694452, + "grad_norm": 0.36206531524658203, + "learning_rate": 0.0001, + "loss": 1.9441, + "step": 109 + }, + { + "epoch": 0.012779552715654952, + "grad_norm": 0.3347959518432617, + "learning_rate": 0.0001, + "loss": 1.9206, + "step": 110 + }, + { + "epoch": 0.012895730467615451, + "grad_norm": 0.32953107357025146, + "learning_rate": 0.0001, + "loss": 1.9498, + "step": 111 + }, + { + "epoch": 0.013011908219575952, + "grad_norm": 0.30254584550857544, + "learning_rate": 0.0001, + "loss": 1.578, + "step": 112 + }, + { + "epoch": 0.013128085971536451, + "grad_norm": 0.3403797149658203, + "learning_rate": 0.0001, + "loss": 1.8821, + "step": 113 + }, + { + "epoch": 0.01324426372349695, + "grad_norm": 0.31918781995773315, + "learning_rate": 0.0001, + "loss": 1.7435, + "step": 114 + }, + { + "epoch": 0.01336044147545745, + "grad_norm": 0.3319108486175537, + "learning_rate": 0.0001, + "loss": 1.7677, + "step": 115 + }, + { + "epoch": 0.01347661922741795, + "grad_norm": 0.34339022636413574, + "learning_rate": 0.0001, + "loss": 1.6114, + "step": 116 + }, + { + "epoch": 0.013592796979378449, + "grad_norm": 0.33843010663986206, + "learning_rate": 0.0001, + "loss": 1.8448, + "step": 117 + }, + { + "epoch": 0.013708974731338948, + "grad_norm": 0.32837411761283875, + "learning_rate": 0.0001, + "loss": 1.7597, + "step": 118 + }, + { + "epoch": 0.013825152483299448, + "grad_norm": 0.3620765507221222, + "learning_rate": 0.0001, + "loss": 1.8675, + "step": 119 + }, + { + "epoch": 0.013941330235259947, + "grad_norm": 0.33245301246643066, + "learning_rate": 0.0001, + "loss": 1.8119, + "step": 120 + }, + { + "epoch": 0.014057507987220448, + "grad_norm": 0.34569036960601807, + "learning_rate": 0.0001, + "loss": 1.9758, + "step": 121 + }, + { + "epoch": 0.014173685739180947, + "grad_norm": 0.374098539352417, + "learning_rate": 0.0001, + "loss": 1.78, + "step": 122 + }, + { + "epoch": 0.014289863491141447, + "grad_norm": 0.3418053686618805, + "learning_rate": 0.0001, + "loss": 1.7408, + "step": 123 + }, + { + "epoch": 0.014406041243101946, + "grad_norm": 0.3384765088558197, + "learning_rate": 0.0001, + "loss": 1.7649, + "step": 124 + }, + { + "epoch": 0.014522218995062446, + "grad_norm": 0.3590647280216217, + "learning_rate": 0.0001, + "loss": 1.9065, + "step": 125 + }, + { + "epoch": 0.014638396747022945, + "grad_norm": 0.3395555019378662, + "learning_rate": 0.0001, + "loss": 1.8293, + "step": 126 + }, + { + "epoch": 0.014754574498983444, + "grad_norm": 0.3610383868217468, + "learning_rate": 0.0001, + "loss": 1.8426, + "step": 127 + }, + { + "epoch": 0.014870752250943944, + "grad_norm": 0.33869925141334534, + "learning_rate": 0.0001, + "loss": 1.7425, + "step": 128 + }, + { + "epoch": 0.014986930002904443, + "grad_norm": 0.338553249835968, + "learning_rate": 0.0001, + "loss": 1.8638, + "step": 129 + }, + { + "epoch": 0.015103107754864944, + "grad_norm": 0.3297845125198364, + "learning_rate": 0.0001, + "loss": 1.8374, + "step": 130 + }, + { + "epoch": 0.015219285506825444, + "grad_norm": 0.3697233498096466, + "learning_rate": 0.0001, + "loss": 1.9652, + "step": 131 + }, + { + "epoch": 0.015335463258785943, + "grad_norm": 0.3488331139087677, + "learning_rate": 0.0001, + "loss": 1.6962, + "step": 132 + }, + { + "epoch": 0.015451641010746442, + "grad_norm": 0.3396114706993103, + "learning_rate": 0.0001, + "loss": 1.7741, + "step": 133 + }, + { + "epoch": 0.015567818762706942, + "grad_norm": 0.34215307235717773, + "learning_rate": 0.0001, + "loss": 1.9031, + "step": 134 + }, + { + "epoch": 0.015683996514667443, + "grad_norm": 0.32633256912231445, + "learning_rate": 0.0001, + "loss": 1.794, + "step": 135 + }, + { + "epoch": 0.015800174266627942, + "grad_norm": 0.3819684088230133, + "learning_rate": 0.0001, + "loss": 1.7509, + "step": 136 + }, + { + "epoch": 0.01591635201858844, + "grad_norm": 0.3342839479446411, + "learning_rate": 0.0001, + "loss": 1.7577, + "step": 137 + }, + { + "epoch": 0.01603252977054894, + "grad_norm": 0.32045823335647583, + "learning_rate": 0.0001, + "loss": 1.9446, + "step": 138 + }, + { + "epoch": 0.01614870752250944, + "grad_norm": 0.3185892701148987, + "learning_rate": 0.0001, + "loss": 1.6481, + "step": 139 + }, + { + "epoch": 0.01626488527446994, + "grad_norm": 0.3285033106803894, + "learning_rate": 0.0001, + "loss": 1.8363, + "step": 140 + }, + { + "epoch": 0.01638106302643044, + "grad_norm": 0.35822629928588867, + "learning_rate": 0.0001, + "loss": 1.7425, + "step": 141 + }, + { + "epoch": 0.01649724077839094, + "grad_norm": 0.3599529266357422, + "learning_rate": 0.0001, + "loss": 1.7378, + "step": 142 + }, + { + "epoch": 0.016613418530351438, + "grad_norm": 0.3105633556842804, + "learning_rate": 0.0001, + "loss": 1.6305, + "step": 143 + }, + { + "epoch": 0.016729596282311937, + "grad_norm": 0.3310804069042206, + "learning_rate": 0.0001, + "loss": 1.7228, + "step": 144 + }, + { + "epoch": 0.016845774034272436, + "grad_norm": 0.32904812693595886, + "learning_rate": 0.0001, + "loss": 1.7132, + "step": 145 + }, + { + "epoch": 0.016961951786232936, + "grad_norm": 0.3462965190410614, + "learning_rate": 0.0001, + "loss": 1.8588, + "step": 146 + }, + { + "epoch": 0.017078129538193435, + "grad_norm": 0.3536953926086426, + "learning_rate": 0.0001, + "loss": 1.7871, + "step": 147 + }, + { + "epoch": 0.017194307290153935, + "grad_norm": 0.3167782723903656, + "learning_rate": 0.0001, + "loss": 1.7468, + "step": 148 + }, + { + "epoch": 0.017310485042114434, + "grad_norm": 0.3438095450401306, + "learning_rate": 0.0001, + "loss": 1.7273, + "step": 149 + }, + { + "epoch": 0.017426662794074933, + "grad_norm": 0.32976609468460083, + "learning_rate": 0.0001, + "loss": 1.8282, + "step": 150 + }, + { + "epoch": 0.017542840546035433, + "grad_norm": 0.36160793900489807, + "learning_rate": 0.0001, + "loss": 1.982, + "step": 151 + }, + { + "epoch": 0.017659018297995936, + "grad_norm": 0.31704050302505493, + "learning_rate": 0.0001, + "loss": 1.6637, + "step": 152 + }, + { + "epoch": 0.017775196049956435, + "grad_norm": 0.3186262547969818, + "learning_rate": 0.0001, + "loss": 1.7416, + "step": 153 + }, + { + "epoch": 0.017891373801916934, + "grad_norm": 0.3476766049861908, + "learning_rate": 0.0001, + "loss": 1.822, + "step": 154 + }, + { + "epoch": 0.018007551553877434, + "grad_norm": 0.35061368346214294, + "learning_rate": 0.0001, + "loss": 1.7696, + "step": 155 + }, + { + "epoch": 0.018123729305837933, + "grad_norm": 0.33829307556152344, + "learning_rate": 0.0001, + "loss": 1.791, + "step": 156 + }, + { + "epoch": 0.018239907057798432, + "grad_norm": 0.3412603735923767, + "learning_rate": 0.0001, + "loss": 1.8324, + "step": 157 + }, + { + "epoch": 0.01835608480975893, + "grad_norm": 0.3786666989326477, + "learning_rate": 0.0001, + "loss": 1.9237, + "step": 158 + }, + { + "epoch": 0.01847226256171943, + "grad_norm": 0.31077513098716736, + "learning_rate": 0.0001, + "loss": 1.6433, + "step": 159 + }, + { + "epoch": 0.01858844031367993, + "grad_norm": 0.32142356038093567, + "learning_rate": 0.0001, + "loss": 1.6929, + "step": 160 + }, + { + "epoch": 0.01870461806564043, + "grad_norm": 0.30329957604408264, + "learning_rate": 0.0001, + "loss": 1.6117, + "step": 161 + }, + { + "epoch": 0.01882079581760093, + "grad_norm": 0.3338776230812073, + "learning_rate": 0.0001, + "loss": 1.7708, + "step": 162 + }, + { + "epoch": 0.01893697356956143, + "grad_norm": 0.32876482605934143, + "learning_rate": 0.0001, + "loss": 1.8144, + "step": 163 + }, + { + "epoch": 0.019053151321521928, + "grad_norm": 0.32878580689430237, + "learning_rate": 0.0001, + "loss": 1.6474, + "step": 164 + }, + { + "epoch": 0.019169329073482427, + "grad_norm": 0.33372652530670166, + "learning_rate": 0.0001, + "loss": 1.7709, + "step": 165 + }, + { + "epoch": 0.019285506825442927, + "grad_norm": 0.3314981460571289, + "learning_rate": 0.0001, + "loss": 1.828, + "step": 166 + }, + { + "epoch": 0.019401684577403426, + "grad_norm": 0.3250749707221985, + "learning_rate": 0.0001, + "loss": 1.7861, + "step": 167 + }, + { + "epoch": 0.019517862329363925, + "grad_norm": 0.31878861784935, + "learning_rate": 0.0001, + "loss": 1.9088, + "step": 168 + }, + { + "epoch": 0.019634040081324425, + "grad_norm": 0.3233966827392578, + "learning_rate": 0.0001, + "loss": 1.759, + "step": 169 + }, + { + "epoch": 0.019750217833284928, + "grad_norm": 0.3260466754436493, + "learning_rate": 0.0001, + "loss": 1.9208, + "step": 170 + }, + { + "epoch": 0.019866395585245427, + "grad_norm": 0.33691638708114624, + "learning_rate": 0.0001, + "loss": 1.8521, + "step": 171 + }, + { + "epoch": 0.019982573337205926, + "grad_norm": 0.33654195070266724, + "learning_rate": 0.0001, + "loss": 1.7191, + "step": 172 + }, + { + "epoch": 0.020098751089166426, + "grad_norm": 0.32309457659721375, + "learning_rate": 0.0001, + "loss": 1.7446, + "step": 173 + }, + { + "epoch": 0.020214928841126925, + "grad_norm": 0.33408665657043457, + "learning_rate": 0.0001, + "loss": 1.8275, + "step": 174 + }, + { + "epoch": 0.020331106593087424, + "grad_norm": 0.32850080728530884, + "learning_rate": 0.0001, + "loss": 1.8264, + "step": 175 + }, + { + "epoch": 0.020447284345047924, + "grad_norm": 0.35644981265068054, + "learning_rate": 0.0001, + "loss": 1.9859, + "step": 176 + }, + { + "epoch": 0.020563462097008423, + "grad_norm": 0.36484628915786743, + "learning_rate": 0.0001, + "loss": 1.8343, + "step": 177 + }, + { + "epoch": 0.020679639848968923, + "grad_norm": 0.3122706413269043, + "learning_rate": 0.0001, + "loss": 1.748, + "step": 178 + }, + { + "epoch": 0.020795817600929422, + "grad_norm": 0.35981225967407227, + "learning_rate": 0.0001, + "loss": 1.9379, + "step": 179 + }, + { + "epoch": 0.02091199535288992, + "grad_norm": 0.3504844009876251, + "learning_rate": 0.0001, + "loss": 1.7112, + "step": 180 + }, + { + "epoch": 0.02102817310485042, + "grad_norm": 0.3608277142047882, + "learning_rate": 0.0001, + "loss": 1.705, + "step": 181 + }, + { + "epoch": 0.02114435085681092, + "grad_norm": 0.3393659293651581, + "learning_rate": 0.0001, + "loss": 1.87, + "step": 182 + }, + { + "epoch": 0.02126052860877142, + "grad_norm": 0.34214162826538086, + "learning_rate": 0.0001, + "loss": 1.8054, + "step": 183 + }, + { + "epoch": 0.02137670636073192, + "grad_norm": 0.33791059255599976, + "learning_rate": 0.0001, + "loss": 1.9013, + "step": 184 + }, + { + "epoch": 0.021492884112692418, + "grad_norm": 0.3184012770652771, + "learning_rate": 0.0001, + "loss": 1.7028, + "step": 185 + }, + { + "epoch": 0.021609061864652918, + "grad_norm": 0.3612908124923706, + "learning_rate": 0.0001, + "loss": 1.8917, + "step": 186 + }, + { + "epoch": 0.021725239616613417, + "grad_norm": 0.3533594608306885, + "learning_rate": 0.0001, + "loss": 1.8029, + "step": 187 + }, + { + "epoch": 0.02184141736857392, + "grad_norm": 0.33328378200531006, + "learning_rate": 0.0001, + "loss": 1.7835, + "step": 188 + }, + { + "epoch": 0.02195759512053442, + "grad_norm": 0.3383578360080719, + "learning_rate": 0.0001, + "loss": 1.7839, + "step": 189 + }, + { + "epoch": 0.02207377287249492, + "grad_norm": 0.3100789785385132, + "learning_rate": 0.0001, + "loss": 1.6669, + "step": 190 + }, + { + "epoch": 0.022189950624455418, + "grad_norm": 0.3665405511856079, + "learning_rate": 0.0001, + "loss": 1.8639, + "step": 191 + }, + { + "epoch": 0.022306128376415917, + "grad_norm": 0.36163437366485596, + "learning_rate": 0.0001, + "loss": 1.8558, + "step": 192 + }, + { + "epoch": 0.022422306128376417, + "grad_norm": 0.367439866065979, + "learning_rate": 0.0001, + "loss": 1.9491, + "step": 193 + }, + { + "epoch": 0.022538483880336916, + "grad_norm": 0.3353922963142395, + "learning_rate": 0.0001, + "loss": 1.8423, + "step": 194 + }, + { + "epoch": 0.022654661632297415, + "grad_norm": 0.33214300870895386, + "learning_rate": 0.0001, + "loss": 1.559, + "step": 195 + }, + { + "epoch": 0.022770839384257915, + "grad_norm": 0.33071082830429077, + "learning_rate": 0.0001, + "loss": 1.8457, + "step": 196 + }, + { + "epoch": 0.022887017136218414, + "grad_norm": 0.3185707926750183, + "learning_rate": 0.0001, + "loss": 1.7457, + "step": 197 + }, + { + "epoch": 0.023003194888178913, + "grad_norm": 0.32431602478027344, + "learning_rate": 0.0001, + "loss": 1.7577, + "step": 198 + }, + { + "epoch": 0.023119372640139413, + "grad_norm": 0.3300994634628296, + "learning_rate": 0.0001, + "loss": 1.7727, + "step": 199 + }, + { + "epoch": 0.023235550392099912, + "grad_norm": 0.3367205858230591, + "learning_rate": 0.0001, + "loss": 1.7845, + "step": 200 + }, + { + "epoch": 0.02335172814406041, + "grad_norm": 0.3295809328556061, + "learning_rate": 0.0001, + "loss": 1.703, + "step": 201 + }, + { + "epoch": 0.02346790589602091, + "grad_norm": 0.3326011598110199, + "learning_rate": 0.0001, + "loss": 1.7791, + "step": 202 + }, + { + "epoch": 0.02358408364798141, + "grad_norm": 0.3159317374229431, + "learning_rate": 0.0001, + "loss": 1.7935, + "step": 203 + }, + { + "epoch": 0.02370026139994191, + "grad_norm": 0.35444700717926025, + "learning_rate": 0.0001, + "loss": 1.8029, + "step": 204 + }, + { + "epoch": 0.02381643915190241, + "grad_norm": 0.35230258107185364, + "learning_rate": 0.0001, + "loss": 1.7454, + "step": 205 + }, + { + "epoch": 0.023932616903862912, + "grad_norm": 0.3529949486255646, + "learning_rate": 0.0001, + "loss": 1.8424, + "step": 206 + }, + { + "epoch": 0.02404879465582341, + "grad_norm": 0.3865301012992859, + "learning_rate": 0.0001, + "loss": 1.8076, + "step": 207 + }, + { + "epoch": 0.02416497240778391, + "grad_norm": 0.32935428619384766, + "learning_rate": 0.0001, + "loss": 1.7917, + "step": 208 + }, + { + "epoch": 0.02428115015974441, + "grad_norm": 0.339444100856781, + "learning_rate": 0.0001, + "loss": 1.8285, + "step": 209 + }, + { + "epoch": 0.02439732791170491, + "grad_norm": 0.3153437077999115, + "learning_rate": 0.0001, + "loss": 1.5033, + "step": 210 + }, + { + "epoch": 0.02451350566366541, + "grad_norm": 0.3516276478767395, + "learning_rate": 0.0001, + "loss": 1.9109, + "step": 211 + }, + { + "epoch": 0.024629683415625908, + "grad_norm": 0.3484845757484436, + "learning_rate": 0.0001, + "loss": 1.6998, + "step": 212 + }, + { + "epoch": 0.024745861167586407, + "grad_norm": 0.3327982723712921, + "learning_rate": 0.0001, + "loss": 1.792, + "step": 213 + }, + { + "epoch": 0.024862038919546907, + "grad_norm": 0.37349483370780945, + "learning_rate": 0.0001, + "loss": 1.8074, + "step": 214 + }, + { + "epoch": 0.024978216671507406, + "grad_norm": 0.3740026354789734, + "learning_rate": 0.0001, + "loss": 1.7827, + "step": 215 + }, + { + "epoch": 0.025094394423467906, + "grad_norm": 0.34118950366973877, + "learning_rate": 0.0001, + "loss": 1.7631, + "step": 216 + }, + { + "epoch": 0.025210572175428405, + "grad_norm": 0.3344680368900299, + "learning_rate": 0.0001, + "loss": 1.7317, + "step": 217 + }, + { + "epoch": 0.025326749927388904, + "grad_norm": 0.3705542981624603, + "learning_rate": 0.0001, + "loss": 1.8239, + "step": 218 + }, + { + "epoch": 0.025442927679349404, + "grad_norm": 0.3481920063495636, + "learning_rate": 0.0001, + "loss": 1.8647, + "step": 219 + }, + { + "epoch": 0.025559105431309903, + "grad_norm": 0.3217613399028778, + "learning_rate": 0.0001, + "loss": 1.7362, + "step": 220 + }, + { + "epoch": 0.025675283183270402, + "grad_norm": 0.3721480667591095, + "learning_rate": 0.0001, + "loss": 1.8672, + "step": 221 + }, + { + "epoch": 0.025791460935230902, + "grad_norm": 0.32509273290634155, + "learning_rate": 0.0001, + "loss": 1.7041, + "step": 222 + }, + { + "epoch": 0.0259076386871914, + "grad_norm": 0.33785101771354675, + "learning_rate": 0.0001, + "loss": 1.7899, + "step": 223 + }, + { + "epoch": 0.026023816439151904, + "grad_norm": 0.3636549115180969, + "learning_rate": 0.0001, + "loss": 1.8676, + "step": 224 + }, + { + "epoch": 0.026139994191112403, + "grad_norm": 0.31692421436309814, + "learning_rate": 0.0001, + "loss": 1.6563, + "step": 225 + }, + { + "epoch": 0.026256171943072903, + "grad_norm": 0.3383740186691284, + "learning_rate": 0.0001, + "loss": 1.7732, + "step": 226 + }, + { + "epoch": 0.026372349695033402, + "grad_norm": 0.355059951543808, + "learning_rate": 0.0001, + "loss": 1.677, + "step": 227 + }, + { + "epoch": 0.0264885274469939, + "grad_norm": 0.3562302589416504, + "learning_rate": 0.0001, + "loss": 1.9306, + "step": 228 + }, + { + "epoch": 0.0266047051989544, + "grad_norm": 0.3225594162940979, + "learning_rate": 0.0001, + "loss": 1.7609, + "step": 229 + }, + { + "epoch": 0.0267208829509149, + "grad_norm": 0.33251953125, + "learning_rate": 0.0001, + "loss": 1.8027, + "step": 230 + }, + { + "epoch": 0.0268370607028754, + "grad_norm": 0.3734368085861206, + "learning_rate": 0.0001, + "loss": 1.7417, + "step": 231 + }, + { + "epoch": 0.0269532384548359, + "grad_norm": 0.33438679575920105, + "learning_rate": 0.0001, + "loss": 1.7758, + "step": 232 + }, + { + "epoch": 0.0270694162067964, + "grad_norm": 0.37446722388267517, + "learning_rate": 0.0001, + "loss": 1.929, + "step": 233 + }, + { + "epoch": 0.027185593958756898, + "grad_norm": 0.3230035603046417, + "learning_rate": 0.0001, + "loss": 1.6549, + "step": 234 + }, + { + "epoch": 0.027301771710717397, + "grad_norm": 0.36308127641677856, + "learning_rate": 0.0001, + "loss": 1.8119, + "step": 235 + }, + { + "epoch": 0.027417949462677896, + "grad_norm": 0.31774529814720154, + "learning_rate": 0.0001, + "loss": 1.7894, + "step": 236 + }, + { + "epoch": 0.027534127214638396, + "grad_norm": 0.3248523473739624, + "learning_rate": 0.0001, + "loss": 1.8632, + "step": 237 + }, + { + "epoch": 0.027650304966598895, + "grad_norm": 0.3736507296562195, + "learning_rate": 0.0001, + "loss": 1.9092, + "step": 238 + }, + { + "epoch": 0.027766482718559395, + "grad_norm": 0.3262912631034851, + "learning_rate": 0.0001, + "loss": 1.697, + "step": 239 + }, + { + "epoch": 0.027882660470519894, + "grad_norm": 0.3249903619289398, + "learning_rate": 0.0001, + "loss": 1.7496, + "step": 240 + }, + { + "epoch": 0.027998838222480393, + "grad_norm": 0.3449850380420685, + "learning_rate": 0.0001, + "loss": 1.825, + "step": 241 + }, + { + "epoch": 0.028115015974440896, + "grad_norm": 0.41747432947158813, + "learning_rate": 0.0001, + "loss": 2.1273, + "step": 242 + }, + { + "epoch": 0.028231193726401396, + "grad_norm": 0.35569465160369873, + "learning_rate": 0.0001, + "loss": 1.8643, + "step": 243 + }, + { + "epoch": 0.028347371478361895, + "grad_norm": 0.3352862596511841, + "learning_rate": 0.0001, + "loss": 1.8427, + "step": 244 + }, + { + "epoch": 0.028463549230322394, + "grad_norm": 0.38555145263671875, + "learning_rate": 0.0001, + "loss": 1.9834, + "step": 245 + }, + { + "epoch": 0.028579726982282894, + "grad_norm": 0.3578146696090698, + "learning_rate": 0.0001, + "loss": 1.7827, + "step": 246 + }, + { + "epoch": 0.028695904734243393, + "grad_norm": 0.3353215754032135, + "learning_rate": 0.0001, + "loss": 1.6884, + "step": 247 + }, + { + "epoch": 0.028812082486203892, + "grad_norm": 0.3413390815258026, + "learning_rate": 0.0001, + "loss": 1.7664, + "step": 248 + }, + { + "epoch": 0.028928260238164392, + "grad_norm": 0.3437090218067169, + "learning_rate": 0.0001, + "loss": 1.8693, + "step": 249 + }, + { + "epoch": 0.02904443799012489, + "grad_norm": 0.3443460762500763, + "learning_rate": 0.0001, + "loss": 1.8738, + "step": 250 + }, + { + "epoch": 0.02916061574208539, + "grad_norm": 0.3518659770488739, + "learning_rate": 0.0001, + "loss": 1.8869, + "step": 251 + }, + { + "epoch": 0.02927679349404589, + "grad_norm": 0.3433094322681427, + "learning_rate": 0.0001, + "loss": 1.8771, + "step": 252 + }, + { + "epoch": 0.02939297124600639, + "grad_norm": 0.3407760262489319, + "learning_rate": 0.0001, + "loss": 1.914, + "step": 253 + }, + { + "epoch": 0.02950914899796689, + "grad_norm": 0.3268572688102722, + "learning_rate": 0.0001, + "loss": 1.8681, + "step": 254 + }, + { + "epoch": 0.029625326749927388, + "grad_norm": 0.3242436349391937, + "learning_rate": 0.0001, + "loss": 1.8188, + "step": 255 + }, + { + "epoch": 0.029741504501887887, + "grad_norm": 0.33246999979019165, + "learning_rate": 0.0001, + "loss": 1.7859, + "step": 256 + }, + { + "epoch": 0.029857682253848387, + "grad_norm": 0.3286731541156769, + "learning_rate": 0.0001, + "loss": 1.782, + "step": 257 + }, + { + "epoch": 0.029973860005808886, + "grad_norm": 0.3115208148956299, + "learning_rate": 0.0001, + "loss": 1.6835, + "step": 258 + }, + { + "epoch": 0.030090037757769385, + "grad_norm": 0.3387967646121979, + "learning_rate": 0.0001, + "loss": 1.8529, + "step": 259 + }, + { + "epoch": 0.03020621550972989, + "grad_norm": 0.35752734541893005, + "learning_rate": 0.0001, + "loss": 1.8307, + "step": 260 + }, + { + "epoch": 0.030322393261690388, + "grad_norm": 0.3699615001678467, + "learning_rate": 0.0001, + "loss": 1.7063, + "step": 261 + }, + { + "epoch": 0.030438571013650887, + "grad_norm": 0.328961044549942, + "learning_rate": 0.0001, + "loss": 1.625, + "step": 262 + }, + { + "epoch": 0.030554748765611386, + "grad_norm": 0.36490383744239807, + "learning_rate": 0.0001, + "loss": 1.8146, + "step": 263 + }, + { + "epoch": 0.030670926517571886, + "grad_norm": 0.32661300897598267, + "learning_rate": 0.0001, + "loss": 1.6685, + "step": 264 + }, + { + "epoch": 0.030787104269532385, + "grad_norm": 0.34228524565696716, + "learning_rate": 0.0001, + "loss": 1.8087, + "step": 265 + }, + { + "epoch": 0.030903282021492885, + "grad_norm": 0.35033437609672546, + "learning_rate": 0.0001, + "loss": 1.8297, + "step": 266 + }, + { + "epoch": 0.031019459773453384, + "grad_norm": 0.3491237759590149, + "learning_rate": 0.0001, + "loss": 1.945, + "step": 267 + }, + { + "epoch": 0.031135637525413883, + "grad_norm": 0.34657391905784607, + "learning_rate": 0.0001, + "loss": 1.823, + "step": 268 + }, + { + "epoch": 0.03125181527737438, + "grad_norm": 0.36879056692123413, + "learning_rate": 0.0001, + "loss": 1.859, + "step": 269 + }, + { + "epoch": 0.031367993029334885, + "grad_norm": 0.36880430579185486, + "learning_rate": 0.0001, + "loss": 1.7599, + "step": 270 + }, + { + "epoch": 0.03148417078129538, + "grad_norm": 0.3530198931694031, + "learning_rate": 0.0001, + "loss": 1.9935, + "step": 271 + }, + { + "epoch": 0.031600348533255884, + "grad_norm": 0.35175904631614685, + "learning_rate": 0.0001, + "loss": 1.8837, + "step": 272 + }, + { + "epoch": 0.03171652628521638, + "grad_norm": 0.3176674246788025, + "learning_rate": 0.0001, + "loss": 1.6186, + "step": 273 + }, + { + "epoch": 0.03183270403717688, + "grad_norm": 0.33333176374435425, + "learning_rate": 0.0001, + "loss": 1.6463, + "step": 274 + }, + { + "epoch": 0.03194888178913738, + "grad_norm": 0.31809431314468384, + "learning_rate": 0.0001, + "loss": 1.5768, + "step": 275 + }, + { + "epoch": 0.03206505954109788, + "grad_norm": 0.3419245183467865, + "learning_rate": 0.0001, + "loss": 1.8647, + "step": 276 + }, + { + "epoch": 0.03218123729305838, + "grad_norm": 0.3137516379356384, + "learning_rate": 0.0001, + "loss": 1.6757, + "step": 277 + }, + { + "epoch": 0.03229741504501888, + "grad_norm": 0.3313903510570526, + "learning_rate": 0.0001, + "loss": 1.7395, + "step": 278 + }, + { + "epoch": 0.032413592796979376, + "grad_norm": 0.3197391629219055, + "learning_rate": 0.0001, + "loss": 1.6098, + "step": 279 + }, + { + "epoch": 0.03252977054893988, + "grad_norm": 0.3287334442138672, + "learning_rate": 0.0001, + "loss": 1.6124, + "step": 280 + }, + { + "epoch": 0.032645948300900375, + "grad_norm": 0.3322586715221405, + "learning_rate": 0.0001, + "loss": 1.8868, + "step": 281 + }, + { + "epoch": 0.03276212605286088, + "grad_norm": 0.35943326354026794, + "learning_rate": 0.0001, + "loss": 1.8031, + "step": 282 + }, + { + "epoch": 0.032878303804821374, + "grad_norm": 0.3296755254268646, + "learning_rate": 0.0001, + "loss": 1.7838, + "step": 283 + }, + { + "epoch": 0.03299448155678188, + "grad_norm": 0.3416058123111725, + "learning_rate": 0.0001, + "loss": 1.7132, + "step": 284 + }, + { + "epoch": 0.03311065930874237, + "grad_norm": 0.3588326573371887, + "learning_rate": 0.0001, + "loss": 1.8722, + "step": 285 + }, + { + "epoch": 0.033226837060702875, + "grad_norm": 0.3292895555496216, + "learning_rate": 0.0001, + "loss": 1.7415, + "step": 286 + }, + { + "epoch": 0.03334301481266338, + "grad_norm": 0.3382332921028137, + "learning_rate": 0.0001, + "loss": 1.8179, + "step": 287 + }, + { + "epoch": 0.033459192564623874, + "grad_norm": 0.33576157689094543, + "learning_rate": 0.0001, + "loss": 1.8442, + "step": 288 + }, + { + "epoch": 0.03357537031658438, + "grad_norm": 0.3294540047645569, + "learning_rate": 0.0001, + "loss": 1.622, + "step": 289 + }, + { + "epoch": 0.03369154806854487, + "grad_norm": 0.3160039782524109, + "learning_rate": 0.0001, + "loss": 1.6648, + "step": 290 + }, + { + "epoch": 0.033807725820505376, + "grad_norm": 0.3588349223136902, + "learning_rate": 0.0001, + "loss": 1.8304, + "step": 291 + }, + { + "epoch": 0.03392390357246587, + "grad_norm": 0.30049145221710205, + "learning_rate": 0.0001, + "loss": 1.6468, + "step": 292 + }, + { + "epoch": 0.034040081324426374, + "grad_norm": 0.3043253719806671, + "learning_rate": 0.0001, + "loss": 1.6615, + "step": 293 + }, + { + "epoch": 0.03415625907638687, + "grad_norm": 0.377269983291626, + "learning_rate": 0.0001, + "loss": 1.8306, + "step": 294 + }, + { + "epoch": 0.03427243682834737, + "grad_norm": 0.378327876329422, + "learning_rate": 0.0001, + "loss": 1.9481, + "step": 295 + }, + { + "epoch": 0.03438861458030787, + "grad_norm": 0.3419339060783386, + "learning_rate": 0.0001, + "loss": 1.8375, + "step": 296 + }, + { + "epoch": 0.03450479233226837, + "grad_norm": 0.3224831521511078, + "learning_rate": 0.0001, + "loss": 1.5147, + "step": 297 + }, + { + "epoch": 0.03462097008422887, + "grad_norm": 0.35075846314430237, + "learning_rate": 0.0001, + "loss": 1.7834, + "step": 298 + }, + { + "epoch": 0.03473714783618937, + "grad_norm": 0.345042884349823, + "learning_rate": 0.0001, + "loss": 1.8939, + "step": 299 + }, + { + "epoch": 0.03485332558814987, + "grad_norm": 0.3130955100059509, + "learning_rate": 0.0001, + "loss": 1.6054, + "step": 300 + }, + { + "epoch": 0.03496950334011037, + "grad_norm": 0.35810473561286926, + "learning_rate": 0.0001, + "loss": 1.7418, + "step": 301 + }, + { + "epoch": 0.035085681092070865, + "grad_norm": 0.33703386783599854, + "learning_rate": 0.0001, + "loss": 1.7681, + "step": 302 + }, + { + "epoch": 0.03520185884403137, + "grad_norm": 0.3259965777397156, + "learning_rate": 0.0001, + "loss": 1.7835, + "step": 303 + }, + { + "epoch": 0.03531803659599187, + "grad_norm": 0.3411068618297577, + "learning_rate": 0.0001, + "loss": 1.7623, + "step": 304 + }, + { + "epoch": 0.03543421434795237, + "grad_norm": 0.35380053520202637, + "learning_rate": 0.0001, + "loss": 1.6731, + "step": 305 + }, + { + "epoch": 0.03555039209991287, + "grad_norm": 0.32706257700920105, + "learning_rate": 0.0001, + "loss": 1.7234, + "step": 306 + }, + { + "epoch": 0.035666569851873366, + "grad_norm": 0.3144983649253845, + "learning_rate": 0.0001, + "loss": 1.6658, + "step": 307 + }, + { + "epoch": 0.03578274760383387, + "grad_norm": 0.3492032289505005, + "learning_rate": 0.0001, + "loss": 1.7184, + "step": 308 + }, + { + "epoch": 0.035898925355794364, + "grad_norm": 0.34730949997901917, + "learning_rate": 0.0001, + "loss": 1.7799, + "step": 309 + }, + { + "epoch": 0.03601510310775487, + "grad_norm": 0.31954050064086914, + "learning_rate": 0.0001, + "loss": 1.6716, + "step": 310 + }, + { + "epoch": 0.03613128085971536, + "grad_norm": 0.3348424732685089, + "learning_rate": 0.0001, + "loss": 1.7981, + "step": 311 + }, + { + "epoch": 0.036247458611675866, + "grad_norm": 0.3164363503456116, + "learning_rate": 0.0001, + "loss": 1.5808, + "step": 312 + }, + { + "epoch": 0.03636363636363636, + "grad_norm": 0.35313692688941956, + "learning_rate": 0.0001, + "loss": 1.9214, + "step": 313 + }, + { + "epoch": 0.036479814115596865, + "grad_norm": 0.3127838671207428, + "learning_rate": 0.0001, + "loss": 1.6572, + "step": 314 + }, + { + "epoch": 0.03659599186755736, + "grad_norm": 0.33738380670547485, + "learning_rate": 0.0001, + "loss": 1.8281, + "step": 315 + }, + { + "epoch": 0.03671216961951786, + "grad_norm": 0.34780019521713257, + "learning_rate": 0.0001, + "loss": 1.863, + "step": 316 + }, + { + "epoch": 0.03682834737147836, + "grad_norm": 0.3482603430747986, + "learning_rate": 0.0001, + "loss": 1.9564, + "step": 317 + }, + { + "epoch": 0.03694452512343886, + "grad_norm": 0.3411928415298462, + "learning_rate": 0.0001, + "loss": 1.8199, + "step": 318 + }, + { + "epoch": 0.03706070287539936, + "grad_norm": 0.33409395813941956, + "learning_rate": 0.0001, + "loss": 1.8027, + "step": 319 + }, + { + "epoch": 0.03717688062735986, + "grad_norm": 0.3127480447292328, + "learning_rate": 0.0001, + "loss": 1.6778, + "step": 320 + }, + { + "epoch": 0.03729305837932036, + "grad_norm": 0.3322620689868927, + "learning_rate": 0.0001, + "loss": 1.674, + "step": 321 + }, + { + "epoch": 0.03740923613128086, + "grad_norm": 0.3270343542098999, + "learning_rate": 0.0001, + "loss": 1.6199, + "step": 322 + }, + { + "epoch": 0.03752541388324136, + "grad_norm": 0.3130471110343933, + "learning_rate": 0.0001, + "loss": 1.6465, + "step": 323 + }, + { + "epoch": 0.03764159163520186, + "grad_norm": 0.33622002601623535, + "learning_rate": 0.0001, + "loss": 1.7173, + "step": 324 + }, + { + "epoch": 0.03775776938716236, + "grad_norm": 0.3530596196651459, + "learning_rate": 0.0001, + "loss": 1.9313, + "step": 325 + }, + { + "epoch": 0.03787394713912286, + "grad_norm": 0.38460564613342285, + "learning_rate": 0.0001, + "loss": 2.0007, + "step": 326 + }, + { + "epoch": 0.03799012489108336, + "grad_norm": 0.3392890393733978, + "learning_rate": 0.0001, + "loss": 1.8334, + "step": 327 + }, + { + "epoch": 0.038106302643043856, + "grad_norm": 0.33205297589302063, + "learning_rate": 0.0001, + "loss": 1.8434, + "step": 328 + }, + { + "epoch": 0.03822248039500436, + "grad_norm": 0.34488892555236816, + "learning_rate": 0.0001, + "loss": 1.7568, + "step": 329 + }, + { + "epoch": 0.038338658146964855, + "grad_norm": 0.357216477394104, + "learning_rate": 0.0001, + "loss": 1.776, + "step": 330 + }, + { + "epoch": 0.03845483589892536, + "grad_norm": 0.31771931052207947, + "learning_rate": 0.0001, + "loss": 1.6836, + "step": 331 + }, + { + "epoch": 0.03857101365088585, + "grad_norm": 0.36487412452697754, + "learning_rate": 0.0001, + "loss": 1.8129, + "step": 332 + }, + { + "epoch": 0.038687191402846356, + "grad_norm": 0.3407606780529022, + "learning_rate": 0.0001, + "loss": 1.7241, + "step": 333 + }, + { + "epoch": 0.03880336915480685, + "grad_norm": 0.31532132625579834, + "learning_rate": 0.0001, + "loss": 1.6642, + "step": 334 + }, + { + "epoch": 0.038919546906767355, + "grad_norm": 0.3287792503833771, + "learning_rate": 0.0001, + "loss": 1.7889, + "step": 335 + }, + { + "epoch": 0.03903572465872785, + "grad_norm": 0.33964601159095764, + "learning_rate": 0.0001, + "loss": 1.8163, + "step": 336 + }, + { + "epoch": 0.039151902410688354, + "grad_norm": 0.34165436029434204, + "learning_rate": 0.0001, + "loss": 1.8581, + "step": 337 + }, + { + "epoch": 0.03926808016264885, + "grad_norm": 0.348545640707016, + "learning_rate": 0.0001, + "loss": 1.7487, + "step": 338 + }, + { + "epoch": 0.03938425791460935, + "grad_norm": 0.322898268699646, + "learning_rate": 0.0001, + "loss": 1.641, + "step": 339 + }, + { + "epoch": 0.039500435666569855, + "grad_norm": 0.3299243748188019, + "learning_rate": 0.0001, + "loss": 1.7179, + "step": 340 + }, + { + "epoch": 0.03961661341853035, + "grad_norm": 0.34560680389404297, + "learning_rate": 0.0001, + "loss": 1.8015, + "step": 341 + }, + { + "epoch": 0.039732791170490854, + "grad_norm": 0.34533241391181946, + "learning_rate": 0.0001, + "loss": 1.8813, + "step": 342 + }, + { + "epoch": 0.03984896892245135, + "grad_norm": 0.3315548598766327, + "learning_rate": 0.0001, + "loss": 1.849, + "step": 343 + }, + { + "epoch": 0.03996514667441185, + "grad_norm": 0.31113383173942566, + "learning_rate": 0.0001, + "loss": 1.7933, + "step": 344 + }, + { + "epoch": 0.04008132442637235, + "grad_norm": 0.34004050493240356, + "learning_rate": 0.0001, + "loss": 1.7723, + "step": 345 + }, + { + "epoch": 0.04019750217833285, + "grad_norm": 0.3573724925518036, + "learning_rate": 0.0001, + "loss": 1.8572, + "step": 346 + }, + { + "epoch": 0.04031367993029335, + "grad_norm": 0.3385657072067261, + "learning_rate": 0.0001, + "loss": 1.7458, + "step": 347 + }, + { + "epoch": 0.04042985768225385, + "grad_norm": 0.34114423394203186, + "learning_rate": 0.0001, + "loss": 1.7496, + "step": 348 + }, + { + "epoch": 0.040546035434214346, + "grad_norm": 0.3590448796749115, + "learning_rate": 0.0001, + "loss": 1.8671, + "step": 349 + }, + { + "epoch": 0.04066221318617485, + "grad_norm": 0.34044602513313293, + "learning_rate": 0.0001, + "loss": 1.751, + "step": 350 + }, + { + "epoch": 0.040778390938135345, + "grad_norm": 0.34901726245880127, + "learning_rate": 0.0001, + "loss": 1.7836, + "step": 351 + }, + { + "epoch": 0.04089456869009585, + "grad_norm": 0.33901697397232056, + "learning_rate": 0.0001, + "loss": 1.9265, + "step": 352 + }, + { + "epoch": 0.041010746442056344, + "grad_norm": 0.33621305227279663, + "learning_rate": 0.0001, + "loss": 1.7498, + "step": 353 + }, + { + "epoch": 0.041126924194016846, + "grad_norm": 0.3610171973705292, + "learning_rate": 0.0001, + "loss": 1.9162, + "step": 354 + }, + { + "epoch": 0.04124310194597734, + "grad_norm": 0.33839908242225647, + "learning_rate": 0.0001, + "loss": 1.8152, + "step": 355 + }, + { + "epoch": 0.041359279697937845, + "grad_norm": 0.32283854484558105, + "learning_rate": 0.0001, + "loss": 1.5406, + "step": 356 + }, + { + "epoch": 0.04147545744989834, + "grad_norm": 0.3764745891094208, + "learning_rate": 0.0001, + "loss": 1.9003, + "step": 357 + }, + { + "epoch": 0.041591635201858844, + "grad_norm": 0.34221282601356506, + "learning_rate": 0.0001, + "loss": 1.8303, + "step": 358 + }, + { + "epoch": 0.04170781295381935, + "grad_norm": 0.36195555329322815, + "learning_rate": 0.0001, + "loss": 1.6955, + "step": 359 + }, + { + "epoch": 0.04182399070577984, + "grad_norm": 0.33597198128700256, + "learning_rate": 0.0001, + "loss": 1.7741, + "step": 360 + }, + { + "epoch": 0.041940168457740346, + "grad_norm": 0.3600618243217468, + "learning_rate": 0.0001, + "loss": 1.8281, + "step": 361 + }, + { + "epoch": 0.04205634620970084, + "grad_norm": 0.3321481943130493, + "learning_rate": 0.0001, + "loss": 1.4949, + "step": 362 + }, + { + "epoch": 0.042172523961661344, + "grad_norm": 0.34914496541023254, + "learning_rate": 0.0001, + "loss": 1.6745, + "step": 363 + }, + { + "epoch": 0.04228870171362184, + "grad_norm": 0.34095922112464905, + "learning_rate": 0.0001, + "loss": 1.6643, + "step": 364 + }, + { + "epoch": 0.04240487946558234, + "grad_norm": 0.35311540961265564, + "learning_rate": 0.0001, + "loss": 1.6882, + "step": 365 + }, + { + "epoch": 0.04252105721754284, + "grad_norm": 0.383943647146225, + "learning_rate": 0.0001, + "loss": 1.8434, + "step": 366 + }, + { + "epoch": 0.04263723496950334, + "grad_norm": 0.33830368518829346, + "learning_rate": 0.0001, + "loss": 1.8743, + "step": 367 + }, + { + "epoch": 0.04275341272146384, + "grad_norm": 0.3408838212490082, + "learning_rate": 0.0001, + "loss": 1.762, + "step": 368 + }, + { + "epoch": 0.04286959047342434, + "grad_norm": 0.37213000655174255, + "learning_rate": 0.0001, + "loss": 1.9295, + "step": 369 + }, + { + "epoch": 0.042985768225384836, + "grad_norm": 0.33618566393852234, + "learning_rate": 0.0001, + "loss": 1.9901, + "step": 370 + }, + { + "epoch": 0.04310194597734534, + "grad_norm": 0.3326283097267151, + "learning_rate": 0.0001, + "loss": 1.7401, + "step": 371 + }, + { + "epoch": 0.043218123729305835, + "grad_norm": 0.3324650526046753, + "learning_rate": 0.0001, + "loss": 1.7636, + "step": 372 + }, + { + "epoch": 0.04333430148126634, + "grad_norm": 0.36444899439811707, + "learning_rate": 0.0001, + "loss": 1.7127, + "step": 373 + }, + { + "epoch": 0.043450479233226834, + "grad_norm": 0.32787513732910156, + "learning_rate": 0.0001, + "loss": 1.7763, + "step": 374 + }, + { + "epoch": 0.04356665698518734, + "grad_norm": 0.3590793311595917, + "learning_rate": 0.0001, + "loss": 1.791, + "step": 375 + }, + { + "epoch": 0.04368283473714784, + "grad_norm": 0.3318205773830414, + "learning_rate": 0.0001, + "loss": 1.8108, + "step": 376 + }, + { + "epoch": 0.043799012489108335, + "grad_norm": 0.34203052520751953, + "learning_rate": 0.0001, + "loss": 1.6802, + "step": 377 + }, + { + "epoch": 0.04391519024106884, + "grad_norm": 0.3480004072189331, + "learning_rate": 0.0001, + "loss": 1.8196, + "step": 378 + }, + { + "epoch": 0.044031367993029334, + "grad_norm": 0.3592599928379059, + "learning_rate": 0.0001, + "loss": 1.768, + "step": 379 + }, + { + "epoch": 0.04414754574498984, + "grad_norm": 0.3415685296058655, + "learning_rate": 0.0001, + "loss": 1.8739, + "step": 380 + }, + { + "epoch": 0.04426372349695033, + "grad_norm": 0.33502671122550964, + "learning_rate": 0.0001, + "loss": 1.7937, + "step": 381 + }, + { + "epoch": 0.044379901248910836, + "grad_norm": 0.3669358193874359, + "learning_rate": 0.0001, + "loss": 1.6517, + "step": 382 + }, + { + "epoch": 0.04449607900087133, + "grad_norm": 0.35224342346191406, + "learning_rate": 0.0001, + "loss": 1.787, + "step": 383 + }, + { + "epoch": 0.044612256752831835, + "grad_norm": 0.3165675699710846, + "learning_rate": 0.0001, + "loss": 1.7865, + "step": 384 + }, + { + "epoch": 0.04472843450479233, + "grad_norm": 0.34537431597709656, + "learning_rate": 0.0001, + "loss": 1.7218, + "step": 385 + }, + { + "epoch": 0.04484461225675283, + "grad_norm": 0.3536418080329895, + "learning_rate": 0.0001, + "loss": 1.8404, + "step": 386 + }, + { + "epoch": 0.04496079000871333, + "grad_norm": 0.33434730768203735, + "learning_rate": 0.0001, + "loss": 1.8322, + "step": 387 + }, + { + "epoch": 0.04507696776067383, + "grad_norm": 0.32901090383529663, + "learning_rate": 0.0001, + "loss": 1.7684, + "step": 388 + }, + { + "epoch": 0.04519314551263433, + "grad_norm": 0.3278455436229706, + "learning_rate": 0.0001, + "loss": 1.6996, + "step": 389 + }, + { + "epoch": 0.04530932326459483, + "grad_norm": 0.34721484780311584, + "learning_rate": 0.0001, + "loss": 1.8898, + "step": 390 + }, + { + "epoch": 0.04542550101655533, + "grad_norm": 0.3322944939136505, + "learning_rate": 0.0001, + "loss": 1.6575, + "step": 391 + }, + { + "epoch": 0.04554167876851583, + "grad_norm": 0.34092095494270325, + "learning_rate": 0.0001, + "loss": 1.8761, + "step": 392 + }, + { + "epoch": 0.045657856520476325, + "grad_norm": 0.3071598708629608, + "learning_rate": 0.0001, + "loss": 1.7564, + "step": 393 + }, + { + "epoch": 0.04577403427243683, + "grad_norm": 0.3361007273197174, + "learning_rate": 0.0001, + "loss": 1.7425, + "step": 394 + }, + { + "epoch": 0.04589021202439733, + "grad_norm": 0.38204225897789, + "learning_rate": 0.0001, + "loss": 1.9099, + "step": 395 + }, + { + "epoch": 0.04600638977635783, + "grad_norm": 0.32257604598999023, + "learning_rate": 0.0001, + "loss": 1.6667, + "step": 396 + }, + { + "epoch": 0.04612256752831833, + "grad_norm": 0.3314444124698639, + "learning_rate": 0.0001, + "loss": 1.7518, + "step": 397 + }, + { + "epoch": 0.046238745280278826, + "grad_norm": 0.34747904539108276, + "learning_rate": 0.0001, + "loss": 1.7717, + "step": 398 + }, + { + "epoch": 0.04635492303223933, + "grad_norm": 0.3240096867084503, + "learning_rate": 0.0001, + "loss": 1.7374, + "step": 399 + }, + { + "epoch": 0.046471100784199824, + "grad_norm": 0.3071964979171753, + "learning_rate": 0.0001, + "loss": 1.6034, + "step": 400 + }, + { + "epoch": 0.04658727853616033, + "grad_norm": 0.3766336739063263, + "learning_rate": 0.0001, + "loss": 1.9295, + "step": 401 + }, + { + "epoch": 0.04670345628812082, + "grad_norm": 0.35144826769828796, + "learning_rate": 0.0001, + "loss": 1.7427, + "step": 402 + }, + { + "epoch": 0.046819634040081326, + "grad_norm": 0.31121134757995605, + "learning_rate": 0.0001, + "loss": 1.6415, + "step": 403 + }, + { + "epoch": 0.04693581179204182, + "grad_norm": 0.3237099051475525, + "learning_rate": 0.0001, + "loss": 1.7629, + "step": 404 + }, + { + "epoch": 0.047051989544002325, + "grad_norm": 0.3905102610588074, + "learning_rate": 0.0001, + "loss": 1.9238, + "step": 405 + }, + { + "epoch": 0.04716816729596282, + "grad_norm": 0.31731948256492615, + "learning_rate": 0.0001, + "loss": 1.7172, + "step": 406 + }, + { + "epoch": 0.047284345047923323, + "grad_norm": 0.33582621812820435, + "learning_rate": 0.0001, + "loss": 1.7687, + "step": 407 + }, + { + "epoch": 0.04740052279988382, + "grad_norm": 0.3342641592025757, + "learning_rate": 0.0001, + "loss": 1.7827, + "step": 408 + }, + { + "epoch": 0.04751670055184432, + "grad_norm": 0.3307628035545349, + "learning_rate": 0.0001, + "loss": 1.8255, + "step": 409 + }, + { + "epoch": 0.04763287830380482, + "grad_norm": 0.3463532030582428, + "learning_rate": 0.0001, + "loss": 1.7545, + "step": 410 + }, + { + "epoch": 0.04774905605576532, + "grad_norm": 0.35025402903556824, + "learning_rate": 0.0001, + "loss": 1.7729, + "step": 411 + }, + { + "epoch": 0.047865233807725824, + "grad_norm": 0.380147784948349, + "learning_rate": 0.0001, + "loss": 1.6817, + "step": 412 + }, + { + "epoch": 0.04798141155968632, + "grad_norm": 0.33736342191696167, + "learning_rate": 0.0001, + "loss": 1.6815, + "step": 413 + }, + { + "epoch": 0.04809758931164682, + "grad_norm": 0.33364975452423096, + "learning_rate": 0.0001, + "loss": 1.726, + "step": 414 + }, + { + "epoch": 0.04821376706360732, + "grad_norm": 0.3768046498298645, + "learning_rate": 0.0001, + "loss": 1.8859, + "step": 415 + }, + { + "epoch": 0.04832994481556782, + "grad_norm": 0.3302350342273712, + "learning_rate": 0.0001, + "loss": 1.669, + "step": 416 + }, + { + "epoch": 0.04844612256752832, + "grad_norm": 0.3570808172225952, + "learning_rate": 0.0001, + "loss": 1.8595, + "step": 417 + }, + { + "epoch": 0.04856230031948882, + "grad_norm": 0.3274199962615967, + "learning_rate": 0.0001, + "loss": 1.8074, + "step": 418 + }, + { + "epoch": 0.048678478071449316, + "grad_norm": 0.36040613055229187, + "learning_rate": 0.0001, + "loss": 1.8473, + "step": 419 + }, + { + "epoch": 0.04879465582340982, + "grad_norm": 0.3288051187992096, + "learning_rate": 0.0001, + "loss": 1.7849, + "step": 420 + }, + { + "epoch": 0.048910833575370315, + "grad_norm": 0.32802271842956543, + "learning_rate": 0.0001, + "loss": 1.8159, + "step": 421 + }, + { + "epoch": 0.04902701132733082, + "grad_norm": 0.3353809118270874, + "learning_rate": 0.0001, + "loss": 1.7419, + "step": 422 + }, + { + "epoch": 0.04914318907929131, + "grad_norm": 0.325050413608551, + "learning_rate": 0.0001, + "loss": 1.7534, + "step": 423 + }, + { + "epoch": 0.049259366831251816, + "grad_norm": 0.34243983030319214, + "learning_rate": 0.0001, + "loss": 1.7955, + "step": 424 + }, + { + "epoch": 0.04937554458321231, + "grad_norm": 0.36178770661354065, + "learning_rate": 0.0001, + "loss": 1.8266, + "step": 425 + }, + { + "epoch": 0.049491722335172815, + "grad_norm": 0.35593631863594055, + "learning_rate": 0.0001, + "loss": 1.8078, + "step": 426 + }, + { + "epoch": 0.04960790008713331, + "grad_norm": 0.3078707158565521, + "learning_rate": 0.0001, + "loss": 1.6211, + "step": 427 + }, + { + "epoch": 0.049724077839093814, + "grad_norm": 0.3557792007923126, + "learning_rate": 0.0001, + "loss": 1.6497, + "step": 428 + }, + { + "epoch": 0.04984025559105431, + "grad_norm": 0.3614198863506317, + "learning_rate": 0.0001, + "loss": 1.8574, + "step": 429 + }, + { + "epoch": 0.04995643334301481, + "grad_norm": 0.3604932427406311, + "learning_rate": 0.0001, + "loss": 1.8913, + "step": 430 + }, + { + "epoch": 0.050072611094975315, + "grad_norm": 0.34103018045425415, + "learning_rate": 0.0001, + "loss": 1.7272, + "step": 431 + }, + { + "epoch": 0.05018878884693581, + "grad_norm": 0.34125006198883057, + "learning_rate": 0.0001, + "loss": 1.8007, + "step": 432 + }, + { + "epoch": 0.050304966598896314, + "grad_norm": 0.3314953148365021, + "learning_rate": 0.0001, + "loss": 1.6559, + "step": 433 + }, + { + "epoch": 0.05042114435085681, + "grad_norm": 0.3640129566192627, + "learning_rate": 0.0001, + "loss": 1.816, + "step": 434 + }, + { + "epoch": 0.05053732210281731, + "grad_norm": 0.3177947402000427, + "learning_rate": 0.0001, + "loss": 1.6548, + "step": 435 + }, + { + "epoch": 0.05065349985477781, + "grad_norm": 0.3107048273086548, + "learning_rate": 0.0001, + "loss": 1.6143, + "step": 436 + }, + { + "epoch": 0.05076967760673831, + "grad_norm": 0.3266063332557678, + "learning_rate": 0.0001, + "loss": 1.7826, + "step": 437 + }, + { + "epoch": 0.05088585535869881, + "grad_norm": 0.32090339064598083, + "learning_rate": 0.0001, + "loss": 1.5662, + "step": 438 + }, + { + "epoch": 0.05100203311065931, + "grad_norm": 0.3660094141960144, + "learning_rate": 0.0001, + "loss": 1.9414, + "step": 439 + }, + { + "epoch": 0.051118210862619806, + "grad_norm": 0.3175399899482727, + "learning_rate": 0.0001, + "loss": 1.6903, + "step": 440 + }, + { + "epoch": 0.05123438861458031, + "grad_norm": 0.3207061290740967, + "learning_rate": 0.0001, + "loss": 1.7275, + "step": 441 + }, + { + "epoch": 0.051350566366540805, + "grad_norm": 0.34974828362464905, + "learning_rate": 0.0001, + "loss": 1.8292, + "step": 442 + }, + { + "epoch": 0.05146674411850131, + "grad_norm": 0.40996474027633667, + "learning_rate": 0.0001, + "loss": 1.8227, + "step": 443 + }, + { + "epoch": 0.051582921870461804, + "grad_norm": 0.33745077252388, + "learning_rate": 0.0001, + "loss": 1.7853, + "step": 444 + }, + { + "epoch": 0.051699099622422306, + "grad_norm": 0.3403610289096832, + "learning_rate": 0.0001, + "loss": 1.8465, + "step": 445 + }, + { + "epoch": 0.0518152773743828, + "grad_norm": 0.30970969796180725, + "learning_rate": 0.0001, + "loss": 1.6728, + "step": 446 + }, + { + "epoch": 0.051931455126343305, + "grad_norm": 0.3483198583126068, + "learning_rate": 0.0001, + "loss": 1.7418, + "step": 447 + }, + { + "epoch": 0.05204763287830381, + "grad_norm": 0.3291124999523163, + "learning_rate": 0.0001, + "loss": 1.6788, + "step": 448 + }, + { + "epoch": 0.052163810630264304, + "grad_norm": 0.320839524269104, + "learning_rate": 0.0001, + "loss": 1.7708, + "step": 449 + }, + { + "epoch": 0.05227998838222481, + "grad_norm": 0.3379274308681488, + "learning_rate": 0.0001, + "loss": 1.7447, + "step": 450 + }, + { + "epoch": 0.0523961661341853, + "grad_norm": 0.31182000041007996, + "learning_rate": 0.0001, + "loss": 1.6674, + "step": 451 + }, + { + "epoch": 0.052512343886145806, + "grad_norm": 0.3308543562889099, + "learning_rate": 0.0001, + "loss": 1.6077, + "step": 452 + }, + { + "epoch": 0.0526285216381063, + "grad_norm": 0.34902223944664, + "learning_rate": 0.0001, + "loss": 1.92, + "step": 453 + }, + { + "epoch": 0.052744699390066804, + "grad_norm": 0.3305058181285858, + "learning_rate": 0.0001, + "loss": 1.7275, + "step": 454 + }, + { + "epoch": 0.0528608771420273, + "grad_norm": 0.3399059772491455, + "learning_rate": 0.0001, + "loss": 1.6418, + "step": 455 + }, + { + "epoch": 0.0529770548939878, + "grad_norm": 0.36334654688835144, + "learning_rate": 0.0001, + "loss": 1.8425, + "step": 456 + }, + { + "epoch": 0.0530932326459483, + "grad_norm": 0.34613272547721863, + "learning_rate": 0.0001, + "loss": 1.7303, + "step": 457 + }, + { + "epoch": 0.0532094103979088, + "grad_norm": 0.3351539373397827, + "learning_rate": 0.0001, + "loss": 1.656, + "step": 458 + }, + { + "epoch": 0.0533255881498693, + "grad_norm": 0.34070366621017456, + "learning_rate": 0.0001, + "loss": 1.6501, + "step": 459 + }, + { + "epoch": 0.0534417659018298, + "grad_norm": 0.3561154305934906, + "learning_rate": 0.0001, + "loss": 1.8496, + "step": 460 + }, + { + "epoch": 0.053557943653790296, + "grad_norm": 0.32410141825675964, + "learning_rate": 0.0001, + "loss": 1.7718, + "step": 461 + }, + { + "epoch": 0.0536741214057508, + "grad_norm": 0.3558378219604492, + "learning_rate": 0.0001, + "loss": 1.7761, + "step": 462 + }, + { + "epoch": 0.053790299157711295, + "grad_norm": 0.3171548843383789, + "learning_rate": 0.0001, + "loss": 1.7016, + "step": 463 + }, + { + "epoch": 0.0539064769096718, + "grad_norm": 0.34194016456604004, + "learning_rate": 0.0001, + "loss": 1.8411, + "step": 464 + }, + { + "epoch": 0.0540226546616323, + "grad_norm": 0.3174999952316284, + "learning_rate": 0.0001, + "loss": 1.7361, + "step": 465 + }, + { + "epoch": 0.0541388324135928, + "grad_norm": 0.33531078696250916, + "learning_rate": 0.0001, + "loss": 1.8138, + "step": 466 + }, + { + "epoch": 0.0542550101655533, + "grad_norm": 0.33532780408859253, + "learning_rate": 0.0001, + "loss": 1.7827, + "step": 467 + }, + { + "epoch": 0.054371187917513795, + "grad_norm": 0.40104812383651733, + "learning_rate": 0.0001, + "loss": 2.0425, + "step": 468 + }, + { + "epoch": 0.0544873656694743, + "grad_norm": 0.33999940752983093, + "learning_rate": 0.0001, + "loss": 1.7676, + "step": 469 + }, + { + "epoch": 0.054603543421434794, + "grad_norm": 0.3406895399093628, + "learning_rate": 0.0001, + "loss": 1.7311, + "step": 470 + }, + { + "epoch": 0.0547197211733953, + "grad_norm": 0.32375043630599976, + "learning_rate": 0.0001, + "loss": 1.7128, + "step": 471 + }, + { + "epoch": 0.05483589892535579, + "grad_norm": 0.3496737480163574, + "learning_rate": 0.0001, + "loss": 1.8868, + "step": 472 + }, + { + "epoch": 0.054952076677316296, + "grad_norm": 0.32865509390830994, + "learning_rate": 0.0001, + "loss": 1.6974, + "step": 473 + }, + { + "epoch": 0.05506825442927679, + "grad_norm": 0.3332083225250244, + "learning_rate": 0.0001, + "loss": 1.7202, + "step": 474 + }, + { + "epoch": 0.055184432181237295, + "grad_norm": 0.3402409851551056, + "learning_rate": 0.0001, + "loss": 1.8425, + "step": 475 + }, + { + "epoch": 0.05530060993319779, + "grad_norm": 0.37318024039268494, + "learning_rate": 0.0001, + "loss": 1.6539, + "step": 476 + }, + { + "epoch": 0.05541678768515829, + "grad_norm": 0.3607703745365143, + "learning_rate": 0.0001, + "loss": 1.8452, + "step": 477 + }, + { + "epoch": 0.05553296543711879, + "grad_norm": 0.3592975437641144, + "learning_rate": 0.0001, + "loss": 1.8463, + "step": 478 + }, + { + "epoch": 0.05564914318907929, + "grad_norm": 0.3610447645187378, + "learning_rate": 0.0001, + "loss": 1.8906, + "step": 479 + }, + { + "epoch": 0.05576532094103979, + "grad_norm": 0.33733242750167847, + "learning_rate": 0.0001, + "loss": 1.8464, + "step": 480 + }, + { + "epoch": 0.05588149869300029, + "grad_norm": 0.35732340812683105, + "learning_rate": 0.0001, + "loss": 1.8149, + "step": 481 + }, + { + "epoch": 0.05599767644496079, + "grad_norm": 0.3425617814064026, + "learning_rate": 0.0001, + "loss": 1.8263, + "step": 482 + }, + { + "epoch": 0.05611385419692129, + "grad_norm": 0.33323052525520325, + "learning_rate": 0.0001, + "loss": 1.7538, + "step": 483 + }, + { + "epoch": 0.05623003194888179, + "grad_norm": 0.35951876640319824, + "learning_rate": 0.0001, + "loss": 1.8618, + "step": 484 + }, + { + "epoch": 0.05634620970084229, + "grad_norm": 0.3292408883571625, + "learning_rate": 0.0001, + "loss": 1.6186, + "step": 485 + }, + { + "epoch": 0.05646238745280279, + "grad_norm": 0.37031203508377075, + "learning_rate": 0.0001, + "loss": 1.7308, + "step": 486 + }, + { + "epoch": 0.05657856520476329, + "grad_norm": 0.32847365736961365, + "learning_rate": 0.0001, + "loss": 1.7937, + "step": 487 + }, + { + "epoch": 0.05669474295672379, + "grad_norm": 0.3220025599002838, + "learning_rate": 0.0001, + "loss": 1.6471, + "step": 488 + }, + { + "epoch": 0.056810920708684286, + "grad_norm": 0.36746835708618164, + "learning_rate": 0.0001, + "loss": 1.6696, + "step": 489 + }, + { + "epoch": 0.05692709846064479, + "grad_norm": 0.3531992733478546, + "learning_rate": 0.0001, + "loss": 1.7423, + "step": 490 + }, + { + "epoch": 0.057043276212605284, + "grad_norm": 0.3173719346523285, + "learning_rate": 0.0001, + "loss": 1.6648, + "step": 491 + }, + { + "epoch": 0.05715945396456579, + "grad_norm": 0.35342299938201904, + "learning_rate": 0.0001, + "loss": 1.7122, + "step": 492 + }, + { + "epoch": 0.05727563171652628, + "grad_norm": 0.3307000696659088, + "learning_rate": 0.0001, + "loss": 1.7597, + "step": 493 + }, + { + "epoch": 0.057391809468486786, + "grad_norm": 0.3486880362033844, + "learning_rate": 0.0001, + "loss": 1.795, + "step": 494 + }, + { + "epoch": 0.05750798722044728, + "grad_norm": 0.37966904044151306, + "learning_rate": 0.0001, + "loss": 1.8754, + "step": 495 + }, + { + "epoch": 0.057624164972407785, + "grad_norm": 0.3487970530986786, + "learning_rate": 0.0001, + "loss": 1.8312, + "step": 496 + }, + { + "epoch": 0.05774034272436828, + "grad_norm": 0.30827823281288147, + "learning_rate": 0.0001, + "loss": 1.6478, + "step": 497 + }, + { + "epoch": 0.057856520476328784, + "grad_norm": 0.33888596296310425, + "learning_rate": 0.0001, + "loss": 1.7648, + "step": 498 + }, + { + "epoch": 0.05797269822828928, + "grad_norm": 0.35201674699783325, + "learning_rate": 0.0001, + "loss": 1.7952, + "step": 499 + }, + { + "epoch": 0.05808887598024978, + "grad_norm": 0.35336655378341675, + "learning_rate": 0.0001, + "loss": 1.808, + "step": 500 + }, + { + "epoch": 0.058205053732210285, + "grad_norm": 0.36219102144241333, + "learning_rate": 0.0001, + "loss": 1.7659, + "step": 501 + }, + { + "epoch": 0.05832123148417078, + "grad_norm": 0.3310224711894989, + "learning_rate": 0.0001, + "loss": 1.6516, + "step": 502 + }, + { + "epoch": 0.058437409236131284, + "grad_norm": 0.372923344373703, + "learning_rate": 0.0001, + "loss": 1.7554, + "step": 503 + }, + { + "epoch": 0.05855358698809178, + "grad_norm": 0.358759343624115, + "learning_rate": 0.0001, + "loss": 1.734, + "step": 504 + }, + { + "epoch": 0.05866976474005228, + "grad_norm": 0.41483983397483826, + "learning_rate": 0.0001, + "loss": 1.7648, + "step": 505 + }, + { + "epoch": 0.05878594249201278, + "grad_norm": 0.3552764058113098, + "learning_rate": 0.0001, + "loss": 1.8973, + "step": 506 + }, + { + "epoch": 0.05890212024397328, + "grad_norm": 0.3468693494796753, + "learning_rate": 0.0001, + "loss": 1.7336, + "step": 507 + }, + { + "epoch": 0.05901829799593378, + "grad_norm": 0.359375536441803, + "learning_rate": 0.0001, + "loss": 1.8677, + "step": 508 + }, + { + "epoch": 0.05913447574789428, + "grad_norm": 0.32384392619132996, + "learning_rate": 0.0001, + "loss": 1.5734, + "step": 509 + }, + { + "epoch": 0.059250653499854776, + "grad_norm": 0.3563079237937927, + "learning_rate": 0.0001, + "loss": 1.8236, + "step": 510 + }, + { + "epoch": 0.05936683125181528, + "grad_norm": 0.32716381549835205, + "learning_rate": 0.0001, + "loss": 1.8558, + "step": 511 + }, + { + "epoch": 0.059483009003775775, + "grad_norm": 0.3453499674797058, + "learning_rate": 0.0001, + "loss": 1.8389, + "step": 512 + }, + { + "epoch": 0.05959918675573628, + "grad_norm": 0.32241371273994446, + "learning_rate": 0.0001, + "loss": 1.5502, + "step": 513 + }, + { + "epoch": 0.05971536450769677, + "grad_norm": 0.34780648350715637, + "learning_rate": 0.0001, + "loss": 1.7995, + "step": 514 + }, + { + "epoch": 0.059831542259657276, + "grad_norm": 0.3340502977371216, + "learning_rate": 0.0001, + "loss": 1.8202, + "step": 515 + }, + { + "epoch": 0.05994772001161777, + "grad_norm": 0.331016480922699, + "learning_rate": 0.0001, + "loss": 1.8144, + "step": 516 + }, + { + "epoch": 0.060063897763578275, + "grad_norm": 0.34053540229797363, + "learning_rate": 0.0001, + "loss": 1.6881, + "step": 517 + }, + { + "epoch": 0.06018007551553877, + "grad_norm": 0.31866925954818726, + "learning_rate": 0.0001, + "loss": 1.541, + "step": 518 + }, + { + "epoch": 0.060296253267499274, + "grad_norm": 0.36000797152519226, + "learning_rate": 0.0001, + "loss": 1.92, + "step": 519 + }, + { + "epoch": 0.06041243101945978, + "grad_norm": 0.37475964426994324, + "learning_rate": 0.0001, + "loss": 1.8022, + "step": 520 + }, + { + "epoch": 0.06052860877142027, + "grad_norm": 0.3554963767528534, + "learning_rate": 0.0001, + "loss": 1.8536, + "step": 521 + }, + { + "epoch": 0.060644786523380775, + "grad_norm": 0.34274694323539734, + "learning_rate": 0.0001, + "loss": 1.7868, + "step": 522 + }, + { + "epoch": 0.06076096427534127, + "grad_norm": 0.31418636441230774, + "learning_rate": 0.0001, + "loss": 1.7159, + "step": 523 + }, + { + "epoch": 0.060877142027301774, + "grad_norm": 0.3319328725337982, + "learning_rate": 0.0001, + "loss": 1.7644, + "step": 524 + }, + { + "epoch": 0.06099331977926227, + "grad_norm": 0.34280523657798767, + "learning_rate": 0.0001, + "loss": 1.759, + "step": 525 + }, + { + "epoch": 0.06110949753122277, + "grad_norm": 0.3405478298664093, + "learning_rate": 0.0001, + "loss": 1.7908, + "step": 526 + }, + { + "epoch": 0.06122567528318327, + "grad_norm": 0.34357553720474243, + "learning_rate": 0.0001, + "loss": 1.6256, + "step": 527 + }, + { + "epoch": 0.06134185303514377, + "grad_norm": 0.3327886164188385, + "learning_rate": 0.0001, + "loss": 1.825, + "step": 528 + }, + { + "epoch": 0.06145803078710427, + "grad_norm": 0.31985750794410706, + "learning_rate": 0.0001, + "loss": 1.5728, + "step": 529 + }, + { + "epoch": 0.06157420853906477, + "grad_norm": 0.32863008975982666, + "learning_rate": 0.0001, + "loss": 1.7012, + "step": 530 + }, + { + "epoch": 0.061690386291025266, + "grad_norm": 0.33859503269195557, + "learning_rate": 0.0001, + "loss": 1.7983, + "step": 531 + }, + { + "epoch": 0.06180656404298577, + "grad_norm": 0.354203999042511, + "learning_rate": 0.0001, + "loss": 1.801, + "step": 532 + }, + { + "epoch": 0.061922741794946265, + "grad_norm": 0.3500482439994812, + "learning_rate": 0.0001, + "loss": 1.7481, + "step": 533 + }, + { + "epoch": 0.06203891954690677, + "grad_norm": 0.3380871117115021, + "learning_rate": 0.0001, + "loss": 1.7285, + "step": 534 + }, + { + "epoch": 0.062155097298867264, + "grad_norm": 0.3638668656349182, + "learning_rate": 0.0001, + "loss": 1.8013, + "step": 535 + }, + { + "epoch": 0.06227127505082777, + "grad_norm": 0.3231678903102875, + "learning_rate": 0.0001, + "loss": 1.6436, + "step": 536 + }, + { + "epoch": 0.06238745280278827, + "grad_norm": 0.3448564112186432, + "learning_rate": 0.0001, + "loss": 1.7836, + "step": 537 + }, + { + "epoch": 0.06250363055474877, + "grad_norm": 0.34526991844177246, + "learning_rate": 0.0001, + "loss": 1.7609, + "step": 538 + }, + { + "epoch": 0.06261980830670927, + "grad_norm": 0.351270467042923, + "learning_rate": 0.0001, + "loss": 1.8093, + "step": 539 + }, + { + "epoch": 0.06273598605866977, + "grad_norm": 0.33383870124816895, + "learning_rate": 0.0001, + "loss": 1.7871, + "step": 540 + }, + { + "epoch": 0.06285216381063026, + "grad_norm": 0.31945285201072693, + "learning_rate": 0.0001, + "loss": 1.5509, + "step": 541 + }, + { + "epoch": 0.06296834156259076, + "grad_norm": 0.32655853033065796, + "learning_rate": 0.0001, + "loss": 1.7056, + "step": 542 + }, + { + "epoch": 0.06308451931455127, + "grad_norm": 0.3944481313228607, + "learning_rate": 0.0001, + "loss": 1.8958, + "step": 543 + }, + { + "epoch": 0.06320069706651177, + "grad_norm": 0.38392776250839233, + "learning_rate": 0.0001, + "loss": 1.8377, + "step": 544 + }, + { + "epoch": 0.06331687481847226, + "grad_norm": 0.3638809621334076, + "learning_rate": 0.0001, + "loss": 1.8184, + "step": 545 + }, + { + "epoch": 0.06343305257043276, + "grad_norm": 0.3336743116378784, + "learning_rate": 0.0001, + "loss": 1.6553, + "step": 546 + }, + { + "epoch": 0.06354923032239326, + "grad_norm": 0.32870176434516907, + "learning_rate": 0.0001, + "loss": 1.7085, + "step": 547 + }, + { + "epoch": 0.06366540807435377, + "grad_norm": 0.3251485526561737, + "learning_rate": 0.0001, + "loss": 1.797, + "step": 548 + }, + { + "epoch": 0.06378158582631425, + "grad_norm": 0.36087340116500854, + "learning_rate": 0.0001, + "loss": 1.8023, + "step": 549 + }, + { + "epoch": 0.06389776357827476, + "grad_norm": 0.35436710715293884, + "learning_rate": 0.0001, + "loss": 1.7703, + "step": 550 + }, + { + "epoch": 0.06401394133023526, + "grad_norm": 0.3625239133834839, + "learning_rate": 0.0001, + "loss": 1.7345, + "step": 551 + }, + { + "epoch": 0.06413011908219576, + "grad_norm": 0.33559513092041016, + "learning_rate": 0.0001, + "loss": 1.6253, + "step": 552 + }, + { + "epoch": 0.06424629683415625, + "grad_norm": 0.3335849642753601, + "learning_rate": 0.0001, + "loss": 1.832, + "step": 553 + }, + { + "epoch": 0.06436247458611676, + "grad_norm": 0.3296360969543457, + "learning_rate": 0.0001, + "loss": 1.7504, + "step": 554 + }, + { + "epoch": 0.06447865233807726, + "grad_norm": 0.3487899601459503, + "learning_rate": 0.0001, + "loss": 1.7819, + "step": 555 + }, + { + "epoch": 0.06459483009003776, + "grad_norm": 0.3731647729873657, + "learning_rate": 0.0001, + "loss": 2.0035, + "step": 556 + }, + { + "epoch": 0.06471100784199826, + "grad_norm": 0.33725181221961975, + "learning_rate": 0.0001, + "loss": 1.8034, + "step": 557 + }, + { + "epoch": 0.06482718559395875, + "grad_norm": 0.3277061879634857, + "learning_rate": 0.0001, + "loss": 1.4806, + "step": 558 + }, + { + "epoch": 0.06494336334591926, + "grad_norm": 0.3473774790763855, + "learning_rate": 0.0001, + "loss": 1.7392, + "step": 559 + }, + { + "epoch": 0.06505954109787976, + "grad_norm": 0.35049760341644287, + "learning_rate": 0.0001, + "loss": 1.6481, + "step": 560 + }, + { + "epoch": 0.06517571884984026, + "grad_norm": 0.3388431668281555, + "learning_rate": 0.0001, + "loss": 1.7494, + "step": 561 + }, + { + "epoch": 0.06529189660180075, + "grad_norm": 0.35445770621299744, + "learning_rate": 0.0001, + "loss": 1.746, + "step": 562 + }, + { + "epoch": 0.06540807435376125, + "grad_norm": 0.3392694592475891, + "learning_rate": 0.0001, + "loss": 1.8481, + "step": 563 + }, + { + "epoch": 0.06552425210572176, + "grad_norm": 0.3575725257396698, + "learning_rate": 0.0001, + "loss": 1.7864, + "step": 564 + }, + { + "epoch": 0.06564042985768226, + "grad_norm": 0.3979930877685547, + "learning_rate": 0.0001, + "loss": 1.8221, + "step": 565 + }, + { + "epoch": 0.06575660760964275, + "grad_norm": 0.3467651605606079, + "learning_rate": 0.0001, + "loss": 1.8548, + "step": 566 + }, + { + "epoch": 0.06587278536160325, + "grad_norm": 0.3466176390647888, + "learning_rate": 0.0001, + "loss": 1.9021, + "step": 567 + }, + { + "epoch": 0.06598896311356375, + "grad_norm": 0.33713239431381226, + "learning_rate": 0.0001, + "loss": 1.835, + "step": 568 + }, + { + "epoch": 0.06610514086552426, + "grad_norm": 0.3413597047328949, + "learning_rate": 0.0001, + "loss": 1.8019, + "step": 569 + }, + { + "epoch": 0.06622131861748475, + "grad_norm": 0.40804192423820496, + "learning_rate": 0.0001, + "loss": 1.9077, + "step": 570 + }, + { + "epoch": 0.06633749636944525, + "grad_norm": 0.36405694484710693, + "learning_rate": 0.0001, + "loss": 1.7789, + "step": 571 + }, + { + "epoch": 0.06645367412140575, + "grad_norm": 0.3612009882926941, + "learning_rate": 0.0001, + "loss": 1.758, + "step": 572 + }, + { + "epoch": 0.06656985187336625, + "grad_norm": 0.3526972830295563, + "learning_rate": 0.0001, + "loss": 1.9189, + "step": 573 + }, + { + "epoch": 0.06668602962532676, + "grad_norm": 0.34476232528686523, + "learning_rate": 0.0001, + "loss": 1.5721, + "step": 574 + }, + { + "epoch": 0.06680220737728725, + "grad_norm": 0.35720011591911316, + "learning_rate": 0.0001, + "loss": 1.9326, + "step": 575 + }, + { + "epoch": 0.06691838512924775, + "grad_norm": 0.35136792063713074, + "learning_rate": 0.0001, + "loss": 1.8546, + "step": 576 + }, + { + "epoch": 0.06703456288120825, + "grad_norm": 0.32413557171821594, + "learning_rate": 0.0001, + "loss": 1.5629, + "step": 577 + }, + { + "epoch": 0.06715074063316875, + "grad_norm": 0.321250319480896, + "learning_rate": 0.0001, + "loss": 1.7531, + "step": 578 + }, + { + "epoch": 0.06726691838512924, + "grad_norm": 0.34484270215034485, + "learning_rate": 0.0001, + "loss": 1.7195, + "step": 579 + }, + { + "epoch": 0.06738309613708975, + "grad_norm": 0.34433478116989136, + "learning_rate": 0.0001, + "loss": 1.7748, + "step": 580 + }, + { + "epoch": 0.06749927388905025, + "grad_norm": 0.3812878727912903, + "learning_rate": 0.0001, + "loss": 1.6951, + "step": 581 + }, + { + "epoch": 0.06761545164101075, + "grad_norm": 0.344705730676651, + "learning_rate": 0.0001, + "loss": 1.5685, + "step": 582 + }, + { + "epoch": 0.06773162939297124, + "grad_norm": 0.3333836793899536, + "learning_rate": 0.0001, + "loss": 1.6683, + "step": 583 + }, + { + "epoch": 0.06784780714493174, + "grad_norm": 0.35086333751678467, + "learning_rate": 0.0001, + "loss": 1.7734, + "step": 584 + }, + { + "epoch": 0.06796398489689225, + "grad_norm": 0.3325541019439697, + "learning_rate": 0.0001, + "loss": 1.609, + "step": 585 + }, + { + "epoch": 0.06808016264885275, + "grad_norm": 0.35840341448783875, + "learning_rate": 0.0001, + "loss": 1.6111, + "step": 586 + }, + { + "epoch": 0.06819634040081324, + "grad_norm": 0.33432942628860474, + "learning_rate": 0.0001, + "loss": 1.8359, + "step": 587 + }, + { + "epoch": 0.06831251815277374, + "grad_norm": 0.34798067808151245, + "learning_rate": 0.0001, + "loss": 1.6117, + "step": 588 + }, + { + "epoch": 0.06842869590473424, + "grad_norm": 0.319845974445343, + "learning_rate": 0.0001, + "loss": 1.6175, + "step": 589 + }, + { + "epoch": 0.06854487365669475, + "grad_norm": 0.37829452753067017, + "learning_rate": 0.0001, + "loss": 1.8517, + "step": 590 + }, + { + "epoch": 0.06866105140865525, + "grad_norm": 0.35298144817352295, + "learning_rate": 0.0001, + "loss": 1.801, + "step": 591 + }, + { + "epoch": 0.06877722916061574, + "grad_norm": 0.40371862053871155, + "learning_rate": 0.0001, + "loss": 1.5935, + "step": 592 + }, + { + "epoch": 0.06889340691257624, + "grad_norm": 0.3450127840042114, + "learning_rate": 0.0001, + "loss": 1.6476, + "step": 593 + }, + { + "epoch": 0.06900958466453674, + "grad_norm": 0.3465590178966522, + "learning_rate": 0.0001, + "loss": 1.7291, + "step": 594 + }, + { + "epoch": 0.06912576241649725, + "grad_norm": 0.3387679159641266, + "learning_rate": 0.0001, + "loss": 1.7121, + "step": 595 + }, + { + "epoch": 0.06924194016845774, + "grad_norm": 0.3386651277542114, + "learning_rate": 0.0001, + "loss": 1.7596, + "step": 596 + }, + { + "epoch": 0.06935811792041824, + "grad_norm": 0.3551906645298004, + "learning_rate": 0.0001, + "loss": 1.5952, + "step": 597 + }, + { + "epoch": 0.06947429567237874, + "grad_norm": 0.37498459219932556, + "learning_rate": 0.0001, + "loss": 1.8622, + "step": 598 + }, + { + "epoch": 0.06959047342433924, + "grad_norm": 0.3397311866283417, + "learning_rate": 0.0001, + "loss": 1.6644, + "step": 599 + }, + { + "epoch": 0.06970665117629973, + "grad_norm": 0.34366878867149353, + "learning_rate": 0.0001, + "loss": 1.7838, + "step": 600 + }, + { + "epoch": 0.06982282892826024, + "grad_norm": 0.3466067910194397, + "learning_rate": 0.0001, + "loss": 1.8222, + "step": 601 + }, + { + "epoch": 0.06993900668022074, + "grad_norm": 0.35218632221221924, + "learning_rate": 0.0001, + "loss": 1.7995, + "step": 602 + }, + { + "epoch": 0.07005518443218124, + "grad_norm": 0.3305307924747467, + "learning_rate": 0.0001, + "loss": 1.7099, + "step": 603 + }, + { + "epoch": 0.07017136218414173, + "grad_norm": 0.3424377143383026, + "learning_rate": 0.0001, + "loss": 1.8505, + "step": 604 + }, + { + "epoch": 0.07028753993610223, + "grad_norm": 0.33201947808265686, + "learning_rate": 0.0001, + "loss": 1.7887, + "step": 605 + }, + { + "epoch": 0.07040371768806274, + "grad_norm": 0.3492118716239929, + "learning_rate": 0.0001, + "loss": 1.8554, + "step": 606 + }, + { + "epoch": 0.07051989544002324, + "grad_norm": 0.3389701843261719, + "learning_rate": 0.0001, + "loss": 1.7442, + "step": 607 + }, + { + "epoch": 0.07063607319198374, + "grad_norm": 0.3567183315753937, + "learning_rate": 0.0001, + "loss": 1.8379, + "step": 608 + }, + { + "epoch": 0.07075225094394423, + "grad_norm": 0.342669814825058, + "learning_rate": 0.0001, + "loss": 1.7222, + "step": 609 + }, + { + "epoch": 0.07086842869590473, + "grad_norm": 0.37096837162971497, + "learning_rate": 0.0001, + "loss": 1.8548, + "step": 610 + }, + { + "epoch": 0.07098460644786524, + "grad_norm": 0.3211749494075775, + "learning_rate": 0.0001, + "loss": 1.7913, + "step": 611 + }, + { + "epoch": 0.07110078419982574, + "grad_norm": 0.3553162217140198, + "learning_rate": 0.0001, + "loss": 1.8448, + "step": 612 + }, + { + "epoch": 0.07121696195178623, + "grad_norm": 0.3596995174884796, + "learning_rate": 0.0001, + "loss": 1.7092, + "step": 613 + }, + { + "epoch": 0.07133313970374673, + "grad_norm": 0.32127490639686584, + "learning_rate": 0.0001, + "loss": 1.6311, + "step": 614 + }, + { + "epoch": 0.07144931745570723, + "grad_norm": 0.361713707447052, + "learning_rate": 0.0001, + "loss": 1.8077, + "step": 615 + }, + { + "epoch": 0.07156549520766774, + "grad_norm": 0.3255080282688141, + "learning_rate": 0.0001, + "loss": 1.7618, + "step": 616 + }, + { + "epoch": 0.07168167295962823, + "grad_norm": 0.3420798182487488, + "learning_rate": 0.0001, + "loss": 1.7544, + "step": 617 + }, + { + "epoch": 0.07179785071158873, + "grad_norm": 0.39319682121276855, + "learning_rate": 0.0001, + "loss": 1.797, + "step": 618 + }, + { + "epoch": 0.07191402846354923, + "grad_norm": 0.33742591738700867, + "learning_rate": 0.0001, + "loss": 1.4727, + "step": 619 + }, + { + "epoch": 0.07203020621550973, + "grad_norm": 0.3327065408229828, + "learning_rate": 0.0001, + "loss": 1.7358, + "step": 620 + }, + { + "epoch": 0.07214638396747022, + "grad_norm": 0.3351439833641052, + "learning_rate": 0.0001, + "loss": 1.6805, + "step": 621 + }, + { + "epoch": 0.07226256171943073, + "grad_norm": 0.3432386815547943, + "learning_rate": 0.0001, + "loss": 1.7511, + "step": 622 + }, + { + "epoch": 0.07237873947139123, + "grad_norm": 0.35164207220077515, + "learning_rate": 0.0001, + "loss": 1.7889, + "step": 623 + }, + { + "epoch": 0.07249491722335173, + "grad_norm": 0.34505143761634827, + "learning_rate": 0.0001, + "loss": 1.7717, + "step": 624 + }, + { + "epoch": 0.07261109497531222, + "grad_norm": 0.34696313738822937, + "learning_rate": 0.0001, + "loss": 1.7991, + "step": 625 + }, + { + "epoch": 0.07272727272727272, + "grad_norm": 0.33465588092803955, + "learning_rate": 0.0001, + "loss": 1.8238, + "step": 626 + }, + { + "epoch": 0.07284345047923323, + "grad_norm": 0.3633832335472107, + "learning_rate": 0.0001, + "loss": 1.808, + "step": 627 + }, + { + "epoch": 0.07295962823119373, + "grad_norm": 0.3349045217037201, + "learning_rate": 0.0001, + "loss": 1.6797, + "step": 628 + }, + { + "epoch": 0.07307580598315423, + "grad_norm": 0.3547269105911255, + "learning_rate": 0.0001, + "loss": 1.7475, + "step": 629 + }, + { + "epoch": 0.07319198373511472, + "grad_norm": 0.41657668352127075, + "learning_rate": 0.0001, + "loss": 2.065, + "step": 630 + }, + { + "epoch": 0.07330816148707522, + "grad_norm": 0.33551225066185, + "learning_rate": 0.0001, + "loss": 1.6979, + "step": 631 + }, + { + "epoch": 0.07342433923903573, + "grad_norm": 0.3287172317504883, + "learning_rate": 0.0001, + "loss": 1.6711, + "step": 632 + }, + { + "epoch": 0.07354051699099623, + "grad_norm": 0.3462834060192108, + "learning_rate": 0.0001, + "loss": 1.7681, + "step": 633 + }, + { + "epoch": 0.07365669474295672, + "grad_norm": 0.32798144221305847, + "learning_rate": 0.0001, + "loss": 1.7706, + "step": 634 + }, + { + "epoch": 0.07377287249491722, + "grad_norm": 0.3188357651233673, + "learning_rate": 0.0001, + "loss": 1.6755, + "step": 635 + }, + { + "epoch": 0.07388905024687772, + "grad_norm": 0.34357234835624695, + "learning_rate": 0.0001, + "loss": 1.7105, + "step": 636 + }, + { + "epoch": 0.07400522799883823, + "grad_norm": 0.3531816005706787, + "learning_rate": 0.0001, + "loss": 1.8203, + "step": 637 + }, + { + "epoch": 0.07412140575079872, + "grad_norm": 0.3235273063182831, + "learning_rate": 0.0001, + "loss": 1.6864, + "step": 638 + }, + { + "epoch": 0.07423758350275922, + "grad_norm": 0.3293229639530182, + "learning_rate": 0.0001, + "loss": 1.7596, + "step": 639 + }, + { + "epoch": 0.07435376125471972, + "grad_norm": 0.3663218021392822, + "learning_rate": 0.0001, + "loss": 1.8909, + "step": 640 + }, + { + "epoch": 0.07446993900668022, + "grad_norm": 0.33595624566078186, + "learning_rate": 0.0001, + "loss": 1.661, + "step": 641 + }, + { + "epoch": 0.07458611675864071, + "grad_norm": 0.3276788890361786, + "learning_rate": 0.0001, + "loss": 1.6903, + "step": 642 + }, + { + "epoch": 0.07470229451060122, + "grad_norm": 0.3766964077949524, + "learning_rate": 0.0001, + "loss": 1.9409, + "step": 643 + }, + { + "epoch": 0.07481847226256172, + "grad_norm": 0.3381744623184204, + "learning_rate": 0.0001, + "loss": 1.6455, + "step": 644 + }, + { + "epoch": 0.07493465001452222, + "grad_norm": 0.34408965706825256, + "learning_rate": 0.0001, + "loss": 1.5929, + "step": 645 + }, + { + "epoch": 0.07505082776648273, + "grad_norm": 0.33645275235176086, + "learning_rate": 0.0001, + "loss": 1.7188, + "step": 646 + }, + { + "epoch": 0.07516700551844321, + "grad_norm": 0.3625565469264984, + "learning_rate": 0.0001, + "loss": 1.798, + "step": 647 + }, + { + "epoch": 0.07528318327040372, + "grad_norm": 0.3350875973701477, + "learning_rate": 0.0001, + "loss": 1.7037, + "step": 648 + }, + { + "epoch": 0.07539936102236422, + "grad_norm": 0.34720659255981445, + "learning_rate": 0.0001, + "loss": 1.6187, + "step": 649 + }, + { + "epoch": 0.07551553877432472, + "grad_norm": 0.3373357951641083, + "learning_rate": 0.0001, + "loss": 1.7226, + "step": 650 + }, + { + "epoch": 0.07563171652628521, + "grad_norm": 0.3362538516521454, + "learning_rate": 0.0001, + "loss": 1.6718, + "step": 651 + }, + { + "epoch": 0.07574789427824571, + "grad_norm": 0.34010645747184753, + "learning_rate": 0.0001, + "loss": 1.7155, + "step": 652 + }, + { + "epoch": 0.07586407203020622, + "grad_norm": 0.31828391551971436, + "learning_rate": 0.0001, + "loss": 1.6605, + "step": 653 + }, + { + "epoch": 0.07598024978216672, + "grad_norm": 0.32539987564086914, + "learning_rate": 0.0001, + "loss": 1.7795, + "step": 654 + }, + { + "epoch": 0.07609642753412721, + "grad_norm": 0.32553842663764954, + "learning_rate": 0.0001, + "loss": 1.7163, + "step": 655 + }, + { + "epoch": 0.07621260528608771, + "grad_norm": 0.35710620880126953, + "learning_rate": 0.0001, + "loss": 1.8839, + "step": 656 + }, + { + "epoch": 0.07632878303804821, + "grad_norm": 0.33389732241630554, + "learning_rate": 0.0001, + "loss": 1.7109, + "step": 657 + }, + { + "epoch": 0.07644496079000872, + "grad_norm": 0.3359222412109375, + "learning_rate": 0.0001, + "loss": 1.9214, + "step": 658 + }, + { + "epoch": 0.0765611385419692, + "grad_norm": 0.3710361123085022, + "learning_rate": 0.0001, + "loss": 1.7439, + "step": 659 + }, + { + "epoch": 0.07667731629392971, + "grad_norm": 0.34122204780578613, + "learning_rate": 0.0001, + "loss": 1.8186, + "step": 660 + }, + { + "epoch": 0.07679349404589021, + "grad_norm": 0.3368107080459595, + "learning_rate": 0.0001, + "loss": 1.8068, + "step": 661 + }, + { + "epoch": 0.07690967179785071, + "grad_norm": 0.3254035711288452, + "learning_rate": 0.0001, + "loss": 1.5137, + "step": 662 + }, + { + "epoch": 0.07702584954981122, + "grad_norm": 0.3608943223953247, + "learning_rate": 0.0001, + "loss": 1.9746, + "step": 663 + }, + { + "epoch": 0.0771420273017717, + "grad_norm": 0.3355536162853241, + "learning_rate": 0.0001, + "loss": 1.6237, + "step": 664 + }, + { + "epoch": 0.07725820505373221, + "grad_norm": 0.3630955219268799, + "learning_rate": 0.0001, + "loss": 1.7642, + "step": 665 + }, + { + "epoch": 0.07737438280569271, + "grad_norm": 0.3425436019897461, + "learning_rate": 0.0001, + "loss": 1.8083, + "step": 666 + }, + { + "epoch": 0.07749056055765322, + "grad_norm": 0.3148491382598877, + "learning_rate": 0.0001, + "loss": 1.4924, + "step": 667 + }, + { + "epoch": 0.0776067383096137, + "grad_norm": 0.3225914239883423, + "learning_rate": 0.0001, + "loss": 1.6458, + "step": 668 + }, + { + "epoch": 0.07772291606157421, + "grad_norm": 0.37254008650779724, + "learning_rate": 0.0001, + "loss": 1.8129, + "step": 669 + }, + { + "epoch": 0.07783909381353471, + "grad_norm": 0.3412669897079468, + "learning_rate": 0.0001, + "loss": 1.8073, + "step": 670 + }, + { + "epoch": 0.07795527156549521, + "grad_norm": 0.333635151386261, + "learning_rate": 0.0001, + "loss": 1.6577, + "step": 671 + }, + { + "epoch": 0.0780714493174557, + "grad_norm": 0.36147254705429077, + "learning_rate": 0.0001, + "loss": 1.8375, + "step": 672 + }, + { + "epoch": 0.0781876270694162, + "grad_norm": 0.3428162932395935, + "learning_rate": 0.0001, + "loss": 1.7767, + "step": 673 + }, + { + "epoch": 0.07830380482137671, + "grad_norm": 0.3516107499599457, + "learning_rate": 0.0001, + "loss": 1.7804, + "step": 674 + }, + { + "epoch": 0.07841998257333721, + "grad_norm": 0.37563571333885193, + "learning_rate": 0.0001, + "loss": 1.7975, + "step": 675 + }, + { + "epoch": 0.0785361603252977, + "grad_norm": 0.32837507128715515, + "learning_rate": 0.0001, + "loss": 1.6925, + "step": 676 + }, + { + "epoch": 0.0786523380772582, + "grad_norm": 0.34303468465805054, + "learning_rate": 0.0001, + "loss": 1.8518, + "step": 677 + }, + { + "epoch": 0.0787685158292187, + "grad_norm": 0.3297956883907318, + "learning_rate": 0.0001, + "loss": 1.7057, + "step": 678 + }, + { + "epoch": 0.07888469358117921, + "grad_norm": 0.34342435002326965, + "learning_rate": 0.0001, + "loss": 1.636, + "step": 679 + }, + { + "epoch": 0.07900087133313971, + "grad_norm": 0.3558964729309082, + "learning_rate": 0.0001, + "loss": 1.7076, + "step": 680 + }, + { + "epoch": 0.0791170490851002, + "grad_norm": 0.3384302258491516, + "learning_rate": 0.0001, + "loss": 1.8352, + "step": 681 + }, + { + "epoch": 0.0792332268370607, + "grad_norm": 0.33917075395584106, + "learning_rate": 0.0001, + "loss": 1.6065, + "step": 682 + }, + { + "epoch": 0.0793494045890212, + "grad_norm": 0.37113896012306213, + "learning_rate": 0.0001, + "loss": 1.7594, + "step": 683 + }, + { + "epoch": 0.07946558234098171, + "grad_norm": 0.34009361267089844, + "learning_rate": 0.0001, + "loss": 1.611, + "step": 684 + }, + { + "epoch": 0.0795817600929422, + "grad_norm": 0.35961294174194336, + "learning_rate": 0.0001, + "loss": 1.7863, + "step": 685 + }, + { + "epoch": 0.0796979378449027, + "grad_norm": 0.3599853813648224, + "learning_rate": 0.0001, + "loss": 1.7754, + "step": 686 + }, + { + "epoch": 0.0798141155968632, + "grad_norm": 0.33733391761779785, + "learning_rate": 0.0001, + "loss": 1.6633, + "step": 687 + }, + { + "epoch": 0.0799302933488237, + "grad_norm": 0.33123722672462463, + "learning_rate": 0.0001, + "loss": 1.7556, + "step": 688 + }, + { + "epoch": 0.0800464711007842, + "grad_norm": 0.3545984923839569, + "learning_rate": 0.0001, + "loss": 1.5852, + "step": 689 + }, + { + "epoch": 0.0801626488527447, + "grad_norm": 0.3520684242248535, + "learning_rate": 0.0001, + "loss": 1.7908, + "step": 690 + }, + { + "epoch": 0.0802788266047052, + "grad_norm": 0.364214152097702, + "learning_rate": 0.0001, + "loss": 1.8752, + "step": 691 + }, + { + "epoch": 0.0803950043566657, + "grad_norm": 0.35055190324783325, + "learning_rate": 0.0001, + "loss": 1.6875, + "step": 692 + }, + { + "epoch": 0.08051118210862619, + "grad_norm": 0.3547899127006531, + "learning_rate": 0.0001, + "loss": 1.6266, + "step": 693 + }, + { + "epoch": 0.0806273598605867, + "grad_norm": 0.31684044003486633, + "learning_rate": 0.0001, + "loss": 1.4333, + "step": 694 + }, + { + "epoch": 0.0807435376125472, + "grad_norm": 0.3523019254207611, + "learning_rate": 0.0001, + "loss": 1.7933, + "step": 695 + }, + { + "epoch": 0.0808597153645077, + "grad_norm": 0.379930317401886, + "learning_rate": 0.0001, + "loss": 1.8274, + "step": 696 + }, + { + "epoch": 0.08097589311646819, + "grad_norm": 0.32600757479667664, + "learning_rate": 0.0001, + "loss": 1.6605, + "step": 697 + }, + { + "epoch": 0.08109207086842869, + "grad_norm": 0.33486902713775635, + "learning_rate": 0.0001, + "loss": 1.7029, + "step": 698 + }, + { + "epoch": 0.0812082486203892, + "grad_norm": 0.3586275577545166, + "learning_rate": 0.0001, + "loss": 1.7762, + "step": 699 + }, + { + "epoch": 0.0813244263723497, + "grad_norm": 0.34406691789627075, + "learning_rate": 0.0001, + "loss": 1.5871, + "step": 700 + }, + { + "epoch": 0.0814406041243102, + "grad_norm": 0.35073554515838623, + "learning_rate": 0.0001, + "loss": 1.776, + "step": 701 + }, + { + "epoch": 0.08155678187627069, + "grad_norm": 0.4370097517967224, + "learning_rate": 0.0001, + "loss": 1.9901, + "step": 702 + }, + { + "epoch": 0.08167295962823119, + "grad_norm": 0.36396339535713196, + "learning_rate": 0.0001, + "loss": 1.8286, + "step": 703 + }, + { + "epoch": 0.0817891373801917, + "grad_norm": 0.31740906834602356, + "learning_rate": 0.0001, + "loss": 1.6781, + "step": 704 + }, + { + "epoch": 0.0819053151321522, + "grad_norm": 0.36947962641716003, + "learning_rate": 0.0001, + "loss": 1.8666, + "step": 705 + }, + { + "epoch": 0.08202149288411269, + "grad_norm": 0.3637601435184479, + "learning_rate": 0.0001, + "loss": 1.8011, + "step": 706 + }, + { + "epoch": 0.08213767063607319, + "grad_norm": 0.35673728585243225, + "learning_rate": 0.0001, + "loss": 1.8199, + "step": 707 + }, + { + "epoch": 0.08225384838803369, + "grad_norm": 0.3384333550930023, + "learning_rate": 0.0001, + "loss": 1.7729, + "step": 708 + }, + { + "epoch": 0.0823700261399942, + "grad_norm": 0.3344199061393738, + "learning_rate": 0.0001, + "loss": 1.797, + "step": 709 + }, + { + "epoch": 0.08248620389195468, + "grad_norm": 0.3451163172721863, + "learning_rate": 0.0001, + "loss": 1.7087, + "step": 710 + }, + { + "epoch": 0.08260238164391519, + "grad_norm": 0.3421926200389862, + "learning_rate": 0.0001, + "loss": 1.8762, + "step": 711 + }, + { + "epoch": 0.08271855939587569, + "grad_norm": 0.32630541920661926, + "learning_rate": 0.0001, + "loss": 1.6638, + "step": 712 + }, + { + "epoch": 0.0828347371478362, + "grad_norm": 0.35678330063819885, + "learning_rate": 0.0001, + "loss": 1.915, + "step": 713 + }, + { + "epoch": 0.08295091489979668, + "grad_norm": 0.3382868766784668, + "learning_rate": 0.0001, + "loss": 1.6471, + "step": 714 + }, + { + "epoch": 0.08306709265175719, + "grad_norm": 0.3485075831413269, + "learning_rate": 0.0001, + "loss": 1.7365, + "step": 715 + }, + { + "epoch": 0.08318327040371769, + "grad_norm": 0.31177419424057007, + "learning_rate": 0.0001, + "loss": 1.6356, + "step": 716 + }, + { + "epoch": 0.08329944815567819, + "grad_norm": 0.34105831384658813, + "learning_rate": 0.0001, + "loss": 1.5632, + "step": 717 + }, + { + "epoch": 0.0834156259076387, + "grad_norm": 0.3517579138278961, + "learning_rate": 0.0001, + "loss": 1.7773, + "step": 718 + }, + { + "epoch": 0.08353180365959918, + "grad_norm": 0.3053590953350067, + "learning_rate": 0.0001, + "loss": 1.5039, + "step": 719 + }, + { + "epoch": 0.08364798141155969, + "grad_norm": 0.35734882950782776, + "learning_rate": 0.0001, + "loss": 1.779, + "step": 720 + }, + { + "epoch": 0.08376415916352019, + "grad_norm": 0.33821558952331543, + "learning_rate": 0.0001, + "loss": 1.7424, + "step": 721 + }, + { + "epoch": 0.08388033691548069, + "grad_norm": 0.3388650417327881, + "learning_rate": 0.0001, + "loss": 1.7397, + "step": 722 + }, + { + "epoch": 0.08399651466744118, + "grad_norm": 0.36601775884628296, + "learning_rate": 0.0001, + "loss": 1.8082, + "step": 723 + }, + { + "epoch": 0.08411269241940168, + "grad_norm": 0.34071359038352966, + "learning_rate": 0.0001, + "loss": 1.7245, + "step": 724 + }, + { + "epoch": 0.08422887017136219, + "grad_norm": 0.34587469696998596, + "learning_rate": 0.0001, + "loss": 1.533, + "step": 725 + }, + { + "epoch": 0.08434504792332269, + "grad_norm": 0.3538304269313812, + "learning_rate": 0.0001, + "loss": 1.973, + "step": 726 + }, + { + "epoch": 0.08446122567528318, + "grad_norm": 0.3469333350658417, + "learning_rate": 0.0001, + "loss": 1.8881, + "step": 727 + }, + { + "epoch": 0.08457740342724368, + "grad_norm": 0.31623193621635437, + "learning_rate": 0.0001, + "loss": 1.7093, + "step": 728 + }, + { + "epoch": 0.08469358117920418, + "grad_norm": 0.3332175016403198, + "learning_rate": 0.0001, + "loss": 1.7208, + "step": 729 + }, + { + "epoch": 0.08480975893116469, + "grad_norm": 0.35500526428222656, + "learning_rate": 0.0001, + "loss": 1.7005, + "step": 730 + }, + { + "epoch": 0.08492593668312517, + "grad_norm": 0.35975468158721924, + "learning_rate": 0.0001, + "loss": 1.7079, + "step": 731 + }, + { + "epoch": 0.08504211443508568, + "grad_norm": 0.33805137872695923, + "learning_rate": 0.0001, + "loss": 1.7452, + "step": 732 + }, + { + "epoch": 0.08515829218704618, + "grad_norm": 0.3446786105632782, + "learning_rate": 0.0001, + "loss": 1.7168, + "step": 733 + }, + { + "epoch": 0.08527446993900668, + "grad_norm": 0.3499710261821747, + "learning_rate": 0.0001, + "loss": 1.8734, + "step": 734 + }, + { + "epoch": 0.08539064769096719, + "grad_norm": 0.33205246925354004, + "learning_rate": 0.0001, + "loss": 1.7198, + "step": 735 + }, + { + "epoch": 0.08550682544292768, + "grad_norm": 0.3275506794452667, + "learning_rate": 0.0001, + "loss": 1.4777, + "step": 736 + }, + { + "epoch": 0.08562300319488818, + "grad_norm": 0.291427344083786, + "learning_rate": 0.0001, + "loss": 1.5407, + "step": 737 + }, + { + "epoch": 0.08573918094684868, + "grad_norm": 0.3342317044734955, + "learning_rate": 0.0001, + "loss": 1.8063, + "step": 738 + }, + { + "epoch": 0.08585535869880918, + "grad_norm": 0.3613170087337494, + "learning_rate": 0.0001, + "loss": 1.8471, + "step": 739 + }, + { + "epoch": 0.08597153645076967, + "grad_norm": 0.35327574610710144, + "learning_rate": 0.0001, + "loss": 1.8022, + "step": 740 + }, + { + "epoch": 0.08608771420273018, + "grad_norm": 0.32750821113586426, + "learning_rate": 0.0001, + "loss": 1.6562, + "step": 741 + }, + { + "epoch": 0.08620389195469068, + "grad_norm": 0.37629932165145874, + "learning_rate": 0.0001, + "loss": 1.6965, + "step": 742 + }, + { + "epoch": 0.08632006970665118, + "grad_norm": 0.3730964958667755, + "learning_rate": 0.0001, + "loss": 1.6513, + "step": 743 + }, + { + "epoch": 0.08643624745861167, + "grad_norm": 0.36657699942588806, + "learning_rate": 0.0001, + "loss": 1.8363, + "step": 744 + }, + { + "epoch": 0.08655242521057217, + "grad_norm": 0.3720366358757019, + "learning_rate": 0.0001, + "loss": 1.9311, + "step": 745 + }, + { + "epoch": 0.08666860296253268, + "grad_norm": 0.34417784214019775, + "learning_rate": 0.0001, + "loss": 1.902, + "step": 746 + }, + { + "epoch": 0.08678478071449318, + "grad_norm": 0.34215980768203735, + "learning_rate": 0.0001, + "loss": 1.6812, + "step": 747 + }, + { + "epoch": 0.08690095846645367, + "grad_norm": 0.37166187167167664, + "learning_rate": 0.0001, + "loss": 1.7309, + "step": 748 + }, + { + "epoch": 0.08701713621841417, + "grad_norm": 0.3440292179584503, + "learning_rate": 0.0001, + "loss": 1.6387, + "step": 749 + }, + { + "epoch": 0.08713331397037467, + "grad_norm": 0.38115811347961426, + "learning_rate": 0.0001, + "loss": 1.8192, + "step": 750 + }, + { + "epoch": 0.08724949172233518, + "grad_norm": 0.38101625442504883, + "learning_rate": 0.0001, + "loss": 1.8616, + "step": 751 + }, + { + "epoch": 0.08736566947429568, + "grad_norm": 0.3463743031024933, + "learning_rate": 0.0001, + "loss": 1.7568, + "step": 752 + }, + { + "epoch": 0.08748184722625617, + "grad_norm": 0.36165106296539307, + "learning_rate": 0.0001, + "loss": 1.8383, + "step": 753 + }, + { + "epoch": 0.08759802497821667, + "grad_norm": 0.31763550639152527, + "learning_rate": 0.0001, + "loss": 1.5879, + "step": 754 + }, + { + "epoch": 0.08771420273017717, + "grad_norm": 0.35172948241233826, + "learning_rate": 0.0001, + "loss": 1.8296, + "step": 755 + }, + { + "epoch": 0.08783038048213768, + "grad_norm": 0.3546104431152344, + "learning_rate": 0.0001, + "loss": 1.615, + "step": 756 + }, + { + "epoch": 0.08794655823409817, + "grad_norm": 0.3244597911834717, + "learning_rate": 0.0001, + "loss": 1.6416, + "step": 757 + }, + { + "epoch": 0.08806273598605867, + "grad_norm": 0.33493107557296753, + "learning_rate": 0.0001, + "loss": 1.6849, + "step": 758 + }, + { + "epoch": 0.08817891373801917, + "grad_norm": 0.3546777665615082, + "learning_rate": 0.0001, + "loss": 1.7593, + "step": 759 + }, + { + "epoch": 0.08829509148997967, + "grad_norm": 0.3528088629245758, + "learning_rate": 0.0001, + "loss": 1.8915, + "step": 760 + }, + { + "epoch": 0.08841126924194016, + "grad_norm": 0.32234612107276917, + "learning_rate": 0.0001, + "loss": 1.6456, + "step": 761 + }, + { + "epoch": 0.08852744699390067, + "grad_norm": 0.35947954654693604, + "learning_rate": 0.0001, + "loss": 1.6958, + "step": 762 + }, + { + "epoch": 0.08864362474586117, + "grad_norm": 0.34787270426750183, + "learning_rate": 0.0001, + "loss": 1.7837, + "step": 763 + }, + { + "epoch": 0.08875980249782167, + "grad_norm": 0.3539670407772064, + "learning_rate": 0.0001, + "loss": 1.8225, + "step": 764 + }, + { + "epoch": 0.08887598024978216, + "grad_norm": 0.3532291352748871, + "learning_rate": 0.0001, + "loss": 1.7474, + "step": 765 + }, + { + "epoch": 0.08899215800174266, + "grad_norm": 0.35020220279693604, + "learning_rate": 0.0001, + "loss": 1.8047, + "step": 766 + }, + { + "epoch": 0.08910833575370317, + "grad_norm": 0.3528810441493988, + "learning_rate": 0.0001, + "loss": 1.5495, + "step": 767 + }, + { + "epoch": 0.08922451350566367, + "grad_norm": 0.38562941551208496, + "learning_rate": 0.0001, + "loss": 1.7691, + "step": 768 + }, + { + "epoch": 0.08934069125762417, + "grad_norm": 0.3530423641204834, + "learning_rate": 0.0001, + "loss": 1.8129, + "step": 769 + }, + { + "epoch": 0.08945686900958466, + "grad_norm": 0.35964086651802063, + "learning_rate": 0.0001, + "loss": 1.8651, + "step": 770 + }, + { + "epoch": 0.08957304676154516, + "grad_norm": 0.3369607627391815, + "learning_rate": 0.0001, + "loss": 1.7238, + "step": 771 + }, + { + "epoch": 0.08968922451350567, + "grad_norm": 0.3420158326625824, + "learning_rate": 0.0001, + "loss": 1.7426, + "step": 772 + }, + { + "epoch": 0.08980540226546617, + "grad_norm": 0.3466860353946686, + "learning_rate": 0.0001, + "loss": 1.7715, + "step": 773 + }, + { + "epoch": 0.08992158001742666, + "grad_norm": 0.33295726776123047, + "learning_rate": 0.0001, + "loss": 1.7334, + "step": 774 + }, + { + "epoch": 0.09003775776938716, + "grad_norm": 0.2985909581184387, + "learning_rate": 0.0001, + "loss": 1.5965, + "step": 775 + }, + { + "epoch": 0.09015393552134766, + "grad_norm": 0.334733247756958, + "learning_rate": 0.0001, + "loss": 1.8092, + "step": 776 + }, + { + "epoch": 0.09027011327330817, + "grad_norm": 0.3320635259151459, + "learning_rate": 0.0001, + "loss": 1.6475, + "step": 777 + }, + { + "epoch": 0.09038629102526866, + "grad_norm": 0.33000028133392334, + "learning_rate": 0.0001, + "loss": 1.6653, + "step": 778 + }, + { + "epoch": 0.09050246877722916, + "grad_norm": 0.3425371050834656, + "learning_rate": 0.0001, + "loss": 1.8284, + "step": 779 + }, + { + "epoch": 0.09061864652918966, + "grad_norm": 0.3471533954143524, + "learning_rate": 0.0001, + "loss": 1.824, + "step": 780 + }, + { + "epoch": 0.09073482428115016, + "grad_norm": 0.34877267479896545, + "learning_rate": 0.0001, + "loss": 1.7793, + "step": 781 + }, + { + "epoch": 0.09085100203311065, + "grad_norm": 0.33842357993125916, + "learning_rate": 0.0001, + "loss": 1.8107, + "step": 782 + }, + { + "epoch": 0.09096717978507116, + "grad_norm": 0.3466620445251465, + "learning_rate": 0.0001, + "loss": 1.7385, + "step": 783 + }, + { + "epoch": 0.09108335753703166, + "grad_norm": 0.3401154577732086, + "learning_rate": 0.0001, + "loss": 1.8291, + "step": 784 + }, + { + "epoch": 0.09119953528899216, + "grad_norm": 0.3380959630012512, + "learning_rate": 0.0001, + "loss": 1.7823, + "step": 785 + }, + { + "epoch": 0.09131571304095265, + "grad_norm": 0.3219566345214844, + "learning_rate": 0.0001, + "loss": 1.6534, + "step": 786 + }, + { + "epoch": 0.09143189079291315, + "grad_norm": 0.36787810921669006, + "learning_rate": 0.0001, + "loss": 1.8798, + "step": 787 + }, + { + "epoch": 0.09154806854487366, + "grad_norm": 0.3317922055721283, + "learning_rate": 0.0001, + "loss": 1.6822, + "step": 788 + }, + { + "epoch": 0.09166424629683416, + "grad_norm": 0.32760554552078247, + "learning_rate": 0.0001, + "loss": 1.6927, + "step": 789 + }, + { + "epoch": 0.09178042404879466, + "grad_norm": 0.3528217375278473, + "learning_rate": 0.0001, + "loss": 1.8356, + "step": 790 + }, + { + "epoch": 0.09189660180075515, + "grad_norm": 0.3439396321773529, + "learning_rate": 0.0001, + "loss": 1.8907, + "step": 791 + }, + { + "epoch": 0.09201277955271565, + "grad_norm": 0.32898107171058655, + "learning_rate": 0.0001, + "loss": 1.7463, + "step": 792 + }, + { + "epoch": 0.09212895730467616, + "grad_norm": 0.35222840309143066, + "learning_rate": 0.0001, + "loss": 1.6535, + "step": 793 + }, + { + "epoch": 0.09224513505663666, + "grad_norm": 0.35788872838020325, + "learning_rate": 0.0001, + "loss": 1.7799, + "step": 794 + }, + { + "epoch": 0.09236131280859715, + "grad_norm": 0.34906426072120667, + "learning_rate": 0.0001, + "loss": 1.7877, + "step": 795 + }, + { + "epoch": 0.09247749056055765, + "grad_norm": 0.3351164162158966, + "learning_rate": 0.0001, + "loss": 1.7754, + "step": 796 + }, + { + "epoch": 0.09259366831251815, + "grad_norm": 0.34004396200180054, + "learning_rate": 0.0001, + "loss": 1.7906, + "step": 797 + }, + { + "epoch": 0.09270984606447866, + "grad_norm": 0.3525846302509308, + "learning_rate": 0.0001, + "loss": 1.852, + "step": 798 + }, + { + "epoch": 0.09282602381643915, + "grad_norm": 0.3507898449897766, + "learning_rate": 0.0001, + "loss": 1.6618, + "step": 799 + }, + { + "epoch": 0.09294220156839965, + "grad_norm": 0.33934998512268066, + "learning_rate": 0.0001, + "loss": 1.8192, + "step": 800 + }, + { + "epoch": 0.09305837932036015, + "grad_norm": 0.3526393473148346, + "learning_rate": 0.0001, + "loss": 1.7328, + "step": 801 + }, + { + "epoch": 0.09317455707232065, + "grad_norm": 0.3338293135166168, + "learning_rate": 0.0001, + "loss": 1.8076, + "step": 802 + }, + { + "epoch": 0.09329073482428114, + "grad_norm": 0.3807794451713562, + "learning_rate": 0.0001, + "loss": 1.9431, + "step": 803 + }, + { + "epoch": 0.09340691257624165, + "grad_norm": 0.33969688415527344, + "learning_rate": 0.0001, + "loss": 1.8415, + "step": 804 + }, + { + "epoch": 0.09352309032820215, + "grad_norm": 0.3313851058483124, + "learning_rate": 0.0001, + "loss": 1.7324, + "step": 805 + }, + { + "epoch": 0.09363926808016265, + "grad_norm": 0.3267270624637604, + "learning_rate": 0.0001, + "loss": 1.7788, + "step": 806 + }, + { + "epoch": 0.09375544583212315, + "grad_norm": 0.3477146327495575, + "learning_rate": 0.0001, + "loss": 1.6614, + "step": 807 + }, + { + "epoch": 0.09387162358408364, + "grad_norm": 0.36601418256759644, + "learning_rate": 0.0001, + "loss": 1.8103, + "step": 808 + }, + { + "epoch": 0.09398780133604415, + "grad_norm": 0.33894267678260803, + "learning_rate": 0.0001, + "loss": 1.7657, + "step": 809 + }, + { + "epoch": 0.09410397908800465, + "grad_norm": 0.35825762152671814, + "learning_rate": 0.0001, + "loss": 1.7546, + "step": 810 + }, + { + "epoch": 0.09422015683996515, + "grad_norm": 0.3400390148162842, + "learning_rate": 0.0001, + "loss": 1.5386, + "step": 811 + }, + { + "epoch": 0.09433633459192564, + "grad_norm": 0.32821226119995117, + "learning_rate": 0.0001, + "loss": 1.6345, + "step": 812 + }, + { + "epoch": 0.09445251234388614, + "grad_norm": 0.3435446619987488, + "learning_rate": 0.0001, + "loss": 1.7616, + "step": 813 + }, + { + "epoch": 0.09456869009584665, + "grad_norm": 0.37115979194641113, + "learning_rate": 0.0001, + "loss": 1.7555, + "step": 814 + }, + { + "epoch": 0.09468486784780715, + "grad_norm": 0.3451048731803894, + "learning_rate": 0.0001, + "loss": 1.6982, + "step": 815 + }, + { + "epoch": 0.09480104559976764, + "grad_norm": 0.3507342040538788, + "learning_rate": 0.0001, + "loss": 1.8472, + "step": 816 + }, + { + "epoch": 0.09491722335172814, + "grad_norm": 0.32728418707847595, + "learning_rate": 0.0001, + "loss": 1.6649, + "step": 817 + }, + { + "epoch": 0.09503340110368864, + "grad_norm": 0.3312884271144867, + "learning_rate": 0.0001, + "loss": 1.6482, + "step": 818 + }, + { + "epoch": 0.09514957885564915, + "grad_norm": 0.3303951025009155, + "learning_rate": 0.0001, + "loss": 1.6003, + "step": 819 + }, + { + "epoch": 0.09526575660760964, + "grad_norm": 0.3290101885795593, + "learning_rate": 0.0001, + "loss": 1.6972, + "step": 820 + }, + { + "epoch": 0.09538193435957014, + "grad_norm": 0.36628472805023193, + "learning_rate": 0.0001, + "loss": 1.7772, + "step": 821 + }, + { + "epoch": 0.09549811211153064, + "grad_norm": 0.3897988796234131, + "learning_rate": 0.0001, + "loss": 1.808, + "step": 822 + }, + { + "epoch": 0.09561428986349114, + "grad_norm": 0.3413255512714386, + "learning_rate": 0.0001, + "loss": 1.7803, + "step": 823 + }, + { + "epoch": 0.09573046761545165, + "grad_norm": 0.4127778112888336, + "learning_rate": 0.0001, + "loss": 1.8115, + "step": 824 + }, + { + "epoch": 0.09584664536741214, + "grad_norm": 0.3477415442466736, + "learning_rate": 0.0001, + "loss": 1.6128, + "step": 825 + }, + { + "epoch": 0.09596282311937264, + "grad_norm": 0.32524755597114563, + "learning_rate": 0.0001, + "loss": 1.6613, + "step": 826 + }, + { + "epoch": 0.09607900087133314, + "grad_norm": 0.36483168601989746, + "learning_rate": 0.0001, + "loss": 1.8151, + "step": 827 + }, + { + "epoch": 0.09619517862329365, + "grad_norm": 0.34030166268348694, + "learning_rate": 0.0001, + "loss": 1.6926, + "step": 828 + }, + { + "epoch": 0.09631135637525413, + "grad_norm": 0.3241477906703949, + "learning_rate": 0.0001, + "loss": 1.6646, + "step": 829 + }, + { + "epoch": 0.09642753412721464, + "grad_norm": 0.3371472954750061, + "learning_rate": 0.0001, + "loss": 1.7769, + "step": 830 + }, + { + "epoch": 0.09654371187917514, + "grad_norm": 0.3401608169078827, + "learning_rate": 0.0001, + "loss": 1.7182, + "step": 831 + }, + { + "epoch": 0.09665988963113564, + "grad_norm": 0.32106491923332214, + "learning_rate": 0.0001, + "loss": 1.5612, + "step": 832 + }, + { + "epoch": 0.09677606738309613, + "grad_norm": 0.34740883111953735, + "learning_rate": 0.0001, + "loss": 1.8344, + "step": 833 + }, + { + "epoch": 0.09689224513505663, + "grad_norm": 0.3772992789745331, + "learning_rate": 0.0001, + "loss": 1.7999, + "step": 834 + }, + { + "epoch": 0.09700842288701714, + "grad_norm": 0.3412041664123535, + "learning_rate": 0.0001, + "loss": 1.748, + "step": 835 + }, + { + "epoch": 0.09712460063897764, + "grad_norm": 0.34832799434661865, + "learning_rate": 0.0001, + "loss": 1.6703, + "step": 836 + }, + { + "epoch": 0.09724077839093813, + "grad_norm": 0.35847747325897217, + "learning_rate": 0.0001, + "loss": 1.8176, + "step": 837 + }, + { + "epoch": 0.09735695614289863, + "grad_norm": 0.36482107639312744, + "learning_rate": 0.0001, + "loss": 1.8864, + "step": 838 + }, + { + "epoch": 0.09747313389485913, + "grad_norm": 0.40612149238586426, + "learning_rate": 0.0001, + "loss": 1.8004, + "step": 839 + }, + { + "epoch": 0.09758931164681964, + "grad_norm": 0.3522476255893707, + "learning_rate": 0.0001, + "loss": 1.8136, + "step": 840 + }, + { + "epoch": 0.09770548939878014, + "grad_norm": 0.37533196806907654, + "learning_rate": 0.0001, + "loss": 1.8091, + "step": 841 + }, + { + "epoch": 0.09782166715074063, + "grad_norm": 0.3364860415458679, + "learning_rate": 0.0001, + "loss": 1.7373, + "step": 842 + }, + { + "epoch": 0.09793784490270113, + "grad_norm": 0.34495505690574646, + "learning_rate": 0.0001, + "loss": 1.8331, + "step": 843 + }, + { + "epoch": 0.09805402265466164, + "grad_norm": 0.3652923107147217, + "learning_rate": 0.0001, + "loss": 1.8655, + "step": 844 + }, + { + "epoch": 0.09817020040662214, + "grad_norm": 0.35473141074180603, + "learning_rate": 0.0001, + "loss": 1.907, + "step": 845 + }, + { + "epoch": 0.09828637815858263, + "grad_norm": 0.36820659041404724, + "learning_rate": 0.0001, + "loss": 1.624, + "step": 846 + }, + { + "epoch": 0.09840255591054313, + "grad_norm": 0.34225067496299744, + "learning_rate": 0.0001, + "loss": 1.6383, + "step": 847 + }, + { + "epoch": 0.09851873366250363, + "grad_norm": 0.3609171509742737, + "learning_rate": 0.0001, + "loss": 1.7822, + "step": 848 + }, + { + "epoch": 0.09863491141446414, + "grad_norm": 0.3501855731010437, + "learning_rate": 0.0001, + "loss": 1.6538, + "step": 849 + }, + { + "epoch": 0.09875108916642462, + "grad_norm": 0.32687613368034363, + "learning_rate": 0.0001, + "loss": 1.7718, + "step": 850 + }, + { + "epoch": 0.09886726691838513, + "grad_norm": 0.34075310826301575, + "learning_rate": 0.0001, + "loss": 1.6748, + "step": 851 + }, + { + "epoch": 0.09898344467034563, + "grad_norm": 0.3613976836204529, + "learning_rate": 0.0001, + "loss": 1.9505, + "step": 852 + }, + { + "epoch": 0.09909962242230613, + "grad_norm": 0.33815905451774597, + "learning_rate": 0.0001, + "loss": 1.7972, + "step": 853 + }, + { + "epoch": 0.09921580017426662, + "grad_norm": 0.35017895698547363, + "learning_rate": 0.0001, + "loss": 1.6505, + "step": 854 + }, + { + "epoch": 0.09933197792622712, + "grad_norm": 0.3758436441421509, + "learning_rate": 0.0001, + "loss": 1.6596, + "step": 855 + }, + { + "epoch": 0.09944815567818763, + "grad_norm": 0.32959380745887756, + "learning_rate": 0.0001, + "loss": 1.6437, + "step": 856 + }, + { + "epoch": 0.09956433343014813, + "grad_norm": 0.3575064539909363, + "learning_rate": 0.0001, + "loss": 1.8109, + "step": 857 + }, + { + "epoch": 0.09968051118210862, + "grad_norm": 0.3391128182411194, + "learning_rate": 0.0001, + "loss": 1.7841, + "step": 858 + }, + { + "epoch": 0.09979668893406912, + "grad_norm": 0.33528846502304077, + "learning_rate": 0.0001, + "loss": 1.7011, + "step": 859 + }, + { + "epoch": 0.09991286668602962, + "grad_norm": 0.32930776476860046, + "learning_rate": 0.0001, + "loss": 1.5906, + "step": 860 + }, + { + "epoch": 0.10002904443799013, + "grad_norm": 0.34488263726234436, + "learning_rate": 0.0001, + "loss": 1.7357, + "step": 861 + }, + { + "epoch": 0.10014522218995063, + "grad_norm": 0.36875104904174805, + "learning_rate": 0.0001, + "loss": 1.8439, + "step": 862 + }, + { + "epoch": 0.10026139994191112, + "grad_norm": 0.3462197184562683, + "learning_rate": 0.0001, + "loss": 1.7203, + "step": 863 + }, + { + "epoch": 0.10037757769387162, + "grad_norm": 0.38343024253845215, + "learning_rate": 0.0001, + "loss": 2.0117, + "step": 864 + }, + { + "epoch": 0.10049375544583213, + "grad_norm": 0.3336431682109833, + "learning_rate": 0.0001, + "loss": 1.7904, + "step": 865 + }, + { + "epoch": 0.10060993319779263, + "grad_norm": 0.3539637327194214, + "learning_rate": 0.0001, + "loss": 1.7501, + "step": 866 + }, + { + "epoch": 0.10072611094975312, + "grad_norm": 0.38433191180229187, + "learning_rate": 0.0001, + "loss": 1.8894, + "step": 867 + }, + { + "epoch": 0.10084228870171362, + "grad_norm": 0.34169065952301025, + "learning_rate": 0.0001, + "loss": 1.6979, + "step": 868 + }, + { + "epoch": 0.10095846645367412, + "grad_norm": 0.3778311610221863, + "learning_rate": 0.0001, + "loss": 1.735, + "step": 869 + }, + { + "epoch": 0.10107464420563463, + "grad_norm": 0.3747129440307617, + "learning_rate": 0.0001, + "loss": 1.7026, + "step": 870 + }, + { + "epoch": 0.10119082195759511, + "grad_norm": 0.34259381890296936, + "learning_rate": 0.0001, + "loss": 1.7926, + "step": 871 + }, + { + "epoch": 0.10130699970955562, + "grad_norm": 0.35702332854270935, + "learning_rate": 0.0001, + "loss": 1.7456, + "step": 872 + }, + { + "epoch": 0.10142317746151612, + "grad_norm": 0.35375505685806274, + "learning_rate": 0.0001, + "loss": 1.6864, + "step": 873 + }, + { + "epoch": 0.10153935521347662, + "grad_norm": 0.35189199447631836, + "learning_rate": 0.0001, + "loss": 1.6413, + "step": 874 + }, + { + "epoch": 0.10165553296543711, + "grad_norm": 0.3281596601009369, + "learning_rate": 0.0001, + "loss": 1.4696, + "step": 875 + }, + { + "epoch": 0.10177171071739761, + "grad_norm": 0.34264156222343445, + "learning_rate": 0.0001, + "loss": 1.8293, + "step": 876 + }, + { + "epoch": 0.10188788846935812, + "grad_norm": 0.3711477518081665, + "learning_rate": 0.0001, + "loss": 1.7117, + "step": 877 + }, + { + "epoch": 0.10200406622131862, + "grad_norm": 0.31812041997909546, + "learning_rate": 0.0001, + "loss": 1.5149, + "step": 878 + }, + { + "epoch": 0.10212024397327912, + "grad_norm": 0.3646787405014038, + "learning_rate": 0.0001, + "loss": 1.6864, + "step": 879 + }, + { + "epoch": 0.10223642172523961, + "grad_norm": 0.3644947111606598, + "learning_rate": 0.0001, + "loss": 1.8658, + "step": 880 + }, + { + "epoch": 0.10235259947720012, + "grad_norm": 0.345739483833313, + "learning_rate": 0.0001, + "loss": 1.7315, + "step": 881 + }, + { + "epoch": 0.10246877722916062, + "grad_norm": 0.3633933663368225, + "learning_rate": 0.0001, + "loss": 1.6847, + "step": 882 + }, + { + "epoch": 0.10258495498112112, + "grad_norm": 0.35784977674484253, + "learning_rate": 0.0001, + "loss": 1.7953, + "step": 883 + }, + { + "epoch": 0.10270113273308161, + "grad_norm": 0.34802740812301636, + "learning_rate": 0.0001, + "loss": 1.7537, + "step": 884 + }, + { + "epoch": 0.10281731048504211, + "grad_norm": 0.334532231092453, + "learning_rate": 0.0001, + "loss": 1.6814, + "step": 885 + }, + { + "epoch": 0.10293348823700262, + "grad_norm": 0.37249666452407837, + "learning_rate": 0.0001, + "loss": 1.838, + "step": 886 + }, + { + "epoch": 0.10304966598896312, + "grad_norm": 0.37367671728134155, + "learning_rate": 0.0001, + "loss": 1.8214, + "step": 887 + }, + { + "epoch": 0.10316584374092361, + "grad_norm": 0.32655900716781616, + "learning_rate": 0.0001, + "loss": 1.7381, + "step": 888 + }, + { + "epoch": 0.10328202149288411, + "grad_norm": 0.33447375893592834, + "learning_rate": 0.0001, + "loss": 1.6977, + "step": 889 + }, + { + "epoch": 0.10339819924484461, + "grad_norm": 0.3553028404712677, + "learning_rate": 0.0001, + "loss": 1.9126, + "step": 890 + }, + { + "epoch": 0.10351437699680512, + "grad_norm": 0.3429213762283325, + "learning_rate": 0.0001, + "loss": 1.7968, + "step": 891 + }, + { + "epoch": 0.1036305547487656, + "grad_norm": 0.3589216470718384, + "learning_rate": 0.0001, + "loss": 1.87, + "step": 892 + }, + { + "epoch": 0.10374673250072611, + "grad_norm": 0.35310253500938416, + "learning_rate": 0.0001, + "loss": 1.7481, + "step": 893 + }, + { + "epoch": 0.10386291025268661, + "grad_norm": 0.36510980129241943, + "learning_rate": 0.0001, + "loss": 1.8996, + "step": 894 + }, + { + "epoch": 0.10397908800464711, + "grad_norm": 0.3604600429534912, + "learning_rate": 0.0001, + "loss": 1.8, + "step": 895 + }, + { + "epoch": 0.10409526575660762, + "grad_norm": 0.3504015803337097, + "learning_rate": 0.0001, + "loss": 1.706, + "step": 896 + }, + { + "epoch": 0.1042114435085681, + "grad_norm": 0.3666890561580658, + "learning_rate": 0.0001, + "loss": 1.9185, + "step": 897 + }, + { + "epoch": 0.10432762126052861, + "grad_norm": 0.32218024134635925, + "learning_rate": 0.0001, + "loss": 1.5574, + "step": 898 + }, + { + "epoch": 0.10444379901248911, + "grad_norm": 0.32854628562927246, + "learning_rate": 0.0001, + "loss": 1.537, + "step": 899 + }, + { + "epoch": 0.10455997676444961, + "grad_norm": 0.34168779850006104, + "learning_rate": 0.0001, + "loss": 1.7293, + "step": 900 + }, + { + "epoch": 0.1046761545164101, + "grad_norm": 0.3389854431152344, + "learning_rate": 0.0001, + "loss": 1.7061, + "step": 901 + }, + { + "epoch": 0.1047923322683706, + "grad_norm": 0.3675687611103058, + "learning_rate": 0.0001, + "loss": 1.9126, + "step": 902 + }, + { + "epoch": 0.10490851002033111, + "grad_norm": 0.33231931924819946, + "learning_rate": 0.0001, + "loss": 1.6451, + "step": 903 + }, + { + "epoch": 0.10502468777229161, + "grad_norm": 0.3507463037967682, + "learning_rate": 0.0001, + "loss": 1.7924, + "step": 904 + }, + { + "epoch": 0.1051408655242521, + "grad_norm": 0.35986390709877014, + "learning_rate": 0.0001, + "loss": 1.7427, + "step": 905 + }, + { + "epoch": 0.1052570432762126, + "grad_norm": 0.36210981011390686, + "learning_rate": 0.0001, + "loss": 1.8232, + "step": 906 + }, + { + "epoch": 0.1053732210281731, + "grad_norm": 0.3577278256416321, + "learning_rate": 0.0001, + "loss": 1.7484, + "step": 907 + }, + { + "epoch": 0.10548939878013361, + "grad_norm": 0.3454272449016571, + "learning_rate": 0.0001, + "loss": 1.6401, + "step": 908 + }, + { + "epoch": 0.1056055765320941, + "grad_norm": 0.34774741530418396, + "learning_rate": 0.0001, + "loss": 1.6245, + "step": 909 + }, + { + "epoch": 0.1057217542840546, + "grad_norm": 0.3502449095249176, + "learning_rate": 0.0001, + "loss": 1.6245, + "step": 910 + }, + { + "epoch": 0.1058379320360151, + "grad_norm": 0.36542028188705444, + "learning_rate": 0.0001, + "loss": 1.8321, + "step": 911 + }, + { + "epoch": 0.1059541097879756, + "grad_norm": 0.35387906432151794, + "learning_rate": 0.0001, + "loss": 1.7257, + "step": 912 + }, + { + "epoch": 0.10607028753993611, + "grad_norm": 0.3482963442802429, + "learning_rate": 0.0001, + "loss": 1.7885, + "step": 913 + }, + { + "epoch": 0.1061864652918966, + "grad_norm": 0.3488115668296814, + "learning_rate": 0.0001, + "loss": 1.6663, + "step": 914 + }, + { + "epoch": 0.1063026430438571, + "grad_norm": 0.3296864926815033, + "learning_rate": 0.0001, + "loss": 1.5167, + "step": 915 + }, + { + "epoch": 0.1064188207958176, + "grad_norm": 0.34132471680641174, + "learning_rate": 0.0001, + "loss": 1.6633, + "step": 916 + }, + { + "epoch": 0.1065349985477781, + "grad_norm": 0.3466493487358093, + "learning_rate": 0.0001, + "loss": 1.6935, + "step": 917 + }, + { + "epoch": 0.1066511762997386, + "grad_norm": 0.3528394401073456, + "learning_rate": 0.0001, + "loss": 1.847, + "step": 918 + }, + { + "epoch": 0.1067673540516991, + "grad_norm": 0.32509297132492065, + "learning_rate": 0.0001, + "loss": 1.681, + "step": 919 + }, + { + "epoch": 0.1068835318036596, + "grad_norm": 0.3523486852645874, + "learning_rate": 0.0001, + "loss": 1.7146, + "step": 920 + }, + { + "epoch": 0.1069997095556201, + "grad_norm": 0.3355209529399872, + "learning_rate": 0.0001, + "loss": 1.6507, + "step": 921 + }, + { + "epoch": 0.10711588730758059, + "grad_norm": 0.3474280834197998, + "learning_rate": 0.0001, + "loss": 1.6249, + "step": 922 + }, + { + "epoch": 0.1072320650595411, + "grad_norm": 0.359402596950531, + "learning_rate": 0.0001, + "loss": 1.8466, + "step": 923 + }, + { + "epoch": 0.1073482428115016, + "grad_norm": 0.3309195339679718, + "learning_rate": 0.0001, + "loss": 1.6251, + "step": 924 + }, + { + "epoch": 0.1074644205634621, + "grad_norm": 0.3497809171676636, + "learning_rate": 0.0001, + "loss": 1.7625, + "step": 925 + }, + { + "epoch": 0.10758059831542259, + "grad_norm": 0.34211966395378113, + "learning_rate": 0.0001, + "loss": 1.7996, + "step": 926 + }, + { + "epoch": 0.1076967760673831, + "grad_norm": 0.3558671772480011, + "learning_rate": 0.0001, + "loss": 1.8548, + "step": 927 + }, + { + "epoch": 0.1078129538193436, + "grad_norm": 0.3554135859012604, + "learning_rate": 0.0001, + "loss": 1.873, + "step": 928 + }, + { + "epoch": 0.1079291315713041, + "grad_norm": 0.35786765813827515, + "learning_rate": 0.0001, + "loss": 1.7973, + "step": 929 + }, + { + "epoch": 0.1080453093232646, + "grad_norm": 0.3519211709499359, + "learning_rate": 0.0001, + "loss": 1.7794, + "step": 930 + }, + { + "epoch": 0.10816148707522509, + "grad_norm": 0.37536898255348206, + "learning_rate": 0.0001, + "loss": 1.8926, + "step": 931 + }, + { + "epoch": 0.1082776648271856, + "grad_norm": 0.3422173857688904, + "learning_rate": 0.0001, + "loss": 1.8172, + "step": 932 + }, + { + "epoch": 0.1083938425791461, + "grad_norm": 0.35913097858428955, + "learning_rate": 0.0001, + "loss": 1.8137, + "step": 933 + }, + { + "epoch": 0.1085100203311066, + "grad_norm": 0.3589080274105072, + "learning_rate": 0.0001, + "loss": 1.7949, + "step": 934 + }, + { + "epoch": 0.10862619808306709, + "grad_norm": 0.37230291962623596, + "learning_rate": 0.0001, + "loss": 1.8148, + "step": 935 + }, + { + "epoch": 0.10874237583502759, + "grad_norm": 0.3313770294189453, + "learning_rate": 0.0001, + "loss": 1.6861, + "step": 936 + }, + { + "epoch": 0.1088585535869881, + "grad_norm": 0.344086229801178, + "learning_rate": 0.0001, + "loss": 1.7272, + "step": 937 + }, + { + "epoch": 0.1089747313389486, + "grad_norm": 0.3584043085575104, + "learning_rate": 0.0001, + "loss": 1.7395, + "step": 938 + }, + { + "epoch": 0.10909090909090909, + "grad_norm": 0.34426915645599365, + "learning_rate": 0.0001, + "loss": 1.6909, + "step": 939 + }, + { + "epoch": 0.10920708684286959, + "grad_norm": 0.340386301279068, + "learning_rate": 0.0001, + "loss": 1.7741, + "step": 940 + }, + { + "epoch": 0.10932326459483009, + "grad_norm": 0.3348495662212372, + "learning_rate": 0.0001, + "loss": 1.7335, + "step": 941 + }, + { + "epoch": 0.1094394423467906, + "grad_norm": 0.35593461990356445, + "learning_rate": 0.0001, + "loss": 1.7451, + "step": 942 + }, + { + "epoch": 0.10955562009875108, + "grad_norm": 0.3561510145664215, + "learning_rate": 0.0001, + "loss": 1.8853, + "step": 943 + }, + { + "epoch": 0.10967179785071159, + "grad_norm": 0.36038845777511597, + "learning_rate": 0.0001, + "loss": 1.6154, + "step": 944 + }, + { + "epoch": 0.10978797560267209, + "grad_norm": 0.3256826400756836, + "learning_rate": 0.0001, + "loss": 1.5143, + "step": 945 + }, + { + "epoch": 0.10990415335463259, + "grad_norm": 0.3304516673088074, + "learning_rate": 0.0001, + "loss": 1.6441, + "step": 946 + }, + { + "epoch": 0.11002033110659308, + "grad_norm": 0.34142622351646423, + "learning_rate": 0.0001, + "loss": 1.7302, + "step": 947 + }, + { + "epoch": 0.11013650885855358, + "grad_norm": 0.3722458481788635, + "learning_rate": 0.0001, + "loss": 1.6348, + "step": 948 + }, + { + "epoch": 0.11025268661051409, + "grad_norm": 0.35679808259010315, + "learning_rate": 0.0001, + "loss": 1.7921, + "step": 949 + }, + { + "epoch": 0.11036886436247459, + "grad_norm": 0.3507538139820099, + "learning_rate": 0.0001, + "loss": 1.7306, + "step": 950 + }, + { + "epoch": 0.11048504211443509, + "grad_norm": 0.34563156962394714, + "learning_rate": 0.0001, + "loss": 1.7972, + "step": 951 + }, + { + "epoch": 0.11060121986639558, + "grad_norm": 0.35172975063323975, + "learning_rate": 0.0001, + "loss": 1.8513, + "step": 952 + }, + { + "epoch": 0.11071739761835608, + "grad_norm": 0.3704832196235657, + "learning_rate": 0.0001, + "loss": 1.7626, + "step": 953 + }, + { + "epoch": 0.11083357537031659, + "grad_norm": 0.3332252502441406, + "learning_rate": 0.0001, + "loss": 1.7258, + "step": 954 + }, + { + "epoch": 0.11094975312227709, + "grad_norm": 0.33239227533340454, + "learning_rate": 0.0001, + "loss": 1.5098, + "step": 955 + }, + { + "epoch": 0.11106593087423758, + "grad_norm": 0.35018518567085266, + "learning_rate": 0.0001, + "loss": 1.8144, + "step": 956 + }, + { + "epoch": 0.11118210862619808, + "grad_norm": 0.34933680295944214, + "learning_rate": 0.0001, + "loss": 1.68, + "step": 957 + }, + { + "epoch": 0.11129828637815858, + "grad_norm": 0.38236263394355774, + "learning_rate": 0.0001, + "loss": 1.8272, + "step": 958 + }, + { + "epoch": 0.11141446413011909, + "grad_norm": 0.3630216717720032, + "learning_rate": 0.0001, + "loss": 1.7154, + "step": 959 + }, + { + "epoch": 0.11153064188207958, + "grad_norm": 0.3446943461894989, + "learning_rate": 0.0001, + "loss": 1.8034, + "step": 960 + }, + { + "epoch": 0.11164681963404008, + "grad_norm": 0.40010347962379456, + "learning_rate": 0.0001, + "loss": 1.7285, + "step": 961 + }, + { + "epoch": 0.11176299738600058, + "grad_norm": 0.35464581847190857, + "learning_rate": 0.0001, + "loss": 1.7943, + "step": 962 + }, + { + "epoch": 0.11187917513796108, + "grad_norm": 0.38472074270248413, + "learning_rate": 0.0001, + "loss": 1.8237, + "step": 963 + }, + { + "epoch": 0.11199535288992157, + "grad_norm": 0.3539893329143524, + "learning_rate": 0.0001, + "loss": 1.7777, + "step": 964 + }, + { + "epoch": 0.11211153064188208, + "grad_norm": 0.3627575933933258, + "learning_rate": 0.0001, + "loss": 1.7519, + "step": 965 + }, + { + "epoch": 0.11222770839384258, + "grad_norm": 0.3388507664203644, + "learning_rate": 0.0001, + "loss": 1.7009, + "step": 966 + }, + { + "epoch": 0.11234388614580308, + "grad_norm": 0.3800560534000397, + "learning_rate": 0.0001, + "loss": 1.7116, + "step": 967 + }, + { + "epoch": 0.11246006389776358, + "grad_norm": 0.3563110828399658, + "learning_rate": 0.0001, + "loss": 1.6944, + "step": 968 + }, + { + "epoch": 0.11257624164972407, + "grad_norm": 0.34514880180358887, + "learning_rate": 0.0001, + "loss": 1.7658, + "step": 969 + }, + { + "epoch": 0.11269241940168458, + "grad_norm": 0.35564523935317993, + "learning_rate": 0.0001, + "loss": 1.6212, + "step": 970 + }, + { + "epoch": 0.11280859715364508, + "grad_norm": 0.35224324464797974, + "learning_rate": 0.0001, + "loss": 1.8569, + "step": 971 + }, + { + "epoch": 0.11292477490560558, + "grad_norm": 0.344664990901947, + "learning_rate": 0.0001, + "loss": 1.6649, + "step": 972 + }, + { + "epoch": 0.11304095265756607, + "grad_norm": 0.3399982452392578, + "learning_rate": 0.0001, + "loss": 1.6618, + "step": 973 + }, + { + "epoch": 0.11315713040952657, + "grad_norm": 0.3541862368583679, + "learning_rate": 0.0001, + "loss": 1.5522, + "step": 974 + }, + { + "epoch": 0.11327330816148708, + "grad_norm": 0.35132747888565063, + "learning_rate": 0.0001, + "loss": 1.7683, + "step": 975 + }, + { + "epoch": 0.11338948591344758, + "grad_norm": 0.3748994469642639, + "learning_rate": 0.0001, + "loss": 1.6813, + "step": 976 + }, + { + "epoch": 0.11350566366540807, + "grad_norm": 0.3247508406639099, + "learning_rate": 0.0001, + "loss": 1.6023, + "step": 977 + }, + { + "epoch": 0.11362184141736857, + "grad_norm": 0.33404844999313354, + "learning_rate": 0.0001, + "loss": 1.6346, + "step": 978 + }, + { + "epoch": 0.11373801916932907, + "grad_norm": 0.36675605177879333, + "learning_rate": 0.0001, + "loss": 1.7969, + "step": 979 + }, + { + "epoch": 0.11385419692128958, + "grad_norm": 0.37571048736572266, + "learning_rate": 0.0001, + "loss": 1.7681, + "step": 980 + }, + { + "epoch": 0.11397037467325007, + "grad_norm": 0.3701353967189789, + "learning_rate": 0.0001, + "loss": 1.8767, + "step": 981 + }, + { + "epoch": 0.11408655242521057, + "grad_norm": 0.34536612033843994, + "learning_rate": 0.0001, + "loss": 1.616, + "step": 982 + }, + { + "epoch": 0.11420273017717107, + "grad_norm": 0.43408679962158203, + "learning_rate": 0.0001, + "loss": 1.9732, + "step": 983 + }, + { + "epoch": 0.11431890792913157, + "grad_norm": 0.34038683772087097, + "learning_rate": 0.0001, + "loss": 1.684, + "step": 984 + }, + { + "epoch": 0.11443508568109208, + "grad_norm": 0.3477528691291809, + "learning_rate": 0.0001, + "loss": 1.5602, + "step": 985 + }, + { + "epoch": 0.11455126343305257, + "grad_norm": 0.3574272096157074, + "learning_rate": 0.0001, + "loss": 1.6587, + "step": 986 + }, + { + "epoch": 0.11466744118501307, + "grad_norm": 0.3423398733139038, + "learning_rate": 0.0001, + "loss": 1.6347, + "step": 987 + }, + { + "epoch": 0.11478361893697357, + "grad_norm": 0.35041436553001404, + "learning_rate": 0.0001, + "loss": 1.6837, + "step": 988 + }, + { + "epoch": 0.11489979668893407, + "grad_norm": 0.3532884418964386, + "learning_rate": 0.0001, + "loss": 1.8723, + "step": 989 + }, + { + "epoch": 0.11501597444089456, + "grad_norm": 0.3406219482421875, + "learning_rate": 0.0001, + "loss": 1.6606, + "step": 990 + }, + { + "epoch": 0.11513215219285507, + "grad_norm": 0.33296358585357666, + "learning_rate": 0.0001, + "loss": 1.6413, + "step": 991 + }, + { + "epoch": 0.11524832994481557, + "grad_norm": 0.35060951113700867, + "learning_rate": 0.0001, + "loss": 1.6228, + "step": 992 + }, + { + "epoch": 0.11536450769677607, + "grad_norm": 0.369406521320343, + "learning_rate": 0.0001, + "loss": 1.7544, + "step": 993 + }, + { + "epoch": 0.11548068544873656, + "grad_norm": 0.3404718041419983, + "learning_rate": 0.0001, + "loss": 1.735, + "step": 994 + }, + { + "epoch": 0.11559686320069706, + "grad_norm": 0.3357236385345459, + "learning_rate": 0.0001, + "loss": 1.7058, + "step": 995 + }, + { + "epoch": 0.11571304095265757, + "grad_norm": 0.3525453805923462, + "learning_rate": 0.0001, + "loss": 1.8039, + "step": 996 + }, + { + "epoch": 0.11582921870461807, + "grad_norm": 0.35840287804603577, + "learning_rate": 0.0001, + "loss": 1.8253, + "step": 997 + }, + { + "epoch": 0.11594539645657856, + "grad_norm": 0.36817336082458496, + "learning_rate": 0.0001, + "loss": 1.8989, + "step": 998 + }, + { + "epoch": 0.11606157420853906, + "grad_norm": 0.349050372838974, + "learning_rate": 0.0001, + "loss": 1.7812, + "step": 999 + }, + { + "epoch": 0.11617775196049956, + "grad_norm": 0.37279772758483887, + "learning_rate": 0.0001, + "loss": 1.8076, + "step": 1000 + }, + { + "epoch": 0.11629392971246007, + "grad_norm": 0.36874422430992126, + "learning_rate": 0.0001, + "loss": 1.7866, + "step": 1001 + }, + { + "epoch": 0.11641010746442057, + "grad_norm": 0.3826844394207001, + "learning_rate": 0.0001, + "loss": 1.7474, + "step": 1002 + }, + { + "epoch": 0.11652628521638106, + "grad_norm": 0.34281986951828003, + "learning_rate": 0.0001, + "loss": 1.657, + "step": 1003 + }, + { + "epoch": 0.11664246296834156, + "grad_norm": 0.3320811688899994, + "learning_rate": 0.0001, + "loss": 1.6287, + "step": 1004 + }, + { + "epoch": 0.11675864072030206, + "grad_norm": 0.3438369631767273, + "learning_rate": 0.0001, + "loss": 1.6014, + "step": 1005 + }, + { + "epoch": 0.11687481847226257, + "grad_norm": 0.35468512773513794, + "learning_rate": 0.0001, + "loss": 1.65, + "step": 1006 + }, + { + "epoch": 0.11699099622422306, + "grad_norm": 0.3840721547603607, + "learning_rate": 0.0001, + "loss": 1.9277, + "step": 1007 + }, + { + "epoch": 0.11710717397618356, + "grad_norm": 0.3772999346256256, + "learning_rate": 0.0001, + "loss": 1.8193, + "step": 1008 + }, + { + "epoch": 0.11722335172814406, + "grad_norm": 0.32721465826034546, + "learning_rate": 0.0001, + "loss": 1.5581, + "step": 1009 + }, + { + "epoch": 0.11733952948010457, + "grad_norm": 0.3579975664615631, + "learning_rate": 0.0001, + "loss": 1.8259, + "step": 1010 + }, + { + "epoch": 0.11745570723206505, + "grad_norm": 0.35289841890335083, + "learning_rate": 0.0001, + "loss": 1.6661, + "step": 1011 + }, + { + "epoch": 0.11757188498402556, + "grad_norm": 0.35894694924354553, + "learning_rate": 0.0001, + "loss": 1.7533, + "step": 1012 + }, + { + "epoch": 0.11768806273598606, + "grad_norm": 0.35734298825263977, + "learning_rate": 0.0001, + "loss": 1.8021, + "step": 1013 + }, + { + "epoch": 0.11780424048794656, + "grad_norm": 0.3388814330101013, + "learning_rate": 0.0001, + "loss": 1.7065, + "step": 1014 + }, + { + "epoch": 0.11792041823990705, + "grad_norm": 0.3703312277793884, + "learning_rate": 0.0001, + "loss": 1.7811, + "step": 1015 + }, + { + "epoch": 0.11803659599186755, + "grad_norm": 0.3490375280380249, + "learning_rate": 0.0001, + "loss": 1.6314, + "step": 1016 + }, + { + "epoch": 0.11815277374382806, + "grad_norm": 0.36077988147735596, + "learning_rate": 0.0001, + "loss": 1.7685, + "step": 1017 + }, + { + "epoch": 0.11826895149578856, + "grad_norm": 0.366590291261673, + "learning_rate": 0.0001, + "loss": 1.7439, + "step": 1018 + }, + { + "epoch": 0.11838512924774906, + "grad_norm": 0.3408995568752289, + "learning_rate": 0.0001, + "loss": 1.6552, + "step": 1019 + }, + { + "epoch": 0.11850130699970955, + "grad_norm": 0.3684650659561157, + "learning_rate": 0.0001, + "loss": 1.7644, + "step": 1020 + }, + { + "epoch": 0.11861748475167005, + "grad_norm": 0.37491750717163086, + "learning_rate": 0.0001, + "loss": 1.7273, + "step": 1021 + }, + { + "epoch": 0.11873366250363056, + "grad_norm": 0.34040531516075134, + "learning_rate": 0.0001, + "loss": 1.7995, + "step": 1022 + }, + { + "epoch": 0.11884984025559106, + "grad_norm": 0.3536158800125122, + "learning_rate": 0.0001, + "loss": 1.811, + "step": 1023 + }, + { + "epoch": 0.11896601800755155, + "grad_norm": 0.38111457228660583, + "learning_rate": 0.0001, + "loss": 1.8178, + "step": 1024 + }, + { + "epoch": 0.11908219575951205, + "grad_norm": 0.39054590463638306, + "learning_rate": 0.0001, + "loss": 1.8101, + "step": 1025 + }, + { + "epoch": 0.11919837351147256, + "grad_norm": 0.37817317247390747, + "learning_rate": 0.0001, + "loss": 1.8686, + "step": 1026 + }, + { + "epoch": 0.11931455126343306, + "grad_norm": 0.38268011808395386, + "learning_rate": 0.0001, + "loss": 1.8499, + "step": 1027 + }, + { + "epoch": 0.11943072901539355, + "grad_norm": 0.3847000002861023, + "learning_rate": 0.0001, + "loss": 1.699, + "step": 1028 + }, + { + "epoch": 0.11954690676735405, + "grad_norm": 0.3735695481300354, + "learning_rate": 0.0001, + "loss": 1.7161, + "step": 1029 + }, + { + "epoch": 0.11966308451931455, + "grad_norm": 0.3305729031562805, + "learning_rate": 0.0001, + "loss": 1.7738, + "step": 1030 + }, + { + "epoch": 0.11977926227127506, + "grad_norm": 0.3482242822647095, + "learning_rate": 0.0001, + "loss": 1.8291, + "step": 1031 + }, + { + "epoch": 0.11989544002323554, + "grad_norm": 0.359893798828125, + "learning_rate": 0.0001, + "loss": 1.7426, + "step": 1032 + }, + { + "epoch": 0.12001161777519605, + "grad_norm": 0.36758822202682495, + "learning_rate": 0.0001, + "loss": 1.9225, + "step": 1033 + }, + { + "epoch": 0.12012779552715655, + "grad_norm": 0.3210867643356323, + "learning_rate": 0.0001, + "loss": 1.5384, + "step": 1034 + }, + { + "epoch": 0.12024397327911705, + "grad_norm": 0.3599686622619629, + "learning_rate": 0.0001, + "loss": 1.77, + "step": 1035 + }, + { + "epoch": 0.12036015103107754, + "grad_norm": 0.3894933760166168, + "learning_rate": 0.0001, + "loss": 1.7807, + "step": 1036 + }, + { + "epoch": 0.12047632878303804, + "grad_norm": 0.3600466847419739, + "learning_rate": 0.0001, + "loss": 1.7304, + "step": 1037 + }, + { + "epoch": 0.12059250653499855, + "grad_norm": 0.3773282766342163, + "learning_rate": 0.0001, + "loss": 1.7148, + "step": 1038 + }, + { + "epoch": 0.12070868428695905, + "grad_norm": 0.342816561460495, + "learning_rate": 0.0001, + "loss": 1.6856, + "step": 1039 + }, + { + "epoch": 0.12082486203891955, + "grad_norm": 0.34455814957618713, + "learning_rate": 0.0001, + "loss": 1.8846, + "step": 1040 + }, + { + "epoch": 0.12094103979088004, + "grad_norm": 0.3425041437149048, + "learning_rate": 0.0001, + "loss": 1.5655, + "step": 1041 + }, + { + "epoch": 0.12105721754284055, + "grad_norm": 0.3462069034576416, + "learning_rate": 0.0001, + "loss": 1.7432, + "step": 1042 + }, + { + "epoch": 0.12117339529480105, + "grad_norm": 0.3301829397678375, + "learning_rate": 0.0001, + "loss": 1.5452, + "step": 1043 + }, + { + "epoch": 0.12128957304676155, + "grad_norm": 0.32111644744873047, + "learning_rate": 0.0001, + "loss": 1.724, + "step": 1044 + }, + { + "epoch": 0.12140575079872204, + "grad_norm": 0.3677181005477905, + "learning_rate": 0.0001, + "loss": 1.7678, + "step": 1045 + }, + { + "epoch": 0.12152192855068254, + "grad_norm": 0.3687067925930023, + "learning_rate": 0.0001, + "loss": 1.6888, + "step": 1046 + }, + { + "epoch": 0.12163810630264305, + "grad_norm": 0.34119775891304016, + "learning_rate": 0.0001, + "loss": 1.7466, + "step": 1047 + }, + { + "epoch": 0.12175428405460355, + "grad_norm": 0.3656751811504364, + "learning_rate": 0.0001, + "loss": 1.6437, + "step": 1048 + }, + { + "epoch": 0.12187046180656404, + "grad_norm": 0.35760533809661865, + "learning_rate": 0.0001, + "loss": 1.8831, + "step": 1049 + }, + { + "epoch": 0.12198663955852454, + "grad_norm": 0.3624860942363739, + "learning_rate": 0.0001, + "loss": 1.7621, + "step": 1050 + }, + { + "epoch": 0.12210281731048504, + "grad_norm": 0.3888024091720581, + "learning_rate": 0.0001, + "loss": 1.6873, + "step": 1051 + }, + { + "epoch": 0.12221899506244555, + "grad_norm": 0.39359578490257263, + "learning_rate": 0.0001, + "loss": 1.9219, + "step": 1052 + }, + { + "epoch": 0.12233517281440603, + "grad_norm": 0.3813614249229431, + "learning_rate": 0.0001, + "loss": 1.7615, + "step": 1053 + }, + { + "epoch": 0.12245135056636654, + "grad_norm": 0.3501231074333191, + "learning_rate": 0.0001, + "loss": 1.7342, + "step": 1054 + }, + { + "epoch": 0.12256752831832704, + "grad_norm": 0.32617613673210144, + "learning_rate": 0.0001, + "loss": 1.6313, + "step": 1055 + }, + { + "epoch": 0.12268370607028754, + "grad_norm": 0.30450668931007385, + "learning_rate": 0.0001, + "loss": 1.4726, + "step": 1056 + }, + { + "epoch": 0.12279988382224805, + "grad_norm": 0.33408787846565247, + "learning_rate": 0.0001, + "loss": 1.4528, + "step": 1057 + }, + { + "epoch": 0.12291606157420853, + "grad_norm": 0.3771173357963562, + "learning_rate": 0.0001, + "loss": 1.8417, + "step": 1058 + }, + { + "epoch": 0.12303223932616904, + "grad_norm": 0.3441436290740967, + "learning_rate": 0.0001, + "loss": 1.6334, + "step": 1059 + }, + { + "epoch": 0.12314841707812954, + "grad_norm": 0.3676146864891052, + "learning_rate": 0.0001, + "loss": 1.6966, + "step": 1060 + }, + { + "epoch": 0.12326459483009004, + "grad_norm": 0.35352498292922974, + "learning_rate": 0.0001, + "loss": 1.5551, + "step": 1061 + }, + { + "epoch": 0.12338077258205053, + "grad_norm": 0.43162113428115845, + "learning_rate": 0.0001, + "loss": 1.593, + "step": 1062 + }, + { + "epoch": 0.12349695033401104, + "grad_norm": 0.370995432138443, + "learning_rate": 0.0001, + "loss": 1.7928, + "step": 1063 + }, + { + "epoch": 0.12361312808597154, + "grad_norm": 0.35732775926589966, + "learning_rate": 0.0001, + "loss": 1.7332, + "step": 1064 + }, + { + "epoch": 0.12372930583793204, + "grad_norm": 0.40261656045913696, + "learning_rate": 0.0001, + "loss": 2.0031, + "step": 1065 + }, + { + "epoch": 0.12384548358989253, + "grad_norm": 0.34159186482429504, + "learning_rate": 0.0001, + "loss": 1.6979, + "step": 1066 + }, + { + "epoch": 0.12396166134185303, + "grad_norm": 0.3457348942756653, + "learning_rate": 0.0001, + "loss": 1.6923, + "step": 1067 + }, + { + "epoch": 0.12407783909381354, + "grad_norm": 0.36517080664634705, + "learning_rate": 0.0001, + "loss": 1.7111, + "step": 1068 + }, + { + "epoch": 0.12419401684577404, + "grad_norm": 0.3570208251476288, + "learning_rate": 0.0001, + "loss": 1.5761, + "step": 1069 + }, + { + "epoch": 0.12431019459773453, + "grad_norm": 0.33659952878952026, + "learning_rate": 0.0001, + "loss": 1.476, + "step": 1070 + }, + { + "epoch": 0.12442637234969503, + "grad_norm": 0.3287290930747986, + "learning_rate": 0.0001, + "loss": 1.6783, + "step": 1071 + }, + { + "epoch": 0.12454255010165553, + "grad_norm": 0.3403220772743225, + "learning_rate": 0.0001, + "loss": 1.8731, + "step": 1072 + }, + { + "epoch": 0.12465872785361604, + "grad_norm": 0.38080114126205444, + "learning_rate": 0.0001, + "loss": 1.6735, + "step": 1073 + }, + { + "epoch": 0.12477490560557654, + "grad_norm": 0.3732984662055969, + "learning_rate": 0.0001, + "loss": 1.7634, + "step": 1074 + }, + { + "epoch": 0.12489108335753703, + "grad_norm": 0.3515911102294922, + "learning_rate": 0.0001, + "loss": 1.7473, + "step": 1075 + }, + { + "epoch": 0.12500726110949753, + "grad_norm": 0.36280357837677, + "learning_rate": 0.0001, + "loss": 1.852, + "step": 1076 + }, + { + "epoch": 0.12512343886145802, + "grad_norm": 0.4288184344768524, + "learning_rate": 0.0001, + "loss": 1.9649, + "step": 1077 + }, + { + "epoch": 0.12523961661341854, + "grad_norm": 0.3732617199420929, + "learning_rate": 0.0001, + "loss": 1.9636, + "step": 1078 + }, + { + "epoch": 0.12535579436537903, + "grad_norm": 0.3731965124607086, + "learning_rate": 0.0001, + "loss": 1.6412, + "step": 1079 + }, + { + "epoch": 0.12547197211733954, + "grad_norm": 0.33766472339630127, + "learning_rate": 0.0001, + "loss": 1.6632, + "step": 1080 + }, + { + "epoch": 0.12558814986930003, + "grad_norm": 0.41299498081207275, + "learning_rate": 0.0001, + "loss": 2.008, + "step": 1081 + }, + { + "epoch": 0.12570432762126052, + "grad_norm": 0.332131952047348, + "learning_rate": 0.0001, + "loss": 1.6935, + "step": 1082 + }, + { + "epoch": 0.12582050537322104, + "grad_norm": 0.3629733920097351, + "learning_rate": 0.0001, + "loss": 1.8256, + "step": 1083 + }, + { + "epoch": 0.12593668312518153, + "grad_norm": 0.35666128993034363, + "learning_rate": 0.0001, + "loss": 1.6795, + "step": 1084 + }, + { + "epoch": 0.12605286087714201, + "grad_norm": 0.3693120777606964, + "learning_rate": 0.0001, + "loss": 1.8276, + "step": 1085 + }, + { + "epoch": 0.12616903862910253, + "grad_norm": 0.38100001215934753, + "learning_rate": 0.0001, + "loss": 1.6575, + "step": 1086 + }, + { + "epoch": 0.12628521638106302, + "grad_norm": 0.35735470056533813, + "learning_rate": 0.0001, + "loss": 1.8506, + "step": 1087 + }, + { + "epoch": 0.12640139413302354, + "grad_norm": 0.3401797413825989, + "learning_rate": 0.0001, + "loss": 1.7468, + "step": 1088 + }, + { + "epoch": 0.12651757188498403, + "grad_norm": 0.34798330068588257, + "learning_rate": 0.0001, + "loss": 1.8221, + "step": 1089 + }, + { + "epoch": 0.12663374963694451, + "grad_norm": 0.3499447703361511, + "learning_rate": 0.0001, + "loss": 1.6818, + "step": 1090 + }, + { + "epoch": 0.12674992738890503, + "grad_norm": 0.3714812099933624, + "learning_rate": 0.0001, + "loss": 1.6802, + "step": 1091 + }, + { + "epoch": 0.12686610514086552, + "grad_norm": 0.3492056429386139, + "learning_rate": 0.0001, + "loss": 1.6368, + "step": 1092 + }, + { + "epoch": 0.12698228289282604, + "grad_norm": 0.3401804566383362, + "learning_rate": 0.0001, + "loss": 1.7797, + "step": 1093 + }, + { + "epoch": 0.12709846064478653, + "grad_norm": 0.33436286449432373, + "learning_rate": 0.0001, + "loss": 1.7528, + "step": 1094 + }, + { + "epoch": 0.12721463839674702, + "grad_norm": 0.35085633397102356, + "learning_rate": 0.0001, + "loss": 1.7508, + "step": 1095 + }, + { + "epoch": 0.12733081614870753, + "grad_norm": 0.3480139970779419, + "learning_rate": 0.0001, + "loss": 1.7582, + "step": 1096 + }, + { + "epoch": 0.12744699390066802, + "grad_norm": 0.3390267789363861, + "learning_rate": 0.0001, + "loss": 1.555, + "step": 1097 + }, + { + "epoch": 0.1275631716526285, + "grad_norm": 0.3470866084098816, + "learning_rate": 0.0001, + "loss": 1.7661, + "step": 1098 + }, + { + "epoch": 0.12767934940458903, + "grad_norm": 0.37509381771087646, + "learning_rate": 0.0001, + "loss": 1.761, + "step": 1099 + }, + { + "epoch": 0.12779552715654952, + "grad_norm": 0.35059359669685364, + "learning_rate": 0.0001, + "loss": 1.5687, + "step": 1100 + }, + { + "epoch": 0.12791170490851003, + "grad_norm": 0.38559988141059875, + "learning_rate": 0.0001, + "loss": 1.9108, + "step": 1101 + }, + { + "epoch": 0.12802788266047052, + "grad_norm": 0.3470633924007416, + "learning_rate": 0.0001, + "loss": 1.6246, + "step": 1102 + }, + { + "epoch": 0.128144060412431, + "grad_norm": 0.3547315001487732, + "learning_rate": 0.0001, + "loss": 1.7726, + "step": 1103 + }, + { + "epoch": 0.12826023816439153, + "grad_norm": 0.3657218813896179, + "learning_rate": 0.0001, + "loss": 1.8437, + "step": 1104 + }, + { + "epoch": 0.12837641591635202, + "grad_norm": 0.3548438847064972, + "learning_rate": 0.0001, + "loss": 1.5604, + "step": 1105 + }, + { + "epoch": 0.1284925936683125, + "grad_norm": 0.3553130030632019, + "learning_rate": 0.0001, + "loss": 1.7507, + "step": 1106 + }, + { + "epoch": 0.12860877142027302, + "grad_norm": 0.35484910011291504, + "learning_rate": 0.0001, + "loss": 1.7819, + "step": 1107 + }, + { + "epoch": 0.1287249491722335, + "grad_norm": 0.3301905691623688, + "learning_rate": 0.0001, + "loss": 1.6291, + "step": 1108 + }, + { + "epoch": 0.12884112692419403, + "grad_norm": 0.3722817301750183, + "learning_rate": 0.0001, + "loss": 1.8194, + "step": 1109 + }, + { + "epoch": 0.12895730467615452, + "grad_norm": 0.408559113740921, + "learning_rate": 0.0001, + "loss": 1.8952, + "step": 1110 + }, + { + "epoch": 0.129073482428115, + "grad_norm": 0.3764549791812897, + "learning_rate": 0.0001, + "loss": 1.8921, + "step": 1111 + }, + { + "epoch": 0.12918966018007552, + "grad_norm": 0.3443762958049774, + "learning_rate": 0.0001, + "loss": 1.7042, + "step": 1112 + }, + { + "epoch": 0.129305837932036, + "grad_norm": 0.37455350160598755, + "learning_rate": 0.0001, + "loss": 1.7833, + "step": 1113 + }, + { + "epoch": 0.12942201568399653, + "grad_norm": 0.35170894861221313, + "learning_rate": 0.0001, + "loss": 1.6039, + "step": 1114 + }, + { + "epoch": 0.12953819343595702, + "grad_norm": 0.33991748094558716, + "learning_rate": 0.0001, + "loss": 1.8047, + "step": 1115 + }, + { + "epoch": 0.1296543711879175, + "grad_norm": 0.3735693693161011, + "learning_rate": 0.0001, + "loss": 1.928, + "step": 1116 + }, + { + "epoch": 0.12977054893987802, + "grad_norm": 0.34423983097076416, + "learning_rate": 0.0001, + "loss": 1.6854, + "step": 1117 + }, + { + "epoch": 0.1298867266918385, + "grad_norm": 0.3688075542449951, + "learning_rate": 0.0001, + "loss": 1.8482, + "step": 1118 + }, + { + "epoch": 0.130002904443799, + "grad_norm": 0.3608585000038147, + "learning_rate": 0.0001, + "loss": 1.8143, + "step": 1119 + }, + { + "epoch": 0.13011908219575952, + "grad_norm": 0.38905078172683716, + "learning_rate": 0.0001, + "loss": 1.7257, + "step": 1120 + }, + { + "epoch": 0.13023525994772, + "grad_norm": 0.3615328371524811, + "learning_rate": 0.0001, + "loss": 1.7845, + "step": 1121 + }, + { + "epoch": 0.13035143769968052, + "grad_norm": 0.35324275493621826, + "learning_rate": 0.0001, + "loss": 1.7745, + "step": 1122 + }, + { + "epoch": 0.130467615451641, + "grad_norm": 0.3279106318950653, + "learning_rate": 0.0001, + "loss": 1.6848, + "step": 1123 + }, + { + "epoch": 0.1305837932036015, + "grad_norm": 0.37770041823387146, + "learning_rate": 0.0001, + "loss": 1.9185, + "step": 1124 + }, + { + "epoch": 0.13069997095556202, + "grad_norm": 0.3365723192691803, + "learning_rate": 0.0001, + "loss": 1.5406, + "step": 1125 + }, + { + "epoch": 0.1308161487075225, + "grad_norm": 0.35211873054504395, + "learning_rate": 0.0001, + "loss": 1.5533, + "step": 1126 + }, + { + "epoch": 0.13093232645948302, + "grad_norm": 0.3364546000957489, + "learning_rate": 0.0001, + "loss": 1.607, + "step": 1127 + }, + { + "epoch": 0.1310485042114435, + "grad_norm": 0.33624467253685, + "learning_rate": 0.0001, + "loss": 1.5203, + "step": 1128 + }, + { + "epoch": 0.131164681963404, + "grad_norm": 0.34484627842903137, + "learning_rate": 0.0001, + "loss": 1.6558, + "step": 1129 + }, + { + "epoch": 0.13128085971536452, + "grad_norm": 0.3245522677898407, + "learning_rate": 0.0001, + "loss": 1.6121, + "step": 1130 + }, + { + "epoch": 0.131397037467325, + "grad_norm": 0.3584153652191162, + "learning_rate": 0.0001, + "loss": 1.6107, + "step": 1131 + }, + { + "epoch": 0.1315132152192855, + "grad_norm": 0.35809946060180664, + "learning_rate": 0.0001, + "loss": 1.7665, + "step": 1132 + }, + { + "epoch": 0.131629392971246, + "grad_norm": 0.3418474793434143, + "learning_rate": 0.0001, + "loss": 1.6811, + "step": 1133 + }, + { + "epoch": 0.1317455707232065, + "grad_norm": 0.3623388707637787, + "learning_rate": 0.0001, + "loss": 1.7171, + "step": 1134 + }, + { + "epoch": 0.13186174847516702, + "grad_norm": 0.3962494730949402, + "learning_rate": 0.0001, + "loss": 1.9956, + "step": 1135 + }, + { + "epoch": 0.1319779262271275, + "grad_norm": 0.34545156359672546, + "learning_rate": 0.0001, + "loss": 1.4836, + "step": 1136 + }, + { + "epoch": 0.132094103979088, + "grad_norm": 0.3415702283382416, + "learning_rate": 0.0001, + "loss": 1.7397, + "step": 1137 + }, + { + "epoch": 0.1322102817310485, + "grad_norm": 0.41906920075416565, + "learning_rate": 0.0001, + "loss": 1.8653, + "step": 1138 + }, + { + "epoch": 0.132326459483009, + "grad_norm": 0.3778826892375946, + "learning_rate": 0.0001, + "loss": 1.696, + "step": 1139 + }, + { + "epoch": 0.1324426372349695, + "grad_norm": 0.3520076870918274, + "learning_rate": 0.0001, + "loss": 1.7079, + "step": 1140 + }, + { + "epoch": 0.13255881498693, + "grad_norm": 0.38221481442451477, + "learning_rate": 0.0001, + "loss": 1.7196, + "step": 1141 + }, + { + "epoch": 0.1326749927388905, + "grad_norm": 0.34587883949279785, + "learning_rate": 0.0001, + "loss": 1.7352, + "step": 1142 + }, + { + "epoch": 0.132791170490851, + "grad_norm": 0.35534965991973877, + "learning_rate": 0.0001, + "loss": 1.7051, + "step": 1143 + }, + { + "epoch": 0.1329073482428115, + "grad_norm": 0.36971423029899597, + "learning_rate": 0.0001, + "loss": 1.7933, + "step": 1144 + }, + { + "epoch": 0.133023525994772, + "grad_norm": 0.3652137219905853, + "learning_rate": 0.0001, + "loss": 1.7395, + "step": 1145 + }, + { + "epoch": 0.1331397037467325, + "grad_norm": 0.340309739112854, + "learning_rate": 0.0001, + "loss": 1.75, + "step": 1146 + }, + { + "epoch": 0.133255881498693, + "grad_norm": 0.3661729693412781, + "learning_rate": 0.0001, + "loss": 1.7802, + "step": 1147 + }, + { + "epoch": 0.1333720592506535, + "grad_norm": 0.3509800434112549, + "learning_rate": 0.0001, + "loss": 1.7748, + "step": 1148 + }, + { + "epoch": 0.133488237002614, + "grad_norm": 0.34903261065483093, + "learning_rate": 0.0001, + "loss": 1.6914, + "step": 1149 + }, + { + "epoch": 0.1336044147545745, + "grad_norm": 0.3474218249320984, + "learning_rate": 0.0001, + "loss": 1.8942, + "step": 1150 + }, + { + "epoch": 0.133720592506535, + "grad_norm": 0.3618185818195343, + "learning_rate": 0.0001, + "loss": 1.7351, + "step": 1151 + }, + { + "epoch": 0.1338367702584955, + "grad_norm": 0.3447827696800232, + "learning_rate": 0.0001, + "loss": 1.5644, + "step": 1152 + }, + { + "epoch": 0.13395294801045599, + "grad_norm": 0.35929834842681885, + "learning_rate": 0.0001, + "loss": 1.6836, + "step": 1153 + }, + { + "epoch": 0.1340691257624165, + "grad_norm": 0.378379762172699, + "learning_rate": 0.0001, + "loss": 1.8789, + "step": 1154 + }, + { + "epoch": 0.134185303514377, + "grad_norm": 0.3777737319469452, + "learning_rate": 0.0001, + "loss": 1.6899, + "step": 1155 + }, + { + "epoch": 0.1343014812663375, + "grad_norm": 0.542852520942688, + "learning_rate": 0.0001, + "loss": 1.5277, + "step": 1156 + }, + { + "epoch": 0.134417659018298, + "grad_norm": 0.3426233232021332, + "learning_rate": 0.0001, + "loss": 1.4822, + "step": 1157 + }, + { + "epoch": 0.13453383677025849, + "grad_norm": 0.34573081135749817, + "learning_rate": 0.0001, + "loss": 1.8107, + "step": 1158 + }, + { + "epoch": 0.134650014522219, + "grad_norm": 0.35474127531051636, + "learning_rate": 0.0001, + "loss": 1.7912, + "step": 1159 + }, + { + "epoch": 0.1347661922741795, + "grad_norm": 0.3295106589794159, + "learning_rate": 0.0001, + "loss": 1.6829, + "step": 1160 + }, + { + "epoch": 0.13488237002613998, + "grad_norm": 0.380728542804718, + "learning_rate": 0.0001, + "loss": 1.8133, + "step": 1161 + }, + { + "epoch": 0.1349985477781005, + "grad_norm": 0.38181304931640625, + "learning_rate": 0.0001, + "loss": 1.748, + "step": 1162 + }, + { + "epoch": 0.135114725530061, + "grad_norm": 0.35895970463752747, + "learning_rate": 0.0001, + "loss": 1.6405, + "step": 1163 + }, + { + "epoch": 0.1352309032820215, + "grad_norm": 0.3574581742286682, + "learning_rate": 0.0001, + "loss": 1.7418, + "step": 1164 + }, + { + "epoch": 0.135347081033982, + "grad_norm": 0.3409847319126129, + "learning_rate": 0.0001, + "loss": 1.5072, + "step": 1165 + }, + { + "epoch": 0.13546325878594248, + "grad_norm": 0.36997130513191223, + "learning_rate": 0.0001, + "loss": 1.8488, + "step": 1166 + }, + { + "epoch": 0.135579436537903, + "grad_norm": 0.34550440311431885, + "learning_rate": 0.0001, + "loss": 1.715, + "step": 1167 + }, + { + "epoch": 0.1356956142898635, + "grad_norm": 0.36882877349853516, + "learning_rate": 0.0001, + "loss": 1.7311, + "step": 1168 + }, + { + "epoch": 0.135811792041824, + "grad_norm": 0.34376415610313416, + "learning_rate": 0.0001, + "loss": 1.6462, + "step": 1169 + }, + { + "epoch": 0.1359279697937845, + "grad_norm": 0.35478463768959045, + "learning_rate": 0.0001, + "loss": 1.836, + "step": 1170 + }, + { + "epoch": 0.13604414754574498, + "grad_norm": 0.3668883442878723, + "learning_rate": 0.0001, + "loss": 1.8465, + "step": 1171 + }, + { + "epoch": 0.1361603252977055, + "grad_norm": 0.33655285835266113, + "learning_rate": 0.0001, + "loss": 1.6395, + "step": 1172 + }, + { + "epoch": 0.136276503049666, + "grad_norm": 0.34968823194503784, + "learning_rate": 0.0001, + "loss": 1.6423, + "step": 1173 + }, + { + "epoch": 0.13639268080162648, + "grad_norm": 0.37849414348602295, + "learning_rate": 0.0001, + "loss": 1.832, + "step": 1174 + }, + { + "epoch": 0.136508858553587, + "grad_norm": 0.3569866120815277, + "learning_rate": 0.0001, + "loss": 1.7077, + "step": 1175 + }, + { + "epoch": 0.13662503630554748, + "grad_norm": 0.34238025546073914, + "learning_rate": 0.0001, + "loss": 1.7226, + "step": 1176 + }, + { + "epoch": 0.136741214057508, + "grad_norm": 0.35172078013420105, + "learning_rate": 0.0001, + "loss": 1.7818, + "step": 1177 + }, + { + "epoch": 0.1368573918094685, + "grad_norm": 0.35582435131073, + "learning_rate": 0.0001, + "loss": 1.6404, + "step": 1178 + }, + { + "epoch": 0.13697356956142898, + "grad_norm": 0.36293935775756836, + "learning_rate": 0.0001, + "loss": 1.8143, + "step": 1179 + }, + { + "epoch": 0.1370897473133895, + "grad_norm": 0.4134126901626587, + "learning_rate": 0.0001, + "loss": 1.8926, + "step": 1180 + }, + { + "epoch": 0.13720592506534998, + "grad_norm": 0.34876781702041626, + "learning_rate": 0.0001, + "loss": 1.7132, + "step": 1181 + }, + { + "epoch": 0.1373221028173105, + "grad_norm": 0.35906705260276794, + "learning_rate": 0.0001, + "loss": 1.8285, + "step": 1182 + }, + { + "epoch": 0.137438280569271, + "grad_norm": 0.3706406354904175, + "learning_rate": 0.0001, + "loss": 1.8965, + "step": 1183 + }, + { + "epoch": 0.13755445832123148, + "grad_norm": 0.3602171242237091, + "learning_rate": 0.0001, + "loss": 1.7168, + "step": 1184 + }, + { + "epoch": 0.137670636073192, + "grad_norm": 0.3797997534275055, + "learning_rate": 0.0001, + "loss": 1.8357, + "step": 1185 + }, + { + "epoch": 0.13778681382515248, + "grad_norm": 0.3507281541824341, + "learning_rate": 0.0001, + "loss": 1.7615, + "step": 1186 + }, + { + "epoch": 0.13790299157711297, + "grad_norm": 0.3676402270793915, + "learning_rate": 0.0001, + "loss": 1.7027, + "step": 1187 + }, + { + "epoch": 0.1380191693290735, + "grad_norm": 0.36422815918922424, + "learning_rate": 0.0001, + "loss": 1.7189, + "step": 1188 + }, + { + "epoch": 0.13813534708103398, + "grad_norm": 0.3710031807422638, + "learning_rate": 0.0001, + "loss": 1.7097, + "step": 1189 + }, + { + "epoch": 0.1382515248329945, + "grad_norm": 0.34949737787246704, + "learning_rate": 0.0001, + "loss": 1.6865, + "step": 1190 + }, + { + "epoch": 0.13836770258495498, + "grad_norm": 0.38215914368629456, + "learning_rate": 0.0001, + "loss": 1.9185, + "step": 1191 + }, + { + "epoch": 0.13848388033691547, + "grad_norm": 0.3533918857574463, + "learning_rate": 0.0001, + "loss": 1.7133, + "step": 1192 + }, + { + "epoch": 0.138600058088876, + "grad_norm": 0.36544063687324524, + "learning_rate": 0.0001, + "loss": 1.5791, + "step": 1193 + }, + { + "epoch": 0.13871623584083648, + "grad_norm": 0.3464662730693817, + "learning_rate": 0.0001, + "loss": 1.6104, + "step": 1194 + }, + { + "epoch": 0.13883241359279697, + "grad_norm": 0.32799604535102844, + "learning_rate": 0.0001, + "loss": 1.6871, + "step": 1195 + }, + { + "epoch": 0.13894859134475748, + "grad_norm": 0.37455034255981445, + "learning_rate": 0.0001, + "loss": 1.895, + "step": 1196 + }, + { + "epoch": 0.13906476909671797, + "grad_norm": 0.3819703459739685, + "learning_rate": 0.0001, + "loss": 1.6722, + "step": 1197 + }, + { + "epoch": 0.1391809468486785, + "grad_norm": 0.33819907903671265, + "learning_rate": 0.0001, + "loss": 1.7259, + "step": 1198 + }, + { + "epoch": 0.13929712460063898, + "grad_norm": 0.36918461322784424, + "learning_rate": 0.0001, + "loss": 1.5488, + "step": 1199 + }, + { + "epoch": 0.13941330235259947, + "grad_norm": 0.37889495491981506, + "learning_rate": 0.0001, + "loss": 1.8902, + "step": 1200 + }, + { + "epoch": 0.13952948010455998, + "grad_norm": 0.34494003653526306, + "learning_rate": 0.0001, + "loss": 1.6674, + "step": 1201 + }, + { + "epoch": 0.13964565785652047, + "grad_norm": 0.3527129292488098, + "learning_rate": 0.0001, + "loss": 1.7551, + "step": 1202 + }, + { + "epoch": 0.139761835608481, + "grad_norm": 0.35137444734573364, + "learning_rate": 0.0001, + "loss": 1.8853, + "step": 1203 + }, + { + "epoch": 0.13987801336044148, + "grad_norm": 0.3461925685405731, + "learning_rate": 0.0001, + "loss": 1.7065, + "step": 1204 + }, + { + "epoch": 0.13999419111240197, + "grad_norm": 0.36129575967788696, + "learning_rate": 0.0001, + "loss": 1.653, + "step": 1205 + }, + { + "epoch": 0.14011036886436248, + "grad_norm": 0.34536105394363403, + "learning_rate": 0.0001, + "loss": 1.6792, + "step": 1206 + }, + { + "epoch": 0.14022654661632297, + "grad_norm": 0.35485297441482544, + "learning_rate": 0.0001, + "loss": 1.6453, + "step": 1207 + }, + { + "epoch": 0.14034272436828346, + "grad_norm": 0.3653600811958313, + "learning_rate": 0.0001, + "loss": 1.7249, + "step": 1208 + }, + { + "epoch": 0.14045890212024398, + "grad_norm": 0.36751556396484375, + "learning_rate": 0.0001, + "loss": 1.9142, + "step": 1209 + }, + { + "epoch": 0.14057507987220447, + "grad_norm": 0.3849860727787018, + "learning_rate": 0.0001, + "loss": 1.7801, + "step": 1210 + }, + { + "epoch": 0.14069125762416498, + "grad_norm": 0.3888527452945709, + "learning_rate": 0.0001, + "loss": 1.7732, + "step": 1211 + }, + { + "epoch": 0.14080743537612547, + "grad_norm": 0.3473045825958252, + "learning_rate": 0.0001, + "loss": 1.7279, + "step": 1212 + }, + { + "epoch": 0.14092361312808596, + "grad_norm": 0.31697720289230347, + "learning_rate": 0.0001, + "loss": 1.5903, + "step": 1213 + }, + { + "epoch": 0.14103979088004648, + "grad_norm": 0.36347395181655884, + "learning_rate": 0.0001, + "loss": 1.8358, + "step": 1214 + }, + { + "epoch": 0.14115596863200697, + "grad_norm": 0.34787067770957947, + "learning_rate": 0.0001, + "loss": 1.7855, + "step": 1215 + }, + { + "epoch": 0.14127214638396748, + "grad_norm": 0.35520100593566895, + "learning_rate": 0.0001, + "loss": 1.7215, + "step": 1216 + }, + { + "epoch": 0.14138832413592797, + "grad_norm": 0.3633323609828949, + "learning_rate": 0.0001, + "loss": 1.7224, + "step": 1217 + }, + { + "epoch": 0.14150450188788846, + "grad_norm": 0.3367605209350586, + "learning_rate": 0.0001, + "loss": 1.6469, + "step": 1218 + }, + { + "epoch": 0.14162067963984898, + "grad_norm": 0.36629173159599304, + "learning_rate": 0.0001, + "loss": 1.8747, + "step": 1219 + }, + { + "epoch": 0.14173685739180947, + "grad_norm": 0.3563576340675354, + "learning_rate": 0.0001, + "loss": 1.8185, + "step": 1220 + }, + { + "epoch": 0.14185303514376996, + "grad_norm": 0.32516106963157654, + "learning_rate": 0.0001, + "loss": 1.5362, + "step": 1221 + }, + { + "epoch": 0.14196921289573047, + "grad_norm": 0.3476703464984894, + "learning_rate": 0.0001, + "loss": 1.7661, + "step": 1222 + }, + { + "epoch": 0.14208539064769096, + "grad_norm": 0.3780508041381836, + "learning_rate": 0.0001, + "loss": 1.7457, + "step": 1223 + }, + { + "epoch": 0.14220156839965148, + "grad_norm": 0.3513147234916687, + "learning_rate": 0.0001, + "loss": 1.6291, + "step": 1224 + }, + { + "epoch": 0.14231774615161197, + "grad_norm": 0.384937584400177, + "learning_rate": 0.0001, + "loss": 1.8593, + "step": 1225 + }, + { + "epoch": 0.14243392390357246, + "grad_norm": 0.37124475836753845, + "learning_rate": 0.0001, + "loss": 1.8555, + "step": 1226 + }, + { + "epoch": 0.14255010165553297, + "grad_norm": 0.3864074647426605, + "learning_rate": 0.0001, + "loss": 1.7074, + "step": 1227 + }, + { + "epoch": 0.14266627940749346, + "grad_norm": 0.34239131212234497, + "learning_rate": 0.0001, + "loss": 1.668, + "step": 1228 + }, + { + "epoch": 0.14278245715945395, + "grad_norm": 0.3898089528083801, + "learning_rate": 0.0001, + "loss": 1.8215, + "step": 1229 + }, + { + "epoch": 0.14289863491141447, + "grad_norm": 0.3413831889629364, + "learning_rate": 0.0001, + "loss": 1.6671, + "step": 1230 + }, + { + "epoch": 0.14301481266337496, + "grad_norm": 0.34374555945396423, + "learning_rate": 0.0001, + "loss": 1.7268, + "step": 1231 + }, + { + "epoch": 0.14313099041533547, + "grad_norm": 0.3335866928100586, + "learning_rate": 0.0001, + "loss": 1.608, + "step": 1232 + }, + { + "epoch": 0.14324716816729596, + "grad_norm": 0.34190499782562256, + "learning_rate": 0.0001, + "loss": 1.6609, + "step": 1233 + }, + { + "epoch": 0.14336334591925645, + "grad_norm": 0.3815557062625885, + "learning_rate": 0.0001, + "loss": 1.9198, + "step": 1234 + }, + { + "epoch": 0.14347952367121697, + "grad_norm": 0.3535671830177307, + "learning_rate": 0.0001, + "loss": 1.6951, + "step": 1235 + }, + { + "epoch": 0.14359570142317746, + "grad_norm": 0.38413700461387634, + "learning_rate": 0.0001, + "loss": 1.8972, + "step": 1236 + }, + { + "epoch": 0.14371187917513797, + "grad_norm": 0.3618411719799042, + "learning_rate": 0.0001, + "loss": 1.6467, + "step": 1237 + }, + { + "epoch": 0.14382805692709846, + "grad_norm": 0.34210920333862305, + "learning_rate": 0.0001, + "loss": 1.634, + "step": 1238 + }, + { + "epoch": 0.14394423467905895, + "grad_norm": 0.3580038249492645, + "learning_rate": 0.0001, + "loss": 1.6651, + "step": 1239 + }, + { + "epoch": 0.14406041243101947, + "grad_norm": 0.36096274852752686, + "learning_rate": 0.0001, + "loss": 1.7914, + "step": 1240 + }, + { + "epoch": 0.14417659018297996, + "grad_norm": 0.3829607665538788, + "learning_rate": 0.0001, + "loss": 1.8332, + "step": 1241 + }, + { + "epoch": 0.14429276793494045, + "grad_norm": 0.3591415286064148, + "learning_rate": 0.0001, + "loss": 1.7212, + "step": 1242 + }, + { + "epoch": 0.14440894568690096, + "grad_norm": 0.36007246375083923, + "learning_rate": 0.0001, + "loss": 1.7336, + "step": 1243 + }, + { + "epoch": 0.14452512343886145, + "grad_norm": 0.3532988131046295, + "learning_rate": 0.0001, + "loss": 1.8504, + "step": 1244 + }, + { + "epoch": 0.14464130119082197, + "grad_norm": 0.37223345041275024, + "learning_rate": 0.0001, + "loss": 1.7329, + "step": 1245 + }, + { + "epoch": 0.14475747894278246, + "grad_norm": 0.34656253457069397, + "learning_rate": 0.0001, + "loss": 1.679, + "step": 1246 + }, + { + "epoch": 0.14487365669474295, + "grad_norm": 0.3697267472743988, + "learning_rate": 0.0001, + "loss": 1.9745, + "step": 1247 + }, + { + "epoch": 0.14498983444670346, + "grad_norm": 0.3211793899536133, + "learning_rate": 0.0001, + "loss": 1.5591, + "step": 1248 + }, + { + "epoch": 0.14510601219866395, + "grad_norm": 0.350603848695755, + "learning_rate": 0.0001, + "loss": 1.5883, + "step": 1249 + }, + { + "epoch": 0.14522218995062444, + "grad_norm": 0.3709065318107605, + "learning_rate": 0.0001, + "loss": 1.7782, + "step": 1250 + }, + { + "epoch": 0.14533836770258496, + "grad_norm": 0.367072194814682, + "learning_rate": 0.0001, + "loss": 1.6962, + "step": 1251 + }, + { + "epoch": 0.14545454545454545, + "grad_norm": 0.35711991786956787, + "learning_rate": 0.0001, + "loss": 1.8047, + "step": 1252 + }, + { + "epoch": 0.14557072320650596, + "grad_norm": 0.37235334515571594, + "learning_rate": 0.0001, + "loss": 1.7157, + "step": 1253 + }, + { + "epoch": 0.14568690095846645, + "grad_norm": 0.36526671051979065, + "learning_rate": 0.0001, + "loss": 1.7217, + "step": 1254 + }, + { + "epoch": 0.14580307871042694, + "grad_norm": 0.35159561038017273, + "learning_rate": 0.0001, + "loss": 1.7109, + "step": 1255 + }, + { + "epoch": 0.14591925646238746, + "grad_norm": 0.35681840777397156, + "learning_rate": 0.0001, + "loss": 1.731, + "step": 1256 + }, + { + "epoch": 0.14603543421434795, + "grad_norm": 0.3531685471534729, + "learning_rate": 0.0001, + "loss": 1.714, + "step": 1257 + }, + { + "epoch": 0.14615161196630846, + "grad_norm": 0.3450145423412323, + "learning_rate": 0.0001, + "loss": 1.7257, + "step": 1258 + }, + { + "epoch": 0.14626778971826895, + "grad_norm": 0.3472040891647339, + "learning_rate": 0.0001, + "loss": 1.7006, + "step": 1259 + }, + { + "epoch": 0.14638396747022944, + "grad_norm": 0.35504022240638733, + "learning_rate": 0.0001, + "loss": 1.8701, + "step": 1260 + }, + { + "epoch": 0.14650014522218996, + "grad_norm": 0.3664184808731079, + "learning_rate": 0.0001, + "loss": 1.7835, + "step": 1261 + }, + { + "epoch": 0.14661632297415045, + "grad_norm": 0.32701820135116577, + "learning_rate": 0.0001, + "loss": 1.6524, + "step": 1262 + }, + { + "epoch": 0.14673250072611094, + "grad_norm": 0.34154212474823, + "learning_rate": 0.0001, + "loss": 1.7332, + "step": 1263 + }, + { + "epoch": 0.14684867847807145, + "grad_norm": 0.3491269052028656, + "learning_rate": 0.0001, + "loss": 1.6165, + "step": 1264 + }, + { + "epoch": 0.14696485623003194, + "grad_norm": 0.351469486951828, + "learning_rate": 0.0001, + "loss": 1.6913, + "step": 1265 + }, + { + "epoch": 0.14708103398199246, + "grad_norm": 0.3573369085788727, + "learning_rate": 0.0001, + "loss": 1.7032, + "step": 1266 + }, + { + "epoch": 0.14719721173395295, + "grad_norm": 0.35862183570861816, + "learning_rate": 0.0001, + "loss": 1.6222, + "step": 1267 + }, + { + "epoch": 0.14731338948591344, + "grad_norm": 0.3771260976791382, + "learning_rate": 0.0001, + "loss": 1.8338, + "step": 1268 + }, + { + "epoch": 0.14742956723787395, + "grad_norm": 0.36234867572784424, + "learning_rate": 0.0001, + "loss": 1.5947, + "step": 1269 + }, + { + "epoch": 0.14754574498983444, + "grad_norm": 0.35228458046913147, + "learning_rate": 0.0001, + "loss": 1.7534, + "step": 1270 + }, + { + "epoch": 0.14766192274179496, + "grad_norm": 0.3665032982826233, + "learning_rate": 0.0001, + "loss": 1.772, + "step": 1271 + }, + { + "epoch": 0.14777810049375545, + "grad_norm": 0.3650833070278168, + "learning_rate": 0.0001, + "loss": 1.7147, + "step": 1272 + }, + { + "epoch": 0.14789427824571594, + "grad_norm": 0.36604440212249756, + "learning_rate": 0.0001, + "loss": 1.648, + "step": 1273 + }, + { + "epoch": 0.14801045599767645, + "grad_norm": 0.360029935836792, + "learning_rate": 0.0001, + "loss": 1.7143, + "step": 1274 + }, + { + "epoch": 0.14812663374963694, + "grad_norm": 0.3815794885158539, + "learning_rate": 0.0001, + "loss": 1.8357, + "step": 1275 + }, + { + "epoch": 0.14824281150159743, + "grad_norm": 0.36898788809776306, + "learning_rate": 0.0001, + "loss": 1.6458, + "step": 1276 + }, + { + "epoch": 0.14835898925355795, + "grad_norm": 0.3591456115245819, + "learning_rate": 0.0001, + "loss": 1.8453, + "step": 1277 + }, + { + "epoch": 0.14847516700551844, + "grad_norm": 0.3744908273220062, + "learning_rate": 0.0001, + "loss": 1.6924, + "step": 1278 + }, + { + "epoch": 0.14859134475747895, + "grad_norm": 0.3827156722545624, + "learning_rate": 0.0001, + "loss": 1.837, + "step": 1279 + }, + { + "epoch": 0.14870752250943944, + "grad_norm": 0.34295526146888733, + "learning_rate": 0.0001, + "loss": 1.5949, + "step": 1280 + }, + { + "epoch": 0.14882370026139993, + "grad_norm": 0.36582034826278687, + "learning_rate": 0.0001, + "loss": 1.8293, + "step": 1281 + }, + { + "epoch": 0.14893987801336045, + "grad_norm": 0.34312430024147034, + "learning_rate": 0.0001, + "loss": 1.7463, + "step": 1282 + }, + { + "epoch": 0.14905605576532094, + "grad_norm": 0.3591785728931427, + "learning_rate": 0.0001, + "loss": 1.6766, + "step": 1283 + }, + { + "epoch": 0.14917223351728143, + "grad_norm": 0.3512047231197357, + "learning_rate": 0.0001, + "loss": 1.7607, + "step": 1284 + }, + { + "epoch": 0.14928841126924194, + "grad_norm": 0.35547614097595215, + "learning_rate": 0.0001, + "loss": 1.8423, + "step": 1285 + }, + { + "epoch": 0.14940458902120243, + "grad_norm": 0.3671259582042694, + "learning_rate": 0.0001, + "loss": 1.7298, + "step": 1286 + }, + { + "epoch": 0.14952076677316295, + "grad_norm": 0.3785369098186493, + "learning_rate": 0.0001, + "loss": 1.7921, + "step": 1287 + }, + { + "epoch": 0.14963694452512344, + "grad_norm": 0.3537178635597229, + "learning_rate": 0.0001, + "loss": 1.6989, + "step": 1288 + }, + { + "epoch": 0.14975312227708393, + "grad_norm": 0.35491520166397095, + "learning_rate": 0.0001, + "loss": 1.7895, + "step": 1289 + }, + { + "epoch": 0.14986930002904444, + "grad_norm": 0.3567078709602356, + "learning_rate": 0.0001, + "loss": 1.7894, + "step": 1290 + }, + { + "epoch": 0.14998547778100493, + "grad_norm": 0.35973167419433594, + "learning_rate": 0.0001, + "loss": 1.8044, + "step": 1291 + }, + { + "epoch": 0.15010165553296545, + "grad_norm": 0.3756448030471802, + "learning_rate": 0.0001, + "loss": 1.9249, + "step": 1292 + }, + { + "epoch": 0.15021783328492594, + "grad_norm": 0.3457126021385193, + "learning_rate": 0.0001, + "loss": 1.6851, + "step": 1293 + }, + { + "epoch": 0.15033401103688643, + "grad_norm": 0.3493401110172272, + "learning_rate": 0.0001, + "loss": 1.6958, + "step": 1294 + }, + { + "epoch": 0.15045018878884694, + "grad_norm": 0.38052037358283997, + "learning_rate": 0.0001, + "loss": 1.8351, + "step": 1295 + }, + { + "epoch": 0.15056636654080743, + "grad_norm": 0.326164186000824, + "learning_rate": 0.0001, + "loss": 1.6357, + "step": 1296 + }, + { + "epoch": 0.15068254429276792, + "grad_norm": 0.37613871693611145, + "learning_rate": 0.0001, + "loss": 1.7088, + "step": 1297 + }, + { + "epoch": 0.15079872204472844, + "grad_norm": 0.40426647663116455, + "learning_rate": 0.0001, + "loss": 1.8863, + "step": 1298 + }, + { + "epoch": 0.15091489979668893, + "grad_norm": 0.3410615026950836, + "learning_rate": 0.0001, + "loss": 1.6051, + "step": 1299 + }, + { + "epoch": 0.15103107754864945, + "grad_norm": 0.36211374402046204, + "learning_rate": 0.0001, + "loss": 1.7685, + "step": 1300 + }, + { + "epoch": 0.15114725530060993, + "grad_norm": 0.39096057415008545, + "learning_rate": 0.0001, + "loss": 1.8656, + "step": 1301 + }, + { + "epoch": 0.15126343305257042, + "grad_norm": 0.3603553771972656, + "learning_rate": 0.0001, + "loss": 1.6818, + "step": 1302 + }, + { + "epoch": 0.15137961080453094, + "grad_norm": 0.3809981048107147, + "learning_rate": 0.0001, + "loss": 1.7441, + "step": 1303 + }, + { + "epoch": 0.15149578855649143, + "grad_norm": 0.35240018367767334, + "learning_rate": 0.0001, + "loss": 1.6923, + "step": 1304 + }, + { + "epoch": 0.15161196630845195, + "grad_norm": 0.3340260088443756, + "learning_rate": 0.0001, + "loss": 1.6864, + "step": 1305 + }, + { + "epoch": 0.15172814406041243, + "grad_norm": 0.3688841164112091, + "learning_rate": 0.0001, + "loss": 1.7241, + "step": 1306 + }, + { + "epoch": 0.15184432181237292, + "grad_norm": 0.36552947759628296, + "learning_rate": 0.0001, + "loss": 1.6414, + "step": 1307 + }, + { + "epoch": 0.15196049956433344, + "grad_norm": 0.3461878001689911, + "learning_rate": 0.0001, + "loss": 1.6445, + "step": 1308 + }, + { + "epoch": 0.15207667731629393, + "grad_norm": 0.35376498103141785, + "learning_rate": 0.0001, + "loss": 1.7873, + "step": 1309 + }, + { + "epoch": 0.15219285506825442, + "grad_norm": 0.37778156995773315, + "learning_rate": 0.0001, + "loss": 1.6798, + "step": 1310 + }, + { + "epoch": 0.15230903282021493, + "grad_norm": 0.3635299503803253, + "learning_rate": 0.0001, + "loss": 1.7731, + "step": 1311 + }, + { + "epoch": 0.15242521057217542, + "grad_norm": 0.37583014369010925, + "learning_rate": 0.0001, + "loss": 1.8543, + "step": 1312 + }, + { + "epoch": 0.15254138832413594, + "grad_norm": 0.37398314476013184, + "learning_rate": 0.0001, + "loss": 1.6894, + "step": 1313 + }, + { + "epoch": 0.15265756607609643, + "grad_norm": 0.37770211696624756, + "learning_rate": 0.0001, + "loss": 1.6799, + "step": 1314 + }, + { + "epoch": 0.15277374382805692, + "grad_norm": 0.3403077721595764, + "learning_rate": 0.0001, + "loss": 1.782, + "step": 1315 + }, + { + "epoch": 0.15288992158001743, + "grad_norm": 0.3504425287246704, + "learning_rate": 0.0001, + "loss": 1.7012, + "step": 1316 + }, + { + "epoch": 0.15300609933197792, + "grad_norm": 0.3553392291069031, + "learning_rate": 0.0001, + "loss": 1.5919, + "step": 1317 + }, + { + "epoch": 0.1531222770839384, + "grad_norm": 0.3788677752017975, + "learning_rate": 0.0001, + "loss": 1.7182, + "step": 1318 + }, + { + "epoch": 0.15323845483589893, + "grad_norm": 0.3600800037384033, + "learning_rate": 0.0001, + "loss": 1.7677, + "step": 1319 + }, + { + "epoch": 0.15335463258785942, + "grad_norm": 0.38489770889282227, + "learning_rate": 0.0001, + "loss": 1.6607, + "step": 1320 + }, + { + "epoch": 0.15347081033981994, + "grad_norm": 0.3559502959251404, + "learning_rate": 0.0001, + "loss": 1.576, + "step": 1321 + }, + { + "epoch": 0.15358698809178042, + "grad_norm": 0.3516106605529785, + "learning_rate": 0.0001, + "loss": 1.7595, + "step": 1322 + }, + { + "epoch": 0.1537031658437409, + "grad_norm": 0.35282179713249207, + "learning_rate": 0.0001, + "loss": 1.7575, + "step": 1323 + }, + { + "epoch": 0.15381934359570143, + "grad_norm": 0.3448900878429413, + "learning_rate": 0.0001, + "loss": 1.8222, + "step": 1324 + }, + { + "epoch": 0.15393552134766192, + "grad_norm": 0.3468807339668274, + "learning_rate": 0.0001, + "loss": 1.6246, + "step": 1325 + }, + { + "epoch": 0.15405169909962244, + "grad_norm": 0.3733367621898651, + "learning_rate": 0.0001, + "loss": 1.7449, + "step": 1326 + }, + { + "epoch": 0.15416787685158292, + "grad_norm": 0.3257894217967987, + "learning_rate": 0.0001, + "loss": 1.5762, + "step": 1327 + }, + { + "epoch": 0.1542840546035434, + "grad_norm": 0.3381756842136383, + "learning_rate": 0.0001, + "loss": 1.6698, + "step": 1328 + }, + { + "epoch": 0.15440023235550393, + "grad_norm": 0.3703326880931854, + "learning_rate": 0.0001, + "loss": 1.8643, + "step": 1329 + }, + { + "epoch": 0.15451641010746442, + "grad_norm": 0.3577940762042999, + "learning_rate": 0.0001, + "loss": 1.75, + "step": 1330 + }, + { + "epoch": 0.1546325878594249, + "grad_norm": 0.38018643856048584, + "learning_rate": 0.0001, + "loss": 1.8148, + "step": 1331 + }, + { + "epoch": 0.15474876561138542, + "grad_norm": 0.3477984368801117, + "learning_rate": 0.0001, + "loss": 1.7081, + "step": 1332 + }, + { + "epoch": 0.1548649433633459, + "grad_norm": 0.35512134432792664, + "learning_rate": 0.0001, + "loss": 1.6638, + "step": 1333 + }, + { + "epoch": 0.15498112111530643, + "grad_norm": 0.34343427419662476, + "learning_rate": 0.0001, + "loss": 1.7468, + "step": 1334 + }, + { + "epoch": 0.15509729886726692, + "grad_norm": 0.36737141013145447, + "learning_rate": 0.0001, + "loss": 1.8357, + "step": 1335 + }, + { + "epoch": 0.1552134766192274, + "grad_norm": 0.3362652659416199, + "learning_rate": 0.0001, + "loss": 1.6064, + "step": 1336 + }, + { + "epoch": 0.15532965437118793, + "grad_norm": 0.3790622651576996, + "learning_rate": 0.0001, + "loss": 1.6913, + "step": 1337 + }, + { + "epoch": 0.15544583212314841, + "grad_norm": 0.3796531856060028, + "learning_rate": 0.0001, + "loss": 1.7659, + "step": 1338 + }, + { + "epoch": 0.1555620098751089, + "grad_norm": 0.3766029477119446, + "learning_rate": 0.0001, + "loss": 1.7234, + "step": 1339 + }, + { + "epoch": 0.15567818762706942, + "grad_norm": 0.33512985706329346, + "learning_rate": 0.0001, + "loss": 1.7591, + "step": 1340 + }, + { + "epoch": 0.1557943653790299, + "grad_norm": 0.35915273427963257, + "learning_rate": 0.0001, + "loss": 1.7368, + "step": 1341 + }, + { + "epoch": 0.15591054313099043, + "grad_norm": 0.3571474254131317, + "learning_rate": 0.0001, + "loss": 1.7366, + "step": 1342 + }, + { + "epoch": 0.15602672088295091, + "grad_norm": 0.36832332611083984, + "learning_rate": 0.0001, + "loss": 1.7109, + "step": 1343 + }, + { + "epoch": 0.1561428986349114, + "grad_norm": 0.34558358788490295, + "learning_rate": 0.0001, + "loss": 1.6834, + "step": 1344 + }, + { + "epoch": 0.15625907638687192, + "grad_norm": 0.3483864665031433, + "learning_rate": 0.0001, + "loss": 1.6472, + "step": 1345 + }, + { + "epoch": 0.1563752541388324, + "grad_norm": 0.3357907831668854, + "learning_rate": 0.0001, + "loss": 1.7206, + "step": 1346 + }, + { + "epoch": 0.15649143189079293, + "grad_norm": 0.37820810079574585, + "learning_rate": 0.0001, + "loss": 1.5922, + "step": 1347 + }, + { + "epoch": 0.15660760964275341, + "grad_norm": 0.4033626616001129, + "learning_rate": 0.0001, + "loss": 1.8405, + "step": 1348 + }, + { + "epoch": 0.1567237873947139, + "grad_norm": 0.3691973090171814, + "learning_rate": 0.0001, + "loss": 1.7431, + "step": 1349 + }, + { + "epoch": 0.15683996514667442, + "grad_norm": 0.3847627639770508, + "learning_rate": 0.0001, + "loss": 1.8025, + "step": 1350 + }, + { + "epoch": 0.1569561428986349, + "grad_norm": 0.4157688617706299, + "learning_rate": 0.0001, + "loss": 1.8269, + "step": 1351 + }, + { + "epoch": 0.1570723206505954, + "grad_norm": 0.35120290517807007, + "learning_rate": 0.0001, + "loss": 1.5718, + "step": 1352 + }, + { + "epoch": 0.15718849840255592, + "grad_norm": 0.3579387664794922, + "learning_rate": 0.0001, + "loss": 1.7129, + "step": 1353 + }, + { + "epoch": 0.1573046761545164, + "grad_norm": 0.3572061359882355, + "learning_rate": 0.0001, + "loss": 1.7241, + "step": 1354 + }, + { + "epoch": 0.15742085390647692, + "grad_norm": 0.38334596157073975, + "learning_rate": 0.0001, + "loss": 1.8802, + "step": 1355 + }, + { + "epoch": 0.1575370316584374, + "grad_norm": 0.3814426064491272, + "learning_rate": 0.0001, + "loss": 1.8669, + "step": 1356 + }, + { + "epoch": 0.1576532094103979, + "grad_norm": 0.3972860276699066, + "learning_rate": 0.0001, + "loss": 1.8, + "step": 1357 + }, + { + "epoch": 0.15776938716235842, + "grad_norm": 0.41393837332725525, + "learning_rate": 0.0001, + "loss": 1.8818, + "step": 1358 + }, + { + "epoch": 0.1578855649143189, + "grad_norm": 0.35115018486976624, + "learning_rate": 0.0001, + "loss": 1.6307, + "step": 1359 + }, + { + "epoch": 0.15800174266627942, + "grad_norm": 0.36894285678863525, + "learning_rate": 0.0001, + "loss": 1.8274, + "step": 1360 + }, + { + "epoch": 0.1581179204182399, + "grad_norm": 0.344990074634552, + "learning_rate": 0.0001, + "loss": 1.7561, + "step": 1361 + }, + { + "epoch": 0.1582340981702004, + "grad_norm": 0.33179888129234314, + "learning_rate": 0.0001, + "loss": 1.588, + "step": 1362 + }, + { + "epoch": 0.15835027592216092, + "grad_norm": 0.3482305407524109, + "learning_rate": 0.0001, + "loss": 1.5916, + "step": 1363 + }, + { + "epoch": 0.1584664536741214, + "grad_norm": 0.34571194648742676, + "learning_rate": 0.0001, + "loss": 1.7024, + "step": 1364 + }, + { + "epoch": 0.1585826314260819, + "grad_norm": 0.3417724668979645, + "learning_rate": 0.0001, + "loss": 1.4753, + "step": 1365 + }, + { + "epoch": 0.1586988091780424, + "grad_norm": 0.36330536007881165, + "learning_rate": 0.0001, + "loss": 1.7957, + "step": 1366 + }, + { + "epoch": 0.1588149869300029, + "grad_norm": 0.3438703417778015, + "learning_rate": 0.0001, + "loss": 1.8058, + "step": 1367 + }, + { + "epoch": 0.15893116468196342, + "grad_norm": 0.35250309109687805, + "learning_rate": 0.0001, + "loss": 1.8591, + "step": 1368 + }, + { + "epoch": 0.1590473424339239, + "grad_norm": 0.3651219606399536, + "learning_rate": 0.0001, + "loss": 1.6975, + "step": 1369 + }, + { + "epoch": 0.1591635201858844, + "grad_norm": 0.3463347256183624, + "learning_rate": 0.0001, + "loss": 1.6508, + "step": 1370 + }, + { + "epoch": 0.1592796979378449, + "grad_norm": 0.3516363799571991, + "learning_rate": 0.0001, + "loss": 1.5442, + "step": 1371 + }, + { + "epoch": 0.1593958756898054, + "grad_norm": 0.37753501534461975, + "learning_rate": 0.0001, + "loss": 1.8072, + "step": 1372 + }, + { + "epoch": 0.1595120534417659, + "grad_norm": 0.3713725507259369, + "learning_rate": 0.0001, + "loss": 1.7824, + "step": 1373 + }, + { + "epoch": 0.1596282311937264, + "grad_norm": 0.35233864188194275, + "learning_rate": 0.0001, + "loss": 1.6618, + "step": 1374 + }, + { + "epoch": 0.1597444089456869, + "grad_norm": 0.3708571195602417, + "learning_rate": 0.0001, + "loss": 1.6926, + "step": 1375 + }, + { + "epoch": 0.1598605866976474, + "grad_norm": 0.4148230254650116, + "learning_rate": 0.0001, + "loss": 1.7946, + "step": 1376 + }, + { + "epoch": 0.1599767644496079, + "grad_norm": 0.3738183081150055, + "learning_rate": 0.0001, + "loss": 1.6456, + "step": 1377 + }, + { + "epoch": 0.1600929422015684, + "grad_norm": 0.40050017833709717, + "learning_rate": 0.0001, + "loss": 1.8349, + "step": 1378 + }, + { + "epoch": 0.1602091199535289, + "grad_norm": 0.3961089551448822, + "learning_rate": 0.0001, + "loss": 1.8826, + "step": 1379 + }, + { + "epoch": 0.1603252977054894, + "grad_norm": 0.38461560010910034, + "learning_rate": 0.0001, + "loss": 1.7562, + "step": 1380 + }, + { + "epoch": 0.1604414754574499, + "grad_norm": 0.3477541208267212, + "learning_rate": 0.0001, + "loss": 1.6368, + "step": 1381 + }, + { + "epoch": 0.1605576532094104, + "grad_norm": 0.3460700213909149, + "learning_rate": 0.0001, + "loss": 1.7069, + "step": 1382 + }, + { + "epoch": 0.1606738309613709, + "grad_norm": 0.3566724956035614, + "learning_rate": 0.0001, + "loss": 1.6473, + "step": 1383 + }, + { + "epoch": 0.1607900087133314, + "grad_norm": 0.36470624804496765, + "learning_rate": 0.0001, + "loss": 1.8717, + "step": 1384 + }, + { + "epoch": 0.1609061864652919, + "grad_norm": 0.3292492926120758, + "learning_rate": 0.0001, + "loss": 1.6557, + "step": 1385 + }, + { + "epoch": 0.16102236421725238, + "grad_norm": 0.36960333585739136, + "learning_rate": 0.0001, + "loss": 1.7142, + "step": 1386 + }, + { + "epoch": 0.1611385419692129, + "grad_norm": 0.36718302965164185, + "learning_rate": 0.0001, + "loss": 1.8368, + "step": 1387 + }, + { + "epoch": 0.1612547197211734, + "grad_norm": 0.3775370419025421, + "learning_rate": 0.0001, + "loss": 1.7769, + "step": 1388 + }, + { + "epoch": 0.1613708974731339, + "grad_norm": 0.3697305917739868, + "learning_rate": 0.0001, + "loss": 1.7334, + "step": 1389 + }, + { + "epoch": 0.1614870752250944, + "grad_norm": 0.37467798590660095, + "learning_rate": 0.0001, + "loss": 1.5945, + "step": 1390 + }, + { + "epoch": 0.16160325297705488, + "grad_norm": 0.3733084499835968, + "learning_rate": 0.0001, + "loss": 1.8647, + "step": 1391 + }, + { + "epoch": 0.1617194307290154, + "grad_norm": 0.3742946982383728, + "learning_rate": 0.0001, + "loss": 1.6711, + "step": 1392 + }, + { + "epoch": 0.1618356084809759, + "grad_norm": 0.3777306079864502, + "learning_rate": 0.0001, + "loss": 1.7787, + "step": 1393 + }, + { + "epoch": 0.16195178623293638, + "grad_norm": 0.3901897370815277, + "learning_rate": 0.0001, + "loss": 1.8356, + "step": 1394 + }, + { + "epoch": 0.1620679639848969, + "grad_norm": 0.38239678740501404, + "learning_rate": 0.0001, + "loss": 1.9155, + "step": 1395 + }, + { + "epoch": 0.16218414173685738, + "grad_norm": 0.3667686879634857, + "learning_rate": 0.0001, + "loss": 1.7969, + "step": 1396 + }, + { + "epoch": 0.1623003194888179, + "grad_norm": 0.38420993089675903, + "learning_rate": 0.0001, + "loss": 1.7801, + "step": 1397 + }, + { + "epoch": 0.1624164972407784, + "grad_norm": 0.36830443143844604, + "learning_rate": 0.0001, + "loss": 1.6106, + "step": 1398 + }, + { + "epoch": 0.16253267499273888, + "grad_norm": 0.35537177324295044, + "learning_rate": 0.0001, + "loss": 1.6633, + "step": 1399 + }, + { + "epoch": 0.1626488527446994, + "grad_norm": 0.35182371735572815, + "learning_rate": 0.0001, + "loss": 1.6809, + "step": 1400 + }, + { + "epoch": 0.16276503049665988, + "grad_norm": 0.36817070841789246, + "learning_rate": 0.0001, + "loss": 1.773, + "step": 1401 + }, + { + "epoch": 0.1628812082486204, + "grad_norm": 0.3580371141433716, + "learning_rate": 0.0001, + "loss": 1.708, + "step": 1402 + }, + { + "epoch": 0.1629973860005809, + "grad_norm": 0.37469223141670227, + "learning_rate": 0.0001, + "loss": 1.7603, + "step": 1403 + }, + { + "epoch": 0.16311356375254138, + "grad_norm": 0.39260217547416687, + "learning_rate": 0.0001, + "loss": 1.9347, + "step": 1404 + }, + { + "epoch": 0.1632297415045019, + "grad_norm": 0.37076276540756226, + "learning_rate": 0.0001, + "loss": 1.755, + "step": 1405 + }, + { + "epoch": 0.16334591925646239, + "grad_norm": 0.34179428219795227, + "learning_rate": 0.0001, + "loss": 1.7104, + "step": 1406 + }, + { + "epoch": 0.16346209700842287, + "grad_norm": 0.359271377325058, + "learning_rate": 0.0001, + "loss": 1.688, + "step": 1407 + }, + { + "epoch": 0.1635782747603834, + "grad_norm": 0.3883320093154907, + "learning_rate": 0.0001, + "loss": 1.8176, + "step": 1408 + }, + { + "epoch": 0.16369445251234388, + "grad_norm": 0.35614779591560364, + "learning_rate": 0.0001, + "loss": 1.6372, + "step": 1409 + }, + { + "epoch": 0.1638106302643044, + "grad_norm": 0.3655150830745697, + "learning_rate": 0.0001, + "loss": 1.8285, + "step": 1410 + }, + { + "epoch": 0.16392680801626489, + "grad_norm": 0.37644287943840027, + "learning_rate": 0.0001, + "loss": 1.8047, + "step": 1411 + }, + { + "epoch": 0.16404298576822537, + "grad_norm": 0.33691367506980896, + "learning_rate": 0.0001, + "loss": 1.6559, + "step": 1412 + }, + { + "epoch": 0.1641591635201859, + "grad_norm": 0.3288310766220093, + "learning_rate": 0.0001, + "loss": 1.5953, + "step": 1413 + }, + { + "epoch": 0.16427534127214638, + "grad_norm": 0.37377673387527466, + "learning_rate": 0.0001, + "loss": 1.7352, + "step": 1414 + }, + { + "epoch": 0.1643915190241069, + "grad_norm": 0.3379480242729187, + "learning_rate": 0.0001, + "loss": 1.6594, + "step": 1415 + }, + { + "epoch": 0.16450769677606739, + "grad_norm": 0.3620838522911072, + "learning_rate": 0.0001, + "loss": 1.6778, + "step": 1416 + }, + { + "epoch": 0.16462387452802787, + "grad_norm": 0.3558915853500366, + "learning_rate": 0.0001, + "loss": 1.7501, + "step": 1417 + }, + { + "epoch": 0.1647400522799884, + "grad_norm": 0.36311277747154236, + "learning_rate": 0.0001, + "loss": 1.6721, + "step": 1418 + }, + { + "epoch": 0.16485623003194888, + "grad_norm": 0.41347232460975647, + "learning_rate": 0.0001, + "loss": 1.834, + "step": 1419 + }, + { + "epoch": 0.16497240778390937, + "grad_norm": 0.3624221086502075, + "learning_rate": 0.0001, + "loss": 1.6716, + "step": 1420 + }, + { + "epoch": 0.1650885855358699, + "grad_norm": 0.3721679449081421, + "learning_rate": 0.0001, + "loss": 1.7655, + "step": 1421 + }, + { + "epoch": 0.16520476328783038, + "grad_norm": 0.3526135981082916, + "learning_rate": 0.0001, + "loss": 1.7385, + "step": 1422 + }, + { + "epoch": 0.1653209410397909, + "grad_norm": 0.4029957354068756, + "learning_rate": 0.0001, + "loss": 2.0216, + "step": 1423 + }, + { + "epoch": 0.16543711879175138, + "grad_norm": 0.36750248074531555, + "learning_rate": 0.0001, + "loss": 1.8155, + "step": 1424 + }, + { + "epoch": 0.16555329654371187, + "grad_norm": 0.358469158411026, + "learning_rate": 0.0001, + "loss": 1.7874, + "step": 1425 + }, + { + "epoch": 0.1656694742956724, + "grad_norm": 0.3403339684009552, + "learning_rate": 0.0001, + "loss": 1.6348, + "step": 1426 + }, + { + "epoch": 0.16578565204763288, + "grad_norm": 0.3741562068462372, + "learning_rate": 0.0001, + "loss": 1.6782, + "step": 1427 + }, + { + "epoch": 0.16590182979959336, + "grad_norm": 0.34802475571632385, + "learning_rate": 0.0001, + "loss": 1.7428, + "step": 1428 + }, + { + "epoch": 0.16601800755155388, + "grad_norm": 0.3652847707271576, + "learning_rate": 0.0001, + "loss": 1.69, + "step": 1429 + }, + { + "epoch": 0.16613418530351437, + "grad_norm": 0.37153634428977966, + "learning_rate": 0.0001, + "loss": 1.8237, + "step": 1430 + }, + { + "epoch": 0.1662503630554749, + "grad_norm": 0.36057737469673157, + "learning_rate": 0.0001, + "loss": 1.8384, + "step": 1431 + }, + { + "epoch": 0.16636654080743538, + "grad_norm": 0.3642016053199768, + "learning_rate": 0.0001, + "loss": 1.8367, + "step": 1432 + }, + { + "epoch": 0.16648271855939586, + "grad_norm": 0.36787521839141846, + "learning_rate": 0.0001, + "loss": 1.7215, + "step": 1433 + }, + { + "epoch": 0.16659889631135638, + "grad_norm": 0.3940580189228058, + "learning_rate": 0.0001, + "loss": 1.9512, + "step": 1434 + }, + { + "epoch": 0.16671507406331687, + "grad_norm": 0.3518722355365753, + "learning_rate": 0.0001, + "loss": 1.6426, + "step": 1435 + }, + { + "epoch": 0.1668312518152774, + "grad_norm": 0.3771706223487854, + "learning_rate": 0.0001, + "loss": 1.7006, + "step": 1436 + }, + { + "epoch": 0.16694742956723788, + "grad_norm": 0.3364204466342926, + "learning_rate": 0.0001, + "loss": 1.6458, + "step": 1437 + }, + { + "epoch": 0.16706360731919837, + "grad_norm": 0.36092275381088257, + "learning_rate": 0.0001, + "loss": 1.8134, + "step": 1438 + }, + { + "epoch": 0.16717978507115888, + "grad_norm": 0.3495999276638031, + "learning_rate": 0.0001, + "loss": 1.6367, + "step": 1439 + }, + { + "epoch": 0.16729596282311937, + "grad_norm": 0.35832324624061584, + "learning_rate": 0.0001, + "loss": 1.6631, + "step": 1440 + }, + { + "epoch": 0.16741214057507986, + "grad_norm": 0.38260895013809204, + "learning_rate": 0.0001, + "loss": 1.8781, + "step": 1441 + }, + { + "epoch": 0.16752831832704038, + "grad_norm": 0.37501585483551025, + "learning_rate": 0.0001, + "loss": 1.7943, + "step": 1442 + }, + { + "epoch": 0.16764449607900087, + "grad_norm": 0.34276142716407776, + "learning_rate": 0.0001, + "loss": 1.643, + "step": 1443 + }, + { + "epoch": 0.16776067383096138, + "grad_norm": 0.32856735587120056, + "learning_rate": 0.0001, + "loss": 1.6322, + "step": 1444 + }, + { + "epoch": 0.16787685158292187, + "grad_norm": 0.35961073637008667, + "learning_rate": 0.0001, + "loss": 1.7903, + "step": 1445 + }, + { + "epoch": 0.16799302933488236, + "grad_norm": 0.3753131628036499, + "learning_rate": 0.0001, + "loss": 1.8137, + "step": 1446 + }, + { + "epoch": 0.16810920708684288, + "grad_norm": 0.3495393693447113, + "learning_rate": 0.0001, + "loss": 1.7824, + "step": 1447 + }, + { + "epoch": 0.16822538483880337, + "grad_norm": 0.35757923126220703, + "learning_rate": 0.0001, + "loss": 1.6898, + "step": 1448 + }, + { + "epoch": 0.16834156259076388, + "grad_norm": 0.3703269064426422, + "learning_rate": 0.0001, + "loss": 1.7857, + "step": 1449 + }, + { + "epoch": 0.16845774034272437, + "grad_norm": 0.3545929789543152, + "learning_rate": 0.0001, + "loss": 1.5796, + "step": 1450 + }, + { + "epoch": 0.16857391809468486, + "grad_norm": 0.3971767723560333, + "learning_rate": 0.0001, + "loss": 1.8604, + "step": 1451 + }, + { + "epoch": 0.16869009584664538, + "grad_norm": 0.3627997040748596, + "learning_rate": 0.0001, + "loss": 1.7352, + "step": 1452 + }, + { + "epoch": 0.16880627359860587, + "grad_norm": 0.37545061111450195, + "learning_rate": 0.0001, + "loss": 1.7773, + "step": 1453 + }, + { + "epoch": 0.16892245135056635, + "grad_norm": 0.4215669631958008, + "learning_rate": 0.0001, + "loss": 1.7495, + "step": 1454 + }, + { + "epoch": 0.16903862910252687, + "grad_norm": 0.3428536355495453, + "learning_rate": 0.0001, + "loss": 1.6043, + "step": 1455 + }, + { + "epoch": 0.16915480685448736, + "grad_norm": 0.3856685161590576, + "learning_rate": 0.0001, + "loss": 1.8013, + "step": 1456 + }, + { + "epoch": 0.16927098460644788, + "grad_norm": 0.3408958315849304, + "learning_rate": 0.0001, + "loss": 1.599, + "step": 1457 + }, + { + "epoch": 0.16938716235840837, + "grad_norm": 0.3705407381057739, + "learning_rate": 0.0001, + "loss": 1.7433, + "step": 1458 + }, + { + "epoch": 0.16950334011036886, + "grad_norm": 0.3559417128562927, + "learning_rate": 0.0001, + "loss": 1.6172, + "step": 1459 + }, + { + "epoch": 0.16961951786232937, + "grad_norm": 0.39479878544807434, + "learning_rate": 0.0001, + "loss": 1.8522, + "step": 1460 + }, + { + "epoch": 0.16973569561428986, + "grad_norm": 0.36740705370903015, + "learning_rate": 0.0001, + "loss": 1.7336, + "step": 1461 + }, + { + "epoch": 0.16985187336625035, + "grad_norm": 0.37933793663978577, + "learning_rate": 0.0001, + "loss": 1.7353, + "step": 1462 + }, + { + "epoch": 0.16996805111821087, + "grad_norm": 0.3855569064617157, + "learning_rate": 0.0001, + "loss": 1.6292, + "step": 1463 + }, + { + "epoch": 0.17008422887017136, + "grad_norm": 0.36735615134239197, + "learning_rate": 0.0001, + "loss": 1.721, + "step": 1464 + }, + { + "epoch": 0.17020040662213187, + "grad_norm": 0.3651230037212372, + "learning_rate": 0.0001, + "loss": 1.638, + "step": 1465 + }, + { + "epoch": 0.17031658437409236, + "grad_norm": 0.3359425663948059, + "learning_rate": 0.0001, + "loss": 1.5297, + "step": 1466 + }, + { + "epoch": 0.17043276212605285, + "grad_norm": 0.37476974725723267, + "learning_rate": 0.0001, + "loss": 1.7478, + "step": 1467 + }, + { + "epoch": 0.17054893987801337, + "grad_norm": 0.3349404036998749, + "learning_rate": 0.0001, + "loss": 1.6202, + "step": 1468 + }, + { + "epoch": 0.17066511762997386, + "grad_norm": 0.375108003616333, + "learning_rate": 0.0001, + "loss": 1.7672, + "step": 1469 + }, + { + "epoch": 0.17078129538193437, + "grad_norm": 0.36245962977409363, + "learning_rate": 0.0001, + "loss": 1.7143, + "step": 1470 + }, + { + "epoch": 0.17089747313389486, + "grad_norm": 0.3471023142337799, + "learning_rate": 0.0001, + "loss": 1.4378, + "step": 1471 + }, + { + "epoch": 0.17101365088585535, + "grad_norm": 0.37973228096961975, + "learning_rate": 0.0001, + "loss": 1.8489, + "step": 1472 + }, + { + "epoch": 0.17112982863781587, + "grad_norm": 0.3702830374240875, + "learning_rate": 0.0001, + "loss": 1.7367, + "step": 1473 + }, + { + "epoch": 0.17124600638977636, + "grad_norm": 0.37454211711883545, + "learning_rate": 0.0001, + "loss": 1.8488, + "step": 1474 + }, + { + "epoch": 0.17136218414173685, + "grad_norm": 0.3523843586444855, + "learning_rate": 0.0001, + "loss": 1.6839, + "step": 1475 + }, + { + "epoch": 0.17147836189369736, + "grad_norm": 0.38084614276885986, + "learning_rate": 0.0001, + "loss": 1.8845, + "step": 1476 + }, + { + "epoch": 0.17159453964565785, + "grad_norm": 0.35280105471611023, + "learning_rate": 0.0001, + "loss": 1.6205, + "step": 1477 + }, + { + "epoch": 0.17171071739761837, + "grad_norm": 0.34946003556251526, + "learning_rate": 0.0001, + "loss": 1.5721, + "step": 1478 + }, + { + "epoch": 0.17182689514957886, + "grad_norm": 0.3549077808856964, + "learning_rate": 0.0001, + "loss": 1.6502, + "step": 1479 + }, + { + "epoch": 0.17194307290153935, + "grad_norm": 0.34895646572113037, + "learning_rate": 0.0001, + "loss": 1.6272, + "step": 1480 + }, + { + "epoch": 0.17205925065349986, + "grad_norm": 0.3998420536518097, + "learning_rate": 0.0001, + "loss": 1.6422, + "step": 1481 + }, + { + "epoch": 0.17217542840546035, + "grad_norm": 0.3754946291446686, + "learning_rate": 0.0001, + "loss": 1.6843, + "step": 1482 + }, + { + "epoch": 0.17229160615742084, + "grad_norm": 0.36129212379455566, + "learning_rate": 0.0001, + "loss": 1.4891, + "step": 1483 + }, + { + "epoch": 0.17240778390938136, + "grad_norm": 0.3542667329311371, + "learning_rate": 0.0001, + "loss": 1.6515, + "step": 1484 + }, + { + "epoch": 0.17252396166134185, + "grad_norm": 0.3390235900878906, + "learning_rate": 0.0001, + "loss": 1.5409, + "step": 1485 + }, + { + "epoch": 0.17264013941330236, + "grad_norm": 0.36193352937698364, + "learning_rate": 0.0001, + "loss": 1.6311, + "step": 1486 + }, + { + "epoch": 0.17275631716526285, + "grad_norm": 0.3606919050216675, + "learning_rate": 0.0001, + "loss": 1.6162, + "step": 1487 + }, + { + "epoch": 0.17287249491722334, + "grad_norm": 0.4732804000377655, + "learning_rate": 0.0001, + "loss": 1.7695, + "step": 1488 + }, + { + "epoch": 0.17298867266918386, + "grad_norm": 0.3804911971092224, + "learning_rate": 0.0001, + "loss": 1.7707, + "step": 1489 + }, + { + "epoch": 0.17310485042114435, + "grad_norm": 0.37315547466278076, + "learning_rate": 0.0001, + "loss": 1.6944, + "step": 1490 + }, + { + "epoch": 0.17322102817310486, + "grad_norm": 0.3711428642272949, + "learning_rate": 0.0001, + "loss": 1.6206, + "step": 1491 + }, + { + "epoch": 0.17333720592506535, + "grad_norm": 0.33480215072631836, + "learning_rate": 0.0001, + "loss": 1.5577, + "step": 1492 + }, + { + "epoch": 0.17345338367702584, + "grad_norm": 0.3833867907524109, + "learning_rate": 0.0001, + "loss": 1.7718, + "step": 1493 + }, + { + "epoch": 0.17356956142898636, + "grad_norm": 0.35482731461524963, + "learning_rate": 0.0001, + "loss": 1.6562, + "step": 1494 + }, + { + "epoch": 0.17368573918094685, + "grad_norm": 0.35908132791519165, + "learning_rate": 0.0001, + "loss": 1.7193, + "step": 1495 + }, + { + "epoch": 0.17380191693290734, + "grad_norm": 0.34301432967185974, + "learning_rate": 0.0001, + "loss": 1.6988, + "step": 1496 + }, + { + "epoch": 0.17391809468486785, + "grad_norm": 0.34541556239128113, + "learning_rate": 0.0001, + "loss": 1.6545, + "step": 1497 + }, + { + "epoch": 0.17403427243682834, + "grad_norm": 0.36009690165519714, + "learning_rate": 0.0001, + "loss": 1.7187, + "step": 1498 + }, + { + "epoch": 0.17415045018878886, + "grad_norm": 0.3848399221897125, + "learning_rate": 0.0001, + "loss": 1.9549, + "step": 1499 + }, + { + "epoch": 0.17426662794074935, + "grad_norm": 0.35505911707878113, + "learning_rate": 0.0001, + "loss": 1.7141, + "step": 1500 + }, + { + "epoch": 0.17438280569270984, + "grad_norm": 0.3311626613140106, + "learning_rate": 0.0001, + "loss": 1.6282, + "step": 1501 + }, + { + "epoch": 0.17449898344467035, + "grad_norm": 0.37923452258110046, + "learning_rate": 0.0001, + "loss": 1.7772, + "step": 1502 + }, + { + "epoch": 0.17461516119663084, + "grad_norm": 0.3624334931373596, + "learning_rate": 0.0001, + "loss": 1.716, + "step": 1503 + }, + { + "epoch": 0.17473133894859136, + "grad_norm": 0.3661384880542755, + "learning_rate": 0.0001, + "loss": 1.7486, + "step": 1504 + }, + { + "epoch": 0.17484751670055185, + "grad_norm": 0.3476395308971405, + "learning_rate": 0.0001, + "loss": 1.8493, + "step": 1505 + }, + { + "epoch": 0.17496369445251234, + "grad_norm": 0.3515165448188782, + "learning_rate": 0.0001, + "loss": 1.7228, + "step": 1506 + }, + { + "epoch": 0.17507987220447285, + "grad_norm": 0.36239245533943176, + "learning_rate": 0.0001, + "loss": 1.6796, + "step": 1507 + }, + { + "epoch": 0.17519604995643334, + "grad_norm": 0.3474177420139313, + "learning_rate": 0.0001, + "loss": 1.5533, + "step": 1508 + }, + { + "epoch": 0.17531222770839383, + "grad_norm": 0.34168288111686707, + "learning_rate": 0.0001, + "loss": 1.4709, + "step": 1509 + }, + { + "epoch": 0.17542840546035435, + "grad_norm": 0.3697127103805542, + "learning_rate": 0.0001, + "loss": 1.7274, + "step": 1510 + }, + { + "epoch": 0.17554458321231484, + "grad_norm": 0.36415886878967285, + "learning_rate": 0.0001, + "loss": 1.6858, + "step": 1511 + }, + { + "epoch": 0.17566076096427535, + "grad_norm": 0.3710338771343231, + "learning_rate": 0.0001, + "loss": 1.8252, + "step": 1512 + }, + { + "epoch": 0.17577693871623584, + "grad_norm": 0.3623411953449249, + "learning_rate": 0.0001, + "loss": 1.4247, + "step": 1513 + }, + { + "epoch": 0.17589311646819633, + "grad_norm": 0.3631919026374817, + "learning_rate": 0.0001, + "loss": 1.7089, + "step": 1514 + }, + { + "epoch": 0.17600929422015685, + "grad_norm": 0.37533360719680786, + "learning_rate": 0.0001, + "loss": 1.748, + "step": 1515 + }, + { + "epoch": 0.17612547197211734, + "grad_norm": 0.36933374404907227, + "learning_rate": 0.0001, + "loss": 1.8289, + "step": 1516 + }, + { + "epoch": 0.17624164972407783, + "grad_norm": 0.3541377782821655, + "learning_rate": 0.0001, + "loss": 1.5451, + "step": 1517 + }, + { + "epoch": 0.17635782747603834, + "grad_norm": 0.38855910301208496, + "learning_rate": 0.0001, + "loss": 1.7275, + "step": 1518 + }, + { + "epoch": 0.17647400522799883, + "grad_norm": 0.3846849203109741, + "learning_rate": 0.0001, + "loss": 1.81, + "step": 1519 + }, + { + "epoch": 0.17659018297995935, + "grad_norm": 0.4078463912010193, + "learning_rate": 0.0001, + "loss": 1.8295, + "step": 1520 + }, + { + "epoch": 0.17670636073191984, + "grad_norm": 0.3415427803993225, + "learning_rate": 0.0001, + "loss": 1.6622, + "step": 1521 + }, + { + "epoch": 0.17682253848388033, + "grad_norm": 0.36249756813049316, + "learning_rate": 0.0001, + "loss": 1.6939, + "step": 1522 + }, + { + "epoch": 0.17693871623584084, + "grad_norm": 0.3513442277908325, + "learning_rate": 0.0001, + "loss": 1.7024, + "step": 1523 + }, + { + "epoch": 0.17705489398780133, + "grad_norm": 0.42712968587875366, + "learning_rate": 0.0001, + "loss": 1.7684, + "step": 1524 + }, + { + "epoch": 0.17717107173976185, + "grad_norm": 0.3688381612300873, + "learning_rate": 0.0001, + "loss": 1.7726, + "step": 1525 + }, + { + "epoch": 0.17728724949172234, + "grad_norm": 0.38176625967025757, + "learning_rate": 0.0001, + "loss": 1.635, + "step": 1526 + }, + { + "epoch": 0.17740342724368283, + "grad_norm": 0.3425157070159912, + "learning_rate": 0.0001, + "loss": 1.509, + "step": 1527 + }, + { + "epoch": 0.17751960499564334, + "grad_norm": 0.362857848405838, + "learning_rate": 0.0001, + "loss": 1.8357, + "step": 1528 + }, + { + "epoch": 0.17763578274760383, + "grad_norm": 0.3613092303276062, + "learning_rate": 0.0001, + "loss": 1.7633, + "step": 1529 + }, + { + "epoch": 0.17775196049956432, + "grad_norm": 0.35249873995780945, + "learning_rate": 0.0001, + "loss": 1.7397, + "step": 1530 + }, + { + "epoch": 0.17786813825152484, + "grad_norm": 0.3414962887763977, + "learning_rate": 0.0001, + "loss": 1.5691, + "step": 1531 + }, + { + "epoch": 0.17798431600348533, + "grad_norm": 0.3660277724266052, + "learning_rate": 0.0001, + "loss": 1.5648, + "step": 1532 + }, + { + "epoch": 0.17810049375544584, + "grad_norm": 0.344135046005249, + "learning_rate": 0.0001, + "loss": 1.7237, + "step": 1533 + }, + { + "epoch": 0.17821667150740633, + "grad_norm": 0.3552722632884979, + "learning_rate": 0.0001, + "loss": 1.6736, + "step": 1534 + }, + { + "epoch": 0.17833284925936682, + "grad_norm": 0.3565710186958313, + "learning_rate": 0.0001, + "loss": 1.6945, + "step": 1535 + }, + { + "epoch": 0.17844902701132734, + "grad_norm": 0.34754517674446106, + "learning_rate": 0.0001, + "loss": 1.5476, + "step": 1536 + }, + { + "epoch": 0.17856520476328783, + "grad_norm": 0.377332478761673, + "learning_rate": 0.0001, + "loss": 1.771, + "step": 1537 + }, + { + "epoch": 0.17868138251524834, + "grad_norm": 0.34879541397094727, + "learning_rate": 0.0001, + "loss": 1.6475, + "step": 1538 + }, + { + "epoch": 0.17879756026720883, + "grad_norm": 0.3693576753139496, + "learning_rate": 0.0001, + "loss": 1.6738, + "step": 1539 + }, + { + "epoch": 0.17891373801916932, + "grad_norm": 0.3447043001651764, + "learning_rate": 0.0001, + "loss": 1.6958, + "step": 1540 + }, + { + "epoch": 0.17902991577112984, + "grad_norm": 0.3493332862854004, + "learning_rate": 0.0001, + "loss": 1.5546, + "step": 1541 + }, + { + "epoch": 0.17914609352309033, + "grad_norm": 0.3783632814884186, + "learning_rate": 0.0001, + "loss": 1.7726, + "step": 1542 + }, + { + "epoch": 0.17926227127505082, + "grad_norm": 0.33536890149116516, + "learning_rate": 0.0001, + "loss": 1.7176, + "step": 1543 + }, + { + "epoch": 0.17937844902701133, + "grad_norm": 0.3568625748157501, + "learning_rate": 0.0001, + "loss": 1.635, + "step": 1544 + }, + { + "epoch": 0.17949462677897182, + "grad_norm": 0.3375276029109955, + "learning_rate": 0.0001, + "loss": 1.6558, + "step": 1545 + }, + { + "epoch": 0.17961080453093234, + "grad_norm": 0.3463954031467438, + "learning_rate": 0.0001, + "loss": 1.7331, + "step": 1546 + }, + { + "epoch": 0.17972698228289283, + "grad_norm": 0.38217151165008545, + "learning_rate": 0.0001, + "loss": 1.7775, + "step": 1547 + }, + { + "epoch": 0.17984316003485332, + "grad_norm": 0.3745064437389374, + "learning_rate": 0.0001, + "loss": 1.789, + "step": 1548 + }, + { + "epoch": 0.17995933778681383, + "grad_norm": 0.3647879660129547, + "learning_rate": 0.0001, + "loss": 1.5916, + "step": 1549 + }, + { + "epoch": 0.18007551553877432, + "grad_norm": 0.34927716851234436, + "learning_rate": 0.0001, + "loss": 1.7875, + "step": 1550 + }, + { + "epoch": 0.1801916932907348, + "grad_norm": 0.35309305787086487, + "learning_rate": 0.0001, + "loss": 1.7382, + "step": 1551 + }, + { + "epoch": 0.18030787104269533, + "grad_norm": 0.385146826505661, + "learning_rate": 0.0001, + "loss": 1.668, + "step": 1552 + }, + { + "epoch": 0.18042404879465582, + "grad_norm": 0.345344603061676, + "learning_rate": 0.0001, + "loss": 1.6421, + "step": 1553 + }, + { + "epoch": 0.18054022654661633, + "grad_norm": 0.3495193123817444, + "learning_rate": 0.0001, + "loss": 1.87, + "step": 1554 + }, + { + "epoch": 0.18065640429857682, + "grad_norm": 0.36495864391326904, + "learning_rate": 0.0001, + "loss": 1.8289, + "step": 1555 + }, + { + "epoch": 0.1807725820505373, + "grad_norm": 0.34785377979278564, + "learning_rate": 0.0001, + "loss": 1.7213, + "step": 1556 + }, + { + "epoch": 0.18088875980249783, + "grad_norm": 0.36701300740242004, + "learning_rate": 0.0001, + "loss": 1.716, + "step": 1557 + }, + { + "epoch": 0.18100493755445832, + "grad_norm": 0.3620215356349945, + "learning_rate": 0.0001, + "loss": 1.6724, + "step": 1558 + }, + { + "epoch": 0.18112111530641883, + "grad_norm": 0.34096696972846985, + "learning_rate": 0.0001, + "loss": 1.4877, + "step": 1559 + }, + { + "epoch": 0.18123729305837932, + "grad_norm": 0.3897305428981781, + "learning_rate": 0.0001, + "loss": 1.7062, + "step": 1560 + }, + { + "epoch": 0.1813534708103398, + "grad_norm": 0.3481612503528595, + "learning_rate": 0.0001, + "loss": 1.6208, + "step": 1561 + }, + { + "epoch": 0.18146964856230033, + "grad_norm": 0.3717382550239563, + "learning_rate": 0.0001, + "loss": 1.781, + "step": 1562 + }, + { + "epoch": 0.18158582631426082, + "grad_norm": 0.3604913353919983, + "learning_rate": 0.0001, + "loss": 1.7037, + "step": 1563 + }, + { + "epoch": 0.1817020040662213, + "grad_norm": 0.36964964866638184, + "learning_rate": 0.0001, + "loss": 1.7833, + "step": 1564 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 0.35544395446777344, + "learning_rate": 0.0001, + "loss": 1.6162, + "step": 1565 + }, + { + "epoch": 0.1819343595701423, + "grad_norm": 0.36883848905563354, + "learning_rate": 0.0001, + "loss": 1.6346, + "step": 1566 + }, + { + "epoch": 0.18205053732210283, + "grad_norm": 0.36678242683410645, + "learning_rate": 0.0001, + "loss": 1.727, + "step": 1567 + }, + { + "epoch": 0.18216671507406332, + "grad_norm": 0.3713185489177704, + "learning_rate": 0.0001, + "loss": 1.8655, + "step": 1568 + }, + { + "epoch": 0.1822828928260238, + "grad_norm": 0.32219791412353516, + "learning_rate": 0.0001, + "loss": 1.6041, + "step": 1569 + }, + { + "epoch": 0.18239907057798432, + "grad_norm": 0.3948063254356384, + "learning_rate": 0.0001, + "loss": 1.7762, + "step": 1570 + }, + { + "epoch": 0.1825152483299448, + "grad_norm": 0.35549992322921753, + "learning_rate": 0.0001, + "loss": 1.719, + "step": 1571 + }, + { + "epoch": 0.1826314260819053, + "grad_norm": 0.36282774806022644, + "learning_rate": 0.0001, + "loss": 1.7864, + "step": 1572 + }, + { + "epoch": 0.18274760383386582, + "grad_norm": 0.35417771339416504, + "learning_rate": 0.0001, + "loss": 1.7345, + "step": 1573 + }, + { + "epoch": 0.1828637815858263, + "grad_norm": 0.406324565410614, + "learning_rate": 0.0001, + "loss": 1.9048, + "step": 1574 + }, + { + "epoch": 0.18297995933778682, + "grad_norm": 0.3815106153488159, + "learning_rate": 0.0001, + "loss": 1.7483, + "step": 1575 + }, + { + "epoch": 0.1830961370897473, + "grad_norm": 0.3650597929954529, + "learning_rate": 0.0001, + "loss": 1.8492, + "step": 1576 + }, + { + "epoch": 0.1832123148417078, + "grad_norm": 0.3681640326976776, + "learning_rate": 0.0001, + "loss": 1.8177, + "step": 1577 + }, + { + "epoch": 0.18332849259366832, + "grad_norm": 0.3654261529445648, + "learning_rate": 0.0001, + "loss": 1.747, + "step": 1578 + }, + { + "epoch": 0.1834446703456288, + "grad_norm": 0.35047435760498047, + "learning_rate": 0.0001, + "loss": 1.5378, + "step": 1579 + }, + { + "epoch": 0.18356084809758932, + "grad_norm": 0.33045920729637146, + "learning_rate": 0.0001, + "loss": 1.6797, + "step": 1580 + }, + { + "epoch": 0.1836770258495498, + "grad_norm": 0.3612997233867645, + "learning_rate": 0.0001, + "loss": 1.5902, + "step": 1581 + }, + { + "epoch": 0.1837932036015103, + "grad_norm": 0.382269948720932, + "learning_rate": 0.0001, + "loss": 1.8914, + "step": 1582 + }, + { + "epoch": 0.18390938135347082, + "grad_norm": 0.348799467086792, + "learning_rate": 0.0001, + "loss": 1.7365, + "step": 1583 + }, + { + "epoch": 0.1840255591054313, + "grad_norm": 0.3481920063495636, + "learning_rate": 0.0001, + "loss": 1.5861, + "step": 1584 + }, + { + "epoch": 0.1841417368573918, + "grad_norm": 0.38168272376060486, + "learning_rate": 0.0001, + "loss": 1.8146, + "step": 1585 + }, + { + "epoch": 0.1842579146093523, + "grad_norm": 0.3492324650287628, + "learning_rate": 0.0001, + "loss": 1.7394, + "step": 1586 + }, + { + "epoch": 0.1843740923613128, + "grad_norm": 0.38045451045036316, + "learning_rate": 0.0001, + "loss": 1.7292, + "step": 1587 + }, + { + "epoch": 0.18449027011327332, + "grad_norm": 0.3941299617290497, + "learning_rate": 0.0001, + "loss": 1.9037, + "step": 1588 + }, + { + "epoch": 0.1846064478652338, + "grad_norm": 0.3451482355594635, + "learning_rate": 0.0001, + "loss": 1.6042, + "step": 1589 + }, + { + "epoch": 0.1847226256171943, + "grad_norm": 0.36182719469070435, + "learning_rate": 0.0001, + "loss": 1.6665, + "step": 1590 + }, + { + "epoch": 0.1848388033691548, + "grad_norm": 0.3482245206832886, + "learning_rate": 0.0001, + "loss": 1.7429, + "step": 1591 + }, + { + "epoch": 0.1849549811211153, + "grad_norm": 0.3704969584941864, + "learning_rate": 0.0001, + "loss": 1.7674, + "step": 1592 + }, + { + "epoch": 0.18507115887307582, + "grad_norm": 0.3831556737422943, + "learning_rate": 0.0001, + "loss": 1.728, + "step": 1593 + }, + { + "epoch": 0.1851873366250363, + "grad_norm": 0.3729779124259949, + "learning_rate": 0.0001, + "loss": 1.7315, + "step": 1594 + }, + { + "epoch": 0.1853035143769968, + "grad_norm": 0.38106775283813477, + "learning_rate": 0.0001, + "loss": 1.6608, + "step": 1595 + }, + { + "epoch": 0.18541969212895731, + "grad_norm": 0.3817857801914215, + "learning_rate": 0.0001, + "loss": 1.5159, + "step": 1596 + }, + { + "epoch": 0.1855358698809178, + "grad_norm": 0.35032176971435547, + "learning_rate": 0.0001, + "loss": 1.6233, + "step": 1597 + }, + { + "epoch": 0.1856520476328783, + "grad_norm": 0.3457838296890259, + "learning_rate": 0.0001, + "loss": 1.4043, + "step": 1598 + }, + { + "epoch": 0.1857682253848388, + "grad_norm": 0.37174192070961, + "learning_rate": 0.0001, + "loss": 1.7605, + "step": 1599 + }, + { + "epoch": 0.1858844031367993, + "grad_norm": 0.36577579379081726, + "learning_rate": 0.0001, + "loss": 1.7309, + "step": 1600 + }, + { + "epoch": 0.18600058088875981, + "grad_norm": 0.3587372899055481, + "learning_rate": 0.0001, + "loss": 1.6997, + "step": 1601 + }, + { + "epoch": 0.1861167586407203, + "grad_norm": 0.3637838363647461, + "learning_rate": 0.0001, + "loss": 1.6007, + "step": 1602 + }, + { + "epoch": 0.1862329363926808, + "grad_norm": 0.3631284534931183, + "learning_rate": 0.0001, + "loss": 1.6191, + "step": 1603 + }, + { + "epoch": 0.1863491141446413, + "grad_norm": 0.3679940104484558, + "learning_rate": 0.0001, + "loss": 1.7726, + "step": 1604 + }, + { + "epoch": 0.1864652918966018, + "grad_norm": 0.36848756670951843, + "learning_rate": 0.0001, + "loss": 1.7786, + "step": 1605 + }, + { + "epoch": 0.1865814696485623, + "grad_norm": 0.36538413166999817, + "learning_rate": 0.0001, + "loss": 1.7454, + "step": 1606 + }, + { + "epoch": 0.1866976474005228, + "grad_norm": 0.382051944732666, + "learning_rate": 0.0001, + "loss": 1.842, + "step": 1607 + }, + { + "epoch": 0.1868138251524833, + "grad_norm": 0.33358198404312134, + "learning_rate": 0.0001, + "loss": 1.5731, + "step": 1608 + }, + { + "epoch": 0.1869300029044438, + "grad_norm": 0.3623516261577606, + "learning_rate": 0.0001, + "loss": 1.7093, + "step": 1609 + }, + { + "epoch": 0.1870461806564043, + "grad_norm": 0.37408748269081116, + "learning_rate": 0.0001, + "loss": 1.6864, + "step": 1610 + }, + { + "epoch": 0.1871623584083648, + "grad_norm": 0.38509878516197205, + "learning_rate": 0.0001, + "loss": 1.935, + "step": 1611 + }, + { + "epoch": 0.1872785361603253, + "grad_norm": 0.39082857966423035, + "learning_rate": 0.0001, + "loss": 1.7414, + "step": 1612 + }, + { + "epoch": 0.1873947139122858, + "grad_norm": 0.3575880825519562, + "learning_rate": 0.0001, + "loss": 1.6035, + "step": 1613 + }, + { + "epoch": 0.1875108916642463, + "grad_norm": 0.3531934916973114, + "learning_rate": 0.0001, + "loss": 1.7231, + "step": 1614 + }, + { + "epoch": 0.1876270694162068, + "grad_norm": 0.3784133493900299, + "learning_rate": 0.0001, + "loss": 1.6525, + "step": 1615 + }, + { + "epoch": 0.1877432471681673, + "grad_norm": 0.37965330481529236, + "learning_rate": 0.0001, + "loss": 1.6978, + "step": 1616 + }, + { + "epoch": 0.1878594249201278, + "grad_norm": 0.36382848024368286, + "learning_rate": 0.0001, + "loss": 1.6383, + "step": 1617 + }, + { + "epoch": 0.1879756026720883, + "grad_norm": 0.3851543068885803, + "learning_rate": 0.0001, + "loss": 1.7535, + "step": 1618 + }, + { + "epoch": 0.18809178042404878, + "grad_norm": 0.3497619926929474, + "learning_rate": 0.0001, + "loss": 1.6959, + "step": 1619 + }, + { + "epoch": 0.1882079581760093, + "grad_norm": 0.34661221504211426, + "learning_rate": 0.0001, + "loss": 1.5884, + "step": 1620 + }, + { + "epoch": 0.1883241359279698, + "grad_norm": 0.348257452249527, + "learning_rate": 0.0001, + "loss": 1.7566, + "step": 1621 + }, + { + "epoch": 0.1884403136799303, + "grad_norm": 0.3511572480201721, + "learning_rate": 0.0001, + "loss": 1.6716, + "step": 1622 + }, + { + "epoch": 0.1885564914318908, + "grad_norm": 0.3942771553993225, + "learning_rate": 0.0001, + "loss": 1.815, + "step": 1623 + }, + { + "epoch": 0.18867266918385128, + "grad_norm": 0.3480950593948364, + "learning_rate": 0.0001, + "loss": 1.7237, + "step": 1624 + }, + { + "epoch": 0.1887888469358118, + "grad_norm": 0.3844303786754608, + "learning_rate": 0.0001, + "loss": 1.8658, + "step": 1625 + }, + { + "epoch": 0.1889050246877723, + "grad_norm": 0.36564818024635315, + "learning_rate": 0.0001, + "loss": 1.7391, + "step": 1626 + }, + { + "epoch": 0.1890212024397328, + "grad_norm": 0.3397161662578583, + "learning_rate": 0.0001, + "loss": 1.5765, + "step": 1627 + }, + { + "epoch": 0.1891373801916933, + "grad_norm": 0.32916903495788574, + "learning_rate": 0.0001, + "loss": 1.4786, + "step": 1628 + }, + { + "epoch": 0.18925355794365378, + "grad_norm": 0.37873196601867676, + "learning_rate": 0.0001, + "loss": 1.754, + "step": 1629 + }, + { + "epoch": 0.1893697356956143, + "grad_norm": 0.3580092787742615, + "learning_rate": 0.0001, + "loss": 1.4407, + "step": 1630 + }, + { + "epoch": 0.1894859134475748, + "grad_norm": 0.38638558983802795, + "learning_rate": 0.0001, + "loss": 1.769, + "step": 1631 + }, + { + "epoch": 0.18960209119953528, + "grad_norm": 0.39580628275871277, + "learning_rate": 0.0001, + "loss": 1.668, + "step": 1632 + }, + { + "epoch": 0.1897182689514958, + "grad_norm": 0.3963046967983246, + "learning_rate": 0.0001, + "loss": 1.8863, + "step": 1633 + }, + { + "epoch": 0.18983444670345628, + "grad_norm": 0.33917540311813354, + "learning_rate": 0.0001, + "loss": 1.5406, + "step": 1634 + }, + { + "epoch": 0.1899506244554168, + "grad_norm": 0.4500591456890106, + "learning_rate": 0.0001, + "loss": 1.7972, + "step": 1635 + }, + { + "epoch": 0.1900668022073773, + "grad_norm": 0.3522135615348816, + "learning_rate": 0.0001, + "loss": 1.6265, + "step": 1636 + }, + { + "epoch": 0.19018297995933778, + "grad_norm": 0.3856748640537262, + "learning_rate": 0.0001, + "loss": 1.708, + "step": 1637 + }, + { + "epoch": 0.1902991577112983, + "grad_norm": 0.4033121168613434, + "learning_rate": 0.0001, + "loss": 1.8365, + "step": 1638 + }, + { + "epoch": 0.19041533546325878, + "grad_norm": 0.3969012200832367, + "learning_rate": 0.0001, + "loss": 1.9015, + "step": 1639 + }, + { + "epoch": 0.19053151321521927, + "grad_norm": 0.3580697178840637, + "learning_rate": 0.0001, + "loss": 1.6171, + "step": 1640 + }, + { + "epoch": 0.1906476909671798, + "grad_norm": 0.37800854444503784, + "learning_rate": 0.0001, + "loss": 1.7347, + "step": 1641 + }, + { + "epoch": 0.19076386871914028, + "grad_norm": 0.3692120313644409, + "learning_rate": 0.0001, + "loss": 1.6419, + "step": 1642 + }, + { + "epoch": 0.1908800464711008, + "grad_norm": 0.380416601896286, + "learning_rate": 0.0001, + "loss": 1.851, + "step": 1643 + }, + { + "epoch": 0.19099622422306128, + "grad_norm": 0.3523741364479065, + "learning_rate": 0.0001, + "loss": 1.6161, + "step": 1644 + }, + { + "epoch": 0.19111240197502177, + "grad_norm": 0.338820219039917, + "learning_rate": 0.0001, + "loss": 1.7057, + "step": 1645 + }, + { + "epoch": 0.1912285797269823, + "grad_norm": 0.356183260679245, + "learning_rate": 0.0001, + "loss": 1.6747, + "step": 1646 + }, + { + "epoch": 0.19134475747894278, + "grad_norm": 0.34077268838882446, + "learning_rate": 0.0001, + "loss": 1.7581, + "step": 1647 + }, + { + "epoch": 0.1914609352309033, + "grad_norm": 0.3771745562553406, + "learning_rate": 0.0001, + "loss": 1.7918, + "step": 1648 + }, + { + "epoch": 0.19157711298286378, + "grad_norm": 0.3605990409851074, + "learning_rate": 0.0001, + "loss": 1.6994, + "step": 1649 + }, + { + "epoch": 0.19169329073482427, + "grad_norm": 0.3496057987213135, + "learning_rate": 0.0001, + "loss": 1.683, + "step": 1650 + }, + { + "epoch": 0.1918094684867848, + "grad_norm": 0.39877089858055115, + "learning_rate": 0.0001, + "loss": 1.9872, + "step": 1651 + }, + { + "epoch": 0.19192564623874528, + "grad_norm": 0.3556186556816101, + "learning_rate": 0.0001, + "loss": 1.7769, + "step": 1652 + }, + { + "epoch": 0.19204182399070577, + "grad_norm": 0.3624957501888275, + "learning_rate": 0.0001, + "loss": 1.6933, + "step": 1653 + }, + { + "epoch": 0.19215800174266628, + "grad_norm": 0.3758990466594696, + "learning_rate": 0.0001, + "loss": 1.7976, + "step": 1654 + }, + { + "epoch": 0.19227417949462677, + "grad_norm": 0.38239556550979614, + "learning_rate": 0.0001, + "loss": 1.6828, + "step": 1655 + }, + { + "epoch": 0.1923903572465873, + "grad_norm": 0.3543239235877991, + "learning_rate": 0.0001, + "loss": 1.667, + "step": 1656 + }, + { + "epoch": 0.19250653499854778, + "grad_norm": 0.3758254647254944, + "learning_rate": 0.0001, + "loss": 1.8641, + "step": 1657 + }, + { + "epoch": 0.19262271275050827, + "grad_norm": 0.40138089656829834, + "learning_rate": 0.0001, + "loss": 1.8514, + "step": 1658 + }, + { + "epoch": 0.19273889050246878, + "grad_norm": 0.36116254329681396, + "learning_rate": 0.0001, + "loss": 1.7876, + "step": 1659 + }, + { + "epoch": 0.19285506825442927, + "grad_norm": 0.34624597430229187, + "learning_rate": 0.0001, + "loss": 1.6897, + "step": 1660 + }, + { + "epoch": 0.19297124600638976, + "grad_norm": 0.3567349910736084, + "learning_rate": 0.0001, + "loss": 1.7766, + "step": 1661 + }, + { + "epoch": 0.19308742375835028, + "grad_norm": 0.35692065954208374, + "learning_rate": 0.0001, + "loss": 1.7122, + "step": 1662 + }, + { + "epoch": 0.19320360151031077, + "grad_norm": 0.32495301961898804, + "learning_rate": 0.0001, + "loss": 1.5563, + "step": 1663 + }, + { + "epoch": 0.19331977926227129, + "grad_norm": 0.3486839830875397, + "learning_rate": 0.0001, + "loss": 1.739, + "step": 1664 + }, + { + "epoch": 0.19343595701423177, + "grad_norm": 0.3624487817287445, + "learning_rate": 0.0001, + "loss": 1.783, + "step": 1665 + }, + { + "epoch": 0.19355213476619226, + "grad_norm": 0.34828513860702515, + "learning_rate": 0.0001, + "loss": 1.6163, + "step": 1666 + }, + { + "epoch": 0.19366831251815278, + "grad_norm": 0.339197039604187, + "learning_rate": 0.0001, + "loss": 1.6403, + "step": 1667 + }, + { + "epoch": 0.19378449027011327, + "grad_norm": 0.3725949227809906, + "learning_rate": 0.0001, + "loss": 1.7417, + "step": 1668 + }, + { + "epoch": 0.19390066802207379, + "grad_norm": 0.346892386674881, + "learning_rate": 0.0001, + "loss": 1.4863, + "step": 1669 + }, + { + "epoch": 0.19401684577403427, + "grad_norm": 0.36845695972442627, + "learning_rate": 0.0001, + "loss": 1.8, + "step": 1670 + }, + { + "epoch": 0.19413302352599476, + "grad_norm": 0.33988621830940247, + "learning_rate": 0.0001, + "loss": 1.664, + "step": 1671 + }, + { + "epoch": 0.19424920127795528, + "grad_norm": 0.35175544023513794, + "learning_rate": 0.0001, + "loss": 1.7329, + "step": 1672 + }, + { + "epoch": 0.19436537902991577, + "grad_norm": 0.35789933800697327, + "learning_rate": 0.0001, + "loss": 1.6697, + "step": 1673 + }, + { + "epoch": 0.19448155678187626, + "grad_norm": 0.371448278427124, + "learning_rate": 0.0001, + "loss": 1.7318, + "step": 1674 + }, + { + "epoch": 0.19459773453383677, + "grad_norm": 0.3563764989376068, + "learning_rate": 0.0001, + "loss": 1.7567, + "step": 1675 + }, + { + "epoch": 0.19471391228579726, + "grad_norm": 0.38420358300209045, + "learning_rate": 0.0001, + "loss": 1.8371, + "step": 1676 + }, + { + "epoch": 0.19483009003775778, + "grad_norm": 0.3995816707611084, + "learning_rate": 0.0001, + "loss": 1.8055, + "step": 1677 + }, + { + "epoch": 0.19494626778971827, + "grad_norm": 0.3630034923553467, + "learning_rate": 0.0001, + "loss": 1.6929, + "step": 1678 + }, + { + "epoch": 0.19506244554167876, + "grad_norm": 0.3776208162307739, + "learning_rate": 0.0001, + "loss": 1.8491, + "step": 1679 + }, + { + "epoch": 0.19517862329363928, + "grad_norm": 0.3581395149230957, + "learning_rate": 0.0001, + "loss": 1.8839, + "step": 1680 + }, + { + "epoch": 0.19529480104559976, + "grad_norm": 0.3381625711917877, + "learning_rate": 0.0001, + "loss": 1.7353, + "step": 1681 + }, + { + "epoch": 0.19541097879756028, + "grad_norm": 0.3696902096271515, + "learning_rate": 0.0001, + "loss": 1.637, + "step": 1682 + }, + { + "epoch": 0.19552715654952077, + "grad_norm": 0.3632832169532776, + "learning_rate": 0.0001, + "loss": 1.6398, + "step": 1683 + }, + { + "epoch": 0.19564333430148126, + "grad_norm": 0.37248799204826355, + "learning_rate": 0.0001, + "loss": 1.7679, + "step": 1684 + }, + { + "epoch": 0.19575951205344178, + "grad_norm": 0.354970782995224, + "learning_rate": 0.0001, + "loss": 1.6954, + "step": 1685 + }, + { + "epoch": 0.19587568980540226, + "grad_norm": 0.370175838470459, + "learning_rate": 0.0001, + "loss": 1.7495, + "step": 1686 + }, + { + "epoch": 0.19599186755736275, + "grad_norm": 0.38560378551483154, + "learning_rate": 0.0001, + "loss": 1.8188, + "step": 1687 + }, + { + "epoch": 0.19610804530932327, + "grad_norm": 0.3574475049972534, + "learning_rate": 0.0001, + "loss": 1.7771, + "step": 1688 + }, + { + "epoch": 0.19622422306128376, + "grad_norm": 0.39494848251342773, + "learning_rate": 0.0001, + "loss": 1.6645, + "step": 1689 + }, + { + "epoch": 0.19634040081324428, + "grad_norm": 0.36588966846466064, + "learning_rate": 0.0001, + "loss": 1.7938, + "step": 1690 + }, + { + "epoch": 0.19645657856520476, + "grad_norm": 0.3459327220916748, + "learning_rate": 0.0001, + "loss": 1.739, + "step": 1691 + }, + { + "epoch": 0.19657275631716525, + "grad_norm": 0.36302775144577026, + "learning_rate": 0.0001, + "loss": 1.741, + "step": 1692 + }, + { + "epoch": 0.19668893406912577, + "grad_norm": 0.3560570776462555, + "learning_rate": 0.0001, + "loss": 1.6156, + "step": 1693 + }, + { + "epoch": 0.19680511182108626, + "grad_norm": 0.35664597153663635, + "learning_rate": 0.0001, + "loss": 1.5997, + "step": 1694 + }, + { + "epoch": 0.19692128957304675, + "grad_norm": 0.35501745343208313, + "learning_rate": 0.0001, + "loss": 1.5578, + "step": 1695 + }, + { + "epoch": 0.19703746732500727, + "grad_norm": 0.36302104592323303, + "learning_rate": 0.0001, + "loss": 1.6408, + "step": 1696 + }, + { + "epoch": 0.19715364507696775, + "grad_norm": 0.3904525637626648, + "learning_rate": 0.0001, + "loss": 1.7684, + "step": 1697 + }, + { + "epoch": 0.19726982282892827, + "grad_norm": 0.38676998019218445, + "learning_rate": 0.0001, + "loss": 1.8969, + "step": 1698 + }, + { + "epoch": 0.19738600058088876, + "grad_norm": 0.3899071514606476, + "learning_rate": 0.0001, + "loss": 1.8044, + "step": 1699 + }, + { + "epoch": 0.19750217833284925, + "grad_norm": 0.38243070244789124, + "learning_rate": 0.0001, + "loss": 1.9196, + "step": 1700 + }, + { + "epoch": 0.19761835608480977, + "grad_norm": 0.3663628101348877, + "learning_rate": 0.0001, + "loss": 1.9099, + "step": 1701 + }, + { + "epoch": 0.19773453383677025, + "grad_norm": 0.3713766634464264, + "learning_rate": 0.0001, + "loss": 1.9145, + "step": 1702 + }, + { + "epoch": 0.19785071158873077, + "grad_norm": 0.36897388100624084, + "learning_rate": 0.0001, + "loss": 1.7247, + "step": 1703 + }, + { + "epoch": 0.19796688934069126, + "grad_norm": 0.36905625462532043, + "learning_rate": 0.0001, + "loss": 1.776, + "step": 1704 + }, + { + "epoch": 0.19808306709265175, + "grad_norm": 0.35968464612960815, + "learning_rate": 0.0001, + "loss": 1.7495, + "step": 1705 + }, + { + "epoch": 0.19819924484461227, + "grad_norm": 0.3755891025066376, + "learning_rate": 0.0001, + "loss": 1.621, + "step": 1706 + }, + { + "epoch": 0.19831542259657275, + "grad_norm": 0.3964156210422516, + "learning_rate": 0.0001, + "loss": 1.6651, + "step": 1707 + }, + { + "epoch": 0.19843160034853324, + "grad_norm": 0.3733653724193573, + "learning_rate": 0.0001, + "loss": 1.7051, + "step": 1708 + }, + { + "epoch": 0.19854777810049376, + "grad_norm": 0.36199355125427246, + "learning_rate": 0.0001, + "loss": 1.6899, + "step": 1709 + }, + { + "epoch": 0.19866395585245425, + "grad_norm": 0.37445268034935, + "learning_rate": 0.0001, + "loss": 1.7155, + "step": 1710 + }, + { + "epoch": 0.19878013360441477, + "grad_norm": 0.3383803069591522, + "learning_rate": 0.0001, + "loss": 1.6613, + "step": 1711 + }, + { + "epoch": 0.19889631135637525, + "grad_norm": 0.3505041301250458, + "learning_rate": 0.0001, + "loss": 1.7243, + "step": 1712 + }, + { + "epoch": 0.19901248910833574, + "grad_norm": 0.3884884715080261, + "learning_rate": 0.0001, + "loss": 1.6943, + "step": 1713 + }, + { + "epoch": 0.19912866686029626, + "grad_norm": 0.3483599126338959, + "learning_rate": 0.0001, + "loss": 1.6576, + "step": 1714 + }, + { + "epoch": 0.19924484461225675, + "grad_norm": 0.36730775237083435, + "learning_rate": 0.0001, + "loss": 1.8406, + "step": 1715 + }, + { + "epoch": 0.19936102236421724, + "grad_norm": 0.3688085675239563, + "learning_rate": 0.0001, + "loss": 1.7211, + "step": 1716 + }, + { + "epoch": 0.19947720011617776, + "grad_norm": 0.3598758578300476, + "learning_rate": 0.0001, + "loss": 1.5951, + "step": 1717 + }, + { + "epoch": 0.19959337786813824, + "grad_norm": 0.3627162277698517, + "learning_rate": 0.0001, + "loss": 1.8198, + "step": 1718 + }, + { + "epoch": 0.19970955562009876, + "grad_norm": 0.35776904225349426, + "learning_rate": 0.0001, + "loss": 1.7034, + "step": 1719 + }, + { + "epoch": 0.19982573337205925, + "grad_norm": 0.3551950454711914, + "learning_rate": 0.0001, + "loss": 1.7584, + "step": 1720 + }, + { + "epoch": 0.19994191112401974, + "grad_norm": 0.3886015713214874, + "learning_rate": 0.0001, + "loss": 1.8489, + "step": 1721 + }, + { + "epoch": 0.20005808887598026, + "grad_norm": 0.396438330411911, + "learning_rate": 0.0001, + "loss": 1.7174, + "step": 1722 + }, + { + "epoch": 0.20017426662794074, + "grad_norm": 0.38339731097221375, + "learning_rate": 0.0001, + "loss": 1.5463, + "step": 1723 + }, + { + "epoch": 0.20029044437990126, + "grad_norm": 0.4310664236545563, + "learning_rate": 0.0001, + "loss": 1.9913, + "step": 1724 + }, + { + "epoch": 0.20040662213186175, + "grad_norm": 0.34265899658203125, + "learning_rate": 0.0001, + "loss": 1.5937, + "step": 1725 + }, + { + "epoch": 0.20052279988382224, + "grad_norm": 0.33483079075813293, + "learning_rate": 0.0001, + "loss": 1.5423, + "step": 1726 + }, + { + "epoch": 0.20063897763578276, + "grad_norm": 0.34570086002349854, + "learning_rate": 0.0001, + "loss": 1.6765, + "step": 1727 + }, + { + "epoch": 0.20075515538774324, + "grad_norm": 0.36235955357551575, + "learning_rate": 0.0001, + "loss": 1.623, + "step": 1728 + }, + { + "epoch": 0.20087133313970373, + "grad_norm": 0.3711889088153839, + "learning_rate": 0.0001, + "loss": 1.7069, + "step": 1729 + }, + { + "epoch": 0.20098751089166425, + "grad_norm": 0.34235769510269165, + "learning_rate": 0.0001, + "loss": 1.7062, + "step": 1730 + }, + { + "epoch": 0.20110368864362474, + "grad_norm": 0.35882294178009033, + "learning_rate": 0.0001, + "loss": 1.6554, + "step": 1731 + }, + { + "epoch": 0.20121986639558526, + "grad_norm": 0.3647457957267761, + "learning_rate": 0.0001, + "loss": 1.6424, + "step": 1732 + }, + { + "epoch": 0.20133604414754575, + "grad_norm": 0.35160166025161743, + "learning_rate": 0.0001, + "loss": 1.6798, + "step": 1733 + }, + { + "epoch": 0.20145222189950623, + "grad_norm": 0.38673707842826843, + "learning_rate": 0.0001, + "loss": 1.7222, + "step": 1734 + }, + { + "epoch": 0.20156839965146675, + "grad_norm": 0.3855576813220978, + "learning_rate": 0.0001, + "loss": 1.7457, + "step": 1735 + }, + { + "epoch": 0.20168457740342724, + "grad_norm": 0.34935352206230164, + "learning_rate": 0.0001, + "loss": 1.5022, + "step": 1736 + }, + { + "epoch": 0.20180075515538776, + "grad_norm": 0.3582732379436493, + "learning_rate": 0.0001, + "loss": 1.7167, + "step": 1737 + }, + { + "epoch": 0.20191693290734825, + "grad_norm": 0.35151124000549316, + "learning_rate": 0.0001, + "loss": 1.515, + "step": 1738 + }, + { + "epoch": 0.20203311065930873, + "grad_norm": 0.3719716966152191, + "learning_rate": 0.0001, + "loss": 1.8222, + "step": 1739 + }, + { + "epoch": 0.20214928841126925, + "grad_norm": 0.35581451654434204, + "learning_rate": 0.0001, + "loss": 1.7917, + "step": 1740 + }, + { + "epoch": 0.20226546616322974, + "grad_norm": 0.3818107545375824, + "learning_rate": 0.0001, + "loss": 1.6525, + "step": 1741 + }, + { + "epoch": 0.20238164391519023, + "grad_norm": 0.3568393588066101, + "learning_rate": 0.0001, + "loss": 1.6038, + "step": 1742 + }, + { + "epoch": 0.20249782166715075, + "grad_norm": 0.38510867953300476, + "learning_rate": 0.0001, + "loss": 1.8498, + "step": 1743 + }, + { + "epoch": 0.20261399941911123, + "grad_norm": 0.36341622471809387, + "learning_rate": 0.0001, + "loss": 1.7387, + "step": 1744 + }, + { + "epoch": 0.20273017717107175, + "grad_norm": 0.38483190536499023, + "learning_rate": 0.0001, + "loss": 1.8348, + "step": 1745 + }, + { + "epoch": 0.20284635492303224, + "grad_norm": 0.3468552529811859, + "learning_rate": 0.0001, + "loss": 1.5533, + "step": 1746 + }, + { + "epoch": 0.20296253267499273, + "grad_norm": 0.388603150844574, + "learning_rate": 0.0001, + "loss": 1.7117, + "step": 1747 + }, + { + "epoch": 0.20307871042695325, + "grad_norm": 0.3520529866218567, + "learning_rate": 0.0001, + "loss": 1.564, + "step": 1748 + }, + { + "epoch": 0.20319488817891374, + "grad_norm": 0.39672860503196716, + "learning_rate": 0.0001, + "loss": 1.6181, + "step": 1749 + }, + { + "epoch": 0.20331106593087422, + "grad_norm": 0.3811289668083191, + "learning_rate": 0.0001, + "loss": 1.6476, + "step": 1750 + }, + { + "epoch": 0.20342724368283474, + "grad_norm": 0.3765932321548462, + "learning_rate": 0.0001, + "loss": 1.6609, + "step": 1751 + }, + { + "epoch": 0.20354342143479523, + "grad_norm": 0.33500736951828003, + "learning_rate": 0.0001, + "loss": 1.6485, + "step": 1752 + }, + { + "epoch": 0.20365959918675575, + "grad_norm": 0.3680993318557739, + "learning_rate": 0.0001, + "loss": 1.6946, + "step": 1753 + }, + { + "epoch": 0.20377577693871624, + "grad_norm": 0.3892023265361786, + "learning_rate": 0.0001, + "loss": 1.8349, + "step": 1754 + }, + { + "epoch": 0.20389195469067672, + "grad_norm": 0.3450574576854706, + "learning_rate": 0.0001, + "loss": 1.7574, + "step": 1755 + }, + { + "epoch": 0.20400813244263724, + "grad_norm": 0.37310826778411865, + "learning_rate": 0.0001, + "loss": 1.8353, + "step": 1756 + }, + { + "epoch": 0.20412431019459773, + "grad_norm": 0.3621986210346222, + "learning_rate": 0.0001, + "loss": 1.5399, + "step": 1757 + }, + { + "epoch": 0.20424048794655825, + "grad_norm": 0.3675428628921509, + "learning_rate": 0.0001, + "loss": 1.5781, + "step": 1758 + }, + { + "epoch": 0.20435666569851874, + "grad_norm": 0.3777763843536377, + "learning_rate": 0.0001, + "loss": 1.7072, + "step": 1759 + }, + { + "epoch": 0.20447284345047922, + "grad_norm": 0.3660062849521637, + "learning_rate": 0.0001, + "loss": 1.6259, + "step": 1760 + }, + { + "epoch": 0.20458902120243974, + "grad_norm": 0.3744828402996063, + "learning_rate": 0.0001, + "loss": 1.6227, + "step": 1761 + }, + { + "epoch": 0.20470519895440023, + "grad_norm": 0.3762773275375366, + "learning_rate": 0.0001, + "loss": 1.6717, + "step": 1762 + }, + { + "epoch": 0.20482137670636072, + "grad_norm": 0.3429649770259857, + "learning_rate": 0.0001, + "loss": 1.6066, + "step": 1763 + }, + { + "epoch": 0.20493755445832124, + "grad_norm": 0.35902631282806396, + "learning_rate": 0.0001, + "loss": 1.7535, + "step": 1764 + }, + { + "epoch": 0.20505373221028173, + "grad_norm": 0.37041351199150085, + "learning_rate": 0.0001, + "loss": 1.8098, + "step": 1765 + }, + { + "epoch": 0.20516990996224224, + "grad_norm": 0.3608233332633972, + "learning_rate": 0.0001, + "loss": 1.6251, + "step": 1766 + }, + { + "epoch": 0.20528608771420273, + "grad_norm": 0.39614373445510864, + "learning_rate": 0.0001, + "loss": 1.7858, + "step": 1767 + }, + { + "epoch": 0.20540226546616322, + "grad_norm": 0.3881773054599762, + "learning_rate": 0.0001, + "loss": 1.7105, + "step": 1768 + }, + { + "epoch": 0.20551844321812374, + "grad_norm": 0.3501490354537964, + "learning_rate": 0.0001, + "loss": 1.6295, + "step": 1769 + }, + { + "epoch": 0.20563462097008423, + "grad_norm": 0.36642521619796753, + "learning_rate": 0.0001, + "loss": 1.6204, + "step": 1770 + }, + { + "epoch": 0.20575079872204474, + "grad_norm": 0.3428248167037964, + "learning_rate": 0.0001, + "loss": 1.5848, + "step": 1771 + }, + { + "epoch": 0.20586697647400523, + "grad_norm": 0.39081087708473206, + "learning_rate": 0.0001, + "loss": 1.7507, + "step": 1772 + }, + { + "epoch": 0.20598315422596572, + "grad_norm": 0.3991013467311859, + "learning_rate": 0.0001, + "loss": 1.9912, + "step": 1773 + }, + { + "epoch": 0.20609933197792624, + "grad_norm": 0.3668970763683319, + "learning_rate": 0.0001, + "loss": 1.7479, + "step": 1774 + }, + { + "epoch": 0.20621550972988673, + "grad_norm": 0.36014431715011597, + "learning_rate": 0.0001, + "loss": 1.687, + "step": 1775 + }, + { + "epoch": 0.20633168748184721, + "grad_norm": 0.38143759965896606, + "learning_rate": 0.0001, + "loss": 1.7137, + "step": 1776 + }, + { + "epoch": 0.20644786523380773, + "grad_norm": 0.382790207862854, + "learning_rate": 0.0001, + "loss": 1.6797, + "step": 1777 + }, + { + "epoch": 0.20656404298576822, + "grad_norm": 0.355749249458313, + "learning_rate": 0.0001, + "loss": 1.6275, + "step": 1778 + }, + { + "epoch": 0.20668022073772874, + "grad_norm": 0.3811320662498474, + "learning_rate": 0.0001, + "loss": 1.8527, + "step": 1779 + }, + { + "epoch": 0.20679639848968923, + "grad_norm": 0.3613418638706207, + "learning_rate": 0.0001, + "loss": 1.7679, + "step": 1780 + }, + { + "epoch": 0.20691257624164971, + "grad_norm": 0.36926668882369995, + "learning_rate": 0.0001, + "loss": 1.7807, + "step": 1781 + }, + { + "epoch": 0.20702875399361023, + "grad_norm": 0.3642374277114868, + "learning_rate": 0.0001, + "loss": 1.6741, + "step": 1782 + }, + { + "epoch": 0.20714493174557072, + "grad_norm": 0.4205605685710907, + "learning_rate": 0.0001, + "loss": 1.8032, + "step": 1783 + }, + { + "epoch": 0.2072611094975312, + "grad_norm": 0.3942674994468689, + "learning_rate": 0.0001, + "loss": 1.7935, + "step": 1784 + }, + { + "epoch": 0.20737728724949173, + "grad_norm": 0.3895415663719177, + "learning_rate": 0.0001, + "loss": 1.7883, + "step": 1785 + }, + { + "epoch": 0.20749346500145222, + "grad_norm": 0.3891303539276123, + "learning_rate": 0.0001, + "loss": 1.7201, + "step": 1786 + }, + { + "epoch": 0.20760964275341273, + "grad_norm": 0.3562510311603546, + "learning_rate": 0.0001, + "loss": 1.6112, + "step": 1787 + }, + { + "epoch": 0.20772582050537322, + "grad_norm": 0.35545283555984497, + "learning_rate": 0.0001, + "loss": 1.6897, + "step": 1788 + }, + { + "epoch": 0.2078419982573337, + "grad_norm": 0.40169668197631836, + "learning_rate": 0.0001, + "loss": 1.67, + "step": 1789 + }, + { + "epoch": 0.20795817600929423, + "grad_norm": 0.36041274666786194, + "learning_rate": 0.0001, + "loss": 1.5874, + "step": 1790 + }, + { + "epoch": 0.20807435376125472, + "grad_norm": 0.36906883120536804, + "learning_rate": 0.0001, + "loss": 1.5506, + "step": 1791 + }, + { + "epoch": 0.20819053151321523, + "grad_norm": 0.35365477204322815, + "learning_rate": 0.0001, + "loss": 1.4886, + "step": 1792 + }, + { + "epoch": 0.20830670926517572, + "grad_norm": 0.3502259850502014, + "learning_rate": 0.0001, + "loss": 1.4629, + "step": 1793 + }, + { + "epoch": 0.2084228870171362, + "grad_norm": 0.3471246659755707, + "learning_rate": 0.0001, + "loss": 1.6308, + "step": 1794 + }, + { + "epoch": 0.20853906476909673, + "grad_norm": 0.36685308814048767, + "learning_rate": 0.0001, + "loss": 1.7224, + "step": 1795 + }, + { + "epoch": 0.20865524252105722, + "grad_norm": 0.40070536732673645, + "learning_rate": 0.0001, + "loss": 1.7234, + "step": 1796 + }, + { + "epoch": 0.2087714202730177, + "grad_norm": 0.35480430722236633, + "learning_rate": 0.0001, + "loss": 1.7181, + "step": 1797 + }, + { + "epoch": 0.20888759802497822, + "grad_norm": 0.35834869742393494, + "learning_rate": 0.0001, + "loss": 1.8237, + "step": 1798 + }, + { + "epoch": 0.2090037757769387, + "grad_norm": 0.521961510181427, + "learning_rate": 0.0001, + "loss": 1.721, + "step": 1799 + }, + { + "epoch": 0.20911995352889923, + "grad_norm": 0.36445826292037964, + "learning_rate": 0.0001, + "loss": 1.6792, + "step": 1800 + }, + { + "epoch": 0.20923613128085972, + "grad_norm": 0.3719666302204132, + "learning_rate": 0.0001, + "loss": 1.7106, + "step": 1801 + }, + { + "epoch": 0.2093523090328202, + "grad_norm": 0.37383589148521423, + "learning_rate": 0.0001, + "loss": 1.8157, + "step": 1802 + }, + { + "epoch": 0.20946848678478072, + "grad_norm": 0.3597404956817627, + "learning_rate": 0.0001, + "loss": 1.7367, + "step": 1803 + }, + { + "epoch": 0.2095846645367412, + "grad_norm": 0.3522038161754608, + "learning_rate": 0.0001, + "loss": 1.5681, + "step": 1804 + }, + { + "epoch": 0.2097008422887017, + "grad_norm": 0.37592774629592896, + "learning_rate": 0.0001, + "loss": 1.7834, + "step": 1805 + }, + { + "epoch": 0.20981702004066222, + "grad_norm": 0.3839535117149353, + "learning_rate": 0.0001, + "loss": 1.7384, + "step": 1806 + }, + { + "epoch": 0.2099331977926227, + "grad_norm": 0.3627369999885559, + "learning_rate": 0.0001, + "loss": 1.6661, + "step": 1807 + }, + { + "epoch": 0.21004937554458322, + "grad_norm": 0.36218005418777466, + "learning_rate": 0.0001, + "loss": 1.5632, + "step": 1808 + }, + { + "epoch": 0.2101655532965437, + "grad_norm": 0.3566618263721466, + "learning_rate": 0.0001, + "loss": 1.673, + "step": 1809 + }, + { + "epoch": 0.2102817310485042, + "grad_norm": 0.3671146631240845, + "learning_rate": 0.0001, + "loss": 1.5717, + "step": 1810 + }, + { + "epoch": 0.21039790880046472, + "grad_norm": 0.3826962113380432, + "learning_rate": 0.0001, + "loss": 1.6545, + "step": 1811 + }, + { + "epoch": 0.2105140865524252, + "grad_norm": 0.36169835925102234, + "learning_rate": 0.0001, + "loss": 1.7124, + "step": 1812 + }, + { + "epoch": 0.21063026430438572, + "grad_norm": 0.3707777261734009, + "learning_rate": 0.0001, + "loss": 1.6791, + "step": 1813 + }, + { + "epoch": 0.2107464420563462, + "grad_norm": 0.3656941056251526, + "learning_rate": 0.0001, + "loss": 1.704, + "step": 1814 + }, + { + "epoch": 0.2108626198083067, + "grad_norm": 0.3701861500740051, + "learning_rate": 0.0001, + "loss": 1.5397, + "step": 1815 + }, + { + "epoch": 0.21097879756026722, + "grad_norm": 0.37783530354499817, + "learning_rate": 0.0001, + "loss": 1.7791, + "step": 1816 + }, + { + "epoch": 0.2110949753122277, + "grad_norm": 0.3402620255947113, + "learning_rate": 0.0001, + "loss": 1.6113, + "step": 1817 + }, + { + "epoch": 0.2112111530641882, + "grad_norm": 0.3584825098514557, + "learning_rate": 0.0001, + "loss": 1.7871, + "step": 1818 + }, + { + "epoch": 0.2113273308161487, + "grad_norm": 0.37935203313827515, + "learning_rate": 0.0001, + "loss": 1.7422, + "step": 1819 + }, + { + "epoch": 0.2114435085681092, + "grad_norm": 0.3971646726131439, + "learning_rate": 0.0001, + "loss": 1.9348, + "step": 1820 + }, + { + "epoch": 0.21155968632006972, + "grad_norm": 0.3793870210647583, + "learning_rate": 0.0001, + "loss": 1.6856, + "step": 1821 + }, + { + "epoch": 0.2116758640720302, + "grad_norm": 0.34898293018341064, + "learning_rate": 0.0001, + "loss": 1.6122, + "step": 1822 + }, + { + "epoch": 0.2117920418239907, + "grad_norm": 0.3546793758869171, + "learning_rate": 0.0001, + "loss": 1.4422, + "step": 1823 + }, + { + "epoch": 0.2119082195759512, + "grad_norm": 0.39553505182266235, + "learning_rate": 0.0001, + "loss": 1.7607, + "step": 1824 + }, + { + "epoch": 0.2120243973279117, + "grad_norm": 0.38580450415611267, + "learning_rate": 0.0001, + "loss": 1.7025, + "step": 1825 + }, + { + "epoch": 0.21214057507987222, + "grad_norm": 0.3958745300769806, + "learning_rate": 0.0001, + "loss": 1.6341, + "step": 1826 + }, + { + "epoch": 0.2122567528318327, + "grad_norm": 0.3790923058986664, + "learning_rate": 0.0001, + "loss": 1.7912, + "step": 1827 + }, + { + "epoch": 0.2123729305837932, + "grad_norm": 0.3835214376449585, + "learning_rate": 0.0001, + "loss": 1.7081, + "step": 1828 + }, + { + "epoch": 0.2124891083357537, + "grad_norm": 0.36152535676956177, + "learning_rate": 0.0001, + "loss": 1.6533, + "step": 1829 + }, + { + "epoch": 0.2126052860877142, + "grad_norm": 0.3254132866859436, + "learning_rate": 0.0001, + "loss": 1.5777, + "step": 1830 + }, + { + "epoch": 0.2127214638396747, + "grad_norm": 0.38130825757980347, + "learning_rate": 0.0001, + "loss": 1.8067, + "step": 1831 + }, + { + "epoch": 0.2128376415916352, + "grad_norm": 0.36673715710639954, + "learning_rate": 0.0001, + "loss": 1.7599, + "step": 1832 + }, + { + "epoch": 0.2129538193435957, + "grad_norm": 0.39289578795433044, + "learning_rate": 0.0001, + "loss": 1.6897, + "step": 1833 + }, + { + "epoch": 0.2130699970955562, + "grad_norm": 0.35290926694869995, + "learning_rate": 0.0001, + "loss": 1.6785, + "step": 1834 + }, + { + "epoch": 0.2131861748475167, + "grad_norm": 0.36013439297676086, + "learning_rate": 0.0001, + "loss": 1.8015, + "step": 1835 + }, + { + "epoch": 0.2133023525994772, + "grad_norm": 0.39623382687568665, + "learning_rate": 0.0001, + "loss": 1.7266, + "step": 1836 + }, + { + "epoch": 0.2134185303514377, + "grad_norm": 0.37049758434295654, + "learning_rate": 0.0001, + "loss": 1.8266, + "step": 1837 + }, + { + "epoch": 0.2135347081033982, + "grad_norm": 0.37480175495147705, + "learning_rate": 0.0001, + "loss": 1.6796, + "step": 1838 + }, + { + "epoch": 0.21365088585535869, + "grad_norm": 0.38570842146873474, + "learning_rate": 0.0001, + "loss": 1.5916, + "step": 1839 + }, + { + "epoch": 0.2137670636073192, + "grad_norm": 0.4038733243942261, + "learning_rate": 0.0001, + "loss": 1.7587, + "step": 1840 + }, + { + "epoch": 0.2138832413592797, + "grad_norm": 0.3932957351207733, + "learning_rate": 0.0001, + "loss": 1.7695, + "step": 1841 + }, + { + "epoch": 0.2139994191112402, + "grad_norm": 0.3737775981426239, + "learning_rate": 0.0001, + "loss": 1.7451, + "step": 1842 + }, + { + "epoch": 0.2141155968632007, + "grad_norm": 0.39853671193122864, + "learning_rate": 0.0001, + "loss": 1.7764, + "step": 1843 + }, + { + "epoch": 0.21423177461516119, + "grad_norm": 0.37162113189697266, + "learning_rate": 0.0001, + "loss": 1.5834, + "step": 1844 + }, + { + "epoch": 0.2143479523671217, + "grad_norm": 0.38657307624816895, + "learning_rate": 0.0001, + "loss": 1.6493, + "step": 1845 + }, + { + "epoch": 0.2144641301190822, + "grad_norm": 0.40344858169555664, + "learning_rate": 0.0001, + "loss": 1.6822, + "step": 1846 + }, + { + "epoch": 0.2145803078710427, + "grad_norm": 0.34257638454437256, + "learning_rate": 0.0001, + "loss": 1.5296, + "step": 1847 + }, + { + "epoch": 0.2146964856230032, + "grad_norm": 0.3880448043346405, + "learning_rate": 0.0001, + "loss": 1.7435, + "step": 1848 + }, + { + "epoch": 0.21481266337496369, + "grad_norm": 0.3583984971046448, + "learning_rate": 0.0001, + "loss": 1.596, + "step": 1849 + }, + { + "epoch": 0.2149288411269242, + "grad_norm": 0.36240407824516296, + "learning_rate": 0.0001, + "loss": 1.5892, + "step": 1850 + }, + { + "epoch": 0.2150450188788847, + "grad_norm": 0.3811880648136139, + "learning_rate": 0.0001, + "loss": 1.7082, + "step": 1851 + }, + { + "epoch": 0.21516119663084518, + "grad_norm": 0.4095620810985565, + "learning_rate": 0.0001, + "loss": 1.8439, + "step": 1852 + }, + { + "epoch": 0.2152773743828057, + "grad_norm": 0.43665236234664917, + "learning_rate": 0.0001, + "loss": 1.8788, + "step": 1853 + }, + { + "epoch": 0.2153935521347662, + "grad_norm": 0.37620803713798523, + "learning_rate": 0.0001, + "loss": 1.7409, + "step": 1854 + }, + { + "epoch": 0.2155097298867267, + "grad_norm": 0.3814584016799927, + "learning_rate": 0.0001, + "loss": 1.7707, + "step": 1855 + }, + { + "epoch": 0.2156259076386872, + "grad_norm": 0.349738210439682, + "learning_rate": 0.0001, + "loss": 1.5507, + "step": 1856 + }, + { + "epoch": 0.21574208539064768, + "grad_norm": 0.388741135597229, + "learning_rate": 0.0001, + "loss": 1.8032, + "step": 1857 + }, + { + "epoch": 0.2158582631426082, + "grad_norm": 0.37817415595054626, + "learning_rate": 0.0001, + "loss": 1.7183, + "step": 1858 + }, + { + "epoch": 0.2159744408945687, + "grad_norm": 0.3564184606075287, + "learning_rate": 0.0001, + "loss": 1.6542, + "step": 1859 + }, + { + "epoch": 0.2160906186465292, + "grad_norm": 0.3882390260696411, + "learning_rate": 0.0001, + "loss": 1.8215, + "step": 1860 + }, + { + "epoch": 0.2162067963984897, + "grad_norm": 0.3727805018424988, + "learning_rate": 0.0001, + "loss": 1.7673, + "step": 1861 + }, + { + "epoch": 0.21632297415045018, + "grad_norm": 0.3581012785434723, + "learning_rate": 0.0001, + "loss": 1.741, + "step": 1862 + }, + { + "epoch": 0.2164391519024107, + "grad_norm": 0.4040369391441345, + "learning_rate": 0.0001, + "loss": 1.8377, + "step": 1863 + }, + { + "epoch": 0.2165553296543712, + "grad_norm": 0.3850097060203552, + "learning_rate": 0.0001, + "loss": 1.7752, + "step": 1864 + }, + { + "epoch": 0.21667150740633168, + "grad_norm": 0.3745688796043396, + "learning_rate": 0.0001, + "loss": 1.7286, + "step": 1865 + }, + { + "epoch": 0.2167876851582922, + "grad_norm": 0.3721669018268585, + "learning_rate": 0.0001, + "loss": 1.7012, + "step": 1866 + }, + { + "epoch": 0.21690386291025268, + "grad_norm": 0.3597140610218048, + "learning_rate": 0.0001, + "loss": 1.7579, + "step": 1867 + }, + { + "epoch": 0.2170200406622132, + "grad_norm": 0.3266042470932007, + "learning_rate": 0.0001, + "loss": 1.5274, + "step": 1868 + }, + { + "epoch": 0.2171362184141737, + "grad_norm": 0.3769254982471466, + "learning_rate": 0.0001, + "loss": 1.9181, + "step": 1869 + }, + { + "epoch": 0.21725239616613418, + "grad_norm": 0.386996328830719, + "learning_rate": 0.0001, + "loss": 1.7463, + "step": 1870 + }, + { + "epoch": 0.2173685739180947, + "grad_norm": 0.3945530354976654, + "learning_rate": 0.0001, + "loss": 1.7364, + "step": 1871 + }, + { + "epoch": 0.21748475167005518, + "grad_norm": 0.39462509751319885, + "learning_rate": 0.0001, + "loss": 1.763, + "step": 1872 + }, + { + "epoch": 0.21760092942201567, + "grad_norm": 0.35269200801849365, + "learning_rate": 0.0001, + "loss": 1.4984, + "step": 1873 + }, + { + "epoch": 0.2177171071739762, + "grad_norm": 0.401889443397522, + "learning_rate": 0.0001, + "loss": 1.9107, + "step": 1874 + }, + { + "epoch": 0.21783328492593668, + "grad_norm": 0.3577077090740204, + "learning_rate": 0.0001, + "loss": 1.7043, + "step": 1875 + }, + { + "epoch": 0.2179494626778972, + "grad_norm": 0.3647770881652832, + "learning_rate": 0.0001, + "loss": 1.688, + "step": 1876 + }, + { + "epoch": 0.21806564042985768, + "grad_norm": 0.37258070707321167, + "learning_rate": 0.0001, + "loss": 1.6599, + "step": 1877 + }, + { + "epoch": 0.21818181818181817, + "grad_norm": 0.38503706455230713, + "learning_rate": 0.0001, + "loss": 1.7757, + "step": 1878 + }, + { + "epoch": 0.2182979959337787, + "grad_norm": 0.3539654314517975, + "learning_rate": 0.0001, + "loss": 1.6676, + "step": 1879 + }, + { + "epoch": 0.21841417368573918, + "grad_norm": 0.3822212517261505, + "learning_rate": 0.0001, + "loss": 1.8839, + "step": 1880 + }, + { + "epoch": 0.2185303514376997, + "grad_norm": 0.34567397832870483, + "learning_rate": 0.0001, + "loss": 1.7168, + "step": 1881 + }, + { + "epoch": 0.21864652918966018, + "grad_norm": 0.359351247549057, + "learning_rate": 0.0001, + "loss": 1.6512, + "step": 1882 + }, + { + "epoch": 0.21876270694162067, + "grad_norm": 0.3686642348766327, + "learning_rate": 0.0001, + "loss": 1.6674, + "step": 1883 + }, + { + "epoch": 0.2188788846935812, + "grad_norm": 0.37803003191947937, + "learning_rate": 0.0001, + "loss": 1.7462, + "step": 1884 + }, + { + "epoch": 0.21899506244554168, + "grad_norm": 0.3631937503814697, + "learning_rate": 0.0001, + "loss": 1.6759, + "step": 1885 + }, + { + "epoch": 0.21911124019750217, + "grad_norm": 0.3699375092983246, + "learning_rate": 0.0001, + "loss": 1.7432, + "step": 1886 + }, + { + "epoch": 0.21922741794946268, + "grad_norm": 0.3758851885795593, + "learning_rate": 0.0001, + "loss": 1.6398, + "step": 1887 + }, + { + "epoch": 0.21934359570142317, + "grad_norm": 0.38159826397895813, + "learning_rate": 0.0001, + "loss": 1.6982, + "step": 1888 + }, + { + "epoch": 0.2194597734533837, + "grad_norm": 0.3970431983470917, + "learning_rate": 0.0001, + "loss": 1.7361, + "step": 1889 + }, + { + "epoch": 0.21957595120534418, + "grad_norm": 0.5288736820220947, + "learning_rate": 0.0001, + "loss": 1.757, + "step": 1890 + }, + { + "epoch": 0.21969212895730467, + "grad_norm": 0.3814886212348938, + "learning_rate": 0.0001, + "loss": 1.7492, + "step": 1891 + }, + { + "epoch": 0.21980830670926518, + "grad_norm": 0.3462470471858978, + "learning_rate": 0.0001, + "loss": 1.5657, + "step": 1892 + }, + { + "epoch": 0.21992448446122567, + "grad_norm": 0.41686296463012695, + "learning_rate": 0.0001, + "loss": 1.7641, + "step": 1893 + }, + { + "epoch": 0.22004066221318616, + "grad_norm": 0.35188373923301697, + "learning_rate": 0.0001, + "loss": 1.6406, + "step": 1894 + }, + { + "epoch": 0.22015683996514668, + "grad_norm": 0.366418719291687, + "learning_rate": 0.0001, + "loss": 1.7024, + "step": 1895 + }, + { + "epoch": 0.22027301771710717, + "grad_norm": 0.3827008306980133, + "learning_rate": 0.0001, + "loss": 1.6493, + "step": 1896 + }, + { + "epoch": 0.22038919546906768, + "grad_norm": 0.38810524344444275, + "learning_rate": 0.0001, + "loss": 1.7724, + "step": 1897 + }, + { + "epoch": 0.22050537322102817, + "grad_norm": 0.3892178237438202, + "learning_rate": 0.0001, + "loss": 1.7133, + "step": 1898 + }, + { + "epoch": 0.22062155097298866, + "grad_norm": 0.34898847341537476, + "learning_rate": 0.0001, + "loss": 1.5637, + "step": 1899 + }, + { + "epoch": 0.22073772872494918, + "grad_norm": 0.38347798585891724, + "learning_rate": 0.0001, + "loss": 1.778, + "step": 1900 + }, + { + "epoch": 0.22085390647690967, + "grad_norm": 0.45999932289123535, + "learning_rate": 0.0001, + "loss": 1.8044, + "step": 1901 + }, + { + "epoch": 0.22097008422887018, + "grad_norm": 0.39396339654922485, + "learning_rate": 0.0001, + "loss": 1.6791, + "step": 1902 + }, + { + "epoch": 0.22108626198083067, + "grad_norm": 0.3777587413787842, + "learning_rate": 0.0001, + "loss": 1.885, + "step": 1903 + }, + { + "epoch": 0.22120243973279116, + "grad_norm": 0.38406991958618164, + "learning_rate": 0.0001, + "loss": 1.6602, + "step": 1904 + }, + { + "epoch": 0.22131861748475168, + "grad_norm": 0.4187374413013458, + "learning_rate": 0.0001, + "loss": 1.714, + "step": 1905 + }, + { + "epoch": 0.22143479523671217, + "grad_norm": 0.35657206177711487, + "learning_rate": 0.0001, + "loss": 1.7717, + "step": 1906 + }, + { + "epoch": 0.22155097298867266, + "grad_norm": 0.3913048803806305, + "learning_rate": 0.0001, + "loss": 1.6577, + "step": 1907 + }, + { + "epoch": 0.22166715074063317, + "grad_norm": 0.38149452209472656, + "learning_rate": 0.0001, + "loss": 1.7873, + "step": 1908 + }, + { + "epoch": 0.22178332849259366, + "grad_norm": 0.3800641596317291, + "learning_rate": 0.0001, + "loss": 1.6575, + "step": 1909 + }, + { + "epoch": 0.22189950624455418, + "grad_norm": 0.3555182218551636, + "learning_rate": 0.0001, + "loss": 1.5213, + "step": 1910 + }, + { + "epoch": 0.22201568399651467, + "grad_norm": 0.3945232331752777, + "learning_rate": 0.0001, + "loss": 1.807, + "step": 1911 + }, + { + "epoch": 0.22213186174847516, + "grad_norm": 0.38593828678131104, + "learning_rate": 0.0001, + "loss": 1.8026, + "step": 1912 + }, + { + "epoch": 0.22224803950043567, + "grad_norm": 0.38784652948379517, + "learning_rate": 0.0001, + "loss": 1.7526, + "step": 1913 + }, + { + "epoch": 0.22236421725239616, + "grad_norm": 0.35396912693977356, + "learning_rate": 0.0001, + "loss": 1.611, + "step": 1914 + }, + { + "epoch": 0.22248039500435668, + "grad_norm": 0.3825778365135193, + "learning_rate": 0.0001, + "loss": 1.6673, + "step": 1915 + }, + { + "epoch": 0.22259657275631717, + "grad_norm": 0.36573922634124756, + "learning_rate": 0.0001, + "loss": 1.7551, + "step": 1916 + }, + { + "epoch": 0.22271275050827766, + "grad_norm": 0.37928685545921326, + "learning_rate": 0.0001, + "loss": 1.6232, + "step": 1917 + }, + { + "epoch": 0.22282892826023817, + "grad_norm": 0.3511602580547333, + "learning_rate": 0.0001, + "loss": 1.7165, + "step": 1918 + }, + { + "epoch": 0.22294510601219866, + "grad_norm": 0.3809853494167328, + "learning_rate": 0.0001, + "loss": 1.7735, + "step": 1919 + }, + { + "epoch": 0.22306128376415915, + "grad_norm": 0.36619848012924194, + "learning_rate": 0.0001, + "loss": 1.6838, + "step": 1920 + }, + { + "epoch": 0.22317746151611967, + "grad_norm": 0.3719876706600189, + "learning_rate": 0.0001, + "loss": 1.6385, + "step": 1921 + }, + { + "epoch": 0.22329363926808016, + "grad_norm": 0.3887604773044586, + "learning_rate": 0.0001, + "loss": 1.6589, + "step": 1922 + }, + { + "epoch": 0.22340981702004067, + "grad_norm": 0.36279502511024475, + "learning_rate": 0.0001, + "loss": 1.7314, + "step": 1923 + }, + { + "epoch": 0.22352599477200116, + "grad_norm": 0.3789854645729065, + "learning_rate": 0.0001, + "loss": 1.5644, + "step": 1924 + }, + { + "epoch": 0.22364217252396165, + "grad_norm": 0.3797144293785095, + "learning_rate": 0.0001, + "loss": 1.6078, + "step": 1925 + }, + { + "epoch": 0.22375835027592217, + "grad_norm": 0.35952475666999817, + "learning_rate": 0.0001, + "loss": 1.5672, + "step": 1926 + }, + { + "epoch": 0.22387452802788266, + "grad_norm": 0.3811299800872803, + "learning_rate": 0.0001, + "loss": 1.9059, + "step": 1927 + }, + { + "epoch": 0.22399070577984315, + "grad_norm": 0.3837408721446991, + "learning_rate": 0.0001, + "loss": 1.7887, + "step": 1928 + }, + { + "epoch": 0.22410688353180366, + "grad_norm": 0.4026016891002655, + "learning_rate": 0.0001, + "loss": 1.8315, + "step": 1929 + }, + { + "epoch": 0.22422306128376415, + "grad_norm": 0.35901689529418945, + "learning_rate": 0.0001, + "loss": 1.6643, + "step": 1930 + }, + { + "epoch": 0.22433923903572467, + "grad_norm": 0.3816909193992615, + "learning_rate": 0.0001, + "loss": 1.7365, + "step": 1931 + }, + { + "epoch": 0.22445541678768516, + "grad_norm": 0.3439340889453888, + "learning_rate": 0.0001, + "loss": 1.6192, + "step": 1932 + }, + { + "epoch": 0.22457159453964565, + "grad_norm": 0.37772074341773987, + "learning_rate": 0.0001, + "loss": 1.7376, + "step": 1933 + }, + { + "epoch": 0.22468777229160616, + "grad_norm": 0.3728993833065033, + "learning_rate": 0.0001, + "loss": 1.6178, + "step": 1934 + }, + { + "epoch": 0.22480395004356665, + "grad_norm": 0.3640214502811432, + "learning_rate": 0.0001, + "loss": 1.6539, + "step": 1935 + }, + { + "epoch": 0.22492012779552717, + "grad_norm": 0.37045779824256897, + "learning_rate": 0.0001, + "loss": 1.8218, + "step": 1936 + }, + { + "epoch": 0.22503630554748766, + "grad_norm": 0.3745948374271393, + "learning_rate": 0.0001, + "loss": 1.5704, + "step": 1937 + }, + { + "epoch": 0.22515248329944815, + "grad_norm": 0.36098146438598633, + "learning_rate": 0.0001, + "loss": 1.6665, + "step": 1938 + }, + { + "epoch": 0.22526866105140866, + "grad_norm": 0.3912782073020935, + "learning_rate": 0.0001, + "loss": 1.8078, + "step": 1939 + }, + { + "epoch": 0.22538483880336915, + "grad_norm": 0.35821422934532166, + "learning_rate": 0.0001, + "loss": 1.6098, + "step": 1940 + }, + { + "epoch": 0.22550101655532964, + "grad_norm": 0.3956640958786011, + "learning_rate": 0.0001, + "loss": 1.709, + "step": 1941 + }, + { + "epoch": 0.22561719430729016, + "grad_norm": 0.3759971559047699, + "learning_rate": 0.0001, + "loss": 1.7211, + "step": 1942 + }, + { + "epoch": 0.22573337205925065, + "grad_norm": 0.42271122336387634, + "learning_rate": 0.0001, + "loss": 1.8635, + "step": 1943 + }, + { + "epoch": 0.22584954981121116, + "grad_norm": 0.350124329328537, + "learning_rate": 0.0001, + "loss": 1.5505, + "step": 1944 + }, + { + "epoch": 0.22596572756317165, + "grad_norm": 0.35673707723617554, + "learning_rate": 0.0001, + "loss": 1.6215, + "step": 1945 + }, + { + "epoch": 0.22608190531513214, + "grad_norm": 0.3845730721950531, + "learning_rate": 0.0001, + "loss": 1.7524, + "step": 1946 + }, + { + "epoch": 0.22619808306709266, + "grad_norm": 0.3668268620967865, + "learning_rate": 0.0001, + "loss": 1.6956, + "step": 1947 + }, + { + "epoch": 0.22631426081905315, + "grad_norm": 0.36193057894706726, + "learning_rate": 0.0001, + "loss": 1.6396, + "step": 1948 + }, + { + "epoch": 0.22643043857101366, + "grad_norm": 0.3776628077030182, + "learning_rate": 0.0001, + "loss": 1.7175, + "step": 1949 + }, + { + "epoch": 0.22654661632297415, + "grad_norm": 0.35988494753837585, + "learning_rate": 0.0001, + "loss": 1.5447, + "step": 1950 + }, + { + "epoch": 0.22666279407493464, + "grad_norm": 0.3582817614078522, + "learning_rate": 0.0001, + "loss": 1.6182, + "step": 1951 + }, + { + "epoch": 0.22677897182689516, + "grad_norm": 0.34842002391815186, + "learning_rate": 0.0001, + "loss": 1.4597, + "step": 1952 + }, + { + "epoch": 0.22689514957885565, + "grad_norm": 0.3792864978313446, + "learning_rate": 0.0001, + "loss": 1.6922, + "step": 1953 + }, + { + "epoch": 0.22701132733081614, + "grad_norm": 0.3827436566352844, + "learning_rate": 0.0001, + "loss": 1.6616, + "step": 1954 + }, + { + "epoch": 0.22712750508277665, + "grad_norm": 0.3784099817276001, + "learning_rate": 0.0001, + "loss": 1.6977, + "step": 1955 + }, + { + "epoch": 0.22724368283473714, + "grad_norm": 0.3828442096710205, + "learning_rate": 0.0001, + "loss": 1.7672, + "step": 1956 + }, + { + "epoch": 0.22735986058669766, + "grad_norm": 0.3599461019039154, + "learning_rate": 0.0001, + "loss": 1.7539, + "step": 1957 + }, + { + "epoch": 0.22747603833865815, + "grad_norm": 0.3842355012893677, + "learning_rate": 0.0001, + "loss": 1.8452, + "step": 1958 + }, + { + "epoch": 0.22759221609061864, + "grad_norm": 0.375945508480072, + "learning_rate": 0.0001, + "loss": 1.8502, + "step": 1959 + }, + { + "epoch": 0.22770839384257915, + "grad_norm": 0.4141533374786377, + "learning_rate": 0.0001, + "loss": 1.7301, + "step": 1960 + }, + { + "epoch": 0.22782457159453964, + "grad_norm": 0.37964141368865967, + "learning_rate": 0.0001, + "loss": 1.5489, + "step": 1961 + }, + { + "epoch": 0.22794074934650013, + "grad_norm": 0.39130285382270813, + "learning_rate": 0.0001, + "loss": 1.8114, + "step": 1962 + }, + { + "epoch": 0.22805692709846065, + "grad_norm": 0.3492740988731384, + "learning_rate": 0.0001, + "loss": 1.6302, + "step": 1963 + }, + { + "epoch": 0.22817310485042114, + "grad_norm": 0.36972182989120483, + "learning_rate": 0.0001, + "loss": 1.6917, + "step": 1964 + }, + { + "epoch": 0.22828928260238165, + "grad_norm": 0.3815283179283142, + "learning_rate": 0.0001, + "loss": 1.5922, + "step": 1965 + }, + { + "epoch": 0.22840546035434214, + "grad_norm": 0.3695911169052124, + "learning_rate": 0.0001, + "loss": 1.7524, + "step": 1966 + }, + { + "epoch": 0.22852163810630263, + "grad_norm": 0.3688741624355316, + "learning_rate": 0.0001, + "loss": 1.8406, + "step": 1967 + }, + { + "epoch": 0.22863781585826315, + "grad_norm": 0.3708171248435974, + "learning_rate": 0.0001, + "loss": 1.7476, + "step": 1968 + }, + { + "epoch": 0.22875399361022364, + "grad_norm": 0.406654417514801, + "learning_rate": 0.0001, + "loss": 1.8354, + "step": 1969 + }, + { + "epoch": 0.22887017136218415, + "grad_norm": 0.3633515536785126, + "learning_rate": 0.0001, + "loss": 1.6008, + "step": 1970 + }, + { + "epoch": 0.22898634911414464, + "grad_norm": 0.37781691551208496, + "learning_rate": 0.0001, + "loss": 1.6574, + "step": 1971 + }, + { + "epoch": 0.22910252686610513, + "grad_norm": 0.36972054839134216, + "learning_rate": 0.0001, + "loss": 1.6906, + "step": 1972 + }, + { + "epoch": 0.22921870461806565, + "grad_norm": 0.40130361914634705, + "learning_rate": 0.0001, + "loss": 1.7472, + "step": 1973 + }, + { + "epoch": 0.22933488237002614, + "grad_norm": 0.38525599241256714, + "learning_rate": 0.0001, + "loss": 1.7806, + "step": 1974 + }, + { + "epoch": 0.22945106012198663, + "grad_norm": 0.38195157051086426, + "learning_rate": 0.0001, + "loss": 1.6819, + "step": 1975 + }, + { + "epoch": 0.22956723787394714, + "grad_norm": 0.36674100160598755, + "learning_rate": 0.0001, + "loss": 1.5677, + "step": 1976 + }, + { + "epoch": 0.22968341562590763, + "grad_norm": 0.37068042159080505, + "learning_rate": 0.0001, + "loss": 1.6381, + "step": 1977 + }, + { + "epoch": 0.22979959337786815, + "grad_norm": 0.35614022612571716, + "learning_rate": 0.0001, + "loss": 1.467, + "step": 1978 + }, + { + "epoch": 0.22991577112982864, + "grad_norm": 0.37935879826545715, + "learning_rate": 0.0001, + "loss": 1.711, + "step": 1979 + }, + { + "epoch": 0.23003194888178913, + "grad_norm": 0.3637298047542572, + "learning_rate": 0.0001, + "loss": 1.6148, + "step": 1980 + }, + { + "epoch": 0.23014812663374964, + "grad_norm": 0.40035533905029297, + "learning_rate": 0.0001, + "loss": 1.6534, + "step": 1981 + }, + { + "epoch": 0.23026430438571013, + "grad_norm": 0.40497007966041565, + "learning_rate": 0.0001, + "loss": 1.6456, + "step": 1982 + }, + { + "epoch": 0.23038048213767062, + "grad_norm": 0.3771011233329773, + "learning_rate": 0.0001, + "loss": 1.6556, + "step": 1983 + }, + { + "epoch": 0.23049665988963114, + "grad_norm": 0.3938141465187073, + "learning_rate": 0.0001, + "loss": 1.791, + "step": 1984 + }, + { + "epoch": 0.23061283764159163, + "grad_norm": 0.3939863443374634, + "learning_rate": 0.0001, + "loss": 1.7685, + "step": 1985 + }, + { + "epoch": 0.23072901539355214, + "grad_norm": 0.3473072052001953, + "learning_rate": 0.0001, + "loss": 1.5541, + "step": 1986 + }, + { + "epoch": 0.23084519314551263, + "grad_norm": 0.371954083442688, + "learning_rate": 0.0001, + "loss": 1.7404, + "step": 1987 + }, + { + "epoch": 0.23096137089747312, + "grad_norm": 0.38401806354522705, + "learning_rate": 0.0001, + "loss": 1.6277, + "step": 1988 + }, + { + "epoch": 0.23107754864943364, + "grad_norm": 0.36638298630714417, + "learning_rate": 0.0001, + "loss": 1.5587, + "step": 1989 + }, + { + "epoch": 0.23119372640139413, + "grad_norm": 0.365355521440506, + "learning_rate": 0.0001, + "loss": 1.5955, + "step": 1990 + }, + { + "epoch": 0.23130990415335465, + "grad_norm": 0.3541325330734253, + "learning_rate": 0.0001, + "loss": 1.6865, + "step": 1991 + }, + { + "epoch": 0.23142608190531513, + "grad_norm": 0.3768599033355713, + "learning_rate": 0.0001, + "loss": 1.8022, + "step": 1992 + }, + { + "epoch": 0.23154225965727562, + "grad_norm": 0.36274993419647217, + "learning_rate": 0.0001, + "loss": 1.6444, + "step": 1993 + }, + { + "epoch": 0.23165843740923614, + "grad_norm": 0.3988153636455536, + "learning_rate": 0.0001, + "loss": 1.8395, + "step": 1994 + }, + { + "epoch": 0.23177461516119663, + "grad_norm": 0.3925970494747162, + "learning_rate": 0.0001, + "loss": 1.6646, + "step": 1995 + }, + { + "epoch": 0.23189079291315712, + "grad_norm": 0.3741264045238495, + "learning_rate": 0.0001, + "loss": 1.6348, + "step": 1996 + }, + { + "epoch": 0.23200697066511763, + "grad_norm": 0.3511716425418854, + "learning_rate": 0.0001, + "loss": 1.5654, + "step": 1997 + }, + { + "epoch": 0.23212314841707812, + "grad_norm": 0.3565942645072937, + "learning_rate": 0.0001, + "loss": 1.7113, + "step": 1998 + }, + { + "epoch": 0.23223932616903864, + "grad_norm": 0.36350539326667786, + "learning_rate": 0.0001, + "loss": 1.6942, + "step": 1999 + }, + { + "epoch": 0.23235550392099913, + "grad_norm": 0.39565804600715637, + "learning_rate": 0.0001, + "loss": 1.6384, + "step": 2000 + }, + { + "epoch": 0.23247168167295962, + "grad_norm": 0.39192995429039, + "learning_rate": 0.0001, + "loss": 1.821, + "step": 2001 + }, + { + "epoch": 0.23258785942492013, + "grad_norm": 0.3590277433395386, + "learning_rate": 0.0001, + "loss": 1.68, + "step": 2002 + }, + { + "epoch": 0.23270403717688062, + "grad_norm": 0.3515608310699463, + "learning_rate": 0.0001, + "loss": 1.6445, + "step": 2003 + }, + { + "epoch": 0.23282021492884114, + "grad_norm": 0.3733135759830475, + "learning_rate": 0.0001, + "loss": 1.567, + "step": 2004 + }, + { + "epoch": 0.23293639268080163, + "grad_norm": 0.3820706903934479, + "learning_rate": 0.0001, + "loss": 1.625, + "step": 2005 + }, + { + "epoch": 0.23305257043276212, + "grad_norm": 0.38101106882095337, + "learning_rate": 0.0001, + "loss": 1.6528, + "step": 2006 + }, + { + "epoch": 0.23316874818472264, + "grad_norm": 0.3717200756072998, + "learning_rate": 0.0001, + "loss": 1.5109, + "step": 2007 + }, + { + "epoch": 0.23328492593668312, + "grad_norm": 0.3890402913093567, + "learning_rate": 0.0001, + "loss": 1.8481, + "step": 2008 + }, + { + "epoch": 0.2334011036886436, + "grad_norm": 0.374593585729599, + "learning_rate": 0.0001, + "loss": 1.6713, + "step": 2009 + }, + { + "epoch": 0.23351728144060413, + "grad_norm": 0.37107715010643005, + "learning_rate": 0.0001, + "loss": 1.6466, + "step": 2010 + }, + { + "epoch": 0.23363345919256462, + "grad_norm": 0.37940695881843567, + "learning_rate": 0.0001, + "loss": 1.6465, + "step": 2011 + }, + { + "epoch": 0.23374963694452514, + "grad_norm": 0.351849228143692, + "learning_rate": 0.0001, + "loss": 1.6345, + "step": 2012 + }, + { + "epoch": 0.23386581469648562, + "grad_norm": 0.35233619809150696, + "learning_rate": 0.0001, + "loss": 1.6328, + "step": 2013 + }, + { + "epoch": 0.2339819924484461, + "grad_norm": 0.39142873883247375, + "learning_rate": 0.0001, + "loss": 1.8342, + "step": 2014 + }, + { + "epoch": 0.23409817020040663, + "grad_norm": 0.36893418431282043, + "learning_rate": 0.0001, + "loss": 1.7415, + "step": 2015 + }, + { + "epoch": 0.23421434795236712, + "grad_norm": 0.4079015851020813, + "learning_rate": 0.0001, + "loss": 1.9148, + "step": 2016 + }, + { + "epoch": 0.2343305257043276, + "grad_norm": 0.35830172896385193, + "learning_rate": 0.0001, + "loss": 1.531, + "step": 2017 + }, + { + "epoch": 0.23444670345628812, + "grad_norm": 0.35987070202827454, + "learning_rate": 0.0001, + "loss": 1.5225, + "step": 2018 + }, + { + "epoch": 0.2345628812082486, + "grad_norm": 0.3772944211959839, + "learning_rate": 0.0001, + "loss": 1.7637, + "step": 2019 + }, + { + "epoch": 0.23467905896020913, + "grad_norm": 0.36652514338493347, + "learning_rate": 0.0001, + "loss": 1.786, + "step": 2020 + }, + { + "epoch": 0.23479523671216962, + "grad_norm": 0.3576715886592865, + "learning_rate": 0.0001, + "loss": 1.5957, + "step": 2021 + }, + { + "epoch": 0.2349114144641301, + "grad_norm": 0.3853584825992584, + "learning_rate": 0.0001, + "loss": 1.7223, + "step": 2022 + }, + { + "epoch": 0.23502759221609063, + "grad_norm": 0.3581581115722656, + "learning_rate": 0.0001, + "loss": 1.7853, + "step": 2023 + }, + { + "epoch": 0.23514376996805111, + "grad_norm": 0.396012544631958, + "learning_rate": 0.0001, + "loss": 1.4849, + "step": 2024 + }, + { + "epoch": 0.23525994772001163, + "grad_norm": 0.37956202030181885, + "learning_rate": 0.0001, + "loss": 1.6308, + "step": 2025 + }, + { + "epoch": 0.23537612547197212, + "grad_norm": 0.377013623714447, + "learning_rate": 0.0001, + "loss": 1.7764, + "step": 2026 + }, + { + "epoch": 0.2354923032239326, + "grad_norm": 0.37704232335090637, + "learning_rate": 0.0001, + "loss": 1.8591, + "step": 2027 + }, + { + "epoch": 0.23560848097589313, + "grad_norm": 0.3992510437965393, + "learning_rate": 0.0001, + "loss": 2.0132, + "step": 2028 + }, + { + "epoch": 0.23572465872785361, + "grad_norm": 0.36997881531715393, + "learning_rate": 0.0001, + "loss": 1.6769, + "step": 2029 + }, + { + "epoch": 0.2358408364798141, + "grad_norm": 0.3613041043281555, + "learning_rate": 0.0001, + "loss": 1.6496, + "step": 2030 + }, + { + "epoch": 0.23595701423177462, + "grad_norm": 0.36582064628601074, + "learning_rate": 0.0001, + "loss": 1.6731, + "step": 2031 + }, + { + "epoch": 0.2360731919837351, + "grad_norm": 0.3395959436893463, + "learning_rate": 0.0001, + "loss": 1.687, + "step": 2032 + }, + { + "epoch": 0.23618936973569563, + "grad_norm": 0.3602873384952545, + "learning_rate": 0.0001, + "loss": 1.6479, + "step": 2033 + }, + { + "epoch": 0.23630554748765611, + "grad_norm": 0.3407090902328491, + "learning_rate": 0.0001, + "loss": 1.6057, + "step": 2034 + }, + { + "epoch": 0.2364217252396166, + "grad_norm": 0.3737788200378418, + "learning_rate": 0.0001, + "loss": 1.6555, + "step": 2035 + }, + { + "epoch": 0.23653790299157712, + "grad_norm": 0.3733769953250885, + "learning_rate": 0.0001, + "loss": 1.7639, + "step": 2036 + }, + { + "epoch": 0.2366540807435376, + "grad_norm": 0.3718021512031555, + "learning_rate": 0.0001, + "loss": 1.8264, + "step": 2037 + }, + { + "epoch": 0.23677025849549813, + "grad_norm": 0.33534038066864014, + "learning_rate": 0.0001, + "loss": 1.4352, + "step": 2038 + }, + { + "epoch": 0.23688643624745861, + "grad_norm": 0.4014089107513428, + "learning_rate": 0.0001, + "loss": 1.8223, + "step": 2039 + }, + { + "epoch": 0.2370026139994191, + "grad_norm": 0.37306705117225647, + "learning_rate": 0.0001, + "loss": 1.8535, + "step": 2040 + }, + { + "epoch": 0.23711879175137962, + "grad_norm": 0.36598867177963257, + "learning_rate": 0.0001, + "loss": 1.7722, + "step": 2041 + }, + { + "epoch": 0.2372349695033401, + "grad_norm": 0.3649059236049652, + "learning_rate": 0.0001, + "loss": 1.6208, + "step": 2042 + }, + { + "epoch": 0.2373511472553006, + "grad_norm": 0.3907899558544159, + "learning_rate": 0.0001, + "loss": 1.6757, + "step": 2043 + }, + { + "epoch": 0.23746732500726112, + "grad_norm": 0.37756413221359253, + "learning_rate": 0.0001, + "loss": 1.7135, + "step": 2044 + }, + { + "epoch": 0.2375835027592216, + "grad_norm": 0.3651416599750519, + "learning_rate": 0.0001, + "loss": 1.723, + "step": 2045 + }, + { + "epoch": 0.23769968051118212, + "grad_norm": 0.3615970015525818, + "learning_rate": 0.0001, + "loss": 1.7378, + "step": 2046 + }, + { + "epoch": 0.2378158582631426, + "grad_norm": 0.3712145686149597, + "learning_rate": 0.0001, + "loss": 1.7052, + "step": 2047 + }, + { + "epoch": 0.2379320360151031, + "grad_norm": 0.39952540397644043, + "learning_rate": 0.0001, + "loss": 1.6936, + "step": 2048 + }, + { + "epoch": 0.23804821376706362, + "grad_norm": 0.37439173460006714, + "learning_rate": 0.0001, + "loss": 1.6468, + "step": 2049 + }, + { + "epoch": 0.2381643915190241, + "grad_norm": 0.37691599130630493, + "learning_rate": 0.0001, + "loss": 1.715, + "step": 2050 + }, + { + "epoch": 0.2382805692709846, + "grad_norm": 0.3760530352592468, + "learning_rate": 0.0001, + "loss": 1.6155, + "step": 2051 + }, + { + "epoch": 0.2383967470229451, + "grad_norm": 0.3865487575531006, + "learning_rate": 0.0001, + "loss": 1.9395, + "step": 2052 + }, + { + "epoch": 0.2385129247749056, + "grad_norm": 0.3658308982849121, + "learning_rate": 0.0001, + "loss": 1.5826, + "step": 2053 + }, + { + "epoch": 0.23862910252686612, + "grad_norm": 0.3813599646091461, + "learning_rate": 0.0001, + "loss": 1.7707, + "step": 2054 + }, + { + "epoch": 0.2387452802788266, + "grad_norm": 0.3815939128398895, + "learning_rate": 0.0001, + "loss": 1.6308, + "step": 2055 + }, + { + "epoch": 0.2388614580307871, + "grad_norm": 0.3719700872898102, + "learning_rate": 0.0001, + "loss": 1.6661, + "step": 2056 + }, + { + "epoch": 0.2389776357827476, + "grad_norm": 0.3591654896736145, + "learning_rate": 0.0001, + "loss": 1.5548, + "step": 2057 + }, + { + "epoch": 0.2390938135347081, + "grad_norm": 0.37177640199661255, + "learning_rate": 0.0001, + "loss": 1.8142, + "step": 2058 + }, + { + "epoch": 0.23920999128666862, + "grad_norm": 0.36674222350120544, + "learning_rate": 0.0001, + "loss": 1.7137, + "step": 2059 + }, + { + "epoch": 0.2393261690386291, + "grad_norm": 0.36910292506217957, + "learning_rate": 0.0001, + "loss": 1.8695, + "step": 2060 + }, + { + "epoch": 0.2394423467905896, + "grad_norm": 0.37248843908309937, + "learning_rate": 0.0001, + "loss": 1.6687, + "step": 2061 + }, + { + "epoch": 0.2395585245425501, + "grad_norm": 0.3622843027114868, + "learning_rate": 0.0001, + "loss": 1.7263, + "step": 2062 + }, + { + "epoch": 0.2396747022945106, + "grad_norm": 0.38794079422950745, + "learning_rate": 0.0001, + "loss": 1.7049, + "step": 2063 + }, + { + "epoch": 0.2397908800464711, + "grad_norm": 0.36130279302597046, + "learning_rate": 0.0001, + "loss": 1.5625, + "step": 2064 + }, + { + "epoch": 0.2399070577984316, + "grad_norm": 0.3953266441822052, + "learning_rate": 0.0001, + "loss": 1.7695, + "step": 2065 + }, + { + "epoch": 0.2400232355503921, + "grad_norm": 0.3774740397930145, + "learning_rate": 0.0001, + "loss": 1.7106, + "step": 2066 + }, + { + "epoch": 0.2401394133023526, + "grad_norm": 0.380696564912796, + "learning_rate": 0.0001, + "loss": 1.7201, + "step": 2067 + }, + { + "epoch": 0.2402555910543131, + "grad_norm": 0.3862208127975464, + "learning_rate": 0.0001, + "loss": 1.7663, + "step": 2068 + }, + { + "epoch": 0.2403717688062736, + "grad_norm": 0.34933963418006897, + "learning_rate": 0.0001, + "loss": 1.6167, + "step": 2069 + }, + { + "epoch": 0.2404879465582341, + "grad_norm": 0.37984904646873474, + "learning_rate": 0.0001, + "loss": 1.6647, + "step": 2070 + }, + { + "epoch": 0.2406041243101946, + "grad_norm": 0.3804422616958618, + "learning_rate": 0.0001, + "loss": 1.7629, + "step": 2071 + }, + { + "epoch": 0.24072030206215508, + "grad_norm": 0.3919108808040619, + "learning_rate": 0.0001, + "loss": 1.6823, + "step": 2072 + }, + { + "epoch": 0.2408364798141156, + "grad_norm": 0.4050542712211609, + "learning_rate": 0.0001, + "loss": 1.6919, + "step": 2073 + }, + { + "epoch": 0.2409526575660761, + "grad_norm": 0.378836989402771, + "learning_rate": 0.0001, + "loss": 1.7873, + "step": 2074 + }, + { + "epoch": 0.2410688353180366, + "grad_norm": 0.39815083146095276, + "learning_rate": 0.0001, + "loss": 1.8006, + "step": 2075 + }, + { + "epoch": 0.2411850130699971, + "grad_norm": 0.39791086316108704, + "learning_rate": 0.0001, + "loss": 1.8538, + "step": 2076 + }, + { + "epoch": 0.24130119082195758, + "grad_norm": 0.3830665349960327, + "learning_rate": 0.0001, + "loss": 1.6778, + "step": 2077 + }, + { + "epoch": 0.2414173685739181, + "grad_norm": 0.38453802466392517, + "learning_rate": 0.0001, + "loss": 1.7415, + "step": 2078 + }, + { + "epoch": 0.2415335463258786, + "grad_norm": 0.40375974774360657, + "learning_rate": 0.0001, + "loss": 1.8335, + "step": 2079 + }, + { + "epoch": 0.2416497240778391, + "grad_norm": 0.38834068179130554, + "learning_rate": 0.0001, + "loss": 1.76, + "step": 2080 + }, + { + "epoch": 0.2417659018297996, + "grad_norm": 0.38506248593330383, + "learning_rate": 0.0001, + "loss": 1.4974, + "step": 2081 + }, + { + "epoch": 0.24188207958176008, + "grad_norm": 0.3646622896194458, + "learning_rate": 0.0001, + "loss": 1.726, + "step": 2082 + }, + { + "epoch": 0.2419982573337206, + "grad_norm": 0.37850677967071533, + "learning_rate": 0.0001, + "loss": 1.6372, + "step": 2083 + }, + { + "epoch": 0.2421144350856811, + "grad_norm": 0.386091411113739, + "learning_rate": 0.0001, + "loss": 1.8543, + "step": 2084 + }, + { + "epoch": 0.24223061283764158, + "grad_norm": 0.38737788796424866, + "learning_rate": 0.0001, + "loss": 1.7008, + "step": 2085 + }, + { + "epoch": 0.2423467905896021, + "grad_norm": 0.37103384733200073, + "learning_rate": 0.0001, + "loss": 1.708, + "step": 2086 + }, + { + "epoch": 0.24246296834156258, + "grad_norm": 0.34921854734420776, + "learning_rate": 0.0001, + "loss": 1.5562, + "step": 2087 + }, + { + "epoch": 0.2425791460935231, + "grad_norm": 0.3560514748096466, + "learning_rate": 0.0001, + "loss": 1.6923, + "step": 2088 + }, + { + "epoch": 0.2426953238454836, + "grad_norm": 0.38028186559677124, + "learning_rate": 0.0001, + "loss": 1.6194, + "step": 2089 + }, + { + "epoch": 0.24281150159744408, + "grad_norm": 0.40691468119621277, + "learning_rate": 0.0001, + "loss": 1.7285, + "step": 2090 + }, + { + "epoch": 0.2429276793494046, + "grad_norm": 0.3687189221382141, + "learning_rate": 0.0001, + "loss": 1.6803, + "step": 2091 + }, + { + "epoch": 0.24304385710136509, + "grad_norm": 0.35396918654441833, + "learning_rate": 0.0001, + "loss": 1.7285, + "step": 2092 + }, + { + "epoch": 0.2431600348533256, + "grad_norm": 0.39168643951416016, + "learning_rate": 0.0001, + "loss": 1.6745, + "step": 2093 + }, + { + "epoch": 0.2432762126052861, + "grad_norm": 0.4195053279399872, + "learning_rate": 0.0001, + "loss": 1.9219, + "step": 2094 + }, + { + "epoch": 0.24339239035724658, + "grad_norm": 0.3593606948852539, + "learning_rate": 0.0001, + "loss": 1.6004, + "step": 2095 + }, + { + "epoch": 0.2435085681092071, + "grad_norm": 0.35588037967681885, + "learning_rate": 0.0001, + "loss": 1.4985, + "step": 2096 + }, + { + "epoch": 0.24362474586116759, + "grad_norm": 0.3821358382701874, + "learning_rate": 0.0001, + "loss": 1.6774, + "step": 2097 + }, + { + "epoch": 0.24374092361312807, + "grad_norm": 0.36826246976852417, + "learning_rate": 0.0001, + "loss": 1.623, + "step": 2098 + }, + { + "epoch": 0.2438571013650886, + "grad_norm": 0.3902747929096222, + "learning_rate": 0.0001, + "loss": 1.8032, + "step": 2099 + }, + { + "epoch": 0.24397327911704908, + "grad_norm": 0.3716658651828766, + "learning_rate": 0.0001, + "loss": 1.6415, + "step": 2100 + }, + { + "epoch": 0.2440894568690096, + "grad_norm": 0.3505365550518036, + "learning_rate": 0.0001, + "loss": 1.6708, + "step": 2101 + }, + { + "epoch": 0.24420563462097009, + "grad_norm": 0.36862871050834656, + "learning_rate": 0.0001, + "loss": 1.7063, + "step": 2102 + }, + { + "epoch": 0.24432181237293057, + "grad_norm": 0.35868167877197266, + "learning_rate": 0.0001, + "loss": 1.6787, + "step": 2103 + }, + { + "epoch": 0.2444379901248911, + "grad_norm": 0.3973603844642639, + "learning_rate": 0.0001, + "loss": 1.661, + "step": 2104 + }, + { + "epoch": 0.24455416787685158, + "grad_norm": 0.37595170736312866, + "learning_rate": 0.0001, + "loss": 1.7285, + "step": 2105 + }, + { + "epoch": 0.24467034562881207, + "grad_norm": 0.3656224012374878, + "learning_rate": 0.0001, + "loss": 1.6813, + "step": 2106 + }, + { + "epoch": 0.24478652338077259, + "grad_norm": 0.39221543073654175, + "learning_rate": 0.0001, + "loss": 1.7968, + "step": 2107 + }, + { + "epoch": 0.24490270113273307, + "grad_norm": 0.36675718426704407, + "learning_rate": 0.0001, + "loss": 1.634, + "step": 2108 + }, + { + "epoch": 0.2450188788846936, + "grad_norm": 0.3545796871185303, + "learning_rate": 0.0001, + "loss": 1.6131, + "step": 2109 + }, + { + "epoch": 0.24513505663665408, + "grad_norm": 0.40240517258644104, + "learning_rate": 0.0001, + "loss": 1.9461, + "step": 2110 + }, + { + "epoch": 0.24525123438861457, + "grad_norm": 0.38813287019729614, + "learning_rate": 0.0001, + "loss": 1.5576, + "step": 2111 + }, + { + "epoch": 0.2453674121405751, + "grad_norm": 0.37340447306632996, + "learning_rate": 0.0001, + "loss": 1.527, + "step": 2112 + }, + { + "epoch": 0.24548358989253558, + "grad_norm": 0.37866276502609253, + "learning_rate": 0.0001, + "loss": 1.7589, + "step": 2113 + }, + { + "epoch": 0.2455997676444961, + "grad_norm": 0.3932071328163147, + "learning_rate": 0.0001, + "loss": 1.7188, + "step": 2114 + }, + { + "epoch": 0.24571594539645658, + "grad_norm": 0.37961530685424805, + "learning_rate": 0.0001, + "loss": 1.6351, + "step": 2115 + }, + { + "epoch": 0.24583212314841707, + "grad_norm": 0.38449040055274963, + "learning_rate": 0.0001, + "loss": 1.6648, + "step": 2116 + }, + { + "epoch": 0.2459483009003776, + "grad_norm": 0.39510199427604675, + "learning_rate": 0.0001, + "loss": 1.6821, + "step": 2117 + }, + { + "epoch": 0.24606447865233808, + "grad_norm": 0.40252119302749634, + "learning_rate": 0.0001, + "loss": 1.7532, + "step": 2118 + }, + { + "epoch": 0.24618065640429856, + "grad_norm": 0.3880206346511841, + "learning_rate": 0.0001, + "loss": 1.7703, + "step": 2119 + }, + { + "epoch": 0.24629683415625908, + "grad_norm": 0.3541505038738251, + "learning_rate": 0.0001, + "loss": 1.6812, + "step": 2120 + }, + { + "epoch": 0.24641301190821957, + "grad_norm": 0.3985120952129364, + "learning_rate": 0.0001, + "loss": 1.7176, + "step": 2121 + }, + { + "epoch": 0.2465291896601801, + "grad_norm": 0.39708301424980164, + "learning_rate": 0.0001, + "loss": 1.7669, + "step": 2122 + }, + { + "epoch": 0.24664536741214058, + "grad_norm": 0.3605371415615082, + "learning_rate": 0.0001, + "loss": 1.6551, + "step": 2123 + }, + { + "epoch": 0.24676154516410106, + "grad_norm": 0.38313308358192444, + "learning_rate": 0.0001, + "loss": 1.4943, + "step": 2124 + }, + { + "epoch": 0.24687772291606158, + "grad_norm": 0.4088183641433716, + "learning_rate": 0.0001, + "loss": 1.7498, + "step": 2125 + }, + { + "epoch": 0.24699390066802207, + "grad_norm": 0.3966735899448395, + "learning_rate": 0.0001, + "loss": 1.6322, + "step": 2126 + }, + { + "epoch": 0.24711007841998256, + "grad_norm": 0.3689868748188019, + "learning_rate": 0.0001, + "loss": 1.689, + "step": 2127 + }, + { + "epoch": 0.24722625617194308, + "grad_norm": 0.41865813732147217, + "learning_rate": 0.0001, + "loss": 1.7459, + "step": 2128 + }, + { + "epoch": 0.24734243392390357, + "grad_norm": 0.39082497358322144, + "learning_rate": 0.0001, + "loss": 1.694, + "step": 2129 + }, + { + "epoch": 0.24745861167586408, + "grad_norm": 0.40093672275543213, + "learning_rate": 0.0001, + "loss": 1.5972, + "step": 2130 + }, + { + "epoch": 0.24757478942782457, + "grad_norm": 0.37469765543937683, + "learning_rate": 0.0001, + "loss": 1.6115, + "step": 2131 + }, + { + "epoch": 0.24769096717978506, + "grad_norm": 0.3949383795261383, + "learning_rate": 0.0001, + "loss": 1.5865, + "step": 2132 + }, + { + "epoch": 0.24780714493174558, + "grad_norm": 0.3878518342971802, + "learning_rate": 0.0001, + "loss": 1.9247, + "step": 2133 + }, + { + "epoch": 0.24792332268370607, + "grad_norm": 0.38046330213546753, + "learning_rate": 0.0001, + "loss": 1.7708, + "step": 2134 + }, + { + "epoch": 0.24803950043566658, + "grad_norm": 0.34667521715164185, + "learning_rate": 0.0001, + "loss": 1.7178, + "step": 2135 + }, + { + "epoch": 0.24815567818762707, + "grad_norm": 0.376510888338089, + "learning_rate": 0.0001, + "loss": 1.8054, + "step": 2136 + }, + { + "epoch": 0.24827185593958756, + "grad_norm": 0.371574729681015, + "learning_rate": 0.0001, + "loss": 1.5861, + "step": 2137 + }, + { + "epoch": 0.24838803369154808, + "grad_norm": 0.4060184955596924, + "learning_rate": 0.0001, + "loss": 1.6723, + "step": 2138 + }, + { + "epoch": 0.24850421144350857, + "grad_norm": 0.4174020290374756, + "learning_rate": 0.0001, + "loss": 1.8037, + "step": 2139 + }, + { + "epoch": 0.24862038919546905, + "grad_norm": 0.3601975440979004, + "learning_rate": 0.0001, + "loss": 1.3983, + "step": 2140 + }, + { + "epoch": 0.24873656694742957, + "grad_norm": 0.3981928825378418, + "learning_rate": 0.0001, + "loss": 1.7354, + "step": 2141 + }, + { + "epoch": 0.24885274469939006, + "grad_norm": 0.3899874687194824, + "learning_rate": 0.0001, + "loss": 1.7262, + "step": 2142 + }, + { + "epoch": 0.24896892245135058, + "grad_norm": 0.364611953496933, + "learning_rate": 0.0001, + "loss": 1.71, + "step": 2143 + }, + { + "epoch": 0.24908510020331107, + "grad_norm": 0.3837345540523529, + "learning_rate": 0.0001, + "loss": 1.7161, + "step": 2144 + }, + { + "epoch": 0.24920127795527156, + "grad_norm": 0.3657201826572418, + "learning_rate": 0.0001, + "loss": 1.7091, + "step": 2145 + }, + { + "epoch": 0.24931745570723207, + "grad_norm": 0.3665958642959595, + "learning_rate": 0.0001, + "loss": 1.67, + "step": 2146 + }, + { + "epoch": 0.24943363345919256, + "grad_norm": 0.35987091064453125, + "learning_rate": 0.0001, + "loss": 1.6669, + "step": 2147 + }, + { + "epoch": 0.24954981121115308, + "grad_norm": 0.3838944733142853, + "learning_rate": 0.0001, + "loss": 1.6725, + "step": 2148 + }, + { + "epoch": 0.24966598896311357, + "grad_norm": 0.3626435101032257, + "learning_rate": 0.0001, + "loss": 1.5494, + "step": 2149 + }, + { + "epoch": 0.24978216671507406, + "grad_norm": 0.37998467683792114, + "learning_rate": 0.0001, + "loss": 1.7085, + "step": 2150 + }, + { + "epoch": 0.24989834446703457, + "grad_norm": 0.37865573167800903, + "learning_rate": 0.0001, + "loss": 1.6637, + "step": 2151 + }, + { + "epoch": 0.25001452221899506, + "grad_norm": 0.38255107402801514, + "learning_rate": 0.0001, + "loss": 1.7066, + "step": 2152 + }, + { + "epoch": 0.25013069997095555, + "grad_norm": 0.375410258769989, + "learning_rate": 0.0001, + "loss": 1.6712, + "step": 2153 + }, + { + "epoch": 0.25024687772291604, + "grad_norm": 0.3752192556858063, + "learning_rate": 0.0001, + "loss": 1.6418, + "step": 2154 + }, + { + "epoch": 0.2503630554748766, + "grad_norm": 0.3927246034145355, + "learning_rate": 0.0001, + "loss": 1.6599, + "step": 2155 + }, + { + "epoch": 0.2504792332268371, + "grad_norm": 0.38156911730766296, + "learning_rate": 0.0001, + "loss": 1.6593, + "step": 2156 + }, + { + "epoch": 0.25059541097879756, + "grad_norm": 0.3630543351173401, + "learning_rate": 0.0001, + "loss": 1.6824, + "step": 2157 + }, + { + "epoch": 0.25071158873075805, + "grad_norm": 0.38114479184150696, + "learning_rate": 0.0001, + "loss": 1.7344, + "step": 2158 + }, + { + "epoch": 0.25082776648271854, + "grad_norm": 0.3578778803348541, + "learning_rate": 0.0001, + "loss": 1.6656, + "step": 2159 + }, + { + "epoch": 0.2509439442346791, + "grad_norm": 0.3649848997592926, + "learning_rate": 0.0001, + "loss": 1.6591, + "step": 2160 + }, + { + "epoch": 0.2510601219866396, + "grad_norm": 0.39916592836380005, + "learning_rate": 0.0001, + "loss": 1.7755, + "step": 2161 + }, + { + "epoch": 0.25117629973860006, + "grad_norm": 0.34567490220069885, + "learning_rate": 0.0001, + "loss": 1.4154, + "step": 2162 + }, + { + "epoch": 0.25129247749056055, + "grad_norm": 0.3676474690437317, + "learning_rate": 0.0001, + "loss": 1.6484, + "step": 2163 + }, + { + "epoch": 0.25140865524252104, + "grad_norm": 0.3737073838710785, + "learning_rate": 0.0001, + "loss": 1.6569, + "step": 2164 + }, + { + "epoch": 0.2515248329944816, + "grad_norm": 0.3624831438064575, + "learning_rate": 0.0001, + "loss": 1.616, + "step": 2165 + }, + { + "epoch": 0.2516410107464421, + "grad_norm": 0.4120437800884247, + "learning_rate": 0.0001, + "loss": 1.9489, + "step": 2166 + }, + { + "epoch": 0.25175718849840256, + "grad_norm": 0.3748963177204132, + "learning_rate": 0.0001, + "loss": 1.613, + "step": 2167 + }, + { + "epoch": 0.25187336625036305, + "grad_norm": 0.38849174976348877, + "learning_rate": 0.0001, + "loss": 1.6497, + "step": 2168 + }, + { + "epoch": 0.25198954400232354, + "grad_norm": 0.3984411358833313, + "learning_rate": 0.0001, + "loss": 1.7911, + "step": 2169 + }, + { + "epoch": 0.25210572175428403, + "grad_norm": 0.40575408935546875, + "learning_rate": 0.0001, + "loss": 1.8617, + "step": 2170 + }, + { + "epoch": 0.2522218995062446, + "grad_norm": 0.3502597212791443, + "learning_rate": 0.0001, + "loss": 1.5792, + "step": 2171 + }, + { + "epoch": 0.25233807725820506, + "grad_norm": 0.36587563157081604, + "learning_rate": 0.0001, + "loss": 1.6778, + "step": 2172 + }, + { + "epoch": 0.25245425501016555, + "grad_norm": 0.395804226398468, + "learning_rate": 0.0001, + "loss": 1.5945, + "step": 2173 + }, + { + "epoch": 0.25257043276212604, + "grad_norm": 0.3610069751739502, + "learning_rate": 0.0001, + "loss": 1.6106, + "step": 2174 + }, + { + "epoch": 0.25268661051408653, + "grad_norm": 0.38481786847114563, + "learning_rate": 0.0001, + "loss": 1.7379, + "step": 2175 + }, + { + "epoch": 0.2528027882660471, + "grad_norm": 0.4121454656124115, + "learning_rate": 0.0001, + "loss": 1.7449, + "step": 2176 + }, + { + "epoch": 0.25291896601800756, + "grad_norm": 0.40824562311172485, + "learning_rate": 0.0001, + "loss": 1.7024, + "step": 2177 + }, + { + "epoch": 0.25303514376996805, + "grad_norm": 0.38010165095329285, + "learning_rate": 0.0001, + "loss": 1.6923, + "step": 2178 + }, + { + "epoch": 0.25315132152192854, + "grad_norm": 0.3835202753543854, + "learning_rate": 0.0001, + "loss": 1.7208, + "step": 2179 + }, + { + "epoch": 0.25326749927388903, + "grad_norm": 0.4025690257549286, + "learning_rate": 0.0001, + "loss": 1.7204, + "step": 2180 + }, + { + "epoch": 0.2533836770258496, + "grad_norm": 0.3862835466861725, + "learning_rate": 0.0001, + "loss": 1.8057, + "step": 2181 + }, + { + "epoch": 0.25349985477781006, + "grad_norm": 0.3741121292114258, + "learning_rate": 0.0001, + "loss": 1.7558, + "step": 2182 + }, + { + "epoch": 0.25361603252977055, + "grad_norm": 0.38978350162506104, + "learning_rate": 0.0001, + "loss": 1.6125, + "step": 2183 + }, + { + "epoch": 0.25373221028173104, + "grad_norm": 0.3566723167896271, + "learning_rate": 0.0001, + "loss": 1.5734, + "step": 2184 + }, + { + "epoch": 0.25384838803369153, + "grad_norm": 0.4036724269390106, + "learning_rate": 0.0001, + "loss": 1.751, + "step": 2185 + }, + { + "epoch": 0.2539645657856521, + "grad_norm": 0.3604584336280823, + "learning_rate": 0.0001, + "loss": 1.6988, + "step": 2186 + }, + { + "epoch": 0.25408074353761256, + "grad_norm": 0.3696625232696533, + "learning_rate": 0.0001, + "loss": 1.5317, + "step": 2187 + }, + { + "epoch": 0.25419692128957305, + "grad_norm": 0.38797250390052795, + "learning_rate": 0.0001, + "loss": 1.6658, + "step": 2188 + }, + { + "epoch": 0.25431309904153354, + "grad_norm": 0.3951641321182251, + "learning_rate": 0.0001, + "loss": 1.6017, + "step": 2189 + }, + { + "epoch": 0.25442927679349403, + "grad_norm": 0.37504735589027405, + "learning_rate": 0.0001, + "loss": 1.6175, + "step": 2190 + }, + { + "epoch": 0.2545454545454545, + "grad_norm": 0.38757580518722534, + "learning_rate": 0.0001, + "loss": 1.6747, + "step": 2191 + }, + { + "epoch": 0.25466163229741506, + "grad_norm": 0.41315603256225586, + "learning_rate": 0.0001, + "loss": 1.6652, + "step": 2192 + }, + { + "epoch": 0.25477781004937555, + "grad_norm": 0.36828580498695374, + "learning_rate": 0.0001, + "loss": 1.7711, + "step": 2193 + }, + { + "epoch": 0.25489398780133604, + "grad_norm": 0.39044126868247986, + "learning_rate": 0.0001, + "loss": 1.7516, + "step": 2194 + }, + { + "epoch": 0.25501016555329653, + "grad_norm": 0.3624008893966675, + "learning_rate": 0.0001, + "loss": 1.6642, + "step": 2195 + }, + { + "epoch": 0.255126343305257, + "grad_norm": 0.35610583424568176, + "learning_rate": 0.0001, + "loss": 1.5775, + "step": 2196 + }, + { + "epoch": 0.25524252105721756, + "grad_norm": 0.3624838888645172, + "learning_rate": 0.0001, + "loss": 1.6916, + "step": 2197 + }, + { + "epoch": 0.25535869880917805, + "grad_norm": 0.3851155638694763, + "learning_rate": 0.0001, + "loss": 1.6095, + "step": 2198 + }, + { + "epoch": 0.25547487656113854, + "grad_norm": 0.3829360604286194, + "learning_rate": 0.0001, + "loss": 1.6852, + "step": 2199 + }, + { + "epoch": 0.25559105431309903, + "grad_norm": 0.3810238242149353, + "learning_rate": 0.0001, + "loss": 1.6091, + "step": 2200 + }, + { + "epoch": 0.2557072320650595, + "grad_norm": 0.384115993976593, + "learning_rate": 0.0001, + "loss": 1.6728, + "step": 2201 + }, + { + "epoch": 0.25582340981702006, + "grad_norm": 0.3733699321746826, + "learning_rate": 0.0001, + "loss": 1.6823, + "step": 2202 + }, + { + "epoch": 0.25593958756898055, + "grad_norm": 0.37444037199020386, + "learning_rate": 0.0001, + "loss": 1.7531, + "step": 2203 + }, + { + "epoch": 0.25605576532094104, + "grad_norm": 0.3865280747413635, + "learning_rate": 0.0001, + "loss": 1.7525, + "step": 2204 + }, + { + "epoch": 0.25617194307290153, + "grad_norm": 0.3753882944583893, + "learning_rate": 0.0001, + "loss": 1.6914, + "step": 2205 + }, + { + "epoch": 0.256288120824862, + "grad_norm": 0.4033168852329254, + "learning_rate": 0.0001, + "loss": 1.4647, + "step": 2206 + }, + { + "epoch": 0.25640429857682256, + "grad_norm": 0.4054439961910248, + "learning_rate": 0.0001, + "loss": 1.76, + "step": 2207 + }, + { + "epoch": 0.25652047632878305, + "grad_norm": 0.3870498239994049, + "learning_rate": 0.0001, + "loss": 1.8224, + "step": 2208 + }, + { + "epoch": 0.25663665408074354, + "grad_norm": 0.3696228861808777, + "learning_rate": 0.0001, + "loss": 1.6343, + "step": 2209 + }, + { + "epoch": 0.25675283183270403, + "grad_norm": 0.36032870411872864, + "learning_rate": 0.0001, + "loss": 1.5121, + "step": 2210 + }, + { + "epoch": 0.2568690095846645, + "grad_norm": 0.3826732635498047, + "learning_rate": 0.0001, + "loss": 1.6123, + "step": 2211 + }, + { + "epoch": 0.256985187336625, + "grad_norm": 0.3756740391254425, + "learning_rate": 0.0001, + "loss": 1.6782, + "step": 2212 + }, + { + "epoch": 0.25710136508858555, + "grad_norm": 0.39152175188064575, + "learning_rate": 0.0001, + "loss": 1.6711, + "step": 2213 + }, + { + "epoch": 0.25721754284054604, + "grad_norm": 0.3959135413169861, + "learning_rate": 0.0001, + "loss": 1.7857, + "step": 2214 + }, + { + "epoch": 0.25733372059250653, + "grad_norm": 0.3905220627784729, + "learning_rate": 0.0001, + "loss": 1.6834, + "step": 2215 + }, + { + "epoch": 0.257449898344467, + "grad_norm": 0.3713771402835846, + "learning_rate": 0.0001, + "loss": 1.6193, + "step": 2216 + }, + { + "epoch": 0.2575660760964275, + "grad_norm": 0.39879244565963745, + "learning_rate": 0.0001, + "loss": 1.7314, + "step": 2217 + }, + { + "epoch": 0.25768225384838805, + "grad_norm": 0.3479825556278229, + "learning_rate": 0.0001, + "loss": 1.6179, + "step": 2218 + }, + { + "epoch": 0.25779843160034854, + "grad_norm": 0.38801833987236023, + "learning_rate": 0.0001, + "loss": 1.7041, + "step": 2219 + }, + { + "epoch": 0.25791460935230903, + "grad_norm": 0.37792959809303284, + "learning_rate": 0.0001, + "loss": 1.6854, + "step": 2220 + }, + { + "epoch": 0.2580307871042695, + "grad_norm": 0.3557312786579132, + "learning_rate": 0.0001, + "loss": 1.5168, + "step": 2221 + }, + { + "epoch": 0.25814696485623, + "grad_norm": 0.37623703479766846, + "learning_rate": 0.0001, + "loss": 1.7855, + "step": 2222 + }, + { + "epoch": 0.25826314260819055, + "grad_norm": 0.35475483536720276, + "learning_rate": 0.0001, + "loss": 1.5467, + "step": 2223 + }, + { + "epoch": 0.25837932036015104, + "grad_norm": 0.379874587059021, + "learning_rate": 0.0001, + "loss": 1.6733, + "step": 2224 + }, + { + "epoch": 0.25849549811211153, + "grad_norm": 0.3651340901851654, + "learning_rate": 0.0001, + "loss": 1.5503, + "step": 2225 + }, + { + "epoch": 0.258611675864072, + "grad_norm": 0.37844133377075195, + "learning_rate": 0.0001, + "loss": 1.7023, + "step": 2226 + }, + { + "epoch": 0.2587278536160325, + "grad_norm": 0.36854520440101624, + "learning_rate": 0.0001, + "loss": 1.799, + "step": 2227 + }, + { + "epoch": 0.25884403136799305, + "grad_norm": 0.4082055389881134, + "learning_rate": 0.0001, + "loss": 1.8261, + "step": 2228 + }, + { + "epoch": 0.25896020911995354, + "grad_norm": 0.3727450668811798, + "learning_rate": 0.0001, + "loss": 1.7122, + "step": 2229 + }, + { + "epoch": 0.25907638687191403, + "grad_norm": 0.38609281182289124, + "learning_rate": 0.0001, + "loss": 1.688, + "step": 2230 + }, + { + "epoch": 0.2591925646238745, + "grad_norm": 0.4126780331134796, + "learning_rate": 0.0001, + "loss": 1.6359, + "step": 2231 + }, + { + "epoch": 0.259308742375835, + "grad_norm": 0.3675900399684906, + "learning_rate": 0.0001, + "loss": 1.7965, + "step": 2232 + }, + { + "epoch": 0.2594249201277955, + "grad_norm": 0.3680029511451721, + "learning_rate": 0.0001, + "loss": 1.6735, + "step": 2233 + }, + { + "epoch": 0.25954109787975604, + "grad_norm": 0.3985327482223511, + "learning_rate": 0.0001, + "loss": 1.7455, + "step": 2234 + }, + { + "epoch": 0.25965727563171653, + "grad_norm": 0.4041634798049927, + "learning_rate": 0.0001, + "loss": 1.9009, + "step": 2235 + }, + { + "epoch": 0.259773453383677, + "grad_norm": 0.3741016685962677, + "learning_rate": 0.0001, + "loss": 1.6318, + "step": 2236 + }, + { + "epoch": 0.2598896311356375, + "grad_norm": 0.38083234429359436, + "learning_rate": 0.0001, + "loss": 1.7036, + "step": 2237 + }, + { + "epoch": 0.260005808887598, + "grad_norm": 0.40651369094848633, + "learning_rate": 0.0001, + "loss": 1.9219, + "step": 2238 + }, + { + "epoch": 0.26012198663955854, + "grad_norm": 0.3514697849750519, + "learning_rate": 0.0001, + "loss": 1.5848, + "step": 2239 + }, + { + "epoch": 0.26023816439151903, + "grad_norm": 0.402826189994812, + "learning_rate": 0.0001, + "loss": 1.769, + "step": 2240 + }, + { + "epoch": 0.2603543421434795, + "grad_norm": 0.3742333948612213, + "learning_rate": 0.0001, + "loss": 1.784, + "step": 2241 + }, + { + "epoch": 0.26047051989544, + "grad_norm": 0.3999182879924774, + "learning_rate": 0.0001, + "loss": 1.9366, + "step": 2242 + }, + { + "epoch": 0.2605866976474005, + "grad_norm": 0.3678368031978607, + "learning_rate": 0.0001, + "loss": 1.6611, + "step": 2243 + }, + { + "epoch": 0.26070287539936104, + "grad_norm": 0.36702072620391846, + "learning_rate": 0.0001, + "loss": 1.4785, + "step": 2244 + }, + { + "epoch": 0.26081905315132153, + "grad_norm": 0.3562362790107727, + "learning_rate": 0.0001, + "loss": 1.7157, + "step": 2245 + }, + { + "epoch": 0.260935230903282, + "grad_norm": 0.40297916531562805, + "learning_rate": 0.0001, + "loss": 1.7167, + "step": 2246 + }, + { + "epoch": 0.2610514086552425, + "grad_norm": 0.37424853444099426, + "learning_rate": 0.0001, + "loss": 1.6965, + "step": 2247 + }, + { + "epoch": 0.261167586407203, + "grad_norm": 0.36831429600715637, + "learning_rate": 0.0001, + "loss": 1.5774, + "step": 2248 + }, + { + "epoch": 0.26128376415916355, + "grad_norm": 0.38578173518180847, + "learning_rate": 0.0001, + "loss": 1.6241, + "step": 2249 + }, + { + "epoch": 0.26139994191112403, + "grad_norm": 0.40575075149536133, + "learning_rate": 0.0001, + "loss": 1.732, + "step": 2250 + }, + { + "epoch": 0.2615161196630845, + "grad_norm": 0.3883078396320343, + "learning_rate": 0.0001, + "loss": 1.6865, + "step": 2251 + }, + { + "epoch": 0.261632297415045, + "grad_norm": 0.3869827687740326, + "learning_rate": 0.0001, + "loss": 1.7362, + "step": 2252 + }, + { + "epoch": 0.2617484751670055, + "grad_norm": 0.38627856969833374, + "learning_rate": 0.0001, + "loss": 1.751, + "step": 2253 + }, + { + "epoch": 0.26186465291896605, + "grad_norm": 0.3833690285682678, + "learning_rate": 0.0001, + "loss": 1.7188, + "step": 2254 + }, + { + "epoch": 0.26198083067092653, + "grad_norm": 0.39400362968444824, + "learning_rate": 0.0001, + "loss": 1.7143, + "step": 2255 + }, + { + "epoch": 0.262097008422887, + "grad_norm": 0.3659832775592804, + "learning_rate": 0.0001, + "loss": 1.6737, + "step": 2256 + }, + { + "epoch": 0.2622131861748475, + "grad_norm": 0.3808686137199402, + "learning_rate": 0.0001, + "loss": 1.6575, + "step": 2257 + }, + { + "epoch": 0.262329363926808, + "grad_norm": 0.36852577328681946, + "learning_rate": 0.0001, + "loss": 1.6378, + "step": 2258 + }, + { + "epoch": 0.2624455416787685, + "grad_norm": 0.37290260195732117, + "learning_rate": 0.0001, + "loss": 1.6818, + "step": 2259 + }, + { + "epoch": 0.26256171943072903, + "grad_norm": 0.4122512936592102, + "learning_rate": 0.0001, + "loss": 1.7609, + "step": 2260 + }, + { + "epoch": 0.2626778971826895, + "grad_norm": 0.3920998275279999, + "learning_rate": 0.0001, + "loss": 1.7818, + "step": 2261 + }, + { + "epoch": 0.26279407493465, + "grad_norm": 0.40477001667022705, + "learning_rate": 0.0001, + "loss": 1.8105, + "step": 2262 + }, + { + "epoch": 0.2629102526866105, + "grad_norm": 0.3666556179523468, + "learning_rate": 0.0001, + "loss": 1.5604, + "step": 2263 + }, + { + "epoch": 0.263026430438571, + "grad_norm": 0.3544174134731293, + "learning_rate": 0.0001, + "loss": 1.6393, + "step": 2264 + }, + { + "epoch": 0.26314260819053154, + "grad_norm": 0.3537236154079437, + "learning_rate": 0.0001, + "loss": 1.5875, + "step": 2265 + }, + { + "epoch": 0.263258785942492, + "grad_norm": 0.3871021866798401, + "learning_rate": 0.0001, + "loss": 1.5506, + "step": 2266 + }, + { + "epoch": 0.2633749636944525, + "grad_norm": 0.3693544566631317, + "learning_rate": 0.0001, + "loss": 1.6045, + "step": 2267 + }, + { + "epoch": 0.263491141446413, + "grad_norm": 0.3744482696056366, + "learning_rate": 0.0001, + "loss": 1.5885, + "step": 2268 + }, + { + "epoch": 0.2636073191983735, + "grad_norm": 0.38229045271873474, + "learning_rate": 0.0001, + "loss": 1.7494, + "step": 2269 + }, + { + "epoch": 0.26372349695033404, + "grad_norm": 0.38412609696388245, + "learning_rate": 0.0001, + "loss": 1.806, + "step": 2270 + }, + { + "epoch": 0.2638396747022945, + "grad_norm": 0.3687461018562317, + "learning_rate": 0.0001, + "loss": 1.5794, + "step": 2271 + }, + { + "epoch": 0.263955852454255, + "grad_norm": 0.40106597542762756, + "learning_rate": 0.0001, + "loss": 1.7895, + "step": 2272 + }, + { + "epoch": 0.2640720302062155, + "grad_norm": 0.3707484304904938, + "learning_rate": 0.0001, + "loss": 1.6494, + "step": 2273 + }, + { + "epoch": 0.264188207958176, + "grad_norm": 0.3728141188621521, + "learning_rate": 0.0001, + "loss": 1.7414, + "step": 2274 + }, + { + "epoch": 0.26430438571013654, + "grad_norm": 0.3807673752307892, + "learning_rate": 0.0001, + "loss": 1.6104, + "step": 2275 + }, + { + "epoch": 0.264420563462097, + "grad_norm": 0.39177513122558594, + "learning_rate": 0.0001, + "loss": 1.7476, + "step": 2276 + }, + { + "epoch": 0.2645367412140575, + "grad_norm": 0.3902243971824646, + "learning_rate": 0.0001, + "loss": 1.7135, + "step": 2277 + }, + { + "epoch": 0.264652918966018, + "grad_norm": 0.36920416355133057, + "learning_rate": 0.0001, + "loss": 1.6747, + "step": 2278 + }, + { + "epoch": 0.2647690967179785, + "grad_norm": 0.37540876865386963, + "learning_rate": 0.0001, + "loss": 1.5276, + "step": 2279 + }, + { + "epoch": 0.264885274469939, + "grad_norm": 0.3670298159122467, + "learning_rate": 0.0001, + "loss": 1.8018, + "step": 2280 + }, + { + "epoch": 0.2650014522218995, + "grad_norm": 0.344098836183548, + "learning_rate": 0.0001, + "loss": 1.5979, + "step": 2281 + }, + { + "epoch": 0.26511762997386, + "grad_norm": 0.36059024930000305, + "learning_rate": 0.0001, + "loss": 1.5998, + "step": 2282 + }, + { + "epoch": 0.2652338077258205, + "grad_norm": 0.37826424837112427, + "learning_rate": 0.0001, + "loss": 1.6737, + "step": 2283 + }, + { + "epoch": 0.265349985477781, + "grad_norm": 0.40158766508102417, + "learning_rate": 0.0001, + "loss": 1.585, + "step": 2284 + }, + { + "epoch": 0.2654661632297415, + "grad_norm": 0.37319740653038025, + "learning_rate": 0.0001, + "loss": 1.6671, + "step": 2285 + }, + { + "epoch": 0.265582340981702, + "grad_norm": 0.3656269311904907, + "learning_rate": 0.0001, + "loss": 1.6144, + "step": 2286 + }, + { + "epoch": 0.2656985187336625, + "grad_norm": 0.3666156232357025, + "learning_rate": 0.0001, + "loss": 1.6565, + "step": 2287 + }, + { + "epoch": 0.265814696485623, + "grad_norm": 0.3868063688278198, + "learning_rate": 0.0001, + "loss": 1.6494, + "step": 2288 + }, + { + "epoch": 0.2659308742375835, + "grad_norm": 0.36704185605049133, + "learning_rate": 0.0001, + "loss": 1.6885, + "step": 2289 + }, + { + "epoch": 0.266047051989544, + "grad_norm": 0.37085020542144775, + "learning_rate": 0.0001, + "loss": 1.5913, + "step": 2290 + }, + { + "epoch": 0.2661632297415045, + "grad_norm": 0.3715968430042267, + "learning_rate": 0.0001, + "loss": 1.8348, + "step": 2291 + }, + { + "epoch": 0.266279407493465, + "grad_norm": 0.3815777599811554, + "learning_rate": 0.0001, + "loss": 1.7571, + "step": 2292 + }, + { + "epoch": 0.2663955852454255, + "grad_norm": 0.3628351092338562, + "learning_rate": 0.0001, + "loss": 1.8001, + "step": 2293 + }, + { + "epoch": 0.266511762997386, + "grad_norm": 0.39187735319137573, + "learning_rate": 0.0001, + "loss": 1.7246, + "step": 2294 + }, + { + "epoch": 0.2666279407493465, + "grad_norm": 0.4006575644016266, + "learning_rate": 0.0001, + "loss": 1.6944, + "step": 2295 + }, + { + "epoch": 0.266744118501307, + "grad_norm": 0.3559897243976593, + "learning_rate": 0.0001, + "loss": 1.6132, + "step": 2296 + }, + { + "epoch": 0.2668602962532675, + "grad_norm": 0.3798125088214874, + "learning_rate": 0.0001, + "loss": 1.6853, + "step": 2297 + }, + { + "epoch": 0.266976474005228, + "grad_norm": 0.40822911262512207, + "learning_rate": 0.0001, + "loss": 1.7918, + "step": 2298 + }, + { + "epoch": 0.2670926517571885, + "grad_norm": 0.36172357201576233, + "learning_rate": 0.0001, + "loss": 1.6948, + "step": 2299 + }, + { + "epoch": 0.267208829509149, + "grad_norm": 0.3883090913295746, + "learning_rate": 0.0001, + "loss": 1.6766, + "step": 2300 + }, + { + "epoch": 0.26732500726110947, + "grad_norm": 0.3930845856666565, + "learning_rate": 0.0001, + "loss": 1.6652, + "step": 2301 + }, + { + "epoch": 0.26744118501307, + "grad_norm": 0.34050822257995605, + "learning_rate": 0.0001, + "loss": 1.5192, + "step": 2302 + }, + { + "epoch": 0.2675573627650305, + "grad_norm": 0.37255436182022095, + "learning_rate": 0.0001, + "loss": 1.6303, + "step": 2303 + }, + { + "epoch": 0.267673540516991, + "grad_norm": 0.383784681558609, + "learning_rate": 0.0001, + "loss": 1.7766, + "step": 2304 + }, + { + "epoch": 0.2677897182689515, + "grad_norm": 0.3867911100387573, + "learning_rate": 0.0001, + "loss": 1.855, + "step": 2305 + }, + { + "epoch": 0.26790589602091197, + "grad_norm": 0.40486153960227966, + "learning_rate": 0.0001, + "loss": 1.6078, + "step": 2306 + }, + { + "epoch": 0.2680220737728725, + "grad_norm": 0.42723652720451355, + "learning_rate": 0.0001, + "loss": 1.909, + "step": 2307 + }, + { + "epoch": 0.268138251524833, + "grad_norm": 0.3602335751056671, + "learning_rate": 0.0001, + "loss": 1.6768, + "step": 2308 + }, + { + "epoch": 0.2682544292767935, + "grad_norm": 0.3916855454444885, + "learning_rate": 0.0001, + "loss": 1.501, + "step": 2309 + }, + { + "epoch": 0.268370607028754, + "grad_norm": 0.3967686891555786, + "learning_rate": 0.0001, + "loss": 1.5217, + "step": 2310 + }, + { + "epoch": 0.26848678478071447, + "grad_norm": 0.37065213918685913, + "learning_rate": 0.0001, + "loss": 1.7548, + "step": 2311 + }, + { + "epoch": 0.268602962532675, + "grad_norm": 0.3448849618434906, + "learning_rate": 0.0001, + "loss": 1.5527, + "step": 2312 + }, + { + "epoch": 0.2687191402846355, + "grad_norm": 0.36089155077934265, + "learning_rate": 0.0001, + "loss": 1.845, + "step": 2313 + }, + { + "epoch": 0.268835318036596, + "grad_norm": 0.3895665109157562, + "learning_rate": 0.0001, + "loss": 1.7589, + "step": 2314 + }, + { + "epoch": 0.2689514957885565, + "grad_norm": 0.3936954736709595, + "learning_rate": 0.0001, + "loss": 1.6873, + "step": 2315 + }, + { + "epoch": 0.26906767354051697, + "grad_norm": 0.38149169087409973, + "learning_rate": 0.0001, + "loss": 1.6693, + "step": 2316 + }, + { + "epoch": 0.2691838512924775, + "grad_norm": 0.34843528270721436, + "learning_rate": 0.0001, + "loss": 1.5742, + "step": 2317 + }, + { + "epoch": 0.269300029044438, + "grad_norm": 0.3937288522720337, + "learning_rate": 0.0001, + "loss": 1.6785, + "step": 2318 + }, + { + "epoch": 0.2694162067963985, + "grad_norm": 0.431801438331604, + "learning_rate": 0.0001, + "loss": 1.7015, + "step": 2319 + }, + { + "epoch": 0.269532384548359, + "grad_norm": 0.3963301181793213, + "learning_rate": 0.0001, + "loss": 1.7602, + "step": 2320 + }, + { + "epoch": 0.26964856230031947, + "grad_norm": 0.39253467321395874, + "learning_rate": 0.0001, + "loss": 1.8633, + "step": 2321 + }, + { + "epoch": 0.26976474005227996, + "grad_norm": 0.3785027861595154, + "learning_rate": 0.0001, + "loss": 1.6881, + "step": 2322 + }, + { + "epoch": 0.2698809178042405, + "grad_norm": 0.4014844596385956, + "learning_rate": 0.0001, + "loss": 1.8149, + "step": 2323 + }, + { + "epoch": 0.269997095556201, + "grad_norm": 0.3676471710205078, + "learning_rate": 0.0001, + "loss": 1.7278, + "step": 2324 + }, + { + "epoch": 0.2701132733081615, + "grad_norm": 0.3606375455856323, + "learning_rate": 0.0001, + "loss": 1.7173, + "step": 2325 + }, + { + "epoch": 0.270229451060122, + "grad_norm": 0.3681615889072418, + "learning_rate": 0.0001, + "loss": 1.747, + "step": 2326 + }, + { + "epoch": 0.27034562881208246, + "grad_norm": 0.3740110397338867, + "learning_rate": 0.0001, + "loss": 1.593, + "step": 2327 + }, + { + "epoch": 0.270461806564043, + "grad_norm": 0.39461883902549744, + "learning_rate": 0.0001, + "loss": 1.7891, + "step": 2328 + }, + { + "epoch": 0.2705779843160035, + "grad_norm": 0.3810250759124756, + "learning_rate": 0.0001, + "loss": 1.7613, + "step": 2329 + }, + { + "epoch": 0.270694162067964, + "grad_norm": 0.39059239625930786, + "learning_rate": 0.0001, + "loss": 1.7721, + "step": 2330 + }, + { + "epoch": 0.2708103398199245, + "grad_norm": 0.40021392703056335, + "learning_rate": 0.0001, + "loss": 1.8333, + "step": 2331 + }, + { + "epoch": 0.27092651757188496, + "grad_norm": 0.42178237438201904, + "learning_rate": 0.0001, + "loss": 1.819, + "step": 2332 + }, + { + "epoch": 0.2710426953238455, + "grad_norm": 0.3722633123397827, + "learning_rate": 0.0001, + "loss": 1.6485, + "step": 2333 + }, + { + "epoch": 0.271158873075806, + "grad_norm": 0.3866879940032959, + "learning_rate": 0.0001, + "loss": 1.8033, + "step": 2334 + }, + { + "epoch": 0.2712750508277665, + "grad_norm": 0.38638031482696533, + "learning_rate": 0.0001, + "loss": 1.7147, + "step": 2335 + }, + { + "epoch": 0.271391228579727, + "grad_norm": 0.37124332785606384, + "learning_rate": 0.0001, + "loss": 1.6957, + "step": 2336 + }, + { + "epoch": 0.27150740633168746, + "grad_norm": 0.39737075567245483, + "learning_rate": 0.0001, + "loss": 1.6314, + "step": 2337 + }, + { + "epoch": 0.271623584083648, + "grad_norm": 0.41190680861473083, + "learning_rate": 0.0001, + "loss": 1.7341, + "step": 2338 + }, + { + "epoch": 0.2717397618356085, + "grad_norm": 0.36484986543655396, + "learning_rate": 0.0001, + "loss": 1.7156, + "step": 2339 + }, + { + "epoch": 0.271855939587569, + "grad_norm": 0.38202008605003357, + "learning_rate": 0.0001, + "loss": 1.7296, + "step": 2340 + }, + { + "epoch": 0.2719721173395295, + "grad_norm": 0.3791213035583496, + "learning_rate": 0.0001, + "loss": 1.7189, + "step": 2341 + }, + { + "epoch": 0.27208829509148996, + "grad_norm": 0.3870936930179596, + "learning_rate": 0.0001, + "loss": 1.6692, + "step": 2342 + }, + { + "epoch": 0.2722044728434505, + "grad_norm": 0.3662189543247223, + "learning_rate": 0.0001, + "loss": 1.7003, + "step": 2343 + }, + { + "epoch": 0.272320650595411, + "grad_norm": 0.3746212422847748, + "learning_rate": 0.0001, + "loss": 1.743, + "step": 2344 + }, + { + "epoch": 0.2724368283473715, + "grad_norm": 0.3724815845489502, + "learning_rate": 0.0001, + "loss": 1.7269, + "step": 2345 + }, + { + "epoch": 0.272553006099332, + "grad_norm": 0.390982985496521, + "learning_rate": 0.0001, + "loss": 1.6358, + "step": 2346 + }, + { + "epoch": 0.27266918385129246, + "grad_norm": 0.38862210512161255, + "learning_rate": 0.0001, + "loss": 1.6747, + "step": 2347 + }, + { + "epoch": 0.27278536160325295, + "grad_norm": 0.3926958441734314, + "learning_rate": 0.0001, + "loss": 1.7315, + "step": 2348 + }, + { + "epoch": 0.2729015393552135, + "grad_norm": 0.3797786831855774, + "learning_rate": 0.0001, + "loss": 1.707, + "step": 2349 + }, + { + "epoch": 0.273017717107174, + "grad_norm": 0.40166667103767395, + "learning_rate": 0.0001, + "loss": 1.6808, + "step": 2350 + }, + { + "epoch": 0.2731338948591345, + "grad_norm": 0.37161746621131897, + "learning_rate": 0.0001, + "loss": 1.7761, + "step": 2351 + }, + { + "epoch": 0.27325007261109496, + "grad_norm": 0.3659614324569702, + "learning_rate": 0.0001, + "loss": 1.5514, + "step": 2352 + }, + { + "epoch": 0.27336625036305545, + "grad_norm": 0.41991570591926575, + "learning_rate": 0.0001, + "loss": 1.7652, + "step": 2353 + }, + { + "epoch": 0.273482428115016, + "grad_norm": 0.41460558772087097, + "learning_rate": 0.0001, + "loss": 1.7779, + "step": 2354 + }, + { + "epoch": 0.2735986058669765, + "grad_norm": 0.3913847506046295, + "learning_rate": 0.0001, + "loss": 1.6966, + "step": 2355 + }, + { + "epoch": 0.273714783618937, + "grad_norm": 0.3612228035926819, + "learning_rate": 0.0001, + "loss": 1.4956, + "step": 2356 + }, + { + "epoch": 0.27383096137089746, + "grad_norm": 0.3820975422859192, + "learning_rate": 0.0001, + "loss": 1.7256, + "step": 2357 + }, + { + "epoch": 0.27394713912285795, + "grad_norm": 0.37913262844085693, + "learning_rate": 0.0001, + "loss": 1.7815, + "step": 2358 + }, + { + "epoch": 0.2740633168748185, + "grad_norm": 0.3759218454360962, + "learning_rate": 0.0001, + "loss": 1.5806, + "step": 2359 + }, + { + "epoch": 0.274179494626779, + "grad_norm": 0.3867158889770508, + "learning_rate": 0.0001, + "loss": 1.6269, + "step": 2360 + }, + { + "epoch": 0.2742956723787395, + "grad_norm": 0.36591243743896484, + "learning_rate": 0.0001, + "loss": 1.7097, + "step": 2361 + }, + { + "epoch": 0.27441185013069996, + "grad_norm": 0.37191227078437805, + "learning_rate": 0.0001, + "loss": 1.6653, + "step": 2362 + }, + { + "epoch": 0.27452802788266045, + "grad_norm": 0.40537264943122864, + "learning_rate": 0.0001, + "loss": 1.7831, + "step": 2363 + }, + { + "epoch": 0.274644205634621, + "grad_norm": 0.3994043171405792, + "learning_rate": 0.0001, + "loss": 1.7722, + "step": 2364 + }, + { + "epoch": 0.2747603833865815, + "grad_norm": 0.3657456040382385, + "learning_rate": 0.0001, + "loss": 1.5085, + "step": 2365 + }, + { + "epoch": 0.274876561138542, + "grad_norm": 0.3876878321170807, + "learning_rate": 0.0001, + "loss": 1.6583, + "step": 2366 + }, + { + "epoch": 0.27499273889050246, + "grad_norm": 0.3732263445854187, + "learning_rate": 0.0001, + "loss": 1.6381, + "step": 2367 + }, + { + "epoch": 0.27510891664246295, + "grad_norm": 0.3888086974620819, + "learning_rate": 0.0001, + "loss": 1.8368, + "step": 2368 + }, + { + "epoch": 0.27522509439442344, + "grad_norm": 0.3704921007156372, + "learning_rate": 0.0001, + "loss": 1.7033, + "step": 2369 + }, + { + "epoch": 0.275341272146384, + "grad_norm": 0.4058105945587158, + "learning_rate": 0.0001, + "loss": 1.907, + "step": 2370 + }, + { + "epoch": 0.2754574498983445, + "grad_norm": 0.35868632793426514, + "learning_rate": 0.0001, + "loss": 1.6088, + "step": 2371 + }, + { + "epoch": 0.27557362765030496, + "grad_norm": 0.3760453462600708, + "learning_rate": 0.0001, + "loss": 1.5563, + "step": 2372 + }, + { + "epoch": 0.27568980540226545, + "grad_norm": 0.43682530522346497, + "learning_rate": 0.0001, + "loss": 1.8795, + "step": 2373 + }, + { + "epoch": 0.27580598315422594, + "grad_norm": 0.4160868525505066, + "learning_rate": 0.0001, + "loss": 1.7954, + "step": 2374 + }, + { + "epoch": 0.2759221609061865, + "grad_norm": 0.3560098707675934, + "learning_rate": 0.0001, + "loss": 1.6013, + "step": 2375 + }, + { + "epoch": 0.276038338658147, + "grad_norm": 0.37041234970092773, + "learning_rate": 0.0001, + "loss": 1.6575, + "step": 2376 + }, + { + "epoch": 0.27615451641010746, + "grad_norm": 0.3486596345901489, + "learning_rate": 0.0001, + "loss": 1.5998, + "step": 2377 + }, + { + "epoch": 0.27627069416206795, + "grad_norm": 0.368912935256958, + "learning_rate": 0.0001, + "loss": 1.7511, + "step": 2378 + }, + { + "epoch": 0.27638687191402844, + "grad_norm": 0.41473448276519775, + "learning_rate": 0.0001, + "loss": 1.8201, + "step": 2379 + }, + { + "epoch": 0.276503049665989, + "grad_norm": 0.39010536670684814, + "learning_rate": 0.0001, + "loss": 1.8835, + "step": 2380 + }, + { + "epoch": 0.2766192274179495, + "grad_norm": 0.44715237617492676, + "learning_rate": 0.0001, + "loss": 1.6617, + "step": 2381 + }, + { + "epoch": 0.27673540516990996, + "grad_norm": 0.38639402389526367, + "learning_rate": 0.0001, + "loss": 1.5988, + "step": 2382 + }, + { + "epoch": 0.27685158292187045, + "grad_norm": 0.3690156638622284, + "learning_rate": 0.0001, + "loss": 1.6905, + "step": 2383 + }, + { + "epoch": 0.27696776067383094, + "grad_norm": 0.380719393491745, + "learning_rate": 0.0001, + "loss": 1.6926, + "step": 2384 + }, + { + "epoch": 0.2770839384257915, + "grad_norm": 0.37003418803215027, + "learning_rate": 0.0001, + "loss": 1.7133, + "step": 2385 + }, + { + "epoch": 0.277200116177752, + "grad_norm": 0.3689243197441101, + "learning_rate": 0.0001, + "loss": 1.7334, + "step": 2386 + }, + { + "epoch": 0.27731629392971247, + "grad_norm": 0.3837697505950928, + "learning_rate": 0.0001, + "loss": 1.6301, + "step": 2387 + }, + { + "epoch": 0.27743247168167295, + "grad_norm": 0.39931994676589966, + "learning_rate": 0.0001, + "loss": 1.7904, + "step": 2388 + }, + { + "epoch": 0.27754864943363344, + "grad_norm": 0.3684816062450409, + "learning_rate": 0.0001, + "loss": 1.7226, + "step": 2389 + }, + { + "epoch": 0.27766482718559393, + "grad_norm": 0.37523365020751953, + "learning_rate": 0.0001, + "loss": 1.6248, + "step": 2390 + }, + { + "epoch": 0.2777810049375545, + "grad_norm": 0.38715073466300964, + "learning_rate": 0.0001, + "loss": 1.8479, + "step": 2391 + }, + { + "epoch": 0.27789718268951497, + "grad_norm": 0.3796006441116333, + "learning_rate": 0.0001, + "loss": 1.6852, + "step": 2392 + }, + { + "epoch": 0.27801336044147545, + "grad_norm": 0.3867599070072174, + "learning_rate": 0.0001, + "loss": 1.6786, + "step": 2393 + }, + { + "epoch": 0.27812953819343594, + "grad_norm": 0.36347195506095886, + "learning_rate": 0.0001, + "loss": 1.5115, + "step": 2394 + }, + { + "epoch": 0.27824571594539643, + "grad_norm": 0.40452706813812256, + "learning_rate": 0.0001, + "loss": 1.7461, + "step": 2395 + }, + { + "epoch": 0.278361893697357, + "grad_norm": 0.37691494822502136, + "learning_rate": 0.0001, + "loss": 1.6363, + "step": 2396 + }, + { + "epoch": 0.27847807144931747, + "grad_norm": 0.3693540394306183, + "learning_rate": 0.0001, + "loss": 1.5973, + "step": 2397 + }, + { + "epoch": 0.27859424920127795, + "grad_norm": 0.3859393894672394, + "learning_rate": 0.0001, + "loss": 1.6396, + "step": 2398 + }, + { + "epoch": 0.27871042695323844, + "grad_norm": 0.36615633964538574, + "learning_rate": 0.0001, + "loss": 1.5203, + "step": 2399 + }, + { + "epoch": 0.27882660470519893, + "grad_norm": 0.4025718867778778, + "learning_rate": 0.0001, + "loss": 1.7692, + "step": 2400 + }, + { + "epoch": 0.2789427824571595, + "grad_norm": 0.35570523142814636, + "learning_rate": 0.0001, + "loss": 1.5524, + "step": 2401 + }, + { + "epoch": 0.27905896020911997, + "grad_norm": 0.4003136456012726, + "learning_rate": 0.0001, + "loss": 1.6839, + "step": 2402 + }, + { + "epoch": 0.27917513796108046, + "grad_norm": 0.38738471269607544, + "learning_rate": 0.0001, + "loss": 1.6605, + "step": 2403 + }, + { + "epoch": 0.27929131571304094, + "grad_norm": 0.38564881682395935, + "learning_rate": 0.0001, + "loss": 1.8121, + "step": 2404 + }, + { + "epoch": 0.27940749346500143, + "grad_norm": 0.4316978454589844, + "learning_rate": 0.0001, + "loss": 1.6242, + "step": 2405 + }, + { + "epoch": 0.279523671216962, + "grad_norm": 0.3853331208229065, + "learning_rate": 0.0001, + "loss": 1.6656, + "step": 2406 + }, + { + "epoch": 0.27963984896892247, + "grad_norm": 0.3825344443321228, + "learning_rate": 0.0001, + "loss": 1.623, + "step": 2407 + }, + { + "epoch": 0.27975602672088296, + "grad_norm": 0.3726043999195099, + "learning_rate": 0.0001, + "loss": 1.7153, + "step": 2408 + }, + { + "epoch": 0.27987220447284344, + "grad_norm": 0.36002033948898315, + "learning_rate": 0.0001, + "loss": 1.4855, + "step": 2409 + }, + { + "epoch": 0.27998838222480393, + "grad_norm": 0.388629674911499, + "learning_rate": 0.0001, + "loss": 1.6194, + "step": 2410 + }, + { + "epoch": 0.2801045599767644, + "grad_norm": 0.3558464050292969, + "learning_rate": 0.0001, + "loss": 1.3596, + "step": 2411 + }, + { + "epoch": 0.28022073772872497, + "grad_norm": 0.36634495854377747, + "learning_rate": 0.0001, + "loss": 1.6079, + "step": 2412 + }, + { + "epoch": 0.28033691548068546, + "grad_norm": 0.36792120337486267, + "learning_rate": 0.0001, + "loss": 1.775, + "step": 2413 + }, + { + "epoch": 0.28045309323264594, + "grad_norm": 0.35753536224365234, + "learning_rate": 0.0001, + "loss": 1.5954, + "step": 2414 + }, + { + "epoch": 0.28056927098460643, + "grad_norm": 0.3687507212162018, + "learning_rate": 0.0001, + "loss": 1.6478, + "step": 2415 + }, + { + "epoch": 0.2806854487365669, + "grad_norm": 0.35439378023147583, + "learning_rate": 0.0001, + "loss": 1.6115, + "step": 2416 + }, + { + "epoch": 0.28080162648852747, + "grad_norm": 0.3909814655780792, + "learning_rate": 0.0001, + "loss": 1.843, + "step": 2417 + }, + { + "epoch": 0.28091780424048796, + "grad_norm": 0.4006040096282959, + "learning_rate": 0.0001, + "loss": 1.7323, + "step": 2418 + }, + { + "epoch": 0.28103398199244845, + "grad_norm": 0.3820417821407318, + "learning_rate": 0.0001, + "loss": 1.7021, + "step": 2419 + }, + { + "epoch": 0.28115015974440893, + "grad_norm": 0.41113728284835815, + "learning_rate": 0.0001, + "loss": 1.6909, + "step": 2420 + }, + { + "epoch": 0.2812663374963694, + "grad_norm": 0.37190499901771545, + "learning_rate": 0.0001, + "loss": 1.5947, + "step": 2421 + }, + { + "epoch": 0.28138251524832997, + "grad_norm": 0.3800449073314667, + "learning_rate": 0.0001, + "loss": 1.699, + "step": 2422 + }, + { + "epoch": 0.28149869300029046, + "grad_norm": 0.3811475932598114, + "learning_rate": 0.0001, + "loss": 1.7483, + "step": 2423 + }, + { + "epoch": 0.28161487075225095, + "grad_norm": 0.3777383863925934, + "learning_rate": 0.0001, + "loss": 1.6842, + "step": 2424 + }, + { + "epoch": 0.28173104850421143, + "grad_norm": 0.3724311590194702, + "learning_rate": 0.0001, + "loss": 1.7839, + "step": 2425 + }, + { + "epoch": 0.2818472262561719, + "grad_norm": 0.38829484581947327, + "learning_rate": 0.0001, + "loss": 1.8328, + "step": 2426 + }, + { + "epoch": 0.28196340400813247, + "grad_norm": 0.4069208800792694, + "learning_rate": 0.0001, + "loss": 1.873, + "step": 2427 + }, + { + "epoch": 0.28207958176009296, + "grad_norm": 0.3797883093357086, + "learning_rate": 0.0001, + "loss": 1.6634, + "step": 2428 + }, + { + "epoch": 0.28219575951205345, + "grad_norm": 0.3935191333293915, + "learning_rate": 0.0001, + "loss": 1.8087, + "step": 2429 + }, + { + "epoch": 0.28231193726401393, + "grad_norm": 0.37056753039360046, + "learning_rate": 0.0001, + "loss": 1.7262, + "step": 2430 + }, + { + "epoch": 0.2824281150159744, + "grad_norm": 0.3883243799209595, + "learning_rate": 0.0001, + "loss": 1.7572, + "step": 2431 + }, + { + "epoch": 0.28254429276793497, + "grad_norm": 0.38896408677101135, + "learning_rate": 0.0001, + "loss": 1.605, + "step": 2432 + }, + { + "epoch": 0.28266047051989546, + "grad_norm": 0.3767715096473694, + "learning_rate": 0.0001, + "loss": 1.7514, + "step": 2433 + }, + { + "epoch": 0.28277664827185595, + "grad_norm": 0.36977216601371765, + "learning_rate": 0.0001, + "loss": 1.659, + "step": 2434 + }, + { + "epoch": 0.28289282602381643, + "grad_norm": 0.38601481914520264, + "learning_rate": 0.0001, + "loss": 1.7051, + "step": 2435 + }, + { + "epoch": 0.2830090037757769, + "grad_norm": 0.36568722128868103, + "learning_rate": 0.0001, + "loss": 1.6103, + "step": 2436 + }, + { + "epoch": 0.2831251815277374, + "grad_norm": 0.3678687512874603, + "learning_rate": 0.0001, + "loss": 1.6256, + "step": 2437 + }, + { + "epoch": 0.28324135927969796, + "grad_norm": 0.4139416813850403, + "learning_rate": 0.0001, + "loss": 1.7692, + "step": 2438 + }, + { + "epoch": 0.28335753703165845, + "grad_norm": 0.3862224519252777, + "learning_rate": 0.0001, + "loss": 1.7017, + "step": 2439 + }, + { + "epoch": 0.28347371478361894, + "grad_norm": 0.3898821771144867, + "learning_rate": 0.0001, + "loss": 1.6621, + "step": 2440 + }, + { + "epoch": 0.2835898925355794, + "grad_norm": 0.35055509209632874, + "learning_rate": 0.0001, + "loss": 1.6043, + "step": 2441 + }, + { + "epoch": 0.2837060702875399, + "grad_norm": 0.3779016137123108, + "learning_rate": 0.0001, + "loss": 1.5587, + "step": 2442 + }, + { + "epoch": 0.28382224803950046, + "grad_norm": 0.38517260551452637, + "learning_rate": 0.0001, + "loss": 1.6497, + "step": 2443 + }, + { + "epoch": 0.28393842579146095, + "grad_norm": 0.3903183043003082, + "learning_rate": 0.0001, + "loss": 1.7303, + "step": 2444 + }, + { + "epoch": 0.28405460354342144, + "grad_norm": 0.41626691818237305, + "learning_rate": 0.0001, + "loss": 1.8199, + "step": 2445 + }, + { + "epoch": 0.2841707812953819, + "grad_norm": 0.37726128101348877, + "learning_rate": 0.0001, + "loss": 1.7504, + "step": 2446 + }, + { + "epoch": 0.2842869590473424, + "grad_norm": 0.3679426908493042, + "learning_rate": 0.0001, + "loss": 1.6864, + "step": 2447 + }, + { + "epoch": 0.28440313679930296, + "grad_norm": 0.3660835921764374, + "learning_rate": 0.0001, + "loss": 1.7442, + "step": 2448 + }, + { + "epoch": 0.28451931455126345, + "grad_norm": 0.39392563700675964, + "learning_rate": 0.0001, + "loss": 1.7503, + "step": 2449 + }, + { + "epoch": 0.28463549230322394, + "grad_norm": 0.38993775844573975, + "learning_rate": 0.0001, + "loss": 1.6938, + "step": 2450 + }, + { + "epoch": 0.2847516700551844, + "grad_norm": 0.3619860112667084, + "learning_rate": 0.0001, + "loss": 1.5416, + "step": 2451 + }, + { + "epoch": 0.2848678478071449, + "grad_norm": 0.3857851028442383, + "learning_rate": 0.0001, + "loss": 1.6284, + "step": 2452 + }, + { + "epoch": 0.28498402555910546, + "grad_norm": 0.3716614544391632, + "learning_rate": 0.0001, + "loss": 1.5245, + "step": 2453 + }, + { + "epoch": 0.28510020331106595, + "grad_norm": 0.4253447949886322, + "learning_rate": 0.0001, + "loss": 1.7602, + "step": 2454 + }, + { + "epoch": 0.28521638106302644, + "grad_norm": 0.39747342467308044, + "learning_rate": 0.0001, + "loss": 1.6925, + "step": 2455 + }, + { + "epoch": 0.2853325588149869, + "grad_norm": 0.3874737322330475, + "learning_rate": 0.0001, + "loss": 1.763, + "step": 2456 + }, + { + "epoch": 0.2854487365669474, + "grad_norm": 0.37462908029556274, + "learning_rate": 0.0001, + "loss": 1.6554, + "step": 2457 + }, + { + "epoch": 0.2855649143189079, + "grad_norm": 0.4207324981689453, + "learning_rate": 0.0001, + "loss": 1.6978, + "step": 2458 + }, + { + "epoch": 0.28568109207086845, + "grad_norm": 0.3763648569583893, + "learning_rate": 0.0001, + "loss": 1.6805, + "step": 2459 + }, + { + "epoch": 0.28579726982282894, + "grad_norm": 0.40208232402801514, + "learning_rate": 0.0001, + "loss": 1.584, + "step": 2460 + }, + { + "epoch": 0.2859134475747894, + "grad_norm": 0.3689945638179779, + "learning_rate": 0.0001, + "loss": 1.7245, + "step": 2461 + }, + { + "epoch": 0.2860296253267499, + "grad_norm": 0.3570806086063385, + "learning_rate": 0.0001, + "loss": 1.493, + "step": 2462 + }, + { + "epoch": 0.2861458030787104, + "grad_norm": 0.39865806698799133, + "learning_rate": 0.0001, + "loss": 1.888, + "step": 2463 + }, + { + "epoch": 0.28626198083067095, + "grad_norm": 0.3934881389141083, + "learning_rate": 0.0001, + "loss": 1.6636, + "step": 2464 + }, + { + "epoch": 0.28637815858263144, + "grad_norm": 0.38317766785621643, + "learning_rate": 0.0001, + "loss": 1.6417, + "step": 2465 + }, + { + "epoch": 0.2864943363345919, + "grad_norm": 0.3968998193740845, + "learning_rate": 0.0001, + "loss": 1.7444, + "step": 2466 + }, + { + "epoch": 0.2866105140865524, + "grad_norm": 0.3885013461112976, + "learning_rate": 0.0001, + "loss": 1.7969, + "step": 2467 + }, + { + "epoch": 0.2867266918385129, + "grad_norm": 0.3675188720226288, + "learning_rate": 0.0001, + "loss": 1.6485, + "step": 2468 + }, + { + "epoch": 0.28684286959047345, + "grad_norm": 0.39261195063591003, + "learning_rate": 0.0001, + "loss": 1.6632, + "step": 2469 + }, + { + "epoch": 0.28695904734243394, + "grad_norm": 0.3737514615058899, + "learning_rate": 0.0001, + "loss": 1.6274, + "step": 2470 + }, + { + "epoch": 0.2870752250943944, + "grad_norm": 0.3958921730518341, + "learning_rate": 0.0001, + "loss": 1.6123, + "step": 2471 + }, + { + "epoch": 0.2871914028463549, + "grad_norm": 0.37590277194976807, + "learning_rate": 0.0001, + "loss": 1.723, + "step": 2472 + }, + { + "epoch": 0.2873075805983154, + "grad_norm": 0.39498046040534973, + "learning_rate": 0.0001, + "loss": 1.6826, + "step": 2473 + }, + { + "epoch": 0.28742375835027595, + "grad_norm": 0.35599952936172485, + "learning_rate": 0.0001, + "loss": 1.6085, + "step": 2474 + }, + { + "epoch": 0.28753993610223644, + "grad_norm": 0.3650219440460205, + "learning_rate": 0.0001, + "loss": 1.5636, + "step": 2475 + }, + { + "epoch": 0.2876561138541969, + "grad_norm": 0.3618330955505371, + "learning_rate": 0.0001, + "loss": 1.5798, + "step": 2476 + }, + { + "epoch": 0.2877722916061574, + "grad_norm": 0.35137006640434265, + "learning_rate": 0.0001, + "loss": 1.6287, + "step": 2477 + }, + { + "epoch": 0.2878884693581179, + "grad_norm": 0.3683086931705475, + "learning_rate": 0.0001, + "loss": 1.5807, + "step": 2478 + }, + { + "epoch": 0.2880046471100784, + "grad_norm": 0.4060046374797821, + "learning_rate": 0.0001, + "loss": 1.6742, + "step": 2479 + }, + { + "epoch": 0.28812082486203894, + "grad_norm": 0.39657512307167053, + "learning_rate": 0.0001, + "loss": 1.7616, + "step": 2480 + }, + { + "epoch": 0.2882370026139994, + "grad_norm": 0.38853272795677185, + "learning_rate": 0.0001, + "loss": 1.7686, + "step": 2481 + }, + { + "epoch": 0.2883531803659599, + "grad_norm": 0.39028510451316833, + "learning_rate": 0.0001, + "loss": 1.674, + "step": 2482 + }, + { + "epoch": 0.2884693581179204, + "grad_norm": 0.3892669379711151, + "learning_rate": 0.0001, + "loss": 1.7145, + "step": 2483 + }, + { + "epoch": 0.2885855358698809, + "grad_norm": 0.3654754161834717, + "learning_rate": 0.0001, + "loss": 1.663, + "step": 2484 + }, + { + "epoch": 0.28870171362184144, + "grad_norm": 0.3528408110141754, + "learning_rate": 0.0001, + "loss": 1.4453, + "step": 2485 + }, + { + "epoch": 0.2888178913738019, + "grad_norm": 0.4061609208583832, + "learning_rate": 0.0001, + "loss": 1.9495, + "step": 2486 + }, + { + "epoch": 0.2889340691257624, + "grad_norm": 0.4418545663356781, + "learning_rate": 0.0001, + "loss": 1.6918, + "step": 2487 + }, + { + "epoch": 0.2890502468777229, + "grad_norm": 0.3858497142791748, + "learning_rate": 0.0001, + "loss": 1.6543, + "step": 2488 + }, + { + "epoch": 0.2891664246296834, + "grad_norm": 0.43995893001556396, + "learning_rate": 0.0001, + "loss": 1.7826, + "step": 2489 + }, + { + "epoch": 0.28928260238164394, + "grad_norm": 0.3793260157108307, + "learning_rate": 0.0001, + "loss": 1.7453, + "step": 2490 + }, + { + "epoch": 0.2893987801336044, + "grad_norm": 0.3741684556007385, + "learning_rate": 0.0001, + "loss": 1.6777, + "step": 2491 + }, + { + "epoch": 0.2895149578855649, + "grad_norm": 0.3923763632774353, + "learning_rate": 0.0001, + "loss": 1.744, + "step": 2492 + }, + { + "epoch": 0.2896311356375254, + "grad_norm": 0.3590010702610016, + "learning_rate": 0.0001, + "loss": 1.5432, + "step": 2493 + }, + { + "epoch": 0.2897473133894859, + "grad_norm": 0.37135443091392517, + "learning_rate": 0.0001, + "loss": 1.6868, + "step": 2494 + }, + { + "epoch": 0.28986349114144644, + "grad_norm": 0.36070337891578674, + "learning_rate": 0.0001, + "loss": 1.6938, + "step": 2495 + }, + { + "epoch": 0.2899796688934069, + "grad_norm": 0.3781169652938843, + "learning_rate": 0.0001, + "loss": 1.5814, + "step": 2496 + }, + { + "epoch": 0.2900958466453674, + "grad_norm": 0.3880577087402344, + "learning_rate": 0.0001, + "loss": 1.4875, + "step": 2497 + }, + { + "epoch": 0.2902120243973279, + "grad_norm": 0.38813501596450806, + "learning_rate": 0.0001, + "loss": 1.4855, + "step": 2498 + }, + { + "epoch": 0.2903282021492884, + "grad_norm": 0.4048740267753601, + "learning_rate": 0.0001, + "loss": 1.6034, + "step": 2499 + }, + { + "epoch": 0.2904443799012489, + "grad_norm": 0.35958972573280334, + "learning_rate": 0.0001, + "loss": 1.5821, + "step": 2500 + }, + { + "epoch": 0.29056055765320943, + "grad_norm": 0.38591668009757996, + "learning_rate": 0.0001, + "loss": 1.7163, + "step": 2501 + }, + { + "epoch": 0.2906767354051699, + "grad_norm": 0.37877535820007324, + "learning_rate": 0.0001, + "loss": 1.6734, + "step": 2502 + }, + { + "epoch": 0.2907929131571304, + "grad_norm": 0.39159923791885376, + "learning_rate": 0.0001, + "loss": 1.8899, + "step": 2503 + }, + { + "epoch": 0.2909090909090909, + "grad_norm": 0.4107246994972229, + "learning_rate": 0.0001, + "loss": 1.644, + "step": 2504 + }, + { + "epoch": 0.2910252686610514, + "grad_norm": 0.39638829231262207, + "learning_rate": 0.0001, + "loss": 1.719, + "step": 2505 + }, + { + "epoch": 0.29114144641301193, + "grad_norm": 0.3886748254299164, + "learning_rate": 0.0001, + "loss": 1.6838, + "step": 2506 + }, + { + "epoch": 0.2912576241649724, + "grad_norm": 0.34036189317703247, + "learning_rate": 0.0001, + "loss": 1.4912, + "step": 2507 + }, + { + "epoch": 0.2913738019169329, + "grad_norm": 0.4072575867176056, + "learning_rate": 0.0001, + "loss": 1.7982, + "step": 2508 + }, + { + "epoch": 0.2914899796688934, + "grad_norm": 0.38684821128845215, + "learning_rate": 0.0001, + "loss": 1.6675, + "step": 2509 + }, + { + "epoch": 0.2916061574208539, + "grad_norm": 0.3691238462924957, + "learning_rate": 0.0001, + "loss": 1.5981, + "step": 2510 + }, + { + "epoch": 0.29172233517281443, + "grad_norm": 0.39206942915916443, + "learning_rate": 0.0001, + "loss": 1.6901, + "step": 2511 + }, + { + "epoch": 0.2918385129247749, + "grad_norm": 0.3901553452014923, + "learning_rate": 0.0001, + "loss": 1.7128, + "step": 2512 + }, + { + "epoch": 0.2919546906767354, + "grad_norm": 0.37073972821235657, + "learning_rate": 0.0001, + "loss": 1.5624, + "step": 2513 + }, + { + "epoch": 0.2920708684286959, + "grad_norm": 0.3903072476387024, + "learning_rate": 0.0001, + "loss": 1.5732, + "step": 2514 + }, + { + "epoch": 0.2921870461806564, + "grad_norm": 0.4193265438079834, + "learning_rate": 0.0001, + "loss": 1.7321, + "step": 2515 + }, + { + "epoch": 0.29230322393261693, + "grad_norm": 0.4069857895374298, + "learning_rate": 0.0001, + "loss": 1.7801, + "step": 2516 + }, + { + "epoch": 0.2924194016845774, + "grad_norm": 0.40134382247924805, + "learning_rate": 0.0001, + "loss": 1.8263, + "step": 2517 + }, + { + "epoch": 0.2925355794365379, + "grad_norm": 0.39630135893821716, + "learning_rate": 0.0001, + "loss": 1.7561, + "step": 2518 + }, + { + "epoch": 0.2926517571884984, + "grad_norm": 0.37301602959632874, + "learning_rate": 0.0001, + "loss": 1.6711, + "step": 2519 + }, + { + "epoch": 0.2927679349404589, + "grad_norm": 0.3810461759567261, + "learning_rate": 0.0001, + "loss": 1.6239, + "step": 2520 + }, + { + "epoch": 0.29288411269241943, + "grad_norm": 0.36505237221717834, + "learning_rate": 0.0001, + "loss": 1.6731, + "step": 2521 + }, + { + "epoch": 0.2930002904443799, + "grad_norm": 0.3974515199661255, + "learning_rate": 0.0001, + "loss": 1.7363, + "step": 2522 + }, + { + "epoch": 0.2931164681963404, + "grad_norm": 0.3713068664073944, + "learning_rate": 0.0001, + "loss": 1.6443, + "step": 2523 + }, + { + "epoch": 0.2932326459483009, + "grad_norm": 0.4444441497325897, + "learning_rate": 0.0001, + "loss": 1.9543, + "step": 2524 + }, + { + "epoch": 0.2933488237002614, + "grad_norm": 0.37668412923812866, + "learning_rate": 0.0001, + "loss": 1.7126, + "step": 2525 + }, + { + "epoch": 0.2934650014522219, + "grad_norm": 0.40829259157180786, + "learning_rate": 0.0001, + "loss": 1.8668, + "step": 2526 + }, + { + "epoch": 0.2935811792041824, + "grad_norm": 0.3872903287410736, + "learning_rate": 0.0001, + "loss": 1.5966, + "step": 2527 + }, + { + "epoch": 0.2936973569561429, + "grad_norm": 0.4001368284225464, + "learning_rate": 0.0001, + "loss": 1.7448, + "step": 2528 + }, + { + "epoch": 0.2938135347081034, + "grad_norm": 0.36894676089286804, + "learning_rate": 0.0001, + "loss": 1.6215, + "step": 2529 + }, + { + "epoch": 0.2939297124600639, + "grad_norm": 0.4085611402988434, + "learning_rate": 0.0001, + "loss": 1.8068, + "step": 2530 + }, + { + "epoch": 0.2940458902120244, + "grad_norm": 0.3942314386367798, + "learning_rate": 0.0001, + "loss": 1.6788, + "step": 2531 + }, + { + "epoch": 0.2941620679639849, + "grad_norm": 0.3652056157588959, + "learning_rate": 0.0001, + "loss": 1.5036, + "step": 2532 + }, + { + "epoch": 0.2942782457159454, + "grad_norm": 0.39049550890922546, + "learning_rate": 0.0001, + "loss": 1.7542, + "step": 2533 + }, + { + "epoch": 0.2943944234679059, + "grad_norm": 0.38500627875328064, + "learning_rate": 0.0001, + "loss": 1.6621, + "step": 2534 + }, + { + "epoch": 0.2945106012198664, + "grad_norm": 0.372928649187088, + "learning_rate": 0.0001, + "loss": 1.5971, + "step": 2535 + }, + { + "epoch": 0.2946267789718269, + "grad_norm": 0.4132663607597351, + "learning_rate": 0.0001, + "loss": 1.7871, + "step": 2536 + }, + { + "epoch": 0.2947429567237874, + "grad_norm": 0.41574445366859436, + "learning_rate": 0.0001, + "loss": 1.7713, + "step": 2537 + }, + { + "epoch": 0.2948591344757479, + "grad_norm": 0.3948863744735718, + "learning_rate": 0.0001, + "loss": 1.6838, + "step": 2538 + }, + { + "epoch": 0.2949753122277084, + "grad_norm": 0.3811478018760681, + "learning_rate": 0.0001, + "loss": 1.5245, + "step": 2539 + }, + { + "epoch": 0.2950914899796689, + "grad_norm": 0.3667415678501129, + "learning_rate": 0.0001, + "loss": 1.7235, + "step": 2540 + }, + { + "epoch": 0.2952076677316294, + "grad_norm": 0.41427987813949585, + "learning_rate": 0.0001, + "loss": 1.8357, + "step": 2541 + }, + { + "epoch": 0.2953238454835899, + "grad_norm": 0.4144713282585144, + "learning_rate": 0.0001, + "loss": 1.7562, + "step": 2542 + }, + { + "epoch": 0.2954400232355504, + "grad_norm": 0.3656145930290222, + "learning_rate": 0.0001, + "loss": 1.604, + "step": 2543 + }, + { + "epoch": 0.2955562009875109, + "grad_norm": 0.3743351697921753, + "learning_rate": 0.0001, + "loss": 1.531, + "step": 2544 + }, + { + "epoch": 0.2956723787394714, + "grad_norm": 0.4163815379142761, + "learning_rate": 0.0001, + "loss": 1.7352, + "step": 2545 + }, + { + "epoch": 0.2957885564914319, + "grad_norm": 0.3973884582519531, + "learning_rate": 0.0001, + "loss": 1.8309, + "step": 2546 + }, + { + "epoch": 0.29590473424339236, + "grad_norm": 0.4104848802089691, + "learning_rate": 0.0001, + "loss": 1.7723, + "step": 2547 + }, + { + "epoch": 0.2960209119953529, + "grad_norm": 0.38057518005371094, + "learning_rate": 0.0001, + "loss": 1.5819, + "step": 2548 + }, + { + "epoch": 0.2961370897473134, + "grad_norm": 0.3594435751438141, + "learning_rate": 0.0001, + "loss": 1.6023, + "step": 2549 + }, + { + "epoch": 0.2962532674992739, + "grad_norm": 0.3822750747203827, + "learning_rate": 0.0001, + "loss": 1.5985, + "step": 2550 + }, + { + "epoch": 0.2963694452512344, + "grad_norm": 0.4234026372432709, + "learning_rate": 0.0001, + "loss": 1.7732, + "step": 2551 + }, + { + "epoch": 0.29648562300319486, + "grad_norm": 0.3969663083553314, + "learning_rate": 0.0001, + "loss": 1.6452, + "step": 2552 + }, + { + "epoch": 0.2966018007551554, + "grad_norm": 0.36822569370269775, + "learning_rate": 0.0001, + "loss": 1.6158, + "step": 2553 + }, + { + "epoch": 0.2967179785071159, + "grad_norm": 0.4209086298942566, + "learning_rate": 0.0001, + "loss": 1.9934, + "step": 2554 + }, + { + "epoch": 0.2968341562590764, + "grad_norm": 0.42897096276283264, + "learning_rate": 0.0001, + "loss": 1.733, + "step": 2555 + }, + { + "epoch": 0.2969503340110369, + "grad_norm": 0.36142057180404663, + "learning_rate": 0.0001, + "loss": 1.5931, + "step": 2556 + }, + { + "epoch": 0.29706651176299737, + "grad_norm": 0.38266682624816895, + "learning_rate": 0.0001, + "loss": 1.6411, + "step": 2557 + }, + { + "epoch": 0.2971826895149579, + "grad_norm": 0.3493916094303131, + "learning_rate": 0.0001, + "loss": 1.647, + "step": 2558 + }, + { + "epoch": 0.2972988672669184, + "grad_norm": 0.3982014060020447, + "learning_rate": 0.0001, + "loss": 1.6872, + "step": 2559 + }, + { + "epoch": 0.2974150450188789, + "grad_norm": 0.3709663450717926, + "learning_rate": 0.0001, + "loss": 1.7749, + "step": 2560 + }, + { + "epoch": 0.2975312227708394, + "grad_norm": 0.3823295831680298, + "learning_rate": 0.0001, + "loss": 1.6854, + "step": 2561 + }, + { + "epoch": 0.29764740052279987, + "grad_norm": 0.36725711822509766, + "learning_rate": 0.0001, + "loss": 1.5369, + "step": 2562 + }, + { + "epoch": 0.2977635782747604, + "grad_norm": 0.407084584236145, + "learning_rate": 0.0001, + "loss": 1.7102, + "step": 2563 + }, + { + "epoch": 0.2978797560267209, + "grad_norm": 0.3887426555156708, + "learning_rate": 0.0001, + "loss": 1.7267, + "step": 2564 + }, + { + "epoch": 0.2979959337786814, + "grad_norm": 0.3916597068309784, + "learning_rate": 0.0001, + "loss": 1.6181, + "step": 2565 + }, + { + "epoch": 0.2981121115306419, + "grad_norm": 0.3767833113670349, + "learning_rate": 0.0001, + "loss": 1.6224, + "step": 2566 + }, + { + "epoch": 0.29822828928260237, + "grad_norm": 0.3760228753089905, + "learning_rate": 0.0001, + "loss": 1.577, + "step": 2567 + }, + { + "epoch": 0.29834446703456285, + "grad_norm": 0.3806806802749634, + "learning_rate": 0.0001, + "loss": 1.5708, + "step": 2568 + }, + { + "epoch": 0.2984606447865234, + "grad_norm": 0.38834068179130554, + "learning_rate": 0.0001, + "loss": 1.735, + "step": 2569 + }, + { + "epoch": 0.2985768225384839, + "grad_norm": 0.4086969196796417, + "learning_rate": 0.0001, + "loss": 1.7382, + "step": 2570 + }, + { + "epoch": 0.2986930002904444, + "grad_norm": 0.3873027265071869, + "learning_rate": 0.0001, + "loss": 1.6157, + "step": 2571 + }, + { + "epoch": 0.29880917804240487, + "grad_norm": 0.38422465324401855, + "learning_rate": 0.0001, + "loss": 1.7938, + "step": 2572 + }, + { + "epoch": 0.29892535579436535, + "grad_norm": 0.3803096115589142, + "learning_rate": 0.0001, + "loss": 1.7387, + "step": 2573 + }, + { + "epoch": 0.2990415335463259, + "grad_norm": 0.41523873805999756, + "learning_rate": 0.0001, + "loss": 1.6704, + "step": 2574 + }, + { + "epoch": 0.2991577112982864, + "grad_norm": 0.4138016104698181, + "learning_rate": 0.0001, + "loss": 1.9156, + "step": 2575 + }, + { + "epoch": 0.2992738890502469, + "grad_norm": 0.3932843506336212, + "learning_rate": 0.0001, + "loss": 1.8139, + "step": 2576 + }, + { + "epoch": 0.29939006680220737, + "grad_norm": 0.37751099467277527, + "learning_rate": 0.0001, + "loss": 1.642, + "step": 2577 + }, + { + "epoch": 0.29950624455416786, + "grad_norm": 0.37788084149360657, + "learning_rate": 0.0001, + "loss": 1.6342, + "step": 2578 + }, + { + "epoch": 0.2996224223061284, + "grad_norm": 0.381740927696228, + "learning_rate": 0.0001, + "loss": 1.6664, + "step": 2579 + }, + { + "epoch": 0.2997386000580889, + "grad_norm": 0.38412848114967346, + "learning_rate": 0.0001, + "loss": 1.6217, + "step": 2580 + }, + { + "epoch": 0.2998547778100494, + "grad_norm": 0.3746092617511749, + "learning_rate": 0.0001, + "loss": 1.5903, + "step": 2581 + }, + { + "epoch": 0.29997095556200987, + "grad_norm": 0.3701321482658386, + "learning_rate": 0.0001, + "loss": 1.7027, + "step": 2582 + }, + { + "epoch": 0.30008713331397036, + "grad_norm": 0.3820204436779022, + "learning_rate": 0.0001, + "loss": 1.6287, + "step": 2583 + }, + { + "epoch": 0.3002033110659309, + "grad_norm": 0.3748900592327118, + "learning_rate": 0.0001, + "loss": 1.6212, + "step": 2584 + }, + { + "epoch": 0.3003194888178914, + "grad_norm": 0.3851977288722992, + "learning_rate": 0.0001, + "loss": 1.7262, + "step": 2585 + }, + { + "epoch": 0.3004356665698519, + "grad_norm": 0.3978486955165863, + "learning_rate": 0.0001, + "loss": 1.8482, + "step": 2586 + }, + { + "epoch": 0.30055184432181237, + "grad_norm": 0.402482807636261, + "learning_rate": 0.0001, + "loss": 1.8079, + "step": 2587 + }, + { + "epoch": 0.30066802207377286, + "grad_norm": 0.41250428557395935, + "learning_rate": 0.0001, + "loss": 1.7092, + "step": 2588 + }, + { + "epoch": 0.30078419982573334, + "grad_norm": 0.4030408561229706, + "learning_rate": 0.0001, + "loss": 1.8394, + "step": 2589 + }, + { + "epoch": 0.3009003775776939, + "grad_norm": 0.3963468670845032, + "learning_rate": 0.0001, + "loss": 1.6341, + "step": 2590 + }, + { + "epoch": 0.3010165553296544, + "grad_norm": 0.37878894805908203, + "learning_rate": 0.0001, + "loss": 1.6902, + "step": 2591 + }, + { + "epoch": 0.30113273308161487, + "grad_norm": 0.3783760666847229, + "learning_rate": 0.0001, + "loss": 1.7441, + "step": 2592 + }, + { + "epoch": 0.30124891083357536, + "grad_norm": 0.3853001892566681, + "learning_rate": 0.0001, + "loss": 1.8408, + "step": 2593 + }, + { + "epoch": 0.30136508858553585, + "grad_norm": 0.37395623326301575, + "learning_rate": 0.0001, + "loss": 1.6941, + "step": 2594 + }, + { + "epoch": 0.3014812663374964, + "grad_norm": 0.3772829473018646, + "learning_rate": 0.0001, + "loss": 1.6901, + "step": 2595 + }, + { + "epoch": 0.3015974440894569, + "grad_norm": 0.3889354467391968, + "learning_rate": 0.0001, + "loss": 1.7094, + "step": 2596 + }, + { + "epoch": 0.30171362184141737, + "grad_norm": 0.4137793481349945, + "learning_rate": 0.0001, + "loss": 1.5989, + "step": 2597 + }, + { + "epoch": 0.30182979959337786, + "grad_norm": 0.37697646021842957, + "learning_rate": 0.0001, + "loss": 1.6379, + "step": 2598 + }, + { + "epoch": 0.30194597734533835, + "grad_norm": 0.3857576847076416, + "learning_rate": 0.0001, + "loss": 1.6224, + "step": 2599 + }, + { + "epoch": 0.3020621550972989, + "grad_norm": 0.41573137044906616, + "learning_rate": 0.0001, + "loss": 1.8468, + "step": 2600 + }, + { + "epoch": 0.3021783328492594, + "grad_norm": 0.3891277313232422, + "learning_rate": 0.0001, + "loss": 1.6498, + "step": 2601 + }, + { + "epoch": 0.30229451060121987, + "grad_norm": 0.3497909605503082, + "learning_rate": 0.0001, + "loss": 1.398, + "step": 2602 + }, + { + "epoch": 0.30241068835318036, + "grad_norm": 0.3606802523136139, + "learning_rate": 0.0001, + "loss": 1.5158, + "step": 2603 + }, + { + "epoch": 0.30252686610514085, + "grad_norm": 0.4066804349422455, + "learning_rate": 0.0001, + "loss": 1.7463, + "step": 2604 + }, + { + "epoch": 0.3026430438571014, + "grad_norm": 0.4184049367904663, + "learning_rate": 0.0001, + "loss": 1.7302, + "step": 2605 + }, + { + "epoch": 0.3027592216090619, + "grad_norm": 0.3844599723815918, + "learning_rate": 0.0001, + "loss": 1.7257, + "step": 2606 + }, + { + "epoch": 0.30287539936102237, + "grad_norm": 0.4283202290534973, + "learning_rate": 0.0001, + "loss": 1.8115, + "step": 2607 + }, + { + "epoch": 0.30299157711298286, + "grad_norm": 0.4154917895793915, + "learning_rate": 0.0001, + "loss": 1.7869, + "step": 2608 + }, + { + "epoch": 0.30310775486494335, + "grad_norm": 0.4059176445007324, + "learning_rate": 0.0001, + "loss": 1.8573, + "step": 2609 + }, + { + "epoch": 0.3032239326169039, + "grad_norm": 0.38833245635032654, + "learning_rate": 0.0001, + "loss": 1.6275, + "step": 2610 + }, + { + "epoch": 0.3033401103688644, + "grad_norm": 0.3859376013278961, + "learning_rate": 0.0001, + "loss": 1.7628, + "step": 2611 + }, + { + "epoch": 0.30345628812082487, + "grad_norm": 0.3899228572845459, + "learning_rate": 0.0001, + "loss": 1.6193, + "step": 2612 + }, + { + "epoch": 0.30357246587278536, + "grad_norm": 0.38399478793144226, + "learning_rate": 0.0001, + "loss": 1.6718, + "step": 2613 + }, + { + "epoch": 0.30368864362474585, + "grad_norm": 0.440913587808609, + "learning_rate": 0.0001, + "loss": 1.8794, + "step": 2614 + }, + { + "epoch": 0.30380482137670634, + "grad_norm": 0.3857106864452362, + "learning_rate": 0.0001, + "loss": 1.7513, + "step": 2615 + }, + { + "epoch": 0.3039209991286669, + "grad_norm": 0.3604831099510193, + "learning_rate": 0.0001, + "loss": 1.6266, + "step": 2616 + }, + { + "epoch": 0.30403717688062737, + "grad_norm": 0.3680822551250458, + "learning_rate": 0.0001, + "loss": 1.6749, + "step": 2617 + }, + { + "epoch": 0.30415335463258786, + "grad_norm": 0.3898998498916626, + "learning_rate": 0.0001, + "loss": 1.7768, + "step": 2618 + }, + { + "epoch": 0.30426953238454835, + "grad_norm": 0.3705196678638458, + "learning_rate": 0.0001, + "loss": 1.6788, + "step": 2619 + }, + { + "epoch": 0.30438571013650884, + "grad_norm": 0.3996119797229767, + "learning_rate": 0.0001, + "loss": 1.7945, + "step": 2620 + }, + { + "epoch": 0.3045018878884694, + "grad_norm": 0.391072541475296, + "learning_rate": 0.0001, + "loss": 1.6452, + "step": 2621 + }, + { + "epoch": 0.30461806564042987, + "grad_norm": 0.5729893445968628, + "learning_rate": 0.0001, + "loss": 1.6585, + "step": 2622 + }, + { + "epoch": 0.30473424339239036, + "grad_norm": 0.38251304626464844, + "learning_rate": 0.0001, + "loss": 1.6424, + "step": 2623 + }, + { + "epoch": 0.30485042114435085, + "grad_norm": 0.3925994634628296, + "learning_rate": 0.0001, + "loss": 1.7669, + "step": 2624 + }, + { + "epoch": 0.30496659889631134, + "grad_norm": 0.3850267827510834, + "learning_rate": 0.0001, + "loss": 1.7181, + "step": 2625 + }, + { + "epoch": 0.3050827766482719, + "grad_norm": 0.37147024273872375, + "learning_rate": 0.0001, + "loss": 1.6673, + "step": 2626 + }, + { + "epoch": 0.30519895440023237, + "grad_norm": 0.41201236844062805, + "learning_rate": 0.0001, + "loss": 1.7335, + "step": 2627 + }, + { + "epoch": 0.30531513215219286, + "grad_norm": 0.4249459505081177, + "learning_rate": 0.0001, + "loss": 1.8026, + "step": 2628 + }, + { + "epoch": 0.30543130990415335, + "grad_norm": 0.37525978684425354, + "learning_rate": 0.0001, + "loss": 1.7273, + "step": 2629 + }, + { + "epoch": 0.30554748765611384, + "grad_norm": 0.3670780658721924, + "learning_rate": 0.0001, + "loss": 1.6659, + "step": 2630 + }, + { + "epoch": 0.3056636654080744, + "grad_norm": 0.37397563457489014, + "learning_rate": 0.0001, + "loss": 1.7848, + "step": 2631 + }, + { + "epoch": 0.30577984316003487, + "grad_norm": 0.35987600684165955, + "learning_rate": 0.0001, + "loss": 1.6362, + "step": 2632 + }, + { + "epoch": 0.30589602091199536, + "grad_norm": 0.3674522340297699, + "learning_rate": 0.0001, + "loss": 1.6985, + "step": 2633 + }, + { + "epoch": 0.30601219866395585, + "grad_norm": 0.3551689684391022, + "learning_rate": 0.0001, + "loss": 1.4699, + "step": 2634 + }, + { + "epoch": 0.30612837641591634, + "grad_norm": 0.39974987506866455, + "learning_rate": 0.0001, + "loss": 1.7497, + "step": 2635 + }, + { + "epoch": 0.3062445541678768, + "grad_norm": 0.37489983439445496, + "learning_rate": 0.0001, + "loss": 1.6122, + "step": 2636 + }, + { + "epoch": 0.30636073191983737, + "grad_norm": 0.38141462206840515, + "learning_rate": 0.0001, + "loss": 1.725, + "step": 2637 + }, + { + "epoch": 0.30647690967179786, + "grad_norm": 0.38110142946243286, + "learning_rate": 0.0001, + "loss": 1.7031, + "step": 2638 + }, + { + "epoch": 0.30659308742375835, + "grad_norm": 0.3580850064754486, + "learning_rate": 0.0001, + "loss": 1.5362, + "step": 2639 + }, + { + "epoch": 0.30670926517571884, + "grad_norm": 0.4292284846305847, + "learning_rate": 0.0001, + "loss": 1.7091, + "step": 2640 + }, + { + "epoch": 0.3068254429276793, + "grad_norm": 0.37014874815940857, + "learning_rate": 0.0001, + "loss": 1.665, + "step": 2641 + }, + { + "epoch": 0.30694162067963987, + "grad_norm": 0.36894306540489197, + "learning_rate": 0.0001, + "loss": 1.5296, + "step": 2642 + }, + { + "epoch": 0.30705779843160036, + "grad_norm": 0.37392929196357727, + "learning_rate": 0.0001, + "loss": 1.5448, + "step": 2643 + }, + { + "epoch": 0.30717397618356085, + "grad_norm": 0.3983522653579712, + "learning_rate": 0.0001, + "loss": 1.5856, + "step": 2644 + }, + { + "epoch": 0.30729015393552134, + "grad_norm": 0.3379395604133606, + "learning_rate": 0.0001, + "loss": 1.4932, + "step": 2645 + }, + { + "epoch": 0.3074063316874818, + "grad_norm": 0.35767263174057007, + "learning_rate": 0.0001, + "loss": 1.6182, + "step": 2646 + }, + { + "epoch": 0.30752250943944237, + "grad_norm": 0.3817872405052185, + "learning_rate": 0.0001, + "loss": 1.7924, + "step": 2647 + }, + { + "epoch": 0.30763868719140286, + "grad_norm": 0.3904268741607666, + "learning_rate": 0.0001, + "loss": 1.6342, + "step": 2648 + }, + { + "epoch": 0.30775486494336335, + "grad_norm": 0.37173137068748474, + "learning_rate": 0.0001, + "loss": 1.5885, + "step": 2649 + }, + { + "epoch": 0.30787104269532384, + "grad_norm": 0.3717529773712158, + "learning_rate": 0.0001, + "loss": 1.6096, + "step": 2650 + }, + { + "epoch": 0.3079872204472843, + "grad_norm": 0.3810470998287201, + "learning_rate": 0.0001, + "loss": 1.6477, + "step": 2651 + }, + { + "epoch": 0.30810339819924487, + "grad_norm": 0.3781997263431549, + "learning_rate": 0.0001, + "loss": 1.5476, + "step": 2652 + }, + { + "epoch": 0.30821957595120536, + "grad_norm": 0.4403388798236847, + "learning_rate": 0.0001, + "loss": 1.7052, + "step": 2653 + }, + { + "epoch": 0.30833575370316585, + "grad_norm": 0.3832712769508362, + "learning_rate": 0.0001, + "loss": 1.7543, + "step": 2654 + }, + { + "epoch": 0.30845193145512634, + "grad_norm": 0.3549423813819885, + "learning_rate": 0.0001, + "loss": 1.4284, + "step": 2655 + }, + { + "epoch": 0.3085681092070868, + "grad_norm": 0.37209948897361755, + "learning_rate": 0.0001, + "loss": 1.6505, + "step": 2656 + }, + { + "epoch": 0.3086842869590473, + "grad_norm": 0.39987820386886597, + "learning_rate": 0.0001, + "loss": 1.6999, + "step": 2657 + }, + { + "epoch": 0.30880046471100786, + "grad_norm": 0.40743595361709595, + "learning_rate": 0.0001, + "loss": 1.8868, + "step": 2658 + }, + { + "epoch": 0.30891664246296835, + "grad_norm": 0.41529178619384766, + "learning_rate": 0.0001, + "loss": 1.6629, + "step": 2659 + }, + { + "epoch": 0.30903282021492884, + "grad_norm": 0.420540988445282, + "learning_rate": 0.0001, + "loss": 1.6699, + "step": 2660 + }, + { + "epoch": 0.3091489979668893, + "grad_norm": 0.38582152128219604, + "learning_rate": 0.0001, + "loss": 1.8365, + "step": 2661 + }, + { + "epoch": 0.3092651757188498, + "grad_norm": 0.382856547832489, + "learning_rate": 0.0001, + "loss": 1.7678, + "step": 2662 + }, + { + "epoch": 0.30938135347081036, + "grad_norm": 0.3981390595436096, + "learning_rate": 0.0001, + "loss": 1.7592, + "step": 2663 + }, + { + "epoch": 0.30949753122277085, + "grad_norm": 0.4064244031906128, + "learning_rate": 0.0001, + "loss": 1.783, + "step": 2664 + }, + { + "epoch": 0.30961370897473134, + "grad_norm": 0.3797922730445862, + "learning_rate": 0.0001, + "loss": 1.6889, + "step": 2665 + }, + { + "epoch": 0.3097298867266918, + "grad_norm": 0.366719514131546, + "learning_rate": 0.0001, + "loss": 1.5794, + "step": 2666 + }, + { + "epoch": 0.3098460644786523, + "grad_norm": 0.3816395103931427, + "learning_rate": 0.0001, + "loss": 1.7305, + "step": 2667 + }, + { + "epoch": 0.30996224223061286, + "grad_norm": 0.3834119141101837, + "learning_rate": 0.0001, + "loss": 1.6341, + "step": 2668 + }, + { + "epoch": 0.31007841998257335, + "grad_norm": 0.3797139525413513, + "learning_rate": 0.0001, + "loss": 1.683, + "step": 2669 + }, + { + "epoch": 0.31019459773453384, + "grad_norm": 0.3854397237300873, + "learning_rate": 0.0001, + "loss": 1.7163, + "step": 2670 + }, + { + "epoch": 0.31031077548649433, + "grad_norm": 0.3634074330329895, + "learning_rate": 0.0001, + "loss": 1.6188, + "step": 2671 + }, + { + "epoch": 0.3104269532384548, + "grad_norm": 0.37497079372406006, + "learning_rate": 0.0001, + "loss": 1.6898, + "step": 2672 + }, + { + "epoch": 0.31054313099041536, + "grad_norm": 0.39349761605262756, + "learning_rate": 0.0001, + "loss": 1.6077, + "step": 2673 + }, + { + "epoch": 0.31065930874237585, + "grad_norm": 0.4020683765411377, + "learning_rate": 0.0001, + "loss": 1.6122, + "step": 2674 + }, + { + "epoch": 0.31077548649433634, + "grad_norm": 0.4030044376850128, + "learning_rate": 0.0001, + "loss": 1.7939, + "step": 2675 + }, + { + "epoch": 0.31089166424629683, + "grad_norm": 0.3698487877845764, + "learning_rate": 0.0001, + "loss": 1.5568, + "step": 2676 + }, + { + "epoch": 0.3110078419982573, + "grad_norm": 0.39510759711265564, + "learning_rate": 0.0001, + "loss": 1.4885, + "step": 2677 + }, + { + "epoch": 0.3111240197502178, + "grad_norm": 0.3986801207065582, + "learning_rate": 0.0001, + "loss": 1.6593, + "step": 2678 + }, + { + "epoch": 0.31124019750217835, + "grad_norm": 0.39601030945777893, + "learning_rate": 0.0001, + "loss": 1.7573, + "step": 2679 + }, + { + "epoch": 0.31135637525413884, + "grad_norm": 0.42814406752586365, + "learning_rate": 0.0001, + "loss": 1.7842, + "step": 2680 + }, + { + "epoch": 0.31147255300609933, + "grad_norm": 0.39685603976249695, + "learning_rate": 0.0001, + "loss": 1.886, + "step": 2681 + }, + { + "epoch": 0.3115887307580598, + "grad_norm": 0.4117811322212219, + "learning_rate": 0.0001, + "loss": 1.7367, + "step": 2682 + }, + { + "epoch": 0.3117049085100203, + "grad_norm": 0.40006887912750244, + "learning_rate": 0.0001, + "loss": 1.7719, + "step": 2683 + }, + { + "epoch": 0.31182108626198085, + "grad_norm": 0.4050239026546478, + "learning_rate": 0.0001, + "loss": 1.6554, + "step": 2684 + }, + { + "epoch": 0.31193726401394134, + "grad_norm": 0.38189923763275146, + "learning_rate": 0.0001, + "loss": 1.6241, + "step": 2685 + }, + { + "epoch": 0.31205344176590183, + "grad_norm": 0.4016052484512329, + "learning_rate": 0.0001, + "loss": 1.6297, + "step": 2686 + }, + { + "epoch": 0.3121696195178623, + "grad_norm": 0.37489327788352966, + "learning_rate": 0.0001, + "loss": 1.6032, + "step": 2687 + }, + { + "epoch": 0.3122857972698228, + "grad_norm": 0.3983065187931061, + "learning_rate": 0.0001, + "loss": 1.5249, + "step": 2688 + }, + { + "epoch": 0.31240197502178335, + "grad_norm": 0.40546756982803345, + "learning_rate": 0.0001, + "loss": 1.7134, + "step": 2689 + }, + { + "epoch": 0.31251815277374384, + "grad_norm": 0.37291330099105835, + "learning_rate": 0.0001, + "loss": 1.6283, + "step": 2690 + }, + { + "epoch": 0.31263433052570433, + "grad_norm": 0.3653269410133362, + "learning_rate": 0.0001, + "loss": 1.5474, + "step": 2691 + }, + { + "epoch": 0.3127505082776648, + "grad_norm": 0.3700310289859772, + "learning_rate": 0.0001, + "loss": 1.6161, + "step": 2692 + }, + { + "epoch": 0.3128666860296253, + "grad_norm": 0.38666659593582153, + "learning_rate": 0.0001, + "loss": 1.6764, + "step": 2693 + }, + { + "epoch": 0.31298286378158585, + "grad_norm": 0.3867506980895996, + "learning_rate": 0.0001, + "loss": 1.7236, + "step": 2694 + }, + { + "epoch": 0.31309904153354634, + "grad_norm": 0.38117802143096924, + "learning_rate": 0.0001, + "loss": 1.7902, + "step": 2695 + }, + { + "epoch": 0.31321521928550683, + "grad_norm": 0.403726726770401, + "learning_rate": 0.0001, + "loss": 1.8469, + "step": 2696 + }, + { + "epoch": 0.3133313970374673, + "grad_norm": 0.3811923861503601, + "learning_rate": 0.0001, + "loss": 1.6764, + "step": 2697 + }, + { + "epoch": 0.3134475747894278, + "grad_norm": 0.3940551280975342, + "learning_rate": 0.0001, + "loss": 1.6292, + "step": 2698 + }, + { + "epoch": 0.3135637525413883, + "grad_norm": 0.3885653614997864, + "learning_rate": 0.0001, + "loss": 1.7254, + "step": 2699 + }, + { + "epoch": 0.31367993029334884, + "grad_norm": 0.36595413088798523, + "learning_rate": 0.0001, + "loss": 1.7143, + "step": 2700 + }, + { + "epoch": 0.31379610804530933, + "grad_norm": 0.38449594378471375, + "learning_rate": 0.0001, + "loss": 1.7474, + "step": 2701 + }, + { + "epoch": 0.3139122857972698, + "grad_norm": 0.3768537640571594, + "learning_rate": 0.0001, + "loss": 1.6631, + "step": 2702 + }, + { + "epoch": 0.3140284635492303, + "grad_norm": 0.3723905384540558, + "learning_rate": 0.0001, + "loss": 1.6857, + "step": 2703 + }, + { + "epoch": 0.3141446413011908, + "grad_norm": 0.3800404667854309, + "learning_rate": 0.0001, + "loss": 1.653, + "step": 2704 + }, + { + "epoch": 0.31426081905315134, + "grad_norm": 0.38999584317207336, + "learning_rate": 0.0001, + "loss": 1.6284, + "step": 2705 + }, + { + "epoch": 0.31437699680511183, + "grad_norm": 0.40568429231643677, + "learning_rate": 0.0001, + "loss": 1.6276, + "step": 2706 + }, + { + "epoch": 0.3144931745570723, + "grad_norm": 0.3945614993572235, + "learning_rate": 0.0001, + "loss": 1.6298, + "step": 2707 + }, + { + "epoch": 0.3146093523090328, + "grad_norm": 0.36772727966308594, + "learning_rate": 0.0001, + "loss": 1.5668, + "step": 2708 + }, + { + "epoch": 0.3147255300609933, + "grad_norm": 0.38640451431274414, + "learning_rate": 0.0001, + "loss": 1.677, + "step": 2709 + }, + { + "epoch": 0.31484170781295384, + "grad_norm": 0.3605159819126129, + "learning_rate": 0.0001, + "loss": 1.4943, + "step": 2710 + }, + { + "epoch": 0.31495788556491433, + "grad_norm": 0.38972237706184387, + "learning_rate": 0.0001, + "loss": 1.801, + "step": 2711 + }, + { + "epoch": 0.3150740633168748, + "grad_norm": 0.44629716873168945, + "learning_rate": 0.0001, + "loss": 1.7858, + "step": 2712 + }, + { + "epoch": 0.3151902410688353, + "grad_norm": 0.39029544591903687, + "learning_rate": 0.0001, + "loss": 1.7399, + "step": 2713 + }, + { + "epoch": 0.3153064188207958, + "grad_norm": 0.37057486176490784, + "learning_rate": 0.0001, + "loss": 1.5732, + "step": 2714 + }, + { + "epoch": 0.31542259657275634, + "grad_norm": 0.38961511850357056, + "learning_rate": 0.0001, + "loss": 1.7939, + "step": 2715 + }, + { + "epoch": 0.31553877432471683, + "grad_norm": 0.3788926899433136, + "learning_rate": 0.0001, + "loss": 1.5609, + "step": 2716 + }, + { + "epoch": 0.3156549520766773, + "grad_norm": 0.3727104961872101, + "learning_rate": 0.0001, + "loss": 1.5085, + "step": 2717 + }, + { + "epoch": 0.3157711298286378, + "grad_norm": 0.38792872428894043, + "learning_rate": 0.0001, + "loss": 1.5825, + "step": 2718 + }, + { + "epoch": 0.3158873075805983, + "grad_norm": 0.39093390107154846, + "learning_rate": 0.0001, + "loss": 1.5953, + "step": 2719 + }, + { + "epoch": 0.31600348533255884, + "grad_norm": 0.39481261372566223, + "learning_rate": 0.0001, + "loss": 1.7409, + "step": 2720 + }, + { + "epoch": 0.31611966308451933, + "grad_norm": 0.39427947998046875, + "learning_rate": 0.0001, + "loss": 1.7673, + "step": 2721 + }, + { + "epoch": 0.3162358408364798, + "grad_norm": 0.3655182719230652, + "learning_rate": 0.0001, + "loss": 1.509, + "step": 2722 + }, + { + "epoch": 0.3163520185884403, + "grad_norm": 0.4002794921398163, + "learning_rate": 0.0001, + "loss": 1.7199, + "step": 2723 + }, + { + "epoch": 0.3164681963404008, + "grad_norm": 0.4090864360332489, + "learning_rate": 0.0001, + "loss": 1.7553, + "step": 2724 + }, + { + "epoch": 0.3165843740923613, + "grad_norm": 0.3690468370914459, + "learning_rate": 0.0001, + "loss": 1.6284, + "step": 2725 + }, + { + "epoch": 0.31670055184432183, + "grad_norm": 0.3945186734199524, + "learning_rate": 0.0001, + "loss": 1.6574, + "step": 2726 + }, + { + "epoch": 0.3168167295962823, + "grad_norm": 0.3898504972457886, + "learning_rate": 0.0001, + "loss": 1.6971, + "step": 2727 + }, + { + "epoch": 0.3169329073482428, + "grad_norm": 0.38657867908477783, + "learning_rate": 0.0001, + "loss": 1.8112, + "step": 2728 + }, + { + "epoch": 0.3170490851002033, + "grad_norm": 0.39374595880508423, + "learning_rate": 0.0001, + "loss": 1.6534, + "step": 2729 + }, + { + "epoch": 0.3171652628521638, + "grad_norm": 0.3979948163032532, + "learning_rate": 0.0001, + "loss": 1.6881, + "step": 2730 + }, + { + "epoch": 0.31728144060412433, + "grad_norm": 0.3945339620113373, + "learning_rate": 0.0001, + "loss": 1.7315, + "step": 2731 + }, + { + "epoch": 0.3173976183560848, + "grad_norm": 0.4169275760650635, + "learning_rate": 0.0001, + "loss": 1.7789, + "step": 2732 + }, + { + "epoch": 0.3175137961080453, + "grad_norm": 0.37716567516326904, + "learning_rate": 0.0001, + "loss": 1.5534, + "step": 2733 + }, + { + "epoch": 0.3176299738600058, + "grad_norm": 0.38198474049568176, + "learning_rate": 0.0001, + "loss": 1.7881, + "step": 2734 + }, + { + "epoch": 0.3177461516119663, + "grad_norm": 0.3758987486362457, + "learning_rate": 0.0001, + "loss": 1.5993, + "step": 2735 + }, + { + "epoch": 0.31786232936392683, + "grad_norm": 0.4033893644809723, + "learning_rate": 0.0001, + "loss": 1.567, + "step": 2736 + }, + { + "epoch": 0.3179785071158873, + "grad_norm": 0.3776089549064636, + "learning_rate": 0.0001, + "loss": 1.614, + "step": 2737 + }, + { + "epoch": 0.3180946848678478, + "grad_norm": 0.38801753520965576, + "learning_rate": 0.0001, + "loss": 1.6007, + "step": 2738 + }, + { + "epoch": 0.3182108626198083, + "grad_norm": 0.3814786970615387, + "learning_rate": 0.0001, + "loss": 1.4875, + "step": 2739 + }, + { + "epoch": 0.3183270403717688, + "grad_norm": 0.35588279366493225, + "learning_rate": 0.0001, + "loss": 1.5519, + "step": 2740 + }, + { + "epoch": 0.31844321812372933, + "grad_norm": 0.378668874502182, + "learning_rate": 0.0001, + "loss": 1.6447, + "step": 2741 + }, + { + "epoch": 0.3185593958756898, + "grad_norm": 0.4211976230144501, + "learning_rate": 0.0001, + "loss": 1.882, + "step": 2742 + }, + { + "epoch": 0.3186755736276503, + "grad_norm": 0.36686578392982483, + "learning_rate": 0.0001, + "loss": 1.5626, + "step": 2743 + }, + { + "epoch": 0.3187917513796108, + "grad_norm": 0.37785688042640686, + "learning_rate": 0.0001, + "loss": 1.5346, + "step": 2744 + }, + { + "epoch": 0.3189079291315713, + "grad_norm": 0.3932788372039795, + "learning_rate": 0.0001, + "loss": 1.7125, + "step": 2745 + }, + { + "epoch": 0.3190241068835318, + "grad_norm": 0.3749261200428009, + "learning_rate": 0.0001, + "loss": 1.7368, + "step": 2746 + }, + { + "epoch": 0.3191402846354923, + "grad_norm": 0.36720603704452515, + "learning_rate": 0.0001, + "loss": 1.6742, + "step": 2747 + }, + { + "epoch": 0.3192564623874528, + "grad_norm": 0.38770416378974915, + "learning_rate": 0.0001, + "loss": 1.6792, + "step": 2748 + }, + { + "epoch": 0.3193726401394133, + "grad_norm": 0.42165276408195496, + "learning_rate": 0.0001, + "loss": 1.8394, + "step": 2749 + }, + { + "epoch": 0.3194888178913738, + "grad_norm": 0.3547378480434418, + "learning_rate": 0.0001, + "loss": 1.7143, + "step": 2750 + }, + { + "epoch": 0.3196049956433343, + "grad_norm": 0.3825453519821167, + "learning_rate": 0.0001, + "loss": 1.5718, + "step": 2751 + }, + { + "epoch": 0.3197211733952948, + "grad_norm": 0.42200520634651184, + "learning_rate": 0.0001, + "loss": 1.6849, + "step": 2752 + }, + { + "epoch": 0.3198373511472553, + "grad_norm": 0.3694903552532196, + "learning_rate": 0.0001, + "loss": 1.6634, + "step": 2753 + }, + { + "epoch": 0.3199535288992158, + "grad_norm": 0.38652074337005615, + "learning_rate": 0.0001, + "loss": 1.673, + "step": 2754 + }, + { + "epoch": 0.3200697066511763, + "grad_norm": 0.4044737219810486, + "learning_rate": 0.0001, + "loss": 1.442, + "step": 2755 + }, + { + "epoch": 0.3201858844031368, + "grad_norm": 0.3883349597454071, + "learning_rate": 0.0001, + "loss": 1.5372, + "step": 2756 + }, + { + "epoch": 0.3203020621550973, + "grad_norm": 0.3901846408843994, + "learning_rate": 0.0001, + "loss": 1.8362, + "step": 2757 + }, + { + "epoch": 0.3204182399070578, + "grad_norm": 0.37919488549232483, + "learning_rate": 0.0001, + "loss": 1.6951, + "step": 2758 + }, + { + "epoch": 0.3205344176590183, + "grad_norm": 0.36535388231277466, + "learning_rate": 0.0001, + "loss": 1.5061, + "step": 2759 + }, + { + "epoch": 0.3206505954109788, + "grad_norm": 0.3907409608364105, + "learning_rate": 0.0001, + "loss": 1.717, + "step": 2760 + }, + { + "epoch": 0.3207667731629393, + "grad_norm": 0.39692288637161255, + "learning_rate": 0.0001, + "loss": 1.6884, + "step": 2761 + }, + { + "epoch": 0.3208829509148998, + "grad_norm": 0.431797593832016, + "learning_rate": 0.0001, + "loss": 1.6013, + "step": 2762 + }, + { + "epoch": 0.3209991286668603, + "grad_norm": 0.38208791613578796, + "learning_rate": 0.0001, + "loss": 1.5427, + "step": 2763 + }, + { + "epoch": 0.3211153064188208, + "grad_norm": 0.40455830097198486, + "learning_rate": 0.0001, + "loss": 1.6036, + "step": 2764 + }, + { + "epoch": 0.3212314841707813, + "grad_norm": 0.3595045506954193, + "learning_rate": 0.0001, + "loss": 1.53, + "step": 2765 + }, + { + "epoch": 0.3213476619227418, + "grad_norm": 0.42558008432388306, + "learning_rate": 0.0001, + "loss": 1.7101, + "step": 2766 + }, + { + "epoch": 0.32146383967470227, + "grad_norm": 0.39539337158203125, + "learning_rate": 0.0001, + "loss": 1.5166, + "step": 2767 + }, + { + "epoch": 0.3215800174266628, + "grad_norm": 0.3601813316345215, + "learning_rate": 0.0001, + "loss": 1.4853, + "step": 2768 + }, + { + "epoch": 0.3216961951786233, + "grad_norm": 0.3683129847049713, + "learning_rate": 0.0001, + "loss": 1.6322, + "step": 2769 + }, + { + "epoch": 0.3218123729305838, + "grad_norm": 0.4206804633140564, + "learning_rate": 0.0001, + "loss": 1.7475, + "step": 2770 + }, + { + "epoch": 0.3219285506825443, + "grad_norm": 0.3908296525478363, + "learning_rate": 0.0001, + "loss": 1.6926, + "step": 2771 + }, + { + "epoch": 0.32204472843450477, + "grad_norm": 0.38299795985221863, + "learning_rate": 0.0001, + "loss": 1.5537, + "step": 2772 + }, + { + "epoch": 0.3221609061864653, + "grad_norm": 0.41059038043022156, + "learning_rate": 0.0001, + "loss": 1.6275, + "step": 2773 + }, + { + "epoch": 0.3222770839384258, + "grad_norm": 0.3887813687324524, + "learning_rate": 0.0001, + "loss": 1.8025, + "step": 2774 + }, + { + "epoch": 0.3223932616903863, + "grad_norm": 0.36034008860588074, + "learning_rate": 0.0001, + "loss": 1.6923, + "step": 2775 + }, + { + "epoch": 0.3225094394423468, + "grad_norm": 0.3732610046863556, + "learning_rate": 0.0001, + "loss": 1.6004, + "step": 2776 + }, + { + "epoch": 0.32262561719430727, + "grad_norm": 0.40316662192344666, + "learning_rate": 0.0001, + "loss": 1.6442, + "step": 2777 + }, + { + "epoch": 0.3227417949462678, + "grad_norm": 0.40975120663642883, + "learning_rate": 0.0001, + "loss": 1.5921, + "step": 2778 + }, + { + "epoch": 0.3228579726982283, + "grad_norm": 0.38018473982810974, + "learning_rate": 0.0001, + "loss": 1.711, + "step": 2779 + }, + { + "epoch": 0.3229741504501888, + "grad_norm": 0.3749701678752899, + "learning_rate": 0.0001, + "loss": 1.5769, + "step": 2780 + }, + { + "epoch": 0.3230903282021493, + "grad_norm": 0.37560439109802246, + "learning_rate": 0.0001, + "loss": 1.5876, + "step": 2781 + }, + { + "epoch": 0.32320650595410977, + "grad_norm": 0.3560850918292999, + "learning_rate": 0.0001, + "loss": 1.5187, + "step": 2782 + }, + { + "epoch": 0.3233226837060703, + "grad_norm": 0.41308125853538513, + "learning_rate": 0.0001, + "loss": 1.7863, + "step": 2783 + }, + { + "epoch": 0.3234388614580308, + "grad_norm": 0.35252368450164795, + "learning_rate": 0.0001, + "loss": 1.6592, + "step": 2784 + }, + { + "epoch": 0.3235550392099913, + "grad_norm": 0.3724053204059601, + "learning_rate": 0.0001, + "loss": 1.739, + "step": 2785 + }, + { + "epoch": 0.3236712169619518, + "grad_norm": 0.4030148684978485, + "learning_rate": 0.0001, + "loss": 1.6719, + "step": 2786 + }, + { + "epoch": 0.32378739471391227, + "grad_norm": 0.3721768260002136, + "learning_rate": 0.0001, + "loss": 1.7063, + "step": 2787 + }, + { + "epoch": 0.32390357246587276, + "grad_norm": 0.34921830892562866, + "learning_rate": 0.0001, + "loss": 1.5576, + "step": 2788 + }, + { + "epoch": 0.3240197502178333, + "grad_norm": 0.39488157629966736, + "learning_rate": 0.0001, + "loss": 1.7164, + "step": 2789 + }, + { + "epoch": 0.3241359279697938, + "grad_norm": 0.3864257037639618, + "learning_rate": 0.0001, + "loss": 1.644, + "step": 2790 + }, + { + "epoch": 0.3242521057217543, + "grad_norm": 0.41633668541908264, + "learning_rate": 0.0001, + "loss": 1.7591, + "step": 2791 + }, + { + "epoch": 0.32436828347371477, + "grad_norm": 0.38308605551719666, + "learning_rate": 0.0001, + "loss": 1.6398, + "step": 2792 + }, + { + "epoch": 0.32448446122567526, + "grad_norm": 0.37461498379707336, + "learning_rate": 0.0001, + "loss": 1.7296, + "step": 2793 + }, + { + "epoch": 0.3246006389776358, + "grad_norm": 0.4059557020664215, + "learning_rate": 0.0001, + "loss": 1.8018, + "step": 2794 + }, + { + "epoch": 0.3247168167295963, + "grad_norm": 0.37268683314323425, + "learning_rate": 0.0001, + "loss": 1.6274, + "step": 2795 + }, + { + "epoch": 0.3248329944815568, + "grad_norm": 0.3826591372489929, + "learning_rate": 0.0001, + "loss": 1.7083, + "step": 2796 + }, + { + "epoch": 0.32494917223351727, + "grad_norm": 0.38750702142715454, + "learning_rate": 0.0001, + "loss": 1.7622, + "step": 2797 + }, + { + "epoch": 0.32506534998547776, + "grad_norm": 0.4062412679195404, + "learning_rate": 0.0001, + "loss": 1.7934, + "step": 2798 + }, + { + "epoch": 0.3251815277374383, + "grad_norm": 0.3924409747123718, + "learning_rate": 0.0001, + "loss": 1.6323, + "step": 2799 + }, + { + "epoch": 0.3252977054893988, + "grad_norm": 0.3657173216342926, + "learning_rate": 0.0001, + "loss": 1.5554, + "step": 2800 + }, + { + "epoch": 0.3254138832413593, + "grad_norm": 0.37579992413520813, + "learning_rate": 0.0001, + "loss": 1.6206, + "step": 2801 + }, + { + "epoch": 0.32553006099331977, + "grad_norm": 0.3894909620285034, + "learning_rate": 0.0001, + "loss": 1.5396, + "step": 2802 + }, + { + "epoch": 0.32564623874528026, + "grad_norm": 0.3836139440536499, + "learning_rate": 0.0001, + "loss": 1.6939, + "step": 2803 + }, + { + "epoch": 0.3257624164972408, + "grad_norm": 0.40788355469703674, + "learning_rate": 0.0001, + "loss": 1.7918, + "step": 2804 + }, + { + "epoch": 0.3258785942492013, + "grad_norm": 0.4235386848449707, + "learning_rate": 0.0001, + "loss": 1.8649, + "step": 2805 + }, + { + "epoch": 0.3259947720011618, + "grad_norm": 0.39928755164146423, + "learning_rate": 0.0001, + "loss": 1.7444, + "step": 2806 + }, + { + "epoch": 0.32611094975312227, + "grad_norm": 0.40659260749816895, + "learning_rate": 0.0001, + "loss": 1.7038, + "step": 2807 + }, + { + "epoch": 0.32622712750508276, + "grad_norm": 0.38179177045822144, + "learning_rate": 0.0001, + "loss": 1.7224, + "step": 2808 + }, + { + "epoch": 0.3263433052570433, + "grad_norm": 0.38274791836738586, + "learning_rate": 0.0001, + "loss": 1.4469, + "step": 2809 + }, + { + "epoch": 0.3264594830090038, + "grad_norm": 0.3890819549560547, + "learning_rate": 0.0001, + "loss": 1.6585, + "step": 2810 + }, + { + "epoch": 0.3265756607609643, + "grad_norm": 0.37747570872306824, + "learning_rate": 0.0001, + "loss": 1.6623, + "step": 2811 + }, + { + "epoch": 0.32669183851292477, + "grad_norm": 0.38250067830085754, + "learning_rate": 0.0001, + "loss": 1.6841, + "step": 2812 + }, + { + "epoch": 0.32680801626488526, + "grad_norm": 0.3761623799800873, + "learning_rate": 0.0001, + "loss": 1.6855, + "step": 2813 + }, + { + "epoch": 0.32692419401684575, + "grad_norm": 0.3789752721786499, + "learning_rate": 0.0001, + "loss": 1.718, + "step": 2814 + }, + { + "epoch": 0.3270403717688063, + "grad_norm": 0.3886089622974396, + "learning_rate": 0.0001, + "loss": 1.7111, + "step": 2815 + }, + { + "epoch": 0.3271565495207668, + "grad_norm": 0.3989008665084839, + "learning_rate": 0.0001, + "loss": 1.8128, + "step": 2816 + }, + { + "epoch": 0.32727272727272727, + "grad_norm": 0.4016900360584259, + "learning_rate": 0.0001, + "loss": 1.682, + "step": 2817 + }, + { + "epoch": 0.32738890502468776, + "grad_norm": 0.41296061873435974, + "learning_rate": 0.0001, + "loss": 1.8053, + "step": 2818 + }, + { + "epoch": 0.32750508277664825, + "grad_norm": 0.3775239586830139, + "learning_rate": 0.0001, + "loss": 1.6908, + "step": 2819 + }, + { + "epoch": 0.3276212605286088, + "grad_norm": 0.4092549681663513, + "learning_rate": 0.0001, + "loss": 1.7844, + "step": 2820 + }, + { + "epoch": 0.3277374382805693, + "grad_norm": 0.3923521041870117, + "learning_rate": 0.0001, + "loss": 1.4103, + "step": 2821 + }, + { + "epoch": 0.32785361603252977, + "grad_norm": 0.3734949231147766, + "learning_rate": 0.0001, + "loss": 1.5444, + "step": 2822 + }, + { + "epoch": 0.32796979378449026, + "grad_norm": 0.3700851798057556, + "learning_rate": 0.0001, + "loss": 1.6773, + "step": 2823 + }, + { + "epoch": 0.32808597153645075, + "grad_norm": 0.3860493302345276, + "learning_rate": 0.0001, + "loss": 1.7857, + "step": 2824 + }, + { + "epoch": 0.3282021492884113, + "grad_norm": 0.4142455756664276, + "learning_rate": 0.0001, + "loss": 1.6164, + "step": 2825 + }, + { + "epoch": 0.3283183270403718, + "grad_norm": 0.3860050141811371, + "learning_rate": 0.0001, + "loss": 1.7274, + "step": 2826 + }, + { + "epoch": 0.32843450479233227, + "grad_norm": 0.39390870928764343, + "learning_rate": 0.0001, + "loss": 1.7529, + "step": 2827 + }, + { + "epoch": 0.32855068254429276, + "grad_norm": 0.3824588656425476, + "learning_rate": 0.0001, + "loss": 1.7334, + "step": 2828 + }, + { + "epoch": 0.32866686029625325, + "grad_norm": 0.3569580018520355, + "learning_rate": 0.0001, + "loss": 1.5416, + "step": 2829 + }, + { + "epoch": 0.3287830380482138, + "grad_norm": 0.3893423080444336, + "learning_rate": 0.0001, + "loss": 1.5403, + "step": 2830 + }, + { + "epoch": 0.3288992158001743, + "grad_norm": 0.3976670503616333, + "learning_rate": 0.0001, + "loss": 1.7884, + "step": 2831 + }, + { + "epoch": 0.32901539355213477, + "grad_norm": 0.38111555576324463, + "learning_rate": 0.0001, + "loss": 1.5406, + "step": 2832 + }, + { + "epoch": 0.32913157130409526, + "grad_norm": 0.38371211290359497, + "learning_rate": 0.0001, + "loss": 1.5291, + "step": 2833 + }, + { + "epoch": 0.32924774905605575, + "grad_norm": 0.4201413691043854, + "learning_rate": 0.0001, + "loss": 1.7543, + "step": 2834 + }, + { + "epoch": 0.32936392680801624, + "grad_norm": 0.386960506439209, + "learning_rate": 0.0001, + "loss": 1.6, + "step": 2835 + }, + { + "epoch": 0.3294801045599768, + "grad_norm": 0.41088366508483887, + "learning_rate": 0.0001, + "loss": 1.9636, + "step": 2836 + }, + { + "epoch": 0.32959628231193727, + "grad_norm": 0.39831310510635376, + "learning_rate": 0.0001, + "loss": 1.6039, + "step": 2837 + }, + { + "epoch": 0.32971246006389776, + "grad_norm": 0.3723565638065338, + "learning_rate": 0.0001, + "loss": 1.6253, + "step": 2838 + }, + { + "epoch": 0.32982863781585825, + "grad_norm": 0.3974156975746155, + "learning_rate": 0.0001, + "loss": 1.629, + "step": 2839 + }, + { + "epoch": 0.32994481556781874, + "grad_norm": 0.3748154640197754, + "learning_rate": 0.0001, + "loss": 1.5427, + "step": 2840 + }, + { + "epoch": 0.3300609933197793, + "grad_norm": 0.375637024641037, + "learning_rate": 0.0001, + "loss": 1.5665, + "step": 2841 + }, + { + "epoch": 0.3301771710717398, + "grad_norm": 0.397079199552536, + "learning_rate": 0.0001, + "loss": 1.7209, + "step": 2842 + }, + { + "epoch": 0.33029334882370026, + "grad_norm": 0.4001121520996094, + "learning_rate": 0.0001, + "loss": 1.6809, + "step": 2843 + }, + { + "epoch": 0.33040952657566075, + "grad_norm": 0.3832455277442932, + "learning_rate": 0.0001, + "loss": 1.7496, + "step": 2844 + }, + { + "epoch": 0.33052570432762124, + "grad_norm": 0.3800029456615448, + "learning_rate": 0.0001, + "loss": 1.6107, + "step": 2845 + }, + { + "epoch": 0.3306418820795818, + "grad_norm": 0.37468817830085754, + "learning_rate": 0.0001, + "loss": 1.6306, + "step": 2846 + }, + { + "epoch": 0.3307580598315423, + "grad_norm": 0.3698212802410126, + "learning_rate": 0.0001, + "loss": 1.6629, + "step": 2847 + }, + { + "epoch": 0.33087423758350276, + "grad_norm": 0.39800024032592773, + "learning_rate": 0.0001, + "loss": 1.5955, + "step": 2848 + }, + { + "epoch": 0.33099041533546325, + "grad_norm": 0.3913809657096863, + "learning_rate": 0.0001, + "loss": 1.7207, + "step": 2849 + }, + { + "epoch": 0.33110659308742374, + "grad_norm": 0.3854704201221466, + "learning_rate": 0.0001, + "loss": 1.696, + "step": 2850 + }, + { + "epoch": 0.3312227708393843, + "grad_norm": 0.36068427562713623, + "learning_rate": 0.0001, + "loss": 1.6256, + "step": 2851 + }, + { + "epoch": 0.3313389485913448, + "grad_norm": 0.4268893003463745, + "learning_rate": 0.0001, + "loss": 1.7395, + "step": 2852 + }, + { + "epoch": 0.33145512634330526, + "grad_norm": 0.3733718991279602, + "learning_rate": 0.0001, + "loss": 1.6515, + "step": 2853 + }, + { + "epoch": 0.33157130409526575, + "grad_norm": 0.36404022574424744, + "learning_rate": 0.0001, + "loss": 1.4211, + "step": 2854 + }, + { + "epoch": 0.33168748184722624, + "grad_norm": 0.4233626425266266, + "learning_rate": 0.0001, + "loss": 1.6407, + "step": 2855 + }, + { + "epoch": 0.33180365959918673, + "grad_norm": 0.48992058634757996, + "learning_rate": 0.0001, + "loss": 1.8077, + "step": 2856 + }, + { + "epoch": 0.3319198373511473, + "grad_norm": 0.4136912226676941, + "learning_rate": 0.0001, + "loss": 1.7703, + "step": 2857 + }, + { + "epoch": 0.33203601510310776, + "grad_norm": 0.36938685178756714, + "learning_rate": 0.0001, + "loss": 1.6473, + "step": 2858 + }, + { + "epoch": 0.33215219285506825, + "grad_norm": 0.36779820919036865, + "learning_rate": 0.0001, + "loss": 1.6147, + "step": 2859 + }, + { + "epoch": 0.33226837060702874, + "grad_norm": 0.36435508728027344, + "learning_rate": 0.0001, + "loss": 1.508, + "step": 2860 + }, + { + "epoch": 0.33238454835898923, + "grad_norm": 0.38518238067626953, + "learning_rate": 0.0001, + "loss": 1.6981, + "step": 2861 + }, + { + "epoch": 0.3325007261109498, + "grad_norm": 0.419127881526947, + "learning_rate": 0.0001, + "loss": 1.8029, + "step": 2862 + }, + { + "epoch": 0.33261690386291026, + "grad_norm": 0.3840404152870178, + "learning_rate": 0.0001, + "loss": 1.6177, + "step": 2863 + }, + { + "epoch": 0.33273308161487075, + "grad_norm": 0.4244234263896942, + "learning_rate": 0.0001, + "loss": 1.7396, + "step": 2864 + }, + { + "epoch": 0.33284925936683124, + "grad_norm": 0.40115001797676086, + "learning_rate": 0.0001, + "loss": 1.6808, + "step": 2865 + }, + { + "epoch": 0.33296543711879173, + "grad_norm": 0.4338228702545166, + "learning_rate": 0.0001, + "loss": 1.6853, + "step": 2866 + }, + { + "epoch": 0.3330816148707523, + "grad_norm": 0.40613770484924316, + "learning_rate": 0.0001, + "loss": 1.7427, + "step": 2867 + }, + { + "epoch": 0.33319779262271276, + "grad_norm": 0.4123576283454895, + "learning_rate": 0.0001, + "loss": 1.5359, + "step": 2868 + }, + { + "epoch": 0.33331397037467325, + "grad_norm": 0.3951095938682556, + "learning_rate": 0.0001, + "loss": 1.6141, + "step": 2869 + }, + { + "epoch": 0.33343014812663374, + "grad_norm": 0.410762220621109, + "learning_rate": 0.0001, + "loss": 1.82, + "step": 2870 + }, + { + "epoch": 0.33354632587859423, + "grad_norm": 0.4156797528266907, + "learning_rate": 0.0001, + "loss": 1.687, + "step": 2871 + }, + { + "epoch": 0.3336625036305548, + "grad_norm": 0.3592885434627533, + "learning_rate": 0.0001, + "loss": 1.5042, + "step": 2872 + }, + { + "epoch": 0.33377868138251526, + "grad_norm": 0.42720362544059753, + "learning_rate": 0.0001, + "loss": 1.918, + "step": 2873 + }, + { + "epoch": 0.33389485913447575, + "grad_norm": 0.40637707710266113, + "learning_rate": 0.0001, + "loss": 1.8286, + "step": 2874 + }, + { + "epoch": 0.33401103688643624, + "grad_norm": 0.39554738998413086, + "learning_rate": 0.0001, + "loss": 1.6854, + "step": 2875 + }, + { + "epoch": 0.33412721463839673, + "grad_norm": 0.37411895394325256, + "learning_rate": 0.0001, + "loss": 1.5642, + "step": 2876 + }, + { + "epoch": 0.3342433923903572, + "grad_norm": 0.38688522577285767, + "learning_rate": 0.0001, + "loss": 1.5817, + "step": 2877 + }, + { + "epoch": 0.33435957014231776, + "grad_norm": 0.39456331729888916, + "learning_rate": 0.0001, + "loss": 1.7141, + "step": 2878 + }, + { + "epoch": 0.33447574789427825, + "grad_norm": 0.37233471870422363, + "learning_rate": 0.0001, + "loss": 1.5662, + "step": 2879 + }, + { + "epoch": 0.33459192564623874, + "grad_norm": 0.3917320668697357, + "learning_rate": 0.0001, + "loss": 1.7551, + "step": 2880 + }, + { + "epoch": 0.33470810339819923, + "grad_norm": 0.3961198627948761, + "learning_rate": 0.0001, + "loss": 1.7296, + "step": 2881 + }, + { + "epoch": 0.3348242811501597, + "grad_norm": 0.4228503108024597, + "learning_rate": 0.0001, + "loss": 1.8795, + "step": 2882 + }, + { + "epoch": 0.33494045890212026, + "grad_norm": 0.3861311376094818, + "learning_rate": 0.0001, + "loss": 1.8777, + "step": 2883 + }, + { + "epoch": 0.33505663665408075, + "grad_norm": 0.3791095018386841, + "learning_rate": 0.0001, + "loss": 1.6151, + "step": 2884 + }, + { + "epoch": 0.33517281440604124, + "grad_norm": 0.45170632004737854, + "learning_rate": 0.0001, + "loss": 1.8281, + "step": 2885 + }, + { + "epoch": 0.33528899215800173, + "grad_norm": 0.4086534082889557, + "learning_rate": 0.0001, + "loss": 1.5911, + "step": 2886 + }, + { + "epoch": 0.3354051699099622, + "grad_norm": 0.4174976348876953, + "learning_rate": 0.0001, + "loss": 1.6932, + "step": 2887 + }, + { + "epoch": 0.33552134766192276, + "grad_norm": 0.39714473485946655, + "learning_rate": 0.0001, + "loss": 1.683, + "step": 2888 + }, + { + "epoch": 0.33563752541388325, + "grad_norm": 0.39770638942718506, + "learning_rate": 0.0001, + "loss": 1.7405, + "step": 2889 + }, + { + "epoch": 0.33575370316584374, + "grad_norm": 0.4018489122390747, + "learning_rate": 0.0001, + "loss": 1.621, + "step": 2890 + }, + { + "epoch": 0.33586988091780423, + "grad_norm": 0.3446825444698334, + "learning_rate": 0.0001, + "loss": 1.5324, + "step": 2891 + }, + { + "epoch": 0.3359860586697647, + "grad_norm": 0.41117849946022034, + "learning_rate": 0.0001, + "loss": 1.7171, + "step": 2892 + }, + { + "epoch": 0.33610223642172526, + "grad_norm": 0.4037335216999054, + "learning_rate": 0.0001, + "loss": 1.7018, + "step": 2893 + }, + { + "epoch": 0.33621841417368575, + "grad_norm": 0.3712430000305176, + "learning_rate": 0.0001, + "loss": 1.5361, + "step": 2894 + }, + { + "epoch": 0.33633459192564624, + "grad_norm": 0.4003915786743164, + "learning_rate": 0.0001, + "loss": 1.5166, + "step": 2895 + }, + { + "epoch": 0.33645076967760673, + "grad_norm": 0.43774011731147766, + "learning_rate": 0.0001, + "loss": 1.5873, + "step": 2896 + }, + { + "epoch": 0.3365669474295672, + "grad_norm": 0.43321821093559265, + "learning_rate": 0.0001, + "loss": 1.5878, + "step": 2897 + }, + { + "epoch": 0.33668312518152776, + "grad_norm": 0.37499547004699707, + "learning_rate": 0.0001, + "loss": 1.5873, + "step": 2898 + }, + { + "epoch": 0.33679930293348825, + "grad_norm": 0.37291446328163147, + "learning_rate": 0.0001, + "loss": 1.4856, + "step": 2899 + }, + { + "epoch": 0.33691548068544874, + "grad_norm": 0.44143107533454895, + "learning_rate": 0.0001, + "loss": 1.8552, + "step": 2900 + }, + { + "epoch": 0.33703165843740923, + "grad_norm": 0.3805929720401764, + "learning_rate": 0.0001, + "loss": 1.7206, + "step": 2901 + }, + { + "epoch": 0.3371478361893697, + "grad_norm": 0.3873468339443207, + "learning_rate": 0.0001, + "loss": 1.6746, + "step": 2902 + }, + { + "epoch": 0.3372640139413302, + "grad_norm": 0.3733051121234894, + "learning_rate": 0.0001, + "loss": 1.5941, + "step": 2903 + }, + { + "epoch": 0.33738019169329075, + "grad_norm": 0.40816688537597656, + "learning_rate": 0.0001, + "loss": 1.6459, + "step": 2904 + }, + { + "epoch": 0.33749636944525124, + "grad_norm": 0.3908594250679016, + "learning_rate": 0.0001, + "loss": 1.6518, + "step": 2905 + }, + { + "epoch": 0.33761254719721173, + "grad_norm": 0.40407541394233704, + "learning_rate": 0.0001, + "loss": 1.7334, + "step": 2906 + }, + { + "epoch": 0.3377287249491722, + "grad_norm": 0.4066741168498993, + "learning_rate": 0.0001, + "loss": 1.6296, + "step": 2907 + }, + { + "epoch": 0.3378449027011327, + "grad_norm": 0.3964504599571228, + "learning_rate": 0.0001, + "loss": 1.6915, + "step": 2908 + }, + { + "epoch": 0.33796108045309325, + "grad_norm": 0.37795713543891907, + "learning_rate": 0.0001, + "loss": 1.5542, + "step": 2909 + }, + { + "epoch": 0.33807725820505374, + "grad_norm": 0.37210172414779663, + "learning_rate": 0.0001, + "loss": 1.6524, + "step": 2910 + }, + { + "epoch": 0.33819343595701423, + "grad_norm": 0.3974775969982147, + "learning_rate": 0.0001, + "loss": 1.7075, + "step": 2911 + }, + { + "epoch": 0.3383096137089747, + "grad_norm": 0.37600451707839966, + "learning_rate": 0.0001, + "loss": 1.8123, + "step": 2912 + }, + { + "epoch": 0.3384257914609352, + "grad_norm": 0.3853381276130676, + "learning_rate": 0.0001, + "loss": 1.7165, + "step": 2913 + }, + { + "epoch": 0.33854196921289575, + "grad_norm": 0.38297778367996216, + "learning_rate": 0.0001, + "loss": 1.6564, + "step": 2914 + }, + { + "epoch": 0.33865814696485624, + "grad_norm": 0.392566978931427, + "learning_rate": 0.0001, + "loss": 1.6987, + "step": 2915 + }, + { + "epoch": 0.33877432471681673, + "grad_norm": 0.3868968188762665, + "learning_rate": 0.0001, + "loss": 1.6981, + "step": 2916 + }, + { + "epoch": 0.3388905024687772, + "grad_norm": 0.39296168088912964, + "learning_rate": 0.0001, + "loss": 1.7263, + "step": 2917 + }, + { + "epoch": 0.3390066802207377, + "grad_norm": 0.3615785837173462, + "learning_rate": 0.0001, + "loss": 1.5509, + "step": 2918 + }, + { + "epoch": 0.33912285797269826, + "grad_norm": 0.3897397220134735, + "learning_rate": 0.0001, + "loss": 1.7675, + "step": 2919 + }, + { + "epoch": 0.33923903572465874, + "grad_norm": 0.3959611654281616, + "learning_rate": 0.0001, + "loss": 1.7306, + "step": 2920 + }, + { + "epoch": 0.33935521347661923, + "grad_norm": 0.39301490783691406, + "learning_rate": 0.0001, + "loss": 1.6661, + "step": 2921 + }, + { + "epoch": 0.3394713912285797, + "grad_norm": 0.3942652940750122, + "learning_rate": 0.0001, + "loss": 1.6023, + "step": 2922 + }, + { + "epoch": 0.3395875689805402, + "grad_norm": 0.43564826250076294, + "learning_rate": 0.0001, + "loss": 1.8724, + "step": 2923 + }, + { + "epoch": 0.3397037467325007, + "grad_norm": 0.39441993832588196, + "learning_rate": 0.0001, + "loss": 1.7982, + "step": 2924 + }, + { + "epoch": 0.33981992448446124, + "grad_norm": 0.3842059373855591, + "learning_rate": 0.0001, + "loss": 1.6407, + "step": 2925 + }, + { + "epoch": 0.33993610223642173, + "grad_norm": 0.40640154480934143, + "learning_rate": 0.0001, + "loss": 1.8546, + "step": 2926 + }, + { + "epoch": 0.3400522799883822, + "grad_norm": 0.41045522689819336, + "learning_rate": 0.0001, + "loss": 1.8838, + "step": 2927 + }, + { + "epoch": 0.3401684577403427, + "grad_norm": 0.37923598289489746, + "learning_rate": 0.0001, + "loss": 1.6066, + "step": 2928 + }, + { + "epoch": 0.3402846354923032, + "grad_norm": 0.3806171715259552, + "learning_rate": 0.0001, + "loss": 1.7563, + "step": 2929 + }, + { + "epoch": 0.34040081324426374, + "grad_norm": 0.3735258877277374, + "learning_rate": 0.0001, + "loss": 1.627, + "step": 2930 + }, + { + "epoch": 0.34051699099622423, + "grad_norm": 0.3817983865737915, + "learning_rate": 0.0001, + "loss": 1.6563, + "step": 2931 + }, + { + "epoch": 0.3406331687481847, + "grad_norm": 0.4227299690246582, + "learning_rate": 0.0001, + "loss": 1.78, + "step": 2932 + }, + { + "epoch": 0.3407493465001452, + "grad_norm": 0.4059399664402008, + "learning_rate": 0.0001, + "loss": 1.5223, + "step": 2933 + }, + { + "epoch": 0.3408655242521057, + "grad_norm": 0.3733903169631958, + "learning_rate": 0.0001, + "loss": 1.5281, + "step": 2934 + }, + { + "epoch": 0.34098170200406624, + "grad_norm": 0.3693414330482483, + "learning_rate": 0.0001, + "loss": 1.6979, + "step": 2935 + }, + { + "epoch": 0.34109787975602673, + "grad_norm": 0.3974681794643402, + "learning_rate": 0.0001, + "loss": 1.7623, + "step": 2936 + }, + { + "epoch": 0.3412140575079872, + "grad_norm": 0.40453028678894043, + "learning_rate": 0.0001, + "loss": 1.6651, + "step": 2937 + }, + { + "epoch": 0.3413302352599477, + "grad_norm": 0.3921975791454315, + "learning_rate": 0.0001, + "loss": 1.5821, + "step": 2938 + }, + { + "epoch": 0.3414464130119082, + "grad_norm": 0.39022400975227356, + "learning_rate": 0.0001, + "loss": 1.6862, + "step": 2939 + }, + { + "epoch": 0.34156259076386875, + "grad_norm": 0.39960476756095886, + "learning_rate": 0.0001, + "loss": 1.7099, + "step": 2940 + }, + { + "epoch": 0.34167876851582923, + "grad_norm": 0.40952742099761963, + "learning_rate": 0.0001, + "loss": 1.7774, + "step": 2941 + }, + { + "epoch": 0.3417949462677897, + "grad_norm": 0.3948822617530823, + "learning_rate": 0.0001, + "loss": 1.7191, + "step": 2942 + }, + { + "epoch": 0.3419111240197502, + "grad_norm": 0.4061760902404785, + "learning_rate": 0.0001, + "loss": 1.7105, + "step": 2943 + }, + { + "epoch": 0.3420273017717107, + "grad_norm": 0.3945620357990265, + "learning_rate": 0.0001, + "loss": 1.7056, + "step": 2944 + }, + { + "epoch": 0.3421434795236712, + "grad_norm": 0.4100196659564972, + "learning_rate": 0.0001, + "loss": 1.7106, + "step": 2945 + }, + { + "epoch": 0.34225965727563173, + "grad_norm": 0.3933731019496918, + "learning_rate": 0.0001, + "loss": 1.6135, + "step": 2946 + }, + { + "epoch": 0.3423758350275922, + "grad_norm": 0.3884199559688568, + "learning_rate": 0.0001, + "loss": 1.6829, + "step": 2947 + }, + { + "epoch": 0.3424920127795527, + "grad_norm": 0.386350154876709, + "learning_rate": 0.0001, + "loss": 1.7033, + "step": 2948 + }, + { + "epoch": 0.3426081905315132, + "grad_norm": 0.40476351976394653, + "learning_rate": 0.0001, + "loss": 1.5197, + "step": 2949 + }, + { + "epoch": 0.3427243682834737, + "grad_norm": 0.407321572303772, + "learning_rate": 0.0001, + "loss": 1.7674, + "step": 2950 + }, + { + "epoch": 0.34284054603543423, + "grad_norm": 0.40488916635513306, + "learning_rate": 0.0001, + "loss": 1.6427, + "step": 2951 + }, + { + "epoch": 0.3429567237873947, + "grad_norm": 0.4045671224594116, + "learning_rate": 0.0001, + "loss": 1.8208, + "step": 2952 + }, + { + "epoch": 0.3430729015393552, + "grad_norm": 0.4002356231212616, + "learning_rate": 0.0001, + "loss": 1.6873, + "step": 2953 + }, + { + "epoch": 0.3431890792913157, + "grad_norm": 0.3586932122707367, + "learning_rate": 0.0001, + "loss": 1.4746, + "step": 2954 + }, + { + "epoch": 0.3433052570432762, + "grad_norm": 0.370332807302475, + "learning_rate": 0.0001, + "loss": 1.6734, + "step": 2955 + }, + { + "epoch": 0.34342143479523674, + "grad_norm": 0.39355921745300293, + "learning_rate": 0.0001, + "loss": 1.8061, + "step": 2956 + }, + { + "epoch": 0.3435376125471972, + "grad_norm": 0.39952757954597473, + "learning_rate": 0.0001, + "loss": 1.6956, + "step": 2957 + }, + { + "epoch": 0.3436537902991577, + "grad_norm": 0.3770568370819092, + "learning_rate": 0.0001, + "loss": 1.724, + "step": 2958 + }, + { + "epoch": 0.3437699680511182, + "grad_norm": 0.4441458582878113, + "learning_rate": 0.0001, + "loss": 1.9858, + "step": 2959 + }, + { + "epoch": 0.3438861458030787, + "grad_norm": 0.38884633779525757, + "learning_rate": 0.0001, + "loss": 1.6538, + "step": 2960 + }, + { + "epoch": 0.34400232355503924, + "grad_norm": 0.41551780700683594, + "learning_rate": 0.0001, + "loss": 1.8076, + "step": 2961 + }, + { + "epoch": 0.3441185013069997, + "grad_norm": 0.3980303108692169, + "learning_rate": 0.0001, + "loss": 1.613, + "step": 2962 + }, + { + "epoch": 0.3442346790589602, + "grad_norm": 0.3553750813007355, + "learning_rate": 0.0001, + "loss": 1.5749, + "step": 2963 + }, + { + "epoch": 0.3443508568109207, + "grad_norm": 0.38143518567085266, + "learning_rate": 0.0001, + "loss": 1.6693, + "step": 2964 + }, + { + "epoch": 0.3444670345628812, + "grad_norm": 0.40971437096595764, + "learning_rate": 0.0001, + "loss": 1.65, + "step": 2965 + }, + { + "epoch": 0.3445832123148417, + "grad_norm": 0.35537075996398926, + "learning_rate": 0.0001, + "loss": 1.6096, + "step": 2966 + }, + { + "epoch": 0.3446993900668022, + "grad_norm": 0.38093459606170654, + "learning_rate": 0.0001, + "loss": 1.7213, + "step": 2967 + }, + { + "epoch": 0.3448155678187627, + "grad_norm": 0.34431418776512146, + "learning_rate": 0.0001, + "loss": 1.4065, + "step": 2968 + }, + { + "epoch": 0.3449317455707232, + "grad_norm": 0.427884042263031, + "learning_rate": 0.0001, + "loss": 1.7658, + "step": 2969 + }, + { + "epoch": 0.3450479233226837, + "grad_norm": 0.4321180582046509, + "learning_rate": 0.0001, + "loss": 1.8182, + "step": 2970 + }, + { + "epoch": 0.3451641010746442, + "grad_norm": 0.3925512731075287, + "learning_rate": 0.0001, + "loss": 1.6195, + "step": 2971 + }, + { + "epoch": 0.3452802788266047, + "grad_norm": 0.3812701106071472, + "learning_rate": 0.0001, + "loss": 1.5437, + "step": 2972 + }, + { + "epoch": 0.3453964565785652, + "grad_norm": 0.4410078227519989, + "learning_rate": 0.0001, + "loss": 1.8109, + "step": 2973 + }, + { + "epoch": 0.3455126343305257, + "grad_norm": 0.38449546694755554, + "learning_rate": 0.0001, + "loss": 1.7467, + "step": 2974 + }, + { + "epoch": 0.3456288120824862, + "grad_norm": 0.3961304724216461, + "learning_rate": 0.0001, + "loss": 1.5686, + "step": 2975 + }, + { + "epoch": 0.3457449898344467, + "grad_norm": 0.3921011686325073, + "learning_rate": 0.0001, + "loss": 1.7288, + "step": 2976 + }, + { + "epoch": 0.3458611675864072, + "grad_norm": 0.39690279960632324, + "learning_rate": 0.0001, + "loss": 1.7883, + "step": 2977 + }, + { + "epoch": 0.3459773453383677, + "grad_norm": 0.4186713397502899, + "learning_rate": 0.0001, + "loss": 1.7538, + "step": 2978 + }, + { + "epoch": 0.3460935230903282, + "grad_norm": 0.39231374859809875, + "learning_rate": 0.0001, + "loss": 1.7383, + "step": 2979 + }, + { + "epoch": 0.3462097008422887, + "grad_norm": 0.3903275430202484, + "learning_rate": 0.0001, + "loss": 1.6001, + "step": 2980 + }, + { + "epoch": 0.3463258785942492, + "grad_norm": 0.4018222689628601, + "learning_rate": 0.0001, + "loss": 1.5887, + "step": 2981 + }, + { + "epoch": 0.3464420563462097, + "grad_norm": 0.41252797842025757, + "learning_rate": 0.0001, + "loss": 1.5982, + "step": 2982 + }, + { + "epoch": 0.3465582340981702, + "grad_norm": 0.38088223338127136, + "learning_rate": 0.0001, + "loss": 1.4515, + "step": 2983 + }, + { + "epoch": 0.3466744118501307, + "grad_norm": 0.3844488859176636, + "learning_rate": 0.0001, + "loss": 1.6082, + "step": 2984 + }, + { + "epoch": 0.3467905896020912, + "grad_norm": 0.3818972408771515, + "learning_rate": 0.0001, + "loss": 1.7069, + "step": 2985 + }, + { + "epoch": 0.3469067673540517, + "grad_norm": 0.4173097014427185, + "learning_rate": 0.0001, + "loss": 1.619, + "step": 2986 + }, + { + "epoch": 0.3470229451060122, + "grad_norm": 0.428475946187973, + "learning_rate": 0.0001, + "loss": 1.8248, + "step": 2987 + }, + { + "epoch": 0.3471391228579727, + "grad_norm": 0.40576884150505066, + "learning_rate": 0.0001, + "loss": 1.6225, + "step": 2988 + }, + { + "epoch": 0.3472553006099332, + "grad_norm": 0.4049152433872223, + "learning_rate": 0.0001, + "loss": 1.5798, + "step": 2989 + }, + { + "epoch": 0.3473714783618937, + "grad_norm": 0.3948923647403717, + "learning_rate": 0.0001, + "loss": 1.6983, + "step": 2990 + }, + { + "epoch": 0.3474876561138542, + "grad_norm": 0.3774530291557312, + "learning_rate": 0.0001, + "loss": 1.7484, + "step": 2991 + }, + { + "epoch": 0.34760383386581467, + "grad_norm": 0.38777539134025574, + "learning_rate": 0.0001, + "loss": 1.6732, + "step": 2992 + }, + { + "epoch": 0.3477200116177752, + "grad_norm": 0.38459205627441406, + "learning_rate": 0.0001, + "loss": 1.6152, + "step": 2993 + }, + { + "epoch": 0.3478361893697357, + "grad_norm": 0.44415146112442017, + "learning_rate": 0.0001, + "loss": 1.8133, + "step": 2994 + }, + { + "epoch": 0.3479523671216962, + "grad_norm": 0.3697360157966614, + "learning_rate": 0.0001, + "loss": 1.5783, + "step": 2995 + }, + { + "epoch": 0.3480685448736567, + "grad_norm": 0.4112284779548645, + "learning_rate": 0.0001, + "loss": 1.7501, + "step": 2996 + }, + { + "epoch": 0.34818472262561717, + "grad_norm": 0.42278242111206055, + "learning_rate": 0.0001, + "loss": 1.7225, + "step": 2997 + }, + { + "epoch": 0.3483009003775777, + "grad_norm": 0.3720592260360718, + "learning_rate": 0.0001, + "loss": 1.5809, + "step": 2998 + }, + { + "epoch": 0.3484170781295382, + "grad_norm": 0.37524881958961487, + "learning_rate": 0.0001, + "loss": 1.6982, + "step": 2999 + }, + { + "epoch": 0.3485332558814987, + "grad_norm": 0.38815271854400635, + "learning_rate": 0.0001, + "loss": 1.6433, + "step": 3000 + }, + { + "epoch": 0.3486494336334592, + "grad_norm": 0.4013916552066803, + "learning_rate": 0.0001, + "loss": 1.6074, + "step": 3001 + }, + { + "epoch": 0.34876561138541967, + "grad_norm": 0.3819652497768402, + "learning_rate": 0.0001, + "loss": 1.6946, + "step": 3002 + }, + { + "epoch": 0.3488817891373802, + "grad_norm": 0.37497755885124207, + "learning_rate": 0.0001, + "loss": 1.5937, + "step": 3003 + }, + { + "epoch": 0.3489979668893407, + "grad_norm": 0.3700786232948303, + "learning_rate": 0.0001, + "loss": 1.5815, + "step": 3004 + }, + { + "epoch": 0.3491141446413012, + "grad_norm": 0.40467193722724915, + "learning_rate": 0.0001, + "loss": 1.7022, + "step": 3005 + }, + { + "epoch": 0.3492303223932617, + "grad_norm": 0.3688233494758606, + "learning_rate": 0.0001, + "loss": 1.5006, + "step": 3006 + }, + { + "epoch": 0.34934650014522217, + "grad_norm": 0.39277154207229614, + "learning_rate": 0.0001, + "loss": 1.7189, + "step": 3007 + }, + { + "epoch": 0.3494626778971827, + "grad_norm": 0.3854866921901703, + "learning_rate": 0.0001, + "loss": 1.66, + "step": 3008 + }, + { + "epoch": 0.3495788556491432, + "grad_norm": 0.4092784821987152, + "learning_rate": 0.0001, + "loss": 1.7302, + "step": 3009 + }, + { + "epoch": 0.3496950334011037, + "grad_norm": 0.3558938205242157, + "learning_rate": 0.0001, + "loss": 1.5746, + "step": 3010 + }, + { + "epoch": 0.3498112111530642, + "grad_norm": 0.4179665148258209, + "learning_rate": 0.0001, + "loss": 1.7137, + "step": 3011 + }, + { + "epoch": 0.34992738890502467, + "grad_norm": 0.38339027762413025, + "learning_rate": 0.0001, + "loss": 1.4907, + "step": 3012 + }, + { + "epoch": 0.35004356665698516, + "grad_norm": 0.36649930477142334, + "learning_rate": 0.0001, + "loss": 1.6285, + "step": 3013 + }, + { + "epoch": 0.3501597444089457, + "grad_norm": 0.4202296733856201, + "learning_rate": 0.0001, + "loss": 1.6298, + "step": 3014 + }, + { + "epoch": 0.3502759221609062, + "grad_norm": 0.4036094844341278, + "learning_rate": 0.0001, + "loss": 1.7476, + "step": 3015 + }, + { + "epoch": 0.3503920999128667, + "grad_norm": 0.3799827992916107, + "learning_rate": 0.0001, + "loss": 1.6123, + "step": 3016 + }, + { + "epoch": 0.3505082776648272, + "grad_norm": 0.40348902344703674, + "learning_rate": 0.0001, + "loss": 1.7164, + "step": 3017 + }, + { + "epoch": 0.35062445541678766, + "grad_norm": 0.38102301955223083, + "learning_rate": 0.0001, + "loss": 1.7244, + "step": 3018 + }, + { + "epoch": 0.3507406331687482, + "grad_norm": 0.3788668215274811, + "learning_rate": 0.0001, + "loss": 1.637, + "step": 3019 + }, + { + "epoch": 0.3508568109207087, + "grad_norm": 0.38864848017692566, + "learning_rate": 0.0001, + "loss": 1.5131, + "step": 3020 + }, + { + "epoch": 0.3509729886726692, + "grad_norm": 0.37422239780426025, + "learning_rate": 0.0001, + "loss": 1.7779, + "step": 3021 + }, + { + "epoch": 0.3510891664246297, + "grad_norm": 0.37829017639160156, + "learning_rate": 0.0001, + "loss": 1.7872, + "step": 3022 + }, + { + "epoch": 0.35120534417659016, + "grad_norm": 0.39269503951072693, + "learning_rate": 0.0001, + "loss": 1.6837, + "step": 3023 + }, + { + "epoch": 0.3513215219285507, + "grad_norm": 0.381878137588501, + "learning_rate": 0.0001, + "loss": 1.6482, + "step": 3024 + }, + { + "epoch": 0.3514376996805112, + "grad_norm": 0.39613422751426697, + "learning_rate": 0.0001, + "loss": 1.8325, + "step": 3025 + }, + { + "epoch": 0.3515538774324717, + "grad_norm": 0.42367449402809143, + "learning_rate": 0.0001, + "loss": 1.7125, + "step": 3026 + }, + { + "epoch": 0.3516700551844322, + "grad_norm": 0.3661247789859772, + "learning_rate": 0.0001, + "loss": 1.6193, + "step": 3027 + }, + { + "epoch": 0.35178623293639266, + "grad_norm": 0.36878344416618347, + "learning_rate": 0.0001, + "loss": 1.5813, + "step": 3028 + }, + { + "epoch": 0.3519024106883532, + "grad_norm": 0.38732531666755676, + "learning_rate": 0.0001, + "loss": 1.7773, + "step": 3029 + }, + { + "epoch": 0.3520185884403137, + "grad_norm": 0.4050348103046417, + "learning_rate": 0.0001, + "loss": 1.5636, + "step": 3030 + }, + { + "epoch": 0.3521347661922742, + "grad_norm": 0.40039730072021484, + "learning_rate": 0.0001, + "loss": 1.6083, + "step": 3031 + }, + { + "epoch": 0.3522509439442347, + "grad_norm": 0.40154829621315, + "learning_rate": 0.0001, + "loss": 1.6367, + "step": 3032 + }, + { + "epoch": 0.35236712169619516, + "grad_norm": 0.37400686740875244, + "learning_rate": 0.0001, + "loss": 1.6594, + "step": 3033 + }, + { + "epoch": 0.35248329944815565, + "grad_norm": 0.3930714726448059, + "learning_rate": 0.0001, + "loss": 1.874, + "step": 3034 + }, + { + "epoch": 0.3525994772001162, + "grad_norm": 0.4327416718006134, + "learning_rate": 0.0001, + "loss": 1.8191, + "step": 3035 + }, + { + "epoch": 0.3527156549520767, + "grad_norm": 0.4135274887084961, + "learning_rate": 0.0001, + "loss": 1.9436, + "step": 3036 + }, + { + "epoch": 0.3528318327040372, + "grad_norm": 0.36766317486763, + "learning_rate": 0.0001, + "loss": 1.6381, + "step": 3037 + }, + { + "epoch": 0.35294801045599766, + "grad_norm": 0.3964691460132599, + "learning_rate": 0.0001, + "loss": 1.7224, + "step": 3038 + }, + { + "epoch": 0.35306418820795815, + "grad_norm": 0.4026515781879425, + "learning_rate": 0.0001, + "loss": 1.8446, + "step": 3039 + }, + { + "epoch": 0.3531803659599187, + "grad_norm": 0.38630566000938416, + "learning_rate": 0.0001, + "loss": 1.6358, + "step": 3040 + }, + { + "epoch": 0.3532965437118792, + "grad_norm": 0.38038820028305054, + "learning_rate": 0.0001, + "loss": 1.6688, + "step": 3041 + }, + { + "epoch": 0.3534127214638397, + "grad_norm": 0.379242867231369, + "learning_rate": 0.0001, + "loss": 1.4822, + "step": 3042 + }, + { + "epoch": 0.35352889921580016, + "grad_norm": 0.3936004340648651, + "learning_rate": 0.0001, + "loss": 1.8992, + "step": 3043 + }, + { + "epoch": 0.35364507696776065, + "grad_norm": 0.3659883737564087, + "learning_rate": 0.0001, + "loss": 1.5642, + "step": 3044 + }, + { + "epoch": 0.3537612547197212, + "grad_norm": 0.4306775629520416, + "learning_rate": 0.0001, + "loss": 1.8698, + "step": 3045 + }, + { + "epoch": 0.3538774324716817, + "grad_norm": 0.40540704131126404, + "learning_rate": 0.0001, + "loss": 1.7321, + "step": 3046 + }, + { + "epoch": 0.3539936102236422, + "grad_norm": 0.3898630440235138, + "learning_rate": 0.0001, + "loss": 1.7021, + "step": 3047 + }, + { + "epoch": 0.35410978797560266, + "grad_norm": 0.40471145510673523, + "learning_rate": 0.0001, + "loss": 1.7801, + "step": 3048 + }, + { + "epoch": 0.35422596572756315, + "grad_norm": 0.40043899416923523, + "learning_rate": 0.0001, + "loss": 1.6066, + "step": 3049 + }, + { + "epoch": 0.3543421434795237, + "grad_norm": 0.3994670808315277, + "learning_rate": 0.0001, + "loss": 1.6784, + "step": 3050 + }, + { + "epoch": 0.3544583212314842, + "grad_norm": 0.4123898446559906, + "learning_rate": 0.0001, + "loss": 1.7, + "step": 3051 + }, + { + "epoch": 0.3545744989834447, + "grad_norm": 0.3823109567165375, + "learning_rate": 0.0001, + "loss": 1.6944, + "step": 3052 + }, + { + "epoch": 0.35469067673540516, + "grad_norm": 0.41799673438072205, + "learning_rate": 0.0001, + "loss": 1.7098, + "step": 3053 + }, + { + "epoch": 0.35480685448736565, + "grad_norm": 0.36214837431907654, + "learning_rate": 0.0001, + "loss": 1.5653, + "step": 3054 + }, + { + "epoch": 0.35492303223932614, + "grad_norm": 0.4229331910610199, + "learning_rate": 0.0001, + "loss": 1.6691, + "step": 3055 + }, + { + "epoch": 0.3550392099912867, + "grad_norm": 0.38769447803497314, + "learning_rate": 0.0001, + "loss": 1.5539, + "step": 3056 + }, + { + "epoch": 0.3551553877432472, + "grad_norm": 0.39255595207214355, + "learning_rate": 0.0001, + "loss": 1.5755, + "step": 3057 + }, + { + "epoch": 0.35527156549520766, + "grad_norm": 0.4204464852809906, + "learning_rate": 0.0001, + "loss": 1.7607, + "step": 3058 + }, + { + "epoch": 0.35538774324716815, + "grad_norm": 0.3666527271270752, + "learning_rate": 0.0001, + "loss": 1.5986, + "step": 3059 + }, + { + "epoch": 0.35550392099912864, + "grad_norm": 0.384753942489624, + "learning_rate": 0.0001, + "loss": 1.5905, + "step": 3060 + }, + { + "epoch": 0.3556200987510892, + "grad_norm": 0.3891599178314209, + "learning_rate": 0.0001, + "loss": 1.6803, + "step": 3061 + }, + { + "epoch": 0.3557362765030497, + "grad_norm": 0.37767064571380615, + "learning_rate": 0.0001, + "loss": 1.6844, + "step": 3062 + }, + { + "epoch": 0.35585245425501016, + "grad_norm": 0.40282127261161804, + "learning_rate": 0.0001, + "loss": 1.8185, + "step": 3063 + }, + { + "epoch": 0.35596863200697065, + "grad_norm": 0.3778342008590698, + "learning_rate": 0.0001, + "loss": 1.5302, + "step": 3064 + }, + { + "epoch": 0.35608480975893114, + "grad_norm": 0.37607836723327637, + "learning_rate": 0.0001, + "loss": 1.6726, + "step": 3065 + }, + { + "epoch": 0.3562009875108917, + "grad_norm": 0.3772094249725342, + "learning_rate": 0.0001, + "loss": 1.7671, + "step": 3066 + }, + { + "epoch": 0.3563171652628522, + "grad_norm": 0.4059067666530609, + "learning_rate": 0.0001, + "loss": 1.6747, + "step": 3067 + }, + { + "epoch": 0.35643334301481266, + "grad_norm": 0.39582398533821106, + "learning_rate": 0.0001, + "loss": 1.6757, + "step": 3068 + }, + { + "epoch": 0.35654952076677315, + "grad_norm": 0.3916482627391815, + "learning_rate": 0.0001, + "loss": 1.5215, + "step": 3069 + }, + { + "epoch": 0.35666569851873364, + "grad_norm": 0.41407474875450134, + "learning_rate": 0.0001, + "loss": 1.6777, + "step": 3070 + }, + { + "epoch": 0.3567818762706942, + "grad_norm": 0.41029298305511475, + "learning_rate": 0.0001, + "loss": 1.7567, + "step": 3071 + }, + { + "epoch": 0.3568980540226547, + "grad_norm": 0.4083024859428406, + "learning_rate": 0.0001, + "loss": 1.6835, + "step": 3072 + }, + { + "epoch": 0.35701423177461517, + "grad_norm": 0.44555673003196716, + "learning_rate": 0.0001, + "loss": 1.7028, + "step": 3073 + }, + { + "epoch": 0.35713040952657565, + "grad_norm": 0.37947797775268555, + "learning_rate": 0.0001, + "loss": 1.5277, + "step": 3074 + }, + { + "epoch": 0.35724658727853614, + "grad_norm": 0.3823384940624237, + "learning_rate": 0.0001, + "loss": 1.673, + "step": 3075 + }, + { + "epoch": 0.3573627650304967, + "grad_norm": 0.3980938494205475, + "learning_rate": 0.0001, + "loss": 1.7366, + "step": 3076 + }, + { + "epoch": 0.3574789427824572, + "grad_norm": 0.39685919880867004, + "learning_rate": 0.0001, + "loss": 1.7057, + "step": 3077 + }, + { + "epoch": 0.35759512053441767, + "grad_norm": 0.4064877927303314, + "learning_rate": 0.0001, + "loss": 1.7869, + "step": 3078 + }, + { + "epoch": 0.35771129828637815, + "grad_norm": 0.41149288415908813, + "learning_rate": 0.0001, + "loss": 1.634, + "step": 3079 + }, + { + "epoch": 0.35782747603833864, + "grad_norm": 0.36738306283950806, + "learning_rate": 0.0001, + "loss": 1.6363, + "step": 3080 + }, + { + "epoch": 0.35794365379029913, + "grad_norm": 0.393264502286911, + "learning_rate": 0.0001, + "loss": 1.6643, + "step": 3081 + }, + { + "epoch": 0.3580598315422597, + "grad_norm": 0.3866167366504669, + "learning_rate": 0.0001, + "loss": 1.7136, + "step": 3082 + }, + { + "epoch": 0.35817600929422017, + "grad_norm": 0.38712388277053833, + "learning_rate": 0.0001, + "loss": 1.6682, + "step": 3083 + }, + { + "epoch": 0.35829218704618065, + "grad_norm": 0.401869535446167, + "learning_rate": 0.0001, + "loss": 1.6885, + "step": 3084 + }, + { + "epoch": 0.35840836479814114, + "grad_norm": 0.3952792286872864, + "learning_rate": 0.0001, + "loss": 1.6841, + "step": 3085 + }, + { + "epoch": 0.35852454255010163, + "grad_norm": 0.36368271708488464, + "learning_rate": 0.0001, + "loss": 1.422, + "step": 3086 + }, + { + "epoch": 0.3586407203020622, + "grad_norm": 0.39687228202819824, + "learning_rate": 0.0001, + "loss": 1.6623, + "step": 3087 + }, + { + "epoch": 0.35875689805402267, + "grad_norm": 0.41528600454330444, + "learning_rate": 0.0001, + "loss": 1.472, + "step": 3088 + }, + { + "epoch": 0.35887307580598315, + "grad_norm": 0.420449435710907, + "learning_rate": 0.0001, + "loss": 1.7244, + "step": 3089 + }, + { + "epoch": 0.35898925355794364, + "grad_norm": 0.43798938393592834, + "learning_rate": 0.0001, + "loss": 1.8541, + "step": 3090 + }, + { + "epoch": 0.35910543130990413, + "grad_norm": 0.37088409066200256, + "learning_rate": 0.0001, + "loss": 1.673, + "step": 3091 + }, + { + "epoch": 0.3592216090618647, + "grad_norm": 0.41478070616722107, + "learning_rate": 0.0001, + "loss": 1.7114, + "step": 3092 + }, + { + "epoch": 0.35933778681382517, + "grad_norm": 0.4017499089241028, + "learning_rate": 0.0001, + "loss": 1.657, + "step": 3093 + }, + { + "epoch": 0.35945396456578566, + "grad_norm": 0.4266115128993988, + "learning_rate": 0.0001, + "loss": 1.7716, + "step": 3094 + }, + { + "epoch": 0.35957014231774614, + "grad_norm": 0.39603403210639954, + "learning_rate": 0.0001, + "loss": 1.7288, + "step": 3095 + }, + { + "epoch": 0.35968632006970663, + "grad_norm": 0.4058244824409485, + "learning_rate": 0.0001, + "loss": 1.6059, + "step": 3096 + }, + { + "epoch": 0.3598024978216672, + "grad_norm": 0.3646165728569031, + "learning_rate": 0.0001, + "loss": 1.5943, + "step": 3097 + }, + { + "epoch": 0.35991867557362767, + "grad_norm": 0.3995044529438019, + "learning_rate": 0.0001, + "loss": 1.5707, + "step": 3098 + }, + { + "epoch": 0.36003485332558816, + "grad_norm": 0.3803756535053253, + "learning_rate": 0.0001, + "loss": 1.5508, + "step": 3099 + }, + { + "epoch": 0.36015103107754864, + "grad_norm": 0.41200825572013855, + "learning_rate": 0.0001, + "loss": 1.7208, + "step": 3100 + }, + { + "epoch": 0.36026720882950913, + "grad_norm": 0.394388347864151, + "learning_rate": 0.0001, + "loss": 1.6949, + "step": 3101 + }, + { + "epoch": 0.3603833865814696, + "grad_norm": 0.3949473798274994, + "learning_rate": 0.0001, + "loss": 1.8397, + "step": 3102 + }, + { + "epoch": 0.36049956433343017, + "grad_norm": 0.41774412989616394, + "learning_rate": 0.0001, + "loss": 1.601, + "step": 3103 + }, + { + "epoch": 0.36061574208539066, + "grad_norm": 0.3581482470035553, + "learning_rate": 0.0001, + "loss": 1.4879, + "step": 3104 + }, + { + "epoch": 0.36073191983735114, + "grad_norm": 0.41178280115127563, + "learning_rate": 0.0001, + "loss": 1.585, + "step": 3105 + }, + { + "epoch": 0.36084809758931163, + "grad_norm": 0.4004935920238495, + "learning_rate": 0.0001, + "loss": 1.6611, + "step": 3106 + }, + { + "epoch": 0.3609642753412721, + "grad_norm": 0.403432697057724, + "learning_rate": 0.0001, + "loss": 1.6761, + "step": 3107 + }, + { + "epoch": 0.36108045309323267, + "grad_norm": 0.3860589861869812, + "learning_rate": 0.0001, + "loss": 1.691, + "step": 3108 + }, + { + "epoch": 0.36119663084519316, + "grad_norm": 0.37853455543518066, + "learning_rate": 0.0001, + "loss": 1.6448, + "step": 3109 + }, + { + "epoch": 0.36131280859715365, + "grad_norm": 0.40292003750801086, + "learning_rate": 0.0001, + "loss": 1.8344, + "step": 3110 + }, + { + "epoch": 0.36142898634911413, + "grad_norm": 0.3984355926513672, + "learning_rate": 0.0001, + "loss": 1.879, + "step": 3111 + }, + { + "epoch": 0.3615451641010746, + "grad_norm": 0.44067731499671936, + "learning_rate": 0.0001, + "loss": 1.6588, + "step": 3112 + }, + { + "epoch": 0.36166134185303517, + "grad_norm": 0.3945563733577728, + "learning_rate": 0.0001, + "loss": 1.7042, + "step": 3113 + }, + { + "epoch": 0.36177751960499566, + "grad_norm": 0.4175066649913788, + "learning_rate": 0.0001, + "loss": 1.6859, + "step": 3114 + }, + { + "epoch": 0.36189369735695615, + "grad_norm": 0.41401904821395874, + "learning_rate": 0.0001, + "loss": 1.7809, + "step": 3115 + }, + { + "epoch": 0.36200987510891663, + "grad_norm": 0.3950018882751465, + "learning_rate": 0.0001, + "loss": 1.5787, + "step": 3116 + }, + { + "epoch": 0.3621260528608771, + "grad_norm": 0.3951791524887085, + "learning_rate": 0.0001, + "loss": 1.5756, + "step": 3117 + }, + { + "epoch": 0.36224223061283767, + "grad_norm": 0.42133089900016785, + "learning_rate": 0.0001, + "loss": 1.5944, + "step": 3118 + }, + { + "epoch": 0.36235840836479816, + "grad_norm": 0.423442542552948, + "learning_rate": 0.0001, + "loss": 1.9073, + "step": 3119 + }, + { + "epoch": 0.36247458611675865, + "grad_norm": 0.4069424271583557, + "learning_rate": 0.0001, + "loss": 1.8604, + "step": 3120 + }, + { + "epoch": 0.36259076386871913, + "grad_norm": 0.4289453625679016, + "learning_rate": 0.0001, + "loss": 1.7824, + "step": 3121 + }, + { + "epoch": 0.3627069416206796, + "grad_norm": 0.36343902349472046, + "learning_rate": 0.0001, + "loss": 1.6927, + "step": 3122 + }, + { + "epoch": 0.3628231193726401, + "grad_norm": 0.45105743408203125, + "learning_rate": 0.0001, + "loss": 1.782, + "step": 3123 + }, + { + "epoch": 0.36293929712460066, + "grad_norm": 0.3925178050994873, + "learning_rate": 0.0001, + "loss": 1.6771, + "step": 3124 + }, + { + "epoch": 0.36305547487656115, + "grad_norm": 0.4135838449001312, + "learning_rate": 0.0001, + "loss": 1.6086, + "step": 3125 + }, + { + "epoch": 0.36317165262852164, + "grad_norm": 0.36576542258262634, + "learning_rate": 0.0001, + "loss": 1.6254, + "step": 3126 + }, + { + "epoch": 0.3632878303804821, + "grad_norm": 0.4198266565799713, + "learning_rate": 0.0001, + "loss": 1.8513, + "step": 3127 + }, + { + "epoch": 0.3634040081324426, + "grad_norm": 0.3670734763145447, + "learning_rate": 0.0001, + "loss": 1.6557, + "step": 3128 + }, + { + "epoch": 0.36352018588440316, + "grad_norm": 0.36761337518692017, + "learning_rate": 0.0001, + "loss": 1.6473, + "step": 3129 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 0.40638405084609985, + "learning_rate": 0.0001, + "loss": 1.6446, + "step": 3130 + }, + { + "epoch": 0.36375254138832414, + "grad_norm": 0.38102391362190247, + "learning_rate": 0.0001, + "loss": 1.6785, + "step": 3131 + }, + { + "epoch": 0.3638687191402846, + "grad_norm": 0.39593052864074707, + "learning_rate": 0.0001, + "loss": 1.6347, + "step": 3132 + }, + { + "epoch": 0.3639848968922451, + "grad_norm": 0.4067305624485016, + "learning_rate": 0.0001, + "loss": 1.7925, + "step": 3133 + }, + { + "epoch": 0.36410107464420566, + "grad_norm": 0.3692638576030731, + "learning_rate": 0.0001, + "loss": 1.6906, + "step": 3134 + }, + { + "epoch": 0.36421725239616615, + "grad_norm": 0.40837207436561584, + "learning_rate": 0.0001, + "loss": 1.8112, + "step": 3135 + }, + { + "epoch": 0.36433343014812664, + "grad_norm": 0.41324061155319214, + "learning_rate": 0.0001, + "loss": 1.6678, + "step": 3136 + }, + { + "epoch": 0.3644496079000871, + "grad_norm": 0.35416892170906067, + "learning_rate": 0.0001, + "loss": 1.5757, + "step": 3137 + }, + { + "epoch": 0.3645657856520476, + "grad_norm": 0.44099685549736023, + "learning_rate": 0.0001, + "loss": 1.918, + "step": 3138 + }, + { + "epoch": 0.36468196340400816, + "grad_norm": 0.3850747346878052, + "learning_rate": 0.0001, + "loss": 1.7345, + "step": 3139 + }, + { + "epoch": 0.36479814115596865, + "grad_norm": 0.4012189209461212, + "learning_rate": 0.0001, + "loss": 1.7401, + "step": 3140 + }, + { + "epoch": 0.36491431890792914, + "grad_norm": 0.41232171654701233, + "learning_rate": 0.0001, + "loss": 1.7973, + "step": 3141 + }, + { + "epoch": 0.3650304966598896, + "grad_norm": 0.37721219658851624, + "learning_rate": 0.0001, + "loss": 1.4974, + "step": 3142 + }, + { + "epoch": 0.3651466744118501, + "grad_norm": 0.3969804346561432, + "learning_rate": 0.0001, + "loss": 1.54, + "step": 3143 + }, + { + "epoch": 0.3652628521638106, + "grad_norm": 0.4444282054901123, + "learning_rate": 0.0001, + "loss": 1.7726, + "step": 3144 + }, + { + "epoch": 0.36537902991577115, + "grad_norm": 0.4145006239414215, + "learning_rate": 0.0001, + "loss": 1.6728, + "step": 3145 + }, + { + "epoch": 0.36549520766773164, + "grad_norm": 0.3919104337692261, + "learning_rate": 0.0001, + "loss": 1.5855, + "step": 3146 + }, + { + "epoch": 0.3656113854196921, + "grad_norm": 0.4502047002315521, + "learning_rate": 0.0001, + "loss": 1.6506, + "step": 3147 + }, + { + "epoch": 0.3657275631716526, + "grad_norm": 0.3652731776237488, + "learning_rate": 0.0001, + "loss": 1.6604, + "step": 3148 + }, + { + "epoch": 0.3658437409236131, + "grad_norm": 0.37962082028388977, + "learning_rate": 0.0001, + "loss": 1.5167, + "step": 3149 + }, + { + "epoch": 0.36595991867557365, + "grad_norm": 0.39860934019088745, + "learning_rate": 0.0001, + "loss": 1.7724, + "step": 3150 + }, + { + "epoch": 0.36607609642753414, + "grad_norm": 0.42549842596054077, + "learning_rate": 0.0001, + "loss": 1.7802, + "step": 3151 + }, + { + "epoch": 0.3661922741794946, + "grad_norm": 0.37746211886405945, + "learning_rate": 0.0001, + "loss": 1.5899, + "step": 3152 + }, + { + "epoch": 0.3663084519314551, + "grad_norm": 0.3802013397216797, + "learning_rate": 0.0001, + "loss": 1.712, + "step": 3153 + }, + { + "epoch": 0.3664246296834156, + "grad_norm": 0.426530659198761, + "learning_rate": 0.0001, + "loss": 1.8369, + "step": 3154 + }, + { + "epoch": 0.36654080743537615, + "grad_norm": 0.4312988817691803, + "learning_rate": 0.0001, + "loss": 1.8648, + "step": 3155 + }, + { + "epoch": 0.36665698518733664, + "grad_norm": 0.41918662190437317, + "learning_rate": 0.0001, + "loss": 1.6841, + "step": 3156 + }, + { + "epoch": 0.3667731629392971, + "grad_norm": 0.3956180810928345, + "learning_rate": 0.0001, + "loss": 1.6398, + "step": 3157 + }, + { + "epoch": 0.3668893406912576, + "grad_norm": 0.3965786397457123, + "learning_rate": 0.0001, + "loss": 1.6395, + "step": 3158 + }, + { + "epoch": 0.3670055184432181, + "grad_norm": 0.38480865955352783, + "learning_rate": 0.0001, + "loss": 1.5811, + "step": 3159 + }, + { + "epoch": 0.36712169619517865, + "grad_norm": 0.3683672249317169, + "learning_rate": 0.0001, + "loss": 1.6798, + "step": 3160 + }, + { + "epoch": 0.36723787394713914, + "grad_norm": 0.3930373191833496, + "learning_rate": 0.0001, + "loss": 1.4934, + "step": 3161 + }, + { + "epoch": 0.3673540516990996, + "grad_norm": 0.4485227167606354, + "learning_rate": 0.0001, + "loss": 1.8452, + "step": 3162 + }, + { + "epoch": 0.3674702294510601, + "grad_norm": 0.3868573009967804, + "learning_rate": 0.0001, + "loss": 1.7911, + "step": 3163 + }, + { + "epoch": 0.3675864072030206, + "grad_norm": 0.35300782322883606, + "learning_rate": 0.0001, + "loss": 1.4992, + "step": 3164 + }, + { + "epoch": 0.36770258495498115, + "grad_norm": 0.38356491923332214, + "learning_rate": 0.0001, + "loss": 1.7018, + "step": 3165 + }, + { + "epoch": 0.36781876270694164, + "grad_norm": 0.4126032292842865, + "learning_rate": 0.0001, + "loss": 1.7307, + "step": 3166 + }, + { + "epoch": 0.3679349404589021, + "grad_norm": 0.3850405216217041, + "learning_rate": 0.0001, + "loss": 1.6739, + "step": 3167 + }, + { + "epoch": 0.3680511182108626, + "grad_norm": 0.40334662795066833, + "learning_rate": 0.0001, + "loss": 1.4995, + "step": 3168 + }, + { + "epoch": 0.3681672959628231, + "grad_norm": 0.3768133223056793, + "learning_rate": 0.0001, + "loss": 1.5328, + "step": 3169 + }, + { + "epoch": 0.3682834737147836, + "grad_norm": 0.43316081166267395, + "learning_rate": 0.0001, + "loss": 1.8014, + "step": 3170 + }, + { + "epoch": 0.36839965146674414, + "grad_norm": 0.39245787262916565, + "learning_rate": 0.0001, + "loss": 1.6579, + "step": 3171 + }, + { + "epoch": 0.3685158292187046, + "grad_norm": 0.42442598938941956, + "learning_rate": 0.0001, + "loss": 1.8705, + "step": 3172 + }, + { + "epoch": 0.3686320069706651, + "grad_norm": 0.384989857673645, + "learning_rate": 0.0001, + "loss": 1.8006, + "step": 3173 + }, + { + "epoch": 0.3687481847226256, + "grad_norm": 0.3639425039291382, + "learning_rate": 0.0001, + "loss": 1.5607, + "step": 3174 + }, + { + "epoch": 0.3688643624745861, + "grad_norm": 0.4113941490650177, + "learning_rate": 0.0001, + "loss": 1.7056, + "step": 3175 + }, + { + "epoch": 0.36898054022654664, + "grad_norm": 0.39703720808029175, + "learning_rate": 0.0001, + "loss": 1.6322, + "step": 3176 + }, + { + "epoch": 0.3690967179785071, + "grad_norm": 0.3902719020843506, + "learning_rate": 0.0001, + "loss": 1.5417, + "step": 3177 + }, + { + "epoch": 0.3692128957304676, + "grad_norm": 0.3856205344200134, + "learning_rate": 0.0001, + "loss": 1.684, + "step": 3178 + }, + { + "epoch": 0.3693290734824281, + "grad_norm": 0.4505693018436432, + "learning_rate": 0.0001, + "loss": 1.7766, + "step": 3179 + }, + { + "epoch": 0.3694452512343886, + "grad_norm": 0.4288894534111023, + "learning_rate": 0.0001, + "loss": 1.7079, + "step": 3180 + }, + { + "epoch": 0.36956142898634914, + "grad_norm": 0.39302563667297363, + "learning_rate": 0.0001, + "loss": 1.6753, + "step": 3181 + }, + { + "epoch": 0.3696776067383096, + "grad_norm": 0.3827257454395294, + "learning_rate": 0.0001, + "loss": 1.5735, + "step": 3182 + }, + { + "epoch": 0.3697937844902701, + "grad_norm": 0.38766050338745117, + "learning_rate": 0.0001, + "loss": 1.6736, + "step": 3183 + }, + { + "epoch": 0.3699099622422306, + "grad_norm": 0.38151904940605164, + "learning_rate": 0.0001, + "loss": 1.6248, + "step": 3184 + }, + { + "epoch": 0.3700261399941911, + "grad_norm": 0.3981049656867981, + "learning_rate": 0.0001, + "loss": 1.6537, + "step": 3185 + }, + { + "epoch": 0.37014231774615164, + "grad_norm": 0.38091933727264404, + "learning_rate": 0.0001, + "loss": 1.6446, + "step": 3186 + }, + { + "epoch": 0.37025849549811213, + "grad_norm": 0.34910398721694946, + "learning_rate": 0.0001, + "loss": 1.4155, + "step": 3187 + }, + { + "epoch": 0.3703746732500726, + "grad_norm": 0.3882802128791809, + "learning_rate": 0.0001, + "loss": 1.5564, + "step": 3188 + }, + { + "epoch": 0.3704908510020331, + "grad_norm": 0.40641340613365173, + "learning_rate": 0.0001, + "loss": 1.8477, + "step": 3189 + }, + { + "epoch": 0.3706070287539936, + "grad_norm": 0.3742848038673401, + "learning_rate": 0.0001, + "loss": 1.5292, + "step": 3190 + }, + { + "epoch": 0.3707232065059541, + "grad_norm": 0.39867183566093445, + "learning_rate": 0.0001, + "loss": 1.5682, + "step": 3191 + }, + { + "epoch": 0.37083938425791463, + "grad_norm": 0.38178515434265137, + "learning_rate": 0.0001, + "loss": 1.57, + "step": 3192 + }, + { + "epoch": 0.3709555620098751, + "grad_norm": 0.38367438316345215, + "learning_rate": 0.0001, + "loss": 1.7399, + "step": 3193 + }, + { + "epoch": 0.3710717397618356, + "grad_norm": 0.4279497563838959, + "learning_rate": 0.0001, + "loss": 1.7196, + "step": 3194 + }, + { + "epoch": 0.3711879175137961, + "grad_norm": 0.4051482379436493, + "learning_rate": 0.0001, + "loss": 1.667, + "step": 3195 + }, + { + "epoch": 0.3713040952657566, + "grad_norm": 0.4077185392379761, + "learning_rate": 0.0001, + "loss": 1.7127, + "step": 3196 + }, + { + "epoch": 0.37142027301771713, + "grad_norm": 0.4203553795814514, + "learning_rate": 0.0001, + "loss": 1.8894, + "step": 3197 + }, + { + "epoch": 0.3715364507696776, + "grad_norm": 0.40069088339805603, + "learning_rate": 0.0001, + "loss": 1.5871, + "step": 3198 + }, + { + "epoch": 0.3716526285216381, + "grad_norm": 0.4221431016921997, + "learning_rate": 0.0001, + "loss": 1.847, + "step": 3199 + }, + { + "epoch": 0.3717688062735986, + "grad_norm": 0.4118482768535614, + "learning_rate": 0.0001, + "loss": 1.7249, + "step": 3200 + }, + { + "epoch": 0.3718849840255591, + "grad_norm": 0.39366352558135986, + "learning_rate": 0.0001, + "loss": 1.6118, + "step": 3201 + }, + { + "epoch": 0.37200116177751963, + "grad_norm": 0.38057029247283936, + "learning_rate": 0.0001, + "loss": 1.5149, + "step": 3202 + }, + { + "epoch": 0.3721173395294801, + "grad_norm": 0.41096463799476624, + "learning_rate": 0.0001, + "loss": 1.6949, + "step": 3203 + }, + { + "epoch": 0.3722335172814406, + "grad_norm": 0.39210009574890137, + "learning_rate": 0.0001, + "loss": 1.7374, + "step": 3204 + }, + { + "epoch": 0.3723496950334011, + "grad_norm": 0.404095858335495, + "learning_rate": 0.0001, + "loss": 1.7286, + "step": 3205 + }, + { + "epoch": 0.3724658727853616, + "grad_norm": 0.4096939265727997, + "learning_rate": 0.0001, + "loss": 1.6522, + "step": 3206 + }, + { + "epoch": 0.37258205053732213, + "grad_norm": 0.41257479786872864, + "learning_rate": 0.0001, + "loss": 1.6904, + "step": 3207 + }, + { + "epoch": 0.3726982282892826, + "grad_norm": 0.4033520221710205, + "learning_rate": 0.0001, + "loss": 1.707, + "step": 3208 + }, + { + "epoch": 0.3728144060412431, + "grad_norm": 0.38651078939437866, + "learning_rate": 0.0001, + "loss": 1.7291, + "step": 3209 + }, + { + "epoch": 0.3729305837932036, + "grad_norm": 0.40973809361457825, + "learning_rate": 0.0001, + "loss": 1.822, + "step": 3210 + }, + { + "epoch": 0.3730467615451641, + "grad_norm": 0.40880417823791504, + "learning_rate": 0.0001, + "loss": 1.6477, + "step": 3211 + }, + { + "epoch": 0.3731629392971246, + "grad_norm": 0.4003618061542511, + "learning_rate": 0.0001, + "loss": 1.7002, + "step": 3212 + }, + { + "epoch": 0.3732791170490851, + "grad_norm": 0.38030943274497986, + "learning_rate": 0.0001, + "loss": 1.4813, + "step": 3213 + }, + { + "epoch": 0.3733952948010456, + "grad_norm": 0.4228413999080658, + "learning_rate": 0.0001, + "loss": 1.6994, + "step": 3214 + }, + { + "epoch": 0.3735114725530061, + "grad_norm": 0.3931327164173126, + "learning_rate": 0.0001, + "loss": 1.5443, + "step": 3215 + }, + { + "epoch": 0.3736276503049666, + "grad_norm": 0.39333489537239075, + "learning_rate": 0.0001, + "loss": 1.7909, + "step": 3216 + }, + { + "epoch": 0.3737438280569271, + "grad_norm": 0.4146861433982849, + "learning_rate": 0.0001, + "loss": 1.9366, + "step": 3217 + }, + { + "epoch": 0.3738600058088876, + "grad_norm": 0.3847891390323639, + "learning_rate": 0.0001, + "loss": 1.5637, + "step": 3218 + }, + { + "epoch": 0.3739761835608481, + "grad_norm": 0.3797701299190521, + "learning_rate": 0.0001, + "loss": 1.6363, + "step": 3219 + }, + { + "epoch": 0.3740923613128086, + "grad_norm": 0.38476166129112244, + "learning_rate": 0.0001, + "loss": 1.6911, + "step": 3220 + }, + { + "epoch": 0.3742085390647691, + "grad_norm": 0.39565321803092957, + "learning_rate": 0.0001, + "loss": 1.6827, + "step": 3221 + }, + { + "epoch": 0.3743247168167296, + "grad_norm": 0.38622844219207764, + "learning_rate": 0.0001, + "loss": 1.6236, + "step": 3222 + }, + { + "epoch": 0.3744408945686901, + "grad_norm": 0.39825373888015747, + "learning_rate": 0.0001, + "loss": 1.716, + "step": 3223 + }, + { + "epoch": 0.3745570723206506, + "grad_norm": 0.35864919424057007, + "learning_rate": 0.0001, + "loss": 1.5082, + "step": 3224 + }, + { + "epoch": 0.3746732500726111, + "grad_norm": 0.40784189105033875, + "learning_rate": 0.0001, + "loss": 1.7036, + "step": 3225 + }, + { + "epoch": 0.3747894278245716, + "grad_norm": 0.41927802562713623, + "learning_rate": 0.0001, + "loss": 1.7068, + "step": 3226 + }, + { + "epoch": 0.3749056055765321, + "grad_norm": 0.44369855523109436, + "learning_rate": 0.0001, + "loss": 1.6996, + "step": 3227 + }, + { + "epoch": 0.3750217833284926, + "grad_norm": 0.36899685859680176, + "learning_rate": 0.0001, + "loss": 1.4204, + "step": 3228 + }, + { + "epoch": 0.3751379610804531, + "grad_norm": 0.3842264711856842, + "learning_rate": 0.0001, + "loss": 1.7682, + "step": 3229 + }, + { + "epoch": 0.3752541388324136, + "grad_norm": 0.41173362731933594, + "learning_rate": 0.0001, + "loss": 1.7494, + "step": 3230 + }, + { + "epoch": 0.3753703165843741, + "grad_norm": 0.37613826990127563, + "learning_rate": 0.0001, + "loss": 1.5564, + "step": 3231 + }, + { + "epoch": 0.3754864943363346, + "grad_norm": 0.4120997190475464, + "learning_rate": 0.0001, + "loss": 1.8258, + "step": 3232 + }, + { + "epoch": 0.37560267208829506, + "grad_norm": 0.4014083743095398, + "learning_rate": 0.0001, + "loss": 1.7189, + "step": 3233 + }, + { + "epoch": 0.3757188498402556, + "grad_norm": 0.38191670179367065, + "learning_rate": 0.0001, + "loss": 1.5884, + "step": 3234 + }, + { + "epoch": 0.3758350275922161, + "grad_norm": 0.3927380442619324, + "learning_rate": 0.0001, + "loss": 1.7031, + "step": 3235 + }, + { + "epoch": 0.3759512053441766, + "grad_norm": 0.39120250940322876, + "learning_rate": 0.0001, + "loss": 1.5805, + "step": 3236 + }, + { + "epoch": 0.3760673830961371, + "grad_norm": 0.36845624446868896, + "learning_rate": 0.0001, + "loss": 1.6001, + "step": 3237 + }, + { + "epoch": 0.37618356084809756, + "grad_norm": 0.39836472272872925, + "learning_rate": 0.0001, + "loss": 1.7282, + "step": 3238 + }, + { + "epoch": 0.3762997386000581, + "grad_norm": 0.3965427279472351, + "learning_rate": 0.0001, + "loss": 1.6275, + "step": 3239 + }, + { + "epoch": 0.3764159163520186, + "grad_norm": 0.4197937846183777, + "learning_rate": 0.0001, + "loss": 1.6893, + "step": 3240 + }, + { + "epoch": 0.3765320941039791, + "grad_norm": 0.42778176069259644, + "learning_rate": 0.0001, + "loss": 1.7027, + "step": 3241 + }, + { + "epoch": 0.3766482718559396, + "grad_norm": 0.3915843665599823, + "learning_rate": 0.0001, + "loss": 1.723, + "step": 3242 + }, + { + "epoch": 0.37676444960790006, + "grad_norm": 0.401483416557312, + "learning_rate": 0.0001, + "loss": 1.573, + "step": 3243 + }, + { + "epoch": 0.3768806273598606, + "grad_norm": 0.4040675163269043, + "learning_rate": 0.0001, + "loss": 1.5794, + "step": 3244 + }, + { + "epoch": 0.3769968051118211, + "grad_norm": 0.39297956228256226, + "learning_rate": 0.0001, + "loss": 1.6027, + "step": 3245 + }, + { + "epoch": 0.3771129828637816, + "grad_norm": 0.3960307240486145, + "learning_rate": 0.0001, + "loss": 1.5524, + "step": 3246 + }, + { + "epoch": 0.3772291606157421, + "grad_norm": 0.4098290205001831, + "learning_rate": 0.0001, + "loss": 1.7013, + "step": 3247 + }, + { + "epoch": 0.37734533836770257, + "grad_norm": 0.4199242889881134, + "learning_rate": 0.0001, + "loss": 1.6731, + "step": 3248 + }, + { + "epoch": 0.3774615161196631, + "grad_norm": 0.4626270830631256, + "learning_rate": 0.0001, + "loss": 1.9117, + "step": 3249 + }, + { + "epoch": 0.3775776938716236, + "grad_norm": 0.3856443464756012, + "learning_rate": 0.0001, + "loss": 1.4409, + "step": 3250 + }, + { + "epoch": 0.3776938716235841, + "grad_norm": 0.3848719000816345, + "learning_rate": 0.0001, + "loss": 1.6576, + "step": 3251 + }, + { + "epoch": 0.3778100493755446, + "grad_norm": 0.4036993384361267, + "learning_rate": 0.0001, + "loss": 1.8056, + "step": 3252 + }, + { + "epoch": 0.37792622712750507, + "grad_norm": 0.42733898758888245, + "learning_rate": 0.0001, + "loss": 1.73, + "step": 3253 + }, + { + "epoch": 0.3780424048794656, + "grad_norm": 0.3956359922885895, + "learning_rate": 0.0001, + "loss": 1.7278, + "step": 3254 + }, + { + "epoch": 0.3781585826314261, + "grad_norm": 0.391928106546402, + "learning_rate": 0.0001, + "loss": 1.5629, + "step": 3255 + }, + { + "epoch": 0.3782747603833866, + "grad_norm": 0.4201536774635315, + "learning_rate": 0.0001, + "loss": 1.8002, + "step": 3256 + }, + { + "epoch": 0.3783909381353471, + "grad_norm": 0.41449272632598877, + "learning_rate": 0.0001, + "loss": 1.6748, + "step": 3257 + }, + { + "epoch": 0.37850711588730757, + "grad_norm": 0.4059070944786072, + "learning_rate": 0.0001, + "loss": 1.6892, + "step": 3258 + }, + { + "epoch": 0.37862329363926805, + "grad_norm": 0.40105417370796204, + "learning_rate": 0.0001, + "loss": 1.6817, + "step": 3259 + }, + { + "epoch": 0.3787394713912286, + "grad_norm": 0.4251152276992798, + "learning_rate": 0.0001, + "loss": 1.6597, + "step": 3260 + }, + { + "epoch": 0.3788556491431891, + "grad_norm": 0.3787403702735901, + "learning_rate": 0.0001, + "loss": 1.6152, + "step": 3261 + }, + { + "epoch": 0.3789718268951496, + "grad_norm": 0.39760053157806396, + "learning_rate": 0.0001, + "loss": 1.7866, + "step": 3262 + }, + { + "epoch": 0.37908800464711007, + "grad_norm": 0.3865306079387665, + "learning_rate": 0.0001, + "loss": 1.5825, + "step": 3263 + }, + { + "epoch": 0.37920418239907056, + "grad_norm": 0.40044572949409485, + "learning_rate": 0.0001, + "loss": 1.7403, + "step": 3264 + }, + { + "epoch": 0.3793203601510311, + "grad_norm": 0.36181673407554626, + "learning_rate": 0.0001, + "loss": 1.5654, + "step": 3265 + }, + { + "epoch": 0.3794365379029916, + "grad_norm": 0.4151827096939087, + "learning_rate": 0.0001, + "loss": 1.7351, + "step": 3266 + }, + { + "epoch": 0.3795527156549521, + "grad_norm": 0.3959139287471771, + "learning_rate": 0.0001, + "loss": 1.5862, + "step": 3267 + }, + { + "epoch": 0.37966889340691257, + "grad_norm": 0.40265771746635437, + "learning_rate": 0.0001, + "loss": 1.7255, + "step": 3268 + }, + { + "epoch": 0.37978507115887306, + "grad_norm": 0.39795050024986267, + "learning_rate": 0.0001, + "loss": 1.6619, + "step": 3269 + }, + { + "epoch": 0.3799012489108336, + "grad_norm": 0.38326337933540344, + "learning_rate": 0.0001, + "loss": 1.6536, + "step": 3270 + }, + { + "epoch": 0.3800174266627941, + "grad_norm": 0.4339217245578766, + "learning_rate": 0.0001, + "loss": 1.7723, + "step": 3271 + }, + { + "epoch": 0.3801336044147546, + "grad_norm": 0.391989141702652, + "learning_rate": 0.0001, + "loss": 1.5652, + "step": 3272 + }, + { + "epoch": 0.38024978216671507, + "grad_norm": 0.3686724901199341, + "learning_rate": 0.0001, + "loss": 1.4674, + "step": 3273 + }, + { + "epoch": 0.38036595991867556, + "grad_norm": 0.3889879584312439, + "learning_rate": 0.0001, + "loss": 1.6417, + "step": 3274 + }, + { + "epoch": 0.3804821376706361, + "grad_norm": 0.37887483835220337, + "learning_rate": 0.0001, + "loss": 1.64, + "step": 3275 + }, + { + "epoch": 0.3805983154225966, + "grad_norm": 0.3940137028694153, + "learning_rate": 0.0001, + "loss": 1.569, + "step": 3276 + }, + { + "epoch": 0.3807144931745571, + "grad_norm": 0.38315024971961975, + "learning_rate": 0.0001, + "loss": 1.654, + "step": 3277 + }, + { + "epoch": 0.38083067092651757, + "grad_norm": 0.43694132566452026, + "learning_rate": 0.0001, + "loss": 1.61, + "step": 3278 + }, + { + "epoch": 0.38094684867847806, + "grad_norm": 0.37439560890197754, + "learning_rate": 0.0001, + "loss": 1.4972, + "step": 3279 + }, + { + "epoch": 0.38106302643043855, + "grad_norm": 0.39639008045196533, + "learning_rate": 0.0001, + "loss": 1.5886, + "step": 3280 + }, + { + "epoch": 0.3811792041823991, + "grad_norm": 0.41483819484710693, + "learning_rate": 0.0001, + "loss": 1.7358, + "step": 3281 + }, + { + "epoch": 0.3812953819343596, + "grad_norm": 0.38614705204963684, + "learning_rate": 0.0001, + "loss": 1.884, + "step": 3282 + }, + { + "epoch": 0.38141155968632007, + "grad_norm": 0.4163734018802643, + "learning_rate": 0.0001, + "loss": 1.7154, + "step": 3283 + }, + { + "epoch": 0.38152773743828056, + "grad_norm": 0.3871447741985321, + "learning_rate": 0.0001, + "loss": 1.8271, + "step": 3284 + }, + { + "epoch": 0.38164391519024105, + "grad_norm": 0.4444115161895752, + "learning_rate": 0.0001, + "loss": 1.7751, + "step": 3285 + }, + { + "epoch": 0.3817600929422016, + "grad_norm": 0.3978256285190582, + "learning_rate": 0.0001, + "loss": 1.7136, + "step": 3286 + }, + { + "epoch": 0.3818762706941621, + "grad_norm": 0.37365588545799255, + "learning_rate": 0.0001, + "loss": 1.55, + "step": 3287 + }, + { + "epoch": 0.38199244844612257, + "grad_norm": 0.39886194467544556, + "learning_rate": 0.0001, + "loss": 1.7383, + "step": 3288 + }, + { + "epoch": 0.38210862619808306, + "grad_norm": 0.4288283586502075, + "learning_rate": 0.0001, + "loss": 1.7196, + "step": 3289 + }, + { + "epoch": 0.38222480395004355, + "grad_norm": 0.40688732266426086, + "learning_rate": 0.0001, + "loss": 1.8548, + "step": 3290 + }, + { + "epoch": 0.3823409817020041, + "grad_norm": 0.3931783437728882, + "learning_rate": 0.0001, + "loss": 1.7435, + "step": 3291 + }, + { + "epoch": 0.3824571594539646, + "grad_norm": 0.41494035720825195, + "learning_rate": 0.0001, + "loss": 1.7333, + "step": 3292 + }, + { + "epoch": 0.38257333720592507, + "grad_norm": 0.43234965205192566, + "learning_rate": 0.0001, + "loss": 1.6722, + "step": 3293 + }, + { + "epoch": 0.38268951495788556, + "grad_norm": 0.3797743022441864, + "learning_rate": 0.0001, + "loss": 1.6612, + "step": 3294 + }, + { + "epoch": 0.38280569270984605, + "grad_norm": 0.39665845036506653, + "learning_rate": 0.0001, + "loss": 1.6871, + "step": 3295 + }, + { + "epoch": 0.3829218704618066, + "grad_norm": 0.430963933467865, + "learning_rate": 0.0001, + "loss": 1.757, + "step": 3296 + }, + { + "epoch": 0.3830380482137671, + "grad_norm": 0.3795839250087738, + "learning_rate": 0.0001, + "loss": 1.727, + "step": 3297 + }, + { + "epoch": 0.38315422596572757, + "grad_norm": 0.3861173987388611, + "learning_rate": 0.0001, + "loss": 1.5847, + "step": 3298 + }, + { + "epoch": 0.38327040371768806, + "grad_norm": 0.4656057357788086, + "learning_rate": 0.0001, + "loss": 1.7191, + "step": 3299 + }, + { + "epoch": 0.38338658146964855, + "grad_norm": 0.42121168971061707, + "learning_rate": 0.0001, + "loss": 1.7395, + "step": 3300 + }, + { + "epoch": 0.38350275922160904, + "grad_norm": 0.4215461313724518, + "learning_rate": 0.0001, + "loss": 1.7149, + "step": 3301 + }, + { + "epoch": 0.3836189369735696, + "grad_norm": 0.3778843581676483, + "learning_rate": 0.0001, + "loss": 1.6032, + "step": 3302 + }, + { + "epoch": 0.38373511472553007, + "grad_norm": 0.372529000043869, + "learning_rate": 0.0001, + "loss": 1.6222, + "step": 3303 + }, + { + "epoch": 0.38385129247749056, + "grad_norm": 0.3816990852355957, + "learning_rate": 0.0001, + "loss": 1.6851, + "step": 3304 + }, + { + "epoch": 0.38396747022945105, + "grad_norm": 0.3976283669471741, + "learning_rate": 0.0001, + "loss": 1.7161, + "step": 3305 + }, + { + "epoch": 0.38408364798141154, + "grad_norm": 0.38981184363365173, + "learning_rate": 0.0001, + "loss": 1.7524, + "step": 3306 + }, + { + "epoch": 0.3841998257333721, + "grad_norm": 0.4135308265686035, + "learning_rate": 0.0001, + "loss": 1.5426, + "step": 3307 + }, + { + "epoch": 0.38431600348533257, + "grad_norm": 0.39930155873298645, + "learning_rate": 0.0001, + "loss": 1.785, + "step": 3308 + }, + { + "epoch": 0.38443218123729306, + "grad_norm": 0.36703500151634216, + "learning_rate": 0.0001, + "loss": 1.4164, + "step": 3309 + }, + { + "epoch": 0.38454835898925355, + "grad_norm": 0.396085262298584, + "learning_rate": 0.0001, + "loss": 1.6961, + "step": 3310 + }, + { + "epoch": 0.38466453674121404, + "grad_norm": 0.4066247045993805, + "learning_rate": 0.0001, + "loss": 1.6314, + "step": 3311 + }, + { + "epoch": 0.3847807144931746, + "grad_norm": 0.4137192368507385, + "learning_rate": 0.0001, + "loss": 1.6474, + "step": 3312 + }, + { + "epoch": 0.38489689224513507, + "grad_norm": 0.44450923800468445, + "learning_rate": 0.0001, + "loss": 1.713, + "step": 3313 + }, + { + "epoch": 0.38501306999709556, + "grad_norm": 0.4225256145000458, + "learning_rate": 0.0001, + "loss": 1.5993, + "step": 3314 + }, + { + "epoch": 0.38512924774905605, + "grad_norm": 0.37783291935920715, + "learning_rate": 0.0001, + "loss": 1.563, + "step": 3315 + }, + { + "epoch": 0.38524542550101654, + "grad_norm": 0.3956874907016754, + "learning_rate": 0.0001, + "loss": 1.6478, + "step": 3316 + }, + { + "epoch": 0.3853616032529771, + "grad_norm": 0.39644670486450195, + "learning_rate": 0.0001, + "loss": 1.631, + "step": 3317 + }, + { + "epoch": 0.38547778100493757, + "grad_norm": 0.4001430869102478, + "learning_rate": 0.0001, + "loss": 1.5372, + "step": 3318 + }, + { + "epoch": 0.38559395875689806, + "grad_norm": 0.42300862073898315, + "learning_rate": 0.0001, + "loss": 1.65, + "step": 3319 + }, + { + "epoch": 0.38571013650885855, + "grad_norm": 0.4075738191604614, + "learning_rate": 0.0001, + "loss": 1.5891, + "step": 3320 + }, + { + "epoch": 0.38582631426081904, + "grad_norm": 0.40944305062294006, + "learning_rate": 0.0001, + "loss": 1.6537, + "step": 3321 + }, + { + "epoch": 0.3859424920127795, + "grad_norm": 0.41256004571914673, + "learning_rate": 0.0001, + "loss": 1.7812, + "step": 3322 + }, + { + "epoch": 0.38605866976474007, + "grad_norm": 0.45253586769104004, + "learning_rate": 0.0001, + "loss": 1.9485, + "step": 3323 + }, + { + "epoch": 0.38617484751670056, + "grad_norm": 0.3973802626132965, + "learning_rate": 0.0001, + "loss": 1.7537, + "step": 3324 + }, + { + "epoch": 0.38629102526866105, + "grad_norm": 0.3796943724155426, + "learning_rate": 0.0001, + "loss": 1.6582, + "step": 3325 + }, + { + "epoch": 0.38640720302062154, + "grad_norm": 0.38597339391708374, + "learning_rate": 0.0001, + "loss": 1.7241, + "step": 3326 + }, + { + "epoch": 0.386523380772582, + "grad_norm": 0.4044647514820099, + "learning_rate": 0.0001, + "loss": 1.6084, + "step": 3327 + }, + { + "epoch": 0.38663955852454257, + "grad_norm": 0.38887619972229004, + "learning_rate": 0.0001, + "loss": 1.6295, + "step": 3328 + }, + { + "epoch": 0.38675573627650306, + "grad_norm": 0.3996337354183197, + "learning_rate": 0.0001, + "loss": 1.6442, + "step": 3329 + }, + { + "epoch": 0.38687191402846355, + "grad_norm": 0.3903794288635254, + "learning_rate": 0.0001, + "loss": 1.6134, + "step": 3330 + }, + { + "epoch": 0.38698809178042404, + "grad_norm": 0.41847532987594604, + "learning_rate": 0.0001, + "loss": 1.7084, + "step": 3331 + }, + { + "epoch": 0.3871042695323845, + "grad_norm": 0.3963734805583954, + "learning_rate": 0.0001, + "loss": 1.646, + "step": 3332 + }, + { + "epoch": 0.38722044728434507, + "grad_norm": 0.3818768858909607, + "learning_rate": 0.0001, + "loss": 1.6953, + "step": 3333 + }, + { + "epoch": 0.38733662503630556, + "grad_norm": 0.4042767286300659, + "learning_rate": 0.0001, + "loss": 1.6731, + "step": 3334 + }, + { + "epoch": 0.38745280278826605, + "grad_norm": 0.3764187693595886, + "learning_rate": 0.0001, + "loss": 1.6663, + "step": 3335 + }, + { + "epoch": 0.38756898054022654, + "grad_norm": 0.41739174723625183, + "learning_rate": 0.0001, + "loss": 1.771, + "step": 3336 + }, + { + "epoch": 0.387685158292187, + "grad_norm": 0.3916127383708954, + "learning_rate": 0.0001, + "loss": 1.5899, + "step": 3337 + }, + { + "epoch": 0.38780133604414757, + "grad_norm": 0.3758176863193512, + "learning_rate": 0.0001, + "loss": 1.5736, + "step": 3338 + }, + { + "epoch": 0.38791751379610806, + "grad_norm": 0.37665247917175293, + "learning_rate": 0.0001, + "loss": 1.5968, + "step": 3339 + }, + { + "epoch": 0.38803369154806855, + "grad_norm": 0.38842839002609253, + "learning_rate": 0.0001, + "loss": 1.6628, + "step": 3340 + }, + { + "epoch": 0.38814986930002904, + "grad_norm": 0.4044833183288574, + "learning_rate": 0.0001, + "loss": 1.7784, + "step": 3341 + }, + { + "epoch": 0.3882660470519895, + "grad_norm": 0.3983111083507538, + "learning_rate": 0.0001, + "loss": 1.5932, + "step": 3342 + }, + { + "epoch": 0.38838222480395007, + "grad_norm": 0.3763045072555542, + "learning_rate": 0.0001, + "loss": 1.6841, + "step": 3343 + }, + { + "epoch": 0.38849840255591056, + "grad_norm": 0.43065011501312256, + "learning_rate": 0.0001, + "loss": 1.7841, + "step": 3344 + }, + { + "epoch": 0.38861458030787105, + "grad_norm": 0.3971543312072754, + "learning_rate": 0.0001, + "loss": 1.7052, + "step": 3345 + }, + { + "epoch": 0.38873075805983154, + "grad_norm": 0.4392778277397156, + "learning_rate": 0.0001, + "loss": 1.7309, + "step": 3346 + }, + { + "epoch": 0.388846935811792, + "grad_norm": 0.41214969754219055, + "learning_rate": 0.0001, + "loss": 1.6677, + "step": 3347 + }, + { + "epoch": 0.3889631135637525, + "grad_norm": 0.3774077296257019, + "learning_rate": 0.0001, + "loss": 1.594, + "step": 3348 + }, + { + "epoch": 0.38907929131571306, + "grad_norm": 0.4119464159011841, + "learning_rate": 0.0001, + "loss": 1.7172, + "step": 3349 + }, + { + "epoch": 0.38919546906767355, + "grad_norm": 0.4135492146015167, + "learning_rate": 0.0001, + "loss": 1.7139, + "step": 3350 + }, + { + "epoch": 0.38931164681963404, + "grad_norm": 0.41081520915031433, + "learning_rate": 0.0001, + "loss": 1.8803, + "step": 3351 + }, + { + "epoch": 0.3894278245715945, + "grad_norm": 0.4038920998573303, + "learning_rate": 0.0001, + "loss": 1.8064, + "step": 3352 + }, + { + "epoch": 0.389544002323555, + "grad_norm": 0.4117118716239929, + "learning_rate": 0.0001, + "loss": 1.6438, + "step": 3353 + }, + { + "epoch": 0.38966018007551556, + "grad_norm": 0.3965054452419281, + "learning_rate": 0.0001, + "loss": 1.7807, + "step": 3354 + }, + { + "epoch": 0.38977635782747605, + "grad_norm": 0.3699134886264801, + "learning_rate": 0.0001, + "loss": 1.6003, + "step": 3355 + }, + { + "epoch": 0.38989253557943654, + "grad_norm": 0.3890858590602875, + "learning_rate": 0.0001, + "loss": 1.6699, + "step": 3356 + }, + { + "epoch": 0.39000871333139703, + "grad_norm": 0.3924883008003235, + "learning_rate": 0.0001, + "loss": 1.6579, + "step": 3357 + }, + { + "epoch": 0.3901248910833575, + "grad_norm": 0.40697407722473145, + "learning_rate": 0.0001, + "loss": 1.743, + "step": 3358 + }, + { + "epoch": 0.39024106883531806, + "grad_norm": 0.3949110507965088, + "learning_rate": 0.0001, + "loss": 1.7615, + "step": 3359 + }, + { + "epoch": 0.39035724658727855, + "grad_norm": 0.4023362100124359, + "learning_rate": 0.0001, + "loss": 1.7459, + "step": 3360 + }, + { + "epoch": 0.39047342433923904, + "grad_norm": 0.3989759385585785, + "learning_rate": 0.0001, + "loss": 1.6347, + "step": 3361 + }, + { + "epoch": 0.39058960209119953, + "grad_norm": 0.3800547420978546, + "learning_rate": 0.0001, + "loss": 1.5109, + "step": 3362 + }, + { + "epoch": 0.39070577984316, + "grad_norm": 0.408544659614563, + "learning_rate": 0.0001, + "loss": 1.7846, + "step": 3363 + }, + { + "epoch": 0.39082195759512056, + "grad_norm": 0.41592341661453247, + "learning_rate": 0.0001, + "loss": 1.6665, + "step": 3364 + }, + { + "epoch": 0.39093813534708105, + "grad_norm": 0.4047764539718628, + "learning_rate": 0.0001, + "loss": 1.88, + "step": 3365 + }, + { + "epoch": 0.39105431309904154, + "grad_norm": 0.3952164649963379, + "learning_rate": 0.0001, + "loss": 1.5757, + "step": 3366 + }, + { + "epoch": 0.39117049085100203, + "grad_norm": 0.39583536982536316, + "learning_rate": 0.0001, + "loss": 1.5236, + "step": 3367 + }, + { + "epoch": 0.3912866686029625, + "grad_norm": 0.40658921003341675, + "learning_rate": 0.0001, + "loss": 1.6708, + "step": 3368 + }, + { + "epoch": 0.391402846354923, + "grad_norm": 0.42151522636413574, + "learning_rate": 0.0001, + "loss": 1.7088, + "step": 3369 + }, + { + "epoch": 0.39151902410688355, + "grad_norm": 0.42344412207603455, + "learning_rate": 0.0001, + "loss": 1.7888, + "step": 3370 + }, + { + "epoch": 0.39163520185884404, + "grad_norm": 0.3720431625843048, + "learning_rate": 0.0001, + "loss": 1.6304, + "step": 3371 + }, + { + "epoch": 0.39175137961080453, + "grad_norm": 0.39411425590515137, + "learning_rate": 0.0001, + "loss": 1.7272, + "step": 3372 + }, + { + "epoch": 0.391867557362765, + "grad_norm": 0.3884545862674713, + "learning_rate": 0.0001, + "loss": 1.6546, + "step": 3373 + }, + { + "epoch": 0.3919837351147255, + "grad_norm": 0.4163500666618347, + "learning_rate": 0.0001, + "loss": 1.7583, + "step": 3374 + }, + { + "epoch": 0.39209991286668605, + "grad_norm": 0.38707318902015686, + "learning_rate": 0.0001, + "loss": 1.6019, + "step": 3375 + }, + { + "epoch": 0.39221609061864654, + "grad_norm": 0.4354010820388794, + "learning_rate": 0.0001, + "loss": 1.5799, + "step": 3376 + }, + { + "epoch": 0.39233226837060703, + "grad_norm": 0.39656829833984375, + "learning_rate": 0.0001, + "loss": 1.6507, + "step": 3377 + }, + { + "epoch": 0.3924484461225675, + "grad_norm": 0.39710286259651184, + "learning_rate": 0.0001, + "loss": 1.6391, + "step": 3378 + }, + { + "epoch": 0.392564623874528, + "grad_norm": 0.40320920944213867, + "learning_rate": 0.0001, + "loss": 1.7816, + "step": 3379 + }, + { + "epoch": 0.39268080162648855, + "grad_norm": 0.40849626064300537, + "learning_rate": 0.0001, + "loss": 1.5617, + "step": 3380 + }, + { + "epoch": 0.39279697937844904, + "grad_norm": 0.37604308128356934, + "learning_rate": 0.0001, + "loss": 1.6333, + "step": 3381 + }, + { + "epoch": 0.39291315713040953, + "grad_norm": 0.38765400648117065, + "learning_rate": 0.0001, + "loss": 1.6481, + "step": 3382 + }, + { + "epoch": 0.39302933488237, + "grad_norm": 0.38033145666122437, + "learning_rate": 0.0001, + "loss": 1.7555, + "step": 3383 + }, + { + "epoch": 0.3931455126343305, + "grad_norm": 0.39232638478279114, + "learning_rate": 0.0001, + "loss": 1.538, + "step": 3384 + }, + { + "epoch": 0.39326169038629105, + "grad_norm": 0.41576313972473145, + "learning_rate": 0.0001, + "loss": 1.6984, + "step": 3385 + }, + { + "epoch": 0.39337786813825154, + "grad_norm": 0.40268710255622864, + "learning_rate": 0.0001, + "loss": 1.7465, + "step": 3386 + }, + { + "epoch": 0.39349404589021203, + "grad_norm": 0.3708171546459198, + "learning_rate": 0.0001, + "loss": 1.5194, + "step": 3387 + }, + { + "epoch": 0.3936102236421725, + "grad_norm": 0.3807376027107239, + "learning_rate": 0.0001, + "loss": 1.5313, + "step": 3388 + }, + { + "epoch": 0.393726401394133, + "grad_norm": 0.39858368039131165, + "learning_rate": 0.0001, + "loss": 1.7221, + "step": 3389 + }, + { + "epoch": 0.3938425791460935, + "grad_norm": 0.3780364692211151, + "learning_rate": 0.0001, + "loss": 1.7133, + "step": 3390 + }, + { + "epoch": 0.39395875689805404, + "grad_norm": 0.39685890078544617, + "learning_rate": 0.0001, + "loss": 1.7051, + "step": 3391 + }, + { + "epoch": 0.39407493465001453, + "grad_norm": 0.4291624128818512, + "learning_rate": 0.0001, + "loss": 1.8533, + "step": 3392 + }, + { + "epoch": 0.394191112401975, + "grad_norm": 0.366272509098053, + "learning_rate": 0.0001, + "loss": 1.4734, + "step": 3393 + }, + { + "epoch": 0.3943072901539355, + "grad_norm": 0.3910382091999054, + "learning_rate": 0.0001, + "loss": 1.6785, + "step": 3394 + }, + { + "epoch": 0.394423467905896, + "grad_norm": 0.4170825481414795, + "learning_rate": 0.0001, + "loss": 1.684, + "step": 3395 + }, + { + "epoch": 0.39453964565785654, + "grad_norm": 0.3881252706050873, + "learning_rate": 0.0001, + "loss": 1.7354, + "step": 3396 + }, + { + "epoch": 0.39465582340981703, + "grad_norm": 0.3868289291858673, + "learning_rate": 0.0001, + "loss": 1.619, + "step": 3397 + }, + { + "epoch": 0.3947720011617775, + "grad_norm": 0.38081884384155273, + "learning_rate": 0.0001, + "loss": 1.6079, + "step": 3398 + }, + { + "epoch": 0.394888178913738, + "grad_norm": 0.39945870637893677, + "learning_rate": 0.0001, + "loss": 1.5884, + "step": 3399 + }, + { + "epoch": 0.3950043566656985, + "grad_norm": 0.4090173840522766, + "learning_rate": 0.0001, + "loss": 1.7628, + "step": 3400 + }, + { + "epoch": 0.39512053441765904, + "grad_norm": 0.3833993971347809, + "learning_rate": 0.0001, + "loss": 1.4581, + "step": 3401 + }, + { + "epoch": 0.39523671216961953, + "grad_norm": 0.4201992154121399, + "learning_rate": 0.0001, + "loss": 1.713, + "step": 3402 + }, + { + "epoch": 0.39535288992158, + "grad_norm": 0.4019543528556824, + "learning_rate": 0.0001, + "loss": 1.6119, + "step": 3403 + }, + { + "epoch": 0.3954690676735405, + "grad_norm": 0.3776283860206604, + "learning_rate": 0.0001, + "loss": 1.5757, + "step": 3404 + }, + { + "epoch": 0.395585245425501, + "grad_norm": 0.38334932923316956, + "learning_rate": 0.0001, + "loss": 1.3783, + "step": 3405 + }, + { + "epoch": 0.39570142317746154, + "grad_norm": 0.3982570767402649, + "learning_rate": 0.0001, + "loss": 1.6886, + "step": 3406 + }, + { + "epoch": 0.39581760092942203, + "grad_norm": 0.3854517340660095, + "learning_rate": 0.0001, + "loss": 1.708, + "step": 3407 + }, + { + "epoch": 0.3959337786813825, + "grad_norm": 0.3867027461528778, + "learning_rate": 0.0001, + "loss": 1.6259, + "step": 3408 + }, + { + "epoch": 0.396049956433343, + "grad_norm": 0.39494284987449646, + "learning_rate": 0.0001, + "loss": 1.5789, + "step": 3409 + }, + { + "epoch": 0.3961661341853035, + "grad_norm": 0.40102261304855347, + "learning_rate": 0.0001, + "loss": 1.6244, + "step": 3410 + }, + { + "epoch": 0.396282311937264, + "grad_norm": 0.38136184215545654, + "learning_rate": 0.0001, + "loss": 1.5714, + "step": 3411 + }, + { + "epoch": 0.39639848968922453, + "grad_norm": 0.40026000142097473, + "learning_rate": 0.0001, + "loss": 1.5351, + "step": 3412 + }, + { + "epoch": 0.396514667441185, + "grad_norm": 0.4199739694595337, + "learning_rate": 0.0001, + "loss": 1.6364, + "step": 3413 + }, + { + "epoch": 0.3966308451931455, + "grad_norm": 0.4212833344936371, + "learning_rate": 0.0001, + "loss": 1.6712, + "step": 3414 + }, + { + "epoch": 0.396747022945106, + "grad_norm": 0.39813485741615295, + "learning_rate": 0.0001, + "loss": 1.4715, + "step": 3415 + }, + { + "epoch": 0.3968632006970665, + "grad_norm": 0.40672633051872253, + "learning_rate": 0.0001, + "loss": 1.7394, + "step": 3416 + }, + { + "epoch": 0.39697937844902703, + "grad_norm": 0.42593738436698914, + "learning_rate": 0.0001, + "loss": 1.8138, + "step": 3417 + }, + { + "epoch": 0.3970955562009875, + "grad_norm": 0.39474889636039734, + "learning_rate": 0.0001, + "loss": 1.5933, + "step": 3418 + }, + { + "epoch": 0.397211733952948, + "grad_norm": 0.4089927077293396, + "learning_rate": 0.0001, + "loss": 1.5349, + "step": 3419 + }, + { + "epoch": 0.3973279117049085, + "grad_norm": 0.3997913897037506, + "learning_rate": 0.0001, + "loss": 1.8294, + "step": 3420 + }, + { + "epoch": 0.397444089456869, + "grad_norm": 0.46322256326675415, + "learning_rate": 0.0001, + "loss": 1.7424, + "step": 3421 + }, + { + "epoch": 0.39756026720882953, + "grad_norm": 0.3604629337787628, + "learning_rate": 0.0001, + "loss": 1.4269, + "step": 3422 + }, + { + "epoch": 0.39767644496079, + "grad_norm": 0.3815920352935791, + "learning_rate": 0.0001, + "loss": 1.4519, + "step": 3423 + }, + { + "epoch": 0.3977926227127505, + "grad_norm": 0.37603339552879333, + "learning_rate": 0.0001, + "loss": 1.5044, + "step": 3424 + }, + { + "epoch": 0.397908800464711, + "grad_norm": 0.4085451364517212, + "learning_rate": 0.0001, + "loss": 1.7673, + "step": 3425 + }, + { + "epoch": 0.3980249782166715, + "grad_norm": 0.46148502826690674, + "learning_rate": 0.0001, + "loss": 1.9431, + "step": 3426 + }, + { + "epoch": 0.39814115596863203, + "grad_norm": 0.3642321527004242, + "learning_rate": 0.0001, + "loss": 1.3867, + "step": 3427 + }, + { + "epoch": 0.3982573337205925, + "grad_norm": 0.39448490738868713, + "learning_rate": 0.0001, + "loss": 1.5879, + "step": 3428 + }, + { + "epoch": 0.398373511472553, + "grad_norm": 0.37852832674980164, + "learning_rate": 0.0001, + "loss": 1.5359, + "step": 3429 + }, + { + "epoch": 0.3984896892245135, + "grad_norm": 0.44772058725357056, + "learning_rate": 0.0001, + "loss": 1.7785, + "step": 3430 + }, + { + "epoch": 0.398605866976474, + "grad_norm": 0.39221101999282837, + "learning_rate": 0.0001, + "loss": 1.689, + "step": 3431 + }, + { + "epoch": 0.3987220447284345, + "grad_norm": 0.38119301199913025, + "learning_rate": 0.0001, + "loss": 1.6269, + "step": 3432 + }, + { + "epoch": 0.398838222480395, + "grad_norm": 0.4090498685836792, + "learning_rate": 0.0001, + "loss": 1.6914, + "step": 3433 + }, + { + "epoch": 0.3989544002323555, + "grad_norm": 0.37487149238586426, + "learning_rate": 0.0001, + "loss": 1.4476, + "step": 3434 + }, + { + "epoch": 0.399070577984316, + "grad_norm": 0.39408043026924133, + "learning_rate": 0.0001, + "loss": 1.5866, + "step": 3435 + }, + { + "epoch": 0.3991867557362765, + "grad_norm": 0.38384729623794556, + "learning_rate": 0.0001, + "loss": 1.627, + "step": 3436 + }, + { + "epoch": 0.399302933488237, + "grad_norm": 0.4172252416610718, + "learning_rate": 0.0001, + "loss": 1.6085, + "step": 3437 + }, + { + "epoch": 0.3994191112401975, + "grad_norm": 0.3903944790363312, + "learning_rate": 0.0001, + "loss": 1.6583, + "step": 3438 + }, + { + "epoch": 0.399535288992158, + "grad_norm": 0.3812950849533081, + "learning_rate": 0.0001, + "loss": 1.6213, + "step": 3439 + }, + { + "epoch": 0.3996514667441185, + "grad_norm": 0.4021446704864502, + "learning_rate": 0.0001, + "loss": 1.6177, + "step": 3440 + }, + { + "epoch": 0.399767644496079, + "grad_norm": 0.3971196711063385, + "learning_rate": 0.0001, + "loss": 1.7095, + "step": 3441 + }, + { + "epoch": 0.3998838222480395, + "grad_norm": 0.40641874074935913, + "learning_rate": 0.0001, + "loss": 1.572, + "step": 3442 + }, + { + "epoch": 0.4, + "grad_norm": 0.41259273886680603, + "learning_rate": 0.0001, + "loss": 1.742, + "step": 3443 + }, + { + "epoch": 0.4001161777519605, + "grad_norm": 0.35140666365623474, + "learning_rate": 0.0001, + "loss": 1.4164, + "step": 3444 + }, + { + "epoch": 0.400232355503921, + "grad_norm": 0.3907198905944824, + "learning_rate": 0.0001, + "loss": 1.5896, + "step": 3445 + }, + { + "epoch": 0.4003485332558815, + "grad_norm": 0.41215452551841736, + "learning_rate": 0.0001, + "loss": 1.5177, + "step": 3446 + }, + { + "epoch": 0.400464711007842, + "grad_norm": 0.41600051522254944, + "learning_rate": 0.0001, + "loss": 1.9322, + "step": 3447 + }, + { + "epoch": 0.4005808887598025, + "grad_norm": 0.4017678201198578, + "learning_rate": 0.0001, + "loss": 1.6239, + "step": 3448 + }, + { + "epoch": 0.400697066511763, + "grad_norm": 0.4211594760417938, + "learning_rate": 0.0001, + "loss": 1.6339, + "step": 3449 + }, + { + "epoch": 0.4008132442637235, + "grad_norm": 0.4117183983325958, + "learning_rate": 0.0001, + "loss": 1.8139, + "step": 3450 + }, + { + "epoch": 0.400929422015684, + "grad_norm": 0.39731109142303467, + "learning_rate": 0.0001, + "loss": 1.6456, + "step": 3451 + }, + { + "epoch": 0.4010455997676445, + "grad_norm": 0.3891085088253021, + "learning_rate": 0.0001, + "loss": 1.6854, + "step": 3452 + }, + { + "epoch": 0.401161777519605, + "grad_norm": 0.3980685770511627, + "learning_rate": 0.0001, + "loss": 1.7527, + "step": 3453 + }, + { + "epoch": 0.4012779552715655, + "grad_norm": 0.43081504106521606, + "learning_rate": 0.0001, + "loss": 1.7178, + "step": 3454 + }, + { + "epoch": 0.401394133023526, + "grad_norm": 0.38269612193107605, + "learning_rate": 0.0001, + "loss": 1.6664, + "step": 3455 + }, + { + "epoch": 0.4015103107754865, + "grad_norm": 0.4062149226665497, + "learning_rate": 0.0001, + "loss": 1.5922, + "step": 3456 + }, + { + "epoch": 0.401626488527447, + "grad_norm": 0.38731974363327026, + "learning_rate": 0.0001, + "loss": 1.6712, + "step": 3457 + }, + { + "epoch": 0.40174266627940747, + "grad_norm": 0.4293443560600281, + "learning_rate": 0.0001, + "loss": 1.8078, + "step": 3458 + }, + { + "epoch": 0.401858844031368, + "grad_norm": 0.3938728868961334, + "learning_rate": 0.0001, + "loss": 1.5877, + "step": 3459 + }, + { + "epoch": 0.4019750217833285, + "grad_norm": 0.41334471106529236, + "learning_rate": 0.0001, + "loss": 1.6082, + "step": 3460 + }, + { + "epoch": 0.402091199535289, + "grad_norm": 0.39084866642951965, + "learning_rate": 0.0001, + "loss": 1.6759, + "step": 3461 + }, + { + "epoch": 0.4022073772872495, + "grad_norm": 0.4102165400981903, + "learning_rate": 0.0001, + "loss": 1.6758, + "step": 3462 + }, + { + "epoch": 0.40232355503920997, + "grad_norm": 0.4234514534473419, + "learning_rate": 0.0001, + "loss": 1.7546, + "step": 3463 + }, + { + "epoch": 0.4024397327911705, + "grad_norm": 0.38048434257507324, + "learning_rate": 0.0001, + "loss": 1.551, + "step": 3464 + }, + { + "epoch": 0.402555910543131, + "grad_norm": 0.39564260840415955, + "learning_rate": 0.0001, + "loss": 1.7084, + "step": 3465 + }, + { + "epoch": 0.4026720882950915, + "grad_norm": 0.40055975317955017, + "learning_rate": 0.0001, + "loss": 1.7346, + "step": 3466 + }, + { + "epoch": 0.402788266047052, + "grad_norm": 0.424216628074646, + "learning_rate": 0.0001, + "loss": 1.713, + "step": 3467 + }, + { + "epoch": 0.40290444379901247, + "grad_norm": 0.38856959342956543, + "learning_rate": 0.0001, + "loss": 1.6781, + "step": 3468 + }, + { + "epoch": 0.403020621550973, + "grad_norm": 0.4526919424533844, + "learning_rate": 0.0001, + "loss": 1.7348, + "step": 3469 + }, + { + "epoch": 0.4031367993029335, + "grad_norm": 0.4121178686618805, + "learning_rate": 0.0001, + "loss": 1.634, + "step": 3470 + }, + { + "epoch": 0.403252977054894, + "grad_norm": 0.41178447008132935, + "learning_rate": 0.0001, + "loss": 1.6574, + "step": 3471 + }, + { + "epoch": 0.4033691548068545, + "grad_norm": 0.3908507823944092, + "learning_rate": 0.0001, + "loss": 1.594, + "step": 3472 + }, + { + "epoch": 0.40348533255881497, + "grad_norm": 0.4167093336582184, + "learning_rate": 0.0001, + "loss": 1.7207, + "step": 3473 + }, + { + "epoch": 0.4036015103107755, + "grad_norm": 0.3978476822376251, + "learning_rate": 0.0001, + "loss": 1.533, + "step": 3474 + }, + { + "epoch": 0.403717688062736, + "grad_norm": 0.4014563262462616, + "learning_rate": 0.0001, + "loss": 1.6501, + "step": 3475 + }, + { + "epoch": 0.4038338658146965, + "grad_norm": 0.42038458585739136, + "learning_rate": 0.0001, + "loss": 1.7169, + "step": 3476 + }, + { + "epoch": 0.403950043566657, + "grad_norm": 0.40541571378707886, + "learning_rate": 0.0001, + "loss": 1.6219, + "step": 3477 + }, + { + "epoch": 0.40406622131861747, + "grad_norm": 0.39469483494758606, + "learning_rate": 0.0001, + "loss": 1.4549, + "step": 3478 + }, + { + "epoch": 0.40418239907057796, + "grad_norm": 0.38308000564575195, + "learning_rate": 0.0001, + "loss": 1.502, + "step": 3479 + }, + { + "epoch": 0.4042985768225385, + "grad_norm": 0.3976595103740692, + "learning_rate": 0.0001, + "loss": 1.5874, + "step": 3480 + }, + { + "epoch": 0.404414754574499, + "grad_norm": 0.4284829795360565, + "learning_rate": 0.0001, + "loss": 1.7053, + "step": 3481 + }, + { + "epoch": 0.4045309323264595, + "grad_norm": 0.36719101667404175, + "learning_rate": 0.0001, + "loss": 1.5308, + "step": 3482 + }, + { + "epoch": 0.40464711007841997, + "grad_norm": 0.4026053845882416, + "learning_rate": 0.0001, + "loss": 1.6293, + "step": 3483 + }, + { + "epoch": 0.40476328783038046, + "grad_norm": 0.40550971031188965, + "learning_rate": 0.0001, + "loss": 1.6716, + "step": 3484 + }, + { + "epoch": 0.404879465582341, + "grad_norm": 0.3787073493003845, + "learning_rate": 0.0001, + "loss": 1.6136, + "step": 3485 + }, + { + "epoch": 0.4049956433343015, + "grad_norm": 0.38911107182502747, + "learning_rate": 0.0001, + "loss": 1.4699, + "step": 3486 + }, + { + "epoch": 0.405111821086262, + "grad_norm": 0.420773446559906, + "learning_rate": 0.0001, + "loss": 1.6514, + "step": 3487 + }, + { + "epoch": 0.40522799883822247, + "grad_norm": 0.40293148159980774, + "learning_rate": 0.0001, + "loss": 1.6972, + "step": 3488 + }, + { + "epoch": 0.40534417659018296, + "grad_norm": 0.40412238240242004, + "learning_rate": 0.0001, + "loss": 1.4579, + "step": 3489 + }, + { + "epoch": 0.4054603543421435, + "grad_norm": 0.42694681882858276, + "learning_rate": 0.0001, + "loss": 1.6944, + "step": 3490 + }, + { + "epoch": 0.405576532094104, + "grad_norm": 0.3828969895839691, + "learning_rate": 0.0001, + "loss": 1.6205, + "step": 3491 + }, + { + "epoch": 0.4056927098460645, + "grad_norm": 0.4304792582988739, + "learning_rate": 0.0001, + "loss": 1.65, + "step": 3492 + }, + { + "epoch": 0.40580888759802497, + "grad_norm": 0.40881672501564026, + "learning_rate": 0.0001, + "loss": 1.6384, + "step": 3493 + }, + { + "epoch": 0.40592506534998546, + "grad_norm": 0.3886183202266693, + "learning_rate": 0.0001, + "loss": 1.6737, + "step": 3494 + }, + { + "epoch": 0.406041243101946, + "grad_norm": 0.3807865083217621, + "learning_rate": 0.0001, + "loss": 1.6524, + "step": 3495 + }, + { + "epoch": 0.4061574208539065, + "grad_norm": 0.4547371566295624, + "learning_rate": 0.0001, + "loss": 2.0104, + "step": 3496 + }, + { + "epoch": 0.406273598605867, + "grad_norm": 0.43122467398643494, + "learning_rate": 0.0001, + "loss": 1.5135, + "step": 3497 + }, + { + "epoch": 0.40638977635782747, + "grad_norm": 0.43641188740730286, + "learning_rate": 0.0001, + "loss": 1.7342, + "step": 3498 + }, + { + "epoch": 0.40650595410978796, + "grad_norm": 0.38105788826942444, + "learning_rate": 0.0001, + "loss": 1.5406, + "step": 3499 + }, + { + "epoch": 0.40662213186174845, + "grad_norm": 0.38278716802597046, + "learning_rate": 0.0001, + "loss": 1.6807, + "step": 3500 + }, + { + "epoch": 0.406738309613709, + "grad_norm": 0.4314371645450592, + "learning_rate": 0.0001, + "loss": 1.6348, + "step": 3501 + }, + { + "epoch": 0.4068544873656695, + "grad_norm": 0.39707180857658386, + "learning_rate": 0.0001, + "loss": 1.6498, + "step": 3502 + }, + { + "epoch": 0.40697066511762997, + "grad_norm": 0.40061885118484497, + "learning_rate": 0.0001, + "loss": 1.7397, + "step": 3503 + }, + { + "epoch": 0.40708684286959046, + "grad_norm": 0.39981821179389954, + "learning_rate": 0.0001, + "loss": 1.7075, + "step": 3504 + }, + { + "epoch": 0.40720302062155095, + "grad_norm": 0.41869446635246277, + "learning_rate": 0.0001, + "loss": 1.7211, + "step": 3505 + }, + { + "epoch": 0.4073191983735115, + "grad_norm": 0.4269830584526062, + "learning_rate": 0.0001, + "loss": 1.7423, + "step": 3506 + }, + { + "epoch": 0.407435376125472, + "grad_norm": 0.44403141736984253, + "learning_rate": 0.0001, + "loss": 1.7895, + "step": 3507 + }, + { + "epoch": 0.40755155387743247, + "grad_norm": 0.4114527404308319, + "learning_rate": 0.0001, + "loss": 1.6789, + "step": 3508 + }, + { + "epoch": 0.40766773162939296, + "grad_norm": 0.40180087089538574, + "learning_rate": 0.0001, + "loss": 1.7476, + "step": 3509 + }, + { + "epoch": 0.40778390938135345, + "grad_norm": 0.4387998580932617, + "learning_rate": 0.0001, + "loss": 1.6723, + "step": 3510 + }, + { + "epoch": 0.407900087133314, + "grad_norm": 0.43841442465782166, + "learning_rate": 0.0001, + "loss": 1.7248, + "step": 3511 + }, + { + "epoch": 0.4080162648852745, + "grad_norm": 0.41502171754837036, + "learning_rate": 0.0001, + "loss": 1.7085, + "step": 3512 + }, + { + "epoch": 0.40813244263723497, + "grad_norm": 0.40264812111854553, + "learning_rate": 0.0001, + "loss": 1.7709, + "step": 3513 + }, + { + "epoch": 0.40824862038919546, + "grad_norm": 0.39349353313446045, + "learning_rate": 0.0001, + "loss": 1.6735, + "step": 3514 + }, + { + "epoch": 0.40836479814115595, + "grad_norm": 0.42213505506515503, + "learning_rate": 0.0001, + "loss": 1.5739, + "step": 3515 + }, + { + "epoch": 0.4084809758931165, + "grad_norm": 0.382960706949234, + "learning_rate": 0.0001, + "loss": 1.6137, + "step": 3516 + }, + { + "epoch": 0.408597153645077, + "grad_norm": 0.4227701723575592, + "learning_rate": 0.0001, + "loss": 1.7499, + "step": 3517 + }, + { + "epoch": 0.40871333139703747, + "grad_norm": 0.4224139451980591, + "learning_rate": 0.0001, + "loss": 1.6512, + "step": 3518 + }, + { + "epoch": 0.40882950914899796, + "grad_norm": 0.3891499638557434, + "learning_rate": 0.0001, + "loss": 1.5613, + "step": 3519 + }, + { + "epoch": 0.40894568690095845, + "grad_norm": 0.40314075350761414, + "learning_rate": 0.0001, + "loss": 1.6407, + "step": 3520 + }, + { + "epoch": 0.40906186465291894, + "grad_norm": 0.3946097791194916, + "learning_rate": 0.0001, + "loss": 1.5376, + "step": 3521 + }, + { + "epoch": 0.4091780424048795, + "grad_norm": 0.41073909401893616, + "learning_rate": 0.0001, + "loss": 1.7195, + "step": 3522 + }, + { + "epoch": 0.40929422015683997, + "grad_norm": 0.3862104117870331, + "learning_rate": 0.0001, + "loss": 1.5824, + "step": 3523 + }, + { + "epoch": 0.40941039790880046, + "grad_norm": 0.38965824246406555, + "learning_rate": 0.0001, + "loss": 1.5643, + "step": 3524 + }, + { + "epoch": 0.40952657566076095, + "grad_norm": 0.4076133370399475, + "learning_rate": 0.0001, + "loss": 1.7149, + "step": 3525 + }, + { + "epoch": 0.40964275341272144, + "grad_norm": 0.39782625436782837, + "learning_rate": 0.0001, + "loss": 1.6744, + "step": 3526 + }, + { + "epoch": 0.409758931164682, + "grad_norm": 0.4191223680973053, + "learning_rate": 0.0001, + "loss": 1.6426, + "step": 3527 + }, + { + "epoch": 0.40987510891664247, + "grad_norm": 0.41905683279037476, + "learning_rate": 0.0001, + "loss": 1.7366, + "step": 3528 + }, + { + "epoch": 0.40999128666860296, + "grad_norm": 0.47418078780174255, + "learning_rate": 0.0001, + "loss": 1.8641, + "step": 3529 + }, + { + "epoch": 0.41010746442056345, + "grad_norm": 0.4219752252101898, + "learning_rate": 0.0001, + "loss": 1.773, + "step": 3530 + }, + { + "epoch": 0.41022364217252394, + "grad_norm": 0.39003801345825195, + "learning_rate": 0.0001, + "loss": 1.6645, + "step": 3531 + }, + { + "epoch": 0.4103398199244845, + "grad_norm": 0.39231449365615845, + "learning_rate": 0.0001, + "loss": 1.7753, + "step": 3532 + }, + { + "epoch": 0.410455997676445, + "grad_norm": 0.39529526233673096, + "learning_rate": 0.0001, + "loss": 1.6385, + "step": 3533 + }, + { + "epoch": 0.41057217542840546, + "grad_norm": 0.4106643497943878, + "learning_rate": 0.0001, + "loss": 1.7264, + "step": 3534 + }, + { + "epoch": 0.41068835318036595, + "grad_norm": 0.3763425648212433, + "learning_rate": 0.0001, + "loss": 1.6384, + "step": 3535 + }, + { + "epoch": 0.41080453093232644, + "grad_norm": 0.4354191720485687, + "learning_rate": 0.0001, + "loss": 1.7028, + "step": 3536 + }, + { + "epoch": 0.410920708684287, + "grad_norm": 0.4091987609863281, + "learning_rate": 0.0001, + "loss": 1.6875, + "step": 3537 + }, + { + "epoch": 0.4110368864362475, + "grad_norm": 0.3786587417125702, + "learning_rate": 0.0001, + "loss": 1.617, + "step": 3538 + }, + { + "epoch": 0.41115306418820796, + "grad_norm": 0.3957653343677521, + "learning_rate": 0.0001, + "loss": 1.6741, + "step": 3539 + }, + { + "epoch": 0.41126924194016845, + "grad_norm": 0.41153082251548767, + "learning_rate": 0.0001, + "loss": 1.5821, + "step": 3540 + }, + { + "epoch": 0.41138541969212894, + "grad_norm": 0.408596932888031, + "learning_rate": 0.0001, + "loss": 1.6669, + "step": 3541 + }, + { + "epoch": 0.4115015974440895, + "grad_norm": 0.4165953993797302, + "learning_rate": 0.0001, + "loss": 1.6293, + "step": 3542 + }, + { + "epoch": 0.41161777519605, + "grad_norm": 0.4159640371799469, + "learning_rate": 0.0001, + "loss": 1.4411, + "step": 3543 + }, + { + "epoch": 0.41173395294801046, + "grad_norm": 0.4020317494869232, + "learning_rate": 0.0001, + "loss": 1.7044, + "step": 3544 + }, + { + "epoch": 0.41185013069997095, + "grad_norm": 0.40720680356025696, + "learning_rate": 0.0001, + "loss": 1.659, + "step": 3545 + }, + { + "epoch": 0.41196630845193144, + "grad_norm": 0.3976491689682007, + "learning_rate": 0.0001, + "loss": 1.671, + "step": 3546 + }, + { + "epoch": 0.41208248620389193, + "grad_norm": 0.40788426995277405, + "learning_rate": 0.0001, + "loss": 1.5899, + "step": 3547 + }, + { + "epoch": 0.4121986639558525, + "grad_norm": 0.43293899297714233, + "learning_rate": 0.0001, + "loss": 1.7645, + "step": 3548 + }, + { + "epoch": 0.41231484170781296, + "grad_norm": 0.4064108729362488, + "learning_rate": 0.0001, + "loss": 1.6196, + "step": 3549 + }, + { + "epoch": 0.41243101945977345, + "grad_norm": 0.38206931948661804, + "learning_rate": 0.0001, + "loss": 1.5335, + "step": 3550 + }, + { + "epoch": 0.41254719721173394, + "grad_norm": 0.41803139448165894, + "learning_rate": 0.0001, + "loss": 1.7538, + "step": 3551 + }, + { + "epoch": 0.41266337496369443, + "grad_norm": 0.4098378121852875, + "learning_rate": 0.0001, + "loss": 1.7071, + "step": 3552 + }, + { + "epoch": 0.412779552715655, + "grad_norm": 0.40097564458847046, + "learning_rate": 0.0001, + "loss": 1.7684, + "step": 3553 + }, + { + "epoch": 0.41289573046761546, + "grad_norm": 0.4043814241886139, + "learning_rate": 0.0001, + "loss": 1.6431, + "step": 3554 + }, + { + "epoch": 0.41301190821957595, + "grad_norm": 0.41799452900886536, + "learning_rate": 0.0001, + "loss": 1.6355, + "step": 3555 + }, + { + "epoch": 0.41312808597153644, + "grad_norm": 0.41283276677131653, + "learning_rate": 0.0001, + "loss": 1.6325, + "step": 3556 + }, + { + "epoch": 0.41324426372349693, + "grad_norm": 0.3751561641693115, + "learning_rate": 0.0001, + "loss": 1.599, + "step": 3557 + }, + { + "epoch": 0.4133604414754575, + "grad_norm": 0.4034547805786133, + "learning_rate": 0.0001, + "loss": 1.6932, + "step": 3558 + }, + { + "epoch": 0.41347661922741796, + "grad_norm": 0.3995891511440277, + "learning_rate": 0.0001, + "loss": 1.7567, + "step": 3559 + }, + { + "epoch": 0.41359279697937845, + "grad_norm": 0.4040803909301758, + "learning_rate": 0.0001, + "loss": 1.6819, + "step": 3560 + }, + { + "epoch": 0.41370897473133894, + "grad_norm": 0.3844342827796936, + "learning_rate": 0.0001, + "loss": 1.8967, + "step": 3561 + }, + { + "epoch": 0.41382515248329943, + "grad_norm": 0.41226911544799805, + "learning_rate": 0.0001, + "loss": 1.7047, + "step": 3562 + }, + { + "epoch": 0.41394133023526, + "grad_norm": 0.4063775837421417, + "learning_rate": 0.0001, + "loss": 1.7052, + "step": 3563 + }, + { + "epoch": 0.41405750798722046, + "grad_norm": 0.3665454089641571, + "learning_rate": 0.0001, + "loss": 1.5527, + "step": 3564 + }, + { + "epoch": 0.41417368573918095, + "grad_norm": 0.3731880784034729, + "learning_rate": 0.0001, + "loss": 1.5557, + "step": 3565 + }, + { + "epoch": 0.41428986349114144, + "grad_norm": 0.4028816223144531, + "learning_rate": 0.0001, + "loss": 1.602, + "step": 3566 + }, + { + "epoch": 0.41440604124310193, + "grad_norm": 0.39572829008102417, + "learning_rate": 0.0001, + "loss": 1.6978, + "step": 3567 + }, + { + "epoch": 0.4145222189950624, + "grad_norm": 0.3968917727470398, + "learning_rate": 0.0001, + "loss": 1.4945, + "step": 3568 + }, + { + "epoch": 0.41463839674702296, + "grad_norm": 0.42275354266166687, + "learning_rate": 0.0001, + "loss": 1.6939, + "step": 3569 + }, + { + "epoch": 0.41475457449898345, + "grad_norm": 0.4133806526660919, + "learning_rate": 0.0001, + "loss": 1.7573, + "step": 3570 + }, + { + "epoch": 0.41487075225094394, + "grad_norm": 0.4026901125907898, + "learning_rate": 0.0001, + "loss": 1.7445, + "step": 3571 + }, + { + "epoch": 0.41498693000290443, + "grad_norm": 0.40076354146003723, + "learning_rate": 0.0001, + "loss": 1.6168, + "step": 3572 + }, + { + "epoch": 0.4151031077548649, + "grad_norm": 0.3952670991420746, + "learning_rate": 0.0001, + "loss": 1.7818, + "step": 3573 + }, + { + "epoch": 0.41521928550682546, + "grad_norm": 0.396936297416687, + "learning_rate": 0.0001, + "loss": 1.6677, + "step": 3574 + }, + { + "epoch": 0.41533546325878595, + "grad_norm": 0.38129517436027527, + "learning_rate": 0.0001, + "loss": 1.5803, + "step": 3575 + }, + { + "epoch": 0.41545164101074644, + "grad_norm": 0.42438849806785583, + "learning_rate": 0.0001, + "loss": 1.6739, + "step": 3576 + }, + { + "epoch": 0.41556781876270693, + "grad_norm": 0.41612252593040466, + "learning_rate": 0.0001, + "loss": 1.8019, + "step": 3577 + }, + { + "epoch": 0.4156839965146674, + "grad_norm": 0.4326874017715454, + "learning_rate": 0.0001, + "loss": 1.6919, + "step": 3578 + }, + { + "epoch": 0.41580017426662796, + "grad_norm": 0.4148086905479431, + "learning_rate": 0.0001, + "loss": 1.6639, + "step": 3579 + }, + { + "epoch": 0.41591635201858845, + "grad_norm": 0.4372271001338959, + "learning_rate": 0.0001, + "loss": 1.8974, + "step": 3580 + }, + { + "epoch": 0.41603252977054894, + "grad_norm": 0.42456239461898804, + "learning_rate": 0.0001, + "loss": 1.8067, + "step": 3581 + }, + { + "epoch": 0.41614870752250943, + "grad_norm": 0.3889367878437042, + "learning_rate": 0.0001, + "loss": 1.5411, + "step": 3582 + }, + { + "epoch": 0.4162648852744699, + "grad_norm": 0.4123779833316803, + "learning_rate": 0.0001, + "loss": 1.7563, + "step": 3583 + }, + { + "epoch": 0.41638106302643046, + "grad_norm": 0.45006388425827026, + "learning_rate": 0.0001, + "loss": 1.6411, + "step": 3584 + }, + { + "epoch": 0.41649724077839095, + "grad_norm": 0.3993321359157562, + "learning_rate": 0.0001, + "loss": 1.663, + "step": 3585 + }, + { + "epoch": 0.41661341853035144, + "grad_norm": 0.39892643690109253, + "learning_rate": 0.0001, + "loss": 1.4789, + "step": 3586 + }, + { + "epoch": 0.41672959628231193, + "grad_norm": 0.4181758463382721, + "learning_rate": 0.0001, + "loss": 1.7197, + "step": 3587 + }, + { + "epoch": 0.4168457740342724, + "grad_norm": 0.3975769877433777, + "learning_rate": 0.0001, + "loss": 1.6527, + "step": 3588 + }, + { + "epoch": 0.4169619517862329, + "grad_norm": 0.3831265866756439, + "learning_rate": 0.0001, + "loss": 1.6703, + "step": 3589 + }, + { + "epoch": 0.41707812953819345, + "grad_norm": 0.41471192240715027, + "learning_rate": 0.0001, + "loss": 1.7917, + "step": 3590 + }, + { + "epoch": 0.41719430729015394, + "grad_norm": 0.3869475722312927, + "learning_rate": 0.0001, + "loss": 1.7876, + "step": 3591 + }, + { + "epoch": 0.41731048504211443, + "grad_norm": 0.37546539306640625, + "learning_rate": 0.0001, + "loss": 1.5797, + "step": 3592 + }, + { + "epoch": 0.4174266627940749, + "grad_norm": 0.38693931698799133, + "learning_rate": 0.0001, + "loss": 1.6906, + "step": 3593 + }, + { + "epoch": 0.4175428405460354, + "grad_norm": 0.40979790687561035, + "learning_rate": 0.0001, + "loss": 1.7553, + "step": 3594 + }, + { + "epoch": 0.41765901829799595, + "grad_norm": 0.4143114686012268, + "learning_rate": 0.0001, + "loss": 1.875, + "step": 3595 + }, + { + "epoch": 0.41777519604995644, + "grad_norm": 0.4477331340312958, + "learning_rate": 0.0001, + "loss": 1.7272, + "step": 3596 + }, + { + "epoch": 0.41789137380191693, + "grad_norm": 0.4350159168243408, + "learning_rate": 0.0001, + "loss": 1.7577, + "step": 3597 + }, + { + "epoch": 0.4180075515538774, + "grad_norm": 0.40103811025619507, + "learning_rate": 0.0001, + "loss": 1.6782, + "step": 3598 + }, + { + "epoch": 0.4181237293058379, + "grad_norm": 0.42773351073265076, + "learning_rate": 0.0001, + "loss": 1.8818, + "step": 3599 + }, + { + "epoch": 0.41823990705779845, + "grad_norm": 0.3894450068473816, + "learning_rate": 0.0001, + "loss": 1.6622, + "step": 3600 + }, + { + "epoch": 0.41835608480975894, + "grad_norm": 0.3698801100254059, + "learning_rate": 0.0001, + "loss": 1.4404, + "step": 3601 + }, + { + "epoch": 0.41847226256171943, + "grad_norm": 0.3938926160335541, + "learning_rate": 0.0001, + "loss": 1.5742, + "step": 3602 + }, + { + "epoch": 0.4185884403136799, + "grad_norm": 0.40313947200775146, + "learning_rate": 0.0001, + "loss": 1.6071, + "step": 3603 + }, + { + "epoch": 0.4187046180656404, + "grad_norm": 0.4046363830566406, + "learning_rate": 0.0001, + "loss": 1.6521, + "step": 3604 + }, + { + "epoch": 0.41882079581760095, + "grad_norm": 0.4111688435077667, + "learning_rate": 0.0001, + "loss": 1.6909, + "step": 3605 + }, + { + "epoch": 0.41893697356956144, + "grad_norm": 0.41641494631767273, + "learning_rate": 0.0001, + "loss": 1.6157, + "step": 3606 + }, + { + "epoch": 0.41905315132152193, + "grad_norm": 0.3970726728439331, + "learning_rate": 0.0001, + "loss": 1.6719, + "step": 3607 + }, + { + "epoch": 0.4191693290734824, + "grad_norm": 0.40690597891807556, + "learning_rate": 0.0001, + "loss": 1.7636, + "step": 3608 + }, + { + "epoch": 0.4192855068254429, + "grad_norm": 0.41410157084465027, + "learning_rate": 0.0001, + "loss": 1.794, + "step": 3609 + }, + { + "epoch": 0.4194016845774034, + "grad_norm": 0.38639214634895325, + "learning_rate": 0.0001, + "loss": 1.6979, + "step": 3610 + }, + { + "epoch": 0.41951786232936394, + "grad_norm": 0.3689410090446472, + "learning_rate": 0.0001, + "loss": 1.6076, + "step": 3611 + }, + { + "epoch": 0.41963404008132443, + "grad_norm": 0.40414369106292725, + "learning_rate": 0.0001, + "loss": 1.8223, + "step": 3612 + }, + { + "epoch": 0.4197502178332849, + "grad_norm": 0.38138678669929504, + "learning_rate": 0.0001, + "loss": 1.3491, + "step": 3613 + }, + { + "epoch": 0.4198663955852454, + "grad_norm": 0.37447166442871094, + "learning_rate": 0.0001, + "loss": 1.5359, + "step": 3614 + }, + { + "epoch": 0.4199825733372059, + "grad_norm": 0.40835610032081604, + "learning_rate": 0.0001, + "loss": 1.7345, + "step": 3615 + }, + { + "epoch": 0.42009875108916644, + "grad_norm": 0.4299129545688629, + "learning_rate": 0.0001, + "loss": 1.7448, + "step": 3616 + }, + { + "epoch": 0.42021492884112693, + "grad_norm": 0.3932313621044159, + "learning_rate": 0.0001, + "loss": 1.6607, + "step": 3617 + }, + { + "epoch": 0.4203311065930874, + "grad_norm": 0.3847653865814209, + "learning_rate": 0.0001, + "loss": 1.5535, + "step": 3618 + }, + { + "epoch": 0.4204472843450479, + "grad_norm": 0.435201495885849, + "learning_rate": 0.0001, + "loss": 1.8684, + "step": 3619 + }, + { + "epoch": 0.4205634620970084, + "grad_norm": 0.3746712803840637, + "learning_rate": 0.0001, + "loss": 1.6723, + "step": 3620 + }, + { + "epoch": 0.42067963984896894, + "grad_norm": 0.37663495540618896, + "learning_rate": 0.0001, + "loss": 1.5523, + "step": 3621 + }, + { + "epoch": 0.42079581760092943, + "grad_norm": 0.37208837270736694, + "learning_rate": 0.0001, + "loss": 1.6111, + "step": 3622 + }, + { + "epoch": 0.4209119953528899, + "grad_norm": 0.3866986334323883, + "learning_rate": 0.0001, + "loss": 1.5904, + "step": 3623 + }, + { + "epoch": 0.4210281731048504, + "grad_norm": 0.40129542350769043, + "learning_rate": 0.0001, + "loss": 1.6223, + "step": 3624 + }, + { + "epoch": 0.4211443508568109, + "grad_norm": 0.38859081268310547, + "learning_rate": 0.0001, + "loss": 1.6547, + "step": 3625 + }, + { + "epoch": 0.42126052860877145, + "grad_norm": 0.43616414070129395, + "learning_rate": 0.0001, + "loss": 1.7725, + "step": 3626 + }, + { + "epoch": 0.42137670636073193, + "grad_norm": 0.43202710151672363, + "learning_rate": 0.0001, + "loss": 1.4989, + "step": 3627 + }, + { + "epoch": 0.4214928841126924, + "grad_norm": 0.39463871717453003, + "learning_rate": 0.0001, + "loss": 1.4335, + "step": 3628 + }, + { + "epoch": 0.4216090618646529, + "grad_norm": 0.4237414002418518, + "learning_rate": 0.0001, + "loss": 1.7187, + "step": 3629 + }, + { + "epoch": 0.4217252396166134, + "grad_norm": 0.39005210995674133, + "learning_rate": 0.0001, + "loss": 1.48, + "step": 3630 + }, + { + "epoch": 0.42184141736857395, + "grad_norm": 0.3883463144302368, + "learning_rate": 0.0001, + "loss": 1.6161, + "step": 3631 + }, + { + "epoch": 0.42195759512053443, + "grad_norm": 0.3896929621696472, + "learning_rate": 0.0001, + "loss": 1.6433, + "step": 3632 + }, + { + "epoch": 0.4220737728724949, + "grad_norm": 0.41953974962234497, + "learning_rate": 0.0001, + "loss": 1.5526, + "step": 3633 + }, + { + "epoch": 0.4221899506244554, + "grad_norm": 0.3946000039577484, + "learning_rate": 0.0001, + "loss": 1.6624, + "step": 3634 + }, + { + "epoch": 0.4223061283764159, + "grad_norm": 0.4022957384586334, + "learning_rate": 0.0001, + "loss": 1.5182, + "step": 3635 + }, + { + "epoch": 0.4224223061283764, + "grad_norm": 0.40276673436164856, + "learning_rate": 0.0001, + "loss": 1.5075, + "step": 3636 + }, + { + "epoch": 0.42253848388033693, + "grad_norm": 0.41087186336517334, + "learning_rate": 0.0001, + "loss": 1.7919, + "step": 3637 + }, + { + "epoch": 0.4226546616322974, + "grad_norm": 0.4109443426132202, + "learning_rate": 0.0001, + "loss": 1.7705, + "step": 3638 + }, + { + "epoch": 0.4227708393842579, + "grad_norm": 0.3984861671924591, + "learning_rate": 0.0001, + "loss": 1.5405, + "step": 3639 + }, + { + "epoch": 0.4228870171362184, + "grad_norm": 0.42480790615081787, + "learning_rate": 0.0001, + "loss": 1.7443, + "step": 3640 + }, + { + "epoch": 0.4230031948881789, + "grad_norm": 0.37020301818847656, + "learning_rate": 0.0001, + "loss": 1.5778, + "step": 3641 + }, + { + "epoch": 0.42311937264013944, + "grad_norm": 0.3878079950809479, + "learning_rate": 0.0001, + "loss": 1.5914, + "step": 3642 + }, + { + "epoch": 0.4232355503920999, + "grad_norm": 0.4088906943798065, + "learning_rate": 0.0001, + "loss": 1.7003, + "step": 3643 + }, + { + "epoch": 0.4233517281440604, + "grad_norm": 0.39940375089645386, + "learning_rate": 0.0001, + "loss": 1.673, + "step": 3644 + }, + { + "epoch": 0.4234679058960209, + "grad_norm": 0.41945314407348633, + "learning_rate": 0.0001, + "loss": 1.6954, + "step": 3645 + }, + { + "epoch": 0.4235840836479814, + "grad_norm": 0.43986397981643677, + "learning_rate": 0.0001, + "loss": 1.7769, + "step": 3646 + }, + { + "epoch": 0.42370026139994194, + "grad_norm": 0.41708502173423767, + "learning_rate": 0.0001, + "loss": 1.6622, + "step": 3647 + }, + { + "epoch": 0.4238164391519024, + "grad_norm": 0.3953828513622284, + "learning_rate": 0.0001, + "loss": 1.6353, + "step": 3648 + }, + { + "epoch": 0.4239326169038629, + "grad_norm": 0.3876919746398926, + "learning_rate": 0.0001, + "loss": 1.475, + "step": 3649 + }, + { + "epoch": 0.4240487946558234, + "grad_norm": 0.4110132157802582, + "learning_rate": 0.0001, + "loss": 1.8118, + "step": 3650 + }, + { + "epoch": 0.4241649724077839, + "grad_norm": 0.4448528587818146, + "learning_rate": 0.0001, + "loss": 1.8269, + "step": 3651 + }, + { + "epoch": 0.42428115015974444, + "grad_norm": 0.4393356740474701, + "learning_rate": 0.0001, + "loss": 1.8647, + "step": 3652 + }, + { + "epoch": 0.4243973279117049, + "grad_norm": 0.4252372086048126, + "learning_rate": 0.0001, + "loss": 1.6679, + "step": 3653 + }, + { + "epoch": 0.4245135056636654, + "grad_norm": 0.41888418793678284, + "learning_rate": 0.0001, + "loss": 1.7479, + "step": 3654 + }, + { + "epoch": 0.4246296834156259, + "grad_norm": 0.39156967401504517, + "learning_rate": 0.0001, + "loss": 1.6443, + "step": 3655 + }, + { + "epoch": 0.4247458611675864, + "grad_norm": 0.3865427076816559, + "learning_rate": 0.0001, + "loss": 1.6541, + "step": 3656 + }, + { + "epoch": 0.4248620389195469, + "grad_norm": 0.37357524037361145, + "learning_rate": 0.0001, + "loss": 1.6029, + "step": 3657 + }, + { + "epoch": 0.4249782166715074, + "grad_norm": 0.3771733045578003, + "learning_rate": 0.0001, + "loss": 1.59, + "step": 3658 + }, + { + "epoch": 0.4250943944234679, + "grad_norm": 0.40311431884765625, + "learning_rate": 0.0001, + "loss": 1.6134, + "step": 3659 + }, + { + "epoch": 0.4252105721754284, + "grad_norm": 0.39886149764060974, + "learning_rate": 0.0001, + "loss": 1.8507, + "step": 3660 + }, + { + "epoch": 0.4253267499273889, + "grad_norm": 0.4184694290161133, + "learning_rate": 0.0001, + "loss": 1.7392, + "step": 3661 + }, + { + "epoch": 0.4254429276793494, + "grad_norm": 0.4278115928173065, + "learning_rate": 0.0001, + "loss": 1.8403, + "step": 3662 + }, + { + "epoch": 0.4255591054313099, + "grad_norm": 0.4227597117424011, + "learning_rate": 0.0001, + "loss": 1.7257, + "step": 3663 + }, + { + "epoch": 0.4256752831832704, + "grad_norm": 0.3717440068721771, + "learning_rate": 0.0001, + "loss": 1.4974, + "step": 3664 + }, + { + "epoch": 0.4257914609352309, + "grad_norm": 0.3993324935436249, + "learning_rate": 0.0001, + "loss": 1.6323, + "step": 3665 + }, + { + "epoch": 0.4259076386871914, + "grad_norm": 0.40730348229408264, + "learning_rate": 0.0001, + "loss": 1.6303, + "step": 3666 + }, + { + "epoch": 0.4260238164391519, + "grad_norm": 0.38585323095321655, + "learning_rate": 0.0001, + "loss": 1.6446, + "step": 3667 + }, + { + "epoch": 0.4261399941911124, + "grad_norm": 0.4284312129020691, + "learning_rate": 0.0001, + "loss": 1.542, + "step": 3668 + }, + { + "epoch": 0.4262561719430729, + "grad_norm": 0.417501300573349, + "learning_rate": 0.0001, + "loss": 1.6636, + "step": 3669 + }, + { + "epoch": 0.4263723496950334, + "grad_norm": 0.3896951973438263, + "learning_rate": 0.0001, + "loss": 1.4807, + "step": 3670 + }, + { + "epoch": 0.4264885274469939, + "grad_norm": 0.4131647050380707, + "learning_rate": 0.0001, + "loss": 1.6609, + "step": 3671 + }, + { + "epoch": 0.4266047051989544, + "grad_norm": 0.36741092801094055, + "learning_rate": 0.0001, + "loss": 1.5499, + "step": 3672 + }, + { + "epoch": 0.4267208829509149, + "grad_norm": 0.41601473093032837, + "learning_rate": 0.0001, + "loss": 1.5905, + "step": 3673 + }, + { + "epoch": 0.4268370607028754, + "grad_norm": 0.41685929894447327, + "learning_rate": 0.0001, + "loss": 1.5354, + "step": 3674 + }, + { + "epoch": 0.4269532384548359, + "grad_norm": 0.41353273391723633, + "learning_rate": 0.0001, + "loss": 1.7881, + "step": 3675 + }, + { + "epoch": 0.4270694162067964, + "grad_norm": 0.40456423163414, + "learning_rate": 0.0001, + "loss": 1.6691, + "step": 3676 + }, + { + "epoch": 0.4271855939587569, + "grad_norm": 0.457220196723938, + "learning_rate": 0.0001, + "loss": 1.7856, + "step": 3677 + }, + { + "epoch": 0.42730177171071737, + "grad_norm": 0.4161781668663025, + "learning_rate": 0.0001, + "loss": 1.6692, + "step": 3678 + }, + { + "epoch": 0.4274179494626779, + "grad_norm": 0.40317872166633606, + "learning_rate": 0.0001, + "loss": 1.6023, + "step": 3679 + }, + { + "epoch": 0.4275341272146384, + "grad_norm": 0.4100804328918457, + "learning_rate": 0.0001, + "loss": 1.7285, + "step": 3680 + }, + { + "epoch": 0.4276503049665989, + "grad_norm": 0.40741920471191406, + "learning_rate": 0.0001, + "loss": 1.6931, + "step": 3681 + }, + { + "epoch": 0.4277664827185594, + "grad_norm": 0.3968175947666168, + "learning_rate": 0.0001, + "loss": 1.5824, + "step": 3682 + }, + { + "epoch": 0.42788266047051987, + "grad_norm": 0.3978244662284851, + "learning_rate": 0.0001, + "loss": 1.7198, + "step": 3683 + }, + { + "epoch": 0.4279988382224804, + "grad_norm": 0.39147892594337463, + "learning_rate": 0.0001, + "loss": 1.6397, + "step": 3684 + }, + { + "epoch": 0.4281150159744409, + "grad_norm": 0.410375714302063, + "learning_rate": 0.0001, + "loss": 1.7164, + "step": 3685 + }, + { + "epoch": 0.4282311937264014, + "grad_norm": 0.4207121729850769, + "learning_rate": 0.0001, + "loss": 1.7148, + "step": 3686 + }, + { + "epoch": 0.4283473714783619, + "grad_norm": 0.446114718914032, + "learning_rate": 0.0001, + "loss": 1.7685, + "step": 3687 + }, + { + "epoch": 0.42846354923032237, + "grad_norm": 0.429855614900589, + "learning_rate": 0.0001, + "loss": 1.8145, + "step": 3688 + }, + { + "epoch": 0.4285797269822829, + "grad_norm": 0.4075881838798523, + "learning_rate": 0.0001, + "loss": 1.6954, + "step": 3689 + }, + { + "epoch": 0.4286959047342434, + "grad_norm": 0.42212194204330444, + "learning_rate": 0.0001, + "loss": 1.8045, + "step": 3690 + }, + { + "epoch": 0.4288120824862039, + "grad_norm": 0.3913438022136688, + "learning_rate": 0.0001, + "loss": 1.541, + "step": 3691 + }, + { + "epoch": 0.4289282602381644, + "grad_norm": 0.4033333659172058, + "learning_rate": 0.0001, + "loss": 1.4609, + "step": 3692 + }, + { + "epoch": 0.42904443799012487, + "grad_norm": 0.41124168038368225, + "learning_rate": 0.0001, + "loss": 1.7919, + "step": 3693 + }, + { + "epoch": 0.4291606157420854, + "grad_norm": 0.4082317352294922, + "learning_rate": 0.0001, + "loss": 1.714, + "step": 3694 + }, + { + "epoch": 0.4292767934940459, + "grad_norm": 0.43243342638015747, + "learning_rate": 0.0001, + "loss": 1.8346, + "step": 3695 + }, + { + "epoch": 0.4293929712460064, + "grad_norm": 0.4029131829738617, + "learning_rate": 0.0001, + "loss": 1.6845, + "step": 3696 + }, + { + "epoch": 0.4295091489979669, + "grad_norm": 0.39119669795036316, + "learning_rate": 0.0001, + "loss": 1.547, + "step": 3697 + }, + { + "epoch": 0.42962532674992737, + "grad_norm": 0.4251865744590759, + "learning_rate": 0.0001, + "loss": 1.7231, + "step": 3698 + }, + { + "epoch": 0.42974150450188786, + "grad_norm": 0.41602757573127747, + "learning_rate": 0.0001, + "loss": 1.6948, + "step": 3699 + }, + { + "epoch": 0.4298576822538484, + "grad_norm": 0.41048288345336914, + "learning_rate": 0.0001, + "loss": 1.7455, + "step": 3700 + }, + { + "epoch": 0.4299738600058089, + "grad_norm": 0.39157551527023315, + "learning_rate": 0.0001, + "loss": 1.6849, + "step": 3701 + }, + { + "epoch": 0.4300900377577694, + "grad_norm": 0.40497899055480957, + "learning_rate": 0.0001, + "loss": 1.729, + "step": 3702 + }, + { + "epoch": 0.4302062155097299, + "grad_norm": 0.36876180768013, + "learning_rate": 0.0001, + "loss": 1.5289, + "step": 3703 + }, + { + "epoch": 0.43032239326169036, + "grad_norm": 0.39832401275634766, + "learning_rate": 0.0001, + "loss": 1.5778, + "step": 3704 + }, + { + "epoch": 0.4304385710136509, + "grad_norm": 0.40576431155204773, + "learning_rate": 0.0001, + "loss": 1.5792, + "step": 3705 + }, + { + "epoch": 0.4305547487656114, + "grad_norm": 0.41582056879997253, + "learning_rate": 0.0001, + "loss": 1.7519, + "step": 3706 + }, + { + "epoch": 0.4306709265175719, + "grad_norm": 0.37532898783683777, + "learning_rate": 0.0001, + "loss": 1.4301, + "step": 3707 + }, + { + "epoch": 0.4307871042695324, + "grad_norm": 0.4363482594490051, + "learning_rate": 0.0001, + "loss": 1.6907, + "step": 3708 + }, + { + "epoch": 0.43090328202149286, + "grad_norm": 0.38650989532470703, + "learning_rate": 0.0001, + "loss": 1.579, + "step": 3709 + }, + { + "epoch": 0.4310194597734534, + "grad_norm": 0.4108797013759613, + "learning_rate": 0.0001, + "loss": 1.8012, + "step": 3710 + }, + { + "epoch": 0.4311356375254139, + "grad_norm": 0.3996245563030243, + "learning_rate": 0.0001, + "loss": 1.7163, + "step": 3711 + }, + { + "epoch": 0.4312518152773744, + "grad_norm": 0.40178465843200684, + "learning_rate": 0.0001, + "loss": 1.744, + "step": 3712 + }, + { + "epoch": 0.4313679930293349, + "grad_norm": 0.38844192028045654, + "learning_rate": 0.0001, + "loss": 1.6231, + "step": 3713 + }, + { + "epoch": 0.43148417078129536, + "grad_norm": 0.38284438848495483, + "learning_rate": 0.0001, + "loss": 1.6097, + "step": 3714 + }, + { + "epoch": 0.4316003485332559, + "grad_norm": 0.4122409224510193, + "learning_rate": 0.0001, + "loss": 1.7502, + "step": 3715 + }, + { + "epoch": 0.4317165262852164, + "grad_norm": 0.4218509793281555, + "learning_rate": 0.0001, + "loss": 1.6147, + "step": 3716 + }, + { + "epoch": 0.4318327040371769, + "grad_norm": 0.4130255877971649, + "learning_rate": 0.0001, + "loss": 1.5444, + "step": 3717 + }, + { + "epoch": 0.4319488817891374, + "grad_norm": 0.4026492238044739, + "learning_rate": 0.0001, + "loss": 1.5865, + "step": 3718 + }, + { + "epoch": 0.43206505954109786, + "grad_norm": 0.4177059531211853, + "learning_rate": 0.0001, + "loss": 1.6353, + "step": 3719 + }, + { + "epoch": 0.4321812372930584, + "grad_norm": 0.4138438105583191, + "learning_rate": 0.0001, + "loss": 1.616, + "step": 3720 + }, + { + "epoch": 0.4322974150450189, + "grad_norm": 0.4228832721710205, + "learning_rate": 0.0001, + "loss": 1.8092, + "step": 3721 + }, + { + "epoch": 0.4324135927969794, + "grad_norm": 0.4101725220680237, + "learning_rate": 0.0001, + "loss": 1.6595, + "step": 3722 + }, + { + "epoch": 0.4325297705489399, + "grad_norm": 0.38750308752059937, + "learning_rate": 0.0001, + "loss": 1.4348, + "step": 3723 + }, + { + "epoch": 0.43264594830090036, + "grad_norm": 0.43653422594070435, + "learning_rate": 0.0001, + "loss": 1.698, + "step": 3724 + }, + { + "epoch": 0.43276212605286085, + "grad_norm": 0.39200592041015625, + "learning_rate": 0.0001, + "loss": 1.6315, + "step": 3725 + }, + { + "epoch": 0.4328783038048214, + "grad_norm": 0.40859901905059814, + "learning_rate": 0.0001, + "loss": 1.6831, + "step": 3726 + }, + { + "epoch": 0.4329944815567819, + "grad_norm": 0.41595181822776794, + "learning_rate": 0.0001, + "loss": 1.6281, + "step": 3727 + }, + { + "epoch": 0.4331106593087424, + "grad_norm": 0.391353577375412, + "learning_rate": 0.0001, + "loss": 1.5171, + "step": 3728 + }, + { + "epoch": 0.43322683706070286, + "grad_norm": 0.40748852491378784, + "learning_rate": 0.0001, + "loss": 1.639, + "step": 3729 + }, + { + "epoch": 0.43334301481266335, + "grad_norm": 0.3781512975692749, + "learning_rate": 0.0001, + "loss": 1.4826, + "step": 3730 + }, + { + "epoch": 0.4334591925646239, + "grad_norm": 0.36677882075309753, + "learning_rate": 0.0001, + "loss": 1.3444, + "step": 3731 + }, + { + "epoch": 0.4335753703165844, + "grad_norm": 0.38588741421699524, + "learning_rate": 0.0001, + "loss": 1.5981, + "step": 3732 + }, + { + "epoch": 0.4336915480685449, + "grad_norm": 0.4158620834350586, + "learning_rate": 0.0001, + "loss": 1.6755, + "step": 3733 + }, + { + "epoch": 0.43380772582050536, + "grad_norm": 0.4044388234615326, + "learning_rate": 0.0001, + "loss": 1.6385, + "step": 3734 + }, + { + "epoch": 0.43392390357246585, + "grad_norm": 0.4197262227535248, + "learning_rate": 0.0001, + "loss": 1.7444, + "step": 3735 + }, + { + "epoch": 0.4340400813244264, + "grad_norm": 0.40574586391448975, + "learning_rate": 0.0001, + "loss": 1.6206, + "step": 3736 + }, + { + "epoch": 0.4341562590763869, + "grad_norm": 0.397217333316803, + "learning_rate": 0.0001, + "loss": 1.6766, + "step": 3737 + }, + { + "epoch": 0.4342724368283474, + "grad_norm": 0.4184318482875824, + "learning_rate": 0.0001, + "loss": 1.6272, + "step": 3738 + }, + { + "epoch": 0.43438861458030786, + "grad_norm": 0.3860454857349396, + "learning_rate": 0.0001, + "loss": 1.5121, + "step": 3739 + }, + { + "epoch": 0.43450479233226835, + "grad_norm": 0.41592180728912354, + "learning_rate": 0.0001, + "loss": 1.5661, + "step": 3740 + }, + { + "epoch": 0.4346209700842289, + "grad_norm": 0.3821113407611847, + "learning_rate": 0.0001, + "loss": 1.5594, + "step": 3741 + }, + { + "epoch": 0.4347371478361894, + "grad_norm": 0.4071682095527649, + "learning_rate": 0.0001, + "loss": 1.5585, + "step": 3742 + }, + { + "epoch": 0.4348533255881499, + "grad_norm": 0.41301366686820984, + "learning_rate": 0.0001, + "loss": 1.5946, + "step": 3743 + }, + { + "epoch": 0.43496950334011036, + "grad_norm": 0.4541078209877014, + "learning_rate": 0.0001, + "loss": 1.6193, + "step": 3744 + }, + { + "epoch": 0.43508568109207085, + "grad_norm": 0.4197753369808197, + "learning_rate": 0.0001, + "loss": 1.7419, + "step": 3745 + }, + { + "epoch": 0.43520185884403134, + "grad_norm": 0.40982601046562195, + "learning_rate": 0.0001, + "loss": 1.601, + "step": 3746 + }, + { + "epoch": 0.4353180365959919, + "grad_norm": 0.439325213432312, + "learning_rate": 0.0001, + "loss": 1.8748, + "step": 3747 + }, + { + "epoch": 0.4354342143479524, + "grad_norm": 0.4326147139072418, + "learning_rate": 0.0001, + "loss": 1.7788, + "step": 3748 + }, + { + "epoch": 0.43555039209991286, + "grad_norm": 0.43868646025657654, + "learning_rate": 0.0001, + "loss": 1.63, + "step": 3749 + }, + { + "epoch": 0.43566656985187335, + "grad_norm": 0.4201911389827728, + "learning_rate": 0.0001, + "loss": 1.524, + "step": 3750 + }, + { + "epoch": 0.43578274760383384, + "grad_norm": 0.3979409337043762, + "learning_rate": 0.0001, + "loss": 1.7081, + "step": 3751 + }, + { + "epoch": 0.4358989253557944, + "grad_norm": 0.4207451045513153, + "learning_rate": 0.0001, + "loss": 1.6116, + "step": 3752 + }, + { + "epoch": 0.4360151031077549, + "grad_norm": 0.3865394592285156, + "learning_rate": 0.0001, + "loss": 1.6928, + "step": 3753 + }, + { + "epoch": 0.43613128085971536, + "grad_norm": 0.4036276340484619, + "learning_rate": 0.0001, + "loss": 1.7083, + "step": 3754 + }, + { + "epoch": 0.43624745861167585, + "grad_norm": 0.39419230818748474, + "learning_rate": 0.0001, + "loss": 1.7132, + "step": 3755 + }, + { + "epoch": 0.43636363636363634, + "grad_norm": 0.42213842272758484, + "learning_rate": 0.0001, + "loss": 1.7595, + "step": 3756 + }, + { + "epoch": 0.4364798141155969, + "grad_norm": 0.4202689528465271, + "learning_rate": 0.0001, + "loss": 1.7564, + "step": 3757 + }, + { + "epoch": 0.4365959918675574, + "grad_norm": 0.40122315287590027, + "learning_rate": 0.0001, + "loss": 1.7285, + "step": 3758 + }, + { + "epoch": 0.43671216961951786, + "grad_norm": 0.41420555114746094, + "learning_rate": 0.0001, + "loss": 1.7936, + "step": 3759 + }, + { + "epoch": 0.43682834737147835, + "grad_norm": 0.46364185214042664, + "learning_rate": 0.0001, + "loss": 1.7911, + "step": 3760 + }, + { + "epoch": 0.43694452512343884, + "grad_norm": 0.404472678899765, + "learning_rate": 0.0001, + "loss": 1.6818, + "step": 3761 + }, + { + "epoch": 0.4370607028753994, + "grad_norm": 0.4069894552230835, + "learning_rate": 0.0001, + "loss": 1.7287, + "step": 3762 + }, + { + "epoch": 0.4371768806273599, + "grad_norm": 0.41052114963531494, + "learning_rate": 0.0001, + "loss": 1.5779, + "step": 3763 + }, + { + "epoch": 0.43729305837932037, + "grad_norm": 0.41978520154953003, + "learning_rate": 0.0001, + "loss": 1.5853, + "step": 3764 + }, + { + "epoch": 0.43740923613128085, + "grad_norm": 0.45879876613616943, + "learning_rate": 0.0001, + "loss": 1.7743, + "step": 3765 + }, + { + "epoch": 0.43752541388324134, + "grad_norm": 0.42506903409957886, + "learning_rate": 0.0001, + "loss": 1.706, + "step": 3766 + }, + { + "epoch": 0.43764159163520183, + "grad_norm": 0.4226793944835663, + "learning_rate": 0.0001, + "loss": 1.6838, + "step": 3767 + }, + { + "epoch": 0.4377577693871624, + "grad_norm": 0.38549569249153137, + "learning_rate": 0.0001, + "loss": 1.5986, + "step": 3768 + }, + { + "epoch": 0.43787394713912287, + "grad_norm": 0.3775811791419983, + "learning_rate": 0.0001, + "loss": 1.5797, + "step": 3769 + }, + { + "epoch": 0.43799012489108335, + "grad_norm": 0.4261288046836853, + "learning_rate": 0.0001, + "loss": 1.7852, + "step": 3770 + }, + { + "epoch": 0.43810630264304384, + "grad_norm": 0.4037782847881317, + "learning_rate": 0.0001, + "loss": 1.7035, + "step": 3771 + }, + { + "epoch": 0.43822248039500433, + "grad_norm": 0.3799671530723572, + "learning_rate": 0.0001, + "loss": 1.5825, + "step": 3772 + }, + { + "epoch": 0.4383386581469649, + "grad_norm": 0.4238581955432892, + "learning_rate": 0.0001, + "loss": 1.7204, + "step": 3773 + }, + { + "epoch": 0.43845483589892537, + "grad_norm": 0.39733898639678955, + "learning_rate": 0.0001, + "loss": 1.7237, + "step": 3774 + }, + { + "epoch": 0.43857101365088585, + "grad_norm": 0.386123925447464, + "learning_rate": 0.0001, + "loss": 1.6263, + "step": 3775 + }, + { + "epoch": 0.43868719140284634, + "grad_norm": 0.42280828952789307, + "learning_rate": 0.0001, + "loss": 1.7637, + "step": 3776 + }, + { + "epoch": 0.43880336915480683, + "grad_norm": 0.4024019241333008, + "learning_rate": 0.0001, + "loss": 1.7998, + "step": 3777 + }, + { + "epoch": 0.4389195469067674, + "grad_norm": 0.36499494314193726, + "learning_rate": 0.0001, + "loss": 1.408, + "step": 3778 + }, + { + "epoch": 0.43903572465872787, + "grad_norm": 0.4545857906341553, + "learning_rate": 0.0001, + "loss": 1.9895, + "step": 3779 + }, + { + "epoch": 0.43915190241068836, + "grad_norm": 0.4315713047981262, + "learning_rate": 0.0001, + "loss": 1.6808, + "step": 3780 + }, + { + "epoch": 0.43926808016264884, + "grad_norm": 0.4134562015533447, + "learning_rate": 0.0001, + "loss": 1.6684, + "step": 3781 + }, + { + "epoch": 0.43938425791460933, + "grad_norm": 0.39927372336387634, + "learning_rate": 0.0001, + "loss": 1.5434, + "step": 3782 + }, + { + "epoch": 0.4395004356665699, + "grad_norm": 0.407842755317688, + "learning_rate": 0.0001, + "loss": 1.5138, + "step": 3783 + }, + { + "epoch": 0.43961661341853037, + "grad_norm": 0.4489937126636505, + "learning_rate": 0.0001, + "loss": 1.7954, + "step": 3784 + }, + { + "epoch": 0.43973279117049086, + "grad_norm": 0.41381195187568665, + "learning_rate": 0.0001, + "loss": 1.5945, + "step": 3785 + }, + { + "epoch": 0.43984896892245134, + "grad_norm": 0.4299207627773285, + "learning_rate": 0.0001, + "loss": 1.7795, + "step": 3786 + }, + { + "epoch": 0.43996514667441183, + "grad_norm": 0.4247708320617676, + "learning_rate": 0.0001, + "loss": 1.5955, + "step": 3787 + }, + { + "epoch": 0.4400813244263723, + "grad_norm": 0.45429563522338867, + "learning_rate": 0.0001, + "loss": 1.6822, + "step": 3788 + }, + { + "epoch": 0.44019750217833287, + "grad_norm": 0.39192894101142883, + "learning_rate": 0.0001, + "loss": 1.5933, + "step": 3789 + }, + { + "epoch": 0.44031367993029336, + "grad_norm": 0.4126262664794922, + "learning_rate": 0.0001, + "loss": 1.6458, + "step": 3790 + }, + { + "epoch": 0.44042985768225384, + "grad_norm": 0.3963555693626404, + "learning_rate": 0.0001, + "loss": 1.552, + "step": 3791 + }, + { + "epoch": 0.44054603543421433, + "grad_norm": 0.3770107924938202, + "learning_rate": 0.0001, + "loss": 1.3031, + "step": 3792 + }, + { + "epoch": 0.4406622131861748, + "grad_norm": 0.40233081579208374, + "learning_rate": 0.0001, + "loss": 1.5341, + "step": 3793 + }, + { + "epoch": 0.44077839093813537, + "grad_norm": 0.4272160232067108, + "learning_rate": 0.0001, + "loss": 1.7011, + "step": 3794 + }, + { + "epoch": 0.44089456869009586, + "grad_norm": 0.4221998453140259, + "learning_rate": 0.0001, + "loss": 1.7047, + "step": 3795 + }, + { + "epoch": 0.44101074644205635, + "grad_norm": 0.3974875509738922, + "learning_rate": 0.0001, + "loss": 1.6049, + "step": 3796 + }, + { + "epoch": 0.44112692419401683, + "grad_norm": 0.462016224861145, + "learning_rate": 0.0001, + "loss": 1.8508, + "step": 3797 + }, + { + "epoch": 0.4412431019459773, + "grad_norm": 0.412485271692276, + "learning_rate": 0.0001, + "loss": 1.6794, + "step": 3798 + }, + { + "epoch": 0.44135927969793787, + "grad_norm": 0.40575462579727173, + "learning_rate": 0.0001, + "loss": 1.6224, + "step": 3799 + }, + { + "epoch": 0.44147545744989836, + "grad_norm": 0.393636018037796, + "learning_rate": 0.0001, + "loss": 1.5162, + "step": 3800 + }, + { + "epoch": 0.44159163520185885, + "grad_norm": 0.39681223034858704, + "learning_rate": 0.0001, + "loss": 1.6334, + "step": 3801 + }, + { + "epoch": 0.44170781295381933, + "grad_norm": 0.41064146161079407, + "learning_rate": 0.0001, + "loss": 1.4694, + "step": 3802 + }, + { + "epoch": 0.4418239907057798, + "grad_norm": 0.4206371605396271, + "learning_rate": 0.0001, + "loss": 1.7796, + "step": 3803 + }, + { + "epoch": 0.44194016845774037, + "grad_norm": 0.41671186685562134, + "learning_rate": 0.0001, + "loss": 1.7832, + "step": 3804 + }, + { + "epoch": 0.44205634620970086, + "grad_norm": 0.38577428460121155, + "learning_rate": 0.0001, + "loss": 1.5598, + "step": 3805 + }, + { + "epoch": 0.44217252396166135, + "grad_norm": 0.3924373388290405, + "learning_rate": 0.0001, + "loss": 1.6841, + "step": 3806 + }, + { + "epoch": 0.44228870171362183, + "grad_norm": 0.44141262769699097, + "learning_rate": 0.0001, + "loss": 1.7508, + "step": 3807 + }, + { + "epoch": 0.4424048794655823, + "grad_norm": 0.40393298864364624, + "learning_rate": 0.0001, + "loss": 1.7571, + "step": 3808 + }, + { + "epoch": 0.44252105721754287, + "grad_norm": 0.42056989669799805, + "learning_rate": 0.0001, + "loss": 1.5809, + "step": 3809 + }, + { + "epoch": 0.44263723496950336, + "grad_norm": 0.38610607385635376, + "learning_rate": 0.0001, + "loss": 1.5378, + "step": 3810 + }, + { + "epoch": 0.44275341272146385, + "grad_norm": 0.408483624458313, + "learning_rate": 0.0001, + "loss": 1.5269, + "step": 3811 + }, + { + "epoch": 0.44286959047342433, + "grad_norm": 0.3991870880126953, + "learning_rate": 0.0001, + "loss": 1.6283, + "step": 3812 + }, + { + "epoch": 0.4429857682253848, + "grad_norm": 0.4130445718765259, + "learning_rate": 0.0001, + "loss": 1.7266, + "step": 3813 + }, + { + "epoch": 0.4431019459773453, + "grad_norm": 0.4013391137123108, + "learning_rate": 0.0001, + "loss": 1.5487, + "step": 3814 + }, + { + "epoch": 0.44321812372930586, + "grad_norm": 0.4077419638633728, + "learning_rate": 0.0001, + "loss": 1.7261, + "step": 3815 + }, + { + "epoch": 0.44333430148126635, + "grad_norm": 0.40052545070648193, + "learning_rate": 0.0001, + "loss": 1.4593, + "step": 3816 + }, + { + "epoch": 0.44345047923322684, + "grad_norm": 0.38764074444770813, + "learning_rate": 0.0001, + "loss": 1.7004, + "step": 3817 + }, + { + "epoch": 0.4435666569851873, + "grad_norm": 0.41951900720596313, + "learning_rate": 0.0001, + "loss": 1.6013, + "step": 3818 + }, + { + "epoch": 0.4436828347371478, + "grad_norm": 0.42698657512664795, + "learning_rate": 0.0001, + "loss": 1.7343, + "step": 3819 + }, + { + "epoch": 0.44379901248910836, + "grad_norm": 0.42899012565612793, + "learning_rate": 0.0001, + "loss": 1.8282, + "step": 3820 + }, + { + "epoch": 0.44391519024106885, + "grad_norm": 0.3827013373374939, + "learning_rate": 0.0001, + "loss": 1.5941, + "step": 3821 + }, + { + "epoch": 0.44403136799302934, + "grad_norm": 0.42411962151527405, + "learning_rate": 0.0001, + "loss": 1.7373, + "step": 3822 + }, + { + "epoch": 0.4441475457449898, + "grad_norm": 0.4320610463619232, + "learning_rate": 0.0001, + "loss": 1.661, + "step": 3823 + }, + { + "epoch": 0.4442637234969503, + "grad_norm": 0.4369167685508728, + "learning_rate": 0.0001, + "loss": 1.6977, + "step": 3824 + }, + { + "epoch": 0.44437990124891086, + "grad_norm": 0.3931221663951874, + "learning_rate": 0.0001, + "loss": 1.7523, + "step": 3825 + }, + { + "epoch": 0.44449607900087135, + "grad_norm": 0.4219224750995636, + "learning_rate": 0.0001, + "loss": 1.6523, + "step": 3826 + }, + { + "epoch": 0.44461225675283184, + "grad_norm": 0.4289875030517578, + "learning_rate": 0.0001, + "loss": 1.7104, + "step": 3827 + }, + { + "epoch": 0.4447284345047923, + "grad_norm": 0.3924271762371063, + "learning_rate": 0.0001, + "loss": 1.6693, + "step": 3828 + }, + { + "epoch": 0.4448446122567528, + "grad_norm": 0.3892977237701416, + "learning_rate": 0.0001, + "loss": 1.674, + "step": 3829 + }, + { + "epoch": 0.44496079000871336, + "grad_norm": 0.4215613305568695, + "learning_rate": 0.0001, + "loss": 1.8097, + "step": 3830 + }, + { + "epoch": 0.44507696776067385, + "grad_norm": 0.38903629779815674, + "learning_rate": 0.0001, + "loss": 1.5231, + "step": 3831 + }, + { + "epoch": 0.44519314551263434, + "grad_norm": 0.3972281217575073, + "learning_rate": 0.0001, + "loss": 1.6644, + "step": 3832 + }, + { + "epoch": 0.4453093232645948, + "grad_norm": 0.43121159076690674, + "learning_rate": 0.0001, + "loss": 1.7917, + "step": 3833 + }, + { + "epoch": 0.4454255010165553, + "grad_norm": 0.3963190019130707, + "learning_rate": 0.0001, + "loss": 1.6404, + "step": 3834 + }, + { + "epoch": 0.4455416787685158, + "grad_norm": 0.4058324992656708, + "learning_rate": 0.0001, + "loss": 1.6382, + "step": 3835 + }, + { + "epoch": 0.44565785652047635, + "grad_norm": 0.38773831725120544, + "learning_rate": 0.0001, + "loss": 1.5982, + "step": 3836 + }, + { + "epoch": 0.44577403427243684, + "grad_norm": 0.4000050723552704, + "learning_rate": 0.0001, + "loss": 1.6112, + "step": 3837 + }, + { + "epoch": 0.4458902120243973, + "grad_norm": 0.4279811382293701, + "learning_rate": 0.0001, + "loss": 1.7576, + "step": 3838 + }, + { + "epoch": 0.4460063897763578, + "grad_norm": 0.3969043493270874, + "learning_rate": 0.0001, + "loss": 1.6881, + "step": 3839 + }, + { + "epoch": 0.4461225675283183, + "grad_norm": 0.3967093825340271, + "learning_rate": 0.0001, + "loss": 1.5786, + "step": 3840 + }, + { + "epoch": 0.44623874528027885, + "grad_norm": 0.4337485432624817, + "learning_rate": 0.0001, + "loss": 1.7033, + "step": 3841 + }, + { + "epoch": 0.44635492303223934, + "grad_norm": 0.4148968756198883, + "learning_rate": 0.0001, + "loss": 1.7227, + "step": 3842 + }, + { + "epoch": 0.4464711007841998, + "grad_norm": 0.4100302457809448, + "learning_rate": 0.0001, + "loss": 1.7214, + "step": 3843 + }, + { + "epoch": 0.4465872785361603, + "grad_norm": 0.430477112531662, + "learning_rate": 0.0001, + "loss": 1.6915, + "step": 3844 + }, + { + "epoch": 0.4467034562881208, + "grad_norm": 0.424699068069458, + "learning_rate": 0.0001, + "loss": 1.6179, + "step": 3845 + }, + { + "epoch": 0.44681963404008135, + "grad_norm": 0.39795196056365967, + "learning_rate": 0.0001, + "loss": 1.7281, + "step": 3846 + }, + { + "epoch": 0.44693581179204184, + "grad_norm": 0.4175790548324585, + "learning_rate": 0.0001, + "loss": 1.5496, + "step": 3847 + }, + { + "epoch": 0.4470519895440023, + "grad_norm": 0.3961549699306488, + "learning_rate": 0.0001, + "loss": 1.5253, + "step": 3848 + }, + { + "epoch": 0.4471681672959628, + "grad_norm": 0.3898450434207916, + "learning_rate": 0.0001, + "loss": 1.569, + "step": 3849 + }, + { + "epoch": 0.4472843450479233, + "grad_norm": 0.41821977496147156, + "learning_rate": 0.0001, + "loss": 1.6888, + "step": 3850 + }, + { + "epoch": 0.44740052279988385, + "grad_norm": 0.38670074939727783, + "learning_rate": 0.0001, + "loss": 1.6028, + "step": 3851 + }, + { + "epoch": 0.44751670055184434, + "grad_norm": 0.4136195182800293, + "learning_rate": 0.0001, + "loss": 1.8563, + "step": 3852 + }, + { + "epoch": 0.4476328783038048, + "grad_norm": 0.4174270033836365, + "learning_rate": 0.0001, + "loss": 1.7291, + "step": 3853 + }, + { + "epoch": 0.4477490560557653, + "grad_norm": 0.41631799936294556, + "learning_rate": 0.0001, + "loss": 1.5991, + "step": 3854 + }, + { + "epoch": 0.4478652338077258, + "grad_norm": 0.443198561668396, + "learning_rate": 0.0001, + "loss": 1.6342, + "step": 3855 + }, + { + "epoch": 0.4479814115596863, + "grad_norm": 0.4108307659626007, + "learning_rate": 0.0001, + "loss": 1.6264, + "step": 3856 + }, + { + "epoch": 0.44809758931164684, + "grad_norm": 0.36928313970565796, + "learning_rate": 0.0001, + "loss": 1.5414, + "step": 3857 + }, + { + "epoch": 0.4482137670636073, + "grad_norm": 0.42408835887908936, + "learning_rate": 0.0001, + "loss": 1.909, + "step": 3858 + }, + { + "epoch": 0.4483299448155678, + "grad_norm": 0.4042268991470337, + "learning_rate": 0.0001, + "loss": 1.6949, + "step": 3859 + }, + { + "epoch": 0.4484461225675283, + "grad_norm": 0.40864208340644836, + "learning_rate": 0.0001, + "loss": 1.6614, + "step": 3860 + }, + { + "epoch": 0.4485623003194888, + "grad_norm": 0.38092851638793945, + "learning_rate": 0.0001, + "loss": 1.6558, + "step": 3861 + }, + { + "epoch": 0.44867847807144934, + "grad_norm": 0.3783656656742096, + "learning_rate": 0.0001, + "loss": 1.5293, + "step": 3862 + }, + { + "epoch": 0.4487946558234098, + "grad_norm": 0.3975265920162201, + "learning_rate": 0.0001, + "loss": 1.6055, + "step": 3863 + }, + { + "epoch": 0.4489108335753703, + "grad_norm": 0.42539462447166443, + "learning_rate": 0.0001, + "loss": 1.7332, + "step": 3864 + }, + { + "epoch": 0.4490270113273308, + "grad_norm": 0.4166495203971863, + "learning_rate": 0.0001, + "loss": 1.6275, + "step": 3865 + }, + { + "epoch": 0.4491431890792913, + "grad_norm": 0.39668262004852295, + "learning_rate": 0.0001, + "loss": 1.6647, + "step": 3866 + }, + { + "epoch": 0.44925936683125184, + "grad_norm": 0.4214429557323456, + "learning_rate": 0.0001, + "loss": 1.6029, + "step": 3867 + }, + { + "epoch": 0.4493755445832123, + "grad_norm": 0.43809008598327637, + "learning_rate": 0.0001, + "loss": 1.4617, + "step": 3868 + }, + { + "epoch": 0.4494917223351728, + "grad_norm": 0.42034274339675903, + "learning_rate": 0.0001, + "loss": 1.794, + "step": 3869 + }, + { + "epoch": 0.4496079000871333, + "grad_norm": 0.3921016454696655, + "learning_rate": 0.0001, + "loss": 1.5351, + "step": 3870 + }, + { + "epoch": 0.4497240778390938, + "grad_norm": 0.4140496253967285, + "learning_rate": 0.0001, + "loss": 1.4723, + "step": 3871 + }, + { + "epoch": 0.44984025559105434, + "grad_norm": 0.43890661001205444, + "learning_rate": 0.0001, + "loss": 1.6398, + "step": 3872 + }, + { + "epoch": 0.4499564333430148, + "grad_norm": 0.40606656670570374, + "learning_rate": 0.0001, + "loss": 1.5126, + "step": 3873 + }, + { + "epoch": 0.4500726110949753, + "grad_norm": 0.45156994462013245, + "learning_rate": 0.0001, + "loss": 1.6816, + "step": 3874 + }, + { + "epoch": 0.4501887888469358, + "grad_norm": 0.4368317425251007, + "learning_rate": 0.0001, + "loss": 1.67, + "step": 3875 + }, + { + "epoch": 0.4503049665988963, + "grad_norm": 0.41252076625823975, + "learning_rate": 0.0001, + "loss": 1.6612, + "step": 3876 + }, + { + "epoch": 0.4504211443508568, + "grad_norm": 0.46063879132270813, + "learning_rate": 0.0001, + "loss": 1.7607, + "step": 3877 + }, + { + "epoch": 0.45053732210281733, + "grad_norm": 0.4769091308116913, + "learning_rate": 0.0001, + "loss": 1.6788, + "step": 3878 + }, + { + "epoch": 0.4506534998547778, + "grad_norm": 0.3981684446334839, + "learning_rate": 0.0001, + "loss": 1.7009, + "step": 3879 + }, + { + "epoch": 0.4507696776067383, + "grad_norm": 0.4136994779109955, + "learning_rate": 0.0001, + "loss": 1.6577, + "step": 3880 + }, + { + "epoch": 0.4508858553586988, + "grad_norm": 0.394010454416275, + "learning_rate": 0.0001, + "loss": 1.6413, + "step": 3881 + }, + { + "epoch": 0.4510020331106593, + "grad_norm": 0.3855630159378052, + "learning_rate": 0.0001, + "loss": 1.6939, + "step": 3882 + }, + { + "epoch": 0.45111821086261983, + "grad_norm": 0.41295209527015686, + "learning_rate": 0.0001, + "loss": 1.7063, + "step": 3883 + }, + { + "epoch": 0.4512343886145803, + "grad_norm": 0.36102786660194397, + "learning_rate": 0.0001, + "loss": 1.5307, + "step": 3884 + }, + { + "epoch": 0.4513505663665408, + "grad_norm": 0.42346733808517456, + "learning_rate": 0.0001, + "loss": 1.6742, + "step": 3885 + }, + { + "epoch": 0.4514667441185013, + "grad_norm": 0.40333321690559387, + "learning_rate": 0.0001, + "loss": 1.6063, + "step": 3886 + }, + { + "epoch": 0.4515829218704618, + "grad_norm": 0.38405728340148926, + "learning_rate": 0.0001, + "loss": 1.5387, + "step": 3887 + }, + { + "epoch": 0.45169909962242233, + "grad_norm": 0.4108724594116211, + "learning_rate": 0.0001, + "loss": 1.6753, + "step": 3888 + }, + { + "epoch": 0.4518152773743828, + "grad_norm": 0.42868340015411377, + "learning_rate": 0.0001, + "loss": 1.7265, + "step": 3889 + }, + { + "epoch": 0.4519314551263433, + "grad_norm": 0.4091024100780487, + "learning_rate": 0.0001, + "loss": 1.5311, + "step": 3890 + }, + { + "epoch": 0.4520476328783038, + "grad_norm": 0.4288122355937958, + "learning_rate": 0.0001, + "loss": 1.6001, + "step": 3891 + }, + { + "epoch": 0.4521638106302643, + "grad_norm": 0.42493435740470886, + "learning_rate": 0.0001, + "loss": 1.8673, + "step": 3892 + }, + { + "epoch": 0.45227998838222483, + "grad_norm": 0.3990475833415985, + "learning_rate": 0.0001, + "loss": 1.6854, + "step": 3893 + }, + { + "epoch": 0.4523961661341853, + "grad_norm": 0.39818140864372253, + "learning_rate": 0.0001, + "loss": 1.7016, + "step": 3894 + }, + { + "epoch": 0.4525123438861458, + "grad_norm": 0.39904457330703735, + "learning_rate": 0.0001, + "loss": 1.648, + "step": 3895 + }, + { + "epoch": 0.4526285216381063, + "grad_norm": 0.41773471236228943, + "learning_rate": 0.0001, + "loss": 1.5215, + "step": 3896 + }, + { + "epoch": 0.4527446993900668, + "grad_norm": 0.42894163727760315, + "learning_rate": 0.0001, + "loss": 1.805, + "step": 3897 + }, + { + "epoch": 0.45286087714202733, + "grad_norm": 0.4194585084915161, + "learning_rate": 0.0001, + "loss": 1.7241, + "step": 3898 + }, + { + "epoch": 0.4529770548939878, + "grad_norm": 0.40379759669303894, + "learning_rate": 0.0001, + "loss": 1.6349, + "step": 3899 + }, + { + "epoch": 0.4530932326459483, + "grad_norm": 0.4571918845176697, + "learning_rate": 0.0001, + "loss": 1.8804, + "step": 3900 + }, + { + "epoch": 0.4532094103979088, + "grad_norm": 0.3897266685962677, + "learning_rate": 0.0001, + "loss": 1.6028, + "step": 3901 + }, + { + "epoch": 0.4533255881498693, + "grad_norm": 0.4049232304096222, + "learning_rate": 0.0001, + "loss": 1.6794, + "step": 3902 + }, + { + "epoch": 0.4534417659018298, + "grad_norm": 0.43378233909606934, + "learning_rate": 0.0001, + "loss": 1.7152, + "step": 3903 + }, + { + "epoch": 0.4535579436537903, + "grad_norm": 0.43286213278770447, + "learning_rate": 0.0001, + "loss": 1.7369, + "step": 3904 + }, + { + "epoch": 0.4536741214057508, + "grad_norm": 0.41819891333580017, + "learning_rate": 0.0001, + "loss": 1.6184, + "step": 3905 + }, + { + "epoch": 0.4537902991577113, + "grad_norm": 0.4100865125656128, + "learning_rate": 0.0001, + "loss": 1.6647, + "step": 3906 + }, + { + "epoch": 0.4539064769096718, + "grad_norm": 0.3872487545013428, + "learning_rate": 0.0001, + "loss": 1.4784, + "step": 3907 + }, + { + "epoch": 0.4540226546616323, + "grad_norm": 0.4244650602340698, + "learning_rate": 0.0001, + "loss": 1.5842, + "step": 3908 + }, + { + "epoch": 0.4541388324135928, + "grad_norm": 0.38467180728912354, + "learning_rate": 0.0001, + "loss": 1.6559, + "step": 3909 + }, + { + "epoch": 0.4542550101655533, + "grad_norm": 0.3707321286201477, + "learning_rate": 0.0001, + "loss": 1.5709, + "step": 3910 + }, + { + "epoch": 0.4543711879175138, + "grad_norm": 0.412838876247406, + "learning_rate": 0.0001, + "loss": 1.6973, + "step": 3911 + }, + { + "epoch": 0.4544873656694743, + "grad_norm": 0.4340794086456299, + "learning_rate": 0.0001, + "loss": 1.6017, + "step": 3912 + }, + { + "epoch": 0.4546035434214348, + "grad_norm": 0.38097426295280457, + "learning_rate": 0.0001, + "loss": 1.4161, + "step": 3913 + }, + { + "epoch": 0.4547197211733953, + "grad_norm": 0.418896347284317, + "learning_rate": 0.0001, + "loss": 1.7463, + "step": 3914 + }, + { + "epoch": 0.4548358989253558, + "grad_norm": 0.427630752325058, + "learning_rate": 0.0001, + "loss": 1.8143, + "step": 3915 + }, + { + "epoch": 0.4549520766773163, + "grad_norm": 0.4016280174255371, + "learning_rate": 0.0001, + "loss": 1.6164, + "step": 3916 + }, + { + "epoch": 0.4550682544292768, + "grad_norm": 0.46618133783340454, + "learning_rate": 0.0001, + "loss": 1.6915, + "step": 3917 + }, + { + "epoch": 0.4551844321812373, + "grad_norm": 0.40906664729118347, + "learning_rate": 0.0001, + "loss": 1.5765, + "step": 3918 + }, + { + "epoch": 0.4553006099331978, + "grad_norm": 0.40268227458000183, + "learning_rate": 0.0001, + "loss": 1.5699, + "step": 3919 + }, + { + "epoch": 0.4554167876851583, + "grad_norm": 0.408345490694046, + "learning_rate": 0.0001, + "loss": 1.7256, + "step": 3920 + }, + { + "epoch": 0.4555329654371188, + "grad_norm": 0.39653435349464417, + "learning_rate": 0.0001, + "loss": 1.5844, + "step": 3921 + }, + { + "epoch": 0.4556491431890793, + "grad_norm": 0.39751750230789185, + "learning_rate": 0.0001, + "loss": 1.6837, + "step": 3922 + }, + { + "epoch": 0.4557653209410398, + "grad_norm": 0.3973672688007355, + "learning_rate": 0.0001, + "loss": 1.6071, + "step": 3923 + }, + { + "epoch": 0.45588149869300026, + "grad_norm": 0.43432149291038513, + "learning_rate": 0.0001, + "loss": 1.6463, + "step": 3924 + }, + { + "epoch": 0.4559976764449608, + "grad_norm": 0.43509483337402344, + "learning_rate": 0.0001, + "loss": 1.7848, + "step": 3925 + }, + { + "epoch": 0.4561138541969213, + "grad_norm": 0.407279908657074, + "learning_rate": 0.0001, + "loss": 1.6915, + "step": 3926 + }, + { + "epoch": 0.4562300319488818, + "grad_norm": 0.40425196290016174, + "learning_rate": 0.0001, + "loss": 1.6857, + "step": 3927 + }, + { + "epoch": 0.4563462097008423, + "grad_norm": 0.4185945689678192, + "learning_rate": 0.0001, + "loss": 1.7103, + "step": 3928 + }, + { + "epoch": 0.45646238745280276, + "grad_norm": 0.39828792214393616, + "learning_rate": 0.0001, + "loss": 1.7497, + "step": 3929 + }, + { + "epoch": 0.4565785652047633, + "grad_norm": 0.40944311022758484, + "learning_rate": 0.0001, + "loss": 1.6036, + "step": 3930 + }, + { + "epoch": 0.4566947429567238, + "grad_norm": 0.3925982117652893, + "learning_rate": 0.0001, + "loss": 1.4826, + "step": 3931 + }, + { + "epoch": 0.4568109207086843, + "grad_norm": 0.413461834192276, + "learning_rate": 0.0001, + "loss": 1.7878, + "step": 3932 + }, + { + "epoch": 0.4569270984606448, + "grad_norm": 0.4017687737941742, + "learning_rate": 0.0001, + "loss": 1.7589, + "step": 3933 + }, + { + "epoch": 0.45704327621260527, + "grad_norm": 0.3962691128253937, + "learning_rate": 0.0001, + "loss": 1.547, + "step": 3934 + }, + { + "epoch": 0.4571594539645658, + "grad_norm": 0.4170648753643036, + "learning_rate": 0.0001, + "loss": 1.6969, + "step": 3935 + }, + { + "epoch": 0.4572756317165263, + "grad_norm": 0.4325627386569977, + "learning_rate": 0.0001, + "loss": 1.5711, + "step": 3936 + }, + { + "epoch": 0.4573918094684868, + "grad_norm": 0.40576669573783875, + "learning_rate": 0.0001, + "loss": 1.5092, + "step": 3937 + }, + { + "epoch": 0.4575079872204473, + "grad_norm": 0.40341272950172424, + "learning_rate": 0.0001, + "loss": 1.6716, + "step": 3938 + }, + { + "epoch": 0.45762416497240777, + "grad_norm": 0.3955957293510437, + "learning_rate": 0.0001, + "loss": 1.7395, + "step": 3939 + }, + { + "epoch": 0.4577403427243683, + "grad_norm": 0.3895145356655121, + "learning_rate": 0.0001, + "loss": 1.6388, + "step": 3940 + }, + { + "epoch": 0.4578565204763288, + "grad_norm": 0.4147211015224457, + "learning_rate": 0.0001, + "loss": 1.5953, + "step": 3941 + }, + { + "epoch": 0.4579726982282893, + "grad_norm": 0.39672261476516724, + "learning_rate": 0.0001, + "loss": 1.5186, + "step": 3942 + }, + { + "epoch": 0.4580888759802498, + "grad_norm": 0.4300733506679535, + "learning_rate": 0.0001, + "loss": 1.6518, + "step": 3943 + }, + { + "epoch": 0.45820505373221027, + "grad_norm": 0.4062694013118744, + "learning_rate": 0.0001, + "loss": 1.8041, + "step": 3944 + }, + { + "epoch": 0.45832123148417075, + "grad_norm": 0.4132828414440155, + "learning_rate": 0.0001, + "loss": 1.537, + "step": 3945 + }, + { + "epoch": 0.4584374092361313, + "grad_norm": 0.42365893721580505, + "learning_rate": 0.0001, + "loss": 1.6487, + "step": 3946 + }, + { + "epoch": 0.4585535869880918, + "grad_norm": 0.3917977511882782, + "learning_rate": 0.0001, + "loss": 1.5432, + "step": 3947 + }, + { + "epoch": 0.4586697647400523, + "grad_norm": 0.4075486361980438, + "learning_rate": 0.0001, + "loss": 1.6274, + "step": 3948 + }, + { + "epoch": 0.45878594249201277, + "grad_norm": 0.41571909189224243, + "learning_rate": 0.0001, + "loss": 1.6735, + "step": 3949 + }, + { + "epoch": 0.45890212024397325, + "grad_norm": 0.44082123041152954, + "learning_rate": 0.0001, + "loss": 1.7793, + "step": 3950 + }, + { + "epoch": 0.4590182979959338, + "grad_norm": 0.40916866064071655, + "learning_rate": 0.0001, + "loss": 1.7366, + "step": 3951 + }, + { + "epoch": 0.4591344757478943, + "grad_norm": 0.3818245530128479, + "learning_rate": 0.0001, + "loss": 1.5146, + "step": 3952 + }, + { + "epoch": 0.4592506534998548, + "grad_norm": 0.4646684527397156, + "learning_rate": 0.0001, + "loss": 1.8009, + "step": 3953 + }, + { + "epoch": 0.45936683125181527, + "grad_norm": 0.4245597720146179, + "learning_rate": 0.0001, + "loss": 1.6859, + "step": 3954 + }, + { + "epoch": 0.45948300900377576, + "grad_norm": 0.42128875851631165, + "learning_rate": 0.0001, + "loss": 1.5136, + "step": 3955 + }, + { + "epoch": 0.4595991867557363, + "grad_norm": 0.41281989216804504, + "learning_rate": 0.0001, + "loss": 1.7685, + "step": 3956 + }, + { + "epoch": 0.4597153645076968, + "grad_norm": 0.4056452810764313, + "learning_rate": 0.0001, + "loss": 1.6749, + "step": 3957 + }, + { + "epoch": 0.4598315422596573, + "grad_norm": 0.4163060188293457, + "learning_rate": 0.0001, + "loss": 1.6496, + "step": 3958 + }, + { + "epoch": 0.45994772001161777, + "grad_norm": 0.4370429813861847, + "learning_rate": 0.0001, + "loss": 1.7209, + "step": 3959 + }, + { + "epoch": 0.46006389776357826, + "grad_norm": 0.4308282136917114, + "learning_rate": 0.0001, + "loss": 1.7271, + "step": 3960 + }, + { + "epoch": 0.4601800755155388, + "grad_norm": 0.42895281314849854, + "learning_rate": 0.0001, + "loss": 1.9235, + "step": 3961 + }, + { + "epoch": 0.4602962532674993, + "grad_norm": 0.3952750563621521, + "learning_rate": 0.0001, + "loss": 1.7455, + "step": 3962 + }, + { + "epoch": 0.4604124310194598, + "grad_norm": 0.4076629877090454, + "learning_rate": 0.0001, + "loss": 1.7427, + "step": 3963 + }, + { + "epoch": 0.46052860877142027, + "grad_norm": 0.41401106119155884, + "learning_rate": 0.0001, + "loss": 1.6593, + "step": 3964 + }, + { + "epoch": 0.46064478652338076, + "grad_norm": 0.39911845326423645, + "learning_rate": 0.0001, + "loss": 1.7128, + "step": 3965 + }, + { + "epoch": 0.46076096427534124, + "grad_norm": 0.3996291160583496, + "learning_rate": 0.0001, + "loss": 1.7472, + "step": 3966 + }, + { + "epoch": 0.4608771420273018, + "grad_norm": 0.4146043360233307, + "learning_rate": 0.0001, + "loss": 1.7652, + "step": 3967 + }, + { + "epoch": 0.4609933197792623, + "grad_norm": 0.4056430459022522, + "learning_rate": 0.0001, + "loss": 1.6542, + "step": 3968 + }, + { + "epoch": 0.46110949753122277, + "grad_norm": 0.3950080871582031, + "learning_rate": 0.0001, + "loss": 1.6484, + "step": 3969 + }, + { + "epoch": 0.46122567528318326, + "grad_norm": 0.3902670443058014, + "learning_rate": 0.0001, + "loss": 1.5769, + "step": 3970 + }, + { + "epoch": 0.46134185303514375, + "grad_norm": 0.408426970243454, + "learning_rate": 0.0001, + "loss": 1.5973, + "step": 3971 + }, + { + "epoch": 0.4614580307871043, + "grad_norm": 0.42066359519958496, + "learning_rate": 0.0001, + "loss": 1.8631, + "step": 3972 + }, + { + "epoch": 0.4615742085390648, + "grad_norm": 0.4041798412799835, + "learning_rate": 0.0001, + "loss": 1.615, + "step": 3973 + }, + { + "epoch": 0.46169038629102527, + "grad_norm": 0.4431467056274414, + "learning_rate": 0.0001, + "loss": 1.6752, + "step": 3974 + }, + { + "epoch": 0.46180656404298576, + "grad_norm": 0.3934585154056549, + "learning_rate": 0.0001, + "loss": 1.6613, + "step": 3975 + }, + { + "epoch": 0.46192274179494625, + "grad_norm": 0.40387487411499023, + "learning_rate": 0.0001, + "loss": 1.5377, + "step": 3976 + }, + { + "epoch": 0.4620389195469068, + "grad_norm": 0.39581024646759033, + "learning_rate": 0.0001, + "loss": 1.6956, + "step": 3977 + }, + { + "epoch": 0.4621550972988673, + "grad_norm": 0.43566346168518066, + "learning_rate": 0.0001, + "loss": 1.7688, + "step": 3978 + }, + { + "epoch": 0.46227127505082777, + "grad_norm": 0.399808794260025, + "learning_rate": 0.0001, + "loss": 1.7104, + "step": 3979 + }, + { + "epoch": 0.46238745280278826, + "grad_norm": 0.42738527059555054, + "learning_rate": 0.0001, + "loss": 1.6639, + "step": 3980 + }, + { + "epoch": 0.46250363055474875, + "grad_norm": 0.3896215260028839, + "learning_rate": 0.0001, + "loss": 1.605, + "step": 3981 + }, + { + "epoch": 0.4626198083067093, + "grad_norm": 0.41741159558296204, + "learning_rate": 0.0001, + "loss": 1.7703, + "step": 3982 + }, + { + "epoch": 0.4627359860586698, + "grad_norm": 0.40396156907081604, + "learning_rate": 0.0001, + "loss": 1.5881, + "step": 3983 + }, + { + "epoch": 0.46285216381063027, + "grad_norm": 0.38009852170944214, + "learning_rate": 0.0001, + "loss": 1.4823, + "step": 3984 + }, + { + "epoch": 0.46296834156259076, + "grad_norm": 0.43098655343055725, + "learning_rate": 0.0001, + "loss": 1.6838, + "step": 3985 + }, + { + "epoch": 0.46308451931455125, + "grad_norm": 0.461058109998703, + "learning_rate": 0.0001, + "loss": 1.6804, + "step": 3986 + }, + { + "epoch": 0.4632006970665118, + "grad_norm": 0.45243731141090393, + "learning_rate": 0.0001, + "loss": 1.9556, + "step": 3987 + }, + { + "epoch": 0.4633168748184723, + "grad_norm": 0.41540998220443726, + "learning_rate": 0.0001, + "loss": 1.5549, + "step": 3988 + }, + { + "epoch": 0.46343305257043277, + "grad_norm": 0.4656381905078888, + "learning_rate": 0.0001, + "loss": 1.6438, + "step": 3989 + }, + { + "epoch": 0.46354923032239326, + "grad_norm": 0.4463013708591461, + "learning_rate": 0.0001, + "loss": 1.7644, + "step": 3990 + }, + { + "epoch": 0.46366540807435375, + "grad_norm": 0.40116289258003235, + "learning_rate": 0.0001, + "loss": 1.6486, + "step": 3991 + }, + { + "epoch": 0.46378158582631424, + "grad_norm": 0.41295450925827026, + "learning_rate": 0.0001, + "loss": 1.6462, + "step": 3992 + }, + { + "epoch": 0.4638977635782748, + "grad_norm": 0.43854475021362305, + "learning_rate": 0.0001, + "loss": 1.8506, + "step": 3993 + }, + { + "epoch": 0.46401394133023527, + "grad_norm": 0.39046186208724976, + "learning_rate": 0.0001, + "loss": 1.6821, + "step": 3994 + }, + { + "epoch": 0.46413011908219576, + "grad_norm": 0.4226478338241577, + "learning_rate": 0.0001, + "loss": 1.7984, + "step": 3995 + }, + { + "epoch": 0.46424629683415625, + "grad_norm": 0.40615102648735046, + "learning_rate": 0.0001, + "loss": 1.6383, + "step": 3996 + }, + { + "epoch": 0.46436247458611674, + "grad_norm": 0.41897422075271606, + "learning_rate": 0.0001, + "loss": 1.5104, + "step": 3997 + }, + { + "epoch": 0.4644786523380773, + "grad_norm": 0.3868965804576874, + "learning_rate": 0.0001, + "loss": 1.5209, + "step": 3998 + }, + { + "epoch": 0.46459483009003777, + "grad_norm": 0.40913960337638855, + "learning_rate": 0.0001, + "loss": 1.7078, + "step": 3999 + }, + { + "epoch": 0.46471100784199826, + "grad_norm": 0.44357261061668396, + "learning_rate": 0.0001, + "loss": 1.5254, + "step": 4000 + }, + { + "epoch": 0.46482718559395875, + "grad_norm": 0.42295897006988525, + "learning_rate": 0.0001, + "loss": 1.6681, + "step": 4001 + }, + { + "epoch": 0.46494336334591924, + "grad_norm": 0.38419994711875916, + "learning_rate": 0.0001, + "loss": 1.5279, + "step": 4002 + }, + { + "epoch": 0.4650595410978798, + "grad_norm": 0.4167155623435974, + "learning_rate": 0.0001, + "loss": 1.7534, + "step": 4003 + }, + { + "epoch": 0.46517571884984027, + "grad_norm": 0.394581139087677, + "learning_rate": 0.0001, + "loss": 1.661, + "step": 4004 + }, + { + "epoch": 0.46529189660180076, + "grad_norm": 0.4233587384223938, + "learning_rate": 0.0001, + "loss": 1.6249, + "step": 4005 + }, + { + "epoch": 0.46540807435376125, + "grad_norm": 0.3914411962032318, + "learning_rate": 0.0001, + "loss": 1.5213, + "step": 4006 + }, + { + "epoch": 0.46552425210572174, + "grad_norm": 0.4535432457923889, + "learning_rate": 0.0001, + "loss": 1.798, + "step": 4007 + }, + { + "epoch": 0.4656404298576823, + "grad_norm": 0.39255577325820923, + "learning_rate": 0.0001, + "loss": 1.5505, + "step": 4008 + }, + { + "epoch": 0.46575660760964277, + "grad_norm": 0.38806408643722534, + "learning_rate": 0.0001, + "loss": 1.5348, + "step": 4009 + }, + { + "epoch": 0.46587278536160326, + "grad_norm": 0.4313128888607025, + "learning_rate": 0.0001, + "loss": 1.5856, + "step": 4010 + }, + { + "epoch": 0.46598896311356375, + "grad_norm": 0.39540693163871765, + "learning_rate": 0.0001, + "loss": 1.4181, + "step": 4011 + }, + { + "epoch": 0.46610514086552424, + "grad_norm": 0.40360215306282043, + "learning_rate": 0.0001, + "loss": 1.6609, + "step": 4012 + }, + { + "epoch": 0.4662213186174847, + "grad_norm": 0.4235546886920929, + "learning_rate": 0.0001, + "loss": 1.7763, + "step": 4013 + }, + { + "epoch": 0.46633749636944527, + "grad_norm": 0.4328942596912384, + "learning_rate": 0.0001, + "loss": 1.8291, + "step": 4014 + }, + { + "epoch": 0.46645367412140576, + "grad_norm": 0.40199771523475647, + "learning_rate": 0.0001, + "loss": 1.4812, + "step": 4015 + }, + { + "epoch": 0.46656985187336625, + "grad_norm": 0.3917251527309418, + "learning_rate": 0.0001, + "loss": 1.6395, + "step": 4016 + }, + { + "epoch": 0.46668602962532674, + "grad_norm": 0.4347023665904999, + "learning_rate": 0.0001, + "loss": 1.7273, + "step": 4017 + }, + { + "epoch": 0.4668022073772872, + "grad_norm": 0.41650915145874023, + "learning_rate": 0.0001, + "loss": 1.6943, + "step": 4018 + }, + { + "epoch": 0.46691838512924777, + "grad_norm": 0.39249107241630554, + "learning_rate": 0.0001, + "loss": 1.5723, + "step": 4019 + }, + { + "epoch": 0.46703456288120826, + "grad_norm": 0.42523854970932007, + "learning_rate": 0.0001, + "loss": 1.6913, + "step": 4020 + }, + { + "epoch": 0.46715074063316875, + "grad_norm": 0.385466068983078, + "learning_rate": 0.0001, + "loss": 1.4976, + "step": 4021 + }, + { + "epoch": 0.46726691838512924, + "grad_norm": 0.39372915029525757, + "learning_rate": 0.0001, + "loss": 1.5259, + "step": 4022 + }, + { + "epoch": 0.4673830961370897, + "grad_norm": 0.41617611050605774, + "learning_rate": 0.0001, + "loss": 1.6379, + "step": 4023 + }, + { + "epoch": 0.46749927388905027, + "grad_norm": 0.4050281047821045, + "learning_rate": 0.0001, + "loss": 1.6093, + "step": 4024 + }, + { + "epoch": 0.46761545164101076, + "grad_norm": 0.41811496019363403, + "learning_rate": 0.0001, + "loss": 1.6338, + "step": 4025 + }, + { + "epoch": 0.46773162939297125, + "grad_norm": 0.4445625841617584, + "learning_rate": 0.0001, + "loss": 1.6577, + "step": 4026 + }, + { + "epoch": 0.46784780714493174, + "grad_norm": 0.3947649598121643, + "learning_rate": 0.0001, + "loss": 1.6823, + "step": 4027 + }, + { + "epoch": 0.4679639848968922, + "grad_norm": 0.39127346873283386, + "learning_rate": 0.0001, + "loss": 1.7173, + "step": 4028 + }, + { + "epoch": 0.46808016264885277, + "grad_norm": 0.4156622588634491, + "learning_rate": 0.0001, + "loss": 1.782, + "step": 4029 + }, + { + "epoch": 0.46819634040081326, + "grad_norm": 0.40657055377960205, + "learning_rate": 0.0001, + "loss": 1.7922, + "step": 4030 + }, + { + "epoch": 0.46831251815277375, + "grad_norm": 0.4071350693702698, + "learning_rate": 0.0001, + "loss": 1.546, + "step": 4031 + }, + { + "epoch": 0.46842869590473424, + "grad_norm": 0.41567331552505493, + "learning_rate": 0.0001, + "loss": 1.703, + "step": 4032 + }, + { + "epoch": 0.4685448736566947, + "grad_norm": 0.4297296106815338, + "learning_rate": 0.0001, + "loss": 1.6237, + "step": 4033 + }, + { + "epoch": 0.4686610514086552, + "grad_norm": 0.46219146251678467, + "learning_rate": 0.0001, + "loss": 1.6704, + "step": 4034 + }, + { + "epoch": 0.46877722916061576, + "grad_norm": 0.38711977005004883, + "learning_rate": 0.0001, + "loss": 1.5427, + "step": 4035 + }, + { + "epoch": 0.46889340691257625, + "grad_norm": 0.39784133434295654, + "learning_rate": 0.0001, + "loss": 1.5922, + "step": 4036 + }, + { + "epoch": 0.46900958466453674, + "grad_norm": 0.4138612747192383, + "learning_rate": 0.0001, + "loss": 1.7207, + "step": 4037 + }, + { + "epoch": 0.4691257624164972, + "grad_norm": 0.42039382457733154, + "learning_rate": 0.0001, + "loss": 1.6665, + "step": 4038 + }, + { + "epoch": 0.4692419401684577, + "grad_norm": 0.40157628059387207, + "learning_rate": 0.0001, + "loss": 1.644, + "step": 4039 + }, + { + "epoch": 0.46935811792041826, + "grad_norm": 0.40044525265693665, + "learning_rate": 0.0001, + "loss": 1.7727, + "step": 4040 + }, + { + "epoch": 0.46947429567237875, + "grad_norm": 0.4081622064113617, + "learning_rate": 0.0001, + "loss": 1.627, + "step": 4041 + }, + { + "epoch": 0.46959047342433924, + "grad_norm": 0.40018370747566223, + "learning_rate": 0.0001, + "loss": 1.6327, + "step": 4042 + }, + { + "epoch": 0.4697066511762997, + "grad_norm": 0.3724619746208191, + "learning_rate": 0.0001, + "loss": 1.5894, + "step": 4043 + }, + { + "epoch": 0.4698228289282602, + "grad_norm": 0.386092871427536, + "learning_rate": 0.0001, + "loss": 1.5178, + "step": 4044 + }, + { + "epoch": 0.46993900668022076, + "grad_norm": 0.44518908858299255, + "learning_rate": 0.0001, + "loss": 1.8437, + "step": 4045 + }, + { + "epoch": 0.47005518443218125, + "grad_norm": 0.4211956560611725, + "learning_rate": 0.0001, + "loss": 1.5107, + "step": 4046 + }, + { + "epoch": 0.47017136218414174, + "grad_norm": 0.4604097306728363, + "learning_rate": 0.0001, + "loss": 1.6791, + "step": 4047 + }, + { + "epoch": 0.47028753993610223, + "grad_norm": 0.4125956892967224, + "learning_rate": 0.0001, + "loss": 1.6793, + "step": 4048 + }, + { + "epoch": 0.4704037176880627, + "grad_norm": 0.46292823553085327, + "learning_rate": 0.0001, + "loss": 1.877, + "step": 4049 + }, + { + "epoch": 0.47051989544002326, + "grad_norm": 0.4355732202529907, + "learning_rate": 0.0001, + "loss": 1.708, + "step": 4050 + }, + { + "epoch": 0.47063607319198375, + "grad_norm": 0.4154967665672302, + "learning_rate": 0.0001, + "loss": 1.7367, + "step": 4051 + }, + { + "epoch": 0.47075225094394424, + "grad_norm": 0.4346443712711334, + "learning_rate": 0.0001, + "loss": 1.6655, + "step": 4052 + }, + { + "epoch": 0.47086842869590473, + "grad_norm": 0.41704896092414856, + "learning_rate": 0.0001, + "loss": 1.8386, + "step": 4053 + }, + { + "epoch": 0.4709846064478652, + "grad_norm": 0.4196739196777344, + "learning_rate": 0.0001, + "loss": 1.6717, + "step": 4054 + }, + { + "epoch": 0.4711007841998257, + "grad_norm": 0.43008390069007874, + "learning_rate": 0.0001, + "loss": 1.7694, + "step": 4055 + }, + { + "epoch": 0.47121696195178625, + "grad_norm": 0.37963777780532837, + "learning_rate": 0.0001, + "loss": 1.4642, + "step": 4056 + }, + { + "epoch": 0.47133313970374674, + "grad_norm": 0.40089571475982666, + "learning_rate": 0.0001, + "loss": 1.5918, + "step": 4057 + }, + { + "epoch": 0.47144931745570723, + "grad_norm": 0.4169794023036957, + "learning_rate": 0.0001, + "loss": 1.7457, + "step": 4058 + }, + { + "epoch": 0.4715654952076677, + "grad_norm": 0.4090938866138458, + "learning_rate": 0.0001, + "loss": 1.6641, + "step": 4059 + }, + { + "epoch": 0.4716816729596282, + "grad_norm": 0.4060511589050293, + "learning_rate": 0.0001, + "loss": 1.5108, + "step": 4060 + }, + { + "epoch": 0.47179785071158875, + "grad_norm": 0.4162571132183075, + "learning_rate": 0.0001, + "loss": 1.7383, + "step": 4061 + }, + { + "epoch": 0.47191402846354924, + "grad_norm": 0.39749133586883545, + "learning_rate": 0.0001, + "loss": 1.6077, + "step": 4062 + }, + { + "epoch": 0.47203020621550973, + "grad_norm": 0.4047934412956238, + "learning_rate": 0.0001, + "loss": 1.5669, + "step": 4063 + }, + { + "epoch": 0.4721463839674702, + "grad_norm": 0.436172217130661, + "learning_rate": 0.0001, + "loss": 1.6651, + "step": 4064 + }, + { + "epoch": 0.4722625617194307, + "grad_norm": 0.4056571125984192, + "learning_rate": 0.0001, + "loss": 1.5496, + "step": 4065 + }, + { + "epoch": 0.47237873947139125, + "grad_norm": 0.4182429909706116, + "learning_rate": 0.0001, + "loss": 1.7614, + "step": 4066 + }, + { + "epoch": 0.47249491722335174, + "grad_norm": 0.3990522623062134, + "learning_rate": 0.0001, + "loss": 1.5811, + "step": 4067 + }, + { + "epoch": 0.47261109497531223, + "grad_norm": 0.4048585593700409, + "learning_rate": 0.0001, + "loss": 1.6271, + "step": 4068 + }, + { + "epoch": 0.4727272727272727, + "grad_norm": 0.3930312693119049, + "learning_rate": 0.0001, + "loss": 1.6553, + "step": 4069 + }, + { + "epoch": 0.4728434504792332, + "grad_norm": 0.4067966639995575, + "learning_rate": 0.0001, + "loss": 1.6163, + "step": 4070 + }, + { + "epoch": 0.47295962823119375, + "grad_norm": 0.4243100881576538, + "learning_rate": 0.0001, + "loss": 1.639, + "step": 4071 + }, + { + "epoch": 0.47307580598315424, + "grad_norm": 0.4516909718513489, + "learning_rate": 0.0001, + "loss": 1.7275, + "step": 4072 + }, + { + "epoch": 0.47319198373511473, + "grad_norm": 0.4081507921218872, + "learning_rate": 0.0001, + "loss": 1.6746, + "step": 4073 + }, + { + "epoch": 0.4733081614870752, + "grad_norm": 0.43128862977027893, + "learning_rate": 0.0001, + "loss": 1.6588, + "step": 4074 + }, + { + "epoch": 0.4734243392390357, + "grad_norm": 0.43954023718833923, + "learning_rate": 0.0001, + "loss": 1.8691, + "step": 4075 + }, + { + "epoch": 0.47354051699099625, + "grad_norm": 0.422748863697052, + "learning_rate": 0.0001, + "loss": 1.6295, + "step": 4076 + }, + { + "epoch": 0.47365669474295674, + "grad_norm": 0.40614694356918335, + "learning_rate": 0.0001, + "loss": 1.5405, + "step": 4077 + }, + { + "epoch": 0.47377287249491723, + "grad_norm": 0.4623175263404846, + "learning_rate": 0.0001, + "loss": 1.8329, + "step": 4078 + }, + { + "epoch": 0.4738890502468777, + "grad_norm": 0.44638627767562866, + "learning_rate": 0.0001, + "loss": 1.9259, + "step": 4079 + }, + { + "epoch": 0.4740052279988382, + "grad_norm": 0.4093332588672638, + "learning_rate": 0.0001, + "loss": 1.6923, + "step": 4080 + }, + { + "epoch": 0.4741214057507987, + "grad_norm": 0.4484642446041107, + "learning_rate": 0.0001, + "loss": 1.7393, + "step": 4081 + }, + { + "epoch": 0.47423758350275924, + "grad_norm": 0.4261915981769562, + "learning_rate": 0.0001, + "loss": 1.7479, + "step": 4082 + }, + { + "epoch": 0.47435376125471973, + "grad_norm": 0.3942796587944031, + "learning_rate": 0.0001, + "loss": 1.5641, + "step": 4083 + }, + { + "epoch": 0.4744699390066802, + "grad_norm": 0.4067613184452057, + "learning_rate": 0.0001, + "loss": 1.6648, + "step": 4084 + }, + { + "epoch": 0.4745861167586407, + "grad_norm": 0.42234787344932556, + "learning_rate": 0.0001, + "loss": 1.6912, + "step": 4085 + }, + { + "epoch": 0.4747022945106012, + "grad_norm": 0.38540762662887573, + "learning_rate": 0.0001, + "loss": 1.5529, + "step": 4086 + }, + { + "epoch": 0.47481847226256174, + "grad_norm": 0.386514276266098, + "learning_rate": 0.0001, + "loss": 1.6059, + "step": 4087 + }, + { + "epoch": 0.47493465001452223, + "grad_norm": 0.4075110852718353, + "learning_rate": 0.0001, + "loss": 1.6056, + "step": 4088 + }, + { + "epoch": 0.4750508277664827, + "grad_norm": 0.39759472012519836, + "learning_rate": 0.0001, + "loss": 1.54, + "step": 4089 + }, + { + "epoch": 0.4751670055184432, + "grad_norm": 0.3987995386123657, + "learning_rate": 0.0001, + "loss": 1.5567, + "step": 4090 + }, + { + "epoch": 0.4752831832704037, + "grad_norm": 0.3973774313926697, + "learning_rate": 0.0001, + "loss": 1.5121, + "step": 4091 + }, + { + "epoch": 0.47539936102236424, + "grad_norm": 0.4317288398742676, + "learning_rate": 0.0001, + "loss": 1.597, + "step": 4092 + }, + { + "epoch": 0.47551553877432473, + "grad_norm": 0.4416712522506714, + "learning_rate": 0.0001, + "loss": 1.7252, + "step": 4093 + }, + { + "epoch": 0.4756317165262852, + "grad_norm": 0.43052783608436584, + "learning_rate": 0.0001, + "loss": 1.6964, + "step": 4094 + }, + { + "epoch": 0.4757478942782457, + "grad_norm": 0.4163714647293091, + "learning_rate": 0.0001, + "loss": 1.6424, + "step": 4095 + }, + { + "epoch": 0.4758640720302062, + "grad_norm": 0.41067805886268616, + "learning_rate": 0.0001, + "loss": 1.5735, + "step": 4096 + }, + { + "epoch": 0.47598024978216674, + "grad_norm": 0.4482041001319885, + "learning_rate": 0.0001, + "loss": 1.8553, + "step": 4097 + }, + { + "epoch": 0.47609642753412723, + "grad_norm": 0.4115692973136902, + "learning_rate": 0.0001, + "loss": 1.6287, + "step": 4098 + }, + { + "epoch": 0.4762126052860877, + "grad_norm": 0.4347507953643799, + "learning_rate": 0.0001, + "loss": 1.7567, + "step": 4099 + }, + { + "epoch": 0.4763287830380482, + "grad_norm": 0.409196674823761, + "learning_rate": 0.0001, + "loss": 1.646, + "step": 4100 + }, + { + "epoch": 0.4764449607900087, + "grad_norm": 0.46702349185943604, + "learning_rate": 0.0001, + "loss": 1.7937, + "step": 4101 + }, + { + "epoch": 0.4765611385419692, + "grad_norm": 0.3931210935115814, + "learning_rate": 0.0001, + "loss": 1.5143, + "step": 4102 + }, + { + "epoch": 0.47667731629392973, + "grad_norm": 0.42548468708992004, + "learning_rate": 0.0001, + "loss": 1.7687, + "step": 4103 + }, + { + "epoch": 0.4767934940458902, + "grad_norm": 0.690558910369873, + "learning_rate": 0.0001, + "loss": 1.6536, + "step": 4104 + }, + { + "epoch": 0.4769096717978507, + "grad_norm": 0.41380855441093445, + "learning_rate": 0.0001, + "loss": 1.6347, + "step": 4105 + }, + { + "epoch": 0.4770258495498112, + "grad_norm": 0.37842079997062683, + "learning_rate": 0.0001, + "loss": 1.5828, + "step": 4106 + }, + { + "epoch": 0.4771420273017717, + "grad_norm": 0.4153887629508972, + "learning_rate": 0.0001, + "loss": 1.6783, + "step": 4107 + }, + { + "epoch": 0.47725820505373223, + "grad_norm": 0.408482164144516, + "learning_rate": 0.0001, + "loss": 1.5271, + "step": 4108 + }, + { + "epoch": 0.4773743828056927, + "grad_norm": 0.4116607904434204, + "learning_rate": 0.0001, + "loss": 1.5599, + "step": 4109 + }, + { + "epoch": 0.4774905605576532, + "grad_norm": 0.3892625570297241, + "learning_rate": 0.0001, + "loss": 1.6382, + "step": 4110 + }, + { + "epoch": 0.4776067383096137, + "grad_norm": 0.41425132751464844, + "learning_rate": 0.0001, + "loss": 1.6396, + "step": 4111 + }, + { + "epoch": 0.4777229160615742, + "grad_norm": 0.4124647080898285, + "learning_rate": 0.0001, + "loss": 1.5062, + "step": 4112 + }, + { + "epoch": 0.47783909381353473, + "grad_norm": 0.43545451760292053, + "learning_rate": 0.0001, + "loss": 1.7047, + "step": 4113 + }, + { + "epoch": 0.4779552715654952, + "grad_norm": 0.4274523854255676, + "learning_rate": 0.0001, + "loss": 1.6518, + "step": 4114 + }, + { + "epoch": 0.4780714493174557, + "grad_norm": 0.41416046023368835, + "learning_rate": 0.0001, + "loss": 1.6724, + "step": 4115 + }, + { + "epoch": 0.4781876270694162, + "grad_norm": 0.4206220805644989, + "learning_rate": 0.0001, + "loss": 1.7191, + "step": 4116 + }, + { + "epoch": 0.4783038048213767, + "grad_norm": 0.4083358645439148, + "learning_rate": 0.0001, + "loss": 1.7273, + "step": 4117 + }, + { + "epoch": 0.47841998257333723, + "grad_norm": 0.4127695560455322, + "learning_rate": 0.0001, + "loss": 1.8423, + "step": 4118 + }, + { + "epoch": 0.4785361603252977, + "grad_norm": 0.441363662481308, + "learning_rate": 0.0001, + "loss": 1.8589, + "step": 4119 + }, + { + "epoch": 0.4786523380772582, + "grad_norm": 0.41235068440437317, + "learning_rate": 0.0001, + "loss": 1.7405, + "step": 4120 + }, + { + "epoch": 0.4787685158292187, + "grad_norm": 0.41113796830177307, + "learning_rate": 0.0001, + "loss": 1.6793, + "step": 4121 + }, + { + "epoch": 0.4788846935811792, + "grad_norm": 0.44025418162345886, + "learning_rate": 0.0001, + "loss": 1.7407, + "step": 4122 + }, + { + "epoch": 0.4790008713331397, + "grad_norm": 0.3890633285045624, + "learning_rate": 0.0001, + "loss": 1.5768, + "step": 4123 + }, + { + "epoch": 0.4791170490851002, + "grad_norm": 0.40119051933288574, + "learning_rate": 0.0001, + "loss": 1.6278, + "step": 4124 + }, + { + "epoch": 0.4792332268370607, + "grad_norm": 0.4198724329471588, + "learning_rate": 0.0001, + "loss": 1.6017, + "step": 4125 + }, + { + "epoch": 0.4793494045890212, + "grad_norm": 0.4038214385509491, + "learning_rate": 0.0001, + "loss": 1.6103, + "step": 4126 + }, + { + "epoch": 0.4794655823409817, + "grad_norm": 0.4278080463409424, + "learning_rate": 0.0001, + "loss": 1.5321, + "step": 4127 + }, + { + "epoch": 0.4795817600929422, + "grad_norm": 0.41188672184944153, + "learning_rate": 0.0001, + "loss": 1.621, + "step": 4128 + }, + { + "epoch": 0.4796979378449027, + "grad_norm": 0.43129464983940125, + "learning_rate": 0.0001, + "loss": 1.7032, + "step": 4129 + }, + { + "epoch": 0.4798141155968632, + "grad_norm": 0.4218551814556122, + "learning_rate": 0.0001, + "loss": 1.742, + "step": 4130 + }, + { + "epoch": 0.4799302933488237, + "grad_norm": 0.37498828768730164, + "learning_rate": 0.0001, + "loss": 1.4627, + "step": 4131 + }, + { + "epoch": 0.4800464711007842, + "grad_norm": 0.4059140682220459, + "learning_rate": 0.0001, + "loss": 1.6074, + "step": 4132 + }, + { + "epoch": 0.4801626488527447, + "grad_norm": 0.4361055791378021, + "learning_rate": 0.0001, + "loss": 1.9007, + "step": 4133 + }, + { + "epoch": 0.4802788266047052, + "grad_norm": 0.40483996272087097, + "learning_rate": 0.0001, + "loss": 1.6734, + "step": 4134 + }, + { + "epoch": 0.4803950043566657, + "grad_norm": 0.4151374101638794, + "learning_rate": 0.0001, + "loss": 1.8005, + "step": 4135 + }, + { + "epoch": 0.4805111821086262, + "grad_norm": 0.4022291302680969, + "learning_rate": 0.0001, + "loss": 1.6368, + "step": 4136 + }, + { + "epoch": 0.4806273598605867, + "grad_norm": 0.40195754170417786, + "learning_rate": 0.0001, + "loss": 1.5596, + "step": 4137 + }, + { + "epoch": 0.4807435376125472, + "grad_norm": 0.41277408599853516, + "learning_rate": 0.0001, + "loss": 1.5831, + "step": 4138 + }, + { + "epoch": 0.4808597153645077, + "grad_norm": 0.4332813024520874, + "learning_rate": 0.0001, + "loss": 1.9132, + "step": 4139 + }, + { + "epoch": 0.4809758931164682, + "grad_norm": 0.3997774124145508, + "learning_rate": 0.0001, + "loss": 1.5791, + "step": 4140 + }, + { + "epoch": 0.4810920708684287, + "grad_norm": 0.4188762605190277, + "learning_rate": 0.0001, + "loss": 1.6811, + "step": 4141 + }, + { + "epoch": 0.4812082486203892, + "grad_norm": 0.40828222036361694, + "learning_rate": 0.0001, + "loss": 1.5355, + "step": 4142 + }, + { + "epoch": 0.4813244263723497, + "grad_norm": 0.4262183904647827, + "learning_rate": 0.0001, + "loss": 1.7421, + "step": 4143 + }, + { + "epoch": 0.48144060412431017, + "grad_norm": 0.4175308346748352, + "learning_rate": 0.0001, + "loss": 1.7043, + "step": 4144 + }, + { + "epoch": 0.4815567818762707, + "grad_norm": 0.40894895792007446, + "learning_rate": 0.0001, + "loss": 1.5447, + "step": 4145 + }, + { + "epoch": 0.4816729596282312, + "grad_norm": 0.4432525038719177, + "learning_rate": 0.0001, + "loss": 1.6228, + "step": 4146 + }, + { + "epoch": 0.4817891373801917, + "grad_norm": 0.412423700094223, + "learning_rate": 0.0001, + "loss": 1.5552, + "step": 4147 + }, + { + "epoch": 0.4819053151321522, + "grad_norm": 0.3926151990890503, + "learning_rate": 0.0001, + "loss": 1.697, + "step": 4148 + }, + { + "epoch": 0.48202149288411267, + "grad_norm": 0.4282263517379761, + "learning_rate": 0.0001, + "loss": 1.7589, + "step": 4149 + }, + { + "epoch": 0.4821376706360732, + "grad_norm": 0.3978181481361389, + "learning_rate": 0.0001, + "loss": 1.4873, + "step": 4150 + }, + { + "epoch": 0.4822538483880337, + "grad_norm": 0.39814430475234985, + "learning_rate": 0.0001, + "loss": 1.6877, + "step": 4151 + }, + { + "epoch": 0.4823700261399942, + "grad_norm": 0.40920761227607727, + "learning_rate": 0.0001, + "loss": 1.7559, + "step": 4152 + }, + { + "epoch": 0.4824862038919547, + "grad_norm": 0.43666741251945496, + "learning_rate": 0.0001, + "loss": 1.6647, + "step": 4153 + }, + { + "epoch": 0.48260238164391517, + "grad_norm": 0.449294775724411, + "learning_rate": 0.0001, + "loss": 1.7839, + "step": 4154 + }, + { + "epoch": 0.4827185593958757, + "grad_norm": 0.4310729205608368, + "learning_rate": 0.0001, + "loss": 1.5696, + "step": 4155 + }, + { + "epoch": 0.4828347371478362, + "grad_norm": 0.4184435307979584, + "learning_rate": 0.0001, + "loss": 1.7023, + "step": 4156 + }, + { + "epoch": 0.4829509148997967, + "grad_norm": 0.4592064321041107, + "learning_rate": 0.0001, + "loss": 1.853, + "step": 4157 + }, + { + "epoch": 0.4830670926517572, + "grad_norm": 0.41902077198028564, + "learning_rate": 0.0001, + "loss": 1.7134, + "step": 4158 + }, + { + "epoch": 0.48318327040371767, + "grad_norm": 0.40385156869888306, + "learning_rate": 0.0001, + "loss": 1.4542, + "step": 4159 + }, + { + "epoch": 0.4832994481556782, + "grad_norm": 0.4063291549682617, + "learning_rate": 0.0001, + "loss": 1.444, + "step": 4160 + }, + { + "epoch": 0.4834156259076387, + "grad_norm": 0.4716438055038452, + "learning_rate": 0.0001, + "loss": 1.8141, + "step": 4161 + }, + { + "epoch": 0.4835318036595992, + "grad_norm": 0.3783859312534332, + "learning_rate": 0.0001, + "loss": 1.3083, + "step": 4162 + }, + { + "epoch": 0.4836479814115597, + "grad_norm": 0.4354921281337738, + "learning_rate": 0.0001, + "loss": 1.5088, + "step": 4163 + }, + { + "epoch": 0.48376415916352017, + "grad_norm": 0.4501488506793976, + "learning_rate": 0.0001, + "loss": 1.7144, + "step": 4164 + }, + { + "epoch": 0.48388033691548066, + "grad_norm": 0.42613235116004944, + "learning_rate": 0.0001, + "loss": 1.6802, + "step": 4165 + }, + { + "epoch": 0.4839965146674412, + "grad_norm": 0.41294065117836, + "learning_rate": 0.0001, + "loss": 1.7037, + "step": 4166 + }, + { + "epoch": 0.4841126924194017, + "grad_norm": 0.399277001619339, + "learning_rate": 0.0001, + "loss": 1.4283, + "step": 4167 + }, + { + "epoch": 0.4842288701713622, + "grad_norm": 0.4134821891784668, + "learning_rate": 0.0001, + "loss": 1.5305, + "step": 4168 + }, + { + "epoch": 0.48434504792332267, + "grad_norm": 0.4123948812484741, + "learning_rate": 0.0001, + "loss": 1.6264, + "step": 4169 + }, + { + "epoch": 0.48446122567528316, + "grad_norm": 0.4469812214374542, + "learning_rate": 0.0001, + "loss": 1.8222, + "step": 4170 + }, + { + "epoch": 0.4845774034272437, + "grad_norm": 0.4323446452617645, + "learning_rate": 0.0001, + "loss": 1.761, + "step": 4171 + }, + { + "epoch": 0.4846935811792042, + "grad_norm": 0.4458593428134918, + "learning_rate": 0.0001, + "loss": 1.7806, + "step": 4172 + }, + { + "epoch": 0.4848097589311647, + "grad_norm": 0.4241911470890045, + "learning_rate": 0.0001, + "loss": 1.7529, + "step": 4173 + }, + { + "epoch": 0.48492593668312517, + "grad_norm": 0.43590041995048523, + "learning_rate": 0.0001, + "loss": 1.6048, + "step": 4174 + }, + { + "epoch": 0.48504211443508566, + "grad_norm": 0.4520244002342224, + "learning_rate": 0.0001, + "loss": 1.764, + "step": 4175 + }, + { + "epoch": 0.4851582921870462, + "grad_norm": 0.40647611021995544, + "learning_rate": 0.0001, + "loss": 1.5838, + "step": 4176 + }, + { + "epoch": 0.4852744699390067, + "grad_norm": 0.4175068438053131, + "learning_rate": 0.0001, + "loss": 1.7583, + "step": 4177 + }, + { + "epoch": 0.4853906476909672, + "grad_norm": 0.4129279553890228, + "learning_rate": 0.0001, + "loss": 1.5511, + "step": 4178 + }, + { + "epoch": 0.48550682544292767, + "grad_norm": 0.42088520526885986, + "learning_rate": 0.0001, + "loss": 1.756, + "step": 4179 + }, + { + "epoch": 0.48562300319488816, + "grad_norm": 0.4570264220237732, + "learning_rate": 0.0001, + "loss": 1.7849, + "step": 4180 + }, + { + "epoch": 0.4857391809468487, + "grad_norm": 0.4151148200035095, + "learning_rate": 0.0001, + "loss": 1.7816, + "step": 4181 + }, + { + "epoch": 0.4858553586988092, + "grad_norm": 0.4177875518798828, + "learning_rate": 0.0001, + "loss": 1.5103, + "step": 4182 + }, + { + "epoch": 0.4859715364507697, + "grad_norm": 0.4347212016582489, + "learning_rate": 0.0001, + "loss": 1.6551, + "step": 4183 + }, + { + "epoch": 0.48608771420273017, + "grad_norm": 0.406599760055542, + "learning_rate": 0.0001, + "loss": 1.6424, + "step": 4184 + }, + { + "epoch": 0.48620389195469066, + "grad_norm": 0.4066656529903412, + "learning_rate": 0.0001, + "loss": 1.5057, + "step": 4185 + }, + { + "epoch": 0.4863200697066512, + "grad_norm": 0.4207620918750763, + "learning_rate": 0.0001, + "loss": 1.5419, + "step": 4186 + }, + { + "epoch": 0.4864362474586117, + "grad_norm": 0.43291810154914856, + "learning_rate": 0.0001, + "loss": 1.7537, + "step": 4187 + }, + { + "epoch": 0.4865524252105722, + "grad_norm": 0.44666996598243713, + "learning_rate": 0.0001, + "loss": 1.7468, + "step": 4188 + }, + { + "epoch": 0.48666860296253267, + "grad_norm": 0.394218772649765, + "learning_rate": 0.0001, + "loss": 1.4794, + "step": 4189 + }, + { + "epoch": 0.48678478071449316, + "grad_norm": 0.41351521015167236, + "learning_rate": 0.0001, + "loss": 1.7016, + "step": 4190 + }, + { + "epoch": 0.48690095846645365, + "grad_norm": 0.412653386592865, + "learning_rate": 0.0001, + "loss": 1.5221, + "step": 4191 + }, + { + "epoch": 0.4870171362184142, + "grad_norm": 0.4062814712524414, + "learning_rate": 0.0001, + "loss": 1.5941, + "step": 4192 + }, + { + "epoch": 0.4871333139703747, + "grad_norm": 0.41630494594573975, + "learning_rate": 0.0001, + "loss": 1.7655, + "step": 4193 + }, + { + "epoch": 0.48724949172233517, + "grad_norm": 0.41663774847984314, + "learning_rate": 0.0001, + "loss": 1.6558, + "step": 4194 + }, + { + "epoch": 0.48736566947429566, + "grad_norm": 0.42713698744773865, + "learning_rate": 0.0001, + "loss": 1.6671, + "step": 4195 + }, + { + "epoch": 0.48748184722625615, + "grad_norm": 0.43906137347221375, + "learning_rate": 0.0001, + "loss": 1.7459, + "step": 4196 + }, + { + "epoch": 0.4875980249782167, + "grad_norm": 0.3749426305294037, + "learning_rate": 0.0001, + "loss": 1.6356, + "step": 4197 + }, + { + "epoch": 0.4877142027301772, + "grad_norm": 0.5166771411895752, + "learning_rate": 0.0001, + "loss": 1.6694, + "step": 4198 + }, + { + "epoch": 0.48783038048213767, + "grad_norm": 0.42134204506874084, + "learning_rate": 0.0001, + "loss": 1.505, + "step": 4199 + }, + { + "epoch": 0.48794655823409816, + "grad_norm": 0.4114912152290344, + "learning_rate": 0.0001, + "loss": 1.5915, + "step": 4200 + }, + { + "epoch": 0.48806273598605865, + "grad_norm": 0.4079913794994354, + "learning_rate": 0.0001, + "loss": 1.55, + "step": 4201 + }, + { + "epoch": 0.4881789137380192, + "grad_norm": 0.4527676999568939, + "learning_rate": 0.0001, + "loss": 1.6562, + "step": 4202 + }, + { + "epoch": 0.4882950914899797, + "grad_norm": 0.4323418438434601, + "learning_rate": 0.0001, + "loss": 1.6853, + "step": 4203 + }, + { + "epoch": 0.48841126924194017, + "grad_norm": 0.43545210361480713, + "learning_rate": 0.0001, + "loss": 1.7446, + "step": 4204 + }, + { + "epoch": 0.48852744699390066, + "grad_norm": 0.4371805191040039, + "learning_rate": 0.0001, + "loss": 1.5752, + "step": 4205 + }, + { + "epoch": 0.48864362474586115, + "grad_norm": 0.42473122477531433, + "learning_rate": 0.0001, + "loss": 1.6911, + "step": 4206 + }, + { + "epoch": 0.4887598024978217, + "grad_norm": 0.42178717255592346, + "learning_rate": 0.0001, + "loss": 1.7745, + "step": 4207 + }, + { + "epoch": 0.4888759802497822, + "grad_norm": 0.39883777499198914, + "learning_rate": 0.0001, + "loss": 1.6367, + "step": 4208 + }, + { + "epoch": 0.48899215800174267, + "grad_norm": 0.39283865690231323, + "learning_rate": 0.0001, + "loss": 1.5009, + "step": 4209 + }, + { + "epoch": 0.48910833575370316, + "grad_norm": 0.390767902135849, + "learning_rate": 0.0001, + "loss": 1.6082, + "step": 4210 + }, + { + "epoch": 0.48922451350566365, + "grad_norm": 0.40290915966033936, + "learning_rate": 0.0001, + "loss": 1.539, + "step": 4211 + }, + { + "epoch": 0.48934069125762414, + "grad_norm": 0.3867753744125366, + "learning_rate": 0.0001, + "loss": 1.514, + "step": 4212 + }, + { + "epoch": 0.4894568690095847, + "grad_norm": 0.40270060300827026, + "learning_rate": 0.0001, + "loss": 1.6378, + "step": 4213 + }, + { + "epoch": 0.48957304676154517, + "grad_norm": 0.40884700417518616, + "learning_rate": 0.0001, + "loss": 1.5464, + "step": 4214 + }, + { + "epoch": 0.48968922451350566, + "grad_norm": 0.4496394395828247, + "learning_rate": 0.0001, + "loss": 1.7212, + "step": 4215 + }, + { + "epoch": 0.48980540226546615, + "grad_norm": 0.4228356182575226, + "learning_rate": 0.0001, + "loss": 1.7374, + "step": 4216 + }, + { + "epoch": 0.48992158001742664, + "grad_norm": 0.3831014633178711, + "learning_rate": 0.0001, + "loss": 1.5569, + "step": 4217 + }, + { + "epoch": 0.4900377577693872, + "grad_norm": 0.4408629834651947, + "learning_rate": 0.0001, + "loss": 1.661, + "step": 4218 + }, + { + "epoch": 0.4901539355213477, + "grad_norm": 0.4310610890388489, + "learning_rate": 0.0001, + "loss": 1.6763, + "step": 4219 + }, + { + "epoch": 0.49027011327330816, + "grad_norm": 0.4541778266429901, + "learning_rate": 0.0001, + "loss": 1.6869, + "step": 4220 + }, + { + "epoch": 0.49038629102526865, + "grad_norm": 0.4351007342338562, + "learning_rate": 0.0001, + "loss": 1.7975, + "step": 4221 + }, + { + "epoch": 0.49050246877722914, + "grad_norm": 0.4092995226383209, + "learning_rate": 0.0001, + "loss": 1.6019, + "step": 4222 + }, + { + "epoch": 0.4906186465291897, + "grad_norm": 0.41761350631713867, + "learning_rate": 0.0001, + "loss": 1.6881, + "step": 4223 + }, + { + "epoch": 0.4907348242811502, + "grad_norm": 0.42286214232444763, + "learning_rate": 0.0001, + "loss": 1.6648, + "step": 4224 + }, + { + "epoch": 0.49085100203311066, + "grad_norm": 0.44822847843170166, + "learning_rate": 0.0001, + "loss": 1.588, + "step": 4225 + }, + { + "epoch": 0.49096717978507115, + "grad_norm": 0.40160536766052246, + "learning_rate": 0.0001, + "loss": 1.5714, + "step": 4226 + }, + { + "epoch": 0.49108335753703164, + "grad_norm": 0.39959850907325745, + "learning_rate": 0.0001, + "loss": 1.5053, + "step": 4227 + }, + { + "epoch": 0.4911995352889922, + "grad_norm": 0.4233490228652954, + "learning_rate": 0.0001, + "loss": 1.7888, + "step": 4228 + }, + { + "epoch": 0.4913157130409527, + "grad_norm": 0.41371530294418335, + "learning_rate": 0.0001, + "loss": 1.7971, + "step": 4229 + }, + { + "epoch": 0.49143189079291316, + "grad_norm": 0.4132305085659027, + "learning_rate": 0.0001, + "loss": 1.7104, + "step": 4230 + }, + { + "epoch": 0.49154806854487365, + "grad_norm": 0.4752264618873596, + "learning_rate": 0.0001, + "loss": 1.7421, + "step": 4231 + }, + { + "epoch": 0.49166424629683414, + "grad_norm": 0.37759026885032654, + "learning_rate": 0.0001, + "loss": 1.5019, + "step": 4232 + }, + { + "epoch": 0.49178042404879463, + "grad_norm": 0.423034131526947, + "learning_rate": 0.0001, + "loss": 1.7351, + "step": 4233 + }, + { + "epoch": 0.4918966018007552, + "grad_norm": 0.42618465423583984, + "learning_rate": 0.0001, + "loss": 1.7726, + "step": 4234 + }, + { + "epoch": 0.49201277955271566, + "grad_norm": 0.4361681044101715, + "learning_rate": 0.0001, + "loss": 1.7888, + "step": 4235 + }, + { + "epoch": 0.49212895730467615, + "grad_norm": 0.4297569692134857, + "learning_rate": 0.0001, + "loss": 1.8216, + "step": 4236 + }, + { + "epoch": 0.49224513505663664, + "grad_norm": 0.4081512987613678, + "learning_rate": 0.0001, + "loss": 1.6625, + "step": 4237 + }, + { + "epoch": 0.49236131280859713, + "grad_norm": 0.4123179614543915, + "learning_rate": 0.0001, + "loss": 1.7767, + "step": 4238 + }, + { + "epoch": 0.4924774905605577, + "grad_norm": 0.42507731914520264, + "learning_rate": 0.0001, + "loss": 1.7946, + "step": 4239 + }, + { + "epoch": 0.49259366831251816, + "grad_norm": 0.4431770145893097, + "learning_rate": 0.0001, + "loss": 1.753, + "step": 4240 + }, + { + "epoch": 0.49270984606447865, + "grad_norm": 0.4066873788833618, + "learning_rate": 0.0001, + "loss": 1.6131, + "step": 4241 + }, + { + "epoch": 0.49282602381643914, + "grad_norm": 0.39505207538604736, + "learning_rate": 0.0001, + "loss": 1.6114, + "step": 4242 + }, + { + "epoch": 0.49294220156839963, + "grad_norm": 0.3953791856765747, + "learning_rate": 0.0001, + "loss": 1.67, + "step": 4243 + }, + { + "epoch": 0.4930583793203602, + "grad_norm": 0.4070848822593689, + "learning_rate": 0.0001, + "loss": 1.5279, + "step": 4244 + }, + { + "epoch": 0.49317455707232066, + "grad_norm": 0.3953931927680969, + "learning_rate": 0.0001, + "loss": 1.6102, + "step": 4245 + }, + { + "epoch": 0.49329073482428115, + "grad_norm": 0.41489389538764954, + "learning_rate": 0.0001, + "loss": 1.6426, + "step": 4246 + }, + { + "epoch": 0.49340691257624164, + "grad_norm": 0.381734699010849, + "learning_rate": 0.0001, + "loss": 1.6127, + "step": 4247 + }, + { + "epoch": 0.49352309032820213, + "grad_norm": 0.4128166139125824, + "learning_rate": 0.0001, + "loss": 1.7149, + "step": 4248 + }, + { + "epoch": 0.4936392680801627, + "grad_norm": 0.4323165714740753, + "learning_rate": 0.0001, + "loss": 1.4858, + "step": 4249 + }, + { + "epoch": 0.49375544583212316, + "grad_norm": 0.4313991069793701, + "learning_rate": 0.0001, + "loss": 1.74, + "step": 4250 + }, + { + "epoch": 0.49387162358408365, + "grad_norm": 0.41360074281692505, + "learning_rate": 0.0001, + "loss": 1.7439, + "step": 4251 + }, + { + "epoch": 0.49398780133604414, + "grad_norm": 0.41541731357574463, + "learning_rate": 0.0001, + "loss": 1.7381, + "step": 4252 + }, + { + "epoch": 0.49410397908800463, + "grad_norm": 0.3932528495788574, + "learning_rate": 0.0001, + "loss": 1.5159, + "step": 4253 + }, + { + "epoch": 0.4942201568399651, + "grad_norm": 0.41322755813598633, + "learning_rate": 0.0001, + "loss": 1.6621, + "step": 4254 + }, + { + "epoch": 0.49433633459192566, + "grad_norm": 0.39592114090919495, + "learning_rate": 0.0001, + "loss": 1.614, + "step": 4255 + }, + { + "epoch": 0.49445251234388615, + "grad_norm": 0.4194466173648834, + "learning_rate": 0.0001, + "loss": 1.7409, + "step": 4256 + }, + { + "epoch": 0.49456869009584664, + "grad_norm": 0.4130099415779114, + "learning_rate": 0.0001, + "loss": 1.5919, + "step": 4257 + }, + { + "epoch": 0.49468486784780713, + "grad_norm": 0.39909616112709045, + "learning_rate": 0.0001, + "loss": 1.3854, + "step": 4258 + }, + { + "epoch": 0.4948010455997676, + "grad_norm": 0.4121648371219635, + "learning_rate": 0.0001, + "loss": 1.6729, + "step": 4259 + }, + { + "epoch": 0.49491722335172816, + "grad_norm": 0.4459930956363678, + "learning_rate": 0.0001, + "loss": 1.8308, + "step": 4260 + }, + { + "epoch": 0.49503340110368865, + "grad_norm": 0.42174288630485535, + "learning_rate": 0.0001, + "loss": 1.7415, + "step": 4261 + }, + { + "epoch": 0.49514957885564914, + "grad_norm": 0.4145379364490509, + "learning_rate": 0.0001, + "loss": 1.713, + "step": 4262 + }, + { + "epoch": 0.49526575660760963, + "grad_norm": 0.40851959586143494, + "learning_rate": 0.0001, + "loss": 1.7002, + "step": 4263 + }, + { + "epoch": 0.4953819343595701, + "grad_norm": 0.42193418741226196, + "learning_rate": 0.0001, + "loss": 1.6732, + "step": 4264 + }, + { + "epoch": 0.49549811211153066, + "grad_norm": 0.4060492515563965, + "learning_rate": 0.0001, + "loss": 1.7783, + "step": 4265 + }, + { + "epoch": 0.49561428986349115, + "grad_norm": 0.3899538815021515, + "learning_rate": 0.0001, + "loss": 1.6265, + "step": 4266 + }, + { + "epoch": 0.49573046761545164, + "grad_norm": 0.42852500081062317, + "learning_rate": 0.0001, + "loss": 1.7705, + "step": 4267 + }, + { + "epoch": 0.49584664536741213, + "grad_norm": 0.41384875774383545, + "learning_rate": 0.0001, + "loss": 1.6454, + "step": 4268 + }, + { + "epoch": 0.4959628231193726, + "grad_norm": 0.41500815749168396, + "learning_rate": 0.0001, + "loss": 1.6607, + "step": 4269 + }, + { + "epoch": 0.49607900087133316, + "grad_norm": 0.40487122535705566, + "learning_rate": 0.0001, + "loss": 1.5961, + "step": 4270 + }, + { + "epoch": 0.49619517862329365, + "grad_norm": 0.4144662022590637, + "learning_rate": 0.0001, + "loss": 1.6281, + "step": 4271 + }, + { + "epoch": 0.49631135637525414, + "grad_norm": 0.40673938393592834, + "learning_rate": 0.0001, + "loss": 1.633, + "step": 4272 + }, + { + "epoch": 0.49642753412721463, + "grad_norm": 0.4252079725265503, + "learning_rate": 0.0001, + "loss": 1.5973, + "step": 4273 + }, + { + "epoch": 0.4965437118791751, + "grad_norm": 0.4067811071872711, + "learning_rate": 0.0001, + "loss": 1.5687, + "step": 4274 + }, + { + "epoch": 0.49665988963113566, + "grad_norm": 0.4027066230773926, + "learning_rate": 0.0001, + "loss": 1.568, + "step": 4275 + }, + { + "epoch": 0.49677606738309615, + "grad_norm": 0.4006246030330658, + "learning_rate": 0.0001, + "loss": 1.5842, + "step": 4276 + }, + { + "epoch": 0.49689224513505664, + "grad_norm": 0.42645466327667236, + "learning_rate": 0.0001, + "loss": 1.7451, + "step": 4277 + }, + { + "epoch": 0.49700842288701713, + "grad_norm": 0.3970547616481781, + "learning_rate": 0.0001, + "loss": 1.71, + "step": 4278 + }, + { + "epoch": 0.4971246006389776, + "grad_norm": 0.43710431456565857, + "learning_rate": 0.0001, + "loss": 1.8507, + "step": 4279 + }, + { + "epoch": 0.4972407783909381, + "grad_norm": 0.4387757182121277, + "learning_rate": 0.0001, + "loss": 1.7658, + "step": 4280 + }, + { + "epoch": 0.49735695614289865, + "grad_norm": 0.3968219757080078, + "learning_rate": 0.0001, + "loss": 1.6035, + "step": 4281 + }, + { + "epoch": 0.49747313389485914, + "grad_norm": 0.42177560925483704, + "learning_rate": 0.0001, + "loss": 1.6699, + "step": 4282 + }, + { + "epoch": 0.49758931164681963, + "grad_norm": 0.46521419286727905, + "learning_rate": 0.0001, + "loss": 1.8792, + "step": 4283 + }, + { + "epoch": 0.4977054893987801, + "grad_norm": 0.45769378542900085, + "learning_rate": 0.0001, + "loss": 1.9134, + "step": 4284 + }, + { + "epoch": 0.4978216671507406, + "grad_norm": 0.415170282125473, + "learning_rate": 0.0001, + "loss": 1.691, + "step": 4285 + }, + { + "epoch": 0.49793784490270115, + "grad_norm": 0.4076900780200958, + "learning_rate": 0.0001, + "loss": 1.614, + "step": 4286 + }, + { + "epoch": 0.49805402265466164, + "grad_norm": 0.42825424671173096, + "learning_rate": 0.0001, + "loss": 1.6607, + "step": 4287 + }, + { + "epoch": 0.49817020040662213, + "grad_norm": 0.39369601011276245, + "learning_rate": 0.0001, + "loss": 1.5995, + "step": 4288 + }, + { + "epoch": 0.4982863781585826, + "grad_norm": 0.415863037109375, + "learning_rate": 0.0001, + "loss": 1.5335, + "step": 4289 + }, + { + "epoch": 0.4984025559105431, + "grad_norm": 0.4149499833583832, + "learning_rate": 0.0001, + "loss": 1.6372, + "step": 4290 + }, + { + "epoch": 0.49851873366250365, + "grad_norm": 0.4311138391494751, + "learning_rate": 0.0001, + "loss": 1.6839, + "step": 4291 + }, + { + "epoch": 0.49863491141446414, + "grad_norm": 0.4341351389884949, + "learning_rate": 0.0001, + "loss": 1.6409, + "step": 4292 + }, + { + "epoch": 0.49875108916642463, + "grad_norm": 0.43341371417045593, + "learning_rate": 0.0001, + "loss": 1.6568, + "step": 4293 + }, + { + "epoch": 0.4988672669183851, + "grad_norm": 0.4445037543773651, + "learning_rate": 0.0001, + "loss": 1.8165, + "step": 4294 + }, + { + "epoch": 0.4989834446703456, + "grad_norm": 0.4031905233860016, + "learning_rate": 0.0001, + "loss": 1.5914, + "step": 4295 + }, + { + "epoch": 0.49909962242230616, + "grad_norm": 0.3780546188354492, + "learning_rate": 0.0001, + "loss": 1.3956, + "step": 4296 + }, + { + "epoch": 0.49921580017426664, + "grad_norm": 0.4312121868133545, + "learning_rate": 0.0001, + "loss": 1.6278, + "step": 4297 + }, + { + "epoch": 0.49933197792622713, + "grad_norm": 0.42564424872398376, + "learning_rate": 0.0001, + "loss": 1.6275, + "step": 4298 + }, + { + "epoch": 0.4994481556781876, + "grad_norm": 0.41907092928886414, + "learning_rate": 0.0001, + "loss": 1.6452, + "step": 4299 + }, + { + "epoch": 0.4995643334301481, + "grad_norm": 0.44234657287597656, + "learning_rate": 0.0001, + "loss": 1.8374, + "step": 4300 + }, + { + "epoch": 0.4996805111821086, + "grad_norm": 0.43759170174598694, + "learning_rate": 0.0001, + "loss": 1.7536, + "step": 4301 + }, + { + "epoch": 0.49979668893406914, + "grad_norm": 0.4182339012622833, + "learning_rate": 0.0001, + "loss": 1.717, + "step": 4302 + }, + { + "epoch": 0.49991286668602963, + "grad_norm": 0.4217366874217987, + "learning_rate": 0.0001, + "loss": 1.7919, + "step": 4303 + }, + { + "epoch": 0.5000290444379901, + "grad_norm": 0.4132644236087799, + "learning_rate": 0.0001, + "loss": 1.5728, + "step": 4304 + }, + { + "epoch": 0.5001452221899506, + "grad_norm": 0.4324856996536255, + "learning_rate": 0.0001, + "loss": 1.5971, + "step": 4305 + }, + { + "epoch": 0.5002613999419111, + "grad_norm": 0.4039803445339203, + "learning_rate": 0.0001, + "loss": 1.6391, + "step": 4306 + }, + { + "epoch": 0.5003775776938716, + "grad_norm": 0.4950650930404663, + "learning_rate": 0.0001, + "loss": 1.8132, + "step": 4307 + }, + { + "epoch": 0.5004937554458321, + "grad_norm": 0.4059164524078369, + "learning_rate": 0.0001, + "loss": 1.6249, + "step": 4308 + }, + { + "epoch": 0.5006099331977927, + "grad_norm": 0.4175871014595032, + "learning_rate": 0.0001, + "loss": 1.575, + "step": 4309 + }, + { + "epoch": 0.5007261109497532, + "grad_norm": 0.40468019247055054, + "learning_rate": 0.0001, + "loss": 1.4808, + "step": 4310 + }, + { + "epoch": 0.5008422887017137, + "grad_norm": 0.45055514574050903, + "learning_rate": 0.0001, + "loss": 1.8287, + "step": 4311 + }, + { + "epoch": 0.5009584664536741, + "grad_norm": 0.4032573997974396, + "learning_rate": 0.0001, + "loss": 1.5997, + "step": 4312 + }, + { + "epoch": 0.5010746442056346, + "grad_norm": 0.4106045067310333, + "learning_rate": 0.0001, + "loss": 1.5928, + "step": 4313 + }, + { + "epoch": 0.5011908219575951, + "grad_norm": 0.4448246955871582, + "learning_rate": 0.0001, + "loss": 1.6811, + "step": 4314 + }, + { + "epoch": 0.5013069997095556, + "grad_norm": 0.45284712314605713, + "learning_rate": 0.0001, + "loss": 1.6662, + "step": 4315 + }, + { + "epoch": 0.5014231774615161, + "grad_norm": 0.4348433017730713, + "learning_rate": 0.0001, + "loss": 1.6896, + "step": 4316 + }, + { + "epoch": 0.5015393552134766, + "grad_norm": 0.4291326701641083, + "learning_rate": 0.0001, + "loss": 1.6956, + "step": 4317 + }, + { + "epoch": 0.5016555329654371, + "grad_norm": 0.42308685183525085, + "learning_rate": 0.0001, + "loss": 1.7336, + "step": 4318 + }, + { + "epoch": 0.5017717107173976, + "grad_norm": 0.4419673681259155, + "learning_rate": 0.0001, + "loss": 1.8071, + "step": 4319 + }, + { + "epoch": 0.5018878884693582, + "grad_norm": 0.4059154987335205, + "learning_rate": 0.0001, + "loss": 1.7697, + "step": 4320 + }, + { + "epoch": 0.5020040662213187, + "grad_norm": 0.44748127460479736, + "learning_rate": 0.0001, + "loss": 1.8027, + "step": 4321 + }, + { + "epoch": 0.5021202439732791, + "grad_norm": 0.4353194832801819, + "learning_rate": 0.0001, + "loss": 1.77, + "step": 4322 + }, + { + "epoch": 0.5022364217252396, + "grad_norm": 0.43615394830703735, + "learning_rate": 0.0001, + "loss": 1.6816, + "step": 4323 + }, + { + "epoch": 0.5023525994772001, + "grad_norm": 0.4044416546821594, + "learning_rate": 0.0001, + "loss": 1.4778, + "step": 4324 + }, + { + "epoch": 0.5024687772291606, + "grad_norm": 0.4381449222564697, + "learning_rate": 0.0001, + "loss": 1.6863, + "step": 4325 + }, + { + "epoch": 0.5025849549811211, + "grad_norm": 0.41832980513572693, + "learning_rate": 0.0001, + "loss": 1.5382, + "step": 4326 + }, + { + "epoch": 0.5027011327330816, + "grad_norm": 0.41191017627716064, + "learning_rate": 0.0001, + "loss": 1.7425, + "step": 4327 + }, + { + "epoch": 0.5028173104850421, + "grad_norm": 0.4307502508163452, + "learning_rate": 0.0001, + "loss": 1.7084, + "step": 4328 + }, + { + "epoch": 0.5029334882370026, + "grad_norm": 0.39051318168640137, + "learning_rate": 0.0001, + "loss": 1.4484, + "step": 4329 + }, + { + "epoch": 0.5030496659889632, + "grad_norm": 0.4072178602218628, + "learning_rate": 0.0001, + "loss": 1.6725, + "step": 4330 + }, + { + "epoch": 0.5031658437409237, + "grad_norm": 0.4136503040790558, + "learning_rate": 0.0001, + "loss": 1.5188, + "step": 4331 + }, + { + "epoch": 0.5032820214928841, + "grad_norm": 0.4073225259780884, + "learning_rate": 0.0001, + "loss": 1.5818, + "step": 4332 + }, + { + "epoch": 0.5033981992448446, + "grad_norm": 0.45762255787849426, + "learning_rate": 0.0001, + "loss": 1.8642, + "step": 4333 + }, + { + "epoch": 0.5035143769968051, + "grad_norm": 0.3969933092594147, + "learning_rate": 0.0001, + "loss": 1.4036, + "step": 4334 + }, + { + "epoch": 0.5036305547487656, + "grad_norm": 0.4575958251953125, + "learning_rate": 0.0001, + "loss": 1.7049, + "step": 4335 + }, + { + "epoch": 0.5037467325007261, + "grad_norm": 0.4271569550037384, + "learning_rate": 0.0001, + "loss": 1.786, + "step": 4336 + }, + { + "epoch": 0.5038629102526866, + "grad_norm": 0.41905540227890015, + "learning_rate": 0.0001, + "loss": 1.4688, + "step": 4337 + }, + { + "epoch": 0.5039790880046471, + "grad_norm": 0.4295593202114105, + "learning_rate": 0.0001, + "loss": 1.7429, + "step": 4338 + }, + { + "epoch": 0.5040952657566076, + "grad_norm": 0.4510742425918579, + "learning_rate": 0.0001, + "loss": 1.6844, + "step": 4339 + }, + { + "epoch": 0.5042114435085681, + "grad_norm": 0.38910943269729614, + "learning_rate": 0.0001, + "loss": 1.6087, + "step": 4340 + }, + { + "epoch": 0.5043276212605287, + "grad_norm": 0.43599119782447815, + "learning_rate": 0.0001, + "loss": 1.6775, + "step": 4341 + }, + { + "epoch": 0.5044437990124891, + "grad_norm": 0.4247783124446869, + "learning_rate": 0.0001, + "loss": 1.7167, + "step": 4342 + }, + { + "epoch": 0.5045599767644496, + "grad_norm": 0.41217729449272156, + "learning_rate": 0.0001, + "loss": 1.6514, + "step": 4343 + }, + { + "epoch": 0.5046761545164101, + "grad_norm": 0.4111458361148834, + "learning_rate": 0.0001, + "loss": 1.7107, + "step": 4344 + }, + { + "epoch": 0.5047923322683706, + "grad_norm": 0.44936275482177734, + "learning_rate": 0.0001, + "loss": 1.7152, + "step": 4345 + }, + { + "epoch": 0.5049085100203311, + "grad_norm": 0.39809176325798035, + "learning_rate": 0.0001, + "loss": 1.4638, + "step": 4346 + }, + { + "epoch": 0.5050246877722916, + "grad_norm": 0.45946526527404785, + "learning_rate": 0.0001, + "loss": 1.7883, + "step": 4347 + }, + { + "epoch": 0.5051408655242521, + "grad_norm": 0.4194871485233307, + "learning_rate": 0.0001, + "loss": 1.6288, + "step": 4348 + }, + { + "epoch": 0.5052570432762126, + "grad_norm": 0.4171569049358368, + "learning_rate": 0.0001, + "loss": 1.6859, + "step": 4349 + }, + { + "epoch": 0.5053732210281731, + "grad_norm": 0.4225428104400635, + "learning_rate": 0.0001, + "loss": 1.5897, + "step": 4350 + }, + { + "epoch": 0.5054893987801337, + "grad_norm": 0.410523921251297, + "learning_rate": 0.0001, + "loss": 1.7041, + "step": 4351 + }, + { + "epoch": 0.5056055765320941, + "grad_norm": 0.4116719663143158, + "learning_rate": 0.0001, + "loss": 1.6019, + "step": 4352 + }, + { + "epoch": 0.5057217542840546, + "grad_norm": 0.405364453792572, + "learning_rate": 0.0001, + "loss": 1.6812, + "step": 4353 + }, + { + "epoch": 0.5058379320360151, + "grad_norm": 0.38687193393707275, + "learning_rate": 0.0001, + "loss": 1.3621, + "step": 4354 + }, + { + "epoch": 0.5059541097879756, + "grad_norm": 0.4160784184932709, + "learning_rate": 0.0001, + "loss": 1.6666, + "step": 4355 + }, + { + "epoch": 0.5060702875399361, + "grad_norm": 0.4402058720588684, + "learning_rate": 0.0001, + "loss": 1.6338, + "step": 4356 + }, + { + "epoch": 0.5061864652918966, + "grad_norm": 0.4149058759212494, + "learning_rate": 0.0001, + "loss": 1.5838, + "step": 4357 + }, + { + "epoch": 0.5063026430438571, + "grad_norm": 0.39352482557296753, + "learning_rate": 0.0001, + "loss": 1.5758, + "step": 4358 + }, + { + "epoch": 0.5064188207958176, + "grad_norm": 0.3932999074459076, + "learning_rate": 0.0001, + "loss": 1.5528, + "step": 4359 + }, + { + "epoch": 0.5065349985477781, + "grad_norm": 0.41762053966522217, + "learning_rate": 0.0001, + "loss": 1.6335, + "step": 4360 + }, + { + "epoch": 0.5066511762997385, + "grad_norm": 0.39525556564331055, + "learning_rate": 0.0001, + "loss": 1.5574, + "step": 4361 + }, + { + "epoch": 0.5067673540516991, + "grad_norm": 0.41103050112724304, + "learning_rate": 0.0001, + "loss": 1.671, + "step": 4362 + }, + { + "epoch": 0.5068835318036596, + "grad_norm": 0.4429750144481659, + "learning_rate": 0.0001, + "loss": 1.7287, + "step": 4363 + }, + { + "epoch": 0.5069997095556201, + "grad_norm": 0.42924562096595764, + "learning_rate": 0.0001, + "loss": 1.6523, + "step": 4364 + }, + { + "epoch": 0.5071158873075806, + "grad_norm": 0.4067068099975586, + "learning_rate": 0.0001, + "loss": 1.5797, + "step": 4365 + }, + { + "epoch": 0.5072320650595411, + "grad_norm": 0.42301809787750244, + "learning_rate": 0.0001, + "loss": 1.5505, + "step": 4366 + }, + { + "epoch": 0.5073482428115016, + "grad_norm": 0.4366927146911621, + "learning_rate": 0.0001, + "loss": 1.6521, + "step": 4367 + }, + { + "epoch": 0.5074644205634621, + "grad_norm": 0.48699912428855896, + "learning_rate": 0.0001, + "loss": 1.6093, + "step": 4368 + }, + { + "epoch": 0.5075805983154226, + "grad_norm": 0.4013606309890747, + "learning_rate": 0.0001, + "loss": 1.6461, + "step": 4369 + }, + { + "epoch": 0.5076967760673831, + "grad_norm": 0.42257198691368103, + "learning_rate": 0.0001, + "loss": 1.6119, + "step": 4370 + }, + { + "epoch": 0.5078129538193435, + "grad_norm": 0.41376587748527527, + "learning_rate": 0.0001, + "loss": 1.519, + "step": 4371 + }, + { + "epoch": 0.5079291315713041, + "grad_norm": 0.42543458938598633, + "learning_rate": 0.0001, + "loss": 1.6674, + "step": 4372 + }, + { + "epoch": 0.5080453093232646, + "grad_norm": 0.40931135416030884, + "learning_rate": 0.0001, + "loss": 1.728, + "step": 4373 + }, + { + "epoch": 0.5081614870752251, + "grad_norm": 0.3892346918582916, + "learning_rate": 0.0001, + "loss": 1.5749, + "step": 4374 + }, + { + "epoch": 0.5082776648271856, + "grad_norm": 0.3831053674221039, + "learning_rate": 0.0001, + "loss": 1.4605, + "step": 4375 + }, + { + "epoch": 0.5083938425791461, + "grad_norm": 0.3641822338104248, + "learning_rate": 0.0001, + "loss": 1.5647, + "step": 4376 + }, + { + "epoch": 0.5085100203311066, + "grad_norm": 0.40960368514060974, + "learning_rate": 0.0001, + "loss": 1.6555, + "step": 4377 + }, + { + "epoch": 0.5086261980830671, + "grad_norm": 0.42281374335289, + "learning_rate": 0.0001, + "loss": 1.6668, + "step": 4378 + }, + { + "epoch": 0.5087423758350276, + "grad_norm": 0.4280136227607727, + "learning_rate": 0.0001, + "loss": 1.5184, + "step": 4379 + }, + { + "epoch": 0.5088585535869881, + "grad_norm": 0.4118686020374298, + "learning_rate": 0.0001, + "loss": 1.6772, + "step": 4380 + }, + { + "epoch": 0.5089747313389485, + "grad_norm": 0.45106491446495056, + "learning_rate": 0.0001, + "loss": 1.5719, + "step": 4381 + }, + { + "epoch": 0.509090909090909, + "grad_norm": 0.3965935707092285, + "learning_rate": 0.0001, + "loss": 1.4588, + "step": 4382 + }, + { + "epoch": 0.5092070868428696, + "grad_norm": 0.3852759003639221, + "learning_rate": 0.0001, + "loss": 1.3832, + "step": 4383 + }, + { + "epoch": 0.5093232645948301, + "grad_norm": 0.4219793677330017, + "learning_rate": 0.0001, + "loss": 1.6145, + "step": 4384 + }, + { + "epoch": 0.5094394423467906, + "grad_norm": 0.4372718632221222, + "learning_rate": 0.0001, + "loss": 1.6514, + "step": 4385 + }, + { + "epoch": 0.5095556200987511, + "grad_norm": 0.40098169445991516, + "learning_rate": 0.0001, + "loss": 1.6136, + "step": 4386 + }, + { + "epoch": 0.5096717978507116, + "grad_norm": 0.4264896810054779, + "learning_rate": 0.0001, + "loss": 1.6914, + "step": 4387 + }, + { + "epoch": 0.5097879756026721, + "grad_norm": 0.4068213105201721, + "learning_rate": 0.0001, + "loss": 1.713, + "step": 4388 + }, + { + "epoch": 0.5099041533546326, + "grad_norm": 0.4088718593120575, + "learning_rate": 0.0001, + "loss": 1.6382, + "step": 4389 + }, + { + "epoch": 0.5100203311065931, + "grad_norm": 0.42276012897491455, + "learning_rate": 0.0001, + "loss": 1.6818, + "step": 4390 + }, + { + "epoch": 0.5101365088585536, + "grad_norm": 0.4298572540283203, + "learning_rate": 0.0001, + "loss": 1.67, + "step": 4391 + }, + { + "epoch": 0.510252686610514, + "grad_norm": 0.4012244939804077, + "learning_rate": 0.0001, + "loss": 1.4795, + "step": 4392 + }, + { + "epoch": 0.5103688643624746, + "grad_norm": 0.41358691453933716, + "learning_rate": 0.0001, + "loss": 1.5749, + "step": 4393 + }, + { + "epoch": 0.5104850421144351, + "grad_norm": 0.41776931285858154, + "learning_rate": 0.0001, + "loss": 1.6255, + "step": 4394 + }, + { + "epoch": 0.5106012198663956, + "grad_norm": 0.4451960325241089, + "learning_rate": 0.0001, + "loss": 1.6869, + "step": 4395 + }, + { + "epoch": 0.5107173976183561, + "grad_norm": 0.43334871530532837, + "learning_rate": 0.0001, + "loss": 1.6803, + "step": 4396 + }, + { + "epoch": 0.5108335753703166, + "grad_norm": 0.3872377276420593, + "learning_rate": 0.0001, + "loss": 1.5352, + "step": 4397 + }, + { + "epoch": 0.5109497531222771, + "grad_norm": 0.3914475440979004, + "learning_rate": 0.0001, + "loss": 1.6059, + "step": 4398 + }, + { + "epoch": 0.5110659308742376, + "grad_norm": 0.447561115026474, + "learning_rate": 0.0001, + "loss": 1.7356, + "step": 4399 + }, + { + "epoch": 0.5111821086261981, + "grad_norm": 0.45441824197769165, + "learning_rate": 0.0001, + "loss": 1.6668, + "step": 4400 + }, + { + "epoch": 0.5112982863781586, + "grad_norm": 0.3847355544567108, + "learning_rate": 0.0001, + "loss": 1.4718, + "step": 4401 + }, + { + "epoch": 0.511414464130119, + "grad_norm": 0.41067802906036377, + "learning_rate": 0.0001, + "loss": 1.6448, + "step": 4402 + }, + { + "epoch": 0.5115306418820795, + "grad_norm": 0.4514882564544678, + "learning_rate": 0.0001, + "loss": 1.792, + "step": 4403 + }, + { + "epoch": 0.5116468196340401, + "grad_norm": 0.4275973439216614, + "learning_rate": 0.0001, + "loss": 1.7367, + "step": 4404 + }, + { + "epoch": 0.5117629973860006, + "grad_norm": 0.41547560691833496, + "learning_rate": 0.0001, + "loss": 1.5478, + "step": 4405 + }, + { + "epoch": 0.5118791751379611, + "grad_norm": 0.40632376074790955, + "learning_rate": 0.0001, + "loss": 1.5644, + "step": 4406 + }, + { + "epoch": 0.5119953528899216, + "grad_norm": 0.4488579034805298, + "learning_rate": 0.0001, + "loss": 1.5699, + "step": 4407 + }, + { + "epoch": 0.5121115306418821, + "grad_norm": 0.42902886867523193, + "learning_rate": 0.0001, + "loss": 1.5679, + "step": 4408 + }, + { + "epoch": 0.5122277083938426, + "grad_norm": 0.42067989706993103, + "learning_rate": 0.0001, + "loss": 1.5815, + "step": 4409 + }, + { + "epoch": 0.5123438861458031, + "grad_norm": 0.4122752845287323, + "learning_rate": 0.0001, + "loss": 1.4263, + "step": 4410 + }, + { + "epoch": 0.5124600638977636, + "grad_norm": 0.443730890750885, + "learning_rate": 0.0001, + "loss": 1.5752, + "step": 4411 + }, + { + "epoch": 0.512576241649724, + "grad_norm": 0.43061313033103943, + "learning_rate": 0.0001, + "loss": 1.5591, + "step": 4412 + }, + { + "epoch": 0.5126924194016845, + "grad_norm": 0.42542704939842224, + "learning_rate": 0.0001, + "loss": 1.6375, + "step": 4413 + }, + { + "epoch": 0.5128085971536451, + "grad_norm": 0.41993504762649536, + "learning_rate": 0.0001, + "loss": 1.5644, + "step": 4414 + }, + { + "epoch": 0.5129247749056056, + "grad_norm": 0.43750661611557007, + "learning_rate": 0.0001, + "loss": 1.6251, + "step": 4415 + }, + { + "epoch": 0.5130409526575661, + "grad_norm": 0.4423817992210388, + "learning_rate": 0.0001, + "loss": 1.5846, + "step": 4416 + }, + { + "epoch": 0.5131571304095266, + "grad_norm": 0.44078147411346436, + "learning_rate": 0.0001, + "loss": 1.7149, + "step": 4417 + }, + { + "epoch": 0.5132733081614871, + "grad_norm": 0.41512519121170044, + "learning_rate": 0.0001, + "loss": 1.6267, + "step": 4418 + }, + { + "epoch": 0.5133894859134476, + "grad_norm": 0.4335169792175293, + "learning_rate": 0.0001, + "loss": 1.6486, + "step": 4419 + }, + { + "epoch": 0.5135056636654081, + "grad_norm": 0.39206641912460327, + "learning_rate": 0.0001, + "loss": 1.5569, + "step": 4420 + }, + { + "epoch": 0.5136218414173686, + "grad_norm": 0.4225231111049652, + "learning_rate": 0.0001, + "loss": 1.6272, + "step": 4421 + }, + { + "epoch": 0.513738019169329, + "grad_norm": 0.4060937762260437, + "learning_rate": 0.0001, + "loss": 1.5008, + "step": 4422 + }, + { + "epoch": 0.5138541969212895, + "grad_norm": 0.4321762025356293, + "learning_rate": 0.0001, + "loss": 1.5683, + "step": 4423 + }, + { + "epoch": 0.51397037467325, + "grad_norm": 0.4687374234199524, + "learning_rate": 0.0001, + "loss": 1.7355, + "step": 4424 + }, + { + "epoch": 0.5140865524252106, + "grad_norm": 0.4362178146839142, + "learning_rate": 0.0001, + "loss": 1.5629, + "step": 4425 + }, + { + "epoch": 0.5142027301771711, + "grad_norm": 0.42870602011680603, + "learning_rate": 0.0001, + "loss": 1.5298, + "step": 4426 + }, + { + "epoch": 0.5143189079291316, + "grad_norm": 0.4190438985824585, + "learning_rate": 0.0001, + "loss": 1.6347, + "step": 4427 + }, + { + "epoch": 0.5144350856810921, + "grad_norm": 0.43848124146461487, + "learning_rate": 0.0001, + "loss": 1.6039, + "step": 4428 + }, + { + "epoch": 0.5145512634330526, + "grad_norm": 4.445634365081787, + "learning_rate": 0.0001, + "loss": 1.5215, + "step": 4429 + }, + { + "epoch": 0.5146674411850131, + "grad_norm": 0.43199974298477173, + "learning_rate": 0.0001, + "loss": 1.6709, + "step": 4430 + }, + { + "epoch": 0.5147836189369736, + "grad_norm": 0.39670631289482117, + "learning_rate": 0.0001, + "loss": 1.5531, + "step": 4431 + }, + { + "epoch": 0.514899796688934, + "grad_norm": 0.4087604880332947, + "learning_rate": 0.0001, + "loss": 1.6337, + "step": 4432 + }, + { + "epoch": 0.5150159744408945, + "grad_norm": 0.4322669804096222, + "learning_rate": 0.0001, + "loss": 1.7526, + "step": 4433 + }, + { + "epoch": 0.515132152192855, + "grad_norm": 0.40673530101776123, + "learning_rate": 0.0001, + "loss": 1.5269, + "step": 4434 + }, + { + "epoch": 0.5152483299448156, + "grad_norm": 0.4231918454170227, + "learning_rate": 0.0001, + "loss": 1.7449, + "step": 4435 + }, + { + "epoch": 0.5153645076967761, + "grad_norm": 0.42139682173728943, + "learning_rate": 0.0001, + "loss": 1.6195, + "step": 4436 + }, + { + "epoch": 0.5154806854487366, + "grad_norm": 0.4375002384185791, + "learning_rate": 0.0001, + "loss": 1.6319, + "step": 4437 + }, + { + "epoch": 0.5155968632006971, + "grad_norm": 0.41837188601493835, + "learning_rate": 0.0001, + "loss": 1.7799, + "step": 4438 + }, + { + "epoch": 0.5157130409526576, + "grad_norm": 0.36996695399284363, + "learning_rate": 0.0001, + "loss": 1.5302, + "step": 4439 + }, + { + "epoch": 0.5158292187046181, + "grad_norm": 0.428469181060791, + "learning_rate": 0.0001, + "loss": 1.7409, + "step": 4440 + }, + { + "epoch": 0.5159453964565786, + "grad_norm": 0.3917810320854187, + "learning_rate": 0.0001, + "loss": 1.619, + "step": 4441 + }, + { + "epoch": 0.516061574208539, + "grad_norm": 0.41308334469795227, + "learning_rate": 0.0001, + "loss": 1.845, + "step": 4442 + }, + { + "epoch": 0.5161777519604995, + "grad_norm": 0.41742420196533203, + "learning_rate": 0.0001, + "loss": 1.6663, + "step": 4443 + }, + { + "epoch": 0.51629392971246, + "grad_norm": 0.42206430435180664, + "learning_rate": 0.0001, + "loss": 1.5815, + "step": 4444 + }, + { + "epoch": 0.5164101074644205, + "grad_norm": 0.41111844778060913, + "learning_rate": 0.0001, + "loss": 1.687, + "step": 4445 + }, + { + "epoch": 0.5165262852163811, + "grad_norm": 0.42969468235969543, + "learning_rate": 0.0001, + "loss": 1.571, + "step": 4446 + }, + { + "epoch": 0.5166424629683416, + "grad_norm": 0.4495762288570404, + "learning_rate": 0.0001, + "loss": 1.5439, + "step": 4447 + }, + { + "epoch": 0.5167586407203021, + "grad_norm": 0.41785508394241333, + "learning_rate": 0.0001, + "loss": 1.4878, + "step": 4448 + }, + { + "epoch": 0.5168748184722626, + "grad_norm": 0.4101484715938568, + "learning_rate": 0.0001, + "loss": 1.6829, + "step": 4449 + }, + { + "epoch": 0.5169909962242231, + "grad_norm": 0.422985315322876, + "learning_rate": 0.0001, + "loss": 1.6234, + "step": 4450 + }, + { + "epoch": 0.5171071739761836, + "grad_norm": 0.45259571075439453, + "learning_rate": 0.0001, + "loss": 1.6417, + "step": 4451 + }, + { + "epoch": 0.517223351728144, + "grad_norm": 0.3940257728099823, + "learning_rate": 0.0001, + "loss": 1.5267, + "step": 4452 + }, + { + "epoch": 0.5173395294801045, + "grad_norm": 0.41753649711608887, + "learning_rate": 0.0001, + "loss": 1.5316, + "step": 4453 + }, + { + "epoch": 0.517455707232065, + "grad_norm": 0.4366135895252228, + "learning_rate": 0.0001, + "loss": 1.7993, + "step": 4454 + }, + { + "epoch": 0.5175718849840255, + "grad_norm": 0.43286582827568054, + "learning_rate": 0.0001, + "loss": 1.7569, + "step": 4455 + }, + { + "epoch": 0.5176880627359861, + "grad_norm": 0.4226556420326233, + "learning_rate": 0.0001, + "loss": 1.6214, + "step": 4456 + }, + { + "epoch": 0.5178042404879466, + "grad_norm": 0.42732787132263184, + "learning_rate": 0.0001, + "loss": 1.7325, + "step": 4457 + }, + { + "epoch": 0.5179204182399071, + "grad_norm": 0.4511219561100006, + "learning_rate": 0.0001, + "loss": 1.6626, + "step": 4458 + }, + { + "epoch": 0.5180365959918676, + "grad_norm": 0.418562114238739, + "learning_rate": 0.0001, + "loss": 1.6329, + "step": 4459 + }, + { + "epoch": 0.5181527737438281, + "grad_norm": 0.4227406978607178, + "learning_rate": 0.0001, + "loss": 1.6733, + "step": 4460 + }, + { + "epoch": 0.5182689514957886, + "grad_norm": 0.4748642146587372, + "learning_rate": 0.0001, + "loss": 1.8498, + "step": 4461 + }, + { + "epoch": 0.518385129247749, + "grad_norm": 0.4098576605319977, + "learning_rate": 0.0001, + "loss": 1.7199, + "step": 4462 + }, + { + "epoch": 0.5185013069997095, + "grad_norm": 0.3687531054019928, + "learning_rate": 0.0001, + "loss": 1.4375, + "step": 4463 + }, + { + "epoch": 0.51861748475167, + "grad_norm": 0.43809184432029724, + "learning_rate": 0.0001, + "loss": 1.6958, + "step": 4464 + }, + { + "epoch": 0.5187336625036305, + "grad_norm": 0.4355543851852417, + "learning_rate": 0.0001, + "loss": 1.529, + "step": 4465 + }, + { + "epoch": 0.518849840255591, + "grad_norm": 0.40263766050338745, + "learning_rate": 0.0001, + "loss": 1.624, + "step": 4466 + }, + { + "epoch": 0.5189660180075516, + "grad_norm": 0.40483129024505615, + "learning_rate": 0.0001, + "loss": 1.6302, + "step": 4467 + }, + { + "epoch": 0.5190821957595121, + "grad_norm": 0.4438629150390625, + "learning_rate": 0.0001, + "loss": 1.7151, + "step": 4468 + }, + { + "epoch": 0.5191983735114726, + "grad_norm": 0.417834997177124, + "learning_rate": 0.0001, + "loss": 1.5252, + "step": 4469 + }, + { + "epoch": 0.5193145512634331, + "grad_norm": 0.4064665138721466, + "learning_rate": 0.0001, + "loss": 1.6158, + "step": 4470 + }, + { + "epoch": 0.5194307290153936, + "grad_norm": 0.4300839900970459, + "learning_rate": 0.0001, + "loss": 1.7744, + "step": 4471 + }, + { + "epoch": 0.519546906767354, + "grad_norm": 0.43153315782546997, + "learning_rate": 0.0001, + "loss": 1.6319, + "step": 4472 + }, + { + "epoch": 0.5196630845193145, + "grad_norm": 0.40210986137390137, + "learning_rate": 0.0001, + "loss": 1.654, + "step": 4473 + }, + { + "epoch": 0.519779262271275, + "grad_norm": 0.4261165261268616, + "learning_rate": 0.0001, + "loss": 1.6603, + "step": 4474 + }, + { + "epoch": 0.5198954400232355, + "grad_norm": 0.40189406275749207, + "learning_rate": 0.0001, + "loss": 1.541, + "step": 4475 + }, + { + "epoch": 0.520011617775196, + "grad_norm": 0.40366873145103455, + "learning_rate": 0.0001, + "loss": 1.6311, + "step": 4476 + }, + { + "epoch": 0.5201277955271566, + "grad_norm": 0.39482223987579346, + "learning_rate": 0.0001, + "loss": 1.5386, + "step": 4477 + }, + { + "epoch": 0.5202439732791171, + "grad_norm": 0.4213715195655823, + "learning_rate": 0.0001, + "loss": 1.6938, + "step": 4478 + }, + { + "epoch": 0.5203601510310776, + "grad_norm": 0.412153959274292, + "learning_rate": 0.0001, + "loss": 1.6617, + "step": 4479 + }, + { + "epoch": 0.5204763287830381, + "grad_norm": 0.4511539041996002, + "learning_rate": 0.0001, + "loss": 1.7117, + "step": 4480 + }, + { + "epoch": 0.5205925065349986, + "grad_norm": 0.4023229777812958, + "learning_rate": 0.0001, + "loss": 1.6164, + "step": 4481 + }, + { + "epoch": 0.520708684286959, + "grad_norm": 0.410552054643631, + "learning_rate": 0.0001, + "loss": 1.6157, + "step": 4482 + }, + { + "epoch": 0.5208248620389195, + "grad_norm": 0.4153655767440796, + "learning_rate": 0.0001, + "loss": 1.7299, + "step": 4483 + }, + { + "epoch": 0.52094103979088, + "grad_norm": 0.4262380599975586, + "learning_rate": 0.0001, + "loss": 1.7457, + "step": 4484 + }, + { + "epoch": 0.5210572175428405, + "grad_norm": 0.38934075832366943, + "learning_rate": 0.0001, + "loss": 1.5891, + "step": 4485 + }, + { + "epoch": 0.521173395294801, + "grad_norm": 0.417797327041626, + "learning_rate": 0.0001, + "loss": 1.5473, + "step": 4486 + }, + { + "epoch": 0.5212895730467616, + "grad_norm": 0.45602330565452576, + "learning_rate": 0.0001, + "loss": 1.7248, + "step": 4487 + }, + { + "epoch": 0.5214057507987221, + "grad_norm": 0.3888321816921234, + "learning_rate": 0.0001, + "loss": 1.5404, + "step": 4488 + }, + { + "epoch": 0.5215219285506826, + "grad_norm": 0.4114174246788025, + "learning_rate": 0.0001, + "loss": 1.623, + "step": 4489 + }, + { + "epoch": 0.5216381063026431, + "grad_norm": 0.43008658289909363, + "learning_rate": 0.0001, + "loss": 1.6818, + "step": 4490 + }, + { + "epoch": 0.5217542840546036, + "grad_norm": 0.40249064564704895, + "learning_rate": 0.0001, + "loss": 1.6013, + "step": 4491 + }, + { + "epoch": 0.521870461806564, + "grad_norm": 0.39665487408638, + "learning_rate": 0.0001, + "loss": 1.6483, + "step": 4492 + }, + { + "epoch": 0.5219866395585245, + "grad_norm": 0.46716028451919556, + "learning_rate": 0.0001, + "loss": 1.8122, + "step": 4493 + }, + { + "epoch": 0.522102817310485, + "grad_norm": 0.4550071954727173, + "learning_rate": 0.0001, + "loss": 1.6226, + "step": 4494 + }, + { + "epoch": 0.5222189950624455, + "grad_norm": 0.40407729148864746, + "learning_rate": 0.0001, + "loss": 1.5972, + "step": 4495 + }, + { + "epoch": 0.522335172814406, + "grad_norm": 0.4077194035053253, + "learning_rate": 0.0001, + "loss": 1.534, + "step": 4496 + }, + { + "epoch": 0.5224513505663665, + "grad_norm": 0.4374626576900482, + "learning_rate": 0.0001, + "loss": 1.8369, + "step": 4497 + }, + { + "epoch": 0.5225675283183271, + "grad_norm": 0.44101259112358093, + "learning_rate": 0.0001, + "loss": 1.6678, + "step": 4498 + }, + { + "epoch": 0.5226837060702876, + "grad_norm": 0.45485249161720276, + "learning_rate": 0.0001, + "loss": 1.619, + "step": 4499 + }, + { + "epoch": 0.5227998838222481, + "grad_norm": 0.4245244860649109, + "learning_rate": 0.0001, + "loss": 1.709, + "step": 4500 + }, + { + "epoch": 0.5229160615742086, + "grad_norm": 0.4580254554748535, + "learning_rate": 0.0001, + "loss": 1.6549, + "step": 4501 + }, + { + "epoch": 0.523032239326169, + "grad_norm": 0.4223475754261017, + "learning_rate": 0.0001, + "loss": 1.5295, + "step": 4502 + }, + { + "epoch": 0.5231484170781295, + "grad_norm": 0.42708471417427063, + "learning_rate": 0.0001, + "loss": 1.8193, + "step": 4503 + }, + { + "epoch": 0.52326459483009, + "grad_norm": 0.42466166615486145, + "learning_rate": 0.0001, + "loss": 1.6805, + "step": 4504 + }, + { + "epoch": 0.5233807725820505, + "grad_norm": 0.4411070942878723, + "learning_rate": 0.0001, + "loss": 1.6752, + "step": 4505 + }, + { + "epoch": 0.523496950334011, + "grad_norm": 0.4360845983028412, + "learning_rate": 0.0001, + "loss": 1.635, + "step": 4506 + }, + { + "epoch": 0.5236131280859715, + "grad_norm": 0.4071287512779236, + "learning_rate": 0.0001, + "loss": 1.6069, + "step": 4507 + }, + { + "epoch": 0.5237293058379321, + "grad_norm": 0.4129199981689453, + "learning_rate": 0.0001, + "loss": 1.5767, + "step": 4508 + }, + { + "epoch": 0.5238454835898926, + "grad_norm": 0.40559443831443787, + "learning_rate": 0.0001, + "loss": 1.6083, + "step": 4509 + }, + { + "epoch": 0.5239616613418531, + "grad_norm": 0.4351811707019806, + "learning_rate": 0.0001, + "loss": 1.7353, + "step": 4510 + }, + { + "epoch": 0.5240778390938136, + "grad_norm": 0.4436653256416321, + "learning_rate": 0.0001, + "loss": 1.8253, + "step": 4511 + }, + { + "epoch": 0.524194016845774, + "grad_norm": 0.4075913727283478, + "learning_rate": 0.0001, + "loss": 1.4621, + "step": 4512 + }, + { + "epoch": 0.5243101945977345, + "grad_norm": 0.4155612885951996, + "learning_rate": 0.0001, + "loss": 1.6053, + "step": 4513 + }, + { + "epoch": 0.524426372349695, + "grad_norm": 0.4131620526313782, + "learning_rate": 0.0001, + "loss": 1.6391, + "step": 4514 + }, + { + "epoch": 0.5245425501016555, + "grad_norm": 0.4026408791542053, + "learning_rate": 0.0001, + "loss": 1.4066, + "step": 4515 + }, + { + "epoch": 0.524658727853616, + "grad_norm": 0.42349475622177124, + "learning_rate": 0.0001, + "loss": 1.7857, + "step": 4516 + }, + { + "epoch": 0.5247749056055765, + "grad_norm": 0.4265604615211487, + "learning_rate": 0.0001, + "loss": 1.6916, + "step": 4517 + }, + { + "epoch": 0.524891083357537, + "grad_norm": 0.4295273423194885, + "learning_rate": 0.0001, + "loss": 1.7647, + "step": 4518 + }, + { + "epoch": 0.5250072611094976, + "grad_norm": 0.42609089612960815, + "learning_rate": 0.0001, + "loss": 1.586, + "step": 4519 + }, + { + "epoch": 0.5251234388614581, + "grad_norm": 0.42496544122695923, + "learning_rate": 0.0001, + "loss": 1.6681, + "step": 4520 + }, + { + "epoch": 0.5252396166134186, + "grad_norm": 0.4378863573074341, + "learning_rate": 0.0001, + "loss": 1.7169, + "step": 4521 + }, + { + "epoch": 0.525355794365379, + "grad_norm": 0.43438613414764404, + "learning_rate": 0.0001, + "loss": 1.7235, + "step": 4522 + }, + { + "epoch": 0.5254719721173395, + "grad_norm": 0.4248746931552887, + "learning_rate": 0.0001, + "loss": 1.4961, + "step": 4523 + }, + { + "epoch": 0.5255881498693, + "grad_norm": 0.43905171751976013, + "learning_rate": 0.0001, + "loss": 1.7656, + "step": 4524 + }, + { + "epoch": 0.5257043276212605, + "grad_norm": 0.4113710820674896, + "learning_rate": 0.0001, + "loss": 1.66, + "step": 4525 + }, + { + "epoch": 0.525820505373221, + "grad_norm": 0.4203665256500244, + "learning_rate": 0.0001, + "loss": 1.5595, + "step": 4526 + }, + { + "epoch": 0.5259366831251815, + "grad_norm": 0.4476137161254883, + "learning_rate": 0.0001, + "loss": 1.7966, + "step": 4527 + }, + { + "epoch": 0.526052860877142, + "grad_norm": 0.400177925825119, + "learning_rate": 0.0001, + "loss": 1.6435, + "step": 4528 + }, + { + "epoch": 0.5261690386291026, + "grad_norm": 0.4333209991455078, + "learning_rate": 0.0001, + "loss": 1.8233, + "step": 4529 + }, + { + "epoch": 0.5262852163810631, + "grad_norm": 0.43465256690979004, + "learning_rate": 0.0001, + "loss": 1.665, + "step": 4530 + }, + { + "epoch": 0.5264013941330236, + "grad_norm": 0.4152390658855438, + "learning_rate": 0.0001, + "loss": 1.6964, + "step": 4531 + }, + { + "epoch": 0.526517571884984, + "grad_norm": 0.42579424381256104, + "learning_rate": 0.0001, + "loss": 1.6378, + "step": 4532 + }, + { + "epoch": 0.5266337496369445, + "grad_norm": 0.4379642903804779, + "learning_rate": 0.0001, + "loss": 1.6145, + "step": 4533 + }, + { + "epoch": 0.526749927388905, + "grad_norm": 0.4103189706802368, + "learning_rate": 0.0001, + "loss": 1.6903, + "step": 4534 + }, + { + "epoch": 0.5268661051408655, + "grad_norm": 0.45309609174728394, + "learning_rate": 0.0001, + "loss": 1.7196, + "step": 4535 + }, + { + "epoch": 0.526982282892826, + "grad_norm": 0.42796722054481506, + "learning_rate": 0.0001, + "loss": 1.6582, + "step": 4536 + }, + { + "epoch": 0.5270984606447865, + "grad_norm": 0.41641005873680115, + "learning_rate": 0.0001, + "loss": 1.6095, + "step": 4537 + }, + { + "epoch": 0.527214638396747, + "grad_norm": 0.4048060178756714, + "learning_rate": 0.0001, + "loss": 1.5817, + "step": 4538 + }, + { + "epoch": 0.5273308161487075, + "grad_norm": 0.41168227791786194, + "learning_rate": 0.0001, + "loss": 1.6224, + "step": 4539 + }, + { + "epoch": 0.5274469939006681, + "grad_norm": 0.45252159237861633, + "learning_rate": 0.0001, + "loss": 1.6104, + "step": 4540 + }, + { + "epoch": 0.5275631716526286, + "grad_norm": 0.4120829999446869, + "learning_rate": 0.0001, + "loss": 1.5013, + "step": 4541 + }, + { + "epoch": 0.527679349404589, + "grad_norm": 0.44721588492393494, + "learning_rate": 0.0001, + "loss": 1.6587, + "step": 4542 + }, + { + "epoch": 0.5277955271565495, + "grad_norm": 0.43271827697753906, + "learning_rate": 0.0001, + "loss": 1.6503, + "step": 4543 + }, + { + "epoch": 0.52791170490851, + "grad_norm": 0.42740026116371155, + "learning_rate": 0.0001, + "loss": 1.7174, + "step": 4544 + }, + { + "epoch": 0.5280278826604705, + "grad_norm": 0.3889728784561157, + "learning_rate": 0.0001, + "loss": 1.6171, + "step": 4545 + }, + { + "epoch": 0.528144060412431, + "grad_norm": 0.4437465965747833, + "learning_rate": 0.0001, + "loss": 1.6956, + "step": 4546 + }, + { + "epoch": 0.5282602381643915, + "grad_norm": 0.39472874999046326, + "learning_rate": 0.0001, + "loss": 1.5576, + "step": 4547 + }, + { + "epoch": 0.528376415916352, + "grad_norm": 0.4104737937450409, + "learning_rate": 0.0001, + "loss": 1.8144, + "step": 4548 + }, + { + "epoch": 0.5284925936683125, + "grad_norm": 0.4385277330875397, + "learning_rate": 0.0001, + "loss": 1.7121, + "step": 4549 + }, + { + "epoch": 0.5286087714202731, + "grad_norm": 0.3903822600841522, + "learning_rate": 0.0001, + "loss": 1.547, + "step": 4550 + }, + { + "epoch": 0.5287249491722336, + "grad_norm": 0.4265904128551483, + "learning_rate": 0.0001, + "loss": 1.6622, + "step": 4551 + }, + { + "epoch": 0.528841126924194, + "grad_norm": 0.4125080704689026, + "learning_rate": 0.0001, + "loss": 1.6587, + "step": 4552 + }, + { + "epoch": 0.5289573046761545, + "grad_norm": 0.42072439193725586, + "learning_rate": 0.0001, + "loss": 1.5583, + "step": 4553 + }, + { + "epoch": 0.529073482428115, + "grad_norm": 0.41168302297592163, + "learning_rate": 0.0001, + "loss": 1.5291, + "step": 4554 + }, + { + "epoch": 0.5291896601800755, + "grad_norm": 0.39374563097953796, + "learning_rate": 0.0001, + "loss": 1.5875, + "step": 4555 + }, + { + "epoch": 0.529305837932036, + "grad_norm": 0.43736445903778076, + "learning_rate": 0.0001, + "loss": 1.6175, + "step": 4556 + }, + { + "epoch": 0.5294220156839965, + "grad_norm": 0.456046462059021, + "learning_rate": 0.0001, + "loss": 1.6058, + "step": 4557 + }, + { + "epoch": 0.529538193435957, + "grad_norm": 0.407843679189682, + "learning_rate": 0.0001, + "loss": 1.6127, + "step": 4558 + }, + { + "epoch": 0.5296543711879175, + "grad_norm": 0.4293687045574188, + "learning_rate": 0.0001, + "loss": 1.6207, + "step": 4559 + }, + { + "epoch": 0.529770548939878, + "grad_norm": 0.42823317646980286, + "learning_rate": 0.0001, + "loss": 1.8056, + "step": 4560 + }, + { + "epoch": 0.5298867266918386, + "grad_norm": 0.41989070177078247, + "learning_rate": 0.0001, + "loss": 1.6296, + "step": 4561 + }, + { + "epoch": 0.530002904443799, + "grad_norm": 0.41696497797966003, + "learning_rate": 0.0001, + "loss": 1.5857, + "step": 4562 + }, + { + "epoch": 0.5301190821957595, + "grad_norm": 0.3901718854904175, + "learning_rate": 0.0001, + "loss": 1.5489, + "step": 4563 + }, + { + "epoch": 0.53023525994772, + "grad_norm": 0.42461147904396057, + "learning_rate": 0.0001, + "loss": 1.605, + "step": 4564 + }, + { + "epoch": 0.5303514376996805, + "grad_norm": 0.4265443682670593, + "learning_rate": 0.0001, + "loss": 1.5939, + "step": 4565 + }, + { + "epoch": 0.530467615451641, + "grad_norm": 0.4418391287326813, + "learning_rate": 0.0001, + "loss": 1.7592, + "step": 4566 + }, + { + "epoch": 0.5305837932036015, + "grad_norm": 0.4429027736186981, + "learning_rate": 0.0001, + "loss": 1.6546, + "step": 4567 + }, + { + "epoch": 0.530699970955562, + "grad_norm": 0.4483877420425415, + "learning_rate": 0.0001, + "loss": 1.6595, + "step": 4568 + }, + { + "epoch": 0.5308161487075225, + "grad_norm": 0.4467184245586395, + "learning_rate": 0.0001, + "loss": 1.7746, + "step": 4569 + }, + { + "epoch": 0.530932326459483, + "grad_norm": 0.4503554403781891, + "learning_rate": 0.0001, + "loss": 1.8263, + "step": 4570 + }, + { + "epoch": 0.5310485042114436, + "grad_norm": 0.40501412749290466, + "learning_rate": 0.0001, + "loss": 1.6129, + "step": 4571 + }, + { + "epoch": 0.531164681963404, + "grad_norm": 0.41153189539909363, + "learning_rate": 0.0001, + "loss": 1.6277, + "step": 4572 + }, + { + "epoch": 0.5312808597153645, + "grad_norm": 0.41947031021118164, + "learning_rate": 0.0001, + "loss": 1.6558, + "step": 4573 + }, + { + "epoch": 0.531397037467325, + "grad_norm": 0.4129175841808319, + "learning_rate": 0.0001, + "loss": 1.5436, + "step": 4574 + }, + { + "epoch": 0.5315132152192855, + "grad_norm": 0.44129297137260437, + "learning_rate": 0.0001, + "loss": 1.693, + "step": 4575 + }, + { + "epoch": 0.531629392971246, + "grad_norm": 0.41922008991241455, + "learning_rate": 0.0001, + "loss": 1.6644, + "step": 4576 + }, + { + "epoch": 0.5317455707232065, + "grad_norm": 0.41607466340065, + "learning_rate": 0.0001, + "loss": 1.6542, + "step": 4577 + }, + { + "epoch": 0.531861748475167, + "grad_norm": 0.42458733916282654, + "learning_rate": 0.0001, + "loss": 1.6047, + "step": 4578 + }, + { + "epoch": 0.5319779262271275, + "grad_norm": 0.43366140127182007, + "learning_rate": 0.0001, + "loss": 1.508, + "step": 4579 + }, + { + "epoch": 0.532094103979088, + "grad_norm": 0.4278888702392578, + "learning_rate": 0.0001, + "loss": 1.688, + "step": 4580 + }, + { + "epoch": 0.5322102817310485, + "grad_norm": 0.4110510051250458, + "learning_rate": 0.0001, + "loss": 1.5144, + "step": 4581 + }, + { + "epoch": 0.532326459483009, + "grad_norm": 0.4136632978916168, + "learning_rate": 0.0001, + "loss": 1.5157, + "step": 4582 + }, + { + "epoch": 0.5324426372349695, + "grad_norm": 0.3931570053100586, + "learning_rate": 0.0001, + "loss": 1.4754, + "step": 4583 + }, + { + "epoch": 0.53255881498693, + "grad_norm": 0.41531267762184143, + "learning_rate": 0.0001, + "loss": 1.6292, + "step": 4584 + }, + { + "epoch": 0.5326749927388905, + "grad_norm": 0.4423351585865021, + "learning_rate": 0.0001, + "loss": 1.847, + "step": 4585 + }, + { + "epoch": 0.532791170490851, + "grad_norm": 0.401030570268631, + "learning_rate": 0.0001, + "loss": 1.592, + "step": 4586 + }, + { + "epoch": 0.5329073482428115, + "grad_norm": 0.42098504304885864, + "learning_rate": 0.0001, + "loss": 1.7131, + "step": 4587 + }, + { + "epoch": 0.533023525994772, + "grad_norm": 0.4085943102836609, + "learning_rate": 0.0001, + "loss": 1.5669, + "step": 4588 + }, + { + "epoch": 0.5331397037467325, + "grad_norm": 0.39442306756973267, + "learning_rate": 0.0001, + "loss": 1.4053, + "step": 4589 + }, + { + "epoch": 0.533255881498693, + "grad_norm": 0.4020472466945648, + "learning_rate": 0.0001, + "loss": 1.491, + "step": 4590 + }, + { + "epoch": 0.5333720592506535, + "grad_norm": 0.41288527846336365, + "learning_rate": 0.0001, + "loss": 1.641, + "step": 4591 + }, + { + "epoch": 0.533488237002614, + "grad_norm": 0.4358307719230652, + "learning_rate": 0.0001, + "loss": 1.5793, + "step": 4592 + }, + { + "epoch": 0.5336044147545745, + "grad_norm": 0.4035166800022125, + "learning_rate": 0.0001, + "loss": 1.5314, + "step": 4593 + }, + { + "epoch": 0.533720592506535, + "grad_norm": 0.420097291469574, + "learning_rate": 0.0001, + "loss": 1.7894, + "step": 4594 + }, + { + "epoch": 0.5338367702584955, + "grad_norm": 0.47331398725509644, + "learning_rate": 0.0001, + "loss": 1.5796, + "step": 4595 + }, + { + "epoch": 0.533952948010456, + "grad_norm": 0.42985647916793823, + "learning_rate": 0.0001, + "loss": 1.7268, + "step": 4596 + }, + { + "epoch": 0.5340691257624165, + "grad_norm": 0.3984420895576477, + "learning_rate": 0.0001, + "loss": 1.4863, + "step": 4597 + }, + { + "epoch": 0.534185303514377, + "grad_norm": 0.41712093353271484, + "learning_rate": 0.0001, + "loss": 1.6269, + "step": 4598 + }, + { + "epoch": 0.5343014812663375, + "grad_norm": 0.42095884680747986, + "learning_rate": 0.0001, + "loss": 1.7377, + "step": 4599 + }, + { + "epoch": 0.534417659018298, + "grad_norm": 0.4140506386756897, + "learning_rate": 0.0001, + "loss": 1.514, + "step": 4600 + }, + { + "epoch": 0.5345338367702585, + "grad_norm": 0.39841312170028687, + "learning_rate": 0.0001, + "loss": 1.4846, + "step": 4601 + }, + { + "epoch": 0.5346500145222189, + "grad_norm": 0.46106958389282227, + "learning_rate": 0.0001, + "loss": 1.6737, + "step": 4602 + }, + { + "epoch": 0.5347661922741795, + "grad_norm": 0.4085831344127655, + "learning_rate": 0.0001, + "loss": 1.5861, + "step": 4603 + }, + { + "epoch": 0.53488237002614, + "grad_norm": 0.49497655034065247, + "learning_rate": 0.0001, + "loss": 1.6332, + "step": 4604 + }, + { + "epoch": 0.5349985477781005, + "grad_norm": 0.43607422709465027, + "learning_rate": 0.0001, + "loss": 1.7121, + "step": 4605 + }, + { + "epoch": 0.535114725530061, + "grad_norm": 0.4607904255390167, + "learning_rate": 0.0001, + "loss": 1.6288, + "step": 4606 + }, + { + "epoch": 0.5352309032820215, + "grad_norm": 0.4455506205558777, + "learning_rate": 0.0001, + "loss": 1.6646, + "step": 4607 + }, + { + "epoch": 0.535347081033982, + "grad_norm": 0.39066752791404724, + "learning_rate": 0.0001, + "loss": 1.4855, + "step": 4608 + }, + { + "epoch": 0.5354632587859425, + "grad_norm": 0.4268709421157837, + "learning_rate": 0.0001, + "loss": 1.8102, + "step": 4609 + }, + { + "epoch": 0.535579436537903, + "grad_norm": 0.40803661942481995, + "learning_rate": 0.0001, + "loss": 1.6831, + "step": 4610 + }, + { + "epoch": 0.5356956142898635, + "grad_norm": 0.4266864061355591, + "learning_rate": 0.0001, + "loss": 1.5085, + "step": 4611 + }, + { + "epoch": 0.5358117920418239, + "grad_norm": 0.4282558262348175, + "learning_rate": 0.0001, + "loss": 1.6198, + "step": 4612 + }, + { + "epoch": 0.5359279697937845, + "grad_norm": 0.4355599880218506, + "learning_rate": 0.0001, + "loss": 1.7922, + "step": 4613 + }, + { + "epoch": 0.536044147545745, + "grad_norm": 0.41130515933036804, + "learning_rate": 0.0001, + "loss": 1.5415, + "step": 4614 + }, + { + "epoch": 0.5361603252977055, + "grad_norm": 0.4203796982765198, + "learning_rate": 0.0001, + "loss": 1.5859, + "step": 4615 + }, + { + "epoch": 0.536276503049666, + "grad_norm": 0.4113680124282837, + "learning_rate": 0.0001, + "loss": 1.6055, + "step": 4616 + }, + { + "epoch": 0.5363926808016265, + "grad_norm": 0.4167897701263428, + "learning_rate": 0.0001, + "loss": 1.654, + "step": 4617 + }, + { + "epoch": 0.536508858553587, + "grad_norm": 0.39767855405807495, + "learning_rate": 0.0001, + "loss": 1.4927, + "step": 4618 + }, + { + "epoch": 0.5366250363055475, + "grad_norm": 0.42861250042915344, + "learning_rate": 0.0001, + "loss": 1.7427, + "step": 4619 + }, + { + "epoch": 0.536741214057508, + "grad_norm": 0.4207315742969513, + "learning_rate": 0.0001, + "loss": 1.8116, + "step": 4620 + }, + { + "epoch": 0.5368573918094685, + "grad_norm": 0.41089701652526855, + "learning_rate": 0.0001, + "loss": 1.7136, + "step": 4621 + }, + { + "epoch": 0.5369735695614289, + "grad_norm": 0.42728808522224426, + "learning_rate": 0.0001, + "loss": 1.6571, + "step": 4622 + }, + { + "epoch": 0.5370897473133894, + "grad_norm": 0.4472593069076538, + "learning_rate": 0.0001, + "loss": 1.7698, + "step": 4623 + }, + { + "epoch": 0.53720592506535, + "grad_norm": 0.41069328784942627, + "learning_rate": 0.0001, + "loss": 1.5528, + "step": 4624 + }, + { + "epoch": 0.5373221028173105, + "grad_norm": 0.4025200605392456, + "learning_rate": 0.0001, + "loss": 1.6138, + "step": 4625 + }, + { + "epoch": 0.537438280569271, + "grad_norm": 0.4271898567676544, + "learning_rate": 0.0001, + "loss": 1.8612, + "step": 4626 + }, + { + "epoch": 0.5375544583212315, + "grad_norm": 0.40859630703926086, + "learning_rate": 0.0001, + "loss": 1.5449, + "step": 4627 + }, + { + "epoch": 0.537670636073192, + "grad_norm": 0.390266090631485, + "learning_rate": 0.0001, + "loss": 1.4714, + "step": 4628 + }, + { + "epoch": 0.5377868138251525, + "grad_norm": 0.44839224219322205, + "learning_rate": 0.0001, + "loss": 1.5188, + "step": 4629 + }, + { + "epoch": 0.537902991577113, + "grad_norm": 0.4331499934196472, + "learning_rate": 0.0001, + "loss": 1.748, + "step": 4630 + }, + { + "epoch": 0.5380191693290735, + "grad_norm": 0.41848304867744446, + "learning_rate": 0.0001, + "loss": 1.559, + "step": 4631 + }, + { + "epoch": 0.5381353470810339, + "grad_norm": 0.4118647873401642, + "learning_rate": 0.0001, + "loss": 1.5754, + "step": 4632 + }, + { + "epoch": 0.5382515248329944, + "grad_norm": 0.4422479569911957, + "learning_rate": 0.0001, + "loss": 1.6402, + "step": 4633 + }, + { + "epoch": 0.538367702584955, + "grad_norm": 0.4162086844444275, + "learning_rate": 0.0001, + "loss": 1.6752, + "step": 4634 + }, + { + "epoch": 0.5384838803369155, + "grad_norm": 0.4036068618297577, + "learning_rate": 0.0001, + "loss": 1.6282, + "step": 4635 + }, + { + "epoch": 0.538600058088876, + "grad_norm": 0.4048601984977722, + "learning_rate": 0.0001, + "loss": 1.5711, + "step": 4636 + }, + { + "epoch": 0.5387162358408365, + "grad_norm": 0.4456162750720978, + "learning_rate": 0.0001, + "loss": 1.7916, + "step": 4637 + }, + { + "epoch": 0.538832413592797, + "grad_norm": 0.44490668177604675, + "learning_rate": 0.0001, + "loss": 1.6388, + "step": 4638 + }, + { + "epoch": 0.5389485913447575, + "grad_norm": 0.42898234724998474, + "learning_rate": 0.0001, + "loss": 1.6908, + "step": 4639 + }, + { + "epoch": 0.539064769096718, + "grad_norm": 0.41526684165000916, + "learning_rate": 0.0001, + "loss": 1.6208, + "step": 4640 + }, + { + "epoch": 0.5391809468486785, + "grad_norm": 0.391984224319458, + "learning_rate": 0.0001, + "loss": 1.5299, + "step": 4641 + }, + { + "epoch": 0.5392971246006389, + "grad_norm": 0.4175387918949127, + "learning_rate": 0.0001, + "loss": 1.5193, + "step": 4642 + }, + { + "epoch": 0.5394133023525994, + "grad_norm": 0.4540769159793854, + "learning_rate": 0.0001, + "loss": 1.601, + "step": 4643 + }, + { + "epoch": 0.5395294801045599, + "grad_norm": 0.43631550669670105, + "learning_rate": 0.0001, + "loss": 1.578, + "step": 4644 + }, + { + "epoch": 0.5396456578565205, + "grad_norm": 0.459330290555954, + "learning_rate": 0.0001, + "loss": 1.7209, + "step": 4645 + }, + { + "epoch": 0.539761835608481, + "grad_norm": 0.41860130429267883, + "learning_rate": 0.0001, + "loss": 1.6275, + "step": 4646 + }, + { + "epoch": 0.5398780133604415, + "grad_norm": 0.5068102478981018, + "learning_rate": 0.0001, + "loss": 1.8484, + "step": 4647 + }, + { + "epoch": 0.539994191112402, + "grad_norm": 0.4381267726421356, + "learning_rate": 0.0001, + "loss": 1.7247, + "step": 4648 + }, + { + "epoch": 0.5401103688643625, + "grad_norm": 0.426881343126297, + "learning_rate": 0.0001, + "loss": 1.5756, + "step": 4649 + }, + { + "epoch": 0.540226546616323, + "grad_norm": 0.44243374466896057, + "learning_rate": 0.0001, + "loss": 1.8272, + "step": 4650 + }, + { + "epoch": 0.5403427243682835, + "grad_norm": 0.4320388734340668, + "learning_rate": 0.0001, + "loss": 1.5522, + "step": 4651 + }, + { + "epoch": 0.540458902120244, + "grad_norm": 0.418743759393692, + "learning_rate": 0.0001, + "loss": 1.5871, + "step": 4652 + }, + { + "epoch": 0.5405750798722044, + "grad_norm": 0.4069058299064636, + "learning_rate": 0.0001, + "loss": 1.7696, + "step": 4653 + }, + { + "epoch": 0.5406912576241649, + "grad_norm": 0.4170093536376953, + "learning_rate": 0.0001, + "loss": 1.6897, + "step": 4654 + }, + { + "epoch": 0.5408074353761255, + "grad_norm": 0.4032455384731293, + "learning_rate": 0.0001, + "loss": 1.6103, + "step": 4655 + }, + { + "epoch": 0.540923613128086, + "grad_norm": 0.3982067108154297, + "learning_rate": 0.0001, + "loss": 1.6941, + "step": 4656 + }, + { + "epoch": 0.5410397908800465, + "grad_norm": 0.4389408528804779, + "learning_rate": 0.0001, + "loss": 1.6428, + "step": 4657 + }, + { + "epoch": 0.541155968632007, + "grad_norm": 0.41216394305229187, + "learning_rate": 0.0001, + "loss": 1.724, + "step": 4658 + }, + { + "epoch": 0.5412721463839675, + "grad_norm": 0.4423290491104126, + "learning_rate": 0.0001, + "loss": 1.8955, + "step": 4659 + }, + { + "epoch": 0.541388324135928, + "grad_norm": 0.45288944244384766, + "learning_rate": 0.0001, + "loss": 1.5609, + "step": 4660 + }, + { + "epoch": 0.5415045018878885, + "grad_norm": 0.43445900082588196, + "learning_rate": 0.0001, + "loss": 1.6535, + "step": 4661 + }, + { + "epoch": 0.541620679639849, + "grad_norm": 0.4164069890975952, + "learning_rate": 0.0001, + "loss": 1.6999, + "step": 4662 + }, + { + "epoch": 0.5417368573918094, + "grad_norm": 0.4147018492221832, + "learning_rate": 0.0001, + "loss": 1.6152, + "step": 4663 + }, + { + "epoch": 0.5418530351437699, + "grad_norm": 0.4525691568851471, + "learning_rate": 0.0001, + "loss": 1.674, + "step": 4664 + }, + { + "epoch": 0.5419692128957304, + "grad_norm": 0.4286227822303772, + "learning_rate": 0.0001, + "loss": 1.6262, + "step": 4665 + }, + { + "epoch": 0.542085390647691, + "grad_norm": 0.4411110281944275, + "learning_rate": 0.0001, + "loss": 1.6274, + "step": 4666 + }, + { + "epoch": 0.5422015683996515, + "grad_norm": 0.3831879794597626, + "learning_rate": 0.0001, + "loss": 1.3289, + "step": 4667 + }, + { + "epoch": 0.542317746151612, + "grad_norm": 0.42957553267478943, + "learning_rate": 0.0001, + "loss": 1.6977, + "step": 4668 + }, + { + "epoch": 0.5424339239035725, + "grad_norm": 0.43326959013938904, + "learning_rate": 0.0001, + "loss": 1.6754, + "step": 4669 + }, + { + "epoch": 0.542550101655533, + "grad_norm": 0.42409175634384155, + "learning_rate": 0.0001, + "loss": 1.5158, + "step": 4670 + }, + { + "epoch": 0.5426662794074935, + "grad_norm": 0.42008155584335327, + "learning_rate": 0.0001, + "loss": 1.6757, + "step": 4671 + }, + { + "epoch": 0.542782457159454, + "grad_norm": 0.41546356678009033, + "learning_rate": 0.0001, + "loss": 1.6692, + "step": 4672 + }, + { + "epoch": 0.5428986349114144, + "grad_norm": 0.41974398493766785, + "learning_rate": 0.0001, + "loss": 1.6507, + "step": 4673 + }, + { + "epoch": 0.5430148126633749, + "grad_norm": 0.39772695302963257, + "learning_rate": 0.0001, + "loss": 1.6017, + "step": 4674 + }, + { + "epoch": 0.5431309904153354, + "grad_norm": 0.43353089690208435, + "learning_rate": 0.0001, + "loss": 1.7021, + "step": 4675 + }, + { + "epoch": 0.543247168167296, + "grad_norm": 0.4182477295398712, + "learning_rate": 0.0001, + "loss": 1.5777, + "step": 4676 + }, + { + "epoch": 0.5433633459192565, + "grad_norm": 0.4173682928085327, + "learning_rate": 0.0001, + "loss": 1.512, + "step": 4677 + }, + { + "epoch": 0.543479523671217, + "grad_norm": 0.44657060503959656, + "learning_rate": 0.0001, + "loss": 1.6167, + "step": 4678 + }, + { + "epoch": 0.5435957014231775, + "grad_norm": 0.43546125292778015, + "learning_rate": 0.0001, + "loss": 1.6459, + "step": 4679 + }, + { + "epoch": 0.543711879175138, + "grad_norm": 0.45045846700668335, + "learning_rate": 0.0001, + "loss": 1.6601, + "step": 4680 + }, + { + "epoch": 0.5438280569270985, + "grad_norm": 0.4383883774280548, + "learning_rate": 0.0001, + "loss": 1.5701, + "step": 4681 + }, + { + "epoch": 0.543944234679059, + "grad_norm": 0.40852198004722595, + "learning_rate": 0.0001, + "loss": 1.6311, + "step": 4682 + }, + { + "epoch": 0.5440604124310194, + "grad_norm": 0.4357270896434784, + "learning_rate": 0.0001, + "loss": 1.8067, + "step": 4683 + }, + { + "epoch": 0.5441765901829799, + "grad_norm": 0.4512038826942444, + "learning_rate": 0.0001, + "loss": 1.6369, + "step": 4684 + }, + { + "epoch": 0.5442927679349404, + "grad_norm": 0.41331276297569275, + "learning_rate": 0.0001, + "loss": 1.6546, + "step": 4685 + }, + { + "epoch": 0.544408945686901, + "grad_norm": 0.4491327702999115, + "learning_rate": 0.0001, + "loss": 1.7954, + "step": 4686 + }, + { + "epoch": 0.5445251234388615, + "grad_norm": 0.4428486227989197, + "learning_rate": 0.0001, + "loss": 1.7398, + "step": 4687 + }, + { + "epoch": 0.544641301190822, + "grad_norm": 0.416236937046051, + "learning_rate": 0.0001, + "loss": 1.6693, + "step": 4688 + }, + { + "epoch": 0.5447574789427825, + "grad_norm": 0.40862324833869934, + "learning_rate": 0.0001, + "loss": 1.6646, + "step": 4689 + }, + { + "epoch": 0.544873656694743, + "grad_norm": 0.43137800693511963, + "learning_rate": 0.0001, + "loss": 1.6398, + "step": 4690 + }, + { + "epoch": 0.5449898344467035, + "grad_norm": 0.4132535755634308, + "learning_rate": 0.0001, + "loss": 1.6942, + "step": 4691 + }, + { + "epoch": 0.545106012198664, + "grad_norm": 0.5874444842338562, + "learning_rate": 0.0001, + "loss": 1.6515, + "step": 4692 + }, + { + "epoch": 0.5452221899506244, + "grad_norm": 0.379760205745697, + "learning_rate": 0.0001, + "loss": 1.3516, + "step": 4693 + }, + { + "epoch": 0.5453383677025849, + "grad_norm": 0.4385640025138855, + "learning_rate": 0.0001, + "loss": 1.776, + "step": 4694 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 0.4262996315956116, + "learning_rate": 0.0001, + "loss": 1.5412, + "step": 4695 + }, + { + "epoch": 0.5455707232065059, + "grad_norm": 0.4239009916782379, + "learning_rate": 0.0001, + "loss": 1.6814, + "step": 4696 + }, + { + "epoch": 0.5456869009584665, + "grad_norm": 0.40220028162002563, + "learning_rate": 0.0001, + "loss": 1.5991, + "step": 4697 + }, + { + "epoch": 0.545803078710427, + "grad_norm": 0.3948020339012146, + "learning_rate": 0.0001, + "loss": 1.5745, + "step": 4698 + }, + { + "epoch": 0.5459192564623875, + "grad_norm": 0.3882817029953003, + "learning_rate": 0.0001, + "loss": 1.4505, + "step": 4699 + }, + { + "epoch": 0.546035434214348, + "grad_norm": 0.4421759843826294, + "learning_rate": 0.0001, + "loss": 1.6698, + "step": 4700 + }, + { + "epoch": 0.5461516119663085, + "grad_norm": 0.4043938219547272, + "learning_rate": 0.0001, + "loss": 1.6986, + "step": 4701 + }, + { + "epoch": 0.546267789718269, + "grad_norm": 0.42511290311813354, + "learning_rate": 0.0001, + "loss": 1.5415, + "step": 4702 + }, + { + "epoch": 0.5463839674702294, + "grad_norm": 0.4274735748767853, + "learning_rate": 0.0001, + "loss": 1.6945, + "step": 4703 + }, + { + "epoch": 0.5465001452221899, + "grad_norm": 0.4572598934173584, + "learning_rate": 0.0001, + "loss": 1.754, + "step": 4704 + }, + { + "epoch": 0.5466163229741504, + "grad_norm": 0.41152650117874146, + "learning_rate": 0.0001, + "loss": 1.3119, + "step": 4705 + }, + { + "epoch": 0.5467325007261109, + "grad_norm": 0.4448837637901306, + "learning_rate": 0.0001, + "loss": 1.6856, + "step": 4706 + }, + { + "epoch": 0.5468486784780715, + "grad_norm": 0.4235488176345825, + "learning_rate": 0.0001, + "loss": 1.7473, + "step": 4707 + }, + { + "epoch": 0.546964856230032, + "grad_norm": 0.4105146527290344, + "learning_rate": 0.0001, + "loss": 1.6456, + "step": 4708 + }, + { + "epoch": 0.5470810339819925, + "grad_norm": 0.4490967094898224, + "learning_rate": 0.0001, + "loss": 1.7984, + "step": 4709 + }, + { + "epoch": 0.547197211733953, + "grad_norm": 0.4733203947544098, + "learning_rate": 0.0001, + "loss": 1.6005, + "step": 4710 + }, + { + "epoch": 0.5473133894859135, + "grad_norm": 0.4103139340877533, + "learning_rate": 0.0001, + "loss": 1.6986, + "step": 4711 + }, + { + "epoch": 0.547429567237874, + "grad_norm": 0.41262155771255493, + "learning_rate": 0.0001, + "loss": 1.5596, + "step": 4712 + }, + { + "epoch": 0.5475457449898344, + "grad_norm": 0.41517654061317444, + "learning_rate": 0.0001, + "loss": 1.6688, + "step": 4713 + }, + { + "epoch": 0.5476619227417949, + "grad_norm": 0.43527457118034363, + "learning_rate": 0.0001, + "loss": 1.7127, + "step": 4714 + }, + { + "epoch": 0.5477781004937554, + "grad_norm": 0.3753294348716736, + "learning_rate": 0.0001, + "loss": 1.4135, + "step": 4715 + }, + { + "epoch": 0.5478942782457159, + "grad_norm": 0.4435529112815857, + "learning_rate": 0.0001, + "loss": 1.7554, + "step": 4716 + }, + { + "epoch": 0.5480104559976764, + "grad_norm": 0.39679768681526184, + "learning_rate": 0.0001, + "loss": 1.5539, + "step": 4717 + }, + { + "epoch": 0.548126633749637, + "grad_norm": 0.421624094247818, + "learning_rate": 0.0001, + "loss": 1.6894, + "step": 4718 + }, + { + "epoch": 0.5482428115015975, + "grad_norm": 0.42037492990493774, + "learning_rate": 0.0001, + "loss": 1.6101, + "step": 4719 + }, + { + "epoch": 0.548358989253558, + "grad_norm": 0.44528070092201233, + "learning_rate": 0.0001, + "loss": 1.4923, + "step": 4720 + }, + { + "epoch": 0.5484751670055185, + "grad_norm": 0.41840189695358276, + "learning_rate": 0.0001, + "loss": 1.6836, + "step": 4721 + }, + { + "epoch": 0.548591344757479, + "grad_norm": 0.3980635404586792, + "learning_rate": 0.0001, + "loss": 1.7035, + "step": 4722 + }, + { + "epoch": 0.5487075225094394, + "grad_norm": 0.42704808712005615, + "learning_rate": 0.0001, + "loss": 1.6621, + "step": 4723 + }, + { + "epoch": 0.5488237002613999, + "grad_norm": 0.4258357584476471, + "learning_rate": 0.0001, + "loss": 1.6625, + "step": 4724 + }, + { + "epoch": 0.5489398780133604, + "grad_norm": 0.4512523114681244, + "learning_rate": 0.0001, + "loss": 1.6432, + "step": 4725 + }, + { + "epoch": 0.5490560557653209, + "grad_norm": 0.4495634138584137, + "learning_rate": 0.0001, + "loss": 1.7266, + "step": 4726 + }, + { + "epoch": 0.5491722335172814, + "grad_norm": 0.44394296407699585, + "learning_rate": 0.0001, + "loss": 1.7735, + "step": 4727 + }, + { + "epoch": 0.549288411269242, + "grad_norm": 0.4451744258403778, + "learning_rate": 0.0001, + "loss": 1.8485, + "step": 4728 + }, + { + "epoch": 0.5494045890212025, + "grad_norm": 0.42420530319213867, + "learning_rate": 0.0001, + "loss": 1.7562, + "step": 4729 + }, + { + "epoch": 0.549520766773163, + "grad_norm": 0.4318549931049347, + "learning_rate": 0.0001, + "loss": 1.5842, + "step": 4730 + }, + { + "epoch": 0.5496369445251235, + "grad_norm": 0.3942951560020447, + "learning_rate": 0.0001, + "loss": 1.7032, + "step": 4731 + }, + { + "epoch": 0.549753122277084, + "grad_norm": 0.42226502299308777, + "learning_rate": 0.0001, + "loss": 1.723, + "step": 4732 + }, + { + "epoch": 0.5498693000290444, + "grad_norm": 0.45085522532463074, + "learning_rate": 0.0001, + "loss": 1.7787, + "step": 4733 + }, + { + "epoch": 0.5499854777810049, + "grad_norm": 0.40263989567756653, + "learning_rate": 0.0001, + "loss": 1.6947, + "step": 4734 + }, + { + "epoch": 0.5501016555329654, + "grad_norm": 0.4361323416233063, + "learning_rate": 0.0001, + "loss": 1.6209, + "step": 4735 + }, + { + "epoch": 0.5502178332849259, + "grad_norm": 0.4477275311946869, + "learning_rate": 0.0001, + "loss": 1.6644, + "step": 4736 + }, + { + "epoch": 0.5503340110368864, + "grad_norm": 0.490419864654541, + "learning_rate": 0.0001, + "loss": 1.7794, + "step": 4737 + }, + { + "epoch": 0.5504501887888469, + "grad_norm": 0.4405496418476105, + "learning_rate": 0.0001, + "loss": 1.6157, + "step": 4738 + }, + { + "epoch": 0.5505663665408075, + "grad_norm": 0.38646984100341797, + "learning_rate": 0.0001, + "loss": 1.5299, + "step": 4739 + }, + { + "epoch": 0.550682544292768, + "grad_norm": 0.40412217378616333, + "learning_rate": 0.0001, + "loss": 1.6316, + "step": 4740 + }, + { + "epoch": 0.5507987220447285, + "grad_norm": 0.41205301880836487, + "learning_rate": 0.0001, + "loss": 1.6778, + "step": 4741 + }, + { + "epoch": 0.550914899796689, + "grad_norm": 0.4445815086364746, + "learning_rate": 0.0001, + "loss": 1.6385, + "step": 4742 + }, + { + "epoch": 0.5510310775486494, + "grad_norm": 0.3821341395378113, + "learning_rate": 0.0001, + "loss": 1.364, + "step": 4743 + }, + { + "epoch": 0.5511472553006099, + "grad_norm": 0.4401394724845886, + "learning_rate": 0.0001, + "loss": 1.8219, + "step": 4744 + }, + { + "epoch": 0.5512634330525704, + "grad_norm": 0.4259290397167206, + "learning_rate": 0.0001, + "loss": 1.7113, + "step": 4745 + }, + { + "epoch": 0.5513796108045309, + "grad_norm": 0.4446122348308563, + "learning_rate": 0.0001, + "loss": 1.672, + "step": 4746 + }, + { + "epoch": 0.5514957885564914, + "grad_norm": 0.4578036367893219, + "learning_rate": 0.0001, + "loss": 1.7534, + "step": 4747 + }, + { + "epoch": 0.5516119663084519, + "grad_norm": 0.42073720693588257, + "learning_rate": 0.0001, + "loss": 1.692, + "step": 4748 + }, + { + "epoch": 0.5517281440604125, + "grad_norm": 0.4655468761920929, + "learning_rate": 0.0001, + "loss": 1.6489, + "step": 4749 + }, + { + "epoch": 0.551844321812373, + "grad_norm": 0.41886764764785767, + "learning_rate": 0.0001, + "loss": 1.6146, + "step": 4750 + }, + { + "epoch": 0.5519604995643335, + "grad_norm": 0.42605897784233093, + "learning_rate": 0.0001, + "loss": 1.5669, + "step": 4751 + }, + { + "epoch": 0.552076677316294, + "grad_norm": 0.43297725915908813, + "learning_rate": 0.0001, + "loss": 1.608, + "step": 4752 + }, + { + "epoch": 0.5521928550682544, + "grad_norm": 0.44777053594589233, + "learning_rate": 0.0001, + "loss": 1.6371, + "step": 4753 + }, + { + "epoch": 0.5523090328202149, + "grad_norm": 0.43533116579055786, + "learning_rate": 0.0001, + "loss": 1.6652, + "step": 4754 + }, + { + "epoch": 0.5524252105721754, + "grad_norm": 0.4219701290130615, + "learning_rate": 0.0001, + "loss": 1.6955, + "step": 4755 + }, + { + "epoch": 0.5525413883241359, + "grad_norm": 0.42743462324142456, + "learning_rate": 0.0001, + "loss": 1.7591, + "step": 4756 + }, + { + "epoch": 0.5526575660760964, + "grad_norm": 0.4393671452999115, + "learning_rate": 0.0001, + "loss": 1.7385, + "step": 4757 + }, + { + "epoch": 0.5527737438280569, + "grad_norm": 0.43937399983406067, + "learning_rate": 0.0001, + "loss": 1.6714, + "step": 4758 + }, + { + "epoch": 0.5528899215800174, + "grad_norm": 0.41498544812202454, + "learning_rate": 0.0001, + "loss": 1.6554, + "step": 4759 + }, + { + "epoch": 0.553006099331978, + "grad_norm": 0.3907436430454254, + "learning_rate": 0.0001, + "loss": 1.4194, + "step": 4760 + }, + { + "epoch": 0.5531222770839385, + "grad_norm": 0.4304429292678833, + "learning_rate": 0.0001, + "loss": 1.5683, + "step": 4761 + }, + { + "epoch": 0.553238454835899, + "grad_norm": 0.41753122210502625, + "learning_rate": 0.0001, + "loss": 1.5066, + "step": 4762 + }, + { + "epoch": 0.5533546325878594, + "grad_norm": 0.4009332060813904, + "learning_rate": 0.0001, + "loss": 1.6122, + "step": 4763 + }, + { + "epoch": 0.5534708103398199, + "grad_norm": 0.41789358854293823, + "learning_rate": 0.0001, + "loss": 1.6677, + "step": 4764 + }, + { + "epoch": 0.5535869880917804, + "grad_norm": 0.42972517013549805, + "learning_rate": 0.0001, + "loss": 1.7597, + "step": 4765 + }, + { + "epoch": 0.5537031658437409, + "grad_norm": 0.4440890848636627, + "learning_rate": 0.0001, + "loss": 1.5651, + "step": 4766 + }, + { + "epoch": 0.5538193435957014, + "grad_norm": 0.43731603026390076, + "learning_rate": 0.0001, + "loss": 1.7988, + "step": 4767 + }, + { + "epoch": 0.5539355213476619, + "grad_norm": 0.4374067485332489, + "learning_rate": 0.0001, + "loss": 1.6098, + "step": 4768 + }, + { + "epoch": 0.5540516990996224, + "grad_norm": 0.43468907475471497, + "learning_rate": 0.0001, + "loss": 1.6439, + "step": 4769 + }, + { + "epoch": 0.554167876851583, + "grad_norm": 0.44447562098503113, + "learning_rate": 0.0001, + "loss": 1.6284, + "step": 4770 + }, + { + "epoch": 0.5542840546035435, + "grad_norm": 0.4318912625312805, + "learning_rate": 0.0001, + "loss": 1.6811, + "step": 4771 + }, + { + "epoch": 0.554400232355504, + "grad_norm": 0.4088803827762604, + "learning_rate": 0.0001, + "loss": 1.506, + "step": 4772 + }, + { + "epoch": 0.5545164101074644, + "grad_norm": 0.45096081495285034, + "learning_rate": 0.0001, + "loss": 1.6394, + "step": 4773 + }, + { + "epoch": 0.5546325878594249, + "grad_norm": 0.45685335993766785, + "learning_rate": 0.0001, + "loss": 1.6431, + "step": 4774 + }, + { + "epoch": 0.5547487656113854, + "grad_norm": 0.4274502396583557, + "learning_rate": 0.0001, + "loss": 1.6178, + "step": 4775 + }, + { + "epoch": 0.5548649433633459, + "grad_norm": 0.3890495300292969, + "learning_rate": 0.0001, + "loss": 1.5234, + "step": 4776 + }, + { + "epoch": 0.5549811211153064, + "grad_norm": 0.4626457095146179, + "learning_rate": 0.0001, + "loss": 1.5708, + "step": 4777 + }, + { + "epoch": 0.5550972988672669, + "grad_norm": 0.42439907789230347, + "learning_rate": 0.0001, + "loss": 1.5759, + "step": 4778 + }, + { + "epoch": 0.5552134766192274, + "grad_norm": 0.45186173915863037, + "learning_rate": 0.0001, + "loss": 1.7971, + "step": 4779 + }, + { + "epoch": 0.5553296543711879, + "grad_norm": 0.4208846688270569, + "learning_rate": 0.0001, + "loss": 1.5664, + "step": 4780 + }, + { + "epoch": 0.5554458321231485, + "grad_norm": 0.43783605098724365, + "learning_rate": 0.0001, + "loss": 1.8354, + "step": 4781 + }, + { + "epoch": 0.555562009875109, + "grad_norm": 0.4037763178348541, + "learning_rate": 0.0001, + "loss": 1.5226, + "step": 4782 + }, + { + "epoch": 0.5556781876270694, + "grad_norm": 0.41722458600997925, + "learning_rate": 0.0001, + "loss": 1.6663, + "step": 4783 + }, + { + "epoch": 0.5557943653790299, + "grad_norm": 0.4464002549648285, + "learning_rate": 0.0001, + "loss": 1.6796, + "step": 4784 + }, + { + "epoch": 0.5559105431309904, + "grad_norm": 0.4327322840690613, + "learning_rate": 0.0001, + "loss": 1.829, + "step": 4785 + }, + { + "epoch": 0.5560267208829509, + "grad_norm": 0.4065304100513458, + "learning_rate": 0.0001, + "loss": 1.6915, + "step": 4786 + }, + { + "epoch": 0.5561428986349114, + "grad_norm": 0.4391983449459076, + "learning_rate": 0.0001, + "loss": 1.6593, + "step": 4787 + }, + { + "epoch": 0.5562590763868719, + "grad_norm": 0.42863357067108154, + "learning_rate": 0.0001, + "loss": 1.4858, + "step": 4788 + }, + { + "epoch": 0.5563752541388324, + "grad_norm": 0.40624338388442993, + "learning_rate": 0.0001, + "loss": 1.4187, + "step": 4789 + }, + { + "epoch": 0.5564914318907929, + "grad_norm": 0.4419528543949127, + "learning_rate": 0.0001, + "loss": 1.5129, + "step": 4790 + }, + { + "epoch": 0.5566076096427535, + "grad_norm": 0.470945805311203, + "learning_rate": 0.0001, + "loss": 1.8979, + "step": 4791 + }, + { + "epoch": 0.556723787394714, + "grad_norm": 0.4236770272254944, + "learning_rate": 0.0001, + "loss": 1.8773, + "step": 4792 + }, + { + "epoch": 0.5568399651466744, + "grad_norm": 0.426725834608078, + "learning_rate": 0.0001, + "loss": 1.6101, + "step": 4793 + }, + { + "epoch": 0.5569561428986349, + "grad_norm": 0.4224294126033783, + "learning_rate": 0.0001, + "loss": 1.6533, + "step": 4794 + }, + { + "epoch": 0.5570723206505954, + "grad_norm": 0.413329541683197, + "learning_rate": 0.0001, + "loss": 1.5878, + "step": 4795 + }, + { + "epoch": 0.5571884984025559, + "grad_norm": 0.43398913741111755, + "learning_rate": 0.0001, + "loss": 1.5168, + "step": 4796 + }, + { + "epoch": 0.5573046761545164, + "grad_norm": 0.4381754696369171, + "learning_rate": 0.0001, + "loss": 1.7338, + "step": 4797 + }, + { + "epoch": 0.5574208539064769, + "grad_norm": 0.44960010051727295, + "learning_rate": 0.0001, + "loss": 1.7337, + "step": 4798 + }, + { + "epoch": 0.5575370316584374, + "grad_norm": 0.4516027271747589, + "learning_rate": 0.0001, + "loss": 1.629, + "step": 4799 + }, + { + "epoch": 0.5576532094103979, + "grad_norm": 0.4343845844268799, + "learning_rate": 0.0001, + "loss": 1.7357, + "step": 4800 + }, + { + "epoch": 0.5577693871623584, + "grad_norm": 0.436753511428833, + "learning_rate": 0.0001, + "loss": 1.6702, + "step": 4801 + }, + { + "epoch": 0.557885564914319, + "grad_norm": 0.4462500512599945, + "learning_rate": 0.0001, + "loss": 1.6267, + "step": 4802 + }, + { + "epoch": 0.5580017426662794, + "grad_norm": 0.396566778421402, + "learning_rate": 0.0001, + "loss": 1.3311, + "step": 4803 + }, + { + "epoch": 0.5581179204182399, + "grad_norm": 0.5002578496932983, + "learning_rate": 0.0001, + "loss": 1.8922, + "step": 4804 + }, + { + "epoch": 0.5582340981702004, + "grad_norm": 0.43090498447418213, + "learning_rate": 0.0001, + "loss": 1.7225, + "step": 4805 + }, + { + "epoch": 0.5583502759221609, + "grad_norm": 0.4082920253276825, + "learning_rate": 0.0001, + "loss": 1.6582, + "step": 4806 + }, + { + "epoch": 0.5584664536741214, + "grad_norm": 0.43145760893821716, + "learning_rate": 0.0001, + "loss": 1.6492, + "step": 4807 + }, + { + "epoch": 0.5585826314260819, + "grad_norm": 0.3939366638660431, + "learning_rate": 0.0001, + "loss": 1.5123, + "step": 4808 + }, + { + "epoch": 0.5586988091780424, + "grad_norm": 0.45775458216667175, + "learning_rate": 0.0001, + "loss": 1.7803, + "step": 4809 + }, + { + "epoch": 0.5588149869300029, + "grad_norm": 0.4209611117839813, + "learning_rate": 0.0001, + "loss": 1.6339, + "step": 4810 + }, + { + "epoch": 0.5589311646819634, + "grad_norm": 0.4282268285751343, + "learning_rate": 0.0001, + "loss": 1.6259, + "step": 4811 + }, + { + "epoch": 0.559047342433924, + "grad_norm": 0.4249371290206909, + "learning_rate": 0.0001, + "loss": 1.6085, + "step": 4812 + }, + { + "epoch": 0.5591635201858844, + "grad_norm": 0.4493142068386078, + "learning_rate": 0.0001, + "loss": 1.5725, + "step": 4813 + }, + { + "epoch": 0.5592796979378449, + "grad_norm": 0.43762892484664917, + "learning_rate": 0.0001, + "loss": 1.79, + "step": 4814 + }, + { + "epoch": 0.5593958756898054, + "grad_norm": 0.4867606461048126, + "learning_rate": 0.0001, + "loss": 1.786, + "step": 4815 + }, + { + "epoch": 0.5595120534417659, + "grad_norm": 0.4017328917980194, + "learning_rate": 0.0001, + "loss": 1.5433, + "step": 4816 + }, + { + "epoch": 0.5596282311937264, + "grad_norm": 0.4285069704055786, + "learning_rate": 0.0001, + "loss": 1.7024, + "step": 4817 + }, + { + "epoch": 0.5597444089456869, + "grad_norm": 0.4313235580921173, + "learning_rate": 0.0001, + "loss": 1.6348, + "step": 4818 + }, + { + "epoch": 0.5598605866976474, + "grad_norm": 0.3793412744998932, + "learning_rate": 0.0001, + "loss": 1.404, + "step": 4819 + }, + { + "epoch": 0.5599767644496079, + "grad_norm": 0.4507150948047638, + "learning_rate": 0.0001, + "loss": 1.8942, + "step": 4820 + }, + { + "epoch": 0.5600929422015684, + "grad_norm": 0.41890159249305725, + "learning_rate": 0.0001, + "loss": 1.5985, + "step": 4821 + }, + { + "epoch": 0.5602091199535288, + "grad_norm": 0.4463735818862915, + "learning_rate": 0.0001, + "loss": 1.7219, + "step": 4822 + }, + { + "epoch": 0.5603252977054894, + "grad_norm": 0.42564481496810913, + "learning_rate": 0.0001, + "loss": 1.7087, + "step": 4823 + }, + { + "epoch": 0.5604414754574499, + "grad_norm": 0.4581405222415924, + "learning_rate": 0.0001, + "loss": 1.6817, + "step": 4824 + }, + { + "epoch": 0.5605576532094104, + "grad_norm": 0.39707350730895996, + "learning_rate": 0.0001, + "loss": 1.51, + "step": 4825 + }, + { + "epoch": 0.5606738309613709, + "grad_norm": 0.43881773948669434, + "learning_rate": 0.0001, + "loss": 1.6627, + "step": 4826 + }, + { + "epoch": 0.5607900087133314, + "grad_norm": 0.4427170157432556, + "learning_rate": 0.0001, + "loss": 1.7076, + "step": 4827 + }, + { + "epoch": 0.5609061864652919, + "grad_norm": 0.41310209035873413, + "learning_rate": 0.0001, + "loss": 1.5741, + "step": 4828 + }, + { + "epoch": 0.5610223642172524, + "grad_norm": 0.4155448079109192, + "learning_rate": 0.0001, + "loss": 1.5655, + "step": 4829 + }, + { + "epoch": 0.5611385419692129, + "grad_norm": 0.42745399475097656, + "learning_rate": 0.0001, + "loss": 1.55, + "step": 4830 + }, + { + "epoch": 0.5612547197211734, + "grad_norm": 0.39938148856163025, + "learning_rate": 0.0001, + "loss": 1.548, + "step": 4831 + }, + { + "epoch": 0.5613708974731338, + "grad_norm": 0.4226715862751007, + "learning_rate": 0.0001, + "loss": 1.6005, + "step": 4832 + }, + { + "epoch": 0.5614870752250944, + "grad_norm": 0.4088357985019684, + "learning_rate": 0.0001, + "loss": 1.5315, + "step": 4833 + }, + { + "epoch": 0.5616032529770549, + "grad_norm": 0.41541755199432373, + "learning_rate": 0.0001, + "loss": 1.5913, + "step": 4834 + }, + { + "epoch": 0.5617194307290154, + "grad_norm": 0.3974235951900482, + "learning_rate": 0.0001, + "loss": 1.523, + "step": 4835 + }, + { + "epoch": 0.5618356084809759, + "grad_norm": 0.42991939187049866, + "learning_rate": 0.0001, + "loss": 1.495, + "step": 4836 + }, + { + "epoch": 0.5619517862329364, + "grad_norm": 0.43215852975845337, + "learning_rate": 0.0001, + "loss": 1.6715, + "step": 4837 + }, + { + "epoch": 0.5620679639848969, + "grad_norm": 0.42073243856430054, + "learning_rate": 0.0001, + "loss": 1.7004, + "step": 4838 + }, + { + "epoch": 0.5621841417368574, + "grad_norm": 0.40232616662979126, + "learning_rate": 0.0001, + "loss": 1.5684, + "step": 4839 + }, + { + "epoch": 0.5623003194888179, + "grad_norm": 0.4098654091358185, + "learning_rate": 0.0001, + "loss": 1.6134, + "step": 4840 + }, + { + "epoch": 0.5624164972407784, + "grad_norm": 0.44964614510536194, + "learning_rate": 0.0001, + "loss": 1.8357, + "step": 4841 + }, + { + "epoch": 0.5625326749927388, + "grad_norm": 0.45787712931632996, + "learning_rate": 0.0001, + "loss": 1.6967, + "step": 4842 + }, + { + "epoch": 0.5626488527446993, + "grad_norm": 0.41997238993644714, + "learning_rate": 0.0001, + "loss": 1.6079, + "step": 4843 + }, + { + "epoch": 0.5627650304966599, + "grad_norm": 0.4027771055698395, + "learning_rate": 0.0001, + "loss": 1.6138, + "step": 4844 + }, + { + "epoch": 0.5628812082486204, + "grad_norm": 0.3900171220302582, + "learning_rate": 0.0001, + "loss": 1.5524, + "step": 4845 + }, + { + "epoch": 0.5629973860005809, + "grad_norm": 0.4267227053642273, + "learning_rate": 0.0001, + "loss": 1.5865, + "step": 4846 + }, + { + "epoch": 0.5631135637525414, + "grad_norm": 0.42500099539756775, + "learning_rate": 0.0001, + "loss": 1.6397, + "step": 4847 + }, + { + "epoch": 0.5632297415045019, + "grad_norm": 0.40846553444862366, + "learning_rate": 0.0001, + "loss": 1.6244, + "step": 4848 + }, + { + "epoch": 0.5633459192564624, + "grad_norm": 0.43166103959083557, + "learning_rate": 0.0001, + "loss": 1.6879, + "step": 4849 + }, + { + "epoch": 0.5634620970084229, + "grad_norm": 0.41474416851997375, + "learning_rate": 0.0001, + "loss": 1.7393, + "step": 4850 + }, + { + "epoch": 0.5635782747603834, + "grad_norm": 0.4628075361251831, + "learning_rate": 0.0001, + "loss": 1.6299, + "step": 4851 + }, + { + "epoch": 0.5636944525123438, + "grad_norm": 0.48346492648124695, + "learning_rate": 0.0001, + "loss": 1.6858, + "step": 4852 + }, + { + "epoch": 0.5638106302643043, + "grad_norm": 0.4645657241344452, + "learning_rate": 0.0001, + "loss": 1.7263, + "step": 4853 + }, + { + "epoch": 0.5639268080162649, + "grad_norm": 0.4429479241371155, + "learning_rate": 0.0001, + "loss": 1.7976, + "step": 4854 + }, + { + "epoch": 0.5640429857682254, + "grad_norm": 0.4967527389526367, + "learning_rate": 0.0001, + "loss": 1.8657, + "step": 4855 + }, + { + "epoch": 0.5641591635201859, + "grad_norm": 0.45798930525779724, + "learning_rate": 0.0001, + "loss": 1.6908, + "step": 4856 + }, + { + "epoch": 0.5642753412721464, + "grad_norm": 0.4299752116203308, + "learning_rate": 0.0001, + "loss": 1.7577, + "step": 4857 + }, + { + "epoch": 0.5643915190241069, + "grad_norm": 0.4171035885810852, + "learning_rate": 0.0001, + "loss": 1.5997, + "step": 4858 + }, + { + "epoch": 0.5645076967760674, + "grad_norm": 0.45656847953796387, + "learning_rate": 0.0001, + "loss": 1.6746, + "step": 4859 + }, + { + "epoch": 0.5646238745280279, + "grad_norm": 0.4058765172958374, + "learning_rate": 0.0001, + "loss": 1.6135, + "step": 4860 + }, + { + "epoch": 0.5647400522799884, + "grad_norm": 0.44879722595214844, + "learning_rate": 0.0001, + "loss": 1.663, + "step": 4861 + }, + { + "epoch": 0.5648562300319488, + "grad_norm": 0.4731845259666443, + "learning_rate": 0.0001, + "loss": 1.713, + "step": 4862 + }, + { + "epoch": 0.5649724077839093, + "grad_norm": 0.42394986748695374, + "learning_rate": 0.0001, + "loss": 1.6365, + "step": 4863 + }, + { + "epoch": 0.5650885855358699, + "grad_norm": 0.4450361728668213, + "learning_rate": 0.0001, + "loss": 1.626, + "step": 4864 + }, + { + "epoch": 0.5652047632878304, + "grad_norm": 0.3994229733943939, + "learning_rate": 0.0001, + "loss": 1.5822, + "step": 4865 + }, + { + "epoch": 0.5653209410397909, + "grad_norm": 0.40167900919914246, + "learning_rate": 0.0001, + "loss": 1.6394, + "step": 4866 + }, + { + "epoch": 0.5654371187917514, + "grad_norm": 0.409967839717865, + "learning_rate": 0.0001, + "loss": 1.5287, + "step": 4867 + }, + { + "epoch": 0.5655532965437119, + "grad_norm": 0.4286467432975769, + "learning_rate": 0.0001, + "loss": 1.7396, + "step": 4868 + }, + { + "epoch": 0.5656694742956724, + "grad_norm": 0.4050009250640869, + "learning_rate": 0.0001, + "loss": 1.5667, + "step": 4869 + }, + { + "epoch": 0.5657856520476329, + "grad_norm": 0.45172083377838135, + "learning_rate": 0.0001, + "loss": 1.6246, + "step": 4870 + }, + { + "epoch": 0.5659018297995934, + "grad_norm": 0.45170027017593384, + "learning_rate": 0.0001, + "loss": 1.6732, + "step": 4871 + }, + { + "epoch": 0.5660180075515538, + "grad_norm": 0.43326306343078613, + "learning_rate": 0.0001, + "loss": 1.6344, + "step": 4872 + }, + { + "epoch": 0.5661341853035143, + "grad_norm": 0.4260367453098297, + "learning_rate": 0.0001, + "loss": 1.7098, + "step": 4873 + }, + { + "epoch": 0.5662503630554748, + "grad_norm": 0.4030110836029053, + "learning_rate": 0.0001, + "loss": 1.5083, + "step": 4874 + }, + { + "epoch": 0.5663665408074354, + "grad_norm": 0.4576740860939026, + "learning_rate": 0.0001, + "loss": 1.5193, + "step": 4875 + }, + { + "epoch": 0.5664827185593959, + "grad_norm": 0.4269350469112396, + "learning_rate": 0.0001, + "loss": 1.6734, + "step": 4876 + }, + { + "epoch": 0.5665988963113564, + "grad_norm": 0.3865914046764374, + "learning_rate": 0.0001, + "loss": 1.4036, + "step": 4877 + }, + { + "epoch": 0.5667150740633169, + "grad_norm": 0.4161985218524933, + "learning_rate": 0.0001, + "loss": 1.6248, + "step": 4878 + }, + { + "epoch": 0.5668312518152774, + "grad_norm": 0.4495078921318054, + "learning_rate": 0.0001, + "loss": 1.5718, + "step": 4879 + }, + { + "epoch": 0.5669474295672379, + "grad_norm": 0.40627461671829224, + "learning_rate": 0.0001, + "loss": 1.5743, + "step": 4880 + }, + { + "epoch": 0.5670636073191984, + "grad_norm": 0.3978639543056488, + "learning_rate": 0.0001, + "loss": 1.5106, + "step": 4881 + }, + { + "epoch": 0.5671797850711588, + "grad_norm": 0.4051436185836792, + "learning_rate": 0.0001, + "loss": 1.5717, + "step": 4882 + }, + { + "epoch": 0.5672959628231193, + "grad_norm": 0.44826844334602356, + "learning_rate": 0.0001, + "loss": 1.6458, + "step": 4883 + }, + { + "epoch": 0.5674121405750798, + "grad_norm": 0.3976137340068817, + "learning_rate": 0.0001, + "loss": 1.6192, + "step": 4884 + }, + { + "epoch": 0.5675283183270404, + "grad_norm": 0.39482155442237854, + "learning_rate": 0.0001, + "loss": 1.4746, + "step": 4885 + }, + { + "epoch": 0.5676444960790009, + "grad_norm": 0.43112707138061523, + "learning_rate": 0.0001, + "loss": 1.697, + "step": 4886 + }, + { + "epoch": 0.5677606738309614, + "grad_norm": 0.45522767305374146, + "learning_rate": 0.0001, + "loss": 1.5977, + "step": 4887 + }, + { + "epoch": 0.5678768515829219, + "grad_norm": 0.425699919462204, + "learning_rate": 0.0001, + "loss": 1.7145, + "step": 4888 + }, + { + "epoch": 0.5679930293348824, + "grad_norm": 0.4110868573188782, + "learning_rate": 0.0001, + "loss": 1.735, + "step": 4889 + }, + { + "epoch": 0.5681092070868429, + "grad_norm": 0.4198397994041443, + "learning_rate": 0.0001, + "loss": 1.542, + "step": 4890 + }, + { + "epoch": 0.5682253848388034, + "grad_norm": 0.41459405422210693, + "learning_rate": 0.0001, + "loss": 1.6464, + "step": 4891 + }, + { + "epoch": 0.5683415625907638, + "grad_norm": 0.4587966501712799, + "learning_rate": 0.0001, + "loss": 1.7476, + "step": 4892 + }, + { + "epoch": 0.5684577403427243, + "grad_norm": 0.4369155466556549, + "learning_rate": 0.0001, + "loss": 1.5929, + "step": 4893 + }, + { + "epoch": 0.5685739180946848, + "grad_norm": 0.4800203740596771, + "learning_rate": 0.0001, + "loss": 1.7202, + "step": 4894 + }, + { + "epoch": 0.5686900958466453, + "grad_norm": 0.4436158835887909, + "learning_rate": 0.0001, + "loss": 1.6576, + "step": 4895 + }, + { + "epoch": 0.5688062735986059, + "grad_norm": 0.4184040129184723, + "learning_rate": 0.0001, + "loss": 1.5964, + "step": 4896 + }, + { + "epoch": 0.5689224513505664, + "grad_norm": 0.3854430913925171, + "learning_rate": 0.0001, + "loss": 1.5371, + "step": 4897 + }, + { + "epoch": 0.5690386291025269, + "grad_norm": 0.4409714639186859, + "learning_rate": 0.0001, + "loss": 1.7056, + "step": 4898 + }, + { + "epoch": 0.5691548068544874, + "grad_norm": 0.4433159828186035, + "learning_rate": 0.0001, + "loss": 1.7323, + "step": 4899 + }, + { + "epoch": 0.5692709846064479, + "grad_norm": 0.4146018326282501, + "learning_rate": 0.0001, + "loss": 1.6415, + "step": 4900 + }, + { + "epoch": 0.5693871623584084, + "grad_norm": 0.45392921566963196, + "learning_rate": 0.0001, + "loss": 1.5877, + "step": 4901 + }, + { + "epoch": 0.5695033401103688, + "grad_norm": 0.42398884892463684, + "learning_rate": 0.0001, + "loss": 1.6091, + "step": 4902 + }, + { + "epoch": 0.5696195178623293, + "grad_norm": 0.478370726108551, + "learning_rate": 0.0001, + "loss": 1.7515, + "step": 4903 + }, + { + "epoch": 0.5697356956142898, + "grad_norm": 0.4152291417121887, + "learning_rate": 0.0001, + "loss": 1.5101, + "step": 4904 + }, + { + "epoch": 0.5698518733662503, + "grad_norm": 0.4159678816795349, + "learning_rate": 0.0001, + "loss": 1.5687, + "step": 4905 + }, + { + "epoch": 0.5699680511182109, + "grad_norm": 0.40162405371665955, + "learning_rate": 0.0001, + "loss": 1.6491, + "step": 4906 + }, + { + "epoch": 0.5700842288701714, + "grad_norm": 0.44275495409965515, + "learning_rate": 0.0001, + "loss": 1.6955, + "step": 4907 + }, + { + "epoch": 0.5702004066221319, + "grad_norm": 0.43401822447776794, + "learning_rate": 0.0001, + "loss": 1.6592, + "step": 4908 + }, + { + "epoch": 0.5703165843740924, + "grad_norm": 0.40605849027633667, + "learning_rate": 0.0001, + "loss": 1.552, + "step": 4909 + }, + { + "epoch": 0.5704327621260529, + "grad_norm": 0.4131391644477844, + "learning_rate": 0.0001, + "loss": 1.6183, + "step": 4910 + }, + { + "epoch": 0.5705489398780134, + "grad_norm": 0.4179399311542511, + "learning_rate": 0.0001, + "loss": 1.5991, + "step": 4911 + }, + { + "epoch": 0.5706651176299739, + "grad_norm": 0.4298674464225769, + "learning_rate": 0.0001, + "loss": 1.5691, + "step": 4912 + }, + { + "epoch": 0.5707812953819343, + "grad_norm": 0.4349467158317566, + "learning_rate": 0.0001, + "loss": 1.6335, + "step": 4913 + }, + { + "epoch": 0.5708974731338948, + "grad_norm": 0.4512692987918854, + "learning_rate": 0.0001, + "loss": 1.6647, + "step": 4914 + }, + { + "epoch": 0.5710136508858553, + "grad_norm": 0.45529913902282715, + "learning_rate": 0.0001, + "loss": 1.6938, + "step": 4915 + }, + { + "epoch": 0.5711298286378158, + "grad_norm": 0.4262561798095703, + "learning_rate": 0.0001, + "loss": 1.493, + "step": 4916 + }, + { + "epoch": 0.5712460063897764, + "grad_norm": 0.40773990750312805, + "learning_rate": 0.0001, + "loss": 1.5212, + "step": 4917 + }, + { + "epoch": 0.5713621841417369, + "grad_norm": 0.43575015664100647, + "learning_rate": 0.0001, + "loss": 1.7376, + "step": 4918 + }, + { + "epoch": 0.5714783618936974, + "grad_norm": 0.4189346730709076, + "learning_rate": 0.0001, + "loss": 1.6159, + "step": 4919 + }, + { + "epoch": 0.5715945396456579, + "grad_norm": 0.4791758954524994, + "learning_rate": 0.0001, + "loss": 1.7816, + "step": 4920 + }, + { + "epoch": 0.5717107173976184, + "grad_norm": 0.42903390526771545, + "learning_rate": 0.0001, + "loss": 1.6155, + "step": 4921 + }, + { + "epoch": 0.5718268951495789, + "grad_norm": 0.4235144555568695, + "learning_rate": 0.0001, + "loss": 1.5611, + "step": 4922 + }, + { + "epoch": 0.5719430729015393, + "grad_norm": 0.4023747742176056, + "learning_rate": 0.0001, + "loss": 1.653, + "step": 4923 + }, + { + "epoch": 0.5720592506534998, + "grad_norm": 0.44118964672088623, + "learning_rate": 0.0001, + "loss": 1.7021, + "step": 4924 + }, + { + "epoch": 0.5721754284054603, + "grad_norm": 0.4274747669696808, + "learning_rate": 0.0001, + "loss": 1.5897, + "step": 4925 + }, + { + "epoch": 0.5722916061574208, + "grad_norm": 0.41285374760627747, + "learning_rate": 0.0001, + "loss": 1.4082, + "step": 4926 + }, + { + "epoch": 0.5724077839093814, + "grad_norm": 0.4146759510040283, + "learning_rate": 0.0001, + "loss": 1.4328, + "step": 4927 + }, + { + "epoch": 0.5725239616613419, + "grad_norm": 0.41832008957862854, + "learning_rate": 0.0001, + "loss": 1.5344, + "step": 4928 + }, + { + "epoch": 0.5726401394133024, + "grad_norm": 0.4418119192123413, + "learning_rate": 0.0001, + "loss": 1.7025, + "step": 4929 + }, + { + "epoch": 0.5727563171652629, + "grad_norm": 0.3945986032485962, + "learning_rate": 0.0001, + "loss": 1.6492, + "step": 4930 + }, + { + "epoch": 0.5728724949172234, + "grad_norm": 0.44057559967041016, + "learning_rate": 0.0001, + "loss": 1.7055, + "step": 4931 + }, + { + "epoch": 0.5729886726691839, + "grad_norm": 0.4348897933959961, + "learning_rate": 0.0001, + "loss": 1.5898, + "step": 4932 + }, + { + "epoch": 0.5731048504211443, + "grad_norm": 0.431241512298584, + "learning_rate": 0.0001, + "loss": 1.7089, + "step": 4933 + }, + { + "epoch": 0.5732210281731048, + "grad_norm": 0.4173518419265747, + "learning_rate": 0.0001, + "loss": 1.6529, + "step": 4934 + }, + { + "epoch": 0.5733372059250653, + "grad_norm": 0.4776740074157715, + "learning_rate": 0.0001, + "loss": 1.6631, + "step": 4935 + }, + { + "epoch": 0.5734533836770258, + "grad_norm": 0.449663907289505, + "learning_rate": 0.0001, + "loss": 1.6217, + "step": 4936 + }, + { + "epoch": 0.5735695614289863, + "grad_norm": 0.4311505854129791, + "learning_rate": 0.0001, + "loss": 1.5797, + "step": 4937 + }, + { + "epoch": 0.5736857391809469, + "grad_norm": 0.4156731963157654, + "learning_rate": 0.0001, + "loss": 1.8089, + "step": 4938 + }, + { + "epoch": 0.5738019169329074, + "grad_norm": 0.4211094379425049, + "learning_rate": 0.0001, + "loss": 1.4911, + "step": 4939 + }, + { + "epoch": 0.5739180946848679, + "grad_norm": 0.4334612488746643, + "learning_rate": 0.0001, + "loss": 1.6352, + "step": 4940 + }, + { + "epoch": 0.5740342724368284, + "grad_norm": 0.43656647205352783, + "learning_rate": 0.0001, + "loss": 1.7283, + "step": 4941 + }, + { + "epoch": 0.5741504501887889, + "grad_norm": 0.3772907555103302, + "learning_rate": 0.0001, + "loss": 1.4156, + "step": 4942 + }, + { + "epoch": 0.5742666279407493, + "grad_norm": 0.4248020350933075, + "learning_rate": 0.0001, + "loss": 1.5932, + "step": 4943 + }, + { + "epoch": 0.5743828056927098, + "grad_norm": 0.4416458010673523, + "learning_rate": 0.0001, + "loss": 1.562, + "step": 4944 + }, + { + "epoch": 0.5744989834446703, + "grad_norm": 0.43282851576805115, + "learning_rate": 0.0001, + "loss": 1.7414, + "step": 4945 + }, + { + "epoch": 0.5746151611966308, + "grad_norm": 0.4210634231567383, + "learning_rate": 0.0001, + "loss": 1.4285, + "step": 4946 + }, + { + "epoch": 0.5747313389485913, + "grad_norm": 0.4776458740234375, + "learning_rate": 0.0001, + "loss": 1.7215, + "step": 4947 + }, + { + "epoch": 0.5748475167005519, + "grad_norm": 0.4351522624492645, + "learning_rate": 0.0001, + "loss": 1.5761, + "step": 4948 + }, + { + "epoch": 0.5749636944525124, + "grad_norm": 0.4363928437232971, + "learning_rate": 0.0001, + "loss": 1.6422, + "step": 4949 + }, + { + "epoch": 0.5750798722044729, + "grad_norm": 0.4042133390903473, + "learning_rate": 0.0001, + "loss": 1.5203, + "step": 4950 + }, + { + "epoch": 0.5751960499564334, + "grad_norm": 0.4090398848056793, + "learning_rate": 0.0001, + "loss": 1.3931, + "step": 4951 + }, + { + "epoch": 0.5753122277083939, + "grad_norm": 0.4363382160663605, + "learning_rate": 0.0001, + "loss": 1.6114, + "step": 4952 + }, + { + "epoch": 0.5754284054603543, + "grad_norm": 0.43779557943344116, + "learning_rate": 0.0001, + "loss": 1.657, + "step": 4953 + }, + { + "epoch": 0.5755445832123148, + "grad_norm": 0.4067518413066864, + "learning_rate": 0.0001, + "loss": 1.5868, + "step": 4954 + }, + { + "epoch": 0.5756607609642753, + "grad_norm": 0.45167723298072815, + "learning_rate": 0.0001, + "loss": 1.726, + "step": 4955 + }, + { + "epoch": 0.5757769387162358, + "grad_norm": 0.4339980185031891, + "learning_rate": 0.0001, + "loss": 1.7253, + "step": 4956 + }, + { + "epoch": 0.5758931164681963, + "grad_norm": 0.41764840483665466, + "learning_rate": 0.0001, + "loss": 1.636, + "step": 4957 + }, + { + "epoch": 0.5760092942201568, + "grad_norm": 0.4416648745536804, + "learning_rate": 0.0001, + "loss": 1.6458, + "step": 4958 + }, + { + "epoch": 0.5761254719721174, + "grad_norm": 0.42781156301498413, + "learning_rate": 0.0001, + "loss": 1.6848, + "step": 4959 + }, + { + "epoch": 0.5762416497240779, + "grad_norm": 0.44320225715637207, + "learning_rate": 0.0001, + "loss": 1.686, + "step": 4960 + }, + { + "epoch": 0.5763578274760384, + "grad_norm": 0.3853711485862732, + "learning_rate": 0.0001, + "loss": 1.4755, + "step": 4961 + }, + { + "epoch": 0.5764740052279989, + "grad_norm": 0.3990086317062378, + "learning_rate": 0.0001, + "loss": 1.5151, + "step": 4962 + }, + { + "epoch": 0.5765901829799593, + "grad_norm": 0.41801634430885315, + "learning_rate": 0.0001, + "loss": 1.418, + "step": 4963 + }, + { + "epoch": 0.5767063607319198, + "grad_norm": 0.4076370596885681, + "learning_rate": 0.0001, + "loss": 1.5114, + "step": 4964 + }, + { + "epoch": 0.5768225384838803, + "grad_norm": 0.4171597361564636, + "learning_rate": 0.0001, + "loss": 1.5458, + "step": 4965 + }, + { + "epoch": 0.5769387162358408, + "grad_norm": 0.4075530171394348, + "learning_rate": 0.0001, + "loss": 1.58, + "step": 4966 + }, + { + "epoch": 0.5770548939878013, + "grad_norm": 0.4294959008693695, + "learning_rate": 0.0001, + "loss": 1.7531, + "step": 4967 + }, + { + "epoch": 0.5771710717397618, + "grad_norm": 0.4060986340045929, + "learning_rate": 0.0001, + "loss": 1.5989, + "step": 4968 + }, + { + "epoch": 0.5772872494917224, + "grad_norm": 0.43765029311180115, + "learning_rate": 0.0001, + "loss": 1.5315, + "step": 4969 + }, + { + "epoch": 0.5774034272436829, + "grad_norm": 0.4338558316230774, + "learning_rate": 0.0001, + "loss": 1.5691, + "step": 4970 + }, + { + "epoch": 0.5775196049956434, + "grad_norm": 0.42239588499069214, + "learning_rate": 0.0001, + "loss": 1.64, + "step": 4971 + }, + { + "epoch": 0.5776357827476039, + "grad_norm": 0.45466890931129456, + "learning_rate": 0.0001, + "loss": 1.6055, + "step": 4972 + }, + { + "epoch": 0.5777519604995643, + "grad_norm": 0.43936869502067566, + "learning_rate": 0.0001, + "loss": 1.6832, + "step": 4973 + }, + { + "epoch": 0.5778681382515248, + "grad_norm": 0.4495198130607605, + "learning_rate": 0.0001, + "loss": 1.6946, + "step": 4974 + }, + { + "epoch": 0.5779843160034853, + "grad_norm": 0.41554975509643555, + "learning_rate": 0.0001, + "loss": 1.5771, + "step": 4975 + }, + { + "epoch": 0.5781004937554458, + "grad_norm": 0.3993794620037079, + "learning_rate": 0.0001, + "loss": 1.4377, + "step": 4976 + }, + { + "epoch": 0.5782166715074063, + "grad_norm": 0.4077496826648712, + "learning_rate": 0.0001, + "loss": 1.6203, + "step": 4977 + }, + { + "epoch": 0.5783328492593668, + "grad_norm": 0.45379477739334106, + "learning_rate": 0.0001, + "loss": 1.7378, + "step": 4978 + }, + { + "epoch": 0.5784490270113273, + "grad_norm": 0.4157100319862366, + "learning_rate": 0.0001, + "loss": 1.6734, + "step": 4979 + }, + { + "epoch": 0.5785652047632879, + "grad_norm": 0.4228839874267578, + "learning_rate": 0.0001, + "loss": 1.6136, + "step": 4980 + }, + { + "epoch": 0.5786813825152484, + "grad_norm": 0.43401455879211426, + "learning_rate": 0.0001, + "loss": 1.7284, + "step": 4981 + }, + { + "epoch": 0.5787975602672089, + "grad_norm": 0.43419334292411804, + "learning_rate": 0.0001, + "loss": 1.5411, + "step": 4982 + }, + { + "epoch": 0.5789137380191693, + "grad_norm": 0.4071895182132721, + "learning_rate": 0.0001, + "loss": 1.5022, + "step": 4983 + }, + { + "epoch": 0.5790299157711298, + "grad_norm": 0.3975391983985901, + "learning_rate": 0.0001, + "loss": 1.4657, + "step": 4984 + }, + { + "epoch": 0.5791460935230903, + "grad_norm": 0.4305202066898346, + "learning_rate": 0.0001, + "loss": 1.6816, + "step": 4985 + }, + { + "epoch": 0.5792622712750508, + "grad_norm": 0.47176504135131836, + "learning_rate": 0.0001, + "loss": 1.8323, + "step": 4986 + }, + { + "epoch": 0.5793784490270113, + "grad_norm": 0.45612940192222595, + "learning_rate": 0.0001, + "loss": 1.6724, + "step": 4987 + }, + { + "epoch": 0.5794946267789718, + "grad_norm": 0.4044199585914612, + "learning_rate": 0.0001, + "loss": 1.5166, + "step": 4988 + }, + { + "epoch": 0.5796108045309323, + "grad_norm": 0.42781221866607666, + "learning_rate": 0.0001, + "loss": 1.5931, + "step": 4989 + }, + { + "epoch": 0.5797269822828929, + "grad_norm": 0.4110262989997864, + "learning_rate": 0.0001, + "loss": 1.5422, + "step": 4990 + }, + { + "epoch": 0.5798431600348534, + "grad_norm": 0.43411797285079956, + "learning_rate": 0.0001, + "loss": 1.6164, + "step": 4991 + }, + { + "epoch": 0.5799593377868139, + "grad_norm": 0.4334775507450104, + "learning_rate": 0.0001, + "loss": 1.5883, + "step": 4992 + }, + { + "epoch": 0.5800755155387743, + "grad_norm": 0.40531855821609497, + "learning_rate": 0.0001, + "loss": 1.606, + "step": 4993 + }, + { + "epoch": 0.5801916932907348, + "grad_norm": 0.41528868675231934, + "learning_rate": 0.0001, + "loss": 1.5316, + "step": 4994 + }, + { + "epoch": 0.5803078710426953, + "grad_norm": 0.44848141074180603, + "learning_rate": 0.0001, + "loss": 1.6933, + "step": 4995 + }, + { + "epoch": 0.5804240487946558, + "grad_norm": 0.4627237021923065, + "learning_rate": 0.0001, + "loss": 1.6469, + "step": 4996 + }, + { + "epoch": 0.5805402265466163, + "grad_norm": 0.4355679154396057, + "learning_rate": 0.0001, + "loss": 1.656, + "step": 4997 + }, + { + "epoch": 0.5806564042985768, + "grad_norm": 0.4059557318687439, + "learning_rate": 0.0001, + "loss": 1.533, + "step": 4998 + }, + { + "epoch": 0.5807725820505373, + "grad_norm": 0.4451749324798584, + "learning_rate": 0.0001, + "loss": 1.8021, + "step": 4999 + }, + { + "epoch": 0.5808887598024978, + "grad_norm": 0.47867774963378906, + "learning_rate": 0.0001, + "loss": 1.7213, + "step": 5000 + }, + { + "epoch": 0.5810049375544584, + "grad_norm": 0.4432399570941925, + "learning_rate": 0.0001, + "loss": 1.5957, + "step": 5001 + }, + { + "epoch": 0.5811211153064189, + "grad_norm": 0.40032443404197693, + "learning_rate": 0.0001, + "loss": 1.6074, + "step": 5002 + }, + { + "epoch": 0.5812372930583793, + "grad_norm": 0.4400497376918793, + "learning_rate": 0.0001, + "loss": 1.7494, + "step": 5003 + }, + { + "epoch": 0.5813534708103398, + "grad_norm": 0.410427451133728, + "learning_rate": 0.0001, + "loss": 1.6003, + "step": 5004 + }, + { + "epoch": 0.5814696485623003, + "grad_norm": 0.3894704580307007, + "learning_rate": 0.0001, + "loss": 1.6396, + "step": 5005 + }, + { + "epoch": 0.5815858263142608, + "grad_norm": 0.4130994975566864, + "learning_rate": 0.0001, + "loss": 1.5854, + "step": 5006 + }, + { + "epoch": 0.5817020040662213, + "grad_norm": 0.4316181242465973, + "learning_rate": 0.0001, + "loss": 1.6324, + "step": 5007 + }, + { + "epoch": 0.5818181818181818, + "grad_norm": 0.43336644768714905, + "learning_rate": 0.0001, + "loss": 1.5665, + "step": 5008 + }, + { + "epoch": 0.5819343595701423, + "grad_norm": 0.41586264967918396, + "learning_rate": 0.0001, + "loss": 1.6583, + "step": 5009 + }, + { + "epoch": 0.5820505373221028, + "grad_norm": 0.398639053106308, + "learning_rate": 0.0001, + "loss": 1.6328, + "step": 5010 + }, + { + "epoch": 0.5821667150740634, + "grad_norm": 0.42765846848487854, + "learning_rate": 0.0001, + "loss": 1.7781, + "step": 5011 + }, + { + "epoch": 0.5822828928260239, + "grad_norm": 0.420610249042511, + "learning_rate": 0.0001, + "loss": 1.6591, + "step": 5012 + }, + { + "epoch": 0.5823990705779843, + "grad_norm": 0.43733763694763184, + "learning_rate": 0.0001, + "loss": 1.709, + "step": 5013 + }, + { + "epoch": 0.5825152483299448, + "grad_norm": 0.46470460295677185, + "learning_rate": 0.0001, + "loss": 1.6778, + "step": 5014 + }, + { + "epoch": 0.5826314260819053, + "grad_norm": 0.43464747071266174, + "learning_rate": 0.0001, + "loss": 1.5551, + "step": 5015 + }, + { + "epoch": 0.5827476038338658, + "grad_norm": 0.42087873816490173, + "learning_rate": 0.0001, + "loss": 1.6084, + "step": 5016 + }, + { + "epoch": 0.5828637815858263, + "grad_norm": 0.4221756160259247, + "learning_rate": 0.0001, + "loss": 1.5856, + "step": 5017 + }, + { + "epoch": 0.5829799593377868, + "grad_norm": 0.4236156940460205, + "learning_rate": 0.0001, + "loss": 1.6823, + "step": 5018 + }, + { + "epoch": 0.5830961370897473, + "grad_norm": 0.39433997869491577, + "learning_rate": 0.0001, + "loss": 1.4167, + "step": 5019 + }, + { + "epoch": 0.5832123148417078, + "grad_norm": 0.4237521290779114, + "learning_rate": 0.0001, + "loss": 1.6079, + "step": 5020 + }, + { + "epoch": 0.5833284925936683, + "grad_norm": 0.44210362434387207, + "learning_rate": 0.0001, + "loss": 1.7102, + "step": 5021 + }, + { + "epoch": 0.5834446703456289, + "grad_norm": 0.4477832317352295, + "learning_rate": 0.0001, + "loss": 1.7188, + "step": 5022 + }, + { + "epoch": 0.5835608480975893, + "grad_norm": 0.39492443203926086, + "learning_rate": 0.0001, + "loss": 1.3972, + "step": 5023 + }, + { + "epoch": 0.5836770258495498, + "grad_norm": 0.42422086000442505, + "learning_rate": 0.0001, + "loss": 1.6085, + "step": 5024 + }, + { + "epoch": 0.5837932036015103, + "grad_norm": 0.4569999575614929, + "learning_rate": 0.0001, + "loss": 1.6428, + "step": 5025 + }, + { + "epoch": 0.5839093813534708, + "grad_norm": 0.38789770007133484, + "learning_rate": 0.0001, + "loss": 1.4186, + "step": 5026 + }, + { + "epoch": 0.5840255591054313, + "grad_norm": 0.4335310161113739, + "learning_rate": 0.0001, + "loss": 1.4464, + "step": 5027 + }, + { + "epoch": 0.5841417368573918, + "grad_norm": 0.4649960398674011, + "learning_rate": 0.0001, + "loss": 1.7433, + "step": 5028 + }, + { + "epoch": 0.5842579146093523, + "grad_norm": 0.41210901737213135, + "learning_rate": 0.0001, + "loss": 1.5121, + "step": 5029 + }, + { + "epoch": 0.5843740923613128, + "grad_norm": 0.4519018232822418, + "learning_rate": 0.0001, + "loss": 1.6782, + "step": 5030 + }, + { + "epoch": 0.5844902701132733, + "grad_norm": 0.45829200744628906, + "learning_rate": 0.0001, + "loss": 1.5862, + "step": 5031 + }, + { + "epoch": 0.5846064478652339, + "grad_norm": 0.4412051737308502, + "learning_rate": 0.0001, + "loss": 1.6317, + "step": 5032 + }, + { + "epoch": 0.5847226256171943, + "grad_norm": 0.4298493266105652, + "learning_rate": 0.0001, + "loss": 1.71, + "step": 5033 + }, + { + "epoch": 0.5848388033691548, + "grad_norm": 0.47564446926116943, + "learning_rate": 0.0001, + "loss": 1.7355, + "step": 5034 + }, + { + "epoch": 0.5849549811211153, + "grad_norm": 0.4598512053489685, + "learning_rate": 0.0001, + "loss": 1.6587, + "step": 5035 + }, + { + "epoch": 0.5850711588730758, + "grad_norm": 0.4496704041957855, + "learning_rate": 0.0001, + "loss": 1.4895, + "step": 5036 + }, + { + "epoch": 0.5851873366250363, + "grad_norm": 0.4410554766654968, + "learning_rate": 0.0001, + "loss": 1.626, + "step": 5037 + }, + { + "epoch": 0.5853035143769968, + "grad_norm": 0.4468580484390259, + "learning_rate": 0.0001, + "loss": 1.7235, + "step": 5038 + }, + { + "epoch": 0.5854196921289573, + "grad_norm": 0.41189512610435486, + "learning_rate": 0.0001, + "loss": 1.5451, + "step": 5039 + }, + { + "epoch": 0.5855358698809178, + "grad_norm": 0.38999536633491516, + "learning_rate": 0.0001, + "loss": 1.5278, + "step": 5040 + }, + { + "epoch": 0.5856520476328783, + "grad_norm": 0.40551015734672546, + "learning_rate": 0.0001, + "loss": 1.4846, + "step": 5041 + }, + { + "epoch": 0.5857682253848389, + "grad_norm": 0.4540075957775116, + "learning_rate": 0.0001, + "loss": 1.7988, + "step": 5042 + }, + { + "epoch": 0.5858844031367993, + "grad_norm": 0.4358082115650177, + "learning_rate": 0.0001, + "loss": 1.6716, + "step": 5043 + }, + { + "epoch": 0.5860005808887598, + "grad_norm": 0.42553818225860596, + "learning_rate": 0.0001, + "loss": 1.5983, + "step": 5044 + }, + { + "epoch": 0.5861167586407203, + "grad_norm": 0.4158061444759369, + "learning_rate": 0.0001, + "loss": 1.5391, + "step": 5045 + }, + { + "epoch": 0.5862329363926808, + "grad_norm": 0.4467496871948242, + "learning_rate": 0.0001, + "loss": 1.79, + "step": 5046 + }, + { + "epoch": 0.5863491141446413, + "grad_norm": 0.4339059293270111, + "learning_rate": 0.0001, + "loss": 1.6562, + "step": 5047 + }, + { + "epoch": 0.5864652918966018, + "grad_norm": 0.4463905096054077, + "learning_rate": 0.0001, + "loss": 1.755, + "step": 5048 + }, + { + "epoch": 0.5865814696485623, + "grad_norm": 0.4336758553981781, + "learning_rate": 0.0001, + "loss": 1.7042, + "step": 5049 + }, + { + "epoch": 0.5866976474005228, + "grad_norm": 0.3911759555339813, + "learning_rate": 0.0001, + "loss": 1.5634, + "step": 5050 + }, + { + "epoch": 0.5868138251524833, + "grad_norm": 0.43406540155410767, + "learning_rate": 0.0001, + "loss": 1.5955, + "step": 5051 + }, + { + "epoch": 0.5869300029044437, + "grad_norm": 0.40602606534957886, + "learning_rate": 0.0001, + "loss": 1.5095, + "step": 5052 + }, + { + "epoch": 0.5870461806564043, + "grad_norm": 0.40257728099823, + "learning_rate": 0.0001, + "loss": 1.659, + "step": 5053 + }, + { + "epoch": 0.5871623584083648, + "grad_norm": 0.42640700936317444, + "learning_rate": 0.0001, + "loss": 1.6454, + "step": 5054 + }, + { + "epoch": 0.5872785361603253, + "grad_norm": 0.4607278108596802, + "learning_rate": 0.0001, + "loss": 1.6632, + "step": 5055 + }, + { + "epoch": 0.5873947139122858, + "grad_norm": 0.456226110458374, + "learning_rate": 0.0001, + "loss": 1.7054, + "step": 5056 + }, + { + "epoch": 0.5875108916642463, + "grad_norm": 0.44643908739089966, + "learning_rate": 0.0001, + "loss": 1.6056, + "step": 5057 + }, + { + "epoch": 0.5876270694162068, + "grad_norm": 0.4707536995410919, + "learning_rate": 0.0001, + "loss": 1.7986, + "step": 5058 + }, + { + "epoch": 0.5877432471681673, + "grad_norm": 0.437148779630661, + "learning_rate": 0.0001, + "loss": 1.6663, + "step": 5059 + }, + { + "epoch": 0.5878594249201278, + "grad_norm": 0.4139115810394287, + "learning_rate": 0.0001, + "loss": 1.6089, + "step": 5060 + }, + { + "epoch": 0.5879756026720883, + "grad_norm": 0.44146662950515747, + "learning_rate": 0.0001, + "loss": 1.6923, + "step": 5061 + }, + { + "epoch": 0.5880917804240487, + "grad_norm": 0.41695713996887207, + "learning_rate": 0.0001, + "loss": 1.6376, + "step": 5062 + }, + { + "epoch": 0.5882079581760093, + "grad_norm": 0.45756304264068604, + "learning_rate": 0.0001, + "loss": 1.622, + "step": 5063 + }, + { + "epoch": 0.5883241359279698, + "grad_norm": 0.4567687213420868, + "learning_rate": 0.0001, + "loss": 1.687, + "step": 5064 + }, + { + "epoch": 0.5884403136799303, + "grad_norm": 0.43768954277038574, + "learning_rate": 0.0001, + "loss": 1.6733, + "step": 5065 + }, + { + "epoch": 0.5885564914318908, + "grad_norm": 0.4621203541755676, + "learning_rate": 0.0001, + "loss": 1.7541, + "step": 5066 + }, + { + "epoch": 0.5886726691838513, + "grad_norm": 0.420998215675354, + "learning_rate": 0.0001, + "loss": 1.601, + "step": 5067 + }, + { + "epoch": 0.5887888469358118, + "grad_norm": 0.45057764649391174, + "learning_rate": 0.0001, + "loss": 1.763, + "step": 5068 + }, + { + "epoch": 0.5889050246877723, + "grad_norm": 0.47149190306663513, + "learning_rate": 0.0001, + "loss": 1.8066, + "step": 5069 + }, + { + "epoch": 0.5890212024397328, + "grad_norm": 0.4899585545063019, + "learning_rate": 0.0001, + "loss": 1.7134, + "step": 5070 + }, + { + "epoch": 0.5891373801916933, + "grad_norm": 0.4497217833995819, + "learning_rate": 0.0001, + "loss": 1.7211, + "step": 5071 + }, + { + "epoch": 0.5892535579436537, + "grad_norm": 0.47766372561454773, + "learning_rate": 0.0001, + "loss": 1.7802, + "step": 5072 + }, + { + "epoch": 0.5893697356956142, + "grad_norm": 0.4019678831100464, + "learning_rate": 0.0001, + "loss": 1.413, + "step": 5073 + }, + { + "epoch": 0.5894859134475748, + "grad_norm": 0.40686938166618347, + "learning_rate": 0.0001, + "loss": 1.7313, + "step": 5074 + }, + { + "epoch": 0.5896020911995353, + "grad_norm": 0.417871356010437, + "learning_rate": 0.0001, + "loss": 1.6024, + "step": 5075 + }, + { + "epoch": 0.5897182689514958, + "grad_norm": 0.4329603612422943, + "learning_rate": 0.0001, + "loss": 1.7662, + "step": 5076 + }, + { + "epoch": 0.5898344467034563, + "grad_norm": 0.41233643889427185, + "learning_rate": 0.0001, + "loss": 1.4379, + "step": 5077 + }, + { + "epoch": 0.5899506244554168, + "grad_norm": 0.4218975603580475, + "learning_rate": 0.0001, + "loss": 1.5749, + "step": 5078 + }, + { + "epoch": 0.5900668022073773, + "grad_norm": 0.4207758605480194, + "learning_rate": 0.0001, + "loss": 1.6849, + "step": 5079 + }, + { + "epoch": 0.5901829799593378, + "grad_norm": 0.4465838670730591, + "learning_rate": 0.0001, + "loss": 1.4879, + "step": 5080 + }, + { + "epoch": 0.5902991577112983, + "grad_norm": 0.43389689922332764, + "learning_rate": 0.0001, + "loss": 1.7137, + "step": 5081 + }, + { + "epoch": 0.5904153354632588, + "grad_norm": 0.4294486343860626, + "learning_rate": 0.0001, + "loss": 1.6588, + "step": 5082 + }, + { + "epoch": 0.5905315132152192, + "grad_norm": 0.45105913281440735, + "learning_rate": 0.0001, + "loss": 1.7492, + "step": 5083 + }, + { + "epoch": 0.5906476909671798, + "grad_norm": 0.3978818655014038, + "learning_rate": 0.0001, + "loss": 1.4812, + "step": 5084 + }, + { + "epoch": 0.5907638687191403, + "grad_norm": 0.44959375262260437, + "learning_rate": 0.0001, + "loss": 1.7158, + "step": 5085 + }, + { + "epoch": 0.5908800464711008, + "grad_norm": 0.40922439098358154, + "learning_rate": 0.0001, + "loss": 1.6289, + "step": 5086 + }, + { + "epoch": 0.5909962242230613, + "grad_norm": 0.41393378376960754, + "learning_rate": 0.0001, + "loss": 1.5888, + "step": 5087 + }, + { + "epoch": 0.5911124019750218, + "grad_norm": 0.4593432545661926, + "learning_rate": 0.0001, + "loss": 1.691, + "step": 5088 + }, + { + "epoch": 0.5912285797269823, + "grad_norm": 0.4352717995643616, + "learning_rate": 0.0001, + "loss": 1.5997, + "step": 5089 + }, + { + "epoch": 0.5913447574789428, + "grad_norm": 0.4431297779083252, + "learning_rate": 0.0001, + "loss": 1.7196, + "step": 5090 + }, + { + "epoch": 0.5914609352309033, + "grad_norm": 0.4279361069202423, + "learning_rate": 0.0001, + "loss": 1.5642, + "step": 5091 + }, + { + "epoch": 0.5915771129828638, + "grad_norm": 0.4590848386287689, + "learning_rate": 0.0001, + "loss": 1.5416, + "step": 5092 + }, + { + "epoch": 0.5916932907348242, + "grad_norm": 0.4331355094909668, + "learning_rate": 0.0001, + "loss": 1.4664, + "step": 5093 + }, + { + "epoch": 0.5918094684867847, + "grad_norm": 0.4613255262374878, + "learning_rate": 0.0001, + "loss": 1.8481, + "step": 5094 + }, + { + "epoch": 0.5919256462387453, + "grad_norm": 0.46412867307662964, + "learning_rate": 0.0001, + "loss": 1.5394, + "step": 5095 + }, + { + "epoch": 0.5920418239907058, + "grad_norm": 0.4639713764190674, + "learning_rate": 0.0001, + "loss": 1.6082, + "step": 5096 + }, + { + "epoch": 0.5921580017426663, + "grad_norm": 0.48971736431121826, + "learning_rate": 0.0001, + "loss": 1.6449, + "step": 5097 + }, + { + "epoch": 0.5922741794946268, + "grad_norm": 0.48425406217575073, + "learning_rate": 0.0001, + "loss": 1.6604, + "step": 5098 + }, + { + "epoch": 0.5923903572465873, + "grad_norm": 0.45319536328315735, + "learning_rate": 0.0001, + "loss": 1.7629, + "step": 5099 + }, + { + "epoch": 0.5925065349985478, + "grad_norm": 0.436058908700943, + "learning_rate": 0.0001, + "loss": 1.6002, + "step": 5100 + }, + { + "epoch": 0.5926227127505083, + "grad_norm": 0.4272605776786804, + "learning_rate": 0.0001, + "loss": 1.6671, + "step": 5101 + }, + { + "epoch": 0.5927388905024688, + "grad_norm": 0.4012027978897095, + "learning_rate": 0.0001, + "loss": 1.5953, + "step": 5102 + }, + { + "epoch": 0.5928550682544292, + "grad_norm": 0.42485886812210083, + "learning_rate": 0.0001, + "loss": 1.5558, + "step": 5103 + }, + { + "epoch": 0.5929712460063897, + "grad_norm": 0.45090624690055847, + "learning_rate": 0.0001, + "loss": 1.672, + "step": 5104 + }, + { + "epoch": 0.5930874237583503, + "grad_norm": 0.45741748809814453, + "learning_rate": 0.0001, + "loss": 1.7975, + "step": 5105 + }, + { + "epoch": 0.5932036015103108, + "grad_norm": 0.48514753580093384, + "learning_rate": 0.0001, + "loss": 1.6806, + "step": 5106 + }, + { + "epoch": 0.5933197792622713, + "grad_norm": 0.41915470361709595, + "learning_rate": 0.0001, + "loss": 1.6837, + "step": 5107 + }, + { + "epoch": 0.5934359570142318, + "grad_norm": 0.3943340480327606, + "learning_rate": 0.0001, + "loss": 1.4051, + "step": 5108 + }, + { + "epoch": 0.5935521347661923, + "grad_norm": 0.4319881796836853, + "learning_rate": 0.0001, + "loss": 1.4794, + "step": 5109 + }, + { + "epoch": 0.5936683125181528, + "grad_norm": 0.41222912073135376, + "learning_rate": 0.0001, + "loss": 1.5224, + "step": 5110 + }, + { + "epoch": 0.5937844902701133, + "grad_norm": 0.4626607894897461, + "learning_rate": 0.0001, + "loss": 1.6452, + "step": 5111 + }, + { + "epoch": 0.5939006680220738, + "grad_norm": 0.44612380862236023, + "learning_rate": 0.0001, + "loss": 1.7116, + "step": 5112 + }, + { + "epoch": 0.5940168457740342, + "grad_norm": 0.44148531556129456, + "learning_rate": 0.0001, + "loss": 1.6118, + "step": 5113 + }, + { + "epoch": 0.5941330235259947, + "grad_norm": 0.4143647253513336, + "learning_rate": 0.0001, + "loss": 1.4093, + "step": 5114 + }, + { + "epoch": 0.5942492012779552, + "grad_norm": 0.4281814992427826, + "learning_rate": 0.0001, + "loss": 1.6976, + "step": 5115 + }, + { + "epoch": 0.5943653790299158, + "grad_norm": 0.44174307584762573, + "learning_rate": 0.0001, + "loss": 1.5203, + "step": 5116 + }, + { + "epoch": 0.5944815567818763, + "grad_norm": 0.4306361675262451, + "learning_rate": 0.0001, + "loss": 1.6543, + "step": 5117 + }, + { + "epoch": 0.5945977345338368, + "grad_norm": 0.46144962310791016, + "learning_rate": 0.0001, + "loss": 1.8373, + "step": 5118 + }, + { + "epoch": 0.5947139122857973, + "grad_norm": 0.43401268124580383, + "learning_rate": 0.0001, + "loss": 1.6625, + "step": 5119 + }, + { + "epoch": 0.5948300900377578, + "grad_norm": 0.4165130853652954, + "learning_rate": 0.0001, + "loss": 1.5347, + "step": 5120 + }, + { + "epoch": 0.5949462677897183, + "grad_norm": 0.46252015233039856, + "learning_rate": 0.0001, + "loss": 1.7019, + "step": 5121 + }, + { + "epoch": 0.5950624455416788, + "grad_norm": 0.40529441833496094, + "learning_rate": 0.0001, + "loss": 1.5844, + "step": 5122 + }, + { + "epoch": 0.5951786232936392, + "grad_norm": 0.44284263253211975, + "learning_rate": 0.0001, + "loss": 1.7003, + "step": 5123 + }, + { + "epoch": 0.5952948010455997, + "grad_norm": 0.4453653395175934, + "learning_rate": 0.0001, + "loss": 1.6263, + "step": 5124 + }, + { + "epoch": 0.5954109787975602, + "grad_norm": 0.4209020137786865, + "learning_rate": 0.0001, + "loss": 1.4575, + "step": 5125 + }, + { + "epoch": 0.5955271565495208, + "grad_norm": 0.4327007234096527, + "learning_rate": 0.0001, + "loss": 1.6253, + "step": 5126 + }, + { + "epoch": 0.5956433343014813, + "grad_norm": 0.4445003271102905, + "learning_rate": 0.0001, + "loss": 1.7994, + "step": 5127 + }, + { + "epoch": 0.5957595120534418, + "grad_norm": 0.40100163221359253, + "learning_rate": 0.0001, + "loss": 1.6465, + "step": 5128 + }, + { + "epoch": 0.5958756898054023, + "grad_norm": 0.4560073912143707, + "learning_rate": 0.0001, + "loss": 1.6552, + "step": 5129 + }, + { + "epoch": 0.5959918675573628, + "grad_norm": 0.42099544405937195, + "learning_rate": 0.0001, + "loss": 1.6327, + "step": 5130 + }, + { + "epoch": 0.5961080453093233, + "grad_norm": 0.40899744629859924, + "learning_rate": 0.0001, + "loss": 1.5274, + "step": 5131 + }, + { + "epoch": 0.5962242230612838, + "grad_norm": 0.3647748827934265, + "learning_rate": 0.0001, + "loss": 1.3157, + "step": 5132 + }, + { + "epoch": 0.5963404008132442, + "grad_norm": 0.43523362278938293, + "learning_rate": 0.0001, + "loss": 1.6919, + "step": 5133 + }, + { + "epoch": 0.5964565785652047, + "grad_norm": 0.40236619114875793, + "learning_rate": 0.0001, + "loss": 1.5173, + "step": 5134 + }, + { + "epoch": 0.5965727563171652, + "grad_norm": 0.43228352069854736, + "learning_rate": 0.0001, + "loss": 1.6709, + "step": 5135 + }, + { + "epoch": 0.5966889340691257, + "grad_norm": 0.41964632272720337, + "learning_rate": 0.0001, + "loss": 1.4214, + "step": 5136 + }, + { + "epoch": 0.5968051118210863, + "grad_norm": 0.5177493691444397, + "learning_rate": 0.0001, + "loss": 1.7107, + "step": 5137 + }, + { + "epoch": 0.5969212895730468, + "grad_norm": 0.40368756651878357, + "learning_rate": 0.0001, + "loss": 1.4657, + "step": 5138 + }, + { + "epoch": 0.5970374673250073, + "grad_norm": 0.49018216133117676, + "learning_rate": 0.0001, + "loss": 1.6654, + "step": 5139 + }, + { + "epoch": 0.5971536450769678, + "grad_norm": 0.4471641480922699, + "learning_rate": 0.0001, + "loss": 1.4782, + "step": 5140 + }, + { + "epoch": 0.5972698228289283, + "grad_norm": 0.4672026038169861, + "learning_rate": 0.0001, + "loss": 1.7613, + "step": 5141 + }, + { + "epoch": 0.5973860005808888, + "grad_norm": 0.49104636907577515, + "learning_rate": 0.0001, + "loss": 1.7263, + "step": 5142 + }, + { + "epoch": 0.5975021783328492, + "grad_norm": 0.4686024487018585, + "learning_rate": 0.0001, + "loss": 1.5556, + "step": 5143 + }, + { + "epoch": 0.5976183560848097, + "grad_norm": 0.41973164677619934, + "learning_rate": 0.0001, + "loss": 1.6068, + "step": 5144 + }, + { + "epoch": 0.5977345338367702, + "grad_norm": 0.43972551822662354, + "learning_rate": 0.0001, + "loss": 1.5259, + "step": 5145 + }, + { + "epoch": 0.5978507115887307, + "grad_norm": 0.4550989270210266, + "learning_rate": 0.0001, + "loss": 1.8349, + "step": 5146 + }, + { + "epoch": 0.5979668893406913, + "grad_norm": 0.452958345413208, + "learning_rate": 0.0001, + "loss": 1.5227, + "step": 5147 + }, + { + "epoch": 0.5980830670926518, + "grad_norm": 0.4228680431842804, + "learning_rate": 0.0001, + "loss": 1.6042, + "step": 5148 + }, + { + "epoch": 0.5981992448446123, + "grad_norm": 0.4337772727012634, + "learning_rate": 0.0001, + "loss": 1.6816, + "step": 5149 + }, + { + "epoch": 0.5983154225965728, + "grad_norm": 0.3951295018196106, + "learning_rate": 0.0001, + "loss": 1.553, + "step": 5150 + }, + { + "epoch": 0.5984316003485333, + "grad_norm": 0.4213055670261383, + "learning_rate": 0.0001, + "loss": 1.5837, + "step": 5151 + }, + { + "epoch": 0.5985477781004938, + "grad_norm": 0.4383963644504547, + "learning_rate": 0.0001, + "loss": 1.7536, + "step": 5152 + }, + { + "epoch": 0.5986639558524542, + "grad_norm": 0.43566179275512695, + "learning_rate": 0.0001, + "loss": 1.6028, + "step": 5153 + }, + { + "epoch": 0.5987801336044147, + "grad_norm": 0.4275076985359192, + "learning_rate": 0.0001, + "loss": 1.6243, + "step": 5154 + }, + { + "epoch": 0.5988963113563752, + "grad_norm": 0.4178537130355835, + "learning_rate": 0.0001, + "loss": 1.5415, + "step": 5155 + }, + { + "epoch": 0.5990124891083357, + "grad_norm": 0.4059988558292389, + "learning_rate": 0.0001, + "loss": 1.6947, + "step": 5156 + }, + { + "epoch": 0.5991286668602962, + "grad_norm": 0.44386255741119385, + "learning_rate": 0.0001, + "loss": 1.5318, + "step": 5157 + }, + { + "epoch": 0.5992448446122568, + "grad_norm": 0.430132657289505, + "learning_rate": 0.0001, + "loss": 1.7439, + "step": 5158 + }, + { + "epoch": 0.5993610223642173, + "grad_norm": 0.4542747735977173, + "learning_rate": 0.0001, + "loss": 1.7855, + "step": 5159 + }, + { + "epoch": 0.5994772001161778, + "grad_norm": 0.4200115203857422, + "learning_rate": 0.0001, + "loss": 1.5615, + "step": 5160 + }, + { + "epoch": 0.5995933778681383, + "grad_norm": 0.4746837615966797, + "learning_rate": 0.0001, + "loss": 1.758, + "step": 5161 + }, + { + "epoch": 0.5997095556200988, + "grad_norm": 0.45345309376716614, + "learning_rate": 0.0001, + "loss": 1.5189, + "step": 5162 + }, + { + "epoch": 0.5998257333720592, + "grad_norm": 0.4250603914260864, + "learning_rate": 0.0001, + "loss": 1.5632, + "step": 5163 + }, + { + "epoch": 0.5999419111240197, + "grad_norm": 0.4513823688030243, + "learning_rate": 0.0001, + "loss": 1.5504, + "step": 5164 + }, + { + "epoch": 0.6000580888759802, + "grad_norm": 0.4406159520149231, + "learning_rate": 0.0001, + "loss": 1.5861, + "step": 5165 + }, + { + "epoch": 0.6001742666279407, + "grad_norm": 0.45702022314071655, + "learning_rate": 0.0001, + "loss": 1.716, + "step": 5166 + }, + { + "epoch": 0.6002904443799012, + "grad_norm": 0.4588814079761505, + "learning_rate": 0.0001, + "loss": 1.7822, + "step": 5167 + }, + { + "epoch": 0.6004066221318618, + "grad_norm": 0.4281061291694641, + "learning_rate": 0.0001, + "loss": 1.6619, + "step": 5168 + }, + { + "epoch": 0.6005227998838223, + "grad_norm": 0.42194947600364685, + "learning_rate": 0.0001, + "loss": 1.5524, + "step": 5169 + }, + { + "epoch": 0.6006389776357828, + "grad_norm": 0.43042877316474915, + "learning_rate": 0.0001, + "loss": 1.6838, + "step": 5170 + }, + { + "epoch": 0.6007551553877433, + "grad_norm": 0.40578198432922363, + "learning_rate": 0.0001, + "loss": 1.6245, + "step": 5171 + }, + { + "epoch": 0.6008713331397038, + "grad_norm": 0.41954731941223145, + "learning_rate": 0.0001, + "loss": 1.6543, + "step": 5172 + }, + { + "epoch": 0.6009875108916642, + "grad_norm": 0.4439033567905426, + "learning_rate": 0.0001, + "loss": 1.6539, + "step": 5173 + }, + { + "epoch": 0.6011036886436247, + "grad_norm": 0.41968825459480286, + "learning_rate": 0.0001, + "loss": 1.3706, + "step": 5174 + }, + { + "epoch": 0.6012198663955852, + "grad_norm": 0.480570524930954, + "learning_rate": 0.0001, + "loss": 1.7536, + "step": 5175 + }, + { + "epoch": 0.6013360441475457, + "grad_norm": 0.45514610409736633, + "learning_rate": 0.0001, + "loss": 1.541, + "step": 5176 + }, + { + "epoch": 0.6014522218995062, + "grad_norm": 0.4680652618408203, + "learning_rate": 0.0001, + "loss": 1.7239, + "step": 5177 + }, + { + "epoch": 0.6015683996514667, + "grad_norm": 0.47438210248947144, + "learning_rate": 0.0001, + "loss": 1.6592, + "step": 5178 + }, + { + "epoch": 0.6016845774034273, + "grad_norm": 0.41302818059921265, + "learning_rate": 0.0001, + "loss": 1.7152, + "step": 5179 + }, + { + "epoch": 0.6018007551553878, + "grad_norm": 0.4282575845718384, + "learning_rate": 0.0001, + "loss": 1.5534, + "step": 5180 + }, + { + "epoch": 0.6019169329073483, + "grad_norm": 0.4566362202167511, + "learning_rate": 0.0001, + "loss": 1.7625, + "step": 5181 + }, + { + "epoch": 0.6020331106593088, + "grad_norm": 0.42509856820106506, + "learning_rate": 0.0001, + "loss": 1.6779, + "step": 5182 + }, + { + "epoch": 0.6021492884112692, + "grad_norm": 0.4111645817756653, + "learning_rate": 0.0001, + "loss": 1.5353, + "step": 5183 + }, + { + "epoch": 0.6022654661632297, + "grad_norm": 0.45297473669052124, + "learning_rate": 0.0001, + "loss": 1.7363, + "step": 5184 + }, + { + "epoch": 0.6023816439151902, + "grad_norm": 0.4520341157913208, + "learning_rate": 0.0001, + "loss": 1.5046, + "step": 5185 + }, + { + "epoch": 0.6024978216671507, + "grad_norm": 0.4603181481361389, + "learning_rate": 0.0001, + "loss": 1.6009, + "step": 5186 + }, + { + "epoch": 0.6026139994191112, + "grad_norm": 0.4459368884563446, + "learning_rate": 0.0001, + "loss": 1.6856, + "step": 5187 + }, + { + "epoch": 0.6027301771710717, + "grad_norm": 0.42652809619903564, + "learning_rate": 0.0001, + "loss": 1.5666, + "step": 5188 + }, + { + "epoch": 0.6028463549230323, + "grad_norm": 0.41247275471687317, + "learning_rate": 0.0001, + "loss": 1.4765, + "step": 5189 + }, + { + "epoch": 0.6029625326749928, + "grad_norm": 0.48231571912765503, + "learning_rate": 0.0001, + "loss": 1.8699, + "step": 5190 + }, + { + "epoch": 0.6030787104269533, + "grad_norm": 0.43310195207595825, + "learning_rate": 0.0001, + "loss": 1.6091, + "step": 5191 + }, + { + "epoch": 0.6031948881789138, + "grad_norm": 0.4534681737422943, + "learning_rate": 0.0001, + "loss": 1.6521, + "step": 5192 + }, + { + "epoch": 0.6033110659308742, + "grad_norm": 0.41649046540260315, + "learning_rate": 0.0001, + "loss": 1.5868, + "step": 5193 + }, + { + "epoch": 0.6034272436828347, + "grad_norm": 0.4298163950443268, + "learning_rate": 0.0001, + "loss": 1.6492, + "step": 5194 + }, + { + "epoch": 0.6035434214347952, + "grad_norm": 0.43806689977645874, + "learning_rate": 0.0001, + "loss": 1.5434, + "step": 5195 + }, + { + "epoch": 0.6036595991867557, + "grad_norm": 0.4382534325122833, + "learning_rate": 0.0001, + "loss": 1.5519, + "step": 5196 + }, + { + "epoch": 0.6037757769387162, + "grad_norm": 0.44435277581214905, + "learning_rate": 0.0001, + "loss": 1.6435, + "step": 5197 + }, + { + "epoch": 0.6038919546906767, + "grad_norm": 0.42443951964378357, + "learning_rate": 0.0001, + "loss": 1.6619, + "step": 5198 + }, + { + "epoch": 0.6040081324426372, + "grad_norm": 0.43603894114494324, + "learning_rate": 0.0001, + "loss": 1.7109, + "step": 5199 + }, + { + "epoch": 0.6041243101945978, + "grad_norm": 0.43126562237739563, + "learning_rate": 0.0001, + "loss": 1.4945, + "step": 5200 + }, + { + "epoch": 0.6042404879465583, + "grad_norm": 0.42838260531425476, + "learning_rate": 0.0001, + "loss": 1.7171, + "step": 5201 + }, + { + "epoch": 0.6043566656985188, + "grad_norm": 0.43312254548072815, + "learning_rate": 0.0001, + "loss": 1.6684, + "step": 5202 + }, + { + "epoch": 0.6044728434504792, + "grad_norm": 0.4641507863998413, + "learning_rate": 0.0001, + "loss": 1.7897, + "step": 5203 + }, + { + "epoch": 0.6045890212024397, + "grad_norm": 0.4522157311439514, + "learning_rate": 0.0001, + "loss": 1.6904, + "step": 5204 + }, + { + "epoch": 0.6047051989544002, + "grad_norm": 0.4437788128852844, + "learning_rate": 0.0001, + "loss": 1.655, + "step": 5205 + }, + { + "epoch": 0.6048213767063607, + "grad_norm": 0.40446487069129944, + "learning_rate": 0.0001, + "loss": 1.5878, + "step": 5206 + }, + { + "epoch": 0.6049375544583212, + "grad_norm": 0.47073522210121155, + "learning_rate": 0.0001, + "loss": 1.92, + "step": 5207 + }, + { + "epoch": 0.6050537322102817, + "grad_norm": 0.4400515556335449, + "learning_rate": 0.0001, + "loss": 1.6603, + "step": 5208 + }, + { + "epoch": 0.6051699099622422, + "grad_norm": 0.41592809557914734, + "learning_rate": 0.0001, + "loss": 1.4964, + "step": 5209 + }, + { + "epoch": 0.6052860877142028, + "grad_norm": 0.44804421067237854, + "learning_rate": 0.0001, + "loss": 1.6943, + "step": 5210 + }, + { + "epoch": 0.6054022654661633, + "grad_norm": 0.4076915979385376, + "learning_rate": 0.0001, + "loss": 1.5187, + "step": 5211 + }, + { + "epoch": 0.6055184432181238, + "grad_norm": 0.42822811007499695, + "learning_rate": 0.0001, + "loss": 1.6617, + "step": 5212 + }, + { + "epoch": 0.6056346209700842, + "grad_norm": 0.40919166803359985, + "learning_rate": 0.0001, + "loss": 1.6142, + "step": 5213 + }, + { + "epoch": 0.6057507987220447, + "grad_norm": 0.4067077040672302, + "learning_rate": 0.0001, + "loss": 1.4705, + "step": 5214 + }, + { + "epoch": 0.6058669764740052, + "grad_norm": 0.44207772612571716, + "learning_rate": 0.0001, + "loss": 1.63, + "step": 5215 + }, + { + "epoch": 0.6059831542259657, + "grad_norm": 0.4533270001411438, + "learning_rate": 0.0001, + "loss": 1.7419, + "step": 5216 + }, + { + "epoch": 0.6060993319779262, + "grad_norm": 0.4846879839897156, + "learning_rate": 0.0001, + "loss": 1.8105, + "step": 5217 + }, + { + "epoch": 0.6062155097298867, + "grad_norm": 0.48439884185791016, + "learning_rate": 0.0001, + "loss": 1.8875, + "step": 5218 + }, + { + "epoch": 0.6063316874818472, + "grad_norm": 0.418059766292572, + "learning_rate": 0.0001, + "loss": 1.4664, + "step": 5219 + }, + { + "epoch": 0.6064478652338078, + "grad_norm": 0.4088701903820038, + "learning_rate": 0.0001, + "loss": 1.4301, + "step": 5220 + }, + { + "epoch": 0.6065640429857683, + "grad_norm": 0.4230385720729828, + "learning_rate": 0.0001, + "loss": 1.5988, + "step": 5221 + }, + { + "epoch": 0.6066802207377288, + "grad_norm": 0.4391116797924042, + "learning_rate": 0.0001, + "loss": 1.4377, + "step": 5222 + }, + { + "epoch": 0.6067963984896892, + "grad_norm": 0.44503310322761536, + "learning_rate": 0.0001, + "loss": 1.6902, + "step": 5223 + }, + { + "epoch": 0.6069125762416497, + "grad_norm": 0.4621829390525818, + "learning_rate": 0.0001, + "loss": 1.743, + "step": 5224 + }, + { + "epoch": 0.6070287539936102, + "grad_norm": 0.4562551975250244, + "learning_rate": 0.0001, + "loss": 1.7225, + "step": 5225 + }, + { + "epoch": 0.6071449317455707, + "grad_norm": 0.4189031720161438, + "learning_rate": 0.0001, + "loss": 1.4614, + "step": 5226 + }, + { + "epoch": 0.6072611094975312, + "grad_norm": 0.45642954111099243, + "learning_rate": 0.0001, + "loss": 1.6229, + "step": 5227 + }, + { + "epoch": 0.6073772872494917, + "grad_norm": 0.4381450116634369, + "learning_rate": 0.0001, + "loss": 1.6006, + "step": 5228 + }, + { + "epoch": 0.6074934650014522, + "grad_norm": 0.4203230142593384, + "learning_rate": 0.0001, + "loss": 1.6691, + "step": 5229 + }, + { + "epoch": 0.6076096427534127, + "grad_norm": 0.41312652826309204, + "learning_rate": 0.0001, + "loss": 1.6501, + "step": 5230 + }, + { + "epoch": 0.6077258205053733, + "grad_norm": 0.46580955386161804, + "learning_rate": 0.0001, + "loss": 1.7905, + "step": 5231 + }, + { + "epoch": 0.6078419982573338, + "grad_norm": 0.42524558305740356, + "learning_rate": 0.0001, + "loss": 1.6327, + "step": 5232 + }, + { + "epoch": 0.6079581760092942, + "grad_norm": 0.43027788400650024, + "learning_rate": 0.0001, + "loss": 1.7278, + "step": 5233 + }, + { + "epoch": 0.6080743537612547, + "grad_norm": 0.4252438545227051, + "learning_rate": 0.0001, + "loss": 1.4463, + "step": 5234 + }, + { + "epoch": 0.6081905315132152, + "grad_norm": 0.4131147861480713, + "learning_rate": 0.0001, + "loss": 1.5513, + "step": 5235 + }, + { + "epoch": 0.6083067092651757, + "grad_norm": 0.48660963773727417, + "learning_rate": 0.0001, + "loss": 1.6143, + "step": 5236 + }, + { + "epoch": 0.6084228870171362, + "grad_norm": 0.43840891122817993, + "learning_rate": 0.0001, + "loss": 1.6438, + "step": 5237 + }, + { + "epoch": 0.6085390647690967, + "grad_norm": 0.48248180747032166, + "learning_rate": 0.0001, + "loss": 1.7602, + "step": 5238 + }, + { + "epoch": 0.6086552425210572, + "grad_norm": 0.45849937200546265, + "learning_rate": 0.0001, + "loss": 1.7877, + "step": 5239 + }, + { + "epoch": 0.6087714202730177, + "grad_norm": 0.43766871094703674, + "learning_rate": 0.0001, + "loss": 1.741, + "step": 5240 + }, + { + "epoch": 0.6088875980249783, + "grad_norm": 0.41059789061546326, + "learning_rate": 0.0001, + "loss": 1.7146, + "step": 5241 + }, + { + "epoch": 0.6090037757769388, + "grad_norm": 0.40786266326904297, + "learning_rate": 0.0001, + "loss": 1.5118, + "step": 5242 + }, + { + "epoch": 0.6091199535288992, + "grad_norm": 0.4507090747356415, + "learning_rate": 0.0001, + "loss": 1.6575, + "step": 5243 + }, + { + "epoch": 0.6092361312808597, + "grad_norm": 0.45171281695365906, + "learning_rate": 0.0001, + "loss": 1.6024, + "step": 5244 + }, + { + "epoch": 0.6093523090328202, + "grad_norm": 0.4520759582519531, + "learning_rate": 0.0001, + "loss": 1.6238, + "step": 5245 + }, + { + "epoch": 0.6094684867847807, + "grad_norm": 0.4407515823841095, + "learning_rate": 0.0001, + "loss": 1.4963, + "step": 5246 + }, + { + "epoch": 0.6095846645367412, + "grad_norm": 0.49019014835357666, + "learning_rate": 0.0001, + "loss": 1.7029, + "step": 5247 + }, + { + "epoch": 0.6097008422887017, + "grad_norm": 0.4346254765987396, + "learning_rate": 0.0001, + "loss": 1.5592, + "step": 5248 + }, + { + "epoch": 0.6098170200406622, + "grad_norm": 0.43381422758102417, + "learning_rate": 0.0001, + "loss": 1.6572, + "step": 5249 + }, + { + "epoch": 0.6099331977926227, + "grad_norm": 0.43752437829971313, + "learning_rate": 0.0001, + "loss": 1.5148, + "step": 5250 + }, + { + "epoch": 0.6100493755445832, + "grad_norm": 0.4531850814819336, + "learning_rate": 0.0001, + "loss": 1.7279, + "step": 5251 + }, + { + "epoch": 0.6101655532965438, + "grad_norm": 0.4308491051197052, + "learning_rate": 0.0001, + "loss": 1.6087, + "step": 5252 + }, + { + "epoch": 0.6102817310485043, + "grad_norm": 0.45477020740509033, + "learning_rate": 0.0001, + "loss": 1.7004, + "step": 5253 + }, + { + "epoch": 0.6103979088004647, + "grad_norm": 0.42559128999710083, + "learning_rate": 0.0001, + "loss": 1.6461, + "step": 5254 + }, + { + "epoch": 0.6105140865524252, + "grad_norm": 0.45015910267829895, + "learning_rate": 0.0001, + "loss": 1.6186, + "step": 5255 + }, + { + "epoch": 0.6106302643043857, + "grad_norm": 0.44628676772117615, + "learning_rate": 0.0001, + "loss": 1.7035, + "step": 5256 + }, + { + "epoch": 0.6107464420563462, + "grad_norm": 0.4328848421573639, + "learning_rate": 0.0001, + "loss": 1.5298, + "step": 5257 + }, + { + "epoch": 0.6108626198083067, + "grad_norm": 0.42768430709838867, + "learning_rate": 0.0001, + "loss": 1.7161, + "step": 5258 + }, + { + "epoch": 0.6109787975602672, + "grad_norm": 0.4138183891773224, + "learning_rate": 0.0001, + "loss": 1.5596, + "step": 5259 + }, + { + "epoch": 0.6110949753122277, + "grad_norm": 0.40088340640068054, + "learning_rate": 0.0001, + "loss": 1.2582, + "step": 5260 + }, + { + "epoch": 0.6112111530641882, + "grad_norm": 0.47067928314208984, + "learning_rate": 0.0001, + "loss": 1.6999, + "step": 5261 + }, + { + "epoch": 0.6113273308161488, + "grad_norm": 0.4110230803489685, + "learning_rate": 0.0001, + "loss": 1.5359, + "step": 5262 + }, + { + "epoch": 0.6114435085681093, + "grad_norm": 0.4310549795627594, + "learning_rate": 0.0001, + "loss": 1.5655, + "step": 5263 + }, + { + "epoch": 0.6115596863200697, + "grad_norm": 0.4675642251968384, + "learning_rate": 0.0001, + "loss": 1.7689, + "step": 5264 + }, + { + "epoch": 0.6116758640720302, + "grad_norm": 0.4433719217777252, + "learning_rate": 0.0001, + "loss": 1.6569, + "step": 5265 + }, + { + "epoch": 0.6117920418239907, + "grad_norm": 0.4305296242237091, + "learning_rate": 0.0001, + "loss": 1.5723, + "step": 5266 + }, + { + "epoch": 0.6119082195759512, + "grad_norm": 0.42353424429893494, + "learning_rate": 0.0001, + "loss": 1.5442, + "step": 5267 + }, + { + "epoch": 0.6120243973279117, + "grad_norm": 0.43402886390686035, + "learning_rate": 0.0001, + "loss": 1.7132, + "step": 5268 + }, + { + "epoch": 0.6121405750798722, + "grad_norm": 0.41518083214759827, + "learning_rate": 0.0001, + "loss": 1.4871, + "step": 5269 + }, + { + "epoch": 0.6122567528318327, + "grad_norm": 0.42695778608322144, + "learning_rate": 0.0001, + "loss": 1.7047, + "step": 5270 + }, + { + "epoch": 0.6123729305837932, + "grad_norm": 0.41428840160369873, + "learning_rate": 0.0001, + "loss": 1.5187, + "step": 5271 + }, + { + "epoch": 0.6124891083357537, + "grad_norm": 0.42342090606689453, + "learning_rate": 0.0001, + "loss": 1.4672, + "step": 5272 + }, + { + "epoch": 0.6126052860877143, + "grad_norm": 0.479927122592926, + "learning_rate": 0.0001, + "loss": 1.838, + "step": 5273 + }, + { + "epoch": 0.6127214638396747, + "grad_norm": 0.4496026039123535, + "learning_rate": 0.0001, + "loss": 1.6792, + "step": 5274 + }, + { + "epoch": 0.6128376415916352, + "grad_norm": 0.4338063895702362, + "learning_rate": 0.0001, + "loss": 1.6543, + "step": 5275 + }, + { + "epoch": 0.6129538193435957, + "grad_norm": 0.4147947132587433, + "learning_rate": 0.0001, + "loss": 1.6407, + "step": 5276 + }, + { + "epoch": 0.6130699970955562, + "grad_norm": 0.4108210802078247, + "learning_rate": 0.0001, + "loss": 1.5585, + "step": 5277 + }, + { + "epoch": 0.6131861748475167, + "grad_norm": 0.40176138281822205, + "learning_rate": 0.0001, + "loss": 1.4927, + "step": 5278 + }, + { + "epoch": 0.6133023525994772, + "grad_norm": 0.44862088561058044, + "learning_rate": 0.0001, + "loss": 1.5002, + "step": 5279 + }, + { + "epoch": 0.6134185303514377, + "grad_norm": 0.4072551429271698, + "learning_rate": 0.0001, + "loss": 1.6122, + "step": 5280 + }, + { + "epoch": 0.6135347081033982, + "grad_norm": 0.47543540596961975, + "learning_rate": 0.0001, + "loss": 1.7072, + "step": 5281 + }, + { + "epoch": 0.6136508858553587, + "grad_norm": 0.4115746021270752, + "learning_rate": 0.0001, + "loss": 1.5711, + "step": 5282 + }, + { + "epoch": 0.6137670636073193, + "grad_norm": 0.42444461584091187, + "learning_rate": 0.0001, + "loss": 1.5438, + "step": 5283 + }, + { + "epoch": 0.6138832413592797, + "grad_norm": 0.4719880223274231, + "learning_rate": 0.0001, + "loss": 1.6267, + "step": 5284 + }, + { + "epoch": 0.6139994191112402, + "grad_norm": 0.4645368158817291, + "learning_rate": 0.0001, + "loss": 1.6634, + "step": 5285 + }, + { + "epoch": 0.6141155968632007, + "grad_norm": 0.4200696349143982, + "learning_rate": 0.0001, + "loss": 1.5337, + "step": 5286 + }, + { + "epoch": 0.6142317746151612, + "grad_norm": 0.439284086227417, + "learning_rate": 0.0001, + "loss": 1.5819, + "step": 5287 + }, + { + "epoch": 0.6143479523671217, + "grad_norm": 0.42954742908477783, + "learning_rate": 0.0001, + "loss": 1.6853, + "step": 5288 + }, + { + "epoch": 0.6144641301190822, + "grad_norm": 0.4379776120185852, + "learning_rate": 0.0001, + "loss": 1.7162, + "step": 5289 + }, + { + "epoch": 0.6145803078710427, + "grad_norm": 0.4193497896194458, + "learning_rate": 0.0001, + "loss": 1.6855, + "step": 5290 + }, + { + "epoch": 0.6146964856230032, + "grad_norm": 0.43286967277526855, + "learning_rate": 0.0001, + "loss": 1.7413, + "step": 5291 + }, + { + "epoch": 0.6148126633749637, + "grad_norm": 0.43035081028938293, + "learning_rate": 0.0001, + "loss": 1.5693, + "step": 5292 + }, + { + "epoch": 0.6149288411269241, + "grad_norm": 0.43507617712020874, + "learning_rate": 0.0001, + "loss": 1.8054, + "step": 5293 + }, + { + "epoch": 0.6150450188788847, + "grad_norm": 0.3959396779537201, + "learning_rate": 0.0001, + "loss": 1.4206, + "step": 5294 + }, + { + "epoch": 0.6151611966308452, + "grad_norm": 0.45225653052330017, + "learning_rate": 0.0001, + "loss": 1.5735, + "step": 5295 + }, + { + "epoch": 0.6152773743828057, + "grad_norm": 0.40767356753349304, + "learning_rate": 0.0001, + "loss": 1.4859, + "step": 5296 + }, + { + "epoch": 0.6153935521347662, + "grad_norm": 0.4382137954235077, + "learning_rate": 0.0001, + "loss": 1.5909, + "step": 5297 + }, + { + "epoch": 0.6155097298867267, + "grad_norm": 0.44092151522636414, + "learning_rate": 0.0001, + "loss": 1.7193, + "step": 5298 + }, + { + "epoch": 0.6156259076386872, + "grad_norm": 0.46326959133148193, + "learning_rate": 0.0001, + "loss": 1.5849, + "step": 5299 + }, + { + "epoch": 0.6157420853906477, + "grad_norm": 0.4669477343559265, + "learning_rate": 0.0001, + "loss": 1.7398, + "step": 5300 + }, + { + "epoch": 0.6158582631426082, + "grad_norm": 0.42557385563850403, + "learning_rate": 0.0001, + "loss": 1.5386, + "step": 5301 + }, + { + "epoch": 0.6159744408945687, + "grad_norm": 0.4492860436439514, + "learning_rate": 0.0001, + "loss": 1.5424, + "step": 5302 + }, + { + "epoch": 0.6160906186465291, + "grad_norm": 0.43879732489585876, + "learning_rate": 0.0001, + "loss": 1.5146, + "step": 5303 + }, + { + "epoch": 0.6162067963984897, + "grad_norm": 0.43372800946235657, + "learning_rate": 0.0001, + "loss": 1.6623, + "step": 5304 + }, + { + "epoch": 0.6163229741504502, + "grad_norm": 0.43956896662712097, + "learning_rate": 0.0001, + "loss": 1.6077, + "step": 5305 + }, + { + "epoch": 0.6164391519024107, + "grad_norm": 0.44320589303970337, + "learning_rate": 0.0001, + "loss": 1.6369, + "step": 5306 + }, + { + "epoch": 0.6165553296543712, + "grad_norm": 0.44610658288002014, + "learning_rate": 0.0001, + "loss": 1.6463, + "step": 5307 + }, + { + "epoch": 0.6166715074063317, + "grad_norm": 0.4208918511867523, + "learning_rate": 0.0001, + "loss": 1.6308, + "step": 5308 + }, + { + "epoch": 0.6167876851582922, + "grad_norm": 0.43007153272628784, + "learning_rate": 0.0001, + "loss": 1.6225, + "step": 5309 + }, + { + "epoch": 0.6169038629102527, + "grad_norm": 0.4086022973060608, + "learning_rate": 0.0001, + "loss": 1.5524, + "step": 5310 + }, + { + "epoch": 0.6170200406622132, + "grad_norm": 0.4730951189994812, + "learning_rate": 0.0001, + "loss": 1.6534, + "step": 5311 + }, + { + "epoch": 0.6171362184141737, + "grad_norm": 0.41164976358413696, + "learning_rate": 0.0001, + "loss": 1.3929, + "step": 5312 + }, + { + "epoch": 0.6172523961661341, + "grad_norm": 0.43444958329200745, + "learning_rate": 0.0001, + "loss": 1.6257, + "step": 5313 + }, + { + "epoch": 0.6173685739180946, + "grad_norm": 0.4134158790111542, + "learning_rate": 0.0001, + "loss": 1.4962, + "step": 5314 + }, + { + "epoch": 0.6174847516700552, + "grad_norm": 0.42741191387176514, + "learning_rate": 0.0001, + "loss": 1.6745, + "step": 5315 + }, + { + "epoch": 0.6176009294220157, + "grad_norm": 0.42607542872428894, + "learning_rate": 0.0001, + "loss": 1.5641, + "step": 5316 + }, + { + "epoch": 0.6177171071739762, + "grad_norm": 0.450953871011734, + "learning_rate": 0.0001, + "loss": 1.7549, + "step": 5317 + }, + { + "epoch": 0.6178332849259367, + "grad_norm": 0.4400138556957245, + "learning_rate": 0.0001, + "loss": 1.6905, + "step": 5318 + }, + { + "epoch": 0.6179494626778972, + "grad_norm": 0.4217263460159302, + "learning_rate": 0.0001, + "loss": 1.6168, + "step": 5319 + }, + { + "epoch": 0.6180656404298577, + "grad_norm": 0.4471561908721924, + "learning_rate": 0.0001, + "loss": 1.5789, + "step": 5320 + }, + { + "epoch": 0.6181818181818182, + "grad_norm": 0.45624133944511414, + "learning_rate": 0.0001, + "loss": 1.6327, + "step": 5321 + }, + { + "epoch": 0.6182979959337787, + "grad_norm": 0.4240606129169464, + "learning_rate": 0.0001, + "loss": 1.6738, + "step": 5322 + }, + { + "epoch": 0.6184141736857391, + "grad_norm": 0.41618114709854126, + "learning_rate": 0.0001, + "loss": 1.6379, + "step": 5323 + }, + { + "epoch": 0.6185303514376996, + "grad_norm": 0.4757783114910126, + "learning_rate": 0.0001, + "loss": 1.6637, + "step": 5324 + }, + { + "epoch": 0.6186465291896602, + "grad_norm": 0.45356112718582153, + "learning_rate": 0.0001, + "loss": 1.6294, + "step": 5325 + }, + { + "epoch": 0.6187627069416207, + "grad_norm": 0.42001453042030334, + "learning_rate": 0.0001, + "loss": 1.5796, + "step": 5326 + }, + { + "epoch": 0.6188788846935812, + "grad_norm": 0.4233144521713257, + "learning_rate": 0.0001, + "loss": 1.8047, + "step": 5327 + }, + { + "epoch": 0.6189950624455417, + "grad_norm": 0.451232373714447, + "learning_rate": 0.0001, + "loss": 1.7163, + "step": 5328 + }, + { + "epoch": 0.6191112401975022, + "grad_norm": 0.4210103750228882, + "learning_rate": 0.0001, + "loss": 1.6238, + "step": 5329 + }, + { + "epoch": 0.6192274179494627, + "grad_norm": 0.43561357259750366, + "learning_rate": 0.0001, + "loss": 1.5469, + "step": 5330 + }, + { + "epoch": 0.6193435957014232, + "grad_norm": 0.43155673146247864, + "learning_rate": 0.0001, + "loss": 1.5399, + "step": 5331 + }, + { + "epoch": 0.6194597734533837, + "grad_norm": 0.45339235663414, + "learning_rate": 0.0001, + "loss": 1.614, + "step": 5332 + }, + { + "epoch": 0.6195759512053441, + "grad_norm": 0.5200868248939514, + "learning_rate": 0.0001, + "loss": 1.6676, + "step": 5333 + }, + { + "epoch": 0.6196921289573046, + "grad_norm": 0.45999056100845337, + "learning_rate": 0.0001, + "loss": 1.7309, + "step": 5334 + }, + { + "epoch": 0.6198083067092651, + "grad_norm": 0.46080732345581055, + "learning_rate": 0.0001, + "loss": 1.6882, + "step": 5335 + }, + { + "epoch": 0.6199244844612257, + "grad_norm": 0.4588463008403778, + "learning_rate": 0.0001, + "loss": 1.7431, + "step": 5336 + }, + { + "epoch": 0.6200406622131862, + "grad_norm": 0.4329404830932617, + "learning_rate": 0.0001, + "loss": 1.7793, + "step": 5337 + }, + { + "epoch": 0.6201568399651467, + "grad_norm": 0.42099177837371826, + "learning_rate": 0.0001, + "loss": 1.5492, + "step": 5338 + }, + { + "epoch": 0.6202730177171072, + "grad_norm": 0.4456626772880554, + "learning_rate": 0.0001, + "loss": 1.5394, + "step": 5339 + }, + { + "epoch": 0.6203891954690677, + "grad_norm": 0.45711061358451843, + "learning_rate": 0.0001, + "loss": 1.6839, + "step": 5340 + }, + { + "epoch": 0.6205053732210282, + "grad_norm": 0.4267137944698334, + "learning_rate": 0.0001, + "loss": 1.5828, + "step": 5341 + }, + { + "epoch": 0.6206215509729887, + "grad_norm": 0.438827782869339, + "learning_rate": 0.0001, + "loss": 1.566, + "step": 5342 + }, + { + "epoch": 0.6207377287249491, + "grad_norm": 0.43408849835395813, + "learning_rate": 0.0001, + "loss": 1.5385, + "step": 5343 + }, + { + "epoch": 0.6208539064769096, + "grad_norm": 0.5134710669517517, + "learning_rate": 0.0001, + "loss": 1.6072, + "step": 5344 + }, + { + "epoch": 0.6209700842288701, + "grad_norm": 0.47032633423805237, + "learning_rate": 0.0001, + "loss": 1.6972, + "step": 5345 + }, + { + "epoch": 0.6210862619808307, + "grad_norm": 0.47240790724754333, + "learning_rate": 0.0001, + "loss": 1.6964, + "step": 5346 + }, + { + "epoch": 0.6212024397327912, + "grad_norm": 0.46300870180130005, + "learning_rate": 0.0001, + "loss": 1.5826, + "step": 5347 + }, + { + "epoch": 0.6213186174847517, + "grad_norm": 0.4472779333591461, + "learning_rate": 0.0001, + "loss": 1.5877, + "step": 5348 + }, + { + "epoch": 0.6214347952367122, + "grad_norm": 0.45526379346847534, + "learning_rate": 0.0001, + "loss": 1.676, + "step": 5349 + }, + { + "epoch": 0.6215509729886727, + "grad_norm": 0.4606863260269165, + "learning_rate": 0.0001, + "loss": 1.5718, + "step": 5350 + }, + { + "epoch": 0.6216671507406332, + "grad_norm": 0.4486534297466278, + "learning_rate": 0.0001, + "loss": 1.6654, + "step": 5351 + }, + { + "epoch": 0.6217833284925937, + "grad_norm": 0.5138721466064453, + "learning_rate": 0.0001, + "loss": 1.6765, + "step": 5352 + }, + { + "epoch": 0.6218995062445541, + "grad_norm": 0.43069109320640564, + "learning_rate": 0.0001, + "loss": 1.5228, + "step": 5353 + }, + { + "epoch": 0.6220156839965146, + "grad_norm": 0.4056498408317566, + "learning_rate": 0.0001, + "loss": 1.5213, + "step": 5354 + }, + { + "epoch": 0.6221318617484751, + "grad_norm": 0.4548470079898834, + "learning_rate": 0.0001, + "loss": 1.7058, + "step": 5355 + }, + { + "epoch": 0.6222480395004356, + "grad_norm": 0.4468959867954254, + "learning_rate": 0.0001, + "loss": 1.7774, + "step": 5356 + }, + { + "epoch": 0.6223642172523962, + "grad_norm": 0.4434147775173187, + "learning_rate": 0.0001, + "loss": 1.4379, + "step": 5357 + }, + { + "epoch": 0.6224803950043567, + "grad_norm": 0.4365995526313782, + "learning_rate": 0.0001, + "loss": 1.6554, + "step": 5358 + }, + { + "epoch": 0.6225965727563172, + "grad_norm": 0.42630013823509216, + "learning_rate": 0.0001, + "loss": 1.6059, + "step": 5359 + }, + { + "epoch": 0.6227127505082777, + "grad_norm": 0.4268414378166199, + "learning_rate": 0.0001, + "loss": 1.727, + "step": 5360 + }, + { + "epoch": 0.6228289282602382, + "grad_norm": 0.4438273012638092, + "learning_rate": 0.0001, + "loss": 1.7561, + "step": 5361 + }, + { + "epoch": 0.6229451060121987, + "grad_norm": 0.4021027088165283, + "learning_rate": 0.0001, + "loss": 1.3959, + "step": 5362 + }, + { + "epoch": 0.6230612837641591, + "grad_norm": 0.40190452337265015, + "learning_rate": 0.0001, + "loss": 1.5664, + "step": 5363 + }, + { + "epoch": 0.6231774615161196, + "grad_norm": 0.45445725321769714, + "learning_rate": 0.0001, + "loss": 1.7218, + "step": 5364 + }, + { + "epoch": 0.6232936392680801, + "grad_norm": 0.41883429884910583, + "learning_rate": 0.0001, + "loss": 1.44, + "step": 5365 + }, + { + "epoch": 0.6234098170200406, + "grad_norm": 0.4443962574005127, + "learning_rate": 0.0001, + "loss": 1.7734, + "step": 5366 + }, + { + "epoch": 0.6235259947720012, + "grad_norm": 0.4436537027359009, + "learning_rate": 0.0001, + "loss": 1.7087, + "step": 5367 + }, + { + "epoch": 0.6236421725239617, + "grad_norm": 0.4414960741996765, + "learning_rate": 0.0001, + "loss": 1.6531, + "step": 5368 + }, + { + "epoch": 0.6237583502759222, + "grad_norm": 0.43961644172668457, + "learning_rate": 0.0001, + "loss": 1.6234, + "step": 5369 + }, + { + "epoch": 0.6238745280278827, + "grad_norm": 0.42468035221099854, + "learning_rate": 0.0001, + "loss": 1.656, + "step": 5370 + }, + { + "epoch": 0.6239907057798432, + "grad_norm": 0.40939033031463623, + "learning_rate": 0.0001, + "loss": 1.5147, + "step": 5371 + }, + { + "epoch": 0.6241068835318037, + "grad_norm": 0.43494606018066406, + "learning_rate": 0.0001, + "loss": 1.5552, + "step": 5372 + }, + { + "epoch": 0.6242230612837641, + "grad_norm": 0.4493979513645172, + "learning_rate": 0.0001, + "loss": 1.7368, + "step": 5373 + }, + { + "epoch": 0.6243392390357246, + "grad_norm": 0.3963182270526886, + "learning_rate": 0.0001, + "loss": 1.4555, + "step": 5374 + }, + { + "epoch": 0.6244554167876851, + "grad_norm": 0.4189557731151581, + "learning_rate": 0.0001, + "loss": 1.568, + "step": 5375 + }, + { + "epoch": 0.6245715945396456, + "grad_norm": 0.449720174074173, + "learning_rate": 0.0001, + "loss": 1.662, + "step": 5376 + }, + { + "epoch": 0.6246877722916061, + "grad_norm": 0.4135220944881439, + "learning_rate": 0.0001, + "loss": 1.6108, + "step": 5377 + }, + { + "epoch": 0.6248039500435667, + "grad_norm": 0.47096794843673706, + "learning_rate": 0.0001, + "loss": 1.6151, + "step": 5378 + }, + { + "epoch": 0.6249201277955272, + "grad_norm": 0.4510813057422638, + "learning_rate": 0.0001, + "loss": 1.6438, + "step": 5379 + }, + { + "epoch": 0.6250363055474877, + "grad_norm": 0.43997013568878174, + "learning_rate": 0.0001, + "loss": 1.5405, + "step": 5380 + }, + { + "epoch": 0.6251524832994482, + "grad_norm": 0.4731866419315338, + "learning_rate": 0.0001, + "loss": 1.7788, + "step": 5381 + }, + { + "epoch": 0.6252686610514087, + "grad_norm": 0.44134780764579773, + "learning_rate": 0.0001, + "loss": 1.6662, + "step": 5382 + }, + { + "epoch": 0.6253848388033691, + "grad_norm": 0.5102000832557678, + "learning_rate": 0.0001, + "loss": 1.7645, + "step": 5383 + }, + { + "epoch": 0.6255010165553296, + "grad_norm": 0.43674057722091675, + "learning_rate": 0.0001, + "loss": 1.5027, + "step": 5384 + }, + { + "epoch": 0.6256171943072901, + "grad_norm": 0.47883304953575134, + "learning_rate": 0.0001, + "loss": 1.6244, + "step": 5385 + }, + { + "epoch": 0.6257333720592506, + "grad_norm": 0.43253079056739807, + "learning_rate": 0.0001, + "loss": 1.7042, + "step": 5386 + }, + { + "epoch": 0.6258495498112111, + "grad_norm": 0.41759157180786133, + "learning_rate": 0.0001, + "loss": 1.6168, + "step": 5387 + }, + { + "epoch": 0.6259657275631717, + "grad_norm": 0.421507865190506, + "learning_rate": 0.0001, + "loss": 1.5269, + "step": 5388 + }, + { + "epoch": 0.6260819053151322, + "grad_norm": 0.4465677738189697, + "learning_rate": 0.0001, + "loss": 1.8354, + "step": 5389 + }, + { + "epoch": 0.6261980830670927, + "grad_norm": 0.4306974709033966, + "learning_rate": 0.0001, + "loss": 1.5394, + "step": 5390 + }, + { + "epoch": 0.6263142608190532, + "grad_norm": 0.4195205271244049, + "learning_rate": 0.0001, + "loss": 1.6212, + "step": 5391 + }, + { + "epoch": 0.6264304385710137, + "grad_norm": 0.44183290004730225, + "learning_rate": 0.0001, + "loss": 1.5672, + "step": 5392 + }, + { + "epoch": 0.6265466163229741, + "grad_norm": 0.44616198539733887, + "learning_rate": 0.0001, + "loss": 1.6277, + "step": 5393 + }, + { + "epoch": 0.6266627940749346, + "grad_norm": 0.44162172079086304, + "learning_rate": 0.0001, + "loss": 1.6604, + "step": 5394 + }, + { + "epoch": 0.6267789718268951, + "grad_norm": 0.46100786328315735, + "learning_rate": 0.0001, + "loss": 1.8678, + "step": 5395 + }, + { + "epoch": 0.6268951495788556, + "grad_norm": 0.4163057208061218, + "learning_rate": 0.0001, + "loss": 1.5349, + "step": 5396 + }, + { + "epoch": 0.6270113273308161, + "grad_norm": 0.4567055404186249, + "learning_rate": 0.0001, + "loss": 1.6346, + "step": 5397 + }, + { + "epoch": 0.6271275050827766, + "grad_norm": 0.4351733326911926, + "learning_rate": 0.0001, + "loss": 1.7198, + "step": 5398 + }, + { + "epoch": 0.6272436828347372, + "grad_norm": 0.4696790277957916, + "learning_rate": 0.0001, + "loss": 1.773, + "step": 5399 + }, + { + "epoch": 0.6273598605866977, + "grad_norm": 0.4156184792518616, + "learning_rate": 0.0001, + "loss": 1.4328, + "step": 5400 + }, + { + "epoch": 0.6274760383386582, + "grad_norm": 0.450946182012558, + "learning_rate": 0.0001, + "loss": 1.7431, + "step": 5401 + }, + { + "epoch": 0.6275922160906187, + "grad_norm": 0.48097074031829834, + "learning_rate": 0.0001, + "loss": 1.622, + "step": 5402 + }, + { + "epoch": 0.6277083938425791, + "grad_norm": 0.4515106976032257, + "learning_rate": 0.0001, + "loss": 1.7013, + "step": 5403 + }, + { + "epoch": 0.6278245715945396, + "grad_norm": 0.4210405647754669, + "learning_rate": 0.0001, + "loss": 1.5421, + "step": 5404 + }, + { + "epoch": 0.6279407493465001, + "grad_norm": 0.42682531476020813, + "learning_rate": 0.0001, + "loss": 1.5663, + "step": 5405 + }, + { + "epoch": 0.6280569270984606, + "grad_norm": 0.4539082944393158, + "learning_rate": 0.0001, + "loss": 1.6611, + "step": 5406 + }, + { + "epoch": 0.6281731048504211, + "grad_norm": 0.43941760063171387, + "learning_rate": 0.0001, + "loss": 1.5897, + "step": 5407 + }, + { + "epoch": 0.6282892826023816, + "grad_norm": 0.4232398271560669, + "learning_rate": 0.0001, + "loss": 1.6926, + "step": 5408 + }, + { + "epoch": 0.6284054603543422, + "grad_norm": 0.4403201639652252, + "learning_rate": 0.0001, + "loss": 1.5219, + "step": 5409 + }, + { + "epoch": 0.6285216381063027, + "grad_norm": 0.45847928524017334, + "learning_rate": 0.0001, + "loss": 1.7491, + "step": 5410 + }, + { + "epoch": 0.6286378158582632, + "grad_norm": 0.4543238878250122, + "learning_rate": 0.0001, + "loss": 1.7062, + "step": 5411 + }, + { + "epoch": 0.6287539936102237, + "grad_norm": 0.4506990611553192, + "learning_rate": 0.0001, + "loss": 1.7011, + "step": 5412 + }, + { + "epoch": 0.6288701713621841, + "grad_norm": 0.38618630170822144, + "learning_rate": 0.0001, + "loss": 1.5152, + "step": 5413 + }, + { + "epoch": 0.6289863491141446, + "grad_norm": 0.48193633556365967, + "learning_rate": 0.0001, + "loss": 1.8388, + "step": 5414 + }, + { + "epoch": 0.6291025268661051, + "grad_norm": 0.45501387119293213, + "learning_rate": 0.0001, + "loss": 1.6377, + "step": 5415 + }, + { + "epoch": 0.6292187046180656, + "grad_norm": 0.43010449409484863, + "learning_rate": 0.0001, + "loss": 1.5293, + "step": 5416 + }, + { + "epoch": 0.6293348823700261, + "grad_norm": 0.42179417610168457, + "learning_rate": 0.0001, + "loss": 1.6207, + "step": 5417 + }, + { + "epoch": 0.6294510601219866, + "grad_norm": 0.4156195819377899, + "learning_rate": 0.0001, + "loss": 1.3938, + "step": 5418 + }, + { + "epoch": 0.6295672378739472, + "grad_norm": 0.4323188364505768, + "learning_rate": 0.0001, + "loss": 1.612, + "step": 5419 + }, + { + "epoch": 0.6296834156259077, + "grad_norm": 0.4095346927642822, + "learning_rate": 0.0001, + "loss": 1.5443, + "step": 5420 + }, + { + "epoch": 0.6297995933778682, + "grad_norm": 0.4441397190093994, + "learning_rate": 0.0001, + "loss": 1.6549, + "step": 5421 + }, + { + "epoch": 0.6299157711298287, + "grad_norm": 0.4608078896999359, + "learning_rate": 0.0001, + "loss": 1.6716, + "step": 5422 + }, + { + "epoch": 0.6300319488817892, + "grad_norm": 0.4394521415233612, + "learning_rate": 0.0001, + "loss": 1.6433, + "step": 5423 + }, + { + "epoch": 0.6301481266337496, + "grad_norm": 0.4374762773513794, + "learning_rate": 0.0001, + "loss": 1.6325, + "step": 5424 + }, + { + "epoch": 0.6302643043857101, + "grad_norm": 0.49020811915397644, + "learning_rate": 0.0001, + "loss": 1.7044, + "step": 5425 + }, + { + "epoch": 0.6303804821376706, + "grad_norm": 0.45695215463638306, + "learning_rate": 0.0001, + "loss": 1.6427, + "step": 5426 + }, + { + "epoch": 0.6304966598896311, + "grad_norm": 0.4443933367729187, + "learning_rate": 0.0001, + "loss": 1.6863, + "step": 5427 + }, + { + "epoch": 0.6306128376415916, + "grad_norm": 0.47112321853637695, + "learning_rate": 0.0001, + "loss": 1.8038, + "step": 5428 + }, + { + "epoch": 0.6307290153935521, + "grad_norm": 0.43735185265541077, + "learning_rate": 0.0001, + "loss": 1.5341, + "step": 5429 + }, + { + "epoch": 0.6308451931455127, + "grad_norm": 0.42006251215934753, + "learning_rate": 0.0001, + "loss": 1.462, + "step": 5430 + }, + { + "epoch": 0.6309613708974732, + "grad_norm": 0.4126074016094208, + "learning_rate": 0.0001, + "loss": 1.5895, + "step": 5431 + }, + { + "epoch": 0.6310775486494337, + "grad_norm": 0.4392840564250946, + "learning_rate": 0.0001, + "loss": 1.7046, + "step": 5432 + }, + { + "epoch": 0.6311937264013942, + "grad_norm": 0.4268054962158203, + "learning_rate": 0.0001, + "loss": 1.6143, + "step": 5433 + }, + { + "epoch": 0.6313099041533546, + "grad_norm": 0.4118890166282654, + "learning_rate": 0.0001, + "loss": 1.4662, + "step": 5434 + }, + { + "epoch": 0.6314260819053151, + "grad_norm": 0.4566260874271393, + "learning_rate": 0.0001, + "loss": 1.5913, + "step": 5435 + }, + { + "epoch": 0.6315422596572756, + "grad_norm": 0.4307934045791626, + "learning_rate": 0.0001, + "loss": 1.6546, + "step": 5436 + }, + { + "epoch": 0.6316584374092361, + "grad_norm": 0.4302327632904053, + "learning_rate": 0.0001, + "loss": 1.7334, + "step": 5437 + }, + { + "epoch": 0.6317746151611966, + "grad_norm": 0.4081536531448364, + "learning_rate": 0.0001, + "loss": 1.5399, + "step": 5438 + }, + { + "epoch": 0.6318907929131571, + "grad_norm": 0.43301841616630554, + "learning_rate": 0.0001, + "loss": 1.6369, + "step": 5439 + }, + { + "epoch": 0.6320069706651177, + "grad_norm": 0.4393984377384186, + "learning_rate": 0.0001, + "loss": 1.5908, + "step": 5440 + }, + { + "epoch": 0.6321231484170782, + "grad_norm": 0.4503820240497589, + "learning_rate": 0.0001, + "loss": 1.7102, + "step": 5441 + }, + { + "epoch": 0.6322393261690387, + "grad_norm": 0.4837632179260254, + "learning_rate": 0.0001, + "loss": 1.7753, + "step": 5442 + }, + { + "epoch": 0.6323555039209992, + "grad_norm": 0.4605303108692169, + "learning_rate": 0.0001, + "loss": 1.6316, + "step": 5443 + }, + { + "epoch": 0.6324716816729596, + "grad_norm": 0.4571518003940582, + "learning_rate": 0.0001, + "loss": 1.6705, + "step": 5444 + }, + { + "epoch": 0.6325878594249201, + "grad_norm": 0.4223042130470276, + "learning_rate": 0.0001, + "loss": 1.4927, + "step": 5445 + }, + { + "epoch": 0.6327040371768806, + "grad_norm": 0.4364268183708191, + "learning_rate": 0.0001, + "loss": 1.6427, + "step": 5446 + }, + { + "epoch": 0.6328202149288411, + "grad_norm": 0.43388858437538147, + "learning_rate": 0.0001, + "loss": 1.6069, + "step": 5447 + }, + { + "epoch": 0.6329363926808016, + "grad_norm": 0.4458199143409729, + "learning_rate": 0.0001, + "loss": 1.7514, + "step": 5448 + }, + { + "epoch": 0.6330525704327621, + "grad_norm": 0.43075671792030334, + "learning_rate": 0.0001, + "loss": 1.5369, + "step": 5449 + }, + { + "epoch": 0.6331687481847226, + "grad_norm": 0.42168936133384705, + "learning_rate": 0.0001, + "loss": 1.4663, + "step": 5450 + }, + { + "epoch": 0.6332849259366832, + "grad_norm": 0.41019555926322937, + "learning_rate": 0.0001, + "loss": 1.5724, + "step": 5451 + }, + { + "epoch": 0.6334011036886437, + "grad_norm": 0.4191059470176697, + "learning_rate": 0.0001, + "loss": 1.5034, + "step": 5452 + }, + { + "epoch": 0.6335172814406042, + "grad_norm": 0.4607682526111603, + "learning_rate": 0.0001, + "loss": 1.6468, + "step": 5453 + }, + { + "epoch": 0.6336334591925646, + "grad_norm": 0.4602360725402832, + "learning_rate": 0.0001, + "loss": 1.7269, + "step": 5454 + }, + { + "epoch": 0.6337496369445251, + "grad_norm": 0.4206385910511017, + "learning_rate": 0.0001, + "loss": 1.5707, + "step": 5455 + }, + { + "epoch": 0.6338658146964856, + "grad_norm": 0.427676796913147, + "learning_rate": 0.0001, + "loss": 1.6279, + "step": 5456 + }, + { + "epoch": 0.6339819924484461, + "grad_norm": 0.47075173258781433, + "learning_rate": 0.0001, + "loss": 1.7829, + "step": 5457 + }, + { + "epoch": 0.6340981702004066, + "grad_norm": 0.45573490858078003, + "learning_rate": 0.0001, + "loss": 1.6889, + "step": 5458 + }, + { + "epoch": 0.6342143479523671, + "grad_norm": 0.43033167719841003, + "learning_rate": 0.0001, + "loss": 1.8298, + "step": 5459 + }, + { + "epoch": 0.6343305257043276, + "grad_norm": 0.429253488779068, + "learning_rate": 0.0001, + "loss": 1.5989, + "step": 5460 + }, + { + "epoch": 0.6344467034562882, + "grad_norm": 0.4369482696056366, + "learning_rate": 0.0001, + "loss": 1.6974, + "step": 5461 + }, + { + "epoch": 0.6345628812082487, + "grad_norm": 0.4227111041545868, + "learning_rate": 0.0001, + "loss": 1.6638, + "step": 5462 + }, + { + "epoch": 0.6346790589602092, + "grad_norm": 0.42353329062461853, + "learning_rate": 0.0001, + "loss": 1.6536, + "step": 5463 + }, + { + "epoch": 0.6347952367121696, + "grad_norm": 0.44350820779800415, + "learning_rate": 0.0001, + "loss": 1.6899, + "step": 5464 + }, + { + "epoch": 0.6349114144641301, + "grad_norm": 0.42905986309051514, + "learning_rate": 0.0001, + "loss": 1.633, + "step": 5465 + }, + { + "epoch": 0.6350275922160906, + "grad_norm": 0.4360484480857849, + "learning_rate": 0.0001, + "loss": 1.6686, + "step": 5466 + }, + { + "epoch": 0.6351437699680511, + "grad_norm": 0.4489285349845886, + "learning_rate": 0.0001, + "loss": 1.7598, + "step": 5467 + }, + { + "epoch": 0.6352599477200116, + "grad_norm": 0.43288835883140564, + "learning_rate": 0.0001, + "loss": 1.6209, + "step": 5468 + }, + { + "epoch": 0.6353761254719721, + "grad_norm": 0.4269554316997528, + "learning_rate": 0.0001, + "loss": 1.514, + "step": 5469 + }, + { + "epoch": 0.6354923032239326, + "grad_norm": 0.43894824385643005, + "learning_rate": 0.0001, + "loss": 1.6423, + "step": 5470 + }, + { + "epoch": 0.6356084809758931, + "grad_norm": 0.42554864287376404, + "learning_rate": 0.0001, + "loss": 1.382, + "step": 5471 + }, + { + "epoch": 0.6357246587278537, + "grad_norm": 0.43490105867385864, + "learning_rate": 0.0001, + "loss": 1.5542, + "step": 5472 + }, + { + "epoch": 0.6358408364798142, + "grad_norm": 0.41975313425064087, + "learning_rate": 0.0001, + "loss": 1.5971, + "step": 5473 + }, + { + "epoch": 0.6359570142317746, + "grad_norm": 0.43810316920280457, + "learning_rate": 0.0001, + "loss": 1.5934, + "step": 5474 + }, + { + "epoch": 0.6360731919837351, + "grad_norm": 0.4118810296058655, + "learning_rate": 0.0001, + "loss": 1.5732, + "step": 5475 + }, + { + "epoch": 0.6361893697356956, + "grad_norm": 0.4668201804161072, + "learning_rate": 0.0001, + "loss": 1.7682, + "step": 5476 + }, + { + "epoch": 0.6363055474876561, + "grad_norm": 0.43905314803123474, + "learning_rate": 0.0001, + "loss": 1.6805, + "step": 5477 + }, + { + "epoch": 0.6364217252396166, + "grad_norm": 0.43517014384269714, + "learning_rate": 0.0001, + "loss": 1.6264, + "step": 5478 + }, + { + "epoch": 0.6365379029915771, + "grad_norm": 0.4257369339466095, + "learning_rate": 0.0001, + "loss": 1.6548, + "step": 5479 + }, + { + "epoch": 0.6366540807435376, + "grad_norm": 0.4415474534034729, + "learning_rate": 0.0001, + "loss": 1.5927, + "step": 5480 + }, + { + "epoch": 0.6367702584954981, + "grad_norm": 0.48000675439834595, + "learning_rate": 0.0001, + "loss": 1.8075, + "step": 5481 + }, + { + "epoch": 0.6368864362474587, + "grad_norm": 0.433585524559021, + "learning_rate": 0.0001, + "loss": 1.6295, + "step": 5482 + }, + { + "epoch": 0.6370026139994192, + "grad_norm": 0.46495142579078674, + "learning_rate": 0.0001, + "loss": 1.5808, + "step": 5483 + }, + { + "epoch": 0.6371187917513796, + "grad_norm": 0.4620204567909241, + "learning_rate": 0.0001, + "loss": 1.7422, + "step": 5484 + }, + { + "epoch": 0.6372349695033401, + "grad_norm": 0.457280695438385, + "learning_rate": 0.0001, + "loss": 1.5088, + "step": 5485 + }, + { + "epoch": 0.6373511472553006, + "grad_norm": 0.40286630392074585, + "learning_rate": 0.0001, + "loss": 1.5871, + "step": 5486 + }, + { + "epoch": 0.6374673250072611, + "grad_norm": 0.41824090480804443, + "learning_rate": 0.0001, + "loss": 1.5161, + "step": 5487 + }, + { + "epoch": 0.6375835027592216, + "grad_norm": 0.4374508261680603, + "learning_rate": 0.0001, + "loss": 1.4786, + "step": 5488 + }, + { + "epoch": 0.6376996805111821, + "grad_norm": 0.44496670365333557, + "learning_rate": 0.0001, + "loss": 1.6769, + "step": 5489 + }, + { + "epoch": 0.6378158582631426, + "grad_norm": 0.49528950452804565, + "learning_rate": 0.0001, + "loss": 1.9251, + "step": 5490 + }, + { + "epoch": 0.6379320360151031, + "grad_norm": 0.4341500401496887, + "learning_rate": 0.0001, + "loss": 1.6974, + "step": 5491 + }, + { + "epoch": 0.6380482137670636, + "grad_norm": 0.4738743305206299, + "learning_rate": 0.0001, + "loss": 1.8318, + "step": 5492 + }, + { + "epoch": 0.6381643915190242, + "grad_norm": 0.4284060001373291, + "learning_rate": 0.0001, + "loss": 1.6151, + "step": 5493 + }, + { + "epoch": 0.6382805692709846, + "grad_norm": 0.4526841938495636, + "learning_rate": 0.0001, + "loss": 1.7989, + "step": 5494 + }, + { + "epoch": 0.6383967470229451, + "grad_norm": 0.47774431109428406, + "learning_rate": 0.0001, + "loss": 1.8372, + "step": 5495 + }, + { + "epoch": 0.6385129247749056, + "grad_norm": 0.41493383049964905, + "learning_rate": 0.0001, + "loss": 1.5741, + "step": 5496 + }, + { + "epoch": 0.6386291025268661, + "grad_norm": 0.3971651494503021, + "learning_rate": 0.0001, + "loss": 1.3297, + "step": 5497 + }, + { + "epoch": 0.6387452802788266, + "grad_norm": 0.4250967502593994, + "learning_rate": 0.0001, + "loss": 1.5869, + "step": 5498 + }, + { + "epoch": 0.6388614580307871, + "grad_norm": 0.4181077778339386, + "learning_rate": 0.0001, + "loss": 1.5321, + "step": 5499 + }, + { + "epoch": 0.6389776357827476, + "grad_norm": 0.45691534876823425, + "learning_rate": 0.0001, + "loss": 1.6779, + "step": 5500 + }, + { + "epoch": 0.6390938135347081, + "grad_norm": 0.43789535760879517, + "learning_rate": 0.0001, + "loss": 1.7341, + "step": 5501 + }, + { + "epoch": 0.6392099912866686, + "grad_norm": 0.42184701561927795, + "learning_rate": 0.0001, + "loss": 1.5499, + "step": 5502 + }, + { + "epoch": 0.6393261690386292, + "grad_norm": 0.4412265717983246, + "learning_rate": 0.0001, + "loss": 1.5301, + "step": 5503 + }, + { + "epoch": 0.6394423467905896, + "grad_norm": 0.4428151845932007, + "learning_rate": 0.0001, + "loss": 1.6518, + "step": 5504 + }, + { + "epoch": 0.6395585245425501, + "grad_norm": 0.4467543959617615, + "learning_rate": 0.0001, + "loss": 1.5952, + "step": 5505 + }, + { + "epoch": 0.6396747022945106, + "grad_norm": 0.40767189860343933, + "learning_rate": 0.0001, + "loss": 1.5017, + "step": 5506 + }, + { + "epoch": 0.6397908800464711, + "grad_norm": 0.4650069773197174, + "learning_rate": 0.0001, + "loss": 1.6277, + "step": 5507 + }, + { + "epoch": 0.6399070577984316, + "grad_norm": 0.4141228199005127, + "learning_rate": 0.0001, + "loss": 1.5472, + "step": 5508 + }, + { + "epoch": 0.6400232355503921, + "grad_norm": 0.4306548833847046, + "learning_rate": 0.0001, + "loss": 1.652, + "step": 5509 + }, + { + "epoch": 0.6401394133023526, + "grad_norm": 0.45290401577949524, + "learning_rate": 0.0001, + "loss": 1.6879, + "step": 5510 + }, + { + "epoch": 0.6402555910543131, + "grad_norm": 0.43989840149879456, + "learning_rate": 0.0001, + "loss": 1.5273, + "step": 5511 + }, + { + "epoch": 0.6403717688062736, + "grad_norm": 0.46134138107299805, + "learning_rate": 0.0001, + "loss": 1.6924, + "step": 5512 + }, + { + "epoch": 0.640487946558234, + "grad_norm": 0.4793362617492676, + "learning_rate": 0.0001, + "loss": 1.7376, + "step": 5513 + }, + { + "epoch": 0.6406041243101946, + "grad_norm": 0.4687068462371826, + "learning_rate": 0.0001, + "loss": 1.3945, + "step": 5514 + }, + { + "epoch": 0.6407203020621551, + "grad_norm": 0.4695538878440857, + "learning_rate": 0.0001, + "loss": 1.7152, + "step": 5515 + }, + { + "epoch": 0.6408364798141156, + "grad_norm": 0.4554864168167114, + "learning_rate": 0.0001, + "loss": 1.6469, + "step": 5516 + }, + { + "epoch": 0.6409526575660761, + "grad_norm": 0.45408567786216736, + "learning_rate": 0.0001, + "loss": 1.6828, + "step": 5517 + }, + { + "epoch": 0.6410688353180366, + "grad_norm": 0.4672776162624359, + "learning_rate": 0.0001, + "loss": 1.7557, + "step": 5518 + }, + { + "epoch": 0.6411850130699971, + "grad_norm": 0.43636855483055115, + "learning_rate": 0.0001, + "loss": 1.6705, + "step": 5519 + }, + { + "epoch": 0.6413011908219576, + "grad_norm": 0.4704853892326355, + "learning_rate": 0.0001, + "loss": 1.6469, + "step": 5520 + }, + { + "epoch": 0.6414173685739181, + "grad_norm": 0.45459243655204773, + "learning_rate": 0.0001, + "loss": 1.5321, + "step": 5521 + }, + { + "epoch": 0.6415335463258786, + "grad_norm": 0.4612571895122528, + "learning_rate": 0.0001, + "loss": 1.6599, + "step": 5522 + }, + { + "epoch": 0.641649724077839, + "grad_norm": 0.44641435146331787, + "learning_rate": 0.0001, + "loss": 1.5666, + "step": 5523 + }, + { + "epoch": 0.6417659018297996, + "grad_norm": 0.4449585974216461, + "learning_rate": 0.0001, + "loss": 1.5589, + "step": 5524 + }, + { + "epoch": 0.6418820795817601, + "grad_norm": 0.49576014280319214, + "learning_rate": 0.0001, + "loss": 1.6711, + "step": 5525 + }, + { + "epoch": 0.6419982573337206, + "grad_norm": 0.4535317122936249, + "learning_rate": 0.0001, + "loss": 1.6709, + "step": 5526 + }, + { + "epoch": 0.6421144350856811, + "grad_norm": 0.43689054250717163, + "learning_rate": 0.0001, + "loss": 1.5724, + "step": 5527 + }, + { + "epoch": 0.6422306128376416, + "grad_norm": 0.4465075135231018, + "learning_rate": 0.0001, + "loss": 1.5477, + "step": 5528 + }, + { + "epoch": 0.6423467905896021, + "grad_norm": 0.4437648355960846, + "learning_rate": 0.0001, + "loss": 1.6005, + "step": 5529 + }, + { + "epoch": 0.6424629683415626, + "grad_norm": 0.43459346890449524, + "learning_rate": 0.0001, + "loss": 1.5339, + "step": 5530 + }, + { + "epoch": 0.6425791460935231, + "grad_norm": 0.46662724018096924, + "learning_rate": 0.0001, + "loss": 1.6315, + "step": 5531 + }, + { + "epoch": 0.6426953238454836, + "grad_norm": 0.4122602939605713, + "learning_rate": 0.0001, + "loss": 1.5359, + "step": 5532 + }, + { + "epoch": 0.642811501597444, + "grad_norm": 0.4282759726047516, + "learning_rate": 0.0001, + "loss": 1.5484, + "step": 5533 + }, + { + "epoch": 0.6429276793494045, + "grad_norm": 0.4355936348438263, + "learning_rate": 0.0001, + "loss": 1.6888, + "step": 5534 + }, + { + "epoch": 0.6430438571013651, + "grad_norm": 0.4413192570209503, + "learning_rate": 0.0001, + "loss": 1.5418, + "step": 5535 + }, + { + "epoch": 0.6431600348533256, + "grad_norm": 0.4456358253955841, + "learning_rate": 0.0001, + "loss": 1.5277, + "step": 5536 + }, + { + "epoch": 0.6432762126052861, + "grad_norm": 0.4487074017524719, + "learning_rate": 0.0001, + "loss": 1.6349, + "step": 5537 + }, + { + "epoch": 0.6433923903572466, + "grad_norm": 0.4546639323234558, + "learning_rate": 0.0001, + "loss": 1.7977, + "step": 5538 + }, + { + "epoch": 0.6435085681092071, + "grad_norm": 0.47125598788261414, + "learning_rate": 0.0001, + "loss": 1.6502, + "step": 5539 + }, + { + "epoch": 0.6436247458611676, + "grad_norm": 0.4681374132633209, + "learning_rate": 0.0001, + "loss": 1.7863, + "step": 5540 + }, + { + "epoch": 0.6437409236131281, + "grad_norm": 0.41679567098617554, + "learning_rate": 0.0001, + "loss": 1.6445, + "step": 5541 + }, + { + "epoch": 0.6438571013650886, + "grad_norm": 0.4643932282924652, + "learning_rate": 0.0001, + "loss": 1.4515, + "step": 5542 + }, + { + "epoch": 0.643973279117049, + "grad_norm": 0.42899298667907715, + "learning_rate": 0.0001, + "loss": 1.5497, + "step": 5543 + }, + { + "epoch": 0.6440894568690095, + "grad_norm": 0.44600605964660645, + "learning_rate": 0.0001, + "loss": 1.5991, + "step": 5544 + }, + { + "epoch": 0.6442056346209701, + "grad_norm": 0.4713478088378906, + "learning_rate": 0.0001, + "loss": 1.6812, + "step": 5545 + }, + { + "epoch": 0.6443218123729306, + "grad_norm": 0.48742958903312683, + "learning_rate": 0.0001, + "loss": 1.5977, + "step": 5546 + }, + { + "epoch": 0.6444379901248911, + "grad_norm": 0.4531098008155823, + "learning_rate": 0.0001, + "loss": 1.7098, + "step": 5547 + }, + { + "epoch": 0.6445541678768516, + "grad_norm": 0.4089454710483551, + "learning_rate": 0.0001, + "loss": 1.4264, + "step": 5548 + }, + { + "epoch": 0.6446703456288121, + "grad_norm": 0.4439248740673065, + "learning_rate": 0.0001, + "loss": 1.5493, + "step": 5549 + }, + { + "epoch": 0.6447865233807726, + "grad_norm": 0.4419444799423218, + "learning_rate": 0.0001, + "loss": 1.58, + "step": 5550 + }, + { + "epoch": 0.6449027011327331, + "grad_norm": 0.43608152866363525, + "learning_rate": 0.0001, + "loss": 1.547, + "step": 5551 + }, + { + "epoch": 0.6450188788846936, + "grad_norm": 0.4767807722091675, + "learning_rate": 0.0001, + "loss": 1.6492, + "step": 5552 + }, + { + "epoch": 0.645135056636654, + "grad_norm": 0.4356260597705841, + "learning_rate": 0.0001, + "loss": 1.6679, + "step": 5553 + }, + { + "epoch": 0.6452512343886145, + "grad_norm": 0.42101210355758667, + "learning_rate": 0.0001, + "loss": 1.5609, + "step": 5554 + }, + { + "epoch": 0.645367412140575, + "grad_norm": 0.4493582248687744, + "learning_rate": 0.0001, + "loss": 1.3819, + "step": 5555 + }, + { + "epoch": 0.6454835898925356, + "grad_norm": 0.4049502909183502, + "learning_rate": 0.0001, + "loss": 1.411, + "step": 5556 + }, + { + "epoch": 0.6455997676444961, + "grad_norm": 0.42648744583129883, + "learning_rate": 0.0001, + "loss": 1.6125, + "step": 5557 + }, + { + "epoch": 0.6457159453964566, + "grad_norm": 0.45338109135627747, + "learning_rate": 0.0001, + "loss": 1.7413, + "step": 5558 + }, + { + "epoch": 0.6458321231484171, + "grad_norm": 0.4680498540401459, + "learning_rate": 0.0001, + "loss": 1.7397, + "step": 5559 + }, + { + "epoch": 0.6459483009003776, + "grad_norm": 0.48937198519706726, + "learning_rate": 0.0001, + "loss": 1.7993, + "step": 5560 + }, + { + "epoch": 0.6460644786523381, + "grad_norm": 0.41471385955810547, + "learning_rate": 0.0001, + "loss": 1.5596, + "step": 5561 + }, + { + "epoch": 0.6461806564042986, + "grad_norm": 0.4447079002857208, + "learning_rate": 0.0001, + "loss": 1.6486, + "step": 5562 + }, + { + "epoch": 0.646296834156259, + "grad_norm": 0.42607253789901733, + "learning_rate": 0.0001, + "loss": 1.6055, + "step": 5563 + }, + { + "epoch": 0.6464130119082195, + "grad_norm": 0.4518924653530121, + "learning_rate": 0.0001, + "loss": 1.7441, + "step": 5564 + }, + { + "epoch": 0.64652918966018, + "grad_norm": 0.4289340376853943, + "learning_rate": 0.0001, + "loss": 1.7057, + "step": 5565 + }, + { + "epoch": 0.6466453674121406, + "grad_norm": 0.45490723848342896, + "learning_rate": 0.0001, + "loss": 1.6834, + "step": 5566 + }, + { + "epoch": 0.6467615451641011, + "grad_norm": 0.4541492462158203, + "learning_rate": 0.0001, + "loss": 1.5296, + "step": 5567 + }, + { + "epoch": 0.6468777229160616, + "grad_norm": 0.42619723081588745, + "learning_rate": 0.0001, + "loss": 1.556, + "step": 5568 + }, + { + "epoch": 0.6469939006680221, + "grad_norm": 0.43783965706825256, + "learning_rate": 0.0001, + "loss": 1.7282, + "step": 5569 + }, + { + "epoch": 0.6471100784199826, + "grad_norm": 0.4257659912109375, + "learning_rate": 0.0001, + "loss": 1.6778, + "step": 5570 + }, + { + "epoch": 0.6472262561719431, + "grad_norm": 0.4469713270664215, + "learning_rate": 0.0001, + "loss": 1.6628, + "step": 5571 + }, + { + "epoch": 0.6473424339239036, + "grad_norm": 0.4462829530239105, + "learning_rate": 0.0001, + "loss": 1.5517, + "step": 5572 + }, + { + "epoch": 0.647458611675864, + "grad_norm": 0.4520929455757141, + "learning_rate": 0.0001, + "loss": 1.647, + "step": 5573 + }, + { + "epoch": 0.6475747894278245, + "grad_norm": 0.47630277276039124, + "learning_rate": 0.0001, + "loss": 1.6382, + "step": 5574 + }, + { + "epoch": 0.647690967179785, + "grad_norm": 0.435032457113266, + "learning_rate": 0.0001, + "loss": 1.4792, + "step": 5575 + }, + { + "epoch": 0.6478071449317455, + "grad_norm": 0.46478384733200073, + "learning_rate": 0.0001, + "loss": 1.6891, + "step": 5576 + }, + { + "epoch": 0.6479233226837061, + "grad_norm": 0.4479098618030548, + "learning_rate": 0.0001, + "loss": 1.6942, + "step": 5577 + }, + { + "epoch": 0.6480395004356666, + "grad_norm": 0.4304809868335724, + "learning_rate": 0.0001, + "loss": 1.5376, + "step": 5578 + }, + { + "epoch": 0.6481556781876271, + "grad_norm": 0.43313372135162354, + "learning_rate": 0.0001, + "loss": 1.6466, + "step": 5579 + }, + { + "epoch": 0.6482718559395876, + "grad_norm": 0.4252367615699768, + "learning_rate": 0.0001, + "loss": 1.6563, + "step": 5580 + }, + { + "epoch": 0.6483880336915481, + "grad_norm": 0.42418304085731506, + "learning_rate": 0.0001, + "loss": 1.6122, + "step": 5581 + }, + { + "epoch": 0.6485042114435086, + "grad_norm": 0.45233771204948425, + "learning_rate": 0.0001, + "loss": 1.779, + "step": 5582 + }, + { + "epoch": 0.648620389195469, + "grad_norm": 0.43534642457962036, + "learning_rate": 0.0001, + "loss": 1.5435, + "step": 5583 + }, + { + "epoch": 0.6487365669474295, + "grad_norm": 0.40678098797798157, + "learning_rate": 0.0001, + "loss": 1.5975, + "step": 5584 + }, + { + "epoch": 0.64885274469939, + "grad_norm": 0.4426053762435913, + "learning_rate": 0.0001, + "loss": 1.599, + "step": 5585 + }, + { + "epoch": 0.6489689224513505, + "grad_norm": 0.43607258796691895, + "learning_rate": 0.0001, + "loss": 1.6646, + "step": 5586 + }, + { + "epoch": 0.6490851002033111, + "grad_norm": 0.43579816818237305, + "learning_rate": 0.0001, + "loss": 1.4639, + "step": 5587 + }, + { + "epoch": 0.6492012779552716, + "grad_norm": 0.43486765027046204, + "learning_rate": 0.0001, + "loss": 1.636, + "step": 5588 + }, + { + "epoch": 0.6493174557072321, + "grad_norm": 0.42424148321151733, + "learning_rate": 0.0001, + "loss": 1.5596, + "step": 5589 + }, + { + "epoch": 0.6494336334591926, + "grad_norm": 0.42990225553512573, + "learning_rate": 0.0001, + "loss": 1.6196, + "step": 5590 + }, + { + "epoch": 0.6495498112111531, + "grad_norm": 0.43480220437049866, + "learning_rate": 0.0001, + "loss": 1.7013, + "step": 5591 + }, + { + "epoch": 0.6496659889631136, + "grad_norm": 0.4241524040699005, + "learning_rate": 0.0001, + "loss": 1.6738, + "step": 5592 + }, + { + "epoch": 0.649782166715074, + "grad_norm": 0.4739820957183838, + "learning_rate": 0.0001, + "loss": 1.7373, + "step": 5593 + }, + { + "epoch": 0.6498983444670345, + "grad_norm": 0.44619935750961304, + "learning_rate": 0.0001, + "loss": 1.5907, + "step": 5594 + }, + { + "epoch": 0.650014522218995, + "grad_norm": 0.444965660572052, + "learning_rate": 0.0001, + "loss": 1.721, + "step": 5595 + }, + { + "epoch": 0.6501306999709555, + "grad_norm": 0.4169558882713318, + "learning_rate": 0.0001, + "loss": 1.4508, + "step": 5596 + }, + { + "epoch": 0.6502468777229161, + "grad_norm": 0.44122716784477234, + "learning_rate": 0.0001, + "loss": 1.612, + "step": 5597 + }, + { + "epoch": 0.6503630554748766, + "grad_norm": 0.4570583403110504, + "learning_rate": 0.0001, + "loss": 1.7827, + "step": 5598 + }, + { + "epoch": 0.6504792332268371, + "grad_norm": 0.4605511426925659, + "learning_rate": 0.0001, + "loss": 1.6557, + "step": 5599 + }, + { + "epoch": 0.6505954109787976, + "grad_norm": 0.44260162115097046, + "learning_rate": 0.0001, + "loss": 1.6296, + "step": 5600 + }, + { + "epoch": 0.6507115887307581, + "grad_norm": 0.4196053445339203, + "learning_rate": 0.0001, + "loss": 1.4482, + "step": 5601 + }, + { + "epoch": 0.6508277664827186, + "grad_norm": 0.44127750396728516, + "learning_rate": 0.0001, + "loss": 1.6281, + "step": 5602 + }, + { + "epoch": 0.650943944234679, + "grad_norm": 0.3989742398262024, + "learning_rate": 0.0001, + "loss": 1.4924, + "step": 5603 + }, + { + "epoch": 0.6510601219866395, + "grad_norm": 0.4253181517124176, + "learning_rate": 0.0001, + "loss": 1.533, + "step": 5604 + }, + { + "epoch": 0.6511762997386, + "grad_norm": 0.43197429180145264, + "learning_rate": 0.0001, + "loss": 1.5778, + "step": 5605 + }, + { + "epoch": 0.6512924774905605, + "grad_norm": 0.41109520196914673, + "learning_rate": 0.0001, + "loss": 1.5876, + "step": 5606 + }, + { + "epoch": 0.651408655242521, + "grad_norm": 0.41764628887176514, + "learning_rate": 0.0001, + "loss": 1.5224, + "step": 5607 + }, + { + "epoch": 0.6515248329944816, + "grad_norm": 0.4168471395969391, + "learning_rate": 0.0001, + "loss": 1.6006, + "step": 5608 + }, + { + "epoch": 0.6516410107464421, + "grad_norm": 0.47764724493026733, + "learning_rate": 0.0001, + "loss": 1.6774, + "step": 5609 + }, + { + "epoch": 0.6517571884984026, + "grad_norm": 0.432256281375885, + "learning_rate": 0.0001, + "loss": 1.6464, + "step": 5610 + }, + { + "epoch": 0.6518733662503631, + "grad_norm": 0.4086950421333313, + "learning_rate": 0.0001, + "loss": 1.3479, + "step": 5611 + }, + { + "epoch": 0.6519895440023236, + "grad_norm": 0.4752063453197479, + "learning_rate": 0.0001, + "loss": 1.6622, + "step": 5612 + }, + { + "epoch": 0.652105721754284, + "grad_norm": 0.41802459955215454, + "learning_rate": 0.0001, + "loss": 1.5654, + "step": 5613 + }, + { + "epoch": 0.6522218995062445, + "grad_norm": 0.43375664949417114, + "learning_rate": 0.0001, + "loss": 1.4565, + "step": 5614 + }, + { + "epoch": 0.652338077258205, + "grad_norm": 0.4103739559650421, + "learning_rate": 0.0001, + "loss": 1.3726, + "step": 5615 + }, + { + "epoch": 0.6524542550101655, + "grad_norm": 0.4138474464416504, + "learning_rate": 0.0001, + "loss": 1.5999, + "step": 5616 + }, + { + "epoch": 0.652570432762126, + "grad_norm": 0.4464971423149109, + "learning_rate": 0.0001, + "loss": 1.6249, + "step": 5617 + }, + { + "epoch": 0.6526866105140866, + "grad_norm": 0.4133894741535187, + "learning_rate": 0.0001, + "loss": 1.5194, + "step": 5618 + }, + { + "epoch": 0.6528027882660471, + "grad_norm": 0.4327857792377472, + "learning_rate": 0.0001, + "loss": 1.6527, + "step": 5619 + }, + { + "epoch": 0.6529189660180076, + "grad_norm": 0.4483093321323395, + "learning_rate": 0.0001, + "loss": 1.6088, + "step": 5620 + }, + { + "epoch": 0.6530351437699681, + "grad_norm": 0.43492817878723145, + "learning_rate": 0.0001, + "loss": 1.6154, + "step": 5621 + }, + { + "epoch": 0.6531513215219286, + "grad_norm": 0.4094054400920868, + "learning_rate": 0.0001, + "loss": 1.4924, + "step": 5622 + }, + { + "epoch": 0.653267499273889, + "grad_norm": 0.4260571599006653, + "learning_rate": 0.0001, + "loss": 1.5632, + "step": 5623 + }, + { + "epoch": 0.6533836770258495, + "grad_norm": 0.46979352831840515, + "learning_rate": 0.0001, + "loss": 1.7291, + "step": 5624 + }, + { + "epoch": 0.65349985477781, + "grad_norm": 0.4641943871974945, + "learning_rate": 0.0001, + "loss": 1.5805, + "step": 5625 + }, + { + "epoch": 0.6536160325297705, + "grad_norm": 0.4234953224658966, + "learning_rate": 0.0001, + "loss": 1.4507, + "step": 5626 + }, + { + "epoch": 0.653732210281731, + "grad_norm": 0.4371975064277649, + "learning_rate": 0.0001, + "loss": 1.5322, + "step": 5627 + }, + { + "epoch": 0.6538483880336915, + "grad_norm": 0.46449965238571167, + "learning_rate": 0.0001, + "loss": 1.6554, + "step": 5628 + }, + { + "epoch": 0.6539645657856521, + "grad_norm": 0.4338880479335785, + "learning_rate": 0.0001, + "loss": 1.613, + "step": 5629 + }, + { + "epoch": 0.6540807435376126, + "grad_norm": 0.4431876540184021, + "learning_rate": 0.0001, + "loss": 1.628, + "step": 5630 + }, + { + "epoch": 0.6541969212895731, + "grad_norm": 0.4568195641040802, + "learning_rate": 0.0001, + "loss": 1.7121, + "step": 5631 + }, + { + "epoch": 0.6543130990415336, + "grad_norm": 0.42718222737312317, + "learning_rate": 0.0001, + "loss": 1.6245, + "step": 5632 + }, + { + "epoch": 0.654429276793494, + "grad_norm": 0.4288146197795868, + "learning_rate": 0.0001, + "loss": 1.4417, + "step": 5633 + }, + { + "epoch": 0.6545454545454545, + "grad_norm": 0.4955886900424957, + "learning_rate": 0.0001, + "loss": 1.8059, + "step": 5634 + }, + { + "epoch": 0.654661632297415, + "grad_norm": 0.46111878752708435, + "learning_rate": 0.0001, + "loss": 1.6836, + "step": 5635 + }, + { + "epoch": 0.6547778100493755, + "grad_norm": 0.46014752984046936, + "learning_rate": 0.0001, + "loss": 1.6274, + "step": 5636 + }, + { + "epoch": 0.654893987801336, + "grad_norm": 0.4325878322124481, + "learning_rate": 0.0001, + "loss": 1.4557, + "step": 5637 + }, + { + "epoch": 0.6550101655532965, + "grad_norm": 0.4380306899547577, + "learning_rate": 0.0001, + "loss": 1.5515, + "step": 5638 + }, + { + "epoch": 0.6551263433052571, + "grad_norm": 0.37767666578292847, + "learning_rate": 0.0001, + "loss": 1.1668, + "step": 5639 + }, + { + "epoch": 0.6552425210572176, + "grad_norm": 0.4658316969871521, + "learning_rate": 0.0001, + "loss": 1.6848, + "step": 5640 + }, + { + "epoch": 0.6553586988091781, + "grad_norm": 0.4257132411003113, + "learning_rate": 0.0001, + "loss": 1.5541, + "step": 5641 + }, + { + "epoch": 0.6554748765611386, + "grad_norm": 0.43129682540893555, + "learning_rate": 0.0001, + "loss": 1.5798, + "step": 5642 + }, + { + "epoch": 0.655591054313099, + "grad_norm": 0.4526180326938629, + "learning_rate": 0.0001, + "loss": 1.7933, + "step": 5643 + }, + { + "epoch": 0.6557072320650595, + "grad_norm": 0.5669481754302979, + "learning_rate": 0.0001, + "loss": 1.6737, + "step": 5644 + }, + { + "epoch": 0.65582340981702, + "grad_norm": 0.4606925845146179, + "learning_rate": 0.0001, + "loss": 1.6747, + "step": 5645 + }, + { + "epoch": 0.6559395875689805, + "grad_norm": 0.4286077916622162, + "learning_rate": 0.0001, + "loss": 1.5805, + "step": 5646 + }, + { + "epoch": 0.656055765320941, + "grad_norm": 0.422355979681015, + "learning_rate": 0.0001, + "loss": 1.6196, + "step": 5647 + }, + { + "epoch": 0.6561719430729015, + "grad_norm": 0.4737590551376343, + "learning_rate": 0.0001, + "loss": 1.7451, + "step": 5648 + }, + { + "epoch": 0.656288120824862, + "grad_norm": 0.47408369183540344, + "learning_rate": 0.0001, + "loss": 1.5371, + "step": 5649 + }, + { + "epoch": 0.6564042985768226, + "grad_norm": 0.4657224416732788, + "learning_rate": 0.0001, + "loss": 1.714, + "step": 5650 + }, + { + "epoch": 0.6565204763287831, + "grad_norm": 0.4612825810909271, + "learning_rate": 0.0001, + "loss": 1.708, + "step": 5651 + }, + { + "epoch": 0.6566366540807436, + "grad_norm": 0.4196070730686188, + "learning_rate": 0.0001, + "loss": 1.6494, + "step": 5652 + }, + { + "epoch": 0.656752831832704, + "grad_norm": 0.43713754415512085, + "learning_rate": 0.0001, + "loss": 1.6664, + "step": 5653 + }, + { + "epoch": 0.6568690095846645, + "grad_norm": 0.4284767508506775, + "learning_rate": 0.0001, + "loss": 1.6073, + "step": 5654 + }, + { + "epoch": 0.656985187336625, + "grad_norm": 0.4555676281452179, + "learning_rate": 0.0001, + "loss": 1.7236, + "step": 5655 + }, + { + "epoch": 0.6571013650885855, + "grad_norm": 0.43046674132347107, + "learning_rate": 0.0001, + "loss": 1.4605, + "step": 5656 + }, + { + "epoch": 0.657217542840546, + "grad_norm": 0.4417819678783417, + "learning_rate": 0.0001, + "loss": 1.5334, + "step": 5657 + }, + { + "epoch": 0.6573337205925065, + "grad_norm": 0.4391897916793823, + "learning_rate": 0.0001, + "loss": 1.5996, + "step": 5658 + }, + { + "epoch": 0.657449898344467, + "grad_norm": 0.4507284164428711, + "learning_rate": 0.0001, + "loss": 1.6983, + "step": 5659 + }, + { + "epoch": 0.6575660760964276, + "grad_norm": 0.43838751316070557, + "learning_rate": 0.0001, + "loss": 1.6327, + "step": 5660 + }, + { + "epoch": 0.6576822538483881, + "grad_norm": 0.4852067232131958, + "learning_rate": 0.0001, + "loss": 1.7812, + "step": 5661 + }, + { + "epoch": 0.6577984316003486, + "grad_norm": 0.416293740272522, + "learning_rate": 0.0001, + "loss": 1.5342, + "step": 5662 + }, + { + "epoch": 0.657914609352309, + "grad_norm": 0.45646652579307556, + "learning_rate": 0.0001, + "loss": 1.5847, + "step": 5663 + }, + { + "epoch": 0.6580307871042695, + "grad_norm": 0.43007123470306396, + "learning_rate": 0.0001, + "loss": 1.5599, + "step": 5664 + }, + { + "epoch": 0.65814696485623, + "grad_norm": 0.4502851963043213, + "learning_rate": 0.0001, + "loss": 1.5684, + "step": 5665 + }, + { + "epoch": 0.6582631426081905, + "grad_norm": 0.44901666045188904, + "learning_rate": 0.0001, + "loss": 1.6163, + "step": 5666 + }, + { + "epoch": 0.658379320360151, + "grad_norm": 0.4369884431362152, + "learning_rate": 0.0001, + "loss": 1.5916, + "step": 5667 + }, + { + "epoch": 0.6584954981121115, + "grad_norm": 0.4983995854854584, + "learning_rate": 0.0001, + "loss": 1.6471, + "step": 5668 + }, + { + "epoch": 0.658611675864072, + "grad_norm": 0.5106979608535767, + "learning_rate": 0.0001, + "loss": 1.7756, + "step": 5669 + }, + { + "epoch": 0.6587278536160325, + "grad_norm": 0.4607015550136566, + "learning_rate": 0.0001, + "loss": 1.6902, + "step": 5670 + }, + { + "epoch": 0.6588440313679931, + "grad_norm": 0.44511786103248596, + "learning_rate": 0.0001, + "loss": 1.5567, + "step": 5671 + }, + { + "epoch": 0.6589602091199536, + "grad_norm": 0.44376087188720703, + "learning_rate": 0.0001, + "loss": 1.5643, + "step": 5672 + }, + { + "epoch": 0.659076386871914, + "grad_norm": 0.463748574256897, + "learning_rate": 0.0001, + "loss": 1.6231, + "step": 5673 + }, + { + "epoch": 0.6591925646238745, + "grad_norm": 0.4355449676513672, + "learning_rate": 0.0001, + "loss": 1.6533, + "step": 5674 + }, + { + "epoch": 0.659308742375835, + "grad_norm": 0.4218558967113495, + "learning_rate": 0.0001, + "loss": 1.3842, + "step": 5675 + }, + { + "epoch": 0.6594249201277955, + "grad_norm": 0.4563169479370117, + "learning_rate": 0.0001, + "loss": 1.7053, + "step": 5676 + }, + { + "epoch": 0.659541097879756, + "grad_norm": 0.4865387976169586, + "learning_rate": 0.0001, + "loss": 1.975, + "step": 5677 + }, + { + "epoch": 0.6596572756317165, + "grad_norm": 0.4624812602996826, + "learning_rate": 0.0001, + "loss": 1.7371, + "step": 5678 + }, + { + "epoch": 0.659773453383677, + "grad_norm": 0.4570534825325012, + "learning_rate": 0.0001, + "loss": 1.7626, + "step": 5679 + }, + { + "epoch": 0.6598896311356375, + "grad_norm": 0.42516836524009705, + "learning_rate": 0.0001, + "loss": 1.5708, + "step": 5680 + }, + { + "epoch": 0.6600058088875981, + "grad_norm": 0.4255434572696686, + "learning_rate": 0.0001, + "loss": 1.6152, + "step": 5681 + }, + { + "epoch": 0.6601219866395586, + "grad_norm": 0.4116609990596771, + "learning_rate": 0.0001, + "loss": 1.4613, + "step": 5682 + }, + { + "epoch": 0.6602381643915191, + "grad_norm": 0.45455899834632874, + "learning_rate": 0.0001, + "loss": 1.7634, + "step": 5683 + }, + { + "epoch": 0.6603543421434795, + "grad_norm": 0.4295278787612915, + "learning_rate": 0.0001, + "loss": 1.7607, + "step": 5684 + }, + { + "epoch": 0.66047051989544, + "grad_norm": 0.43869319558143616, + "learning_rate": 0.0001, + "loss": 1.4694, + "step": 5685 + }, + { + "epoch": 0.6605866976474005, + "grad_norm": 0.4462525546550751, + "learning_rate": 0.0001, + "loss": 1.7752, + "step": 5686 + }, + { + "epoch": 0.660702875399361, + "grad_norm": 0.4246017038822174, + "learning_rate": 0.0001, + "loss": 1.6029, + "step": 5687 + }, + { + "epoch": 0.6608190531513215, + "grad_norm": 0.43097880482673645, + "learning_rate": 0.0001, + "loss": 1.5193, + "step": 5688 + }, + { + "epoch": 0.660935230903282, + "grad_norm": 0.45352938771247864, + "learning_rate": 0.0001, + "loss": 1.6469, + "step": 5689 + }, + { + "epoch": 0.6610514086552425, + "grad_norm": 0.46061140298843384, + "learning_rate": 0.0001, + "loss": 1.5656, + "step": 5690 + }, + { + "epoch": 0.661167586407203, + "grad_norm": 0.4334444999694824, + "learning_rate": 0.0001, + "loss": 1.6308, + "step": 5691 + }, + { + "epoch": 0.6612837641591636, + "grad_norm": 0.43552273511886597, + "learning_rate": 0.0001, + "loss": 1.7532, + "step": 5692 + }, + { + "epoch": 0.6613999419111241, + "grad_norm": 0.4349510669708252, + "learning_rate": 0.0001, + "loss": 1.6089, + "step": 5693 + }, + { + "epoch": 0.6615161196630845, + "grad_norm": 0.45464491844177246, + "learning_rate": 0.0001, + "loss": 1.604, + "step": 5694 + }, + { + "epoch": 0.661632297415045, + "grad_norm": 0.44270920753479004, + "learning_rate": 0.0001, + "loss": 1.5775, + "step": 5695 + }, + { + "epoch": 0.6617484751670055, + "grad_norm": 0.4432513117790222, + "learning_rate": 0.0001, + "loss": 1.6634, + "step": 5696 + }, + { + "epoch": 0.661864652918966, + "grad_norm": 0.4541747272014618, + "learning_rate": 0.0001, + "loss": 1.483, + "step": 5697 + }, + { + "epoch": 0.6619808306709265, + "grad_norm": 0.43986377120018005, + "learning_rate": 0.0001, + "loss": 1.6965, + "step": 5698 + }, + { + "epoch": 0.662097008422887, + "grad_norm": 0.463550329208374, + "learning_rate": 0.0001, + "loss": 1.6621, + "step": 5699 + }, + { + "epoch": 0.6622131861748475, + "grad_norm": 0.42243123054504395, + "learning_rate": 0.0001, + "loss": 1.5027, + "step": 5700 + }, + { + "epoch": 0.662329363926808, + "grad_norm": 0.4550102651119232, + "learning_rate": 0.0001, + "loss": 1.521, + "step": 5701 + }, + { + "epoch": 0.6624455416787686, + "grad_norm": 0.48717159032821655, + "learning_rate": 0.0001, + "loss": 1.8157, + "step": 5702 + }, + { + "epoch": 0.6625617194307291, + "grad_norm": 0.43235111236572266, + "learning_rate": 0.0001, + "loss": 1.6943, + "step": 5703 + }, + { + "epoch": 0.6626778971826895, + "grad_norm": 0.39643558859825134, + "learning_rate": 0.0001, + "loss": 1.4418, + "step": 5704 + }, + { + "epoch": 0.66279407493465, + "grad_norm": 0.445264607667923, + "learning_rate": 0.0001, + "loss": 1.5598, + "step": 5705 + }, + { + "epoch": 0.6629102526866105, + "grad_norm": 0.45641717314720154, + "learning_rate": 0.0001, + "loss": 1.7218, + "step": 5706 + }, + { + "epoch": 0.663026430438571, + "grad_norm": 0.4507564902305603, + "learning_rate": 0.0001, + "loss": 1.597, + "step": 5707 + }, + { + "epoch": 0.6631426081905315, + "grad_norm": 0.4877043068408966, + "learning_rate": 0.0001, + "loss": 1.7286, + "step": 5708 + }, + { + "epoch": 0.663258785942492, + "grad_norm": 0.42610713839530945, + "learning_rate": 0.0001, + "loss": 1.6014, + "step": 5709 + }, + { + "epoch": 0.6633749636944525, + "grad_norm": 0.4246615767478943, + "learning_rate": 0.0001, + "loss": 1.6489, + "step": 5710 + }, + { + "epoch": 0.663491141446413, + "grad_norm": 0.46146342158317566, + "learning_rate": 0.0001, + "loss": 1.7475, + "step": 5711 + }, + { + "epoch": 0.6636073191983735, + "grad_norm": 0.45187118649482727, + "learning_rate": 0.0001, + "loss": 1.6847, + "step": 5712 + }, + { + "epoch": 0.6637234969503341, + "grad_norm": 0.4408842921257019, + "learning_rate": 0.0001, + "loss": 1.5933, + "step": 5713 + }, + { + "epoch": 0.6638396747022945, + "grad_norm": 0.4070923626422882, + "learning_rate": 0.0001, + "loss": 1.5068, + "step": 5714 + }, + { + "epoch": 0.663955852454255, + "grad_norm": 0.4318831264972687, + "learning_rate": 0.0001, + "loss": 1.5662, + "step": 5715 + }, + { + "epoch": 0.6640720302062155, + "grad_norm": 0.48511743545532227, + "learning_rate": 0.0001, + "loss": 1.7734, + "step": 5716 + }, + { + "epoch": 0.664188207958176, + "grad_norm": 0.41092416644096375, + "learning_rate": 0.0001, + "loss": 1.4239, + "step": 5717 + }, + { + "epoch": 0.6643043857101365, + "grad_norm": 0.4554840922355652, + "learning_rate": 0.0001, + "loss": 1.6977, + "step": 5718 + }, + { + "epoch": 0.664420563462097, + "grad_norm": 0.42100241780281067, + "learning_rate": 0.0001, + "loss": 1.475, + "step": 5719 + }, + { + "epoch": 0.6645367412140575, + "grad_norm": 0.421871542930603, + "learning_rate": 0.0001, + "loss": 1.4534, + "step": 5720 + }, + { + "epoch": 0.664652918966018, + "grad_norm": 0.46472758054733276, + "learning_rate": 0.0001, + "loss": 1.6236, + "step": 5721 + }, + { + "epoch": 0.6647690967179785, + "grad_norm": 0.4486168920993805, + "learning_rate": 0.0001, + "loss": 1.5682, + "step": 5722 + }, + { + "epoch": 0.6648852744699391, + "grad_norm": 0.44002678990364075, + "learning_rate": 0.0001, + "loss": 1.6501, + "step": 5723 + }, + { + "epoch": 0.6650014522218995, + "grad_norm": 0.427792489528656, + "learning_rate": 0.0001, + "loss": 1.527, + "step": 5724 + }, + { + "epoch": 0.66511762997386, + "grad_norm": 0.44811704754829407, + "learning_rate": 0.0001, + "loss": 1.6518, + "step": 5725 + }, + { + "epoch": 0.6652338077258205, + "grad_norm": 0.4508500099182129, + "learning_rate": 0.0001, + "loss": 1.7428, + "step": 5726 + }, + { + "epoch": 0.665349985477781, + "grad_norm": 0.4322895407676697, + "learning_rate": 0.0001, + "loss": 1.4308, + "step": 5727 + }, + { + "epoch": 0.6654661632297415, + "grad_norm": 0.4665115177631378, + "learning_rate": 0.0001, + "loss": 1.7279, + "step": 5728 + }, + { + "epoch": 0.665582340981702, + "grad_norm": 0.45049217343330383, + "learning_rate": 0.0001, + "loss": 1.6839, + "step": 5729 + }, + { + "epoch": 0.6656985187336625, + "grad_norm": 0.4667704701423645, + "learning_rate": 0.0001, + "loss": 1.553, + "step": 5730 + }, + { + "epoch": 0.665814696485623, + "grad_norm": 0.43236711621284485, + "learning_rate": 0.0001, + "loss": 1.5529, + "step": 5731 + }, + { + "epoch": 0.6659308742375835, + "grad_norm": 0.4469371438026428, + "learning_rate": 0.0001, + "loss": 1.5488, + "step": 5732 + }, + { + "epoch": 0.666047051989544, + "grad_norm": 0.4933030307292938, + "learning_rate": 0.0001, + "loss": 1.7527, + "step": 5733 + }, + { + "epoch": 0.6661632297415045, + "grad_norm": 0.466074138879776, + "learning_rate": 0.0001, + "loss": 1.658, + "step": 5734 + }, + { + "epoch": 0.666279407493465, + "grad_norm": 0.418959379196167, + "learning_rate": 0.0001, + "loss": 1.6183, + "step": 5735 + }, + { + "epoch": 0.6663955852454255, + "grad_norm": 0.4188354015350342, + "learning_rate": 0.0001, + "loss": 1.4785, + "step": 5736 + }, + { + "epoch": 0.666511762997386, + "grad_norm": 0.46846193075180054, + "learning_rate": 0.0001, + "loss": 1.6352, + "step": 5737 + }, + { + "epoch": 0.6666279407493465, + "grad_norm": 0.41429415345191956, + "learning_rate": 0.0001, + "loss": 1.5134, + "step": 5738 + }, + { + "epoch": 0.666744118501307, + "grad_norm": 0.4290158748626709, + "learning_rate": 0.0001, + "loss": 1.4901, + "step": 5739 + }, + { + "epoch": 0.6668602962532675, + "grad_norm": 0.45462852716445923, + "learning_rate": 0.0001, + "loss": 1.7486, + "step": 5740 + }, + { + "epoch": 0.666976474005228, + "grad_norm": 0.4142838716506958, + "learning_rate": 0.0001, + "loss": 1.5618, + "step": 5741 + }, + { + "epoch": 0.6670926517571885, + "grad_norm": 0.4205038249492645, + "learning_rate": 0.0001, + "loss": 1.5088, + "step": 5742 + }, + { + "epoch": 0.667208829509149, + "grad_norm": 0.45246177911758423, + "learning_rate": 0.0001, + "loss": 1.6424, + "step": 5743 + }, + { + "epoch": 0.6673250072611095, + "grad_norm": 0.4968937039375305, + "learning_rate": 0.0001, + "loss": 1.5755, + "step": 5744 + }, + { + "epoch": 0.66744118501307, + "grad_norm": 0.44130516052246094, + "learning_rate": 0.0001, + "loss": 1.6289, + "step": 5745 + }, + { + "epoch": 0.6675573627650305, + "grad_norm": 0.4321398138999939, + "learning_rate": 0.0001, + "loss": 1.6267, + "step": 5746 + }, + { + "epoch": 0.667673540516991, + "grad_norm": 0.43933340907096863, + "learning_rate": 0.0001, + "loss": 1.5193, + "step": 5747 + }, + { + "epoch": 0.6677897182689515, + "grad_norm": 0.4424324333667755, + "learning_rate": 0.0001, + "loss": 1.6544, + "step": 5748 + }, + { + "epoch": 0.667905896020912, + "grad_norm": 0.44031408429145813, + "learning_rate": 0.0001, + "loss": 1.6682, + "step": 5749 + }, + { + "epoch": 0.6680220737728725, + "grad_norm": 0.4373178482055664, + "learning_rate": 0.0001, + "loss": 1.4932, + "step": 5750 + }, + { + "epoch": 0.668138251524833, + "grad_norm": 0.517252504825592, + "learning_rate": 0.0001, + "loss": 1.8808, + "step": 5751 + }, + { + "epoch": 0.6682544292767935, + "grad_norm": 0.44176408648490906, + "learning_rate": 0.0001, + "loss": 1.6645, + "step": 5752 + }, + { + "epoch": 0.668370607028754, + "grad_norm": 0.4362224340438843, + "learning_rate": 0.0001, + "loss": 1.577, + "step": 5753 + }, + { + "epoch": 0.6684867847807144, + "grad_norm": 0.44093477725982666, + "learning_rate": 0.0001, + "loss": 1.529, + "step": 5754 + }, + { + "epoch": 0.668602962532675, + "grad_norm": 0.42824220657348633, + "learning_rate": 0.0001, + "loss": 1.6981, + "step": 5755 + }, + { + "epoch": 0.6687191402846355, + "grad_norm": 0.46376579999923706, + "learning_rate": 0.0001, + "loss": 1.7126, + "step": 5756 + }, + { + "epoch": 0.668835318036596, + "grad_norm": 0.47102320194244385, + "learning_rate": 0.0001, + "loss": 1.8464, + "step": 5757 + }, + { + "epoch": 0.6689514957885565, + "grad_norm": 0.4689522385597229, + "learning_rate": 0.0001, + "loss": 1.6374, + "step": 5758 + }, + { + "epoch": 0.669067673540517, + "grad_norm": 0.43695563077926636, + "learning_rate": 0.0001, + "loss": 1.567, + "step": 5759 + }, + { + "epoch": 0.6691838512924775, + "grad_norm": 0.48102685809135437, + "learning_rate": 0.0001, + "loss": 1.6039, + "step": 5760 + }, + { + "epoch": 0.669300029044438, + "grad_norm": 0.4178447127342224, + "learning_rate": 0.0001, + "loss": 1.4713, + "step": 5761 + }, + { + "epoch": 0.6694162067963985, + "grad_norm": 0.43344780802726746, + "learning_rate": 0.0001, + "loss": 1.5005, + "step": 5762 + }, + { + "epoch": 0.669532384548359, + "grad_norm": 0.4385789632797241, + "learning_rate": 0.0001, + "loss": 1.6053, + "step": 5763 + }, + { + "epoch": 0.6696485623003194, + "grad_norm": 0.49828559160232544, + "learning_rate": 0.0001, + "loss": 1.8065, + "step": 5764 + }, + { + "epoch": 0.66976474005228, + "grad_norm": 0.45839375257492065, + "learning_rate": 0.0001, + "loss": 1.7687, + "step": 5765 + }, + { + "epoch": 0.6698809178042405, + "grad_norm": 0.45659172534942627, + "learning_rate": 0.0001, + "loss": 1.6962, + "step": 5766 + }, + { + "epoch": 0.669997095556201, + "grad_norm": 0.43861255049705505, + "learning_rate": 0.0001, + "loss": 1.3606, + "step": 5767 + }, + { + "epoch": 0.6701132733081615, + "grad_norm": 0.47588106989860535, + "learning_rate": 0.0001, + "loss": 1.7271, + "step": 5768 + }, + { + "epoch": 0.670229451060122, + "grad_norm": 0.43281859159469604, + "learning_rate": 0.0001, + "loss": 1.5828, + "step": 5769 + }, + { + "epoch": 0.6703456288120825, + "grad_norm": 0.42776212096214294, + "learning_rate": 0.0001, + "loss": 1.586, + "step": 5770 + }, + { + "epoch": 0.670461806564043, + "grad_norm": 0.48418179154396057, + "learning_rate": 0.0001, + "loss": 1.6726, + "step": 5771 + }, + { + "epoch": 0.6705779843160035, + "grad_norm": 0.47568055987358093, + "learning_rate": 0.0001, + "loss": 1.6748, + "step": 5772 + }, + { + "epoch": 0.670694162067964, + "grad_norm": 0.46286889910697937, + "learning_rate": 0.0001, + "loss": 1.6651, + "step": 5773 + }, + { + "epoch": 0.6708103398199244, + "grad_norm": 0.42690396308898926, + "learning_rate": 0.0001, + "loss": 1.5761, + "step": 5774 + }, + { + "epoch": 0.670926517571885, + "grad_norm": 0.5125055313110352, + "learning_rate": 0.0001, + "loss": 1.8218, + "step": 5775 + }, + { + "epoch": 0.6710426953238455, + "grad_norm": 0.4569031894207001, + "learning_rate": 0.0001, + "loss": 1.7317, + "step": 5776 + }, + { + "epoch": 0.671158873075806, + "grad_norm": 0.3980884253978729, + "learning_rate": 0.0001, + "loss": 1.5388, + "step": 5777 + }, + { + "epoch": 0.6712750508277665, + "grad_norm": 0.450810968875885, + "learning_rate": 0.0001, + "loss": 1.5016, + "step": 5778 + }, + { + "epoch": 0.671391228579727, + "grad_norm": 0.47312337160110474, + "learning_rate": 0.0001, + "loss": 1.7046, + "step": 5779 + }, + { + "epoch": 0.6715074063316875, + "grad_norm": 0.42309361696243286, + "learning_rate": 0.0001, + "loss": 1.5981, + "step": 5780 + }, + { + "epoch": 0.671623584083648, + "grad_norm": 0.4467894434928894, + "learning_rate": 0.0001, + "loss": 1.6096, + "step": 5781 + }, + { + "epoch": 0.6717397618356085, + "grad_norm": 0.41931766271591187, + "learning_rate": 0.0001, + "loss": 1.4433, + "step": 5782 + }, + { + "epoch": 0.671855939587569, + "grad_norm": 0.49468186497688293, + "learning_rate": 0.0001, + "loss": 1.5217, + "step": 5783 + }, + { + "epoch": 0.6719721173395294, + "grad_norm": 0.5051024556159973, + "learning_rate": 0.0001, + "loss": 1.7906, + "step": 5784 + }, + { + "epoch": 0.6720882950914899, + "grad_norm": 0.4598732888698578, + "learning_rate": 0.0001, + "loss": 1.6131, + "step": 5785 + }, + { + "epoch": 0.6722044728434505, + "grad_norm": 0.4720596671104431, + "learning_rate": 0.0001, + "loss": 1.4947, + "step": 5786 + }, + { + "epoch": 0.672320650595411, + "grad_norm": 0.40756291151046753, + "learning_rate": 0.0001, + "loss": 1.4437, + "step": 5787 + }, + { + "epoch": 0.6724368283473715, + "grad_norm": 0.44663846492767334, + "learning_rate": 0.0001, + "loss": 1.6562, + "step": 5788 + }, + { + "epoch": 0.672553006099332, + "grad_norm": 0.44735023379325867, + "learning_rate": 0.0001, + "loss": 1.6056, + "step": 5789 + }, + { + "epoch": 0.6726691838512925, + "grad_norm": 0.4546589255332947, + "learning_rate": 0.0001, + "loss": 1.6313, + "step": 5790 + }, + { + "epoch": 0.672785361603253, + "grad_norm": 0.44948139786720276, + "learning_rate": 0.0001, + "loss": 1.6102, + "step": 5791 + }, + { + "epoch": 0.6729015393552135, + "grad_norm": 0.45747241377830505, + "learning_rate": 0.0001, + "loss": 1.6629, + "step": 5792 + }, + { + "epoch": 0.673017717107174, + "grad_norm": 0.46398770809173584, + "learning_rate": 0.0001, + "loss": 1.5984, + "step": 5793 + }, + { + "epoch": 0.6731338948591344, + "grad_norm": 0.4531368017196655, + "learning_rate": 0.0001, + "loss": 1.5425, + "step": 5794 + }, + { + "epoch": 0.6732500726110949, + "grad_norm": 0.43872615694999695, + "learning_rate": 0.0001, + "loss": 1.5689, + "step": 5795 + }, + { + "epoch": 0.6733662503630555, + "grad_norm": 0.4827875792980194, + "learning_rate": 0.0001, + "loss": 1.7964, + "step": 5796 + }, + { + "epoch": 0.673482428115016, + "grad_norm": 0.46272164583206177, + "learning_rate": 0.0001, + "loss": 1.6332, + "step": 5797 + }, + { + "epoch": 0.6735986058669765, + "grad_norm": 0.43684569001197815, + "learning_rate": 0.0001, + "loss": 1.5881, + "step": 5798 + }, + { + "epoch": 0.673714783618937, + "grad_norm": 0.43560728430747986, + "learning_rate": 0.0001, + "loss": 1.5473, + "step": 5799 + }, + { + "epoch": 0.6738309613708975, + "grad_norm": 0.43896791338920593, + "learning_rate": 0.0001, + "loss": 1.6281, + "step": 5800 + }, + { + "epoch": 0.673947139122858, + "grad_norm": 0.4776453375816345, + "learning_rate": 0.0001, + "loss": 1.64, + "step": 5801 + }, + { + "epoch": 0.6740633168748185, + "grad_norm": 0.44419828057289124, + "learning_rate": 0.0001, + "loss": 1.6425, + "step": 5802 + }, + { + "epoch": 0.674179494626779, + "grad_norm": 0.44315245747566223, + "learning_rate": 0.0001, + "loss": 1.5136, + "step": 5803 + }, + { + "epoch": 0.6742956723787394, + "grad_norm": 0.4680185914039612, + "learning_rate": 0.0001, + "loss": 1.7217, + "step": 5804 + }, + { + "epoch": 0.6744118501306999, + "grad_norm": 0.4504956007003784, + "learning_rate": 0.0001, + "loss": 1.5078, + "step": 5805 + }, + { + "epoch": 0.6745280278826604, + "grad_norm": 0.4400990307331085, + "learning_rate": 0.0001, + "loss": 1.6676, + "step": 5806 + }, + { + "epoch": 0.674644205634621, + "grad_norm": 0.5055308938026428, + "learning_rate": 0.0001, + "loss": 1.5901, + "step": 5807 + }, + { + "epoch": 0.6747603833865815, + "grad_norm": 0.4236086905002594, + "learning_rate": 0.0001, + "loss": 1.6601, + "step": 5808 + }, + { + "epoch": 0.674876561138542, + "grad_norm": 0.4559587836265564, + "learning_rate": 0.0001, + "loss": 1.7122, + "step": 5809 + }, + { + "epoch": 0.6749927388905025, + "grad_norm": 0.42813077569007874, + "learning_rate": 0.0001, + "loss": 1.6111, + "step": 5810 + }, + { + "epoch": 0.675108916642463, + "grad_norm": 0.4201710820198059, + "learning_rate": 0.0001, + "loss": 1.5335, + "step": 5811 + }, + { + "epoch": 0.6752250943944235, + "grad_norm": 0.4242464005947113, + "learning_rate": 0.0001, + "loss": 1.5137, + "step": 5812 + }, + { + "epoch": 0.675341272146384, + "grad_norm": 0.45076656341552734, + "learning_rate": 0.0001, + "loss": 1.698, + "step": 5813 + }, + { + "epoch": 0.6754574498983444, + "grad_norm": 0.4501783549785614, + "learning_rate": 0.0001, + "loss": 1.7337, + "step": 5814 + }, + { + "epoch": 0.6755736276503049, + "grad_norm": 0.4316518306732178, + "learning_rate": 0.0001, + "loss": 1.514, + "step": 5815 + }, + { + "epoch": 0.6756898054022654, + "grad_norm": 0.4382632076740265, + "learning_rate": 0.0001, + "loss": 1.8078, + "step": 5816 + }, + { + "epoch": 0.675805983154226, + "grad_norm": 0.42367810010910034, + "learning_rate": 0.0001, + "loss": 1.6614, + "step": 5817 + }, + { + "epoch": 0.6759221609061865, + "grad_norm": 0.434133917093277, + "learning_rate": 0.0001, + "loss": 1.5825, + "step": 5818 + }, + { + "epoch": 0.676038338658147, + "grad_norm": 0.4283505976200104, + "learning_rate": 0.0001, + "loss": 1.6213, + "step": 5819 + }, + { + "epoch": 0.6761545164101075, + "grad_norm": 0.4194677770137787, + "learning_rate": 0.0001, + "loss": 1.6286, + "step": 5820 + }, + { + "epoch": 0.676270694162068, + "grad_norm": 0.43860924243927, + "learning_rate": 0.0001, + "loss": 1.6455, + "step": 5821 + }, + { + "epoch": 0.6763868719140285, + "grad_norm": 0.43896040320396423, + "learning_rate": 0.0001, + "loss": 1.5243, + "step": 5822 + }, + { + "epoch": 0.676503049665989, + "grad_norm": 0.41808873414993286, + "learning_rate": 0.0001, + "loss": 1.3372, + "step": 5823 + }, + { + "epoch": 0.6766192274179494, + "grad_norm": 0.5032152533531189, + "learning_rate": 0.0001, + "loss": 1.7323, + "step": 5824 + }, + { + "epoch": 0.6767354051699099, + "grad_norm": 0.48192158341407776, + "learning_rate": 0.0001, + "loss": 1.7263, + "step": 5825 + }, + { + "epoch": 0.6768515829218704, + "grad_norm": 0.4845306873321533, + "learning_rate": 0.0001, + "loss": 1.7389, + "step": 5826 + }, + { + "epoch": 0.6769677606738309, + "grad_norm": 0.46503588557243347, + "learning_rate": 0.0001, + "loss": 1.7579, + "step": 5827 + }, + { + "epoch": 0.6770839384257915, + "grad_norm": 0.42725270986557007, + "learning_rate": 0.0001, + "loss": 1.5129, + "step": 5828 + }, + { + "epoch": 0.677200116177752, + "grad_norm": 0.43792206048965454, + "learning_rate": 0.0001, + "loss": 1.6703, + "step": 5829 + }, + { + "epoch": 0.6773162939297125, + "grad_norm": 0.42509621381759644, + "learning_rate": 0.0001, + "loss": 1.5468, + "step": 5830 + }, + { + "epoch": 0.677432471681673, + "grad_norm": 0.4887797236442566, + "learning_rate": 0.0001, + "loss": 1.7904, + "step": 5831 + }, + { + "epoch": 0.6775486494336335, + "grad_norm": 0.4223867356777191, + "learning_rate": 0.0001, + "loss": 1.4495, + "step": 5832 + }, + { + "epoch": 0.677664827185594, + "grad_norm": 0.44735440611839294, + "learning_rate": 0.0001, + "loss": 1.5636, + "step": 5833 + }, + { + "epoch": 0.6777810049375544, + "grad_norm": 0.4342286288738251, + "learning_rate": 0.0001, + "loss": 1.4604, + "step": 5834 + }, + { + "epoch": 0.6778971826895149, + "grad_norm": 0.47264301776885986, + "learning_rate": 0.0001, + "loss": 1.5672, + "step": 5835 + }, + { + "epoch": 0.6780133604414754, + "grad_norm": 0.45144325494766235, + "learning_rate": 0.0001, + "loss": 1.6075, + "step": 5836 + }, + { + "epoch": 0.6781295381934359, + "grad_norm": 0.45070862770080566, + "learning_rate": 0.0001, + "loss": 1.7091, + "step": 5837 + }, + { + "epoch": 0.6782457159453965, + "grad_norm": 0.44926077127456665, + "learning_rate": 0.0001, + "loss": 1.5651, + "step": 5838 + }, + { + "epoch": 0.678361893697357, + "grad_norm": 0.42580538988113403, + "learning_rate": 0.0001, + "loss": 1.6603, + "step": 5839 + }, + { + "epoch": 0.6784780714493175, + "grad_norm": 0.4390771985054016, + "learning_rate": 0.0001, + "loss": 1.5282, + "step": 5840 + }, + { + "epoch": 0.678594249201278, + "grad_norm": 0.43495962023735046, + "learning_rate": 0.0001, + "loss": 1.527, + "step": 5841 + }, + { + "epoch": 0.6787104269532385, + "grad_norm": 0.42545855045318604, + "learning_rate": 0.0001, + "loss": 1.4797, + "step": 5842 + }, + { + "epoch": 0.678826604705199, + "grad_norm": 0.40862634778022766, + "learning_rate": 0.0001, + "loss": 1.4629, + "step": 5843 + }, + { + "epoch": 0.6789427824571594, + "grad_norm": 0.4625224173069, + "learning_rate": 0.0001, + "loss": 1.6166, + "step": 5844 + }, + { + "epoch": 0.6790589602091199, + "grad_norm": 0.4572260081768036, + "learning_rate": 0.0001, + "loss": 1.6402, + "step": 5845 + }, + { + "epoch": 0.6791751379610804, + "grad_norm": 0.45550736784935, + "learning_rate": 0.0001, + "loss": 1.725, + "step": 5846 + }, + { + "epoch": 0.6792913157130409, + "grad_norm": 0.49932238459587097, + "learning_rate": 0.0001, + "loss": 1.6559, + "step": 5847 + }, + { + "epoch": 0.6794074934650014, + "grad_norm": 0.43598851561546326, + "learning_rate": 0.0001, + "loss": 1.6201, + "step": 5848 + }, + { + "epoch": 0.679523671216962, + "grad_norm": 0.4407789707183838, + "learning_rate": 0.0001, + "loss": 1.5795, + "step": 5849 + }, + { + "epoch": 0.6796398489689225, + "grad_norm": 0.5003304481506348, + "learning_rate": 0.0001, + "loss": 1.9146, + "step": 5850 + }, + { + "epoch": 0.679756026720883, + "grad_norm": 0.45455658435821533, + "learning_rate": 0.0001, + "loss": 1.6535, + "step": 5851 + }, + { + "epoch": 0.6798722044728435, + "grad_norm": 0.429861456155777, + "learning_rate": 0.0001, + "loss": 1.7047, + "step": 5852 + }, + { + "epoch": 0.679988382224804, + "grad_norm": 0.41853785514831543, + "learning_rate": 0.0001, + "loss": 1.4794, + "step": 5853 + }, + { + "epoch": 0.6801045599767644, + "grad_norm": 0.4636412560939789, + "learning_rate": 0.0001, + "loss": 1.6509, + "step": 5854 + }, + { + "epoch": 0.6802207377287249, + "grad_norm": 0.4763379395008087, + "learning_rate": 0.0001, + "loss": 1.9099, + "step": 5855 + }, + { + "epoch": 0.6803369154806854, + "grad_norm": 0.40933549404144287, + "learning_rate": 0.0001, + "loss": 1.4495, + "step": 5856 + }, + { + "epoch": 0.6804530932326459, + "grad_norm": 0.43275657296180725, + "learning_rate": 0.0001, + "loss": 1.4485, + "step": 5857 + }, + { + "epoch": 0.6805692709846064, + "grad_norm": 0.4391117990016937, + "learning_rate": 0.0001, + "loss": 1.6513, + "step": 5858 + }, + { + "epoch": 0.680685448736567, + "grad_norm": 0.456015408039093, + "learning_rate": 0.0001, + "loss": 1.5974, + "step": 5859 + }, + { + "epoch": 0.6808016264885275, + "grad_norm": 0.4378184378147125, + "learning_rate": 0.0001, + "loss": 1.5483, + "step": 5860 + }, + { + "epoch": 0.680917804240488, + "grad_norm": 0.4603559672832489, + "learning_rate": 0.0001, + "loss": 1.5069, + "step": 5861 + }, + { + "epoch": 0.6810339819924485, + "grad_norm": 0.46735942363739014, + "learning_rate": 0.0001, + "loss": 1.5081, + "step": 5862 + }, + { + "epoch": 0.681150159744409, + "grad_norm": 0.42926350235939026, + "learning_rate": 0.0001, + "loss": 1.5607, + "step": 5863 + }, + { + "epoch": 0.6812663374963694, + "grad_norm": 0.4338090717792511, + "learning_rate": 0.0001, + "loss": 1.5821, + "step": 5864 + }, + { + "epoch": 0.6813825152483299, + "grad_norm": 0.4733595550060272, + "learning_rate": 0.0001, + "loss": 1.4723, + "step": 5865 + }, + { + "epoch": 0.6814986930002904, + "grad_norm": 0.43445518612861633, + "learning_rate": 0.0001, + "loss": 1.6658, + "step": 5866 + }, + { + "epoch": 0.6816148707522509, + "grad_norm": 0.4225948750972748, + "learning_rate": 0.0001, + "loss": 1.5438, + "step": 5867 + }, + { + "epoch": 0.6817310485042114, + "grad_norm": 0.4454067051410675, + "learning_rate": 0.0001, + "loss": 1.6634, + "step": 5868 + }, + { + "epoch": 0.6818472262561719, + "grad_norm": 0.4258386194705963, + "learning_rate": 0.0001, + "loss": 1.4935, + "step": 5869 + }, + { + "epoch": 0.6819634040081325, + "grad_norm": 0.5034085512161255, + "learning_rate": 0.0001, + "loss": 1.6897, + "step": 5870 + }, + { + "epoch": 0.682079581760093, + "grad_norm": 0.48741772770881653, + "learning_rate": 0.0001, + "loss": 1.8736, + "step": 5871 + }, + { + "epoch": 0.6821957595120535, + "grad_norm": 0.4586668908596039, + "learning_rate": 0.0001, + "loss": 1.6409, + "step": 5872 + }, + { + "epoch": 0.682311937264014, + "grad_norm": 0.445441335439682, + "learning_rate": 0.0001, + "loss": 1.5984, + "step": 5873 + }, + { + "epoch": 0.6824281150159744, + "grad_norm": 0.42984551191329956, + "learning_rate": 0.0001, + "loss": 1.6407, + "step": 5874 + }, + { + "epoch": 0.6825442927679349, + "grad_norm": 0.42550551891326904, + "learning_rate": 0.0001, + "loss": 1.4473, + "step": 5875 + }, + { + "epoch": 0.6826604705198954, + "grad_norm": 0.43332982063293457, + "learning_rate": 0.0001, + "loss": 1.4163, + "step": 5876 + }, + { + "epoch": 0.6827766482718559, + "grad_norm": 0.46797114610671997, + "learning_rate": 0.0001, + "loss": 1.6737, + "step": 5877 + }, + { + "epoch": 0.6828928260238164, + "grad_norm": 0.45049670338630676, + "learning_rate": 0.0001, + "loss": 1.7157, + "step": 5878 + }, + { + "epoch": 0.6830090037757769, + "grad_norm": 0.4513972997665405, + "learning_rate": 0.0001, + "loss": 1.5783, + "step": 5879 + }, + { + "epoch": 0.6831251815277375, + "grad_norm": 0.46550363302230835, + "learning_rate": 0.0001, + "loss": 1.537, + "step": 5880 + }, + { + "epoch": 0.683241359279698, + "grad_norm": 0.4839429259300232, + "learning_rate": 0.0001, + "loss": 1.7258, + "step": 5881 + }, + { + "epoch": 0.6833575370316585, + "grad_norm": 0.4608222544193268, + "learning_rate": 0.0001, + "loss": 1.5762, + "step": 5882 + }, + { + "epoch": 0.683473714783619, + "grad_norm": 0.4843682050704956, + "learning_rate": 0.0001, + "loss": 1.7465, + "step": 5883 + }, + { + "epoch": 0.6835898925355794, + "grad_norm": 0.44494950771331787, + "learning_rate": 0.0001, + "loss": 1.6064, + "step": 5884 + }, + { + "epoch": 0.6837060702875399, + "grad_norm": 0.4825565814971924, + "learning_rate": 0.0001, + "loss": 1.6292, + "step": 5885 + }, + { + "epoch": 0.6838222480395004, + "grad_norm": 0.4432181715965271, + "learning_rate": 0.0001, + "loss": 1.7055, + "step": 5886 + }, + { + "epoch": 0.6839384257914609, + "grad_norm": 0.43164384365081787, + "learning_rate": 0.0001, + "loss": 1.6208, + "step": 5887 + }, + { + "epoch": 0.6840546035434214, + "grad_norm": 0.4706867039203644, + "learning_rate": 0.0001, + "loss": 1.7136, + "step": 5888 + }, + { + "epoch": 0.6841707812953819, + "grad_norm": 0.4217129647731781, + "learning_rate": 0.0001, + "loss": 1.4834, + "step": 5889 + }, + { + "epoch": 0.6842869590473424, + "grad_norm": 0.4253685772418976, + "learning_rate": 0.0001, + "loss": 1.4565, + "step": 5890 + }, + { + "epoch": 0.684403136799303, + "grad_norm": 0.431268572807312, + "learning_rate": 0.0001, + "loss": 1.6927, + "step": 5891 + }, + { + "epoch": 0.6845193145512635, + "grad_norm": 0.44175827503204346, + "learning_rate": 0.0001, + "loss": 1.6133, + "step": 5892 + }, + { + "epoch": 0.684635492303224, + "grad_norm": 0.41143807768821716, + "learning_rate": 0.0001, + "loss": 1.5274, + "step": 5893 + }, + { + "epoch": 0.6847516700551844, + "grad_norm": 0.4632609784603119, + "learning_rate": 0.0001, + "loss": 1.6837, + "step": 5894 + }, + { + "epoch": 0.6848678478071449, + "grad_norm": 0.45641466975212097, + "learning_rate": 0.0001, + "loss": 1.7331, + "step": 5895 + }, + { + "epoch": 0.6849840255591054, + "grad_norm": 0.4799252152442932, + "learning_rate": 0.0001, + "loss": 1.6531, + "step": 5896 + }, + { + "epoch": 0.6851002033110659, + "grad_norm": 0.42571592330932617, + "learning_rate": 0.0001, + "loss": 1.6587, + "step": 5897 + }, + { + "epoch": 0.6852163810630264, + "grad_norm": 0.42581233382225037, + "learning_rate": 0.0001, + "loss": 1.5199, + "step": 5898 + }, + { + "epoch": 0.6853325588149869, + "grad_norm": 0.4454191327095032, + "learning_rate": 0.0001, + "loss": 1.4827, + "step": 5899 + }, + { + "epoch": 0.6854487365669474, + "grad_norm": 0.4443952143192291, + "learning_rate": 0.0001, + "loss": 1.3731, + "step": 5900 + }, + { + "epoch": 0.685564914318908, + "grad_norm": 0.4548780918121338, + "learning_rate": 0.0001, + "loss": 1.5763, + "step": 5901 + }, + { + "epoch": 0.6856810920708685, + "grad_norm": 0.4646033048629761, + "learning_rate": 0.0001, + "loss": 1.6999, + "step": 5902 + }, + { + "epoch": 0.685797269822829, + "grad_norm": 0.4402289390563965, + "learning_rate": 0.0001, + "loss": 1.4804, + "step": 5903 + }, + { + "epoch": 0.6859134475747894, + "grad_norm": 0.462422251701355, + "learning_rate": 0.0001, + "loss": 1.6013, + "step": 5904 + }, + { + "epoch": 0.6860296253267499, + "grad_norm": 0.4708074629306793, + "learning_rate": 0.0001, + "loss": 1.7784, + "step": 5905 + }, + { + "epoch": 0.6861458030787104, + "grad_norm": 0.4279741942882538, + "learning_rate": 0.0001, + "loss": 1.577, + "step": 5906 + }, + { + "epoch": 0.6862619808306709, + "grad_norm": 0.45395079255104065, + "learning_rate": 0.0001, + "loss": 1.7005, + "step": 5907 + }, + { + "epoch": 0.6863781585826314, + "grad_norm": 0.46580180525779724, + "learning_rate": 0.0001, + "loss": 1.7221, + "step": 5908 + }, + { + "epoch": 0.6864943363345919, + "grad_norm": 0.44675102829933167, + "learning_rate": 0.0001, + "loss": 1.6527, + "step": 5909 + }, + { + "epoch": 0.6866105140865524, + "grad_norm": 0.41608941555023193, + "learning_rate": 0.0001, + "loss": 1.4215, + "step": 5910 + }, + { + "epoch": 0.6867266918385129, + "grad_norm": 0.4041667878627777, + "learning_rate": 0.0001, + "loss": 1.3818, + "step": 5911 + }, + { + "epoch": 0.6868428695904735, + "grad_norm": 0.46621033549308777, + "learning_rate": 0.0001, + "loss": 1.6305, + "step": 5912 + }, + { + "epoch": 0.686959047342434, + "grad_norm": 0.42089179158210754, + "learning_rate": 0.0001, + "loss": 1.3727, + "step": 5913 + }, + { + "epoch": 0.6870752250943944, + "grad_norm": 0.43382903933525085, + "learning_rate": 0.0001, + "loss": 1.3902, + "step": 5914 + }, + { + "epoch": 0.6871914028463549, + "grad_norm": 0.45685744285583496, + "learning_rate": 0.0001, + "loss": 1.7223, + "step": 5915 + }, + { + "epoch": 0.6873075805983154, + "grad_norm": 0.4664427936077118, + "learning_rate": 0.0001, + "loss": 1.5859, + "step": 5916 + }, + { + "epoch": 0.6874237583502759, + "grad_norm": 0.43341436982154846, + "learning_rate": 0.0001, + "loss": 1.5792, + "step": 5917 + }, + { + "epoch": 0.6875399361022364, + "grad_norm": 0.4530865252017975, + "learning_rate": 0.0001, + "loss": 1.6366, + "step": 5918 + }, + { + "epoch": 0.6876561138541969, + "grad_norm": 0.4333666265010834, + "learning_rate": 0.0001, + "loss": 1.4821, + "step": 5919 + }, + { + "epoch": 0.6877722916061574, + "grad_norm": 0.4299314618110657, + "learning_rate": 0.0001, + "loss": 1.5067, + "step": 5920 + }, + { + "epoch": 0.6878884693581179, + "grad_norm": 0.43617698550224304, + "learning_rate": 0.0001, + "loss": 1.7213, + "step": 5921 + }, + { + "epoch": 0.6880046471100785, + "grad_norm": 0.4485790431499481, + "learning_rate": 0.0001, + "loss": 1.4229, + "step": 5922 + }, + { + "epoch": 0.688120824862039, + "grad_norm": 0.44341012835502625, + "learning_rate": 0.0001, + "loss": 1.4585, + "step": 5923 + }, + { + "epoch": 0.6882370026139994, + "grad_norm": 0.4278321862220764, + "learning_rate": 0.0001, + "loss": 1.5298, + "step": 5924 + }, + { + "epoch": 0.6883531803659599, + "grad_norm": 0.4742536246776581, + "learning_rate": 0.0001, + "loss": 1.7254, + "step": 5925 + }, + { + "epoch": 0.6884693581179204, + "grad_norm": 0.43478327989578247, + "learning_rate": 0.0001, + "loss": 1.4958, + "step": 5926 + }, + { + "epoch": 0.6885855358698809, + "grad_norm": 0.4236944317817688, + "learning_rate": 0.0001, + "loss": 1.4795, + "step": 5927 + }, + { + "epoch": 0.6887017136218414, + "grad_norm": 0.4370484948158264, + "learning_rate": 0.0001, + "loss": 1.6045, + "step": 5928 + }, + { + "epoch": 0.6888178913738019, + "grad_norm": 0.45525872707366943, + "learning_rate": 0.0001, + "loss": 1.6832, + "step": 5929 + }, + { + "epoch": 0.6889340691257624, + "grad_norm": 0.4394777715206146, + "learning_rate": 0.0001, + "loss": 1.542, + "step": 5930 + }, + { + "epoch": 0.6890502468777229, + "grad_norm": 0.4571029841899872, + "learning_rate": 0.0001, + "loss": 1.7596, + "step": 5931 + }, + { + "epoch": 0.6891664246296834, + "grad_norm": 0.4554816782474518, + "learning_rate": 0.0001, + "loss": 1.6739, + "step": 5932 + }, + { + "epoch": 0.689282602381644, + "grad_norm": 0.43854042887687683, + "learning_rate": 0.0001, + "loss": 1.5481, + "step": 5933 + }, + { + "epoch": 0.6893987801336044, + "grad_norm": 0.4328182339668274, + "learning_rate": 0.0001, + "loss": 1.644, + "step": 5934 + }, + { + "epoch": 0.6895149578855649, + "grad_norm": 0.44765332341194153, + "learning_rate": 0.0001, + "loss": 1.6783, + "step": 5935 + }, + { + "epoch": 0.6896311356375254, + "grad_norm": 0.43624910712242126, + "learning_rate": 0.0001, + "loss": 1.6246, + "step": 5936 + }, + { + "epoch": 0.6897473133894859, + "grad_norm": 0.46665430068969727, + "learning_rate": 0.0001, + "loss": 1.6061, + "step": 5937 + }, + { + "epoch": 0.6898634911414464, + "grad_norm": 0.45889371633529663, + "learning_rate": 0.0001, + "loss": 1.551, + "step": 5938 + }, + { + "epoch": 0.6899796688934069, + "grad_norm": 0.4659807085990906, + "learning_rate": 0.0001, + "loss": 1.878, + "step": 5939 + }, + { + "epoch": 0.6900958466453674, + "grad_norm": 0.45350781083106995, + "learning_rate": 0.0001, + "loss": 1.6036, + "step": 5940 + }, + { + "epoch": 0.6902120243973279, + "grad_norm": 0.44755759835243225, + "learning_rate": 0.0001, + "loss": 1.6358, + "step": 5941 + }, + { + "epoch": 0.6903282021492884, + "grad_norm": 0.39925143122673035, + "learning_rate": 0.0001, + "loss": 1.4103, + "step": 5942 + }, + { + "epoch": 0.690444379901249, + "grad_norm": 0.4676305055618286, + "learning_rate": 0.0001, + "loss": 1.7827, + "step": 5943 + }, + { + "epoch": 0.6905605576532095, + "grad_norm": 0.42078080773353577, + "learning_rate": 0.0001, + "loss": 1.3661, + "step": 5944 + }, + { + "epoch": 0.6906767354051699, + "grad_norm": 0.4294394254684448, + "learning_rate": 0.0001, + "loss": 1.5364, + "step": 5945 + }, + { + "epoch": 0.6907929131571304, + "grad_norm": 0.4754204750061035, + "learning_rate": 0.0001, + "loss": 1.4769, + "step": 5946 + }, + { + "epoch": 0.6909090909090909, + "grad_norm": 0.4373209476470947, + "learning_rate": 0.0001, + "loss": 1.6253, + "step": 5947 + }, + { + "epoch": 0.6910252686610514, + "grad_norm": 0.4468858242034912, + "learning_rate": 0.0001, + "loss": 1.4982, + "step": 5948 + }, + { + "epoch": 0.6911414464130119, + "grad_norm": 0.41958677768707275, + "learning_rate": 0.0001, + "loss": 1.6706, + "step": 5949 + }, + { + "epoch": 0.6912576241649724, + "grad_norm": 0.4472271203994751, + "learning_rate": 0.0001, + "loss": 1.6119, + "step": 5950 + }, + { + "epoch": 0.6913738019169329, + "grad_norm": 0.4623844623565674, + "learning_rate": 0.0001, + "loss": 1.6781, + "step": 5951 + }, + { + "epoch": 0.6914899796688934, + "grad_norm": 0.4381425082683563, + "learning_rate": 0.0001, + "loss": 1.5165, + "step": 5952 + }, + { + "epoch": 0.691606157420854, + "grad_norm": 0.4867214858531952, + "learning_rate": 0.0001, + "loss": 1.7421, + "step": 5953 + }, + { + "epoch": 0.6917223351728145, + "grad_norm": 0.45404472947120667, + "learning_rate": 0.0001, + "loss": 1.6872, + "step": 5954 + }, + { + "epoch": 0.6918385129247749, + "grad_norm": 0.45104196667671204, + "learning_rate": 0.0001, + "loss": 1.759, + "step": 5955 + }, + { + "epoch": 0.6919546906767354, + "grad_norm": 0.43430057168006897, + "learning_rate": 0.0001, + "loss": 1.6374, + "step": 5956 + }, + { + "epoch": 0.6920708684286959, + "grad_norm": 0.4231088161468506, + "learning_rate": 0.0001, + "loss": 1.5436, + "step": 5957 + }, + { + "epoch": 0.6921870461806564, + "grad_norm": 0.43777701258659363, + "learning_rate": 0.0001, + "loss": 1.5665, + "step": 5958 + }, + { + "epoch": 0.6923032239326169, + "grad_norm": 0.4415545165538788, + "learning_rate": 0.0001, + "loss": 1.6971, + "step": 5959 + }, + { + "epoch": 0.6924194016845774, + "grad_norm": 0.4693138599395752, + "learning_rate": 0.0001, + "loss": 1.827, + "step": 5960 + }, + { + "epoch": 0.6925355794365379, + "grad_norm": 0.458947092294693, + "learning_rate": 0.0001, + "loss": 1.7383, + "step": 5961 + }, + { + "epoch": 0.6926517571884984, + "grad_norm": 0.47753122448921204, + "learning_rate": 0.0001, + "loss": 1.6538, + "step": 5962 + }, + { + "epoch": 0.6927679349404589, + "grad_norm": 0.44865846633911133, + "learning_rate": 0.0001, + "loss": 1.5345, + "step": 5963 + }, + { + "epoch": 0.6928841126924195, + "grad_norm": 0.46769386529922485, + "learning_rate": 0.0001, + "loss": 1.6535, + "step": 5964 + }, + { + "epoch": 0.6930002904443799, + "grad_norm": 0.46594002842903137, + "learning_rate": 0.0001, + "loss": 1.5307, + "step": 5965 + }, + { + "epoch": 0.6931164681963404, + "grad_norm": 0.44469648599624634, + "learning_rate": 0.0001, + "loss": 1.6999, + "step": 5966 + }, + { + "epoch": 0.6932326459483009, + "grad_norm": 0.44019195437431335, + "learning_rate": 0.0001, + "loss": 1.6059, + "step": 5967 + }, + { + "epoch": 0.6933488237002614, + "grad_norm": 0.4828648567199707, + "learning_rate": 0.0001, + "loss": 1.734, + "step": 5968 + }, + { + "epoch": 0.6934650014522219, + "grad_norm": 0.4316853880882263, + "learning_rate": 0.0001, + "loss": 1.5491, + "step": 5969 + }, + { + "epoch": 0.6935811792041824, + "grad_norm": 0.45684826374053955, + "learning_rate": 0.0001, + "loss": 1.496, + "step": 5970 + }, + { + "epoch": 0.6936973569561429, + "grad_norm": 0.4338705539703369, + "learning_rate": 0.0001, + "loss": 1.6207, + "step": 5971 + }, + { + "epoch": 0.6938135347081034, + "grad_norm": 0.46950671076774597, + "learning_rate": 0.0001, + "loss": 1.6252, + "step": 5972 + }, + { + "epoch": 0.6939297124600639, + "grad_norm": 0.4339956045150757, + "learning_rate": 0.0001, + "loss": 1.5908, + "step": 5973 + }, + { + "epoch": 0.6940458902120245, + "grad_norm": 0.5043813586235046, + "learning_rate": 0.0001, + "loss": 1.6671, + "step": 5974 + }, + { + "epoch": 0.6941620679639849, + "grad_norm": 0.4209142029285431, + "learning_rate": 0.0001, + "loss": 1.6935, + "step": 5975 + }, + { + "epoch": 0.6942782457159454, + "grad_norm": 0.4390365779399872, + "learning_rate": 0.0001, + "loss": 1.6409, + "step": 5976 + }, + { + "epoch": 0.6943944234679059, + "grad_norm": 0.4435301423072815, + "learning_rate": 0.0001, + "loss": 1.4815, + "step": 5977 + }, + { + "epoch": 0.6945106012198664, + "grad_norm": 0.4381083548069, + "learning_rate": 0.0001, + "loss": 1.5671, + "step": 5978 + }, + { + "epoch": 0.6946267789718269, + "grad_norm": 0.4319905638694763, + "learning_rate": 0.0001, + "loss": 1.5669, + "step": 5979 + }, + { + "epoch": 0.6947429567237874, + "grad_norm": 0.42140626907348633, + "learning_rate": 0.0001, + "loss": 1.6624, + "step": 5980 + }, + { + "epoch": 0.6948591344757479, + "grad_norm": 0.46980568766593933, + "learning_rate": 0.0001, + "loss": 1.8191, + "step": 5981 + }, + { + "epoch": 0.6949753122277084, + "grad_norm": 0.44185250997543335, + "learning_rate": 0.0001, + "loss": 1.6686, + "step": 5982 + }, + { + "epoch": 0.6950914899796689, + "grad_norm": 0.4495440423488617, + "learning_rate": 0.0001, + "loss": 1.7767, + "step": 5983 + }, + { + "epoch": 0.6952076677316293, + "grad_norm": 0.43711593747138977, + "learning_rate": 0.0001, + "loss": 1.4925, + "step": 5984 + }, + { + "epoch": 0.6953238454835899, + "grad_norm": 0.44120272994041443, + "learning_rate": 0.0001, + "loss": 1.6307, + "step": 5985 + }, + { + "epoch": 0.6954400232355504, + "grad_norm": 0.46513938903808594, + "learning_rate": 0.0001, + "loss": 1.7124, + "step": 5986 + }, + { + "epoch": 0.6955562009875109, + "grad_norm": 0.43529126048088074, + "learning_rate": 0.0001, + "loss": 1.548, + "step": 5987 + }, + { + "epoch": 0.6956723787394714, + "grad_norm": 0.4164985120296478, + "learning_rate": 0.0001, + "loss": 1.6025, + "step": 5988 + }, + { + "epoch": 0.6957885564914319, + "grad_norm": 0.4299083352088928, + "learning_rate": 0.0001, + "loss": 1.6278, + "step": 5989 + }, + { + "epoch": 0.6959047342433924, + "grad_norm": 0.4332889914512634, + "learning_rate": 0.0001, + "loss": 1.5584, + "step": 5990 + }, + { + "epoch": 0.6960209119953529, + "grad_norm": 0.4063509404659271, + "learning_rate": 0.0001, + "loss": 1.529, + "step": 5991 + }, + { + "epoch": 0.6961370897473134, + "grad_norm": 0.4663069248199463, + "learning_rate": 0.0001, + "loss": 1.7359, + "step": 5992 + }, + { + "epoch": 0.6962532674992739, + "grad_norm": 0.46274805068969727, + "learning_rate": 0.0001, + "loss": 1.6643, + "step": 5993 + }, + { + "epoch": 0.6963694452512343, + "grad_norm": 0.45924481749534607, + "learning_rate": 0.0001, + "loss": 1.6665, + "step": 5994 + }, + { + "epoch": 0.6964856230031949, + "grad_norm": 0.4352482557296753, + "learning_rate": 0.0001, + "loss": 1.5231, + "step": 5995 + }, + { + "epoch": 0.6966018007551554, + "grad_norm": 0.4291403591632843, + "learning_rate": 0.0001, + "loss": 1.5539, + "step": 5996 + }, + { + "epoch": 0.6967179785071159, + "grad_norm": 0.5188395977020264, + "learning_rate": 0.0001, + "loss": 1.6233, + "step": 5997 + }, + { + "epoch": 0.6968341562590764, + "grad_norm": 0.4158150851726532, + "learning_rate": 0.0001, + "loss": 1.4289, + "step": 5998 + }, + { + "epoch": 0.6969503340110369, + "grad_norm": 0.461252897977829, + "learning_rate": 0.0001, + "loss": 1.7773, + "step": 5999 + }, + { + "epoch": 0.6970665117629974, + "grad_norm": 0.40928980708122253, + "learning_rate": 0.0001, + "loss": 1.4985, + "step": 6000 + }, + { + "epoch": 0.6971826895149579, + "grad_norm": 0.4179105758666992, + "learning_rate": 0.0001, + "loss": 1.6255, + "step": 6001 + }, + { + "epoch": 0.6972988672669184, + "grad_norm": 0.4326860010623932, + "learning_rate": 0.0001, + "loss": 1.7069, + "step": 6002 + }, + { + "epoch": 0.6974150450188789, + "grad_norm": 0.42176535725593567, + "learning_rate": 0.0001, + "loss": 1.4696, + "step": 6003 + }, + { + "epoch": 0.6975312227708393, + "grad_norm": 0.4395149350166321, + "learning_rate": 0.0001, + "loss": 1.5552, + "step": 6004 + }, + { + "epoch": 0.6976474005227998, + "grad_norm": 0.43788623809814453, + "learning_rate": 0.0001, + "loss": 1.6544, + "step": 6005 + }, + { + "epoch": 0.6977635782747604, + "grad_norm": 0.4315040111541748, + "learning_rate": 0.0001, + "loss": 1.6495, + "step": 6006 + }, + { + "epoch": 0.6978797560267209, + "grad_norm": 0.43025481700897217, + "learning_rate": 0.0001, + "loss": 1.6953, + "step": 6007 + }, + { + "epoch": 0.6979959337786814, + "grad_norm": 0.4490424692630768, + "learning_rate": 0.0001, + "loss": 1.6173, + "step": 6008 + }, + { + "epoch": 0.6981121115306419, + "grad_norm": 0.4578996002674103, + "learning_rate": 0.0001, + "loss": 1.7515, + "step": 6009 + }, + { + "epoch": 0.6982282892826024, + "grad_norm": 0.42355626821517944, + "learning_rate": 0.0001, + "loss": 1.5837, + "step": 6010 + }, + { + "epoch": 0.6983444670345629, + "grad_norm": 0.4342043399810791, + "learning_rate": 0.0001, + "loss": 1.5084, + "step": 6011 + }, + { + "epoch": 0.6984606447865234, + "grad_norm": 0.41684943437576294, + "learning_rate": 0.0001, + "loss": 1.4773, + "step": 6012 + }, + { + "epoch": 0.6985768225384839, + "grad_norm": 0.4408121705055237, + "learning_rate": 0.0001, + "loss": 1.4747, + "step": 6013 + }, + { + "epoch": 0.6986930002904443, + "grad_norm": 0.43644100427627563, + "learning_rate": 0.0001, + "loss": 1.4511, + "step": 6014 + }, + { + "epoch": 0.6988091780424048, + "grad_norm": 0.4824478328227997, + "learning_rate": 0.0001, + "loss": 1.5967, + "step": 6015 + }, + { + "epoch": 0.6989253557943654, + "grad_norm": 0.4514261782169342, + "learning_rate": 0.0001, + "loss": 1.6404, + "step": 6016 + }, + { + "epoch": 0.6990415335463259, + "grad_norm": 0.4663759768009186, + "learning_rate": 0.0001, + "loss": 1.6674, + "step": 6017 + }, + { + "epoch": 0.6991577112982864, + "grad_norm": 0.45603466033935547, + "learning_rate": 0.0001, + "loss": 1.5931, + "step": 6018 + }, + { + "epoch": 0.6992738890502469, + "grad_norm": 0.44485101103782654, + "learning_rate": 0.0001, + "loss": 1.614, + "step": 6019 + }, + { + "epoch": 0.6993900668022074, + "grad_norm": 0.4779071509838104, + "learning_rate": 0.0001, + "loss": 1.7452, + "step": 6020 + }, + { + "epoch": 0.6995062445541679, + "grad_norm": 0.4669143259525299, + "learning_rate": 0.0001, + "loss": 1.6123, + "step": 6021 + }, + { + "epoch": 0.6996224223061284, + "grad_norm": 0.4332158863544464, + "learning_rate": 0.0001, + "loss": 1.6481, + "step": 6022 + }, + { + "epoch": 0.6997386000580889, + "grad_norm": 0.4396936595439911, + "learning_rate": 0.0001, + "loss": 1.7863, + "step": 6023 + }, + { + "epoch": 0.6998547778100493, + "grad_norm": 0.43175163865089417, + "learning_rate": 0.0001, + "loss": 1.5967, + "step": 6024 + }, + { + "epoch": 0.6999709555620098, + "grad_norm": 0.4532018005847931, + "learning_rate": 0.0001, + "loss": 1.6209, + "step": 6025 + }, + { + "epoch": 0.7000871333139703, + "grad_norm": 0.4363815188407898, + "learning_rate": 0.0001, + "loss": 1.5637, + "step": 6026 + }, + { + "epoch": 0.7002033110659309, + "grad_norm": 0.4686264097690582, + "learning_rate": 0.0001, + "loss": 1.6303, + "step": 6027 + }, + { + "epoch": 0.7003194888178914, + "grad_norm": 0.4810371696949005, + "learning_rate": 0.0001, + "loss": 1.5524, + "step": 6028 + }, + { + "epoch": 0.7004356665698519, + "grad_norm": 0.4395377039909363, + "learning_rate": 0.0001, + "loss": 1.589, + "step": 6029 + }, + { + "epoch": 0.7005518443218124, + "grad_norm": 0.43942928314208984, + "learning_rate": 0.0001, + "loss": 1.4534, + "step": 6030 + }, + { + "epoch": 0.7006680220737729, + "grad_norm": 0.43220555782318115, + "learning_rate": 0.0001, + "loss": 1.6113, + "step": 6031 + }, + { + "epoch": 0.7007841998257334, + "grad_norm": 0.44977232813835144, + "learning_rate": 0.0001, + "loss": 1.4633, + "step": 6032 + }, + { + "epoch": 0.7009003775776939, + "grad_norm": 0.4994456171989441, + "learning_rate": 0.0001, + "loss": 1.729, + "step": 6033 + }, + { + "epoch": 0.7010165553296543, + "grad_norm": 0.44498246908187866, + "learning_rate": 0.0001, + "loss": 1.6094, + "step": 6034 + }, + { + "epoch": 0.7011327330816148, + "grad_norm": 0.4909377992153168, + "learning_rate": 0.0001, + "loss": 1.7201, + "step": 6035 + }, + { + "epoch": 0.7012489108335753, + "grad_norm": 0.4470892548561096, + "learning_rate": 0.0001, + "loss": 1.554, + "step": 6036 + }, + { + "epoch": 0.7013650885855359, + "grad_norm": 0.47130686044692993, + "learning_rate": 0.0001, + "loss": 1.7369, + "step": 6037 + }, + { + "epoch": 0.7014812663374964, + "grad_norm": 0.4447838068008423, + "learning_rate": 0.0001, + "loss": 1.5377, + "step": 6038 + }, + { + "epoch": 0.7015974440894569, + "grad_norm": 0.48111826181411743, + "learning_rate": 0.0001, + "loss": 1.7131, + "step": 6039 + }, + { + "epoch": 0.7017136218414174, + "grad_norm": 0.4512770175933838, + "learning_rate": 0.0001, + "loss": 1.6969, + "step": 6040 + }, + { + "epoch": 0.7018297995933779, + "grad_norm": 0.4442228078842163, + "learning_rate": 0.0001, + "loss": 1.5082, + "step": 6041 + }, + { + "epoch": 0.7019459773453384, + "grad_norm": 0.49722543358802795, + "learning_rate": 0.0001, + "loss": 1.7205, + "step": 6042 + }, + { + "epoch": 0.7020621550972989, + "grad_norm": 0.4556099474430084, + "learning_rate": 0.0001, + "loss": 1.498, + "step": 6043 + }, + { + "epoch": 0.7021783328492593, + "grad_norm": 0.465631365776062, + "learning_rate": 0.0001, + "loss": 1.4306, + "step": 6044 + }, + { + "epoch": 0.7022945106012198, + "grad_norm": 0.4637199938297272, + "learning_rate": 0.0001, + "loss": 1.7575, + "step": 6045 + }, + { + "epoch": 0.7024106883531803, + "grad_norm": 0.5277969241142273, + "learning_rate": 0.0001, + "loss": 1.6427, + "step": 6046 + }, + { + "epoch": 0.7025268661051408, + "grad_norm": 0.43430525064468384, + "learning_rate": 0.0001, + "loss": 1.6548, + "step": 6047 + }, + { + "epoch": 0.7026430438571014, + "grad_norm": 0.4051744043827057, + "learning_rate": 0.0001, + "loss": 1.357, + "step": 6048 + }, + { + "epoch": 0.7027592216090619, + "grad_norm": 0.4759519696235657, + "learning_rate": 0.0001, + "loss": 1.6906, + "step": 6049 + }, + { + "epoch": 0.7028753993610224, + "grad_norm": 0.4706842005252838, + "learning_rate": 0.0001, + "loss": 1.6156, + "step": 6050 + }, + { + "epoch": 0.7029915771129829, + "grad_norm": 0.46256595849990845, + "learning_rate": 0.0001, + "loss": 1.7044, + "step": 6051 + }, + { + "epoch": 0.7031077548649434, + "grad_norm": 0.43394848704338074, + "learning_rate": 0.0001, + "loss": 1.6269, + "step": 6052 + }, + { + "epoch": 0.7032239326169039, + "grad_norm": 0.44326069951057434, + "learning_rate": 0.0001, + "loss": 1.5096, + "step": 6053 + }, + { + "epoch": 0.7033401103688643, + "grad_norm": 0.47456949949264526, + "learning_rate": 0.0001, + "loss": 1.7654, + "step": 6054 + }, + { + "epoch": 0.7034562881208248, + "grad_norm": 0.42411813139915466, + "learning_rate": 0.0001, + "loss": 1.5732, + "step": 6055 + }, + { + "epoch": 0.7035724658727853, + "grad_norm": 0.43997639417648315, + "learning_rate": 0.0001, + "loss": 1.6792, + "step": 6056 + }, + { + "epoch": 0.7036886436247458, + "grad_norm": 0.3923532962799072, + "learning_rate": 0.0001, + "loss": 1.4779, + "step": 6057 + }, + { + "epoch": 0.7038048213767064, + "grad_norm": 0.4113970994949341, + "learning_rate": 0.0001, + "loss": 1.4705, + "step": 6058 + }, + { + "epoch": 0.7039209991286669, + "grad_norm": 0.42837658524513245, + "learning_rate": 0.0001, + "loss": 1.4292, + "step": 6059 + }, + { + "epoch": 0.7040371768806274, + "grad_norm": 0.44520092010498047, + "learning_rate": 0.0001, + "loss": 1.5377, + "step": 6060 + }, + { + "epoch": 0.7041533546325879, + "grad_norm": 0.43117523193359375, + "learning_rate": 0.0001, + "loss": 1.3792, + "step": 6061 + }, + { + "epoch": 0.7042695323845484, + "grad_norm": 0.4564533233642578, + "learning_rate": 0.0001, + "loss": 1.6178, + "step": 6062 + }, + { + "epoch": 0.7043857101365089, + "grad_norm": 0.4633900225162506, + "learning_rate": 0.0001, + "loss": 1.6223, + "step": 6063 + }, + { + "epoch": 0.7045018878884693, + "grad_norm": 0.439037561416626, + "learning_rate": 0.0001, + "loss": 1.5777, + "step": 6064 + }, + { + "epoch": 0.7046180656404298, + "grad_norm": 0.42920568585395813, + "learning_rate": 0.0001, + "loss": 1.6043, + "step": 6065 + }, + { + "epoch": 0.7047342433923903, + "grad_norm": 0.44558966159820557, + "learning_rate": 0.0001, + "loss": 1.5141, + "step": 6066 + }, + { + "epoch": 0.7048504211443508, + "grad_norm": 0.4488900899887085, + "learning_rate": 0.0001, + "loss": 1.6281, + "step": 6067 + }, + { + "epoch": 0.7049665988963113, + "grad_norm": 0.4773916006088257, + "learning_rate": 0.0001, + "loss": 1.6774, + "step": 6068 + }, + { + "epoch": 0.7050827766482719, + "grad_norm": 0.4205687344074249, + "learning_rate": 0.0001, + "loss": 1.5134, + "step": 6069 + }, + { + "epoch": 0.7051989544002324, + "grad_norm": 0.4228312373161316, + "learning_rate": 0.0001, + "loss": 1.5634, + "step": 6070 + }, + { + "epoch": 0.7053151321521929, + "grad_norm": 0.42489486932754517, + "learning_rate": 0.0001, + "loss": 1.5474, + "step": 6071 + }, + { + "epoch": 0.7054313099041534, + "grad_norm": 0.432058185338974, + "learning_rate": 0.0001, + "loss": 1.5865, + "step": 6072 + }, + { + "epoch": 0.7055474876561139, + "grad_norm": 0.4359389543533325, + "learning_rate": 0.0001, + "loss": 1.5741, + "step": 6073 + }, + { + "epoch": 0.7056636654080743, + "grad_norm": 0.46697720885276794, + "learning_rate": 0.0001, + "loss": 1.5813, + "step": 6074 + }, + { + "epoch": 0.7057798431600348, + "grad_norm": 0.4296872317790985, + "learning_rate": 0.0001, + "loss": 1.6521, + "step": 6075 + }, + { + "epoch": 0.7058960209119953, + "grad_norm": 0.4327179491519928, + "learning_rate": 0.0001, + "loss": 1.5553, + "step": 6076 + }, + { + "epoch": 0.7060121986639558, + "grad_norm": 0.49642127752304077, + "learning_rate": 0.0001, + "loss": 1.6104, + "step": 6077 + }, + { + "epoch": 0.7061283764159163, + "grad_norm": 0.44212982058525085, + "learning_rate": 0.0001, + "loss": 1.6172, + "step": 6078 + }, + { + "epoch": 0.7062445541678769, + "grad_norm": 0.45311763882637024, + "learning_rate": 0.0001, + "loss": 1.6611, + "step": 6079 + }, + { + "epoch": 0.7063607319198374, + "grad_norm": 0.4574792981147766, + "learning_rate": 0.0001, + "loss": 1.5746, + "step": 6080 + }, + { + "epoch": 0.7064769096717979, + "grad_norm": 0.4645473062992096, + "learning_rate": 0.0001, + "loss": 1.5848, + "step": 6081 + }, + { + "epoch": 0.7065930874237584, + "grad_norm": 0.4442659318447113, + "learning_rate": 0.0001, + "loss": 1.58, + "step": 6082 + }, + { + "epoch": 0.7067092651757189, + "grad_norm": 0.41226938366889954, + "learning_rate": 0.0001, + "loss": 1.518, + "step": 6083 + }, + { + "epoch": 0.7068254429276793, + "grad_norm": 0.4444557726383209, + "learning_rate": 0.0001, + "loss": 1.6688, + "step": 6084 + }, + { + "epoch": 0.7069416206796398, + "grad_norm": 0.4641963839530945, + "learning_rate": 0.0001, + "loss": 1.4054, + "step": 6085 + }, + { + "epoch": 0.7070577984316003, + "grad_norm": 0.44319379329681396, + "learning_rate": 0.0001, + "loss": 1.5801, + "step": 6086 + }, + { + "epoch": 0.7071739761835608, + "grad_norm": 0.4771478474140167, + "learning_rate": 0.0001, + "loss": 1.6775, + "step": 6087 + }, + { + "epoch": 0.7072901539355213, + "grad_norm": 0.45707952976226807, + "learning_rate": 0.0001, + "loss": 1.5976, + "step": 6088 + }, + { + "epoch": 0.7074063316874818, + "grad_norm": 0.49724724888801575, + "learning_rate": 0.0001, + "loss": 1.6874, + "step": 6089 + }, + { + "epoch": 0.7075225094394424, + "grad_norm": 0.44829389452934265, + "learning_rate": 0.0001, + "loss": 1.5349, + "step": 6090 + }, + { + "epoch": 0.7076386871914029, + "grad_norm": 0.4388028681278229, + "learning_rate": 0.0001, + "loss": 1.5188, + "step": 6091 + }, + { + "epoch": 0.7077548649433634, + "grad_norm": 0.41982501745224, + "learning_rate": 0.0001, + "loss": 1.604, + "step": 6092 + }, + { + "epoch": 0.7078710426953239, + "grad_norm": 0.4299807846546173, + "learning_rate": 0.0001, + "loss": 1.55, + "step": 6093 + }, + { + "epoch": 0.7079872204472843, + "grad_norm": 0.42945340275764465, + "learning_rate": 0.0001, + "loss": 1.5398, + "step": 6094 + }, + { + "epoch": 0.7081033981992448, + "grad_norm": 0.4422987401485443, + "learning_rate": 0.0001, + "loss": 1.6441, + "step": 6095 + }, + { + "epoch": 0.7082195759512053, + "grad_norm": 0.42367589473724365, + "learning_rate": 0.0001, + "loss": 1.6466, + "step": 6096 + }, + { + "epoch": 0.7083357537031658, + "grad_norm": 0.4437691867351532, + "learning_rate": 0.0001, + "loss": 1.6916, + "step": 6097 + }, + { + "epoch": 0.7084519314551263, + "grad_norm": 0.434927374124527, + "learning_rate": 0.0001, + "loss": 1.6145, + "step": 6098 + }, + { + "epoch": 0.7085681092070868, + "grad_norm": 0.5107674598693848, + "learning_rate": 0.0001, + "loss": 1.6533, + "step": 6099 + }, + { + "epoch": 0.7086842869590474, + "grad_norm": 0.41166844964027405, + "learning_rate": 0.0001, + "loss": 1.5646, + "step": 6100 + }, + { + "epoch": 0.7088004647110079, + "grad_norm": 0.42547357082366943, + "learning_rate": 0.0001, + "loss": 1.5394, + "step": 6101 + }, + { + "epoch": 0.7089166424629684, + "grad_norm": 0.44167038798332214, + "learning_rate": 0.0001, + "loss": 1.5377, + "step": 6102 + }, + { + "epoch": 0.7090328202149289, + "grad_norm": 0.4662346839904785, + "learning_rate": 0.0001, + "loss": 1.5511, + "step": 6103 + }, + { + "epoch": 0.7091489979668893, + "grad_norm": 0.43853190541267395, + "learning_rate": 0.0001, + "loss": 1.671, + "step": 6104 + }, + { + "epoch": 0.7092651757188498, + "grad_norm": 0.4340886175632477, + "learning_rate": 0.0001, + "loss": 1.6179, + "step": 6105 + }, + { + "epoch": 0.7093813534708103, + "grad_norm": 0.4345012605190277, + "learning_rate": 0.0001, + "loss": 1.6163, + "step": 6106 + }, + { + "epoch": 0.7094975312227708, + "grad_norm": 0.43658679723739624, + "learning_rate": 0.0001, + "loss": 1.5974, + "step": 6107 + }, + { + "epoch": 0.7096137089747313, + "grad_norm": 0.42257410287857056, + "learning_rate": 0.0001, + "loss": 1.5284, + "step": 6108 + }, + { + "epoch": 0.7097298867266918, + "grad_norm": 0.4617374539375305, + "learning_rate": 0.0001, + "loss": 1.7341, + "step": 6109 + }, + { + "epoch": 0.7098460644786523, + "grad_norm": 0.4545523524284363, + "learning_rate": 0.0001, + "loss": 1.6107, + "step": 6110 + }, + { + "epoch": 0.7099622422306129, + "grad_norm": 0.4178352355957031, + "learning_rate": 0.0001, + "loss": 1.5278, + "step": 6111 + }, + { + "epoch": 0.7100784199825734, + "grad_norm": 0.4775643050670624, + "learning_rate": 0.0001, + "loss": 1.6867, + "step": 6112 + }, + { + "epoch": 0.7101945977345339, + "grad_norm": 0.4341333508491516, + "learning_rate": 0.0001, + "loss": 1.492, + "step": 6113 + }, + { + "epoch": 0.7103107754864944, + "grad_norm": 0.44754558801651, + "learning_rate": 0.0001, + "loss": 1.5143, + "step": 6114 + }, + { + "epoch": 0.7104269532384548, + "grad_norm": 0.40256983041763306, + "learning_rate": 0.0001, + "loss": 1.5979, + "step": 6115 + }, + { + "epoch": 0.7105431309904153, + "grad_norm": 0.4549466669559479, + "learning_rate": 0.0001, + "loss": 1.5838, + "step": 6116 + }, + { + "epoch": 0.7106593087423758, + "grad_norm": 0.45725107192993164, + "learning_rate": 0.0001, + "loss": 1.5894, + "step": 6117 + }, + { + "epoch": 0.7107754864943363, + "grad_norm": 0.4477519094944, + "learning_rate": 0.0001, + "loss": 1.6944, + "step": 6118 + }, + { + "epoch": 0.7108916642462968, + "grad_norm": 0.4426574110984802, + "learning_rate": 0.0001, + "loss": 1.4692, + "step": 6119 + }, + { + "epoch": 0.7110078419982573, + "grad_norm": 0.42777371406555176, + "learning_rate": 0.0001, + "loss": 1.4743, + "step": 6120 + }, + { + "epoch": 0.7111240197502179, + "grad_norm": 0.43052244186401367, + "learning_rate": 0.0001, + "loss": 1.6027, + "step": 6121 + }, + { + "epoch": 0.7112401975021784, + "grad_norm": 0.42970481514930725, + "learning_rate": 0.0001, + "loss": 1.6517, + "step": 6122 + }, + { + "epoch": 0.7113563752541389, + "grad_norm": 0.4782750606536865, + "learning_rate": 0.0001, + "loss": 1.5925, + "step": 6123 + }, + { + "epoch": 0.7114725530060994, + "grad_norm": 0.45479482412338257, + "learning_rate": 0.0001, + "loss": 1.6684, + "step": 6124 + }, + { + "epoch": 0.7115887307580598, + "grad_norm": 0.43917152285575867, + "learning_rate": 0.0001, + "loss": 1.5785, + "step": 6125 + }, + { + "epoch": 0.7117049085100203, + "grad_norm": 0.4511852562427521, + "learning_rate": 0.0001, + "loss": 1.6015, + "step": 6126 + }, + { + "epoch": 0.7118210862619808, + "grad_norm": 0.4351733326911926, + "learning_rate": 0.0001, + "loss": 1.7038, + "step": 6127 + }, + { + "epoch": 0.7119372640139413, + "grad_norm": 0.458448588848114, + "learning_rate": 0.0001, + "loss": 1.588, + "step": 6128 + }, + { + "epoch": 0.7120534417659018, + "grad_norm": 0.41360458731651306, + "learning_rate": 0.0001, + "loss": 1.3792, + "step": 6129 + }, + { + "epoch": 0.7121696195178623, + "grad_norm": 0.45411717891693115, + "learning_rate": 0.0001, + "loss": 1.6064, + "step": 6130 + }, + { + "epoch": 0.7122857972698228, + "grad_norm": 0.4399266242980957, + "learning_rate": 0.0001, + "loss": 1.589, + "step": 6131 + }, + { + "epoch": 0.7124019750217834, + "grad_norm": 0.49353474378585815, + "learning_rate": 0.0001, + "loss": 1.7608, + "step": 6132 + }, + { + "epoch": 0.7125181527737439, + "grad_norm": 0.4594828188419342, + "learning_rate": 0.0001, + "loss": 1.5687, + "step": 6133 + }, + { + "epoch": 0.7126343305257044, + "grad_norm": 0.4374282658100128, + "learning_rate": 0.0001, + "loss": 1.4874, + "step": 6134 + }, + { + "epoch": 0.7127505082776648, + "grad_norm": 0.45439621806144714, + "learning_rate": 0.0001, + "loss": 1.5785, + "step": 6135 + }, + { + "epoch": 0.7128666860296253, + "grad_norm": 0.4368216097354889, + "learning_rate": 0.0001, + "loss": 1.6585, + "step": 6136 + }, + { + "epoch": 0.7129828637815858, + "grad_norm": 0.4321901202201843, + "learning_rate": 0.0001, + "loss": 1.6303, + "step": 6137 + }, + { + "epoch": 0.7130990415335463, + "grad_norm": 0.4542466402053833, + "learning_rate": 0.0001, + "loss": 1.5449, + "step": 6138 + }, + { + "epoch": 0.7132152192855068, + "grad_norm": 0.43192481994628906, + "learning_rate": 0.0001, + "loss": 1.6529, + "step": 6139 + }, + { + "epoch": 0.7133313970374673, + "grad_norm": 0.46249476075172424, + "learning_rate": 0.0001, + "loss": 1.6503, + "step": 6140 + }, + { + "epoch": 0.7134475747894278, + "grad_norm": 0.4444749653339386, + "learning_rate": 0.0001, + "loss": 1.7026, + "step": 6141 + }, + { + "epoch": 0.7135637525413884, + "grad_norm": 0.4751194417476654, + "learning_rate": 0.0001, + "loss": 1.6593, + "step": 6142 + }, + { + "epoch": 0.7136799302933489, + "grad_norm": 0.4836347699165344, + "learning_rate": 0.0001, + "loss": 1.6869, + "step": 6143 + }, + { + "epoch": 0.7137961080453094, + "grad_norm": 0.4470170736312866, + "learning_rate": 0.0001, + "loss": 1.3949, + "step": 6144 + }, + { + "epoch": 0.7139122857972698, + "grad_norm": 0.43671542406082153, + "learning_rate": 0.0001, + "loss": 1.6338, + "step": 6145 + }, + { + "epoch": 0.7140284635492303, + "grad_norm": 0.4873625636100769, + "learning_rate": 0.0001, + "loss": 1.7472, + "step": 6146 + }, + { + "epoch": 0.7141446413011908, + "grad_norm": 0.43032151460647583, + "learning_rate": 0.0001, + "loss": 1.5258, + "step": 6147 + }, + { + "epoch": 0.7142608190531513, + "grad_norm": 0.445576936006546, + "learning_rate": 0.0001, + "loss": 1.6298, + "step": 6148 + }, + { + "epoch": 0.7143769968051118, + "grad_norm": 0.450259804725647, + "learning_rate": 0.0001, + "loss": 1.5595, + "step": 6149 + }, + { + "epoch": 0.7144931745570723, + "grad_norm": 0.46049565076828003, + "learning_rate": 0.0001, + "loss": 1.5674, + "step": 6150 + }, + { + "epoch": 0.7146093523090328, + "grad_norm": 0.44381004571914673, + "learning_rate": 0.0001, + "loss": 1.5347, + "step": 6151 + }, + { + "epoch": 0.7147255300609934, + "grad_norm": 0.44434526562690735, + "learning_rate": 0.0001, + "loss": 1.5329, + "step": 6152 + }, + { + "epoch": 0.7148417078129539, + "grad_norm": 0.4273211658000946, + "learning_rate": 0.0001, + "loss": 1.498, + "step": 6153 + }, + { + "epoch": 0.7149578855649144, + "grad_norm": 0.46949854493141174, + "learning_rate": 0.0001, + "loss": 1.6709, + "step": 6154 + }, + { + "epoch": 0.7150740633168748, + "grad_norm": 0.40654122829437256, + "learning_rate": 0.0001, + "loss": 1.4843, + "step": 6155 + }, + { + "epoch": 0.7151902410688353, + "grad_norm": 0.4765225946903229, + "learning_rate": 0.0001, + "loss": 1.7118, + "step": 6156 + }, + { + "epoch": 0.7153064188207958, + "grad_norm": 0.46587619185447693, + "learning_rate": 0.0001, + "loss": 1.6022, + "step": 6157 + }, + { + "epoch": 0.7154225965727563, + "grad_norm": 0.4613794684410095, + "learning_rate": 0.0001, + "loss": 1.7496, + "step": 6158 + }, + { + "epoch": 0.7155387743247168, + "grad_norm": 0.4671042263507843, + "learning_rate": 0.0001, + "loss": 1.6173, + "step": 6159 + }, + { + "epoch": 0.7156549520766773, + "grad_norm": 0.4612120985984802, + "learning_rate": 0.0001, + "loss": 1.6127, + "step": 6160 + }, + { + "epoch": 0.7157711298286378, + "grad_norm": 0.4640682339668274, + "learning_rate": 0.0001, + "loss": 1.6909, + "step": 6161 + }, + { + "epoch": 0.7158873075805983, + "grad_norm": 0.43429458141326904, + "learning_rate": 0.0001, + "loss": 1.6723, + "step": 6162 + }, + { + "epoch": 0.7160034853325589, + "grad_norm": 0.4491255581378937, + "learning_rate": 0.0001, + "loss": 1.512, + "step": 6163 + }, + { + "epoch": 0.7161196630845194, + "grad_norm": 0.39764824509620667, + "learning_rate": 0.0001, + "loss": 1.344, + "step": 6164 + }, + { + "epoch": 0.7162358408364798, + "grad_norm": 0.5114927887916565, + "learning_rate": 0.0001, + "loss": 1.746, + "step": 6165 + }, + { + "epoch": 0.7163520185884403, + "grad_norm": 0.5151812434196472, + "learning_rate": 0.0001, + "loss": 1.6123, + "step": 6166 + }, + { + "epoch": 0.7164681963404008, + "grad_norm": 0.43614163994789124, + "learning_rate": 0.0001, + "loss": 1.63, + "step": 6167 + }, + { + "epoch": 0.7165843740923613, + "grad_norm": 0.46181315183639526, + "learning_rate": 0.0001, + "loss": 1.6494, + "step": 6168 + }, + { + "epoch": 0.7167005518443218, + "grad_norm": 0.43157726526260376, + "learning_rate": 0.0001, + "loss": 1.5617, + "step": 6169 + }, + { + "epoch": 0.7168167295962823, + "grad_norm": 0.4304739534854889, + "learning_rate": 0.0001, + "loss": 1.5688, + "step": 6170 + }, + { + "epoch": 0.7169329073482428, + "grad_norm": 0.4536237120628357, + "learning_rate": 0.0001, + "loss": 1.7147, + "step": 6171 + }, + { + "epoch": 0.7170490851002033, + "grad_norm": 0.40845513343811035, + "learning_rate": 0.0001, + "loss": 1.4578, + "step": 6172 + }, + { + "epoch": 0.7171652628521639, + "grad_norm": 0.45678746700286865, + "learning_rate": 0.0001, + "loss": 1.6398, + "step": 6173 + }, + { + "epoch": 0.7172814406041244, + "grad_norm": 0.42773115634918213, + "learning_rate": 0.0001, + "loss": 1.5271, + "step": 6174 + }, + { + "epoch": 0.7173976183560848, + "grad_norm": 0.4427812993526459, + "learning_rate": 0.0001, + "loss": 1.7262, + "step": 6175 + }, + { + "epoch": 0.7175137961080453, + "grad_norm": 0.42807790637016296, + "learning_rate": 0.0001, + "loss": 1.6036, + "step": 6176 + }, + { + "epoch": 0.7176299738600058, + "grad_norm": 0.4445224404335022, + "learning_rate": 0.0001, + "loss": 1.6919, + "step": 6177 + }, + { + "epoch": 0.7177461516119663, + "grad_norm": 0.43124210834503174, + "learning_rate": 0.0001, + "loss": 1.5545, + "step": 6178 + }, + { + "epoch": 0.7178623293639268, + "grad_norm": 0.41215458512306213, + "learning_rate": 0.0001, + "loss": 1.6969, + "step": 6179 + }, + { + "epoch": 0.7179785071158873, + "grad_norm": 0.45972126722335815, + "learning_rate": 0.0001, + "loss": 1.6359, + "step": 6180 + }, + { + "epoch": 0.7180946848678478, + "grad_norm": 0.44836103916168213, + "learning_rate": 0.0001, + "loss": 1.7967, + "step": 6181 + }, + { + "epoch": 0.7182108626198083, + "grad_norm": 0.4521453082561493, + "learning_rate": 0.0001, + "loss": 1.657, + "step": 6182 + }, + { + "epoch": 0.7183270403717688, + "grad_norm": 0.42196735739707947, + "learning_rate": 0.0001, + "loss": 1.5564, + "step": 6183 + }, + { + "epoch": 0.7184432181237294, + "grad_norm": 0.4695570170879364, + "learning_rate": 0.0001, + "loss": 1.6709, + "step": 6184 + }, + { + "epoch": 0.7185593958756898, + "grad_norm": 0.4804345369338989, + "learning_rate": 0.0001, + "loss": 1.7857, + "step": 6185 + }, + { + "epoch": 0.7186755736276503, + "grad_norm": 0.41611528396606445, + "learning_rate": 0.0001, + "loss": 1.5269, + "step": 6186 + }, + { + "epoch": 0.7187917513796108, + "grad_norm": 0.4277372658252716, + "learning_rate": 0.0001, + "loss": 1.4379, + "step": 6187 + }, + { + "epoch": 0.7189079291315713, + "grad_norm": 0.45039102435112, + "learning_rate": 0.0001, + "loss": 1.6271, + "step": 6188 + }, + { + "epoch": 0.7190241068835318, + "grad_norm": 0.4118036925792694, + "learning_rate": 0.0001, + "loss": 1.4019, + "step": 6189 + }, + { + "epoch": 0.7191402846354923, + "grad_norm": 0.42078375816345215, + "learning_rate": 0.0001, + "loss": 1.4847, + "step": 6190 + }, + { + "epoch": 0.7192564623874528, + "grad_norm": 0.4338337779045105, + "learning_rate": 0.0001, + "loss": 1.6613, + "step": 6191 + }, + { + "epoch": 0.7193726401394133, + "grad_norm": 0.440143883228302, + "learning_rate": 0.0001, + "loss": 1.6647, + "step": 6192 + }, + { + "epoch": 0.7194888178913738, + "grad_norm": 0.45626693964004517, + "learning_rate": 0.0001, + "loss": 1.6489, + "step": 6193 + }, + { + "epoch": 0.7196049956433344, + "grad_norm": 0.48190397024154663, + "learning_rate": 0.0001, + "loss": 1.6335, + "step": 6194 + }, + { + "epoch": 0.7197211733952948, + "grad_norm": 0.42678698897361755, + "learning_rate": 0.0001, + "loss": 1.6169, + "step": 6195 + }, + { + "epoch": 0.7198373511472553, + "grad_norm": 0.48554766178131104, + "learning_rate": 0.0001, + "loss": 1.7154, + "step": 6196 + }, + { + "epoch": 0.7199535288992158, + "grad_norm": 0.4570693075656891, + "learning_rate": 0.0001, + "loss": 1.622, + "step": 6197 + }, + { + "epoch": 0.7200697066511763, + "grad_norm": 0.48876166343688965, + "learning_rate": 0.0001, + "loss": 1.678, + "step": 6198 + }, + { + "epoch": 0.7201858844031368, + "grad_norm": 0.4803100526332855, + "learning_rate": 0.0001, + "loss": 1.5687, + "step": 6199 + }, + { + "epoch": 0.7203020621550973, + "grad_norm": 0.44159939885139465, + "learning_rate": 0.0001, + "loss": 1.6498, + "step": 6200 + }, + { + "epoch": 0.7204182399070578, + "grad_norm": 0.45352402329444885, + "learning_rate": 0.0001, + "loss": 1.7286, + "step": 6201 + }, + { + "epoch": 0.7205344176590183, + "grad_norm": 0.4505062699317932, + "learning_rate": 0.0001, + "loss": 1.5318, + "step": 6202 + }, + { + "epoch": 0.7206505954109788, + "grad_norm": 0.4428583085536957, + "learning_rate": 0.0001, + "loss": 1.6199, + "step": 6203 + }, + { + "epoch": 0.7207667731629392, + "grad_norm": 0.43122392892837524, + "learning_rate": 0.0001, + "loss": 1.5087, + "step": 6204 + }, + { + "epoch": 0.7208829509148998, + "grad_norm": 0.4514487087726593, + "learning_rate": 0.0001, + "loss": 1.7102, + "step": 6205 + }, + { + "epoch": 0.7209991286668603, + "grad_norm": 0.43971630930900574, + "learning_rate": 0.0001, + "loss": 1.6305, + "step": 6206 + }, + { + "epoch": 0.7211153064188208, + "grad_norm": 0.424069344997406, + "learning_rate": 0.0001, + "loss": 1.4566, + "step": 6207 + }, + { + "epoch": 0.7212314841707813, + "grad_norm": 0.4398737847805023, + "learning_rate": 0.0001, + "loss": 1.4913, + "step": 6208 + }, + { + "epoch": 0.7213476619227418, + "grad_norm": 0.4196438491344452, + "learning_rate": 0.0001, + "loss": 1.5111, + "step": 6209 + }, + { + "epoch": 0.7214638396747023, + "grad_norm": 0.45629531145095825, + "learning_rate": 0.0001, + "loss": 1.6255, + "step": 6210 + }, + { + "epoch": 0.7215800174266628, + "grad_norm": 0.47182023525238037, + "learning_rate": 0.0001, + "loss": 1.7316, + "step": 6211 + }, + { + "epoch": 0.7216961951786233, + "grad_norm": 0.460814893245697, + "learning_rate": 0.0001, + "loss": 1.6633, + "step": 6212 + }, + { + "epoch": 0.7218123729305838, + "grad_norm": 0.41416749358177185, + "learning_rate": 0.0001, + "loss": 1.3794, + "step": 6213 + }, + { + "epoch": 0.7219285506825442, + "grad_norm": 0.4730556607246399, + "learning_rate": 0.0001, + "loss": 1.6985, + "step": 6214 + }, + { + "epoch": 0.7220447284345048, + "grad_norm": 0.4644532799720764, + "learning_rate": 0.0001, + "loss": 1.5189, + "step": 6215 + }, + { + "epoch": 0.7221609061864653, + "grad_norm": 0.44122570753097534, + "learning_rate": 0.0001, + "loss": 1.6217, + "step": 6216 + }, + { + "epoch": 0.7222770839384258, + "grad_norm": 0.46849554777145386, + "learning_rate": 0.0001, + "loss": 1.4721, + "step": 6217 + }, + { + "epoch": 0.7223932616903863, + "grad_norm": 0.43822774291038513, + "learning_rate": 0.0001, + "loss": 1.518, + "step": 6218 + }, + { + "epoch": 0.7225094394423468, + "grad_norm": 0.42008358240127563, + "learning_rate": 0.0001, + "loss": 1.529, + "step": 6219 + }, + { + "epoch": 0.7226256171943073, + "grad_norm": 0.45318713784217834, + "learning_rate": 0.0001, + "loss": 1.6414, + "step": 6220 + }, + { + "epoch": 0.7227417949462678, + "grad_norm": 0.4390380084514618, + "learning_rate": 0.0001, + "loss": 1.5671, + "step": 6221 + }, + { + "epoch": 0.7228579726982283, + "grad_norm": 0.46329495310783386, + "learning_rate": 0.0001, + "loss": 1.5951, + "step": 6222 + }, + { + "epoch": 0.7229741504501888, + "grad_norm": 0.47496578097343445, + "learning_rate": 0.0001, + "loss": 1.6651, + "step": 6223 + }, + { + "epoch": 0.7230903282021492, + "grad_norm": 0.46759724617004395, + "learning_rate": 0.0001, + "loss": 1.7158, + "step": 6224 + }, + { + "epoch": 0.7232065059541097, + "grad_norm": 0.46540212631225586, + "learning_rate": 0.0001, + "loss": 1.4931, + "step": 6225 + }, + { + "epoch": 0.7233226837060703, + "grad_norm": 0.4282924234867096, + "learning_rate": 0.0001, + "loss": 1.5571, + "step": 6226 + }, + { + "epoch": 0.7234388614580308, + "grad_norm": 0.44697490334510803, + "learning_rate": 0.0001, + "loss": 1.5344, + "step": 6227 + }, + { + "epoch": 0.7235550392099913, + "grad_norm": 0.4610814154148102, + "learning_rate": 0.0001, + "loss": 1.4993, + "step": 6228 + }, + { + "epoch": 0.7236712169619518, + "grad_norm": 0.45639610290527344, + "learning_rate": 0.0001, + "loss": 1.5497, + "step": 6229 + }, + { + "epoch": 0.7237873947139123, + "grad_norm": 0.4381502568721771, + "learning_rate": 0.0001, + "loss": 1.573, + "step": 6230 + }, + { + "epoch": 0.7239035724658728, + "grad_norm": 0.440270334482193, + "learning_rate": 0.0001, + "loss": 1.4069, + "step": 6231 + }, + { + "epoch": 0.7240197502178333, + "grad_norm": 0.4624209403991699, + "learning_rate": 0.0001, + "loss": 1.5338, + "step": 6232 + }, + { + "epoch": 0.7241359279697938, + "grad_norm": 0.4678003489971161, + "learning_rate": 0.0001, + "loss": 1.659, + "step": 6233 + }, + { + "epoch": 0.7242521057217542, + "grad_norm": 0.44897353649139404, + "learning_rate": 0.0001, + "loss": 1.5219, + "step": 6234 + }, + { + "epoch": 0.7243682834737147, + "grad_norm": 0.431941419839859, + "learning_rate": 0.0001, + "loss": 1.6613, + "step": 6235 + }, + { + "epoch": 0.7244844612256753, + "grad_norm": 0.45239877700805664, + "learning_rate": 0.0001, + "loss": 1.6443, + "step": 6236 + }, + { + "epoch": 0.7246006389776358, + "grad_norm": 0.4519427418708801, + "learning_rate": 0.0001, + "loss": 1.7284, + "step": 6237 + }, + { + "epoch": 0.7247168167295963, + "grad_norm": 0.45957690477371216, + "learning_rate": 0.0001, + "loss": 1.5839, + "step": 6238 + }, + { + "epoch": 0.7248329944815568, + "grad_norm": 0.46946290135383606, + "learning_rate": 0.0001, + "loss": 1.5579, + "step": 6239 + }, + { + "epoch": 0.7249491722335173, + "grad_norm": 0.4527909457683563, + "learning_rate": 0.0001, + "loss": 1.6597, + "step": 6240 + }, + { + "epoch": 0.7250653499854778, + "grad_norm": 0.4612141251564026, + "learning_rate": 0.0001, + "loss": 1.6803, + "step": 6241 + }, + { + "epoch": 0.7251815277374383, + "grad_norm": 0.4190555214881897, + "learning_rate": 0.0001, + "loss": 1.517, + "step": 6242 + }, + { + "epoch": 0.7252977054893988, + "grad_norm": 0.4483005702495575, + "learning_rate": 0.0001, + "loss": 1.5936, + "step": 6243 + }, + { + "epoch": 0.7254138832413592, + "grad_norm": 0.4509059190750122, + "learning_rate": 0.0001, + "loss": 1.6389, + "step": 6244 + }, + { + "epoch": 0.7255300609933197, + "grad_norm": 0.47739991545677185, + "learning_rate": 0.0001, + "loss": 1.759, + "step": 6245 + }, + { + "epoch": 0.7256462387452802, + "grad_norm": 0.4522044360637665, + "learning_rate": 0.0001, + "loss": 1.6458, + "step": 6246 + }, + { + "epoch": 0.7257624164972408, + "grad_norm": 0.43035435676574707, + "learning_rate": 0.0001, + "loss": 1.5951, + "step": 6247 + }, + { + "epoch": 0.7258785942492013, + "grad_norm": 0.4420571029186249, + "learning_rate": 0.0001, + "loss": 1.5334, + "step": 6248 + }, + { + "epoch": 0.7259947720011618, + "grad_norm": 0.47003141045570374, + "learning_rate": 0.0001, + "loss": 1.7234, + "step": 6249 + }, + { + "epoch": 0.7261109497531223, + "grad_norm": 0.43513643741607666, + "learning_rate": 0.0001, + "loss": 1.7224, + "step": 6250 + }, + { + "epoch": 0.7262271275050828, + "grad_norm": 0.4268497824668884, + "learning_rate": 0.0001, + "loss": 1.6493, + "step": 6251 + }, + { + "epoch": 0.7263433052570433, + "grad_norm": 0.46211907267570496, + "learning_rate": 0.0001, + "loss": 1.7306, + "step": 6252 + }, + { + "epoch": 0.7264594830090038, + "grad_norm": 0.4459565281867981, + "learning_rate": 0.0001, + "loss": 1.4749, + "step": 6253 + }, + { + "epoch": 0.7265756607609642, + "grad_norm": 0.44835829734802246, + "learning_rate": 0.0001, + "loss": 1.5263, + "step": 6254 + }, + { + "epoch": 0.7266918385129247, + "grad_norm": 0.4384959936141968, + "learning_rate": 0.0001, + "loss": 1.4294, + "step": 6255 + }, + { + "epoch": 0.7268080162648852, + "grad_norm": 0.4654468894004822, + "learning_rate": 0.0001, + "loss": 1.6979, + "step": 6256 + }, + { + "epoch": 0.7269241940168458, + "grad_norm": 0.4371379613876343, + "learning_rate": 0.0001, + "loss": 1.6439, + "step": 6257 + }, + { + "epoch": 0.7270403717688063, + "grad_norm": 0.48108625411987305, + "learning_rate": 0.0001, + "loss": 1.7885, + "step": 6258 + }, + { + "epoch": 0.7271565495207668, + "grad_norm": 0.4787601828575134, + "learning_rate": 0.0001, + "loss": 1.6427, + "step": 6259 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 0.43150654435157776, + "learning_rate": 0.0001, + "loss": 1.5843, + "step": 6260 + }, + { + "epoch": 0.7273889050246878, + "grad_norm": 0.46279093623161316, + "learning_rate": 0.0001, + "loss": 1.6501, + "step": 6261 + }, + { + "epoch": 0.7275050827766483, + "grad_norm": 0.4516332447528839, + "learning_rate": 0.0001, + "loss": 1.5332, + "step": 6262 + }, + { + "epoch": 0.7276212605286088, + "grad_norm": 0.43989452719688416, + "learning_rate": 0.0001, + "loss": 1.6061, + "step": 6263 + }, + { + "epoch": 0.7277374382805692, + "grad_norm": 0.44632989168167114, + "learning_rate": 0.0001, + "loss": 1.5898, + "step": 6264 + }, + { + "epoch": 0.7278536160325297, + "grad_norm": 0.44944363832473755, + "learning_rate": 0.0001, + "loss": 1.6132, + "step": 6265 + }, + { + "epoch": 0.7279697937844902, + "grad_norm": 0.4271278977394104, + "learning_rate": 0.0001, + "loss": 1.48, + "step": 6266 + }, + { + "epoch": 0.7280859715364507, + "grad_norm": 0.4227433204650879, + "learning_rate": 0.0001, + "loss": 1.5262, + "step": 6267 + }, + { + "epoch": 0.7282021492884113, + "grad_norm": 0.42294958233833313, + "learning_rate": 0.0001, + "loss": 1.6177, + "step": 6268 + }, + { + "epoch": 0.7283183270403718, + "grad_norm": 0.43251901865005493, + "learning_rate": 0.0001, + "loss": 1.576, + "step": 6269 + }, + { + "epoch": 0.7284345047923323, + "grad_norm": 0.47049641609191895, + "learning_rate": 0.0001, + "loss": 1.7649, + "step": 6270 + }, + { + "epoch": 0.7285506825442928, + "grad_norm": 0.457476943731308, + "learning_rate": 0.0001, + "loss": 1.6335, + "step": 6271 + }, + { + "epoch": 0.7286668602962533, + "grad_norm": 0.4726758599281311, + "learning_rate": 0.0001, + "loss": 1.6506, + "step": 6272 + }, + { + "epoch": 0.7287830380482138, + "grad_norm": 0.5140420198440552, + "learning_rate": 0.0001, + "loss": 1.8907, + "step": 6273 + }, + { + "epoch": 0.7288992158001742, + "grad_norm": 0.4506530165672302, + "learning_rate": 0.0001, + "loss": 1.6638, + "step": 6274 + }, + { + "epoch": 0.7290153935521347, + "grad_norm": 0.4550815522670746, + "learning_rate": 0.0001, + "loss": 1.7825, + "step": 6275 + }, + { + "epoch": 0.7291315713040952, + "grad_norm": 0.45727846026420593, + "learning_rate": 0.0001, + "loss": 1.728, + "step": 6276 + }, + { + "epoch": 0.7292477490560557, + "grad_norm": 0.4206952154636383, + "learning_rate": 0.0001, + "loss": 1.4247, + "step": 6277 + }, + { + "epoch": 0.7293639268080163, + "grad_norm": 0.4411332905292511, + "learning_rate": 0.0001, + "loss": 1.7024, + "step": 6278 + }, + { + "epoch": 0.7294801045599768, + "grad_norm": 0.4896695613861084, + "learning_rate": 0.0001, + "loss": 1.7724, + "step": 6279 + }, + { + "epoch": 0.7295962823119373, + "grad_norm": 0.463642954826355, + "learning_rate": 0.0001, + "loss": 1.6887, + "step": 6280 + }, + { + "epoch": 0.7297124600638978, + "grad_norm": 0.41118043661117554, + "learning_rate": 0.0001, + "loss": 1.5507, + "step": 6281 + }, + { + "epoch": 0.7298286378158583, + "grad_norm": 0.4534677565097809, + "learning_rate": 0.0001, + "loss": 1.6086, + "step": 6282 + }, + { + "epoch": 0.7299448155678188, + "grad_norm": 0.4662560522556305, + "learning_rate": 0.0001, + "loss": 1.5533, + "step": 6283 + }, + { + "epoch": 0.7300609933197793, + "grad_norm": 0.4774499833583832, + "learning_rate": 0.0001, + "loss": 1.8409, + "step": 6284 + }, + { + "epoch": 0.7301771710717397, + "grad_norm": 0.45723921060562134, + "learning_rate": 0.0001, + "loss": 1.6268, + "step": 6285 + }, + { + "epoch": 0.7302933488237002, + "grad_norm": 0.44660598039627075, + "learning_rate": 0.0001, + "loss": 1.6364, + "step": 6286 + }, + { + "epoch": 0.7304095265756607, + "grad_norm": 0.44800204038619995, + "learning_rate": 0.0001, + "loss": 1.5822, + "step": 6287 + }, + { + "epoch": 0.7305257043276212, + "grad_norm": 0.45778778195381165, + "learning_rate": 0.0001, + "loss": 1.5414, + "step": 6288 + }, + { + "epoch": 0.7306418820795818, + "grad_norm": 0.431100457906723, + "learning_rate": 0.0001, + "loss": 1.6013, + "step": 6289 + }, + { + "epoch": 0.7307580598315423, + "grad_norm": 0.4813309907913208, + "learning_rate": 0.0001, + "loss": 1.5132, + "step": 6290 + }, + { + "epoch": 0.7308742375835028, + "grad_norm": 0.4887335002422333, + "learning_rate": 0.0001, + "loss": 1.7038, + "step": 6291 + }, + { + "epoch": 0.7309904153354633, + "grad_norm": 0.48292210698127747, + "learning_rate": 0.0001, + "loss": 1.6222, + "step": 6292 + }, + { + "epoch": 0.7311065930874238, + "grad_norm": 0.42640790343284607, + "learning_rate": 0.0001, + "loss": 1.5194, + "step": 6293 + }, + { + "epoch": 0.7312227708393843, + "grad_norm": 0.41172870993614197, + "learning_rate": 0.0001, + "loss": 1.4218, + "step": 6294 + }, + { + "epoch": 0.7313389485913447, + "grad_norm": 0.48400574922561646, + "learning_rate": 0.0001, + "loss": 1.6952, + "step": 6295 + }, + { + "epoch": 0.7314551263433052, + "grad_norm": 0.47430482506752014, + "learning_rate": 0.0001, + "loss": 1.5349, + "step": 6296 + }, + { + "epoch": 0.7315713040952657, + "grad_norm": 0.4179590046405792, + "learning_rate": 0.0001, + "loss": 1.6036, + "step": 6297 + }, + { + "epoch": 0.7316874818472262, + "grad_norm": 0.4300008714199066, + "learning_rate": 0.0001, + "loss": 1.5309, + "step": 6298 + }, + { + "epoch": 0.7318036595991868, + "grad_norm": 0.4634290933609009, + "learning_rate": 0.0001, + "loss": 1.7461, + "step": 6299 + }, + { + "epoch": 0.7319198373511473, + "grad_norm": 0.4361773133277893, + "learning_rate": 0.0001, + "loss": 1.5926, + "step": 6300 + }, + { + "epoch": 0.7320360151031078, + "grad_norm": 0.46928146481513977, + "learning_rate": 0.0001, + "loss": 1.7193, + "step": 6301 + }, + { + "epoch": 0.7321521928550683, + "grad_norm": 0.47918713092803955, + "learning_rate": 0.0001, + "loss": 1.7658, + "step": 6302 + }, + { + "epoch": 0.7322683706070288, + "grad_norm": 0.49823591113090515, + "learning_rate": 0.0001, + "loss": 1.6058, + "step": 6303 + }, + { + "epoch": 0.7323845483589893, + "grad_norm": 0.5099037289619446, + "learning_rate": 0.0001, + "loss": 1.8295, + "step": 6304 + }, + { + "epoch": 0.7325007261109497, + "grad_norm": 0.49155983328819275, + "learning_rate": 0.0001, + "loss": 1.6552, + "step": 6305 + }, + { + "epoch": 0.7326169038629102, + "grad_norm": 0.4120127260684967, + "learning_rate": 0.0001, + "loss": 1.3311, + "step": 6306 + }, + { + "epoch": 0.7327330816148707, + "grad_norm": 0.5385419726371765, + "learning_rate": 0.0001, + "loss": 1.6745, + "step": 6307 + }, + { + "epoch": 0.7328492593668312, + "grad_norm": 0.4356537461280823, + "learning_rate": 0.0001, + "loss": 1.6443, + "step": 6308 + }, + { + "epoch": 0.7329654371187917, + "grad_norm": 0.46573251485824585, + "learning_rate": 0.0001, + "loss": 1.6368, + "step": 6309 + }, + { + "epoch": 0.7330816148707523, + "grad_norm": 0.42254379391670227, + "learning_rate": 0.0001, + "loss": 1.5118, + "step": 6310 + }, + { + "epoch": 0.7331977926227128, + "grad_norm": 0.42466604709625244, + "learning_rate": 0.0001, + "loss": 1.5458, + "step": 6311 + }, + { + "epoch": 0.7333139703746733, + "grad_norm": 0.4290546774864197, + "learning_rate": 0.0001, + "loss": 1.3726, + "step": 6312 + }, + { + "epoch": 0.7334301481266338, + "grad_norm": 0.4976603090763092, + "learning_rate": 0.0001, + "loss": 1.6887, + "step": 6313 + }, + { + "epoch": 0.7335463258785943, + "grad_norm": 0.457592248916626, + "learning_rate": 0.0001, + "loss": 1.5581, + "step": 6314 + }, + { + "epoch": 0.7336625036305547, + "grad_norm": 0.4522661864757538, + "learning_rate": 0.0001, + "loss": 1.6479, + "step": 6315 + }, + { + "epoch": 0.7337786813825152, + "grad_norm": 0.43026474118232727, + "learning_rate": 0.0001, + "loss": 1.4995, + "step": 6316 + }, + { + "epoch": 0.7338948591344757, + "grad_norm": 0.41863980889320374, + "learning_rate": 0.0001, + "loss": 1.4759, + "step": 6317 + }, + { + "epoch": 0.7340110368864362, + "grad_norm": 0.47513189911842346, + "learning_rate": 0.0001, + "loss": 1.7717, + "step": 6318 + }, + { + "epoch": 0.7341272146383967, + "grad_norm": 0.46296703815460205, + "learning_rate": 0.0001, + "loss": 1.5514, + "step": 6319 + }, + { + "epoch": 0.7342433923903573, + "grad_norm": 0.40901249647140503, + "learning_rate": 0.0001, + "loss": 1.4381, + "step": 6320 + }, + { + "epoch": 0.7343595701423178, + "grad_norm": 0.48210811614990234, + "learning_rate": 0.0001, + "loss": 1.5241, + "step": 6321 + }, + { + "epoch": 0.7344757478942783, + "grad_norm": 0.4578268826007843, + "learning_rate": 0.0001, + "loss": 1.5555, + "step": 6322 + }, + { + "epoch": 0.7345919256462388, + "grad_norm": 0.5248122811317444, + "learning_rate": 0.0001, + "loss": 1.7845, + "step": 6323 + }, + { + "epoch": 0.7347081033981993, + "grad_norm": 0.4604836404323578, + "learning_rate": 0.0001, + "loss": 1.5511, + "step": 6324 + }, + { + "epoch": 0.7348242811501597, + "grad_norm": 0.4478675425052643, + "learning_rate": 0.0001, + "loss": 1.5158, + "step": 6325 + }, + { + "epoch": 0.7349404589021202, + "grad_norm": 0.4665616452693939, + "learning_rate": 0.0001, + "loss": 1.7363, + "step": 6326 + }, + { + "epoch": 0.7350566366540807, + "grad_norm": 0.4462968409061432, + "learning_rate": 0.0001, + "loss": 1.5514, + "step": 6327 + }, + { + "epoch": 0.7351728144060412, + "grad_norm": 0.4639495313167572, + "learning_rate": 0.0001, + "loss": 1.6764, + "step": 6328 + }, + { + "epoch": 0.7352889921580017, + "grad_norm": 0.47614338994026184, + "learning_rate": 0.0001, + "loss": 1.7773, + "step": 6329 + }, + { + "epoch": 0.7354051699099623, + "grad_norm": 0.47251033782958984, + "learning_rate": 0.0001, + "loss": 1.6546, + "step": 6330 + }, + { + "epoch": 0.7355213476619228, + "grad_norm": 0.4667409062385559, + "learning_rate": 0.0001, + "loss": 1.6973, + "step": 6331 + }, + { + "epoch": 0.7356375254138833, + "grad_norm": 0.43964049220085144, + "learning_rate": 0.0001, + "loss": 1.6167, + "step": 6332 + }, + { + "epoch": 0.7357537031658438, + "grad_norm": 0.43899863958358765, + "learning_rate": 0.0001, + "loss": 1.4977, + "step": 6333 + }, + { + "epoch": 0.7358698809178043, + "grad_norm": 0.4609370231628418, + "learning_rate": 0.0001, + "loss": 1.5861, + "step": 6334 + }, + { + "epoch": 0.7359860586697647, + "grad_norm": 0.4471205472946167, + "learning_rate": 0.0001, + "loss": 1.5519, + "step": 6335 + }, + { + "epoch": 0.7361022364217252, + "grad_norm": 0.4316443204879761, + "learning_rate": 0.0001, + "loss": 1.4519, + "step": 6336 + }, + { + "epoch": 0.7362184141736857, + "grad_norm": 0.4466089606285095, + "learning_rate": 0.0001, + "loss": 1.6115, + "step": 6337 + }, + { + "epoch": 0.7363345919256462, + "grad_norm": 0.4607985317707062, + "learning_rate": 0.0001, + "loss": 1.6619, + "step": 6338 + }, + { + "epoch": 0.7364507696776067, + "grad_norm": 0.46263134479522705, + "learning_rate": 0.0001, + "loss": 1.7685, + "step": 6339 + }, + { + "epoch": 0.7365669474295672, + "grad_norm": 0.4616027772426605, + "learning_rate": 0.0001, + "loss": 1.553, + "step": 6340 + }, + { + "epoch": 0.7366831251815278, + "grad_norm": 0.4539996385574341, + "learning_rate": 0.0001, + "loss": 1.5797, + "step": 6341 + }, + { + "epoch": 0.7367993029334883, + "grad_norm": 0.4599131643772125, + "learning_rate": 0.0001, + "loss": 1.5019, + "step": 6342 + }, + { + "epoch": 0.7369154806854488, + "grad_norm": 0.4772374927997589, + "learning_rate": 0.0001, + "loss": 1.5946, + "step": 6343 + }, + { + "epoch": 0.7370316584374093, + "grad_norm": 0.4597277343273163, + "learning_rate": 0.0001, + "loss": 1.6406, + "step": 6344 + }, + { + "epoch": 0.7371478361893697, + "grad_norm": 0.4844377934932709, + "learning_rate": 0.0001, + "loss": 1.6903, + "step": 6345 + }, + { + "epoch": 0.7372640139413302, + "grad_norm": 0.4434802532196045, + "learning_rate": 0.0001, + "loss": 1.6391, + "step": 6346 + }, + { + "epoch": 0.7373801916932907, + "grad_norm": 0.48957550525665283, + "learning_rate": 0.0001, + "loss": 1.6194, + "step": 6347 + }, + { + "epoch": 0.7374963694452512, + "grad_norm": 0.4729391932487488, + "learning_rate": 0.0001, + "loss": 1.5037, + "step": 6348 + }, + { + "epoch": 0.7376125471972117, + "grad_norm": 0.4368651807308197, + "learning_rate": 0.0001, + "loss": 1.7193, + "step": 6349 + }, + { + "epoch": 0.7377287249491722, + "grad_norm": 0.4742048978805542, + "learning_rate": 0.0001, + "loss": 1.8506, + "step": 6350 + }, + { + "epoch": 0.7378449027011328, + "grad_norm": 0.4797591269016266, + "learning_rate": 0.0001, + "loss": 1.6867, + "step": 6351 + }, + { + "epoch": 0.7379610804530933, + "grad_norm": 0.46037012338638306, + "learning_rate": 0.0001, + "loss": 1.6366, + "step": 6352 + }, + { + "epoch": 0.7380772582050538, + "grad_norm": 0.45086851716041565, + "learning_rate": 0.0001, + "loss": 1.5711, + "step": 6353 + }, + { + "epoch": 0.7381934359570143, + "grad_norm": 0.4557948410511017, + "learning_rate": 0.0001, + "loss": 1.5405, + "step": 6354 + }, + { + "epoch": 0.7383096137089747, + "grad_norm": 0.4523628354072571, + "learning_rate": 0.0001, + "loss": 1.5482, + "step": 6355 + }, + { + "epoch": 0.7384257914609352, + "grad_norm": 0.43781834840774536, + "learning_rate": 0.0001, + "loss": 1.4367, + "step": 6356 + }, + { + "epoch": 0.7385419692128957, + "grad_norm": 0.5110533833503723, + "learning_rate": 0.0001, + "loss": 1.6609, + "step": 6357 + }, + { + "epoch": 0.7386581469648562, + "grad_norm": 0.4702020585536957, + "learning_rate": 0.0001, + "loss": 1.5689, + "step": 6358 + }, + { + "epoch": 0.7387743247168167, + "grad_norm": 0.45719245076179504, + "learning_rate": 0.0001, + "loss": 1.504, + "step": 6359 + }, + { + "epoch": 0.7388905024687772, + "grad_norm": 0.45897620916366577, + "learning_rate": 0.0001, + "loss": 1.6057, + "step": 6360 + }, + { + "epoch": 0.7390066802207377, + "grad_norm": 0.48604246973991394, + "learning_rate": 0.0001, + "loss": 1.6199, + "step": 6361 + }, + { + "epoch": 0.7391228579726983, + "grad_norm": 0.4290804862976074, + "learning_rate": 0.0001, + "loss": 1.5523, + "step": 6362 + }, + { + "epoch": 0.7392390357246588, + "grad_norm": 0.47816741466522217, + "learning_rate": 0.0001, + "loss": 1.5016, + "step": 6363 + }, + { + "epoch": 0.7393552134766193, + "grad_norm": 0.5012046098709106, + "learning_rate": 0.0001, + "loss": 1.514, + "step": 6364 + }, + { + "epoch": 0.7394713912285797, + "grad_norm": 0.5031648874282837, + "learning_rate": 0.0001, + "loss": 1.7799, + "step": 6365 + }, + { + "epoch": 0.7395875689805402, + "grad_norm": 0.46205514669418335, + "learning_rate": 0.0001, + "loss": 1.7046, + "step": 6366 + }, + { + "epoch": 0.7397037467325007, + "grad_norm": 0.4918919801712036, + "learning_rate": 0.0001, + "loss": 1.6548, + "step": 6367 + }, + { + "epoch": 0.7398199244844612, + "grad_norm": 0.4417077898979187, + "learning_rate": 0.0001, + "loss": 1.5443, + "step": 6368 + }, + { + "epoch": 0.7399361022364217, + "grad_norm": 0.5084665417671204, + "learning_rate": 0.0001, + "loss": 1.8225, + "step": 6369 + }, + { + "epoch": 0.7400522799883822, + "grad_norm": 0.4046757221221924, + "learning_rate": 0.0001, + "loss": 1.3748, + "step": 6370 + }, + { + "epoch": 0.7401684577403427, + "grad_norm": 0.4671691358089447, + "learning_rate": 0.0001, + "loss": 1.6038, + "step": 6371 + }, + { + "epoch": 0.7402846354923033, + "grad_norm": 0.41901758313179016, + "learning_rate": 0.0001, + "loss": 1.4741, + "step": 6372 + }, + { + "epoch": 0.7404008132442638, + "grad_norm": 0.4258348345756531, + "learning_rate": 0.0001, + "loss": 1.4846, + "step": 6373 + }, + { + "epoch": 0.7405169909962243, + "grad_norm": 0.44732606410980225, + "learning_rate": 0.0001, + "loss": 1.6255, + "step": 6374 + }, + { + "epoch": 0.7406331687481847, + "grad_norm": 0.4379178583621979, + "learning_rate": 0.0001, + "loss": 1.5444, + "step": 6375 + }, + { + "epoch": 0.7407493465001452, + "grad_norm": 0.44309452176094055, + "learning_rate": 0.0001, + "loss": 1.4593, + "step": 6376 + }, + { + "epoch": 0.7408655242521057, + "grad_norm": 0.47775688767433167, + "learning_rate": 0.0001, + "loss": 1.5678, + "step": 6377 + }, + { + "epoch": 0.7409817020040662, + "grad_norm": 0.4426514208316803, + "learning_rate": 0.0001, + "loss": 1.5051, + "step": 6378 + }, + { + "epoch": 0.7410978797560267, + "grad_norm": 0.45742908120155334, + "learning_rate": 0.0001, + "loss": 1.5815, + "step": 6379 + }, + { + "epoch": 0.7412140575079872, + "grad_norm": 0.46088162064552307, + "learning_rate": 0.0001, + "loss": 1.6739, + "step": 6380 + }, + { + "epoch": 0.7413302352599477, + "grad_norm": 0.4453122317790985, + "learning_rate": 0.0001, + "loss": 1.4667, + "step": 6381 + }, + { + "epoch": 0.7414464130119082, + "grad_norm": 0.4657224714756012, + "learning_rate": 0.0001, + "loss": 1.7709, + "step": 6382 + }, + { + "epoch": 0.7415625907638688, + "grad_norm": 0.47661951184272766, + "learning_rate": 0.0001, + "loss": 1.5963, + "step": 6383 + }, + { + "epoch": 0.7416787685158293, + "grad_norm": 0.4470673203468323, + "learning_rate": 0.0001, + "loss": 1.6727, + "step": 6384 + }, + { + "epoch": 0.7417949462677897, + "grad_norm": 0.4273890554904938, + "learning_rate": 0.0001, + "loss": 1.3943, + "step": 6385 + }, + { + "epoch": 0.7419111240197502, + "grad_norm": 0.4307868778705597, + "learning_rate": 0.0001, + "loss": 1.4627, + "step": 6386 + }, + { + "epoch": 0.7420273017717107, + "grad_norm": 0.5137743353843689, + "learning_rate": 0.0001, + "loss": 1.543, + "step": 6387 + }, + { + "epoch": 0.7421434795236712, + "grad_norm": 0.4485335648059845, + "learning_rate": 0.0001, + "loss": 1.5651, + "step": 6388 + }, + { + "epoch": 0.7422596572756317, + "grad_norm": 0.46610766649246216, + "learning_rate": 0.0001, + "loss": 1.5949, + "step": 6389 + }, + { + "epoch": 0.7423758350275922, + "grad_norm": 0.4473322629928589, + "learning_rate": 0.0001, + "loss": 1.4623, + "step": 6390 + }, + { + "epoch": 0.7424920127795527, + "grad_norm": 0.4895930886268616, + "learning_rate": 0.0001, + "loss": 1.6431, + "step": 6391 + }, + { + "epoch": 0.7426081905315132, + "grad_norm": 0.48194587230682373, + "learning_rate": 0.0001, + "loss": 1.6737, + "step": 6392 + }, + { + "epoch": 0.7427243682834738, + "grad_norm": 0.44431936740875244, + "learning_rate": 0.0001, + "loss": 1.5463, + "step": 6393 + }, + { + "epoch": 0.7428405460354343, + "grad_norm": 0.4284924864768982, + "learning_rate": 0.0001, + "loss": 1.6245, + "step": 6394 + }, + { + "epoch": 0.7429567237873947, + "grad_norm": 0.4462593197822571, + "learning_rate": 0.0001, + "loss": 1.5761, + "step": 6395 + }, + { + "epoch": 0.7430729015393552, + "grad_norm": 0.4361656606197357, + "learning_rate": 0.0001, + "loss": 1.5675, + "step": 6396 + }, + { + "epoch": 0.7431890792913157, + "grad_norm": 0.4379312992095947, + "learning_rate": 0.0001, + "loss": 1.6233, + "step": 6397 + }, + { + "epoch": 0.7433052570432762, + "grad_norm": 0.4425080418586731, + "learning_rate": 0.0001, + "loss": 1.4492, + "step": 6398 + }, + { + "epoch": 0.7434214347952367, + "grad_norm": 0.4721614718437195, + "learning_rate": 0.0001, + "loss": 1.7602, + "step": 6399 + }, + { + "epoch": 0.7435376125471972, + "grad_norm": 0.45762506127357483, + "learning_rate": 0.0001, + "loss": 1.5522, + "step": 6400 + }, + { + "epoch": 0.7436537902991577, + "grad_norm": 0.44420626759529114, + "learning_rate": 0.0001, + "loss": 1.4694, + "step": 6401 + }, + { + "epoch": 0.7437699680511182, + "grad_norm": 0.4328571856021881, + "learning_rate": 0.0001, + "loss": 1.5338, + "step": 6402 + }, + { + "epoch": 0.7438861458030787, + "grad_norm": 0.4925638735294342, + "learning_rate": 0.0001, + "loss": 1.731, + "step": 6403 + }, + { + "epoch": 0.7440023235550393, + "grad_norm": 0.43524104356765747, + "learning_rate": 0.0001, + "loss": 1.6355, + "step": 6404 + }, + { + "epoch": 0.7441185013069997, + "grad_norm": 0.4498136043548584, + "learning_rate": 0.0001, + "loss": 1.5421, + "step": 6405 + }, + { + "epoch": 0.7442346790589602, + "grad_norm": 0.43513327836990356, + "learning_rate": 0.0001, + "loss": 1.3918, + "step": 6406 + }, + { + "epoch": 0.7443508568109207, + "grad_norm": 0.4613160789012909, + "learning_rate": 0.0001, + "loss": 1.6829, + "step": 6407 + }, + { + "epoch": 0.7444670345628812, + "grad_norm": 0.4584994614124298, + "learning_rate": 0.0001, + "loss": 1.6791, + "step": 6408 + }, + { + "epoch": 0.7445832123148417, + "grad_norm": 0.49195122718811035, + "learning_rate": 0.0001, + "loss": 1.8131, + "step": 6409 + }, + { + "epoch": 0.7446993900668022, + "grad_norm": 0.4556443393230438, + "learning_rate": 0.0001, + "loss": 1.6678, + "step": 6410 + }, + { + "epoch": 0.7448155678187627, + "grad_norm": 0.46693944931030273, + "learning_rate": 0.0001, + "loss": 1.719, + "step": 6411 + }, + { + "epoch": 0.7449317455707232, + "grad_norm": 0.46386128664016724, + "learning_rate": 0.0001, + "loss": 1.6447, + "step": 6412 + }, + { + "epoch": 0.7450479233226837, + "grad_norm": 0.43832361698150635, + "learning_rate": 0.0001, + "loss": 1.5779, + "step": 6413 + }, + { + "epoch": 0.7451641010746443, + "grad_norm": 0.4549694359302521, + "learning_rate": 0.0001, + "loss": 1.6203, + "step": 6414 + }, + { + "epoch": 0.7452802788266047, + "grad_norm": 0.44481566548347473, + "learning_rate": 0.0001, + "loss": 1.5899, + "step": 6415 + }, + { + "epoch": 0.7453964565785652, + "grad_norm": 0.43813270330429077, + "learning_rate": 0.0001, + "loss": 1.6835, + "step": 6416 + }, + { + "epoch": 0.7455126343305257, + "grad_norm": 0.44562163949012756, + "learning_rate": 0.0001, + "loss": 1.6412, + "step": 6417 + }, + { + "epoch": 0.7456288120824862, + "grad_norm": 0.4208749532699585, + "learning_rate": 0.0001, + "loss": 1.4426, + "step": 6418 + }, + { + "epoch": 0.7457449898344467, + "grad_norm": 0.4573444426059723, + "learning_rate": 0.0001, + "loss": 1.5261, + "step": 6419 + }, + { + "epoch": 0.7458611675864072, + "grad_norm": 0.4514586329460144, + "learning_rate": 0.0001, + "loss": 1.6794, + "step": 6420 + }, + { + "epoch": 0.7459773453383677, + "grad_norm": 0.427177757024765, + "learning_rate": 0.0001, + "loss": 1.5954, + "step": 6421 + }, + { + "epoch": 0.7460935230903282, + "grad_norm": 0.43251994252204895, + "learning_rate": 0.0001, + "loss": 1.6548, + "step": 6422 + }, + { + "epoch": 0.7462097008422887, + "grad_norm": 0.4547024369239807, + "learning_rate": 0.0001, + "loss": 1.4714, + "step": 6423 + }, + { + "epoch": 0.7463258785942491, + "grad_norm": 0.454458087682724, + "learning_rate": 0.0001, + "loss": 1.585, + "step": 6424 + }, + { + "epoch": 0.7464420563462097, + "grad_norm": 0.44266244769096375, + "learning_rate": 0.0001, + "loss": 1.5443, + "step": 6425 + }, + { + "epoch": 0.7465582340981702, + "grad_norm": 0.4643990099430084, + "learning_rate": 0.0001, + "loss": 1.6165, + "step": 6426 + }, + { + "epoch": 0.7466744118501307, + "grad_norm": 0.4686664938926697, + "learning_rate": 0.0001, + "loss": 1.5298, + "step": 6427 + }, + { + "epoch": 0.7467905896020912, + "grad_norm": 0.49029332399368286, + "learning_rate": 0.0001, + "loss": 1.5791, + "step": 6428 + }, + { + "epoch": 0.7469067673540517, + "grad_norm": 0.46670180559158325, + "learning_rate": 0.0001, + "loss": 1.5147, + "step": 6429 + }, + { + "epoch": 0.7470229451060122, + "grad_norm": 0.44535404443740845, + "learning_rate": 0.0001, + "loss": 1.6434, + "step": 6430 + }, + { + "epoch": 0.7471391228579727, + "grad_norm": 0.4496926963329315, + "learning_rate": 0.0001, + "loss": 1.4151, + "step": 6431 + }, + { + "epoch": 0.7472553006099332, + "grad_norm": 0.4497212767601013, + "learning_rate": 0.0001, + "loss": 1.6221, + "step": 6432 + }, + { + "epoch": 0.7473714783618937, + "grad_norm": 0.44064804911613464, + "learning_rate": 0.0001, + "loss": 1.5384, + "step": 6433 + }, + { + "epoch": 0.7474876561138541, + "grad_norm": 0.459143728017807, + "learning_rate": 0.0001, + "loss": 1.6105, + "step": 6434 + }, + { + "epoch": 0.7476038338658147, + "grad_norm": 0.4302321672439575, + "learning_rate": 0.0001, + "loss": 1.5137, + "step": 6435 + }, + { + "epoch": 0.7477200116177752, + "grad_norm": 0.4803028702735901, + "learning_rate": 0.0001, + "loss": 1.6813, + "step": 6436 + }, + { + "epoch": 0.7478361893697357, + "grad_norm": 0.41635861992836, + "learning_rate": 0.0001, + "loss": 1.4167, + "step": 6437 + }, + { + "epoch": 0.7479523671216962, + "grad_norm": 0.44405534863471985, + "learning_rate": 0.0001, + "loss": 1.6584, + "step": 6438 + }, + { + "epoch": 0.7480685448736567, + "grad_norm": 0.4449010491371155, + "learning_rate": 0.0001, + "loss": 1.7648, + "step": 6439 + }, + { + "epoch": 0.7481847226256172, + "grad_norm": 0.45254096388816833, + "learning_rate": 0.0001, + "loss": 1.6037, + "step": 6440 + }, + { + "epoch": 0.7483009003775777, + "grad_norm": 0.4546097218990326, + "learning_rate": 0.0001, + "loss": 1.6735, + "step": 6441 + }, + { + "epoch": 0.7484170781295382, + "grad_norm": 0.44353100657463074, + "learning_rate": 0.0001, + "loss": 1.6668, + "step": 6442 + }, + { + "epoch": 0.7485332558814987, + "grad_norm": 0.4421907365322113, + "learning_rate": 0.0001, + "loss": 1.6233, + "step": 6443 + }, + { + "epoch": 0.7486494336334591, + "grad_norm": 0.4550931751728058, + "learning_rate": 0.0001, + "loss": 1.5141, + "step": 6444 + }, + { + "epoch": 0.7487656113854196, + "grad_norm": 0.4081590175628662, + "learning_rate": 0.0001, + "loss": 1.3085, + "step": 6445 + }, + { + "epoch": 0.7488817891373802, + "grad_norm": 0.41531556844711304, + "learning_rate": 0.0001, + "loss": 1.5251, + "step": 6446 + }, + { + "epoch": 0.7489979668893407, + "grad_norm": 0.46945273876190186, + "learning_rate": 0.0001, + "loss": 1.6785, + "step": 6447 + }, + { + "epoch": 0.7491141446413012, + "grad_norm": 0.4772491455078125, + "learning_rate": 0.0001, + "loss": 1.6821, + "step": 6448 + }, + { + "epoch": 0.7492303223932617, + "grad_norm": 0.4390372335910797, + "learning_rate": 0.0001, + "loss": 1.5864, + "step": 6449 + }, + { + "epoch": 0.7493465001452222, + "grad_norm": 0.425656795501709, + "learning_rate": 0.0001, + "loss": 1.4169, + "step": 6450 + }, + { + "epoch": 0.7494626778971827, + "grad_norm": 0.44285711646080017, + "learning_rate": 0.0001, + "loss": 1.4793, + "step": 6451 + }, + { + "epoch": 0.7495788556491432, + "grad_norm": 0.44809186458587646, + "learning_rate": 0.0001, + "loss": 1.6826, + "step": 6452 + }, + { + "epoch": 0.7496950334011037, + "grad_norm": 0.44603654742240906, + "learning_rate": 0.0001, + "loss": 1.4245, + "step": 6453 + }, + { + "epoch": 0.7498112111530641, + "grad_norm": 0.45368316769599915, + "learning_rate": 0.0001, + "loss": 1.5795, + "step": 6454 + }, + { + "epoch": 0.7499273889050246, + "grad_norm": 0.464532732963562, + "learning_rate": 0.0001, + "loss": 1.6552, + "step": 6455 + }, + { + "epoch": 0.7500435666569852, + "grad_norm": 0.43661272525787354, + "learning_rate": 0.0001, + "loss": 1.4989, + "step": 6456 + }, + { + "epoch": 0.7501597444089457, + "grad_norm": 0.4373878240585327, + "learning_rate": 0.0001, + "loss": 1.6482, + "step": 6457 + }, + { + "epoch": 0.7502759221609062, + "grad_norm": 0.45285502076148987, + "learning_rate": 0.0001, + "loss": 1.558, + "step": 6458 + }, + { + "epoch": 0.7503920999128667, + "grad_norm": 0.4704199433326721, + "learning_rate": 0.0001, + "loss": 1.4876, + "step": 6459 + }, + { + "epoch": 0.7505082776648272, + "grad_norm": 0.45915988087654114, + "learning_rate": 0.0001, + "loss": 1.7206, + "step": 6460 + }, + { + "epoch": 0.7506244554167877, + "grad_norm": 0.46170055866241455, + "learning_rate": 0.0001, + "loss": 1.6194, + "step": 6461 + }, + { + "epoch": 0.7507406331687482, + "grad_norm": 0.46552303433418274, + "learning_rate": 0.0001, + "loss": 1.6316, + "step": 6462 + }, + { + "epoch": 0.7508568109207087, + "grad_norm": 0.47524699568748474, + "learning_rate": 0.0001, + "loss": 1.629, + "step": 6463 + }, + { + "epoch": 0.7509729886726692, + "grad_norm": 0.4851406514644623, + "learning_rate": 0.0001, + "loss": 1.605, + "step": 6464 + }, + { + "epoch": 0.7510891664246296, + "grad_norm": 0.46212369203567505, + "learning_rate": 0.0001, + "loss": 1.6158, + "step": 6465 + }, + { + "epoch": 0.7512053441765901, + "grad_norm": 0.4637957811355591, + "learning_rate": 0.0001, + "loss": 1.6477, + "step": 6466 + }, + { + "epoch": 0.7513215219285507, + "grad_norm": 0.46081292629241943, + "learning_rate": 0.0001, + "loss": 1.5939, + "step": 6467 + }, + { + "epoch": 0.7514376996805112, + "grad_norm": 0.418517529964447, + "learning_rate": 0.0001, + "loss": 1.5286, + "step": 6468 + }, + { + "epoch": 0.7515538774324717, + "grad_norm": 0.42126187682151794, + "learning_rate": 0.0001, + "loss": 1.5361, + "step": 6469 + }, + { + "epoch": 0.7516700551844322, + "grad_norm": 0.45902541279792786, + "learning_rate": 0.0001, + "loss": 1.6279, + "step": 6470 + }, + { + "epoch": 0.7517862329363927, + "grad_norm": 0.42754048109054565, + "learning_rate": 0.0001, + "loss": 1.4632, + "step": 6471 + }, + { + "epoch": 0.7519024106883532, + "grad_norm": 0.4438011348247528, + "learning_rate": 0.0001, + "loss": 1.4954, + "step": 6472 + }, + { + "epoch": 0.7520185884403137, + "grad_norm": 0.46611130237579346, + "learning_rate": 0.0001, + "loss": 1.7748, + "step": 6473 + }, + { + "epoch": 0.7521347661922742, + "grad_norm": 0.43576914072036743, + "learning_rate": 0.0001, + "loss": 1.4486, + "step": 6474 + }, + { + "epoch": 0.7522509439442346, + "grad_norm": 0.4808332622051239, + "learning_rate": 0.0001, + "loss": 1.6467, + "step": 6475 + }, + { + "epoch": 0.7523671216961951, + "grad_norm": 0.4799876809120178, + "learning_rate": 0.0001, + "loss": 1.6361, + "step": 6476 + }, + { + "epoch": 0.7524832994481557, + "grad_norm": 0.4838355481624603, + "learning_rate": 0.0001, + "loss": 1.6259, + "step": 6477 + }, + { + "epoch": 0.7525994772001162, + "grad_norm": 0.47365084290504456, + "learning_rate": 0.0001, + "loss": 1.5567, + "step": 6478 + }, + { + "epoch": 0.7527156549520767, + "grad_norm": 0.4396180510520935, + "learning_rate": 0.0001, + "loss": 1.6491, + "step": 6479 + }, + { + "epoch": 0.7528318327040372, + "grad_norm": 0.4406949579715729, + "learning_rate": 0.0001, + "loss": 1.6357, + "step": 6480 + }, + { + "epoch": 0.7529480104559977, + "grad_norm": 0.43334612250328064, + "learning_rate": 0.0001, + "loss": 1.5365, + "step": 6481 + }, + { + "epoch": 0.7530641882079582, + "grad_norm": 0.4289649426937103, + "learning_rate": 0.0001, + "loss": 1.4876, + "step": 6482 + }, + { + "epoch": 0.7531803659599187, + "grad_norm": 0.4250386953353882, + "learning_rate": 0.0001, + "loss": 1.4657, + "step": 6483 + }, + { + "epoch": 0.7532965437118792, + "grad_norm": 0.4700089395046234, + "learning_rate": 0.0001, + "loss": 1.6436, + "step": 6484 + }, + { + "epoch": 0.7534127214638396, + "grad_norm": 0.4487655460834503, + "learning_rate": 0.0001, + "loss": 1.5854, + "step": 6485 + }, + { + "epoch": 0.7535288992158001, + "grad_norm": 0.44160258769989014, + "learning_rate": 0.0001, + "loss": 1.5324, + "step": 6486 + }, + { + "epoch": 0.7536450769677606, + "grad_norm": 0.43378591537475586, + "learning_rate": 0.0001, + "loss": 1.5401, + "step": 6487 + }, + { + "epoch": 0.7537612547197212, + "grad_norm": 0.4056842029094696, + "learning_rate": 0.0001, + "loss": 1.5334, + "step": 6488 + }, + { + "epoch": 0.7538774324716817, + "grad_norm": 0.4529426693916321, + "learning_rate": 0.0001, + "loss": 1.5559, + "step": 6489 + }, + { + "epoch": 0.7539936102236422, + "grad_norm": 0.5032495856285095, + "learning_rate": 0.0001, + "loss": 1.7247, + "step": 6490 + }, + { + "epoch": 0.7541097879756027, + "grad_norm": 0.4208425283432007, + "learning_rate": 0.0001, + "loss": 1.4887, + "step": 6491 + }, + { + "epoch": 0.7542259657275632, + "grad_norm": 0.429240345954895, + "learning_rate": 0.0001, + "loss": 1.6465, + "step": 6492 + }, + { + "epoch": 0.7543421434795237, + "grad_norm": 0.45296066999435425, + "learning_rate": 0.0001, + "loss": 1.5084, + "step": 6493 + }, + { + "epoch": 0.7544583212314842, + "grad_norm": 0.4730745255947113, + "learning_rate": 0.0001, + "loss": 1.7686, + "step": 6494 + }, + { + "epoch": 0.7545744989834446, + "grad_norm": 0.43891751766204834, + "learning_rate": 0.0001, + "loss": 1.7096, + "step": 6495 + }, + { + "epoch": 0.7546906767354051, + "grad_norm": 0.4880208671092987, + "learning_rate": 0.0001, + "loss": 1.7739, + "step": 6496 + }, + { + "epoch": 0.7548068544873656, + "grad_norm": 0.43888798356056213, + "learning_rate": 0.0001, + "loss": 1.5535, + "step": 6497 + }, + { + "epoch": 0.7549230322393262, + "grad_norm": 0.42402908205986023, + "learning_rate": 0.0001, + "loss": 1.5107, + "step": 6498 + }, + { + "epoch": 0.7550392099912867, + "grad_norm": 0.43991389870643616, + "learning_rate": 0.0001, + "loss": 1.5399, + "step": 6499 + }, + { + "epoch": 0.7551553877432472, + "grad_norm": 0.43535780906677246, + "learning_rate": 0.0001, + "loss": 1.4413, + "step": 6500 + }, + { + "epoch": 0.7552715654952077, + "grad_norm": 0.4613569974899292, + "learning_rate": 0.0001, + "loss": 1.6213, + "step": 6501 + }, + { + "epoch": 0.7553877432471682, + "grad_norm": 0.4456695020198822, + "learning_rate": 0.0001, + "loss": 1.5422, + "step": 6502 + }, + { + "epoch": 0.7555039209991287, + "grad_norm": 0.4481378197669983, + "learning_rate": 0.0001, + "loss": 1.6161, + "step": 6503 + }, + { + "epoch": 0.7556200987510892, + "grad_norm": 0.45384448766708374, + "learning_rate": 0.0001, + "loss": 1.5488, + "step": 6504 + }, + { + "epoch": 0.7557362765030496, + "grad_norm": 0.474298894405365, + "learning_rate": 0.0001, + "loss": 1.6841, + "step": 6505 + }, + { + "epoch": 0.7558524542550101, + "grad_norm": 0.478977233171463, + "learning_rate": 0.0001, + "loss": 1.757, + "step": 6506 + }, + { + "epoch": 0.7559686320069706, + "grad_norm": 0.45256179571151733, + "learning_rate": 0.0001, + "loss": 1.745, + "step": 6507 + }, + { + "epoch": 0.7560848097589312, + "grad_norm": 0.465461790561676, + "learning_rate": 0.0001, + "loss": 1.7028, + "step": 6508 + }, + { + "epoch": 0.7562009875108917, + "grad_norm": 0.4733600318431854, + "learning_rate": 0.0001, + "loss": 1.7272, + "step": 6509 + }, + { + "epoch": 0.7563171652628522, + "grad_norm": 0.46702197194099426, + "learning_rate": 0.0001, + "loss": 1.6782, + "step": 6510 + }, + { + "epoch": 0.7564333430148127, + "grad_norm": 0.4695473909378052, + "learning_rate": 0.0001, + "loss": 1.5759, + "step": 6511 + }, + { + "epoch": 0.7565495207667732, + "grad_norm": 0.48179465532302856, + "learning_rate": 0.0001, + "loss": 1.7386, + "step": 6512 + }, + { + "epoch": 0.7566656985187337, + "grad_norm": 0.4340745210647583, + "learning_rate": 0.0001, + "loss": 1.385, + "step": 6513 + }, + { + "epoch": 0.7567818762706942, + "grad_norm": 0.4561876058578491, + "learning_rate": 0.0001, + "loss": 1.7278, + "step": 6514 + }, + { + "epoch": 0.7568980540226546, + "grad_norm": 0.4735701084136963, + "learning_rate": 0.0001, + "loss": 1.6604, + "step": 6515 + }, + { + "epoch": 0.7570142317746151, + "grad_norm": 0.5096227526664734, + "learning_rate": 0.0001, + "loss": 1.6791, + "step": 6516 + }, + { + "epoch": 0.7571304095265756, + "grad_norm": 0.5059962868690491, + "learning_rate": 0.0001, + "loss": 1.6957, + "step": 6517 + }, + { + "epoch": 0.7572465872785361, + "grad_norm": 0.4230511784553528, + "learning_rate": 0.0001, + "loss": 1.4676, + "step": 6518 + }, + { + "epoch": 0.7573627650304967, + "grad_norm": 0.5008574724197388, + "learning_rate": 0.0001, + "loss": 1.6361, + "step": 6519 + }, + { + "epoch": 0.7574789427824572, + "grad_norm": 0.45503777265548706, + "learning_rate": 0.0001, + "loss": 1.582, + "step": 6520 + }, + { + "epoch": 0.7575951205344177, + "grad_norm": 0.49144798517227173, + "learning_rate": 0.0001, + "loss": 1.5634, + "step": 6521 + }, + { + "epoch": 0.7577112982863782, + "grad_norm": 0.450183629989624, + "learning_rate": 0.0001, + "loss": 1.569, + "step": 6522 + }, + { + "epoch": 0.7578274760383387, + "grad_norm": 0.4511738717556, + "learning_rate": 0.0001, + "loss": 1.4578, + "step": 6523 + }, + { + "epoch": 0.7579436537902992, + "grad_norm": 0.45936381816864014, + "learning_rate": 0.0001, + "loss": 1.5197, + "step": 6524 + }, + { + "epoch": 0.7580598315422596, + "grad_norm": 0.4909375309944153, + "learning_rate": 0.0001, + "loss": 1.6538, + "step": 6525 + }, + { + "epoch": 0.7581760092942201, + "grad_norm": 0.426647812128067, + "learning_rate": 0.0001, + "loss": 1.5593, + "step": 6526 + }, + { + "epoch": 0.7582921870461806, + "grad_norm": 0.42700493335723877, + "learning_rate": 0.0001, + "loss": 1.4212, + "step": 6527 + }, + { + "epoch": 0.7584083647981411, + "grad_norm": 0.4363797903060913, + "learning_rate": 0.0001, + "loss": 1.4909, + "step": 6528 + }, + { + "epoch": 0.7585245425501017, + "grad_norm": 0.47377872467041016, + "learning_rate": 0.0001, + "loss": 1.6416, + "step": 6529 + }, + { + "epoch": 0.7586407203020622, + "grad_norm": 0.45347169041633606, + "learning_rate": 0.0001, + "loss": 1.5481, + "step": 6530 + }, + { + "epoch": 0.7587568980540227, + "grad_norm": 0.4400898814201355, + "learning_rate": 0.0001, + "loss": 1.6557, + "step": 6531 + }, + { + "epoch": 0.7588730758059832, + "grad_norm": 0.4629836678504944, + "learning_rate": 0.0001, + "loss": 1.6101, + "step": 6532 + }, + { + "epoch": 0.7589892535579437, + "grad_norm": 0.43157103657722473, + "learning_rate": 0.0001, + "loss": 1.3997, + "step": 6533 + }, + { + "epoch": 0.7591054313099042, + "grad_norm": 0.44874757528305054, + "learning_rate": 0.0001, + "loss": 1.6097, + "step": 6534 + }, + { + "epoch": 0.7592216090618646, + "grad_norm": 0.4572241008281708, + "learning_rate": 0.0001, + "loss": 1.7732, + "step": 6535 + }, + { + "epoch": 0.7593377868138251, + "grad_norm": 0.509334146976471, + "learning_rate": 0.0001, + "loss": 1.6985, + "step": 6536 + }, + { + "epoch": 0.7594539645657856, + "grad_norm": 0.45650917291641235, + "learning_rate": 0.0001, + "loss": 1.7018, + "step": 6537 + }, + { + "epoch": 0.7595701423177461, + "grad_norm": 0.4282233715057373, + "learning_rate": 0.0001, + "loss": 1.5703, + "step": 6538 + }, + { + "epoch": 0.7596863200697066, + "grad_norm": 0.49342942237854004, + "learning_rate": 0.0001, + "loss": 1.6612, + "step": 6539 + }, + { + "epoch": 0.7598024978216672, + "grad_norm": 0.45365840196609497, + "learning_rate": 0.0001, + "loss": 1.7058, + "step": 6540 + }, + { + "epoch": 0.7599186755736277, + "grad_norm": 0.4653255343437195, + "learning_rate": 0.0001, + "loss": 1.5784, + "step": 6541 + }, + { + "epoch": 0.7600348533255882, + "grad_norm": 0.4376204013824463, + "learning_rate": 0.0001, + "loss": 1.4713, + "step": 6542 + }, + { + "epoch": 0.7601510310775487, + "grad_norm": 0.42907100915908813, + "learning_rate": 0.0001, + "loss": 1.5606, + "step": 6543 + }, + { + "epoch": 0.7602672088295092, + "grad_norm": 0.4851232171058655, + "learning_rate": 0.0001, + "loss": 1.6919, + "step": 6544 + }, + { + "epoch": 0.7603833865814696, + "grad_norm": 0.4842982590198517, + "learning_rate": 0.0001, + "loss": 1.5926, + "step": 6545 + }, + { + "epoch": 0.7604995643334301, + "grad_norm": 0.44596225023269653, + "learning_rate": 0.0001, + "loss": 1.5771, + "step": 6546 + }, + { + "epoch": 0.7606157420853906, + "grad_norm": 0.42956697940826416, + "learning_rate": 0.0001, + "loss": 1.5368, + "step": 6547 + }, + { + "epoch": 0.7607319198373511, + "grad_norm": 0.4747844934463501, + "learning_rate": 0.0001, + "loss": 1.6931, + "step": 6548 + }, + { + "epoch": 0.7608480975893116, + "grad_norm": 0.4778608977794647, + "learning_rate": 0.0001, + "loss": 1.7017, + "step": 6549 + }, + { + "epoch": 0.7609642753412722, + "grad_norm": 0.4931018352508545, + "learning_rate": 0.0001, + "loss": 1.7501, + "step": 6550 + }, + { + "epoch": 0.7610804530932327, + "grad_norm": 0.4602007567882538, + "learning_rate": 0.0001, + "loss": 1.7131, + "step": 6551 + }, + { + "epoch": 0.7611966308451932, + "grad_norm": 0.4534691572189331, + "learning_rate": 0.0001, + "loss": 1.418, + "step": 6552 + }, + { + "epoch": 0.7613128085971537, + "grad_norm": 0.4759400486946106, + "learning_rate": 0.0001, + "loss": 1.7773, + "step": 6553 + }, + { + "epoch": 0.7614289863491142, + "grad_norm": 0.44911110401153564, + "learning_rate": 0.0001, + "loss": 1.7104, + "step": 6554 + }, + { + "epoch": 0.7615451641010746, + "grad_norm": 0.4117274582386017, + "learning_rate": 0.0001, + "loss": 1.5102, + "step": 6555 + }, + { + "epoch": 0.7616613418530351, + "grad_norm": 0.44577354192733765, + "learning_rate": 0.0001, + "loss": 1.72, + "step": 6556 + }, + { + "epoch": 0.7617775196049956, + "grad_norm": 0.4217338562011719, + "learning_rate": 0.0001, + "loss": 1.4747, + "step": 6557 + }, + { + "epoch": 0.7618936973569561, + "grad_norm": 0.4698602855205536, + "learning_rate": 0.0001, + "loss": 1.6707, + "step": 6558 + }, + { + "epoch": 0.7620098751089166, + "grad_norm": 0.44901999831199646, + "learning_rate": 0.0001, + "loss": 1.656, + "step": 6559 + }, + { + "epoch": 0.7621260528608771, + "grad_norm": 0.41744062304496765, + "learning_rate": 0.0001, + "loss": 1.4392, + "step": 6560 + }, + { + "epoch": 0.7622422306128377, + "grad_norm": 0.4368325471878052, + "learning_rate": 0.0001, + "loss": 1.5994, + "step": 6561 + }, + { + "epoch": 0.7623584083647982, + "grad_norm": 0.43781906366348267, + "learning_rate": 0.0001, + "loss": 1.547, + "step": 6562 + }, + { + "epoch": 0.7624745861167587, + "grad_norm": 0.4672465920448303, + "learning_rate": 0.0001, + "loss": 1.6581, + "step": 6563 + }, + { + "epoch": 0.7625907638687192, + "grad_norm": 0.44556140899658203, + "learning_rate": 0.0001, + "loss": 1.6116, + "step": 6564 + }, + { + "epoch": 0.7627069416206796, + "grad_norm": 0.47782960534095764, + "learning_rate": 0.0001, + "loss": 1.556, + "step": 6565 + }, + { + "epoch": 0.7628231193726401, + "grad_norm": 0.4672389328479767, + "learning_rate": 0.0001, + "loss": 1.6303, + "step": 6566 + }, + { + "epoch": 0.7629392971246006, + "grad_norm": 0.44168704748153687, + "learning_rate": 0.0001, + "loss": 1.6007, + "step": 6567 + }, + { + "epoch": 0.7630554748765611, + "grad_norm": 0.47608694434165955, + "learning_rate": 0.0001, + "loss": 1.6944, + "step": 6568 + }, + { + "epoch": 0.7631716526285216, + "grad_norm": 0.4470519423484802, + "learning_rate": 0.0001, + "loss": 1.6235, + "step": 6569 + }, + { + "epoch": 0.7632878303804821, + "grad_norm": 0.41022247076034546, + "learning_rate": 0.0001, + "loss": 1.3155, + "step": 6570 + }, + { + "epoch": 0.7634040081324427, + "grad_norm": 0.4504031240940094, + "learning_rate": 0.0001, + "loss": 1.5964, + "step": 6571 + }, + { + "epoch": 0.7635201858844032, + "grad_norm": 0.48684489727020264, + "learning_rate": 0.0001, + "loss": 1.6809, + "step": 6572 + }, + { + "epoch": 0.7636363636363637, + "grad_norm": 0.43762362003326416, + "learning_rate": 0.0001, + "loss": 1.6745, + "step": 6573 + }, + { + "epoch": 0.7637525413883242, + "grad_norm": 0.47748515009880066, + "learning_rate": 0.0001, + "loss": 1.6348, + "step": 6574 + }, + { + "epoch": 0.7638687191402846, + "grad_norm": 0.4670480191707611, + "learning_rate": 0.0001, + "loss": 1.7789, + "step": 6575 + }, + { + "epoch": 0.7639848968922451, + "grad_norm": 0.45016875863075256, + "learning_rate": 0.0001, + "loss": 1.6367, + "step": 6576 + }, + { + "epoch": 0.7641010746442056, + "grad_norm": 0.4202631115913391, + "learning_rate": 0.0001, + "loss": 1.4861, + "step": 6577 + }, + { + "epoch": 0.7642172523961661, + "grad_norm": 0.4666217565536499, + "learning_rate": 0.0001, + "loss": 1.4498, + "step": 6578 + }, + { + "epoch": 0.7643334301481266, + "grad_norm": 0.46688103675842285, + "learning_rate": 0.0001, + "loss": 1.4261, + "step": 6579 + }, + { + "epoch": 0.7644496079000871, + "grad_norm": 0.4328000843524933, + "learning_rate": 0.0001, + "loss": 1.503, + "step": 6580 + }, + { + "epoch": 0.7645657856520476, + "grad_norm": 0.4551675617694855, + "learning_rate": 0.0001, + "loss": 1.567, + "step": 6581 + }, + { + "epoch": 0.7646819634040082, + "grad_norm": 0.4549828767776489, + "learning_rate": 0.0001, + "loss": 1.5367, + "step": 6582 + }, + { + "epoch": 0.7647981411559687, + "grad_norm": 0.45938196778297424, + "learning_rate": 0.0001, + "loss": 1.6921, + "step": 6583 + }, + { + "epoch": 0.7649143189079292, + "grad_norm": 0.4694269299507141, + "learning_rate": 0.0001, + "loss": 1.5216, + "step": 6584 + }, + { + "epoch": 0.7650304966598896, + "grad_norm": 0.47541555762290955, + "learning_rate": 0.0001, + "loss": 1.6457, + "step": 6585 + }, + { + "epoch": 0.7651466744118501, + "grad_norm": 0.432059645652771, + "learning_rate": 0.0001, + "loss": 1.4684, + "step": 6586 + }, + { + "epoch": 0.7652628521638106, + "grad_norm": 0.48397889733314514, + "learning_rate": 0.0001, + "loss": 1.5379, + "step": 6587 + }, + { + "epoch": 0.7653790299157711, + "grad_norm": 0.437933087348938, + "learning_rate": 0.0001, + "loss": 1.5497, + "step": 6588 + }, + { + "epoch": 0.7654952076677316, + "grad_norm": 0.4440605044364929, + "learning_rate": 0.0001, + "loss": 1.663, + "step": 6589 + }, + { + "epoch": 0.7656113854196921, + "grad_norm": 0.447862833738327, + "learning_rate": 0.0001, + "loss": 1.6118, + "step": 6590 + }, + { + "epoch": 0.7657275631716526, + "grad_norm": 0.41971054673194885, + "learning_rate": 0.0001, + "loss": 1.5061, + "step": 6591 + }, + { + "epoch": 0.7658437409236132, + "grad_norm": 0.4456073045730591, + "learning_rate": 0.0001, + "loss": 1.5482, + "step": 6592 + }, + { + "epoch": 0.7659599186755737, + "grad_norm": 0.43009260296821594, + "learning_rate": 0.0001, + "loss": 1.5966, + "step": 6593 + }, + { + "epoch": 0.7660760964275342, + "grad_norm": 0.482006698846817, + "learning_rate": 0.0001, + "loss": 1.6334, + "step": 6594 + }, + { + "epoch": 0.7661922741794946, + "grad_norm": 0.4667458236217499, + "learning_rate": 0.0001, + "loss": 1.5051, + "step": 6595 + }, + { + "epoch": 0.7663084519314551, + "grad_norm": 0.4783475995063782, + "learning_rate": 0.0001, + "loss": 1.7254, + "step": 6596 + }, + { + "epoch": 0.7664246296834156, + "grad_norm": 0.4709714353084564, + "learning_rate": 0.0001, + "loss": 1.5501, + "step": 6597 + }, + { + "epoch": 0.7665408074353761, + "grad_norm": 0.45129141211509705, + "learning_rate": 0.0001, + "loss": 1.573, + "step": 6598 + }, + { + "epoch": 0.7666569851873366, + "grad_norm": 0.44044631719589233, + "learning_rate": 0.0001, + "loss": 1.4134, + "step": 6599 + }, + { + "epoch": 0.7667731629392971, + "grad_norm": 0.4458431899547577, + "learning_rate": 0.0001, + "loss": 1.4987, + "step": 6600 + }, + { + "epoch": 0.7668893406912576, + "grad_norm": 0.4668479263782501, + "learning_rate": 0.0001, + "loss": 1.5884, + "step": 6601 + }, + { + "epoch": 0.7670055184432181, + "grad_norm": 0.4631541073322296, + "learning_rate": 0.0001, + "loss": 1.5373, + "step": 6602 + }, + { + "epoch": 0.7671216961951787, + "grad_norm": 0.47970542311668396, + "learning_rate": 0.0001, + "loss": 1.6183, + "step": 6603 + }, + { + "epoch": 0.7672378739471392, + "grad_norm": 0.4707159101963043, + "learning_rate": 0.0001, + "loss": 1.7697, + "step": 6604 + }, + { + "epoch": 0.7673540516990996, + "grad_norm": 0.48300158977508545, + "learning_rate": 0.0001, + "loss": 1.5748, + "step": 6605 + }, + { + "epoch": 0.7674702294510601, + "grad_norm": 0.47976362705230713, + "learning_rate": 0.0001, + "loss": 1.5799, + "step": 6606 + }, + { + "epoch": 0.7675864072030206, + "grad_norm": 0.4959631860256195, + "learning_rate": 0.0001, + "loss": 1.6045, + "step": 6607 + }, + { + "epoch": 0.7677025849549811, + "grad_norm": 0.4722089171409607, + "learning_rate": 0.0001, + "loss": 1.6423, + "step": 6608 + }, + { + "epoch": 0.7678187627069416, + "grad_norm": 0.5199443101882935, + "learning_rate": 0.0001, + "loss": 1.642, + "step": 6609 + }, + { + "epoch": 0.7679349404589021, + "grad_norm": 0.4067780375480652, + "learning_rate": 0.0001, + "loss": 1.3372, + "step": 6610 + }, + { + "epoch": 0.7680511182108626, + "grad_norm": 0.43238359689712524, + "learning_rate": 0.0001, + "loss": 1.5107, + "step": 6611 + }, + { + "epoch": 0.7681672959628231, + "grad_norm": 0.5341728925704956, + "learning_rate": 0.0001, + "loss": 1.8739, + "step": 6612 + }, + { + "epoch": 0.7682834737147837, + "grad_norm": 0.46490055322647095, + "learning_rate": 0.0001, + "loss": 1.7022, + "step": 6613 + }, + { + "epoch": 0.7683996514667442, + "grad_norm": 0.4309159219264984, + "learning_rate": 0.0001, + "loss": 1.6349, + "step": 6614 + }, + { + "epoch": 0.7685158292187046, + "grad_norm": 0.4642525017261505, + "learning_rate": 0.0001, + "loss": 1.6468, + "step": 6615 + }, + { + "epoch": 0.7686320069706651, + "grad_norm": 0.46897363662719727, + "learning_rate": 0.0001, + "loss": 1.5809, + "step": 6616 + }, + { + "epoch": 0.7687481847226256, + "grad_norm": 0.43041566014289856, + "learning_rate": 0.0001, + "loss": 1.5338, + "step": 6617 + }, + { + "epoch": 0.7688643624745861, + "grad_norm": 0.42853835225105286, + "learning_rate": 0.0001, + "loss": 1.4978, + "step": 6618 + }, + { + "epoch": 0.7689805402265466, + "grad_norm": 0.4620231091976166, + "learning_rate": 0.0001, + "loss": 1.73, + "step": 6619 + }, + { + "epoch": 0.7690967179785071, + "grad_norm": 0.44504088163375854, + "learning_rate": 0.0001, + "loss": 1.6048, + "step": 6620 + }, + { + "epoch": 0.7692128957304676, + "grad_norm": 0.48609089851379395, + "learning_rate": 0.0001, + "loss": 1.5766, + "step": 6621 + }, + { + "epoch": 0.7693290734824281, + "grad_norm": 0.483452171087265, + "learning_rate": 0.0001, + "loss": 1.6595, + "step": 6622 + }, + { + "epoch": 0.7694452512343886, + "grad_norm": 0.4754545986652374, + "learning_rate": 0.0001, + "loss": 1.663, + "step": 6623 + }, + { + "epoch": 0.7695614289863492, + "grad_norm": 0.4533579349517822, + "learning_rate": 0.0001, + "loss": 1.5474, + "step": 6624 + }, + { + "epoch": 0.7696776067383096, + "grad_norm": 0.4664144814014435, + "learning_rate": 0.0001, + "loss": 1.6962, + "step": 6625 + }, + { + "epoch": 0.7697937844902701, + "grad_norm": 0.48113691806793213, + "learning_rate": 0.0001, + "loss": 1.5808, + "step": 6626 + }, + { + "epoch": 0.7699099622422306, + "grad_norm": 0.4325944185256958, + "learning_rate": 0.0001, + "loss": 1.5159, + "step": 6627 + }, + { + "epoch": 0.7700261399941911, + "grad_norm": 0.4415172338485718, + "learning_rate": 0.0001, + "loss": 1.5869, + "step": 6628 + }, + { + "epoch": 0.7701423177461516, + "grad_norm": 0.41996294260025024, + "learning_rate": 0.0001, + "loss": 1.497, + "step": 6629 + }, + { + "epoch": 0.7702584954981121, + "grad_norm": 0.4572359025478363, + "learning_rate": 0.0001, + "loss": 1.6587, + "step": 6630 + }, + { + "epoch": 0.7703746732500726, + "grad_norm": 0.4630671441555023, + "learning_rate": 0.0001, + "loss": 1.6671, + "step": 6631 + }, + { + "epoch": 0.7704908510020331, + "grad_norm": 0.4352489411830902, + "learning_rate": 0.0001, + "loss": 1.6456, + "step": 6632 + }, + { + "epoch": 0.7706070287539936, + "grad_norm": 0.4531479477882385, + "learning_rate": 0.0001, + "loss": 1.525, + "step": 6633 + }, + { + "epoch": 0.7707232065059542, + "grad_norm": 0.5102584362030029, + "learning_rate": 0.0001, + "loss": 1.5636, + "step": 6634 + }, + { + "epoch": 0.7708393842579147, + "grad_norm": 0.45892634987831116, + "learning_rate": 0.0001, + "loss": 1.5188, + "step": 6635 + }, + { + "epoch": 0.7709555620098751, + "grad_norm": 0.4718113839626312, + "learning_rate": 0.0001, + "loss": 1.7131, + "step": 6636 + }, + { + "epoch": 0.7710717397618356, + "grad_norm": 0.4592994749546051, + "learning_rate": 0.0001, + "loss": 1.644, + "step": 6637 + }, + { + "epoch": 0.7711879175137961, + "grad_norm": 0.4589068293571472, + "learning_rate": 0.0001, + "loss": 1.5792, + "step": 6638 + }, + { + "epoch": 0.7713040952657566, + "grad_norm": 0.41312143206596375, + "learning_rate": 0.0001, + "loss": 1.3861, + "step": 6639 + }, + { + "epoch": 0.7714202730177171, + "grad_norm": 0.4337233901023865, + "learning_rate": 0.0001, + "loss": 1.4143, + "step": 6640 + }, + { + "epoch": 0.7715364507696776, + "grad_norm": 0.4643138349056244, + "learning_rate": 0.0001, + "loss": 1.6514, + "step": 6641 + }, + { + "epoch": 0.7716526285216381, + "grad_norm": 0.485331654548645, + "learning_rate": 0.0001, + "loss": 1.6514, + "step": 6642 + }, + { + "epoch": 0.7717688062735986, + "grad_norm": 0.4634449779987335, + "learning_rate": 0.0001, + "loss": 1.6173, + "step": 6643 + }, + { + "epoch": 0.771884984025559, + "grad_norm": 0.45036226511001587, + "learning_rate": 0.0001, + "loss": 1.4835, + "step": 6644 + }, + { + "epoch": 0.7720011617775197, + "grad_norm": 0.45549461245536804, + "learning_rate": 0.0001, + "loss": 1.6978, + "step": 6645 + }, + { + "epoch": 0.7721173395294801, + "grad_norm": 0.4745498299598694, + "learning_rate": 0.0001, + "loss": 1.5631, + "step": 6646 + }, + { + "epoch": 0.7722335172814406, + "grad_norm": 0.5061119198799133, + "learning_rate": 0.0001, + "loss": 1.725, + "step": 6647 + }, + { + "epoch": 0.7723496950334011, + "grad_norm": 0.4423942565917969, + "learning_rate": 0.0001, + "loss": 1.5582, + "step": 6648 + }, + { + "epoch": 0.7724658727853616, + "grad_norm": 0.46046459674835205, + "learning_rate": 0.0001, + "loss": 1.6316, + "step": 6649 + }, + { + "epoch": 0.7725820505373221, + "grad_norm": 0.4494078755378723, + "learning_rate": 0.0001, + "loss": 1.5904, + "step": 6650 + }, + { + "epoch": 0.7726982282892826, + "grad_norm": 0.4689868986606598, + "learning_rate": 0.0001, + "loss": 1.7184, + "step": 6651 + }, + { + "epoch": 0.7728144060412431, + "grad_norm": 0.42947614192962646, + "learning_rate": 0.0001, + "loss": 1.5102, + "step": 6652 + }, + { + "epoch": 0.7729305837932036, + "grad_norm": 0.424402117729187, + "learning_rate": 0.0001, + "loss": 1.3695, + "step": 6653 + }, + { + "epoch": 0.773046761545164, + "grad_norm": 0.44652658700942993, + "learning_rate": 0.0001, + "loss": 1.6374, + "step": 6654 + }, + { + "epoch": 0.7731629392971247, + "grad_norm": 0.4878056049346924, + "learning_rate": 0.0001, + "loss": 1.6091, + "step": 6655 + }, + { + "epoch": 0.7732791170490851, + "grad_norm": 0.4504278898239136, + "learning_rate": 0.0001, + "loss": 1.4506, + "step": 6656 + }, + { + "epoch": 0.7733952948010456, + "grad_norm": 0.43324804306030273, + "learning_rate": 0.0001, + "loss": 1.4912, + "step": 6657 + }, + { + "epoch": 0.7735114725530061, + "grad_norm": 0.47235342860221863, + "learning_rate": 0.0001, + "loss": 1.7402, + "step": 6658 + }, + { + "epoch": 0.7736276503049666, + "grad_norm": 0.4258509576320648, + "learning_rate": 0.0001, + "loss": 1.5373, + "step": 6659 + }, + { + "epoch": 0.7737438280569271, + "grad_norm": 0.47323077917099, + "learning_rate": 0.0001, + "loss": 1.5615, + "step": 6660 + }, + { + "epoch": 0.7738600058088876, + "grad_norm": 0.44588038325309753, + "learning_rate": 0.0001, + "loss": 1.601, + "step": 6661 + }, + { + "epoch": 0.7739761835608481, + "grad_norm": 0.4483187794685364, + "learning_rate": 0.0001, + "loss": 1.4728, + "step": 6662 + }, + { + "epoch": 0.7740923613128086, + "grad_norm": 0.4594082534313202, + "learning_rate": 0.0001, + "loss": 1.6624, + "step": 6663 + }, + { + "epoch": 0.774208539064769, + "grad_norm": 0.4679012894630432, + "learning_rate": 0.0001, + "loss": 1.7395, + "step": 6664 + }, + { + "epoch": 0.7743247168167295, + "grad_norm": 0.5004777312278748, + "learning_rate": 0.0001, + "loss": 1.7538, + "step": 6665 + }, + { + "epoch": 0.7744408945686901, + "grad_norm": 0.4686128795146942, + "learning_rate": 0.0001, + "loss": 1.6201, + "step": 6666 + }, + { + "epoch": 0.7745570723206506, + "grad_norm": 0.46381059288978577, + "learning_rate": 0.0001, + "loss": 1.6585, + "step": 6667 + }, + { + "epoch": 0.7746732500726111, + "grad_norm": 0.4782971739768982, + "learning_rate": 0.0001, + "loss": 1.7547, + "step": 6668 + }, + { + "epoch": 0.7747894278245716, + "grad_norm": 0.48163890838623047, + "learning_rate": 0.0001, + "loss": 1.7483, + "step": 6669 + }, + { + "epoch": 0.7749056055765321, + "grad_norm": 0.47830620408058167, + "learning_rate": 0.0001, + "loss": 1.6716, + "step": 6670 + }, + { + "epoch": 0.7750217833284926, + "grad_norm": 0.4307786524295807, + "learning_rate": 0.0001, + "loss": 1.7148, + "step": 6671 + }, + { + "epoch": 0.7751379610804531, + "grad_norm": 0.4738655090332031, + "learning_rate": 0.0001, + "loss": 1.6182, + "step": 6672 + }, + { + "epoch": 0.7752541388324136, + "grad_norm": 0.45517605543136597, + "learning_rate": 0.0001, + "loss": 1.6803, + "step": 6673 + }, + { + "epoch": 0.775370316584374, + "grad_norm": 0.44930917024612427, + "learning_rate": 0.0001, + "loss": 1.5551, + "step": 6674 + }, + { + "epoch": 0.7754864943363345, + "grad_norm": 0.4695178270339966, + "learning_rate": 0.0001, + "loss": 1.6834, + "step": 6675 + }, + { + "epoch": 0.7756026720882951, + "grad_norm": 0.4576238691806793, + "learning_rate": 0.0001, + "loss": 1.5411, + "step": 6676 + }, + { + "epoch": 0.7757188498402556, + "grad_norm": 0.44012871384620667, + "learning_rate": 0.0001, + "loss": 1.4499, + "step": 6677 + }, + { + "epoch": 0.7758350275922161, + "grad_norm": 0.4941727817058563, + "learning_rate": 0.0001, + "loss": 1.6882, + "step": 6678 + }, + { + "epoch": 0.7759512053441766, + "grad_norm": 0.4414857029914856, + "learning_rate": 0.0001, + "loss": 1.5447, + "step": 6679 + }, + { + "epoch": 0.7760673830961371, + "grad_norm": 0.468009352684021, + "learning_rate": 0.0001, + "loss": 1.69, + "step": 6680 + }, + { + "epoch": 0.7761835608480976, + "grad_norm": 0.514825165271759, + "learning_rate": 0.0001, + "loss": 1.6376, + "step": 6681 + }, + { + "epoch": 0.7762997386000581, + "grad_norm": 0.47073277831077576, + "learning_rate": 0.0001, + "loss": 1.5097, + "step": 6682 + }, + { + "epoch": 0.7764159163520186, + "grad_norm": 0.46188485622406006, + "learning_rate": 0.0001, + "loss": 1.5405, + "step": 6683 + }, + { + "epoch": 0.776532094103979, + "grad_norm": 0.5069568753242493, + "learning_rate": 0.0001, + "loss": 1.577, + "step": 6684 + }, + { + "epoch": 0.7766482718559395, + "grad_norm": 0.4475022256374359, + "learning_rate": 0.0001, + "loss": 1.5666, + "step": 6685 + }, + { + "epoch": 0.7767644496079001, + "grad_norm": 0.48565706610679626, + "learning_rate": 0.0001, + "loss": 1.5044, + "step": 6686 + }, + { + "epoch": 0.7768806273598606, + "grad_norm": 0.4480251967906952, + "learning_rate": 0.0001, + "loss": 1.5559, + "step": 6687 + }, + { + "epoch": 0.7769968051118211, + "grad_norm": 0.4224783480167389, + "learning_rate": 0.0001, + "loss": 1.4649, + "step": 6688 + }, + { + "epoch": 0.7771129828637816, + "grad_norm": 0.45073410868644714, + "learning_rate": 0.0001, + "loss": 1.6847, + "step": 6689 + }, + { + "epoch": 0.7772291606157421, + "grad_norm": 0.4535382390022278, + "learning_rate": 0.0001, + "loss": 1.6862, + "step": 6690 + }, + { + "epoch": 0.7773453383677026, + "grad_norm": 0.45997950434684753, + "learning_rate": 0.0001, + "loss": 1.6649, + "step": 6691 + }, + { + "epoch": 0.7774615161196631, + "grad_norm": 0.4208465814590454, + "learning_rate": 0.0001, + "loss": 1.4265, + "step": 6692 + }, + { + "epoch": 0.7775776938716236, + "grad_norm": 0.4323180317878723, + "learning_rate": 0.0001, + "loss": 1.4691, + "step": 6693 + }, + { + "epoch": 0.777693871623584, + "grad_norm": 0.47370851039886475, + "learning_rate": 0.0001, + "loss": 1.5979, + "step": 6694 + }, + { + "epoch": 0.7778100493755445, + "grad_norm": 0.46739041805267334, + "learning_rate": 0.0001, + "loss": 1.6456, + "step": 6695 + }, + { + "epoch": 0.777926227127505, + "grad_norm": 0.44516703486442566, + "learning_rate": 0.0001, + "loss": 1.5745, + "step": 6696 + }, + { + "epoch": 0.7780424048794656, + "grad_norm": 0.4348163306713104, + "learning_rate": 0.0001, + "loss": 1.5218, + "step": 6697 + }, + { + "epoch": 0.7781585826314261, + "grad_norm": 0.5044941902160645, + "learning_rate": 0.0001, + "loss": 1.7077, + "step": 6698 + }, + { + "epoch": 0.7782747603833866, + "grad_norm": 0.48483940958976746, + "learning_rate": 0.0001, + "loss": 1.7837, + "step": 6699 + }, + { + "epoch": 0.7783909381353471, + "grad_norm": 0.46066558361053467, + "learning_rate": 0.0001, + "loss": 1.6183, + "step": 6700 + }, + { + "epoch": 0.7785071158873076, + "grad_norm": 0.45025432109832764, + "learning_rate": 0.0001, + "loss": 1.6095, + "step": 6701 + }, + { + "epoch": 0.7786232936392681, + "grad_norm": 0.4388742744922638, + "learning_rate": 0.0001, + "loss": 1.6649, + "step": 6702 + }, + { + "epoch": 0.7787394713912286, + "grad_norm": 0.44050899147987366, + "learning_rate": 0.0001, + "loss": 1.5372, + "step": 6703 + }, + { + "epoch": 0.778855649143189, + "grad_norm": 0.5146862864494324, + "learning_rate": 0.0001, + "loss": 1.8927, + "step": 6704 + }, + { + "epoch": 0.7789718268951495, + "grad_norm": 0.43440109491348267, + "learning_rate": 0.0001, + "loss": 1.5271, + "step": 6705 + }, + { + "epoch": 0.77908800464711, + "grad_norm": 0.43129873275756836, + "learning_rate": 0.0001, + "loss": 1.6918, + "step": 6706 + }, + { + "epoch": 0.7792041823990706, + "grad_norm": 0.4510217308998108, + "learning_rate": 0.0001, + "loss": 1.532, + "step": 6707 + }, + { + "epoch": 0.7793203601510311, + "grad_norm": 0.5071493983268738, + "learning_rate": 0.0001, + "loss": 1.7241, + "step": 6708 + }, + { + "epoch": 0.7794365379029916, + "grad_norm": 0.4631008505821228, + "learning_rate": 0.0001, + "loss": 1.5681, + "step": 6709 + }, + { + "epoch": 0.7795527156549521, + "grad_norm": 0.4254882335662842, + "learning_rate": 0.0001, + "loss": 1.4675, + "step": 6710 + }, + { + "epoch": 0.7796688934069126, + "grad_norm": 0.4777772128582001, + "learning_rate": 0.0001, + "loss": 1.7763, + "step": 6711 + }, + { + "epoch": 0.7797850711588731, + "grad_norm": 0.4478752017021179, + "learning_rate": 0.0001, + "loss": 1.6425, + "step": 6712 + }, + { + "epoch": 0.7799012489108336, + "grad_norm": 0.4405520260334015, + "learning_rate": 0.0001, + "loss": 1.7456, + "step": 6713 + }, + { + "epoch": 0.7800174266627941, + "grad_norm": 0.43905889987945557, + "learning_rate": 0.0001, + "loss": 1.6287, + "step": 6714 + }, + { + "epoch": 0.7801336044147545, + "grad_norm": 0.46866193413734436, + "learning_rate": 0.0001, + "loss": 1.6221, + "step": 6715 + }, + { + "epoch": 0.780249782166715, + "grad_norm": 0.4609702229499817, + "learning_rate": 0.0001, + "loss": 1.744, + "step": 6716 + }, + { + "epoch": 0.7803659599186755, + "grad_norm": 0.4288838803768158, + "learning_rate": 0.0001, + "loss": 1.5726, + "step": 6717 + }, + { + "epoch": 0.7804821376706361, + "grad_norm": 0.4898937940597534, + "learning_rate": 0.0001, + "loss": 1.7136, + "step": 6718 + }, + { + "epoch": 0.7805983154225966, + "grad_norm": 0.46226128935813904, + "learning_rate": 0.0001, + "loss": 1.6664, + "step": 6719 + }, + { + "epoch": 0.7807144931745571, + "grad_norm": 0.4708699584007263, + "learning_rate": 0.0001, + "loss": 1.4289, + "step": 6720 + }, + { + "epoch": 0.7808306709265176, + "grad_norm": 0.4597412943840027, + "learning_rate": 0.0001, + "loss": 1.7016, + "step": 6721 + }, + { + "epoch": 0.7809468486784781, + "grad_norm": 0.44381624460220337, + "learning_rate": 0.0001, + "loss": 1.6171, + "step": 6722 + }, + { + "epoch": 0.7810630264304386, + "grad_norm": 0.43926021456718445, + "learning_rate": 0.0001, + "loss": 1.6971, + "step": 6723 + }, + { + "epoch": 0.7811792041823991, + "grad_norm": 0.4407069683074951, + "learning_rate": 0.0001, + "loss": 1.5507, + "step": 6724 + }, + { + "epoch": 0.7812953819343595, + "grad_norm": 0.44746869802474976, + "learning_rate": 0.0001, + "loss": 1.5651, + "step": 6725 + }, + { + "epoch": 0.78141155968632, + "grad_norm": 0.44408538937568665, + "learning_rate": 0.0001, + "loss": 1.378, + "step": 6726 + }, + { + "epoch": 0.7815277374382805, + "grad_norm": 0.44895273447036743, + "learning_rate": 0.0001, + "loss": 1.647, + "step": 6727 + }, + { + "epoch": 0.7816439151902411, + "grad_norm": 0.4479658901691437, + "learning_rate": 0.0001, + "loss": 1.5602, + "step": 6728 + }, + { + "epoch": 0.7817600929422016, + "grad_norm": 0.4522935450077057, + "learning_rate": 0.0001, + "loss": 1.5872, + "step": 6729 + }, + { + "epoch": 0.7818762706941621, + "grad_norm": 0.4975508749485016, + "learning_rate": 0.0001, + "loss": 1.5601, + "step": 6730 + }, + { + "epoch": 0.7819924484461226, + "grad_norm": 0.4425521194934845, + "learning_rate": 0.0001, + "loss": 1.6797, + "step": 6731 + }, + { + "epoch": 0.7821086261980831, + "grad_norm": 0.4268142580986023, + "learning_rate": 0.0001, + "loss": 1.5329, + "step": 6732 + }, + { + "epoch": 0.7822248039500436, + "grad_norm": 0.45263275504112244, + "learning_rate": 0.0001, + "loss": 1.6088, + "step": 6733 + }, + { + "epoch": 0.7823409817020041, + "grad_norm": 0.4428475797176361, + "learning_rate": 0.0001, + "loss": 1.4382, + "step": 6734 + }, + { + "epoch": 0.7824571594539645, + "grad_norm": 0.5304811596870422, + "learning_rate": 0.0001, + "loss": 1.7288, + "step": 6735 + }, + { + "epoch": 0.782573337205925, + "grad_norm": 0.43402206897735596, + "learning_rate": 0.0001, + "loss": 1.5834, + "step": 6736 + }, + { + "epoch": 0.7826895149578855, + "grad_norm": 0.43947911262512207, + "learning_rate": 0.0001, + "loss": 1.4528, + "step": 6737 + }, + { + "epoch": 0.782805692709846, + "grad_norm": 0.4783528447151184, + "learning_rate": 0.0001, + "loss": 1.6796, + "step": 6738 + }, + { + "epoch": 0.7829218704618066, + "grad_norm": 0.4822714924812317, + "learning_rate": 0.0001, + "loss": 1.5938, + "step": 6739 + }, + { + "epoch": 0.7830380482137671, + "grad_norm": 0.4645395576953888, + "learning_rate": 0.0001, + "loss": 1.6273, + "step": 6740 + }, + { + "epoch": 0.7831542259657276, + "grad_norm": 0.47700071334838867, + "learning_rate": 0.0001, + "loss": 1.6726, + "step": 6741 + }, + { + "epoch": 0.7832704037176881, + "grad_norm": 0.48509228229522705, + "learning_rate": 0.0001, + "loss": 1.6509, + "step": 6742 + }, + { + "epoch": 0.7833865814696486, + "grad_norm": 0.43715542554855347, + "learning_rate": 0.0001, + "loss": 1.5105, + "step": 6743 + }, + { + "epoch": 0.7835027592216091, + "grad_norm": 0.5839574933052063, + "learning_rate": 0.0001, + "loss": 1.7659, + "step": 6744 + }, + { + "epoch": 0.7836189369735695, + "grad_norm": 0.4410605728626251, + "learning_rate": 0.0001, + "loss": 1.5191, + "step": 6745 + }, + { + "epoch": 0.78373511472553, + "grad_norm": 0.44699952006340027, + "learning_rate": 0.0001, + "loss": 1.4267, + "step": 6746 + }, + { + "epoch": 0.7838512924774905, + "grad_norm": 0.43901586532592773, + "learning_rate": 0.0001, + "loss": 1.5008, + "step": 6747 + }, + { + "epoch": 0.783967470229451, + "grad_norm": 0.43384185433387756, + "learning_rate": 0.0001, + "loss": 1.4576, + "step": 6748 + }, + { + "epoch": 0.7840836479814116, + "grad_norm": 0.41548269987106323, + "learning_rate": 0.0001, + "loss": 1.3953, + "step": 6749 + }, + { + "epoch": 0.7841998257333721, + "grad_norm": 0.4889026880264282, + "learning_rate": 0.0001, + "loss": 1.5251, + "step": 6750 + }, + { + "epoch": 0.7843160034853326, + "grad_norm": 0.46144333481788635, + "learning_rate": 0.0001, + "loss": 1.6779, + "step": 6751 + }, + { + "epoch": 0.7844321812372931, + "grad_norm": 0.4552895128726959, + "learning_rate": 0.0001, + "loss": 1.6212, + "step": 6752 + }, + { + "epoch": 0.7845483589892536, + "grad_norm": 0.4651104807853699, + "learning_rate": 0.0001, + "loss": 1.3792, + "step": 6753 + }, + { + "epoch": 0.7846645367412141, + "grad_norm": 0.4177487790584564, + "learning_rate": 0.0001, + "loss": 1.5152, + "step": 6754 + }, + { + "epoch": 0.7847807144931745, + "grad_norm": 0.4314608573913574, + "learning_rate": 0.0001, + "loss": 1.4307, + "step": 6755 + }, + { + "epoch": 0.784896892245135, + "grad_norm": 0.4462542235851288, + "learning_rate": 0.0001, + "loss": 1.687, + "step": 6756 + }, + { + "epoch": 0.7850130699970955, + "grad_norm": 0.4506712853908539, + "learning_rate": 0.0001, + "loss": 1.6538, + "step": 6757 + }, + { + "epoch": 0.785129247749056, + "grad_norm": 0.465562641620636, + "learning_rate": 0.0001, + "loss": 1.5888, + "step": 6758 + }, + { + "epoch": 0.7852454255010165, + "grad_norm": 0.5159258842468262, + "learning_rate": 0.0001, + "loss": 1.7131, + "step": 6759 + }, + { + "epoch": 0.7853616032529771, + "grad_norm": 0.4616967439651489, + "learning_rate": 0.0001, + "loss": 1.6249, + "step": 6760 + }, + { + "epoch": 0.7854777810049376, + "grad_norm": 0.4350794553756714, + "learning_rate": 0.0001, + "loss": 1.505, + "step": 6761 + }, + { + "epoch": 0.7855939587568981, + "grad_norm": 0.4288870692253113, + "learning_rate": 0.0001, + "loss": 1.4676, + "step": 6762 + }, + { + "epoch": 0.7857101365088586, + "grad_norm": 0.4555493891239166, + "learning_rate": 0.0001, + "loss": 1.605, + "step": 6763 + }, + { + "epoch": 0.7858263142608191, + "grad_norm": 0.4390559196472168, + "learning_rate": 0.0001, + "loss": 1.4557, + "step": 6764 + }, + { + "epoch": 0.7859424920127795, + "grad_norm": 0.47772979736328125, + "learning_rate": 0.0001, + "loss": 1.5336, + "step": 6765 + }, + { + "epoch": 0.78605866976474, + "grad_norm": 0.45631012320518494, + "learning_rate": 0.0001, + "loss": 1.6223, + "step": 6766 + }, + { + "epoch": 0.7861748475167005, + "grad_norm": 0.4568979740142822, + "learning_rate": 0.0001, + "loss": 1.4912, + "step": 6767 + }, + { + "epoch": 0.786291025268661, + "grad_norm": 0.4683852195739746, + "learning_rate": 0.0001, + "loss": 1.5739, + "step": 6768 + }, + { + "epoch": 0.7864072030206215, + "grad_norm": 0.45670732855796814, + "learning_rate": 0.0001, + "loss": 1.5515, + "step": 6769 + }, + { + "epoch": 0.7865233807725821, + "grad_norm": 0.4741782248020172, + "learning_rate": 0.0001, + "loss": 1.6872, + "step": 6770 + }, + { + "epoch": 0.7866395585245426, + "grad_norm": 0.4552614092826843, + "learning_rate": 0.0001, + "loss": 1.6347, + "step": 6771 + }, + { + "epoch": 0.7867557362765031, + "grad_norm": 0.4711034297943115, + "learning_rate": 0.0001, + "loss": 1.6586, + "step": 6772 + }, + { + "epoch": 0.7868719140284636, + "grad_norm": 0.4594586491584778, + "learning_rate": 0.0001, + "loss": 1.652, + "step": 6773 + }, + { + "epoch": 0.7869880917804241, + "grad_norm": 0.458446204662323, + "learning_rate": 0.0001, + "loss": 1.6688, + "step": 6774 + }, + { + "epoch": 0.7871042695323845, + "grad_norm": 0.48481476306915283, + "learning_rate": 0.0001, + "loss": 1.6635, + "step": 6775 + }, + { + "epoch": 0.787220447284345, + "grad_norm": 0.5112566947937012, + "learning_rate": 0.0001, + "loss": 1.7064, + "step": 6776 + }, + { + "epoch": 0.7873366250363055, + "grad_norm": 0.43871763348579407, + "learning_rate": 0.0001, + "loss": 1.5638, + "step": 6777 + }, + { + "epoch": 0.787452802788266, + "grad_norm": 0.47852954268455505, + "learning_rate": 0.0001, + "loss": 1.5477, + "step": 6778 + }, + { + "epoch": 0.7875689805402265, + "grad_norm": 0.4647180736064911, + "learning_rate": 0.0001, + "loss": 1.6743, + "step": 6779 + }, + { + "epoch": 0.787685158292187, + "grad_norm": 0.454166978597641, + "learning_rate": 0.0001, + "loss": 1.5399, + "step": 6780 + }, + { + "epoch": 0.7878013360441476, + "grad_norm": 0.447322279214859, + "learning_rate": 0.0001, + "loss": 1.5876, + "step": 6781 + }, + { + "epoch": 0.7879175137961081, + "grad_norm": 0.4487408399581909, + "learning_rate": 0.0001, + "loss": 1.6681, + "step": 6782 + }, + { + "epoch": 0.7880336915480686, + "grad_norm": 0.46517065167427063, + "learning_rate": 0.0001, + "loss": 1.725, + "step": 6783 + }, + { + "epoch": 0.7881498693000291, + "grad_norm": 0.4516124427318573, + "learning_rate": 0.0001, + "loss": 1.6411, + "step": 6784 + }, + { + "epoch": 0.7882660470519895, + "grad_norm": 0.46726909279823303, + "learning_rate": 0.0001, + "loss": 1.5802, + "step": 6785 + }, + { + "epoch": 0.78838222480395, + "grad_norm": 0.44458258152008057, + "learning_rate": 0.0001, + "loss": 1.5214, + "step": 6786 + }, + { + "epoch": 0.7884984025559105, + "grad_norm": 0.44695961475372314, + "learning_rate": 0.0001, + "loss": 1.5685, + "step": 6787 + }, + { + "epoch": 0.788614580307871, + "grad_norm": 0.46932363510131836, + "learning_rate": 0.0001, + "loss": 1.5508, + "step": 6788 + }, + { + "epoch": 0.7887307580598315, + "grad_norm": 0.4439738392829895, + "learning_rate": 0.0001, + "loss": 1.6621, + "step": 6789 + }, + { + "epoch": 0.788846935811792, + "grad_norm": 0.45402002334594727, + "learning_rate": 0.0001, + "loss": 1.6997, + "step": 6790 + }, + { + "epoch": 0.7889631135637526, + "grad_norm": 0.45656514167785645, + "learning_rate": 0.0001, + "loss": 1.4856, + "step": 6791 + }, + { + "epoch": 0.7890792913157131, + "grad_norm": 0.4795583486557007, + "learning_rate": 0.0001, + "loss": 1.5899, + "step": 6792 + }, + { + "epoch": 0.7891954690676736, + "grad_norm": 0.45822617411613464, + "learning_rate": 0.0001, + "loss": 1.5305, + "step": 6793 + }, + { + "epoch": 0.7893116468196341, + "grad_norm": 0.44518178701400757, + "learning_rate": 0.0001, + "loss": 1.5226, + "step": 6794 + }, + { + "epoch": 0.7894278245715945, + "grad_norm": 0.4444418251514435, + "learning_rate": 0.0001, + "loss": 1.6133, + "step": 6795 + }, + { + "epoch": 0.789544002323555, + "grad_norm": 0.4494422972202301, + "learning_rate": 0.0001, + "loss": 1.7072, + "step": 6796 + }, + { + "epoch": 0.7896601800755155, + "grad_norm": 0.43979451060295105, + "learning_rate": 0.0001, + "loss": 1.5291, + "step": 6797 + }, + { + "epoch": 0.789776357827476, + "grad_norm": 0.4433092474937439, + "learning_rate": 0.0001, + "loss": 1.5643, + "step": 6798 + }, + { + "epoch": 0.7898925355794365, + "grad_norm": 0.44533100724220276, + "learning_rate": 0.0001, + "loss": 1.7044, + "step": 6799 + }, + { + "epoch": 0.790008713331397, + "grad_norm": 0.4789431393146515, + "learning_rate": 0.0001, + "loss": 1.6042, + "step": 6800 + }, + { + "epoch": 0.7901248910833575, + "grad_norm": 0.4464295506477356, + "learning_rate": 0.0001, + "loss": 1.573, + "step": 6801 + }, + { + "epoch": 0.7902410688353181, + "grad_norm": 0.48397591710090637, + "learning_rate": 0.0001, + "loss": 1.5736, + "step": 6802 + }, + { + "epoch": 0.7903572465872786, + "grad_norm": 0.47774365544319153, + "learning_rate": 0.0001, + "loss": 1.6687, + "step": 6803 + }, + { + "epoch": 0.7904734243392391, + "grad_norm": 0.4871893525123596, + "learning_rate": 0.0001, + "loss": 1.6478, + "step": 6804 + }, + { + "epoch": 0.7905896020911996, + "grad_norm": 0.4551542103290558, + "learning_rate": 0.0001, + "loss": 1.8026, + "step": 6805 + }, + { + "epoch": 0.79070577984316, + "grad_norm": 0.4858977496623993, + "learning_rate": 0.0001, + "loss": 1.5087, + "step": 6806 + }, + { + "epoch": 0.7908219575951205, + "grad_norm": 0.4872024953365326, + "learning_rate": 0.0001, + "loss": 1.6519, + "step": 6807 + }, + { + "epoch": 0.790938135347081, + "grad_norm": 0.45094436407089233, + "learning_rate": 0.0001, + "loss": 1.7071, + "step": 6808 + }, + { + "epoch": 0.7910543130990415, + "grad_norm": 0.43320778012275696, + "learning_rate": 0.0001, + "loss": 1.6605, + "step": 6809 + }, + { + "epoch": 0.791170490851002, + "grad_norm": 0.4422098696231842, + "learning_rate": 0.0001, + "loss": 1.5932, + "step": 6810 + }, + { + "epoch": 0.7912866686029625, + "grad_norm": 0.44262856245040894, + "learning_rate": 0.0001, + "loss": 1.4983, + "step": 6811 + }, + { + "epoch": 0.7914028463549231, + "grad_norm": 0.4632859230041504, + "learning_rate": 0.0001, + "loss": 1.6968, + "step": 6812 + }, + { + "epoch": 0.7915190241068836, + "grad_norm": 0.4445875883102417, + "learning_rate": 0.0001, + "loss": 1.6205, + "step": 6813 + }, + { + "epoch": 0.7916352018588441, + "grad_norm": 0.47772374749183655, + "learning_rate": 0.0001, + "loss": 1.7145, + "step": 6814 + }, + { + "epoch": 0.7917513796108046, + "grad_norm": 0.4446466863155365, + "learning_rate": 0.0001, + "loss": 1.5446, + "step": 6815 + }, + { + "epoch": 0.791867557362765, + "grad_norm": 0.47620153427124023, + "learning_rate": 0.0001, + "loss": 1.7399, + "step": 6816 + }, + { + "epoch": 0.7919837351147255, + "grad_norm": 0.496855229139328, + "learning_rate": 0.0001, + "loss": 1.5951, + "step": 6817 + }, + { + "epoch": 0.792099912866686, + "grad_norm": 0.44331079721450806, + "learning_rate": 0.0001, + "loss": 1.5529, + "step": 6818 + }, + { + "epoch": 0.7922160906186465, + "grad_norm": 0.4731941223144531, + "learning_rate": 0.0001, + "loss": 1.6266, + "step": 6819 + }, + { + "epoch": 0.792332268370607, + "grad_norm": 0.507358729839325, + "learning_rate": 0.0001, + "loss": 1.5982, + "step": 6820 + }, + { + "epoch": 0.7924484461225675, + "grad_norm": 0.46159496903419495, + "learning_rate": 0.0001, + "loss": 1.6328, + "step": 6821 + }, + { + "epoch": 0.792564623874528, + "grad_norm": 0.4719959497451782, + "learning_rate": 0.0001, + "loss": 1.6799, + "step": 6822 + }, + { + "epoch": 0.7926808016264886, + "grad_norm": 0.4765816330909729, + "learning_rate": 0.0001, + "loss": 1.6144, + "step": 6823 + }, + { + "epoch": 0.7927969793784491, + "grad_norm": 0.43941232562065125, + "learning_rate": 0.0001, + "loss": 1.5425, + "step": 6824 + }, + { + "epoch": 0.7929131571304096, + "grad_norm": 0.47497496008872986, + "learning_rate": 0.0001, + "loss": 1.649, + "step": 6825 + }, + { + "epoch": 0.79302933488237, + "grad_norm": 0.43414729833602905, + "learning_rate": 0.0001, + "loss": 1.6018, + "step": 6826 + }, + { + "epoch": 0.7931455126343305, + "grad_norm": 0.48383721709251404, + "learning_rate": 0.0001, + "loss": 1.7554, + "step": 6827 + }, + { + "epoch": 0.793261690386291, + "grad_norm": 0.4651719033718109, + "learning_rate": 0.0001, + "loss": 1.5808, + "step": 6828 + }, + { + "epoch": 0.7933778681382515, + "grad_norm": 0.42594683170318604, + "learning_rate": 0.0001, + "loss": 1.5052, + "step": 6829 + }, + { + "epoch": 0.793494045890212, + "grad_norm": 0.4392074644565582, + "learning_rate": 0.0001, + "loss": 1.5447, + "step": 6830 + }, + { + "epoch": 0.7936102236421725, + "grad_norm": 0.43708086013793945, + "learning_rate": 0.0001, + "loss": 1.3342, + "step": 6831 + }, + { + "epoch": 0.793726401394133, + "grad_norm": 0.46385717391967773, + "learning_rate": 0.0001, + "loss": 1.709, + "step": 6832 + }, + { + "epoch": 0.7938425791460936, + "grad_norm": 0.4499506652355194, + "learning_rate": 0.0001, + "loss": 1.5392, + "step": 6833 + }, + { + "epoch": 0.7939587568980541, + "grad_norm": 0.46915948390960693, + "learning_rate": 0.0001, + "loss": 1.6962, + "step": 6834 + }, + { + "epoch": 0.7940749346500146, + "grad_norm": 0.4685263931751251, + "learning_rate": 0.0001, + "loss": 1.6678, + "step": 6835 + }, + { + "epoch": 0.794191112401975, + "grad_norm": 0.45696932077407837, + "learning_rate": 0.0001, + "loss": 1.6566, + "step": 6836 + }, + { + "epoch": 0.7943072901539355, + "grad_norm": 0.47869807481765747, + "learning_rate": 0.0001, + "loss": 1.6313, + "step": 6837 + }, + { + "epoch": 0.794423467905896, + "grad_norm": 0.45035579800605774, + "learning_rate": 0.0001, + "loss": 1.6116, + "step": 6838 + }, + { + "epoch": 0.7945396456578565, + "grad_norm": 0.48194530606269836, + "learning_rate": 0.0001, + "loss": 1.5359, + "step": 6839 + }, + { + "epoch": 0.794655823409817, + "grad_norm": 0.47445181012153625, + "learning_rate": 0.0001, + "loss": 1.7277, + "step": 6840 + }, + { + "epoch": 0.7947720011617775, + "grad_norm": 0.4426630139350891, + "learning_rate": 0.0001, + "loss": 1.5437, + "step": 6841 + }, + { + "epoch": 0.794888178913738, + "grad_norm": 0.45886486768722534, + "learning_rate": 0.0001, + "loss": 1.5772, + "step": 6842 + }, + { + "epoch": 0.7950043566656985, + "grad_norm": 0.4601069986820221, + "learning_rate": 0.0001, + "loss": 1.6217, + "step": 6843 + }, + { + "epoch": 0.7951205344176591, + "grad_norm": 0.42290568351745605, + "learning_rate": 0.0001, + "loss": 1.4043, + "step": 6844 + }, + { + "epoch": 0.7952367121696196, + "grad_norm": 0.4962608516216278, + "learning_rate": 0.0001, + "loss": 1.6887, + "step": 6845 + }, + { + "epoch": 0.79535288992158, + "grad_norm": 0.47025734186172485, + "learning_rate": 0.0001, + "loss": 1.4043, + "step": 6846 + }, + { + "epoch": 0.7954690676735405, + "grad_norm": 0.441499263048172, + "learning_rate": 0.0001, + "loss": 1.6115, + "step": 6847 + }, + { + "epoch": 0.795585245425501, + "grad_norm": 0.4636131823062897, + "learning_rate": 0.0001, + "loss": 1.6929, + "step": 6848 + }, + { + "epoch": 0.7957014231774615, + "grad_norm": 0.46920451521873474, + "learning_rate": 0.0001, + "loss": 1.5591, + "step": 6849 + }, + { + "epoch": 0.795817600929422, + "grad_norm": 0.49390605092048645, + "learning_rate": 0.0001, + "loss": 1.6649, + "step": 6850 + }, + { + "epoch": 0.7959337786813825, + "grad_norm": 0.46050357818603516, + "learning_rate": 0.0001, + "loss": 1.6044, + "step": 6851 + }, + { + "epoch": 0.796049956433343, + "grad_norm": 0.4840867817401886, + "learning_rate": 0.0001, + "loss": 1.4195, + "step": 6852 + }, + { + "epoch": 0.7961661341853035, + "grad_norm": 0.46949562430381775, + "learning_rate": 0.0001, + "loss": 1.6401, + "step": 6853 + }, + { + "epoch": 0.7962823119372641, + "grad_norm": 0.5006006360054016, + "learning_rate": 0.0001, + "loss": 1.8093, + "step": 6854 + }, + { + "epoch": 0.7963984896892246, + "grad_norm": 0.5085091590881348, + "learning_rate": 0.0001, + "loss": 1.8903, + "step": 6855 + }, + { + "epoch": 0.796514667441185, + "grad_norm": 0.4508131742477417, + "learning_rate": 0.0001, + "loss": 1.5699, + "step": 6856 + }, + { + "epoch": 0.7966308451931455, + "grad_norm": 0.4634523391723633, + "learning_rate": 0.0001, + "loss": 1.5805, + "step": 6857 + }, + { + "epoch": 0.796747022945106, + "grad_norm": 0.4478987753391266, + "learning_rate": 0.0001, + "loss": 1.6498, + "step": 6858 + }, + { + "epoch": 0.7968632006970665, + "grad_norm": 0.4543631374835968, + "learning_rate": 0.0001, + "loss": 1.6481, + "step": 6859 + }, + { + "epoch": 0.796979378449027, + "grad_norm": 0.48462975025177, + "learning_rate": 0.0001, + "loss": 1.6535, + "step": 6860 + }, + { + "epoch": 0.7970955562009875, + "grad_norm": 0.45995092391967773, + "learning_rate": 0.0001, + "loss": 1.6833, + "step": 6861 + }, + { + "epoch": 0.797211733952948, + "grad_norm": 0.48978012800216675, + "learning_rate": 0.0001, + "loss": 1.4784, + "step": 6862 + }, + { + "epoch": 0.7973279117049085, + "grad_norm": 0.45018213987350464, + "learning_rate": 0.0001, + "loss": 1.6156, + "step": 6863 + }, + { + "epoch": 0.797444089456869, + "grad_norm": 0.47435247898101807, + "learning_rate": 0.0001, + "loss": 1.6635, + "step": 6864 + }, + { + "epoch": 0.7975602672088296, + "grad_norm": 0.48207005858421326, + "learning_rate": 0.0001, + "loss": 1.5935, + "step": 6865 + }, + { + "epoch": 0.79767644496079, + "grad_norm": 0.4846184253692627, + "learning_rate": 0.0001, + "loss": 1.5241, + "step": 6866 + }, + { + "epoch": 0.7977926227127505, + "grad_norm": 0.42367056012153625, + "learning_rate": 0.0001, + "loss": 1.3894, + "step": 6867 + }, + { + "epoch": 0.797908800464711, + "grad_norm": 0.46958428621292114, + "learning_rate": 0.0001, + "loss": 1.5891, + "step": 6868 + }, + { + "epoch": 0.7980249782166715, + "grad_norm": 0.4629022777080536, + "learning_rate": 0.0001, + "loss": 1.7408, + "step": 6869 + }, + { + "epoch": 0.798141155968632, + "grad_norm": 0.5330944657325745, + "learning_rate": 0.0001, + "loss": 1.8337, + "step": 6870 + }, + { + "epoch": 0.7982573337205925, + "grad_norm": 0.45015132427215576, + "learning_rate": 0.0001, + "loss": 1.6438, + "step": 6871 + }, + { + "epoch": 0.798373511472553, + "grad_norm": 0.447221964597702, + "learning_rate": 0.0001, + "loss": 1.5227, + "step": 6872 + }, + { + "epoch": 0.7984896892245135, + "grad_norm": 0.4698033928871155, + "learning_rate": 0.0001, + "loss": 1.6346, + "step": 6873 + }, + { + "epoch": 0.798605866976474, + "grad_norm": 0.4613732099533081, + "learning_rate": 0.0001, + "loss": 1.7136, + "step": 6874 + }, + { + "epoch": 0.7987220447284346, + "grad_norm": 0.45149853825569153, + "learning_rate": 0.0001, + "loss": 1.53, + "step": 6875 + }, + { + "epoch": 0.798838222480395, + "grad_norm": 0.44822239875793457, + "learning_rate": 0.0001, + "loss": 1.569, + "step": 6876 + }, + { + "epoch": 0.7989544002323555, + "grad_norm": 0.4859652817249298, + "learning_rate": 0.0001, + "loss": 1.5857, + "step": 6877 + }, + { + "epoch": 0.799070577984316, + "grad_norm": 0.4371628165245056, + "learning_rate": 0.0001, + "loss": 1.5781, + "step": 6878 + }, + { + "epoch": 0.7991867557362765, + "grad_norm": 0.432081013917923, + "learning_rate": 0.0001, + "loss": 1.6044, + "step": 6879 + }, + { + "epoch": 0.799302933488237, + "grad_norm": 0.4589148461818695, + "learning_rate": 0.0001, + "loss": 1.5197, + "step": 6880 + }, + { + "epoch": 0.7994191112401975, + "grad_norm": 0.43646013736724854, + "learning_rate": 0.0001, + "loss": 1.5708, + "step": 6881 + }, + { + "epoch": 0.799535288992158, + "grad_norm": 0.42627108097076416, + "learning_rate": 0.0001, + "loss": 1.5185, + "step": 6882 + }, + { + "epoch": 0.7996514667441185, + "grad_norm": 0.4646453559398651, + "learning_rate": 0.0001, + "loss": 1.5625, + "step": 6883 + }, + { + "epoch": 0.799767644496079, + "grad_norm": 0.46586647629737854, + "learning_rate": 0.0001, + "loss": 1.4998, + "step": 6884 + }, + { + "epoch": 0.7998838222480396, + "grad_norm": 0.512482762336731, + "learning_rate": 0.0001, + "loss": 1.6869, + "step": 6885 + }, + { + "epoch": 0.8, + "grad_norm": 0.4631596803665161, + "learning_rate": 0.0001, + "loss": 1.6069, + "step": 6886 + }, + { + "epoch": 0.8001161777519605, + "grad_norm": 0.472889244556427, + "learning_rate": 0.0001, + "loss": 1.7365, + "step": 6887 + }, + { + "epoch": 0.800232355503921, + "grad_norm": 0.5098186731338501, + "learning_rate": 0.0001, + "loss": 1.4923, + "step": 6888 + }, + { + "epoch": 0.8003485332558815, + "grad_norm": 0.48305651545524597, + "learning_rate": 0.0001, + "loss": 1.777, + "step": 6889 + }, + { + "epoch": 0.800464711007842, + "grad_norm": 0.48298099637031555, + "learning_rate": 0.0001, + "loss": 1.7043, + "step": 6890 + }, + { + "epoch": 0.8005808887598025, + "grad_norm": 0.4996420741081238, + "learning_rate": 0.0001, + "loss": 1.7451, + "step": 6891 + }, + { + "epoch": 0.800697066511763, + "grad_norm": 0.44727784395217896, + "learning_rate": 0.0001, + "loss": 1.6145, + "step": 6892 + }, + { + "epoch": 0.8008132442637235, + "grad_norm": 0.4710618555545807, + "learning_rate": 0.0001, + "loss": 1.531, + "step": 6893 + }, + { + "epoch": 0.800929422015684, + "grad_norm": 0.4605047404766083, + "learning_rate": 0.0001, + "loss": 1.6003, + "step": 6894 + }, + { + "epoch": 0.8010455997676444, + "grad_norm": 0.4915280342102051, + "learning_rate": 0.0001, + "loss": 1.6894, + "step": 6895 + }, + { + "epoch": 0.801161777519605, + "grad_norm": 0.45687055587768555, + "learning_rate": 0.0001, + "loss": 1.7078, + "step": 6896 + }, + { + "epoch": 0.8012779552715655, + "grad_norm": 0.4345172345638275, + "learning_rate": 0.0001, + "loss": 1.624, + "step": 6897 + }, + { + "epoch": 0.801394133023526, + "grad_norm": 0.4714672863483429, + "learning_rate": 0.0001, + "loss": 1.4299, + "step": 6898 + }, + { + "epoch": 0.8015103107754865, + "grad_norm": 0.46883341670036316, + "learning_rate": 0.0001, + "loss": 1.5489, + "step": 6899 + }, + { + "epoch": 0.801626488527447, + "grad_norm": 0.43329375982284546, + "learning_rate": 0.0001, + "loss": 1.5865, + "step": 6900 + }, + { + "epoch": 0.8017426662794075, + "grad_norm": 0.5150987505912781, + "learning_rate": 0.0001, + "loss": 1.8174, + "step": 6901 + }, + { + "epoch": 0.801858844031368, + "grad_norm": 0.43868762254714966, + "learning_rate": 0.0001, + "loss": 1.5489, + "step": 6902 + }, + { + "epoch": 0.8019750217833285, + "grad_norm": 0.4465371072292328, + "learning_rate": 0.0001, + "loss": 1.5276, + "step": 6903 + }, + { + "epoch": 0.802091199535289, + "grad_norm": 0.4565187096595764, + "learning_rate": 0.0001, + "loss": 1.6618, + "step": 6904 + }, + { + "epoch": 0.8022073772872494, + "grad_norm": 0.5145065784454346, + "learning_rate": 0.0001, + "loss": 1.7764, + "step": 6905 + }, + { + "epoch": 0.80232355503921, + "grad_norm": 0.5177064538002014, + "learning_rate": 0.0001, + "loss": 1.6975, + "step": 6906 + }, + { + "epoch": 0.8024397327911705, + "grad_norm": 0.42869505286216736, + "learning_rate": 0.0001, + "loss": 1.4623, + "step": 6907 + }, + { + "epoch": 0.802555910543131, + "grad_norm": 0.4332997500896454, + "learning_rate": 0.0001, + "loss": 1.5114, + "step": 6908 + }, + { + "epoch": 0.8026720882950915, + "grad_norm": 0.4914308786392212, + "learning_rate": 0.0001, + "loss": 1.5891, + "step": 6909 + }, + { + "epoch": 0.802788266047052, + "grad_norm": 0.4771595299243927, + "learning_rate": 0.0001, + "loss": 1.5143, + "step": 6910 + }, + { + "epoch": 0.8029044437990125, + "grad_norm": 0.45163875818252563, + "learning_rate": 0.0001, + "loss": 1.7303, + "step": 6911 + }, + { + "epoch": 0.803020621550973, + "grad_norm": 0.4456539452075958, + "learning_rate": 0.0001, + "loss": 1.6847, + "step": 6912 + }, + { + "epoch": 0.8031367993029335, + "grad_norm": 0.42746564745903015, + "learning_rate": 0.0001, + "loss": 1.5497, + "step": 6913 + }, + { + "epoch": 0.803252977054894, + "grad_norm": 0.4743218421936035, + "learning_rate": 0.0001, + "loss": 1.5728, + "step": 6914 + }, + { + "epoch": 0.8033691548068544, + "grad_norm": 0.5129088759422302, + "learning_rate": 0.0001, + "loss": 1.6247, + "step": 6915 + }, + { + "epoch": 0.8034853325588149, + "grad_norm": 0.4551659822463989, + "learning_rate": 0.0001, + "loss": 1.6385, + "step": 6916 + }, + { + "epoch": 0.8036015103107755, + "grad_norm": 0.4753081202507019, + "learning_rate": 0.0001, + "loss": 1.6096, + "step": 6917 + }, + { + "epoch": 0.803717688062736, + "grad_norm": 0.4394998252391815, + "learning_rate": 0.0001, + "loss": 1.575, + "step": 6918 + }, + { + "epoch": 0.8038338658146965, + "grad_norm": 0.45924627780914307, + "learning_rate": 0.0001, + "loss": 1.5364, + "step": 6919 + }, + { + "epoch": 0.803950043566657, + "grad_norm": 0.44786056876182556, + "learning_rate": 0.0001, + "loss": 1.5884, + "step": 6920 + }, + { + "epoch": 0.8040662213186175, + "grad_norm": 0.4409099519252777, + "learning_rate": 0.0001, + "loss": 1.6561, + "step": 6921 + }, + { + "epoch": 0.804182399070578, + "grad_norm": 0.45234692096710205, + "learning_rate": 0.0001, + "loss": 1.5925, + "step": 6922 + }, + { + "epoch": 0.8042985768225385, + "grad_norm": 0.5276682376861572, + "learning_rate": 0.0001, + "loss": 1.7412, + "step": 6923 + }, + { + "epoch": 0.804414754574499, + "grad_norm": 0.4487822353839874, + "learning_rate": 0.0001, + "loss": 1.6712, + "step": 6924 + }, + { + "epoch": 0.8045309323264594, + "grad_norm": 0.4596274495124817, + "learning_rate": 0.0001, + "loss": 1.687, + "step": 6925 + }, + { + "epoch": 0.8046471100784199, + "grad_norm": 0.44739964604377747, + "learning_rate": 0.0001, + "loss": 1.5391, + "step": 6926 + }, + { + "epoch": 0.8047632878303805, + "grad_norm": 0.46213769912719727, + "learning_rate": 0.0001, + "loss": 1.4638, + "step": 6927 + }, + { + "epoch": 0.804879465582341, + "grad_norm": 0.4504436254501343, + "learning_rate": 0.0001, + "loss": 1.6742, + "step": 6928 + }, + { + "epoch": 0.8049956433343015, + "grad_norm": 0.4626392722129822, + "learning_rate": 0.0001, + "loss": 1.4868, + "step": 6929 + }, + { + "epoch": 0.805111821086262, + "grad_norm": 0.4738352596759796, + "learning_rate": 0.0001, + "loss": 1.699, + "step": 6930 + }, + { + "epoch": 0.8052279988382225, + "grad_norm": 0.45543593168258667, + "learning_rate": 0.0001, + "loss": 1.6072, + "step": 6931 + }, + { + "epoch": 0.805344176590183, + "grad_norm": 0.4740353226661682, + "learning_rate": 0.0001, + "loss": 1.6442, + "step": 6932 + }, + { + "epoch": 0.8054603543421435, + "grad_norm": 0.4481174647808075, + "learning_rate": 0.0001, + "loss": 1.5732, + "step": 6933 + }, + { + "epoch": 0.805576532094104, + "grad_norm": 0.461093544960022, + "learning_rate": 0.0001, + "loss": 1.6113, + "step": 6934 + }, + { + "epoch": 0.8056927098460644, + "grad_norm": 0.4422491490840912, + "learning_rate": 0.0001, + "loss": 1.5261, + "step": 6935 + }, + { + "epoch": 0.8058088875980249, + "grad_norm": 0.4397140443325043, + "learning_rate": 0.0001, + "loss": 1.5065, + "step": 6936 + }, + { + "epoch": 0.8059250653499854, + "grad_norm": 0.47454312443733215, + "learning_rate": 0.0001, + "loss": 1.5382, + "step": 6937 + }, + { + "epoch": 0.806041243101946, + "grad_norm": 0.46560293436050415, + "learning_rate": 0.0001, + "loss": 1.6589, + "step": 6938 + }, + { + "epoch": 0.8061574208539065, + "grad_norm": 0.46185746788978577, + "learning_rate": 0.0001, + "loss": 1.6063, + "step": 6939 + }, + { + "epoch": 0.806273598605867, + "grad_norm": 0.4402056634426117, + "learning_rate": 0.0001, + "loss": 1.5025, + "step": 6940 + }, + { + "epoch": 0.8063897763578275, + "grad_norm": 0.452150821685791, + "learning_rate": 0.0001, + "loss": 1.5531, + "step": 6941 + }, + { + "epoch": 0.806505954109788, + "grad_norm": 0.4527072012424469, + "learning_rate": 0.0001, + "loss": 1.5323, + "step": 6942 + }, + { + "epoch": 0.8066221318617485, + "grad_norm": 0.4439299702644348, + "learning_rate": 0.0001, + "loss": 1.5294, + "step": 6943 + }, + { + "epoch": 0.806738309613709, + "grad_norm": 0.4649060070514679, + "learning_rate": 0.0001, + "loss": 1.7965, + "step": 6944 + }, + { + "epoch": 0.8068544873656694, + "grad_norm": 0.46530160307884216, + "learning_rate": 0.0001, + "loss": 1.7508, + "step": 6945 + }, + { + "epoch": 0.8069706651176299, + "grad_norm": 0.456967294216156, + "learning_rate": 0.0001, + "loss": 1.6391, + "step": 6946 + }, + { + "epoch": 0.8070868428695904, + "grad_norm": 0.47573670744895935, + "learning_rate": 0.0001, + "loss": 1.5697, + "step": 6947 + }, + { + "epoch": 0.807203020621551, + "grad_norm": 0.4661976099014282, + "learning_rate": 0.0001, + "loss": 1.6118, + "step": 6948 + }, + { + "epoch": 0.8073191983735115, + "grad_norm": 0.4907025694847107, + "learning_rate": 0.0001, + "loss": 1.6961, + "step": 6949 + }, + { + "epoch": 0.807435376125472, + "grad_norm": 0.4918571412563324, + "learning_rate": 0.0001, + "loss": 1.6054, + "step": 6950 + }, + { + "epoch": 0.8075515538774325, + "grad_norm": 0.48200660943984985, + "learning_rate": 0.0001, + "loss": 1.6009, + "step": 6951 + }, + { + "epoch": 0.807667731629393, + "grad_norm": 0.42808154225349426, + "learning_rate": 0.0001, + "loss": 1.503, + "step": 6952 + }, + { + "epoch": 0.8077839093813535, + "grad_norm": 0.473086416721344, + "learning_rate": 0.0001, + "loss": 1.7153, + "step": 6953 + }, + { + "epoch": 0.807900087133314, + "grad_norm": 0.44156551361083984, + "learning_rate": 0.0001, + "loss": 1.5875, + "step": 6954 + }, + { + "epoch": 0.8080162648852744, + "grad_norm": 0.5178338289260864, + "learning_rate": 0.0001, + "loss": 1.5546, + "step": 6955 + }, + { + "epoch": 0.8081324426372349, + "grad_norm": 0.4564967751502991, + "learning_rate": 0.0001, + "loss": 1.5125, + "step": 6956 + }, + { + "epoch": 0.8082486203891954, + "grad_norm": 0.44008997082710266, + "learning_rate": 0.0001, + "loss": 1.5756, + "step": 6957 + }, + { + "epoch": 0.8083647981411559, + "grad_norm": 0.4413783848285675, + "learning_rate": 0.0001, + "loss": 1.5241, + "step": 6958 + }, + { + "epoch": 0.8084809758931165, + "grad_norm": 0.4661567807197571, + "learning_rate": 0.0001, + "loss": 1.4379, + "step": 6959 + }, + { + "epoch": 0.808597153645077, + "grad_norm": 0.47281137108802795, + "learning_rate": 0.0001, + "loss": 1.7149, + "step": 6960 + }, + { + "epoch": 0.8087133313970375, + "grad_norm": 0.47660407423973083, + "learning_rate": 0.0001, + "loss": 1.7855, + "step": 6961 + }, + { + "epoch": 0.808829509148998, + "grad_norm": 0.4914090633392334, + "learning_rate": 0.0001, + "loss": 1.6549, + "step": 6962 + }, + { + "epoch": 0.8089456869009585, + "grad_norm": 0.4621322453022003, + "learning_rate": 0.0001, + "loss": 1.5594, + "step": 6963 + }, + { + "epoch": 0.809061864652919, + "grad_norm": 0.4495106339454651, + "learning_rate": 0.0001, + "loss": 1.6727, + "step": 6964 + }, + { + "epoch": 0.8091780424048794, + "grad_norm": 0.49163323640823364, + "learning_rate": 0.0001, + "loss": 1.5462, + "step": 6965 + }, + { + "epoch": 0.8092942201568399, + "grad_norm": 0.47235107421875, + "learning_rate": 0.0001, + "loss": 1.5385, + "step": 6966 + }, + { + "epoch": 0.8094103979088004, + "grad_norm": 0.45100662112236023, + "learning_rate": 0.0001, + "loss": 1.5889, + "step": 6967 + }, + { + "epoch": 0.8095265756607609, + "grad_norm": 0.46376436948776245, + "learning_rate": 0.0001, + "loss": 1.7342, + "step": 6968 + }, + { + "epoch": 0.8096427534127215, + "grad_norm": 0.490784227848053, + "learning_rate": 0.0001, + "loss": 1.6433, + "step": 6969 + }, + { + "epoch": 0.809758931164682, + "grad_norm": 0.49541133642196655, + "learning_rate": 0.0001, + "loss": 1.7031, + "step": 6970 + }, + { + "epoch": 0.8098751089166425, + "grad_norm": 0.47032544016838074, + "learning_rate": 0.0001, + "loss": 1.5497, + "step": 6971 + }, + { + "epoch": 0.809991286668603, + "grad_norm": 0.4594431519508362, + "learning_rate": 0.0001, + "loss": 1.5522, + "step": 6972 + }, + { + "epoch": 0.8101074644205635, + "grad_norm": 0.4627406895160675, + "learning_rate": 0.0001, + "loss": 1.561, + "step": 6973 + }, + { + "epoch": 0.810223642172524, + "grad_norm": 0.4937112331390381, + "learning_rate": 0.0001, + "loss": 1.7939, + "step": 6974 + }, + { + "epoch": 0.8103398199244845, + "grad_norm": 0.4783715605735779, + "learning_rate": 0.0001, + "loss": 1.6928, + "step": 6975 + }, + { + "epoch": 0.8104559976764449, + "grad_norm": 0.47671520709991455, + "learning_rate": 0.0001, + "loss": 1.5699, + "step": 6976 + }, + { + "epoch": 0.8105721754284054, + "grad_norm": 0.47149521112442017, + "learning_rate": 0.0001, + "loss": 1.6648, + "step": 6977 + }, + { + "epoch": 0.8106883531803659, + "grad_norm": 0.45699021220207214, + "learning_rate": 0.0001, + "loss": 1.5172, + "step": 6978 + }, + { + "epoch": 0.8108045309323264, + "grad_norm": 0.4693375527858734, + "learning_rate": 0.0001, + "loss": 1.6806, + "step": 6979 + }, + { + "epoch": 0.810920708684287, + "grad_norm": 0.4451500475406647, + "learning_rate": 0.0001, + "loss": 1.6516, + "step": 6980 + }, + { + "epoch": 0.8110368864362475, + "grad_norm": 0.45721450448036194, + "learning_rate": 0.0001, + "loss": 1.6198, + "step": 6981 + }, + { + "epoch": 0.811153064188208, + "grad_norm": 0.44827860593795776, + "learning_rate": 0.0001, + "loss": 1.6295, + "step": 6982 + }, + { + "epoch": 0.8112692419401685, + "grad_norm": 0.47101083397865295, + "learning_rate": 0.0001, + "loss": 1.4988, + "step": 6983 + }, + { + "epoch": 0.811385419692129, + "grad_norm": 0.4695024788379669, + "learning_rate": 0.0001, + "loss": 1.6635, + "step": 6984 + }, + { + "epoch": 0.8115015974440895, + "grad_norm": 0.5033894777297974, + "learning_rate": 0.0001, + "loss": 1.5087, + "step": 6985 + }, + { + "epoch": 0.8116177751960499, + "grad_norm": 0.49278390407562256, + "learning_rate": 0.0001, + "loss": 1.5152, + "step": 6986 + }, + { + "epoch": 0.8117339529480104, + "grad_norm": 0.5109890699386597, + "learning_rate": 0.0001, + "loss": 1.6748, + "step": 6987 + }, + { + "epoch": 0.8118501306999709, + "grad_norm": 0.4835306704044342, + "learning_rate": 0.0001, + "loss": 1.7316, + "step": 6988 + }, + { + "epoch": 0.8119663084519314, + "grad_norm": 0.48664847016334534, + "learning_rate": 0.0001, + "loss": 1.6619, + "step": 6989 + }, + { + "epoch": 0.812082486203892, + "grad_norm": 0.4511289596557617, + "learning_rate": 0.0001, + "loss": 1.4578, + "step": 6990 + }, + { + "epoch": 0.8121986639558525, + "grad_norm": 0.46078622341156006, + "learning_rate": 0.0001, + "loss": 1.529, + "step": 6991 + }, + { + "epoch": 0.812314841707813, + "grad_norm": 0.5006919503211975, + "learning_rate": 0.0001, + "loss": 1.5886, + "step": 6992 + }, + { + "epoch": 0.8124310194597735, + "grad_norm": 0.46968936920166016, + "learning_rate": 0.0001, + "loss": 1.7114, + "step": 6993 + }, + { + "epoch": 0.812547197211734, + "grad_norm": 0.46966198086738586, + "learning_rate": 0.0001, + "loss": 1.5932, + "step": 6994 + }, + { + "epoch": 0.8126633749636945, + "grad_norm": 0.5085934400558472, + "learning_rate": 0.0001, + "loss": 1.7453, + "step": 6995 + }, + { + "epoch": 0.8127795527156549, + "grad_norm": 0.42895692586898804, + "learning_rate": 0.0001, + "loss": 1.505, + "step": 6996 + }, + { + "epoch": 0.8128957304676154, + "grad_norm": 0.46299561858177185, + "learning_rate": 0.0001, + "loss": 1.6104, + "step": 6997 + }, + { + "epoch": 0.8130119082195759, + "grad_norm": 0.4561334550380707, + "learning_rate": 0.0001, + "loss": 1.5164, + "step": 6998 + }, + { + "epoch": 0.8131280859715364, + "grad_norm": 0.49617594480514526, + "learning_rate": 0.0001, + "loss": 1.7706, + "step": 6999 + }, + { + "epoch": 0.8132442637234969, + "grad_norm": 0.4899764955043793, + "learning_rate": 0.0001, + "loss": 1.6421, + "step": 7000 + }, + { + "epoch": 0.8133604414754575, + "grad_norm": 0.4907947778701782, + "learning_rate": 0.0001, + "loss": 1.7775, + "step": 7001 + }, + { + "epoch": 0.813476619227418, + "grad_norm": 0.45711711049079895, + "learning_rate": 0.0001, + "loss": 1.6188, + "step": 7002 + }, + { + "epoch": 0.8135927969793785, + "grad_norm": 0.449201375246048, + "learning_rate": 0.0001, + "loss": 1.611, + "step": 7003 + }, + { + "epoch": 0.813708974731339, + "grad_norm": 0.4838063418865204, + "learning_rate": 0.0001, + "loss": 1.5176, + "step": 7004 + }, + { + "epoch": 0.8138251524832995, + "grad_norm": 0.46976253390312195, + "learning_rate": 0.0001, + "loss": 1.6453, + "step": 7005 + }, + { + "epoch": 0.8139413302352599, + "grad_norm": 0.4605077803134918, + "learning_rate": 0.0001, + "loss": 1.6659, + "step": 7006 + }, + { + "epoch": 0.8140575079872204, + "grad_norm": 0.4310915470123291, + "learning_rate": 0.0001, + "loss": 1.5281, + "step": 7007 + }, + { + "epoch": 0.8141736857391809, + "grad_norm": 0.49787524342536926, + "learning_rate": 0.0001, + "loss": 1.6964, + "step": 7008 + }, + { + "epoch": 0.8142898634911414, + "grad_norm": 0.4633225202560425, + "learning_rate": 0.0001, + "loss": 1.6641, + "step": 7009 + }, + { + "epoch": 0.8144060412431019, + "grad_norm": 0.5344363451004028, + "learning_rate": 0.0001, + "loss": 1.8373, + "step": 7010 + }, + { + "epoch": 0.8145222189950625, + "grad_norm": 0.4966965615749359, + "learning_rate": 0.0001, + "loss": 1.6806, + "step": 7011 + }, + { + "epoch": 0.814638396747023, + "grad_norm": 0.4843759834766388, + "learning_rate": 0.0001, + "loss": 1.8051, + "step": 7012 + }, + { + "epoch": 0.8147545744989835, + "grad_norm": 0.46458831429481506, + "learning_rate": 0.0001, + "loss": 1.4644, + "step": 7013 + }, + { + "epoch": 0.814870752250944, + "grad_norm": 0.46197426319122314, + "learning_rate": 0.0001, + "loss": 1.6258, + "step": 7014 + }, + { + "epoch": 0.8149869300029045, + "grad_norm": 0.4449380934238434, + "learning_rate": 0.0001, + "loss": 1.5742, + "step": 7015 + }, + { + "epoch": 0.8151031077548649, + "grad_norm": 0.47970300912857056, + "learning_rate": 0.0001, + "loss": 1.4273, + "step": 7016 + }, + { + "epoch": 0.8152192855068254, + "grad_norm": 0.48018476366996765, + "learning_rate": 0.0001, + "loss": 1.7756, + "step": 7017 + }, + { + "epoch": 0.8153354632587859, + "grad_norm": 0.45871859788894653, + "learning_rate": 0.0001, + "loss": 1.7081, + "step": 7018 + }, + { + "epoch": 0.8154516410107464, + "grad_norm": 0.509146511554718, + "learning_rate": 0.0001, + "loss": 1.7281, + "step": 7019 + }, + { + "epoch": 0.8155678187627069, + "grad_norm": 0.4545764625072479, + "learning_rate": 0.0001, + "loss": 1.5903, + "step": 7020 + }, + { + "epoch": 0.8156839965146674, + "grad_norm": 0.46723830699920654, + "learning_rate": 0.0001, + "loss": 1.5216, + "step": 7021 + }, + { + "epoch": 0.815800174266628, + "grad_norm": 0.4666821360588074, + "learning_rate": 0.0001, + "loss": 1.6664, + "step": 7022 + }, + { + "epoch": 0.8159163520185885, + "grad_norm": 0.45362234115600586, + "learning_rate": 0.0001, + "loss": 1.6185, + "step": 7023 + }, + { + "epoch": 0.816032529770549, + "grad_norm": 0.5081374049186707, + "learning_rate": 0.0001, + "loss": 1.7431, + "step": 7024 + }, + { + "epoch": 0.8161487075225095, + "grad_norm": 0.4551977515220642, + "learning_rate": 0.0001, + "loss": 1.4208, + "step": 7025 + }, + { + "epoch": 0.8162648852744699, + "grad_norm": 0.4469984471797943, + "learning_rate": 0.0001, + "loss": 1.4629, + "step": 7026 + }, + { + "epoch": 0.8163810630264304, + "grad_norm": 0.4428351819515228, + "learning_rate": 0.0001, + "loss": 1.5317, + "step": 7027 + }, + { + "epoch": 0.8164972407783909, + "grad_norm": 0.46681028604507446, + "learning_rate": 0.0001, + "loss": 1.7313, + "step": 7028 + }, + { + "epoch": 0.8166134185303514, + "grad_norm": 0.47523221373558044, + "learning_rate": 0.0001, + "loss": 1.5623, + "step": 7029 + }, + { + "epoch": 0.8167295962823119, + "grad_norm": 0.42008304595947266, + "learning_rate": 0.0001, + "loss": 1.2919, + "step": 7030 + }, + { + "epoch": 0.8168457740342724, + "grad_norm": 0.4836232364177704, + "learning_rate": 0.0001, + "loss": 1.6328, + "step": 7031 + }, + { + "epoch": 0.816961951786233, + "grad_norm": 0.49851134419441223, + "learning_rate": 0.0001, + "loss": 1.4884, + "step": 7032 + }, + { + "epoch": 0.8170781295381935, + "grad_norm": 0.4806590974330902, + "learning_rate": 0.0001, + "loss": 1.4057, + "step": 7033 + }, + { + "epoch": 0.817194307290154, + "grad_norm": 0.46328291296958923, + "learning_rate": 0.0001, + "loss": 1.5448, + "step": 7034 + }, + { + "epoch": 0.8173104850421145, + "grad_norm": 0.44353359937667847, + "learning_rate": 0.0001, + "loss": 1.583, + "step": 7035 + }, + { + "epoch": 0.8174266627940749, + "grad_norm": 0.5083478689193726, + "learning_rate": 0.0001, + "loss": 1.7822, + "step": 7036 + }, + { + "epoch": 0.8175428405460354, + "grad_norm": 0.4776815176010132, + "learning_rate": 0.0001, + "loss": 1.7168, + "step": 7037 + }, + { + "epoch": 0.8176590182979959, + "grad_norm": 0.43044784665107727, + "learning_rate": 0.0001, + "loss": 1.3441, + "step": 7038 + }, + { + "epoch": 0.8177751960499564, + "grad_norm": 0.4387872815132141, + "learning_rate": 0.0001, + "loss": 1.5306, + "step": 7039 + }, + { + "epoch": 0.8178913738019169, + "grad_norm": 0.46325990557670593, + "learning_rate": 0.0001, + "loss": 1.7339, + "step": 7040 + }, + { + "epoch": 0.8180075515538774, + "grad_norm": 0.4589924216270447, + "learning_rate": 0.0001, + "loss": 1.6567, + "step": 7041 + }, + { + "epoch": 0.8181237293058379, + "grad_norm": 0.4716587960720062, + "learning_rate": 0.0001, + "loss": 1.6844, + "step": 7042 + }, + { + "epoch": 0.8182399070577985, + "grad_norm": 0.4652487635612488, + "learning_rate": 0.0001, + "loss": 1.7618, + "step": 7043 + }, + { + "epoch": 0.818356084809759, + "grad_norm": 0.4787328839302063, + "learning_rate": 0.0001, + "loss": 1.5892, + "step": 7044 + }, + { + "epoch": 0.8184722625617195, + "grad_norm": 0.47911664843559265, + "learning_rate": 0.0001, + "loss": 1.6959, + "step": 7045 + }, + { + "epoch": 0.8185884403136799, + "grad_norm": 0.4904523491859436, + "learning_rate": 0.0001, + "loss": 1.7353, + "step": 7046 + }, + { + "epoch": 0.8187046180656404, + "grad_norm": 0.4955161213874817, + "learning_rate": 0.0001, + "loss": 1.5992, + "step": 7047 + }, + { + "epoch": 0.8188207958176009, + "grad_norm": 0.44562825560569763, + "learning_rate": 0.0001, + "loss": 1.6544, + "step": 7048 + }, + { + "epoch": 0.8189369735695614, + "grad_norm": 0.44809067249298096, + "learning_rate": 0.0001, + "loss": 1.3182, + "step": 7049 + }, + { + "epoch": 0.8190531513215219, + "grad_norm": 0.4440357983112335, + "learning_rate": 0.0001, + "loss": 1.6446, + "step": 7050 + }, + { + "epoch": 0.8191693290734824, + "grad_norm": 0.4690278172492981, + "learning_rate": 0.0001, + "loss": 1.5354, + "step": 7051 + }, + { + "epoch": 0.8192855068254429, + "grad_norm": 0.4551823139190674, + "learning_rate": 0.0001, + "loss": 1.6531, + "step": 7052 + }, + { + "epoch": 0.8194016845774035, + "grad_norm": 0.48354125022888184, + "learning_rate": 0.0001, + "loss": 1.7015, + "step": 7053 + }, + { + "epoch": 0.819517862329364, + "grad_norm": 0.4836772084236145, + "learning_rate": 0.0001, + "loss": 1.5432, + "step": 7054 + }, + { + "epoch": 0.8196340400813245, + "grad_norm": 0.4699805974960327, + "learning_rate": 0.0001, + "loss": 1.6777, + "step": 7055 + }, + { + "epoch": 0.8197502178332849, + "grad_norm": 0.41965410113334656, + "learning_rate": 0.0001, + "loss": 1.4799, + "step": 7056 + }, + { + "epoch": 0.8198663955852454, + "grad_norm": 0.5057850480079651, + "learning_rate": 0.0001, + "loss": 1.7782, + "step": 7057 + }, + { + "epoch": 0.8199825733372059, + "grad_norm": 0.4643802046775818, + "learning_rate": 0.0001, + "loss": 1.6054, + "step": 7058 + }, + { + "epoch": 0.8200987510891664, + "grad_norm": 0.4550783932209015, + "learning_rate": 0.0001, + "loss": 1.5912, + "step": 7059 + }, + { + "epoch": 0.8202149288411269, + "grad_norm": 0.5149709582328796, + "learning_rate": 0.0001, + "loss": 1.5496, + "step": 7060 + }, + { + "epoch": 0.8203311065930874, + "grad_norm": 0.46510255336761475, + "learning_rate": 0.0001, + "loss": 1.6459, + "step": 7061 + }, + { + "epoch": 0.8204472843450479, + "grad_norm": 0.46841713786125183, + "learning_rate": 0.0001, + "loss": 1.615, + "step": 7062 + }, + { + "epoch": 0.8205634620970085, + "grad_norm": 0.4657427966594696, + "learning_rate": 0.0001, + "loss": 1.6439, + "step": 7063 + }, + { + "epoch": 0.820679639848969, + "grad_norm": 0.43857282400131226, + "learning_rate": 0.0001, + "loss": 1.3911, + "step": 7064 + }, + { + "epoch": 0.8207958176009295, + "grad_norm": 0.457220196723938, + "learning_rate": 0.0001, + "loss": 1.5525, + "step": 7065 + }, + { + "epoch": 0.82091199535289, + "grad_norm": 0.462664932012558, + "learning_rate": 0.0001, + "loss": 1.6042, + "step": 7066 + }, + { + "epoch": 0.8210281731048504, + "grad_norm": 0.5375829339027405, + "learning_rate": 0.0001, + "loss": 1.6802, + "step": 7067 + }, + { + "epoch": 0.8211443508568109, + "grad_norm": 0.47955790162086487, + "learning_rate": 0.0001, + "loss": 1.6042, + "step": 7068 + }, + { + "epoch": 0.8212605286087714, + "grad_norm": 0.4525638818740845, + "learning_rate": 0.0001, + "loss": 1.5566, + "step": 7069 + }, + { + "epoch": 0.8213767063607319, + "grad_norm": 0.6293335556983948, + "learning_rate": 0.0001, + "loss": 1.7643, + "step": 7070 + }, + { + "epoch": 0.8214928841126924, + "grad_norm": 0.46683555841445923, + "learning_rate": 0.0001, + "loss": 1.615, + "step": 7071 + }, + { + "epoch": 0.8216090618646529, + "grad_norm": 0.4560864269733429, + "learning_rate": 0.0001, + "loss": 1.4407, + "step": 7072 + }, + { + "epoch": 0.8217252396166134, + "grad_norm": 0.4356245696544647, + "learning_rate": 0.0001, + "loss": 1.5042, + "step": 7073 + }, + { + "epoch": 0.821841417368574, + "grad_norm": 0.4432215988636017, + "learning_rate": 0.0001, + "loss": 1.4791, + "step": 7074 + }, + { + "epoch": 0.8219575951205345, + "grad_norm": 0.43178898096084595, + "learning_rate": 0.0001, + "loss": 1.5999, + "step": 7075 + }, + { + "epoch": 0.822073772872495, + "grad_norm": 0.45065784454345703, + "learning_rate": 0.0001, + "loss": 1.5992, + "step": 7076 + }, + { + "epoch": 0.8221899506244554, + "grad_norm": 0.44402584433555603, + "learning_rate": 0.0001, + "loss": 1.645, + "step": 7077 + }, + { + "epoch": 0.8223061283764159, + "grad_norm": 0.4661881625652313, + "learning_rate": 0.0001, + "loss": 1.636, + "step": 7078 + }, + { + "epoch": 0.8224223061283764, + "grad_norm": 0.46382853388786316, + "learning_rate": 0.0001, + "loss": 1.589, + "step": 7079 + }, + { + "epoch": 0.8225384838803369, + "grad_norm": 0.4864356219768524, + "learning_rate": 0.0001, + "loss": 1.5702, + "step": 7080 + }, + { + "epoch": 0.8226546616322974, + "grad_norm": 0.48990336060523987, + "learning_rate": 0.0001, + "loss": 1.7913, + "step": 7081 + }, + { + "epoch": 0.8227708393842579, + "grad_norm": 0.4853106141090393, + "learning_rate": 0.0001, + "loss": 1.8482, + "step": 7082 + }, + { + "epoch": 0.8228870171362184, + "grad_norm": 0.42953288555145264, + "learning_rate": 0.0001, + "loss": 1.3854, + "step": 7083 + }, + { + "epoch": 0.823003194888179, + "grad_norm": 0.4772704541683197, + "learning_rate": 0.0001, + "loss": 1.4499, + "step": 7084 + }, + { + "epoch": 0.8231193726401395, + "grad_norm": 0.4406863749027252, + "learning_rate": 0.0001, + "loss": 1.5327, + "step": 7085 + }, + { + "epoch": 0.8232355503921, + "grad_norm": 0.47718164324760437, + "learning_rate": 0.0001, + "loss": 1.6784, + "step": 7086 + }, + { + "epoch": 0.8233517281440604, + "grad_norm": 0.4515807628631592, + "learning_rate": 0.0001, + "loss": 1.5782, + "step": 7087 + }, + { + "epoch": 0.8234679058960209, + "grad_norm": 0.4711925685405731, + "learning_rate": 0.0001, + "loss": 1.6348, + "step": 7088 + }, + { + "epoch": 0.8235840836479814, + "grad_norm": 0.4708985984325409, + "learning_rate": 0.0001, + "loss": 1.5224, + "step": 7089 + }, + { + "epoch": 0.8237002613999419, + "grad_norm": 0.48707452416419983, + "learning_rate": 0.0001, + "loss": 1.5098, + "step": 7090 + }, + { + "epoch": 0.8238164391519024, + "grad_norm": 0.5250394940376282, + "learning_rate": 0.0001, + "loss": 1.707, + "step": 7091 + }, + { + "epoch": 0.8239326169038629, + "grad_norm": 0.4977336525917053, + "learning_rate": 0.0001, + "loss": 1.6806, + "step": 7092 + }, + { + "epoch": 0.8240487946558234, + "grad_norm": 0.4466916024684906, + "learning_rate": 0.0001, + "loss": 1.5158, + "step": 7093 + }, + { + "epoch": 0.8241649724077839, + "grad_norm": 0.4977428913116455, + "learning_rate": 0.0001, + "loss": 1.6532, + "step": 7094 + }, + { + "epoch": 0.8242811501597445, + "grad_norm": 0.4770013093948364, + "learning_rate": 0.0001, + "loss": 1.629, + "step": 7095 + }, + { + "epoch": 0.824397327911705, + "grad_norm": 0.4783066511154175, + "learning_rate": 0.0001, + "loss": 1.6102, + "step": 7096 + }, + { + "epoch": 0.8245135056636654, + "grad_norm": 0.44045940041542053, + "learning_rate": 0.0001, + "loss": 1.4502, + "step": 7097 + }, + { + "epoch": 0.8246296834156259, + "grad_norm": 0.449008584022522, + "learning_rate": 0.0001, + "loss": 1.566, + "step": 7098 + }, + { + "epoch": 0.8247458611675864, + "grad_norm": 0.45892423391342163, + "learning_rate": 0.0001, + "loss": 1.6121, + "step": 7099 + }, + { + "epoch": 0.8248620389195469, + "grad_norm": 0.4626208245754242, + "learning_rate": 0.0001, + "loss": 1.6057, + "step": 7100 + }, + { + "epoch": 0.8249782166715074, + "grad_norm": 0.5038914084434509, + "learning_rate": 0.0001, + "loss": 1.651, + "step": 7101 + }, + { + "epoch": 0.8250943944234679, + "grad_norm": 0.4759518802165985, + "learning_rate": 0.0001, + "loss": 1.7069, + "step": 7102 + }, + { + "epoch": 0.8252105721754284, + "grad_norm": 0.48517483472824097, + "learning_rate": 0.0001, + "loss": 1.5147, + "step": 7103 + }, + { + "epoch": 0.8253267499273889, + "grad_norm": 0.47098508477211, + "learning_rate": 0.0001, + "loss": 1.6013, + "step": 7104 + }, + { + "epoch": 0.8254429276793495, + "grad_norm": 0.47671204805374146, + "learning_rate": 0.0001, + "loss": 1.5827, + "step": 7105 + }, + { + "epoch": 0.82555910543131, + "grad_norm": 0.4556223750114441, + "learning_rate": 0.0001, + "loss": 1.5374, + "step": 7106 + }, + { + "epoch": 0.8256752831832704, + "grad_norm": 0.47399070858955383, + "learning_rate": 0.0001, + "loss": 1.6283, + "step": 7107 + }, + { + "epoch": 0.8257914609352309, + "grad_norm": 0.4748283624649048, + "learning_rate": 0.0001, + "loss": 1.6214, + "step": 7108 + }, + { + "epoch": 0.8259076386871914, + "grad_norm": 0.48463886976242065, + "learning_rate": 0.0001, + "loss": 1.6234, + "step": 7109 + }, + { + "epoch": 0.8260238164391519, + "grad_norm": 0.46149513125419617, + "learning_rate": 0.0001, + "loss": 1.4978, + "step": 7110 + }, + { + "epoch": 0.8261399941911124, + "grad_norm": 0.47687479853630066, + "learning_rate": 0.0001, + "loss": 1.6549, + "step": 7111 + }, + { + "epoch": 0.8262561719430729, + "grad_norm": 0.46911653876304626, + "learning_rate": 0.0001, + "loss": 1.5885, + "step": 7112 + }, + { + "epoch": 0.8263723496950334, + "grad_norm": 0.5078893899917603, + "learning_rate": 0.0001, + "loss": 1.677, + "step": 7113 + }, + { + "epoch": 0.8264885274469939, + "grad_norm": 0.455218642950058, + "learning_rate": 0.0001, + "loss": 1.6192, + "step": 7114 + }, + { + "epoch": 0.8266047051989543, + "grad_norm": 0.44931599497795105, + "learning_rate": 0.0001, + "loss": 1.5572, + "step": 7115 + }, + { + "epoch": 0.826720882950915, + "grad_norm": 0.50384521484375, + "learning_rate": 0.0001, + "loss": 1.7711, + "step": 7116 + }, + { + "epoch": 0.8268370607028754, + "grad_norm": 0.48653748631477356, + "learning_rate": 0.0001, + "loss": 1.6987, + "step": 7117 + }, + { + "epoch": 0.8269532384548359, + "grad_norm": 0.44801607728004456, + "learning_rate": 0.0001, + "loss": 1.5793, + "step": 7118 + }, + { + "epoch": 0.8270694162067964, + "grad_norm": 0.45984333753585815, + "learning_rate": 0.0001, + "loss": 1.7446, + "step": 7119 + }, + { + "epoch": 0.8271855939587569, + "grad_norm": 0.49128204584121704, + "learning_rate": 0.0001, + "loss": 1.6582, + "step": 7120 + }, + { + "epoch": 0.8273017717107174, + "grad_norm": 0.47482722997665405, + "learning_rate": 0.0001, + "loss": 1.5669, + "step": 7121 + }, + { + "epoch": 0.8274179494626779, + "grad_norm": 0.46835750341415405, + "learning_rate": 0.0001, + "loss": 1.5629, + "step": 7122 + }, + { + "epoch": 0.8275341272146384, + "grad_norm": 0.4840836524963379, + "learning_rate": 0.0001, + "loss": 1.7906, + "step": 7123 + }, + { + "epoch": 0.8276503049665989, + "grad_norm": 0.4466482102870941, + "learning_rate": 0.0001, + "loss": 1.5372, + "step": 7124 + }, + { + "epoch": 0.8277664827185593, + "grad_norm": 0.47686582803726196, + "learning_rate": 0.0001, + "loss": 1.5739, + "step": 7125 + }, + { + "epoch": 0.82788266047052, + "grad_norm": 0.46509337425231934, + "learning_rate": 0.0001, + "loss": 1.5728, + "step": 7126 + }, + { + "epoch": 0.8279988382224804, + "grad_norm": 0.44292381405830383, + "learning_rate": 0.0001, + "loss": 1.6446, + "step": 7127 + }, + { + "epoch": 0.8281150159744409, + "grad_norm": 0.42537015676498413, + "learning_rate": 0.0001, + "loss": 1.5346, + "step": 7128 + }, + { + "epoch": 0.8282311937264014, + "grad_norm": 0.45168519020080566, + "learning_rate": 0.0001, + "loss": 1.554, + "step": 7129 + }, + { + "epoch": 0.8283473714783619, + "grad_norm": 0.48862767219543457, + "learning_rate": 0.0001, + "loss": 1.632, + "step": 7130 + }, + { + "epoch": 0.8284635492303224, + "grad_norm": 0.4335499405860901, + "learning_rate": 0.0001, + "loss": 1.3461, + "step": 7131 + }, + { + "epoch": 0.8285797269822829, + "grad_norm": 0.4990119934082031, + "learning_rate": 0.0001, + "loss": 1.7773, + "step": 7132 + }, + { + "epoch": 0.8286959047342434, + "grad_norm": 0.4781734347343445, + "learning_rate": 0.0001, + "loss": 1.6872, + "step": 7133 + }, + { + "epoch": 0.8288120824862039, + "grad_norm": 0.46874862909317017, + "learning_rate": 0.0001, + "loss": 1.6284, + "step": 7134 + }, + { + "epoch": 0.8289282602381643, + "grad_norm": 0.49614983797073364, + "learning_rate": 0.0001, + "loss": 1.5967, + "step": 7135 + }, + { + "epoch": 0.8290444379901248, + "grad_norm": 0.45961683988571167, + "learning_rate": 0.0001, + "loss": 1.5863, + "step": 7136 + }, + { + "epoch": 0.8291606157420854, + "grad_norm": 0.42698389291763306, + "learning_rate": 0.0001, + "loss": 1.4959, + "step": 7137 + }, + { + "epoch": 0.8292767934940459, + "grad_norm": 0.5121428370475769, + "learning_rate": 0.0001, + "loss": 1.6586, + "step": 7138 + }, + { + "epoch": 0.8293929712460064, + "grad_norm": 0.45941177010536194, + "learning_rate": 0.0001, + "loss": 1.6187, + "step": 7139 + }, + { + "epoch": 0.8295091489979669, + "grad_norm": 0.487868994474411, + "learning_rate": 0.0001, + "loss": 1.5562, + "step": 7140 + }, + { + "epoch": 0.8296253267499274, + "grad_norm": 0.49762672185897827, + "learning_rate": 0.0001, + "loss": 1.6841, + "step": 7141 + }, + { + "epoch": 0.8297415045018879, + "grad_norm": 0.4626729488372803, + "learning_rate": 0.0001, + "loss": 1.6782, + "step": 7142 + }, + { + "epoch": 0.8298576822538484, + "grad_norm": 0.4500117301940918, + "learning_rate": 0.0001, + "loss": 1.44, + "step": 7143 + }, + { + "epoch": 0.8299738600058089, + "grad_norm": 0.45620059967041016, + "learning_rate": 0.0001, + "loss": 1.5538, + "step": 7144 + }, + { + "epoch": 0.8300900377577694, + "grad_norm": 0.48835253715515137, + "learning_rate": 0.0001, + "loss": 1.6525, + "step": 7145 + }, + { + "epoch": 0.8302062155097298, + "grad_norm": 0.468001127243042, + "learning_rate": 0.0001, + "loss": 1.6553, + "step": 7146 + }, + { + "epoch": 0.8303223932616904, + "grad_norm": 0.45825132727622986, + "learning_rate": 0.0001, + "loss": 1.6388, + "step": 7147 + }, + { + "epoch": 0.8304385710136509, + "grad_norm": 0.46828025579452515, + "learning_rate": 0.0001, + "loss": 1.509, + "step": 7148 + }, + { + "epoch": 0.8305547487656114, + "grad_norm": 0.49574530124664307, + "learning_rate": 0.0001, + "loss": 1.6193, + "step": 7149 + }, + { + "epoch": 0.8306709265175719, + "grad_norm": 0.4577179551124573, + "learning_rate": 0.0001, + "loss": 1.6214, + "step": 7150 + }, + { + "epoch": 0.8307871042695324, + "grad_norm": 0.47962459921836853, + "learning_rate": 0.0001, + "loss": 1.622, + "step": 7151 + }, + { + "epoch": 0.8309032820214929, + "grad_norm": 0.5886511206626892, + "learning_rate": 0.0001, + "loss": 1.6682, + "step": 7152 + }, + { + "epoch": 0.8310194597734534, + "grad_norm": 0.5050860047340393, + "learning_rate": 0.0001, + "loss": 1.5805, + "step": 7153 + }, + { + "epoch": 0.8311356375254139, + "grad_norm": 0.48599928617477417, + "learning_rate": 0.0001, + "loss": 1.6516, + "step": 7154 + }, + { + "epoch": 0.8312518152773744, + "grad_norm": 0.4886664152145386, + "learning_rate": 0.0001, + "loss": 1.6722, + "step": 7155 + }, + { + "epoch": 0.8313679930293348, + "grad_norm": 0.502704381942749, + "learning_rate": 0.0001, + "loss": 1.6681, + "step": 7156 + }, + { + "epoch": 0.8314841707812953, + "grad_norm": 0.4662174880504608, + "learning_rate": 0.0001, + "loss": 1.5118, + "step": 7157 + }, + { + "epoch": 0.8316003485332559, + "grad_norm": 0.5194843411445618, + "learning_rate": 0.0001, + "loss": 1.8369, + "step": 7158 + }, + { + "epoch": 0.8317165262852164, + "grad_norm": 0.458987832069397, + "learning_rate": 0.0001, + "loss": 1.738, + "step": 7159 + }, + { + "epoch": 0.8318327040371769, + "grad_norm": 0.4516839385032654, + "learning_rate": 0.0001, + "loss": 1.4411, + "step": 7160 + }, + { + "epoch": 0.8319488817891374, + "grad_norm": 0.47868669033050537, + "learning_rate": 0.0001, + "loss": 1.4402, + "step": 7161 + }, + { + "epoch": 0.8320650595410979, + "grad_norm": 0.4523623585700989, + "learning_rate": 0.0001, + "loss": 1.7252, + "step": 7162 + }, + { + "epoch": 0.8321812372930584, + "grad_norm": 0.4759959876537323, + "learning_rate": 0.0001, + "loss": 1.6792, + "step": 7163 + }, + { + "epoch": 0.8322974150450189, + "grad_norm": 0.47161024808883667, + "learning_rate": 0.0001, + "loss": 1.6595, + "step": 7164 + }, + { + "epoch": 0.8324135927969794, + "grad_norm": 0.43895629048347473, + "learning_rate": 0.0001, + "loss": 1.6528, + "step": 7165 + }, + { + "epoch": 0.8325297705489398, + "grad_norm": 0.47241824865341187, + "learning_rate": 0.0001, + "loss": 1.4324, + "step": 7166 + }, + { + "epoch": 0.8326459483009003, + "grad_norm": 0.43697863817214966, + "learning_rate": 0.0001, + "loss": 1.4951, + "step": 7167 + }, + { + "epoch": 0.8327621260528609, + "grad_norm": 0.5089771747589111, + "learning_rate": 0.0001, + "loss": 1.6859, + "step": 7168 + }, + { + "epoch": 0.8328783038048214, + "grad_norm": 0.47586750984191895, + "learning_rate": 0.0001, + "loss": 1.4354, + "step": 7169 + }, + { + "epoch": 0.8329944815567819, + "grad_norm": 0.4991025924682617, + "learning_rate": 0.0001, + "loss": 1.6361, + "step": 7170 + }, + { + "epoch": 0.8331106593087424, + "grad_norm": 0.4552896022796631, + "learning_rate": 0.0001, + "loss": 1.6522, + "step": 7171 + }, + { + "epoch": 0.8332268370607029, + "grad_norm": 0.4508793354034424, + "learning_rate": 0.0001, + "loss": 1.7194, + "step": 7172 + }, + { + "epoch": 0.8333430148126634, + "grad_norm": 0.4660915732383728, + "learning_rate": 0.0001, + "loss": 1.5309, + "step": 7173 + }, + { + "epoch": 0.8334591925646239, + "grad_norm": 0.48091113567352295, + "learning_rate": 0.0001, + "loss": 1.608, + "step": 7174 + }, + { + "epoch": 0.8335753703165844, + "grad_norm": 0.4672652781009674, + "learning_rate": 0.0001, + "loss": 1.6219, + "step": 7175 + }, + { + "epoch": 0.8336915480685448, + "grad_norm": 0.46777546405792236, + "learning_rate": 0.0001, + "loss": 1.6119, + "step": 7176 + }, + { + "epoch": 0.8338077258205053, + "grad_norm": 0.422648161649704, + "learning_rate": 0.0001, + "loss": 1.4262, + "step": 7177 + }, + { + "epoch": 0.8339239035724658, + "grad_norm": 0.4612855613231659, + "learning_rate": 0.0001, + "loss": 1.695, + "step": 7178 + }, + { + "epoch": 0.8340400813244264, + "grad_norm": 0.4481313228607178, + "learning_rate": 0.0001, + "loss": 1.5589, + "step": 7179 + }, + { + "epoch": 0.8341562590763869, + "grad_norm": 0.4890344440937042, + "learning_rate": 0.0001, + "loss": 1.6548, + "step": 7180 + }, + { + "epoch": 0.8342724368283474, + "grad_norm": 0.4647752046585083, + "learning_rate": 0.0001, + "loss": 1.5956, + "step": 7181 + }, + { + "epoch": 0.8343886145803079, + "grad_norm": 0.5206274390220642, + "learning_rate": 0.0001, + "loss": 1.7667, + "step": 7182 + }, + { + "epoch": 0.8345047923322684, + "grad_norm": 0.4441780745983124, + "learning_rate": 0.0001, + "loss": 1.3936, + "step": 7183 + }, + { + "epoch": 0.8346209700842289, + "grad_norm": 0.4659591019153595, + "learning_rate": 0.0001, + "loss": 1.4816, + "step": 7184 + }, + { + "epoch": 0.8347371478361894, + "grad_norm": 0.45973634719848633, + "learning_rate": 0.0001, + "loss": 1.5349, + "step": 7185 + }, + { + "epoch": 0.8348533255881498, + "grad_norm": 0.43164822459220886, + "learning_rate": 0.0001, + "loss": 1.4983, + "step": 7186 + }, + { + "epoch": 0.8349695033401103, + "grad_norm": 0.4463837742805481, + "learning_rate": 0.0001, + "loss": 1.5938, + "step": 7187 + }, + { + "epoch": 0.8350856810920708, + "grad_norm": 0.4984694719314575, + "learning_rate": 0.0001, + "loss": 1.8769, + "step": 7188 + }, + { + "epoch": 0.8352018588440314, + "grad_norm": 0.476712167263031, + "learning_rate": 0.0001, + "loss": 1.6925, + "step": 7189 + }, + { + "epoch": 0.8353180365959919, + "grad_norm": 0.4878515899181366, + "learning_rate": 0.0001, + "loss": 1.6762, + "step": 7190 + }, + { + "epoch": 0.8354342143479524, + "grad_norm": 0.44668495655059814, + "learning_rate": 0.0001, + "loss": 1.7159, + "step": 7191 + }, + { + "epoch": 0.8355503920999129, + "grad_norm": 0.46840810775756836, + "learning_rate": 0.0001, + "loss": 1.6875, + "step": 7192 + }, + { + "epoch": 0.8356665698518734, + "grad_norm": 0.47092488408088684, + "learning_rate": 0.0001, + "loss": 1.6715, + "step": 7193 + }, + { + "epoch": 0.8357827476038339, + "grad_norm": 0.4265216588973999, + "learning_rate": 0.0001, + "loss": 1.3333, + "step": 7194 + }, + { + "epoch": 0.8358989253557944, + "grad_norm": 0.4651343822479248, + "learning_rate": 0.0001, + "loss": 1.7237, + "step": 7195 + }, + { + "epoch": 0.8360151031077548, + "grad_norm": 0.4326002299785614, + "learning_rate": 0.0001, + "loss": 1.5349, + "step": 7196 + }, + { + "epoch": 0.8361312808597153, + "grad_norm": 0.46833327412605286, + "learning_rate": 0.0001, + "loss": 1.6418, + "step": 7197 + }, + { + "epoch": 0.8362474586116758, + "grad_norm": 0.46911799907684326, + "learning_rate": 0.0001, + "loss": 1.5144, + "step": 7198 + }, + { + "epoch": 0.8363636363636363, + "grad_norm": 0.43018439412117004, + "learning_rate": 0.0001, + "loss": 1.5022, + "step": 7199 + }, + { + "epoch": 0.8364798141155969, + "grad_norm": 0.5141427516937256, + "learning_rate": 0.0001, + "loss": 1.5996, + "step": 7200 + }, + { + "epoch": 0.8365959918675574, + "grad_norm": 0.4784258306026459, + "learning_rate": 0.0001, + "loss": 1.7193, + "step": 7201 + }, + { + "epoch": 0.8367121696195179, + "grad_norm": 0.47277989983558655, + "learning_rate": 0.0001, + "loss": 1.4349, + "step": 7202 + }, + { + "epoch": 0.8368283473714784, + "grad_norm": 0.46376681327819824, + "learning_rate": 0.0001, + "loss": 1.6443, + "step": 7203 + }, + { + "epoch": 0.8369445251234389, + "grad_norm": 0.4796522855758667, + "learning_rate": 0.0001, + "loss": 1.7946, + "step": 7204 + }, + { + "epoch": 0.8370607028753994, + "grad_norm": 0.42169952392578125, + "learning_rate": 0.0001, + "loss": 1.4923, + "step": 7205 + }, + { + "epoch": 0.8371768806273598, + "grad_norm": 0.4597862660884857, + "learning_rate": 0.0001, + "loss": 1.6694, + "step": 7206 + }, + { + "epoch": 0.8372930583793203, + "grad_norm": 0.4585164189338684, + "learning_rate": 0.0001, + "loss": 1.5235, + "step": 7207 + }, + { + "epoch": 0.8374092361312808, + "grad_norm": 0.4696848392486572, + "learning_rate": 0.0001, + "loss": 1.6361, + "step": 7208 + }, + { + "epoch": 0.8375254138832413, + "grad_norm": 0.4311649203300476, + "learning_rate": 0.0001, + "loss": 1.4602, + "step": 7209 + }, + { + "epoch": 0.8376415916352019, + "grad_norm": 0.48763203620910645, + "learning_rate": 0.0001, + "loss": 1.5663, + "step": 7210 + }, + { + "epoch": 0.8377577693871624, + "grad_norm": 0.4315451383590698, + "learning_rate": 0.0001, + "loss": 1.5024, + "step": 7211 + }, + { + "epoch": 0.8378739471391229, + "grad_norm": 0.44684144854545593, + "learning_rate": 0.0001, + "loss": 1.6711, + "step": 7212 + }, + { + "epoch": 0.8379901248910834, + "grad_norm": 0.4800936281681061, + "learning_rate": 0.0001, + "loss": 1.4873, + "step": 7213 + }, + { + "epoch": 0.8381063026430439, + "grad_norm": 0.45008590817451477, + "learning_rate": 0.0001, + "loss": 1.5139, + "step": 7214 + }, + { + "epoch": 0.8382224803950044, + "grad_norm": 0.5298113822937012, + "learning_rate": 0.0001, + "loss": 1.6431, + "step": 7215 + }, + { + "epoch": 0.8383386581469648, + "grad_norm": 0.4812602400779724, + "learning_rate": 0.0001, + "loss": 1.6722, + "step": 7216 + }, + { + "epoch": 0.8384548358989253, + "grad_norm": 0.48588478565216064, + "learning_rate": 0.0001, + "loss": 1.747, + "step": 7217 + }, + { + "epoch": 0.8385710136508858, + "grad_norm": 0.5386226177215576, + "learning_rate": 0.0001, + "loss": 1.6504, + "step": 7218 + }, + { + "epoch": 0.8386871914028463, + "grad_norm": 0.5037118196487427, + "learning_rate": 0.0001, + "loss": 1.6933, + "step": 7219 + }, + { + "epoch": 0.8388033691548068, + "grad_norm": 0.48345088958740234, + "learning_rate": 0.0001, + "loss": 1.7639, + "step": 7220 + }, + { + "epoch": 0.8389195469067674, + "grad_norm": 0.4569917321205139, + "learning_rate": 0.0001, + "loss": 1.6688, + "step": 7221 + }, + { + "epoch": 0.8390357246587279, + "grad_norm": 0.4810996651649475, + "learning_rate": 0.0001, + "loss": 1.6776, + "step": 7222 + }, + { + "epoch": 0.8391519024106884, + "grad_norm": 0.49533677101135254, + "learning_rate": 0.0001, + "loss": 1.6395, + "step": 7223 + }, + { + "epoch": 0.8392680801626489, + "grad_norm": 0.4612615704536438, + "learning_rate": 0.0001, + "loss": 1.7384, + "step": 7224 + }, + { + "epoch": 0.8393842579146094, + "grad_norm": 0.4266413152217865, + "learning_rate": 0.0001, + "loss": 1.5239, + "step": 7225 + }, + { + "epoch": 0.8395004356665698, + "grad_norm": 0.490537166595459, + "learning_rate": 0.0001, + "loss": 1.8198, + "step": 7226 + }, + { + "epoch": 0.8396166134185303, + "grad_norm": 0.45766183733940125, + "learning_rate": 0.0001, + "loss": 1.4652, + "step": 7227 + }, + { + "epoch": 0.8397327911704908, + "grad_norm": 0.48349621891975403, + "learning_rate": 0.0001, + "loss": 1.6274, + "step": 7228 + }, + { + "epoch": 0.8398489689224513, + "grad_norm": 0.44390740990638733, + "learning_rate": 0.0001, + "loss": 1.5538, + "step": 7229 + }, + { + "epoch": 0.8399651466744118, + "grad_norm": 0.4712617099285126, + "learning_rate": 0.0001, + "loss": 1.5777, + "step": 7230 + }, + { + "epoch": 0.8400813244263724, + "grad_norm": 0.47112375497817993, + "learning_rate": 0.0001, + "loss": 1.6299, + "step": 7231 + }, + { + "epoch": 0.8401975021783329, + "grad_norm": 0.491479754447937, + "learning_rate": 0.0001, + "loss": 1.5637, + "step": 7232 + }, + { + "epoch": 0.8403136799302934, + "grad_norm": 0.42350316047668457, + "learning_rate": 0.0001, + "loss": 1.5259, + "step": 7233 + }, + { + "epoch": 0.8404298576822539, + "grad_norm": 0.4231340289115906, + "learning_rate": 0.0001, + "loss": 1.6274, + "step": 7234 + }, + { + "epoch": 0.8405460354342144, + "grad_norm": 0.47028040885925293, + "learning_rate": 0.0001, + "loss": 1.7254, + "step": 7235 + }, + { + "epoch": 0.8406622131861748, + "grad_norm": 0.4882940351963043, + "learning_rate": 0.0001, + "loss": 1.7317, + "step": 7236 + }, + { + "epoch": 0.8407783909381353, + "grad_norm": 0.4849685728549957, + "learning_rate": 0.0001, + "loss": 1.6552, + "step": 7237 + }, + { + "epoch": 0.8408945686900958, + "grad_norm": 0.45504260063171387, + "learning_rate": 0.0001, + "loss": 1.6153, + "step": 7238 + }, + { + "epoch": 0.8410107464420563, + "grad_norm": 0.4550893008708954, + "learning_rate": 0.0001, + "loss": 1.4802, + "step": 7239 + }, + { + "epoch": 0.8411269241940168, + "grad_norm": 0.5196647047996521, + "learning_rate": 0.0001, + "loss": 1.6103, + "step": 7240 + }, + { + "epoch": 0.8412431019459774, + "grad_norm": 0.525698721408844, + "learning_rate": 0.0001, + "loss": 1.7223, + "step": 7241 + }, + { + "epoch": 0.8413592796979379, + "grad_norm": 0.46245935559272766, + "learning_rate": 0.0001, + "loss": 1.7588, + "step": 7242 + }, + { + "epoch": 0.8414754574498984, + "grad_norm": 0.4684200584888458, + "learning_rate": 0.0001, + "loss": 1.6076, + "step": 7243 + }, + { + "epoch": 0.8415916352018589, + "grad_norm": 0.5075876116752625, + "learning_rate": 0.0001, + "loss": 1.5209, + "step": 7244 + }, + { + "epoch": 0.8417078129538194, + "grad_norm": 0.4788230061531067, + "learning_rate": 0.0001, + "loss": 1.6149, + "step": 7245 + }, + { + "epoch": 0.8418239907057798, + "grad_norm": 0.4579117000102997, + "learning_rate": 0.0001, + "loss": 1.6237, + "step": 7246 + }, + { + "epoch": 0.8419401684577403, + "grad_norm": 0.46064794063568115, + "learning_rate": 0.0001, + "loss": 1.583, + "step": 7247 + }, + { + "epoch": 0.8420563462097008, + "grad_norm": 0.49048542976379395, + "learning_rate": 0.0001, + "loss": 1.5957, + "step": 7248 + }, + { + "epoch": 0.8421725239616613, + "grad_norm": 0.46786949038505554, + "learning_rate": 0.0001, + "loss": 1.3677, + "step": 7249 + }, + { + "epoch": 0.8422887017136218, + "grad_norm": 0.4660671651363373, + "learning_rate": 0.0001, + "loss": 1.5582, + "step": 7250 + }, + { + "epoch": 0.8424048794655823, + "grad_norm": 0.45976850390434265, + "learning_rate": 0.0001, + "loss": 1.5745, + "step": 7251 + }, + { + "epoch": 0.8425210572175429, + "grad_norm": 0.45167800784111023, + "learning_rate": 0.0001, + "loss": 1.4037, + "step": 7252 + }, + { + "epoch": 0.8426372349695034, + "grad_norm": 0.45298877358436584, + "learning_rate": 0.0001, + "loss": 1.5996, + "step": 7253 + }, + { + "epoch": 0.8427534127214639, + "grad_norm": 0.4448161721229553, + "learning_rate": 0.0001, + "loss": 1.4936, + "step": 7254 + }, + { + "epoch": 0.8428695904734244, + "grad_norm": 0.4482094943523407, + "learning_rate": 0.0001, + "loss": 1.5979, + "step": 7255 + }, + { + "epoch": 0.8429857682253848, + "grad_norm": 0.45849940180778503, + "learning_rate": 0.0001, + "loss": 1.5942, + "step": 7256 + }, + { + "epoch": 0.8431019459773453, + "grad_norm": 0.4591526985168457, + "learning_rate": 0.0001, + "loss": 1.5527, + "step": 7257 + }, + { + "epoch": 0.8432181237293058, + "grad_norm": 0.4684211015701294, + "learning_rate": 0.0001, + "loss": 1.6851, + "step": 7258 + }, + { + "epoch": 0.8433343014812663, + "grad_norm": 0.4497279226779938, + "learning_rate": 0.0001, + "loss": 1.4289, + "step": 7259 + }, + { + "epoch": 0.8434504792332268, + "grad_norm": 0.46544843912124634, + "learning_rate": 0.0001, + "loss": 1.5504, + "step": 7260 + }, + { + "epoch": 0.8435666569851873, + "grad_norm": 0.4916938841342926, + "learning_rate": 0.0001, + "loss": 1.6037, + "step": 7261 + }, + { + "epoch": 0.8436828347371479, + "grad_norm": 0.4782385230064392, + "learning_rate": 0.0001, + "loss": 1.6031, + "step": 7262 + }, + { + "epoch": 0.8437990124891084, + "grad_norm": 0.46342313289642334, + "learning_rate": 0.0001, + "loss": 1.6774, + "step": 7263 + }, + { + "epoch": 0.8439151902410689, + "grad_norm": 0.456551194190979, + "learning_rate": 0.0001, + "loss": 1.7548, + "step": 7264 + }, + { + "epoch": 0.8440313679930294, + "grad_norm": 0.4580911695957184, + "learning_rate": 0.0001, + "loss": 1.5583, + "step": 7265 + }, + { + "epoch": 0.8441475457449898, + "grad_norm": 0.47985199093818665, + "learning_rate": 0.0001, + "loss": 1.5994, + "step": 7266 + }, + { + "epoch": 0.8442637234969503, + "grad_norm": 0.47257447242736816, + "learning_rate": 0.0001, + "loss": 1.5999, + "step": 7267 + }, + { + "epoch": 0.8443799012489108, + "grad_norm": 0.4965929687023163, + "learning_rate": 0.0001, + "loss": 1.5904, + "step": 7268 + }, + { + "epoch": 0.8444960790008713, + "grad_norm": 0.4734271168708801, + "learning_rate": 0.0001, + "loss": 1.7729, + "step": 7269 + }, + { + "epoch": 0.8446122567528318, + "grad_norm": 0.4869614839553833, + "learning_rate": 0.0001, + "loss": 1.7014, + "step": 7270 + }, + { + "epoch": 0.8447284345047923, + "grad_norm": 0.4773048162460327, + "learning_rate": 0.0001, + "loss": 1.5696, + "step": 7271 + }, + { + "epoch": 0.8448446122567528, + "grad_norm": 0.47592759132385254, + "learning_rate": 0.0001, + "loss": 1.5401, + "step": 7272 + }, + { + "epoch": 0.8449607900087134, + "grad_norm": 0.4415343105792999, + "learning_rate": 0.0001, + "loss": 1.3948, + "step": 7273 + }, + { + "epoch": 0.8450769677606739, + "grad_norm": 0.4446001350879669, + "learning_rate": 0.0001, + "loss": 1.5169, + "step": 7274 + }, + { + "epoch": 0.8451931455126344, + "grad_norm": 0.4620625674724579, + "learning_rate": 0.0001, + "loss": 1.5818, + "step": 7275 + }, + { + "epoch": 0.8453093232645948, + "grad_norm": 0.4809649884700775, + "learning_rate": 0.0001, + "loss": 1.5976, + "step": 7276 + }, + { + "epoch": 0.8454255010165553, + "grad_norm": 0.45134058594703674, + "learning_rate": 0.0001, + "loss": 1.6033, + "step": 7277 + }, + { + "epoch": 0.8455416787685158, + "grad_norm": 0.44494450092315674, + "learning_rate": 0.0001, + "loss": 1.5495, + "step": 7278 + }, + { + "epoch": 0.8456578565204763, + "grad_norm": 0.5055558085441589, + "learning_rate": 0.0001, + "loss": 1.4701, + "step": 7279 + }, + { + "epoch": 0.8457740342724368, + "grad_norm": 0.418148398399353, + "learning_rate": 0.0001, + "loss": 1.4975, + "step": 7280 + }, + { + "epoch": 0.8458902120243973, + "grad_norm": 0.4533846378326416, + "learning_rate": 0.0001, + "loss": 1.6442, + "step": 7281 + }, + { + "epoch": 0.8460063897763578, + "grad_norm": 0.49870097637176514, + "learning_rate": 0.0001, + "loss": 1.6239, + "step": 7282 + }, + { + "epoch": 0.8461225675283184, + "grad_norm": 0.4689449071884155, + "learning_rate": 0.0001, + "loss": 1.7109, + "step": 7283 + }, + { + "epoch": 0.8462387452802789, + "grad_norm": 0.466709166765213, + "learning_rate": 0.0001, + "loss": 1.5115, + "step": 7284 + }, + { + "epoch": 0.8463549230322394, + "grad_norm": 0.46668291091918945, + "learning_rate": 0.0001, + "loss": 1.7183, + "step": 7285 + }, + { + "epoch": 0.8464711007841998, + "grad_norm": 0.43597686290740967, + "learning_rate": 0.0001, + "loss": 1.5108, + "step": 7286 + }, + { + "epoch": 0.8465872785361603, + "grad_norm": 0.5049334168434143, + "learning_rate": 0.0001, + "loss": 1.5353, + "step": 7287 + }, + { + "epoch": 0.8467034562881208, + "grad_norm": 0.46801841259002686, + "learning_rate": 0.0001, + "loss": 1.762, + "step": 7288 + }, + { + "epoch": 0.8468196340400813, + "grad_norm": 0.46203526854515076, + "learning_rate": 0.0001, + "loss": 1.6308, + "step": 7289 + }, + { + "epoch": 0.8469358117920418, + "grad_norm": 0.45195382833480835, + "learning_rate": 0.0001, + "loss": 1.6999, + "step": 7290 + }, + { + "epoch": 0.8470519895440023, + "grad_norm": 0.4815501868724823, + "learning_rate": 0.0001, + "loss": 1.5988, + "step": 7291 + }, + { + "epoch": 0.8471681672959628, + "grad_norm": 0.4791460931301117, + "learning_rate": 0.0001, + "loss": 1.6911, + "step": 7292 + }, + { + "epoch": 0.8472843450479233, + "grad_norm": 0.4468856155872345, + "learning_rate": 0.0001, + "loss": 1.6002, + "step": 7293 + }, + { + "epoch": 0.8474005227998839, + "grad_norm": 0.45526090264320374, + "learning_rate": 0.0001, + "loss": 1.5771, + "step": 7294 + }, + { + "epoch": 0.8475167005518444, + "grad_norm": 0.5207736492156982, + "learning_rate": 0.0001, + "loss": 1.7663, + "step": 7295 + }, + { + "epoch": 0.8476328783038048, + "grad_norm": 0.47375261783599854, + "learning_rate": 0.0001, + "loss": 1.5903, + "step": 7296 + }, + { + "epoch": 0.8477490560557653, + "grad_norm": 0.4951147139072418, + "learning_rate": 0.0001, + "loss": 1.5862, + "step": 7297 + }, + { + "epoch": 0.8478652338077258, + "grad_norm": 0.4843972325325012, + "learning_rate": 0.0001, + "loss": 1.6032, + "step": 7298 + }, + { + "epoch": 0.8479814115596863, + "grad_norm": 0.44949018955230713, + "learning_rate": 0.0001, + "loss": 1.5033, + "step": 7299 + }, + { + "epoch": 0.8480975893116468, + "grad_norm": 0.467597633600235, + "learning_rate": 0.0001, + "loss": 1.6343, + "step": 7300 + }, + { + "epoch": 0.8482137670636073, + "grad_norm": 0.44474077224731445, + "learning_rate": 0.0001, + "loss": 1.491, + "step": 7301 + }, + { + "epoch": 0.8483299448155678, + "grad_norm": 0.455203115940094, + "learning_rate": 0.0001, + "loss": 1.5006, + "step": 7302 + }, + { + "epoch": 0.8484461225675283, + "grad_norm": 0.44709402322769165, + "learning_rate": 0.0001, + "loss": 1.3972, + "step": 7303 + }, + { + "epoch": 0.8485623003194889, + "grad_norm": 0.4874759316444397, + "learning_rate": 0.0001, + "loss": 1.7365, + "step": 7304 + }, + { + "epoch": 0.8486784780714494, + "grad_norm": 0.4757806956768036, + "learning_rate": 0.0001, + "loss": 1.609, + "step": 7305 + }, + { + "epoch": 0.8487946558234098, + "grad_norm": 0.4443889260292053, + "learning_rate": 0.0001, + "loss": 1.4886, + "step": 7306 + }, + { + "epoch": 0.8489108335753703, + "grad_norm": 0.4977072477340698, + "learning_rate": 0.0001, + "loss": 1.6426, + "step": 7307 + }, + { + "epoch": 0.8490270113273308, + "grad_norm": 0.5108775496482849, + "learning_rate": 0.0001, + "loss": 1.7174, + "step": 7308 + }, + { + "epoch": 0.8491431890792913, + "grad_norm": 0.5324129462242126, + "learning_rate": 0.0001, + "loss": 1.7534, + "step": 7309 + }, + { + "epoch": 0.8492593668312518, + "grad_norm": 0.46875059604644775, + "learning_rate": 0.0001, + "loss": 1.7199, + "step": 7310 + }, + { + "epoch": 0.8493755445832123, + "grad_norm": 0.4918031096458435, + "learning_rate": 0.0001, + "loss": 1.5536, + "step": 7311 + }, + { + "epoch": 0.8494917223351728, + "grad_norm": 0.4679587185382843, + "learning_rate": 0.0001, + "loss": 1.67, + "step": 7312 + }, + { + "epoch": 0.8496079000871333, + "grad_norm": 0.4425588846206665, + "learning_rate": 0.0001, + "loss": 1.4041, + "step": 7313 + }, + { + "epoch": 0.8497240778390938, + "grad_norm": 0.48238202929496765, + "learning_rate": 0.0001, + "loss": 1.8244, + "step": 7314 + }, + { + "epoch": 0.8498402555910544, + "grad_norm": 0.42519932985305786, + "learning_rate": 0.0001, + "loss": 1.4709, + "step": 7315 + }, + { + "epoch": 0.8499564333430148, + "grad_norm": 0.4286966919898987, + "learning_rate": 0.0001, + "loss": 1.518, + "step": 7316 + }, + { + "epoch": 0.8500726110949753, + "grad_norm": 0.4630890190601349, + "learning_rate": 0.0001, + "loss": 1.6126, + "step": 7317 + }, + { + "epoch": 0.8501887888469358, + "grad_norm": 0.4862578809261322, + "learning_rate": 0.0001, + "loss": 1.5924, + "step": 7318 + }, + { + "epoch": 0.8503049665988963, + "grad_norm": 0.45061859488487244, + "learning_rate": 0.0001, + "loss": 1.6935, + "step": 7319 + }, + { + "epoch": 0.8504211443508568, + "grad_norm": 0.47194933891296387, + "learning_rate": 0.0001, + "loss": 1.639, + "step": 7320 + }, + { + "epoch": 0.8505373221028173, + "grad_norm": 0.457969605922699, + "learning_rate": 0.0001, + "loss": 1.6307, + "step": 7321 + }, + { + "epoch": 0.8506534998547778, + "grad_norm": 0.4440132677555084, + "learning_rate": 0.0001, + "loss": 1.6272, + "step": 7322 + }, + { + "epoch": 0.8507696776067383, + "grad_norm": 0.5148141384124756, + "learning_rate": 0.0001, + "loss": 1.659, + "step": 7323 + }, + { + "epoch": 0.8508858553586988, + "grad_norm": 0.481653094291687, + "learning_rate": 0.0001, + "loss": 1.6465, + "step": 7324 + }, + { + "epoch": 0.8510020331106594, + "grad_norm": 0.4533710777759552, + "learning_rate": 0.0001, + "loss": 1.4495, + "step": 7325 + }, + { + "epoch": 0.8511182108626199, + "grad_norm": 0.4878830909729004, + "learning_rate": 0.0001, + "loss": 1.6365, + "step": 7326 + }, + { + "epoch": 0.8512343886145803, + "grad_norm": 0.5046993494033813, + "learning_rate": 0.0001, + "loss": 1.8043, + "step": 7327 + }, + { + "epoch": 0.8513505663665408, + "grad_norm": 0.48228132724761963, + "learning_rate": 0.0001, + "loss": 1.5357, + "step": 7328 + }, + { + "epoch": 0.8514667441185013, + "grad_norm": 0.47209233045578003, + "learning_rate": 0.0001, + "loss": 1.6615, + "step": 7329 + }, + { + "epoch": 0.8515829218704618, + "grad_norm": 0.4597385823726654, + "learning_rate": 0.0001, + "loss": 1.4711, + "step": 7330 + }, + { + "epoch": 0.8516990996224223, + "grad_norm": 0.478766530752182, + "learning_rate": 0.0001, + "loss": 1.6755, + "step": 7331 + }, + { + "epoch": 0.8518152773743828, + "grad_norm": 0.4749451279640198, + "learning_rate": 0.0001, + "loss": 1.5885, + "step": 7332 + }, + { + "epoch": 0.8519314551263433, + "grad_norm": 0.4681585431098938, + "learning_rate": 0.0001, + "loss": 1.531, + "step": 7333 + }, + { + "epoch": 0.8520476328783038, + "grad_norm": 0.5120987892150879, + "learning_rate": 0.0001, + "loss": 1.651, + "step": 7334 + }, + { + "epoch": 0.8521638106302643, + "grad_norm": 0.4379187822341919, + "learning_rate": 0.0001, + "loss": 1.4779, + "step": 7335 + }, + { + "epoch": 0.8522799883822249, + "grad_norm": 0.5244159698486328, + "learning_rate": 0.0001, + "loss": 1.7338, + "step": 7336 + }, + { + "epoch": 0.8523961661341853, + "grad_norm": 0.4366852343082428, + "learning_rate": 0.0001, + "loss": 1.448, + "step": 7337 + }, + { + "epoch": 0.8525123438861458, + "grad_norm": 0.468220591545105, + "learning_rate": 0.0001, + "loss": 1.5285, + "step": 7338 + }, + { + "epoch": 0.8526285216381063, + "grad_norm": 0.43382033705711365, + "learning_rate": 0.0001, + "loss": 1.4661, + "step": 7339 + }, + { + "epoch": 0.8527446993900668, + "grad_norm": 0.48206859827041626, + "learning_rate": 0.0001, + "loss": 1.7871, + "step": 7340 + }, + { + "epoch": 0.8528608771420273, + "grad_norm": 0.448639839887619, + "learning_rate": 0.0001, + "loss": 1.3691, + "step": 7341 + }, + { + "epoch": 0.8529770548939878, + "grad_norm": 0.48015591502189636, + "learning_rate": 0.0001, + "loss": 1.7805, + "step": 7342 + }, + { + "epoch": 0.8530932326459483, + "grad_norm": 0.44810763001441956, + "learning_rate": 0.0001, + "loss": 1.502, + "step": 7343 + }, + { + "epoch": 0.8532094103979088, + "grad_norm": 0.5089064240455627, + "learning_rate": 0.0001, + "loss": 1.7226, + "step": 7344 + }, + { + "epoch": 0.8533255881498693, + "grad_norm": 0.4648088216781616, + "learning_rate": 0.0001, + "loss": 1.3528, + "step": 7345 + }, + { + "epoch": 0.8534417659018299, + "grad_norm": 0.46756070852279663, + "learning_rate": 0.0001, + "loss": 1.6056, + "step": 7346 + }, + { + "epoch": 0.8535579436537903, + "grad_norm": 0.4743233621120453, + "learning_rate": 0.0001, + "loss": 1.6762, + "step": 7347 + }, + { + "epoch": 0.8536741214057508, + "grad_norm": 0.4871358573436737, + "learning_rate": 0.0001, + "loss": 1.7681, + "step": 7348 + }, + { + "epoch": 0.8537902991577113, + "grad_norm": 0.4481533467769623, + "learning_rate": 0.0001, + "loss": 1.5602, + "step": 7349 + }, + { + "epoch": 0.8539064769096718, + "grad_norm": 0.49137020111083984, + "learning_rate": 0.0001, + "loss": 1.3881, + "step": 7350 + }, + { + "epoch": 0.8540226546616323, + "grad_norm": 0.45045387744903564, + "learning_rate": 0.0001, + "loss": 1.6635, + "step": 7351 + }, + { + "epoch": 0.8541388324135928, + "grad_norm": 0.45191293954849243, + "learning_rate": 0.0001, + "loss": 1.5433, + "step": 7352 + }, + { + "epoch": 0.8542550101655533, + "grad_norm": 0.46247225999832153, + "learning_rate": 0.0001, + "loss": 1.7451, + "step": 7353 + }, + { + "epoch": 0.8543711879175138, + "grad_norm": 0.4777053892612457, + "learning_rate": 0.0001, + "loss": 1.6937, + "step": 7354 + }, + { + "epoch": 0.8544873656694743, + "grad_norm": 0.44908279180526733, + "learning_rate": 0.0001, + "loss": 1.5446, + "step": 7355 + }, + { + "epoch": 0.8546035434214347, + "grad_norm": 0.4714983403682709, + "learning_rate": 0.0001, + "loss": 1.5421, + "step": 7356 + }, + { + "epoch": 0.8547197211733953, + "grad_norm": 0.4688902497291565, + "learning_rate": 0.0001, + "loss": 1.6913, + "step": 7357 + }, + { + "epoch": 0.8548358989253558, + "grad_norm": 0.46939048171043396, + "learning_rate": 0.0001, + "loss": 1.67, + "step": 7358 + }, + { + "epoch": 0.8549520766773163, + "grad_norm": 0.47825509309768677, + "learning_rate": 0.0001, + "loss": 1.6216, + "step": 7359 + }, + { + "epoch": 0.8550682544292768, + "grad_norm": 0.49258682131767273, + "learning_rate": 0.0001, + "loss": 1.6379, + "step": 7360 + }, + { + "epoch": 0.8551844321812373, + "grad_norm": 0.5022354125976562, + "learning_rate": 0.0001, + "loss": 1.7366, + "step": 7361 + }, + { + "epoch": 0.8553006099331978, + "grad_norm": 0.4815436899662018, + "learning_rate": 0.0001, + "loss": 1.5895, + "step": 7362 + }, + { + "epoch": 0.8554167876851583, + "grad_norm": 0.4701344668865204, + "learning_rate": 0.0001, + "loss": 1.6379, + "step": 7363 + }, + { + "epoch": 0.8555329654371188, + "grad_norm": 0.4595559537410736, + "learning_rate": 0.0001, + "loss": 1.4641, + "step": 7364 + }, + { + "epoch": 0.8556491431890793, + "grad_norm": 0.45530787110328674, + "learning_rate": 0.0001, + "loss": 1.5974, + "step": 7365 + }, + { + "epoch": 0.8557653209410397, + "grad_norm": 0.45153534412384033, + "learning_rate": 0.0001, + "loss": 1.3796, + "step": 7366 + }, + { + "epoch": 0.8558814986930003, + "grad_norm": 0.4568248391151428, + "learning_rate": 0.0001, + "loss": 1.6289, + "step": 7367 + }, + { + "epoch": 0.8559976764449608, + "grad_norm": 0.5031229853630066, + "learning_rate": 0.0001, + "loss": 1.6887, + "step": 7368 + }, + { + "epoch": 0.8561138541969213, + "grad_norm": 0.5217198729515076, + "learning_rate": 0.0001, + "loss": 1.7427, + "step": 7369 + }, + { + "epoch": 0.8562300319488818, + "grad_norm": 0.46275079250335693, + "learning_rate": 0.0001, + "loss": 1.4803, + "step": 7370 + }, + { + "epoch": 0.8563462097008423, + "grad_norm": 0.47801512479782104, + "learning_rate": 0.0001, + "loss": 1.6526, + "step": 7371 + }, + { + "epoch": 0.8564623874528028, + "grad_norm": 0.48373323678970337, + "learning_rate": 0.0001, + "loss": 1.5822, + "step": 7372 + }, + { + "epoch": 0.8565785652047633, + "grad_norm": 0.4380189776420593, + "learning_rate": 0.0001, + "loss": 1.4553, + "step": 7373 + }, + { + "epoch": 0.8566947429567238, + "grad_norm": 0.48953357338905334, + "learning_rate": 0.0001, + "loss": 1.7664, + "step": 7374 + }, + { + "epoch": 0.8568109207086843, + "grad_norm": 0.4443288743495941, + "learning_rate": 0.0001, + "loss": 1.4155, + "step": 7375 + }, + { + "epoch": 0.8569270984606447, + "grad_norm": 0.4462949335575104, + "learning_rate": 0.0001, + "loss": 1.6234, + "step": 7376 + }, + { + "epoch": 0.8570432762126052, + "grad_norm": 0.5047136545181274, + "learning_rate": 0.0001, + "loss": 1.6205, + "step": 7377 + }, + { + "epoch": 0.8571594539645658, + "grad_norm": 0.440891832113266, + "learning_rate": 0.0001, + "loss": 1.6558, + "step": 7378 + }, + { + "epoch": 0.8572756317165263, + "grad_norm": 0.49732422828674316, + "learning_rate": 0.0001, + "loss": 1.7462, + "step": 7379 + }, + { + "epoch": 0.8573918094684868, + "grad_norm": 0.47000056505203247, + "learning_rate": 0.0001, + "loss": 1.6352, + "step": 7380 + }, + { + "epoch": 0.8575079872204473, + "grad_norm": 0.4483802020549774, + "learning_rate": 0.0001, + "loss": 1.5411, + "step": 7381 + }, + { + "epoch": 0.8576241649724078, + "grad_norm": 0.4606070816516876, + "learning_rate": 0.0001, + "loss": 1.6308, + "step": 7382 + }, + { + "epoch": 0.8577403427243683, + "grad_norm": 0.4387550950050354, + "learning_rate": 0.0001, + "loss": 1.5059, + "step": 7383 + }, + { + "epoch": 0.8578565204763288, + "grad_norm": 0.4888121783733368, + "learning_rate": 0.0001, + "loss": 1.6821, + "step": 7384 + }, + { + "epoch": 0.8579726982282893, + "grad_norm": 0.4582982659339905, + "learning_rate": 0.0001, + "loss": 1.527, + "step": 7385 + }, + { + "epoch": 0.8580888759802497, + "grad_norm": 0.46754390001296997, + "learning_rate": 0.0001, + "loss": 1.6641, + "step": 7386 + }, + { + "epoch": 0.8582050537322102, + "grad_norm": 0.44355496764183044, + "learning_rate": 0.0001, + "loss": 1.6246, + "step": 7387 + }, + { + "epoch": 0.8583212314841708, + "grad_norm": 0.628402829170227, + "learning_rate": 0.0001, + "loss": 1.4312, + "step": 7388 + }, + { + "epoch": 0.8584374092361313, + "grad_norm": 0.5051394701004028, + "learning_rate": 0.0001, + "loss": 1.6685, + "step": 7389 + }, + { + "epoch": 0.8585535869880918, + "grad_norm": 0.4363487660884857, + "learning_rate": 0.0001, + "loss": 1.3303, + "step": 7390 + }, + { + "epoch": 0.8586697647400523, + "grad_norm": 0.4447265863418579, + "learning_rate": 0.0001, + "loss": 1.4897, + "step": 7391 + }, + { + "epoch": 0.8587859424920128, + "grad_norm": 0.4490847885608673, + "learning_rate": 0.0001, + "loss": 1.3502, + "step": 7392 + }, + { + "epoch": 0.8589021202439733, + "grad_norm": 0.6224667429924011, + "learning_rate": 0.0001, + "loss": 1.5804, + "step": 7393 + }, + { + "epoch": 0.8590182979959338, + "grad_norm": 0.4750422537326813, + "learning_rate": 0.0001, + "loss": 1.5267, + "step": 7394 + }, + { + "epoch": 0.8591344757478943, + "grad_norm": 0.47161757946014404, + "learning_rate": 0.0001, + "loss": 1.5626, + "step": 7395 + }, + { + "epoch": 0.8592506534998547, + "grad_norm": 0.4862884283065796, + "learning_rate": 0.0001, + "loss": 1.5404, + "step": 7396 + }, + { + "epoch": 0.8593668312518152, + "grad_norm": 0.6144393682479858, + "learning_rate": 0.0001, + "loss": 1.4849, + "step": 7397 + }, + { + "epoch": 0.8594830090037757, + "grad_norm": 0.4535844326019287, + "learning_rate": 0.0001, + "loss": 1.536, + "step": 7398 + }, + { + "epoch": 0.8595991867557363, + "grad_norm": 0.4707335829734802, + "learning_rate": 0.0001, + "loss": 1.6917, + "step": 7399 + }, + { + "epoch": 0.8597153645076968, + "grad_norm": 0.45508313179016113, + "learning_rate": 0.0001, + "loss": 1.5563, + "step": 7400 + }, + { + "epoch": 0.8598315422596573, + "grad_norm": 0.45613574981689453, + "learning_rate": 0.0001, + "loss": 1.5132, + "step": 7401 + }, + { + "epoch": 0.8599477200116178, + "grad_norm": 0.46452754735946655, + "learning_rate": 0.0001, + "loss": 1.6491, + "step": 7402 + }, + { + "epoch": 0.8600638977635783, + "grad_norm": 0.5027807354927063, + "learning_rate": 0.0001, + "loss": 1.7318, + "step": 7403 + }, + { + "epoch": 0.8601800755155388, + "grad_norm": 0.4823223352432251, + "learning_rate": 0.0001, + "loss": 1.6181, + "step": 7404 + }, + { + "epoch": 0.8602962532674993, + "grad_norm": 0.4800436794757843, + "learning_rate": 0.0001, + "loss": 1.6692, + "step": 7405 + }, + { + "epoch": 0.8604124310194597, + "grad_norm": 0.48085838556289673, + "learning_rate": 0.0001, + "loss": 1.5379, + "step": 7406 + }, + { + "epoch": 0.8605286087714202, + "grad_norm": 0.45660898089408875, + "learning_rate": 0.0001, + "loss": 1.5542, + "step": 7407 + }, + { + "epoch": 0.8606447865233807, + "grad_norm": 0.4891624450683594, + "learning_rate": 0.0001, + "loss": 1.569, + "step": 7408 + }, + { + "epoch": 0.8607609642753413, + "grad_norm": 0.4807065427303314, + "learning_rate": 0.0001, + "loss": 1.7155, + "step": 7409 + }, + { + "epoch": 0.8608771420273018, + "grad_norm": 0.45160871744155884, + "learning_rate": 0.0001, + "loss": 1.5249, + "step": 7410 + }, + { + "epoch": 0.8609933197792623, + "grad_norm": 0.4672984182834625, + "learning_rate": 0.0001, + "loss": 1.5387, + "step": 7411 + }, + { + "epoch": 0.8611094975312228, + "grad_norm": 0.4706302881240845, + "learning_rate": 0.0001, + "loss": 1.7187, + "step": 7412 + }, + { + "epoch": 0.8612256752831833, + "grad_norm": 0.4920727014541626, + "learning_rate": 0.0001, + "loss": 1.6299, + "step": 7413 + }, + { + "epoch": 0.8613418530351438, + "grad_norm": 0.46764302253723145, + "learning_rate": 0.0001, + "loss": 1.5539, + "step": 7414 + }, + { + "epoch": 0.8614580307871043, + "grad_norm": 0.4732871651649475, + "learning_rate": 0.0001, + "loss": 1.5067, + "step": 7415 + }, + { + "epoch": 0.8615742085390647, + "grad_norm": 0.4615955054759979, + "learning_rate": 0.0001, + "loss": 1.4017, + "step": 7416 + }, + { + "epoch": 0.8616903862910252, + "grad_norm": 1.543933629989624, + "learning_rate": 0.0001, + "loss": 1.7691, + "step": 7417 + }, + { + "epoch": 0.8618065640429857, + "grad_norm": 0.4970760941505432, + "learning_rate": 0.0001, + "loss": 1.5598, + "step": 7418 + }, + { + "epoch": 0.8619227417949463, + "grad_norm": 0.45834749937057495, + "learning_rate": 0.0001, + "loss": 1.5149, + "step": 7419 + }, + { + "epoch": 0.8620389195469068, + "grad_norm": 0.49793627858161926, + "learning_rate": 0.0001, + "loss": 1.7069, + "step": 7420 + }, + { + "epoch": 0.8621550972988673, + "grad_norm": 0.4896315336227417, + "learning_rate": 0.0001, + "loss": 1.5341, + "step": 7421 + }, + { + "epoch": 0.8622712750508278, + "grad_norm": 0.46836981177330017, + "learning_rate": 0.0001, + "loss": 1.5196, + "step": 7422 + }, + { + "epoch": 0.8623874528027883, + "grad_norm": 0.47271421551704407, + "learning_rate": 0.0001, + "loss": 1.6342, + "step": 7423 + }, + { + "epoch": 0.8625036305547488, + "grad_norm": 0.5026324987411499, + "learning_rate": 0.0001, + "loss": 1.6324, + "step": 7424 + }, + { + "epoch": 0.8626198083067093, + "grad_norm": 0.515398383140564, + "learning_rate": 0.0001, + "loss": 1.813, + "step": 7425 + }, + { + "epoch": 0.8627359860586697, + "grad_norm": 0.49251848459243774, + "learning_rate": 0.0001, + "loss": 1.6419, + "step": 7426 + }, + { + "epoch": 0.8628521638106302, + "grad_norm": 0.44753625988960266, + "learning_rate": 0.0001, + "loss": 1.5583, + "step": 7427 + }, + { + "epoch": 0.8629683415625907, + "grad_norm": 0.4866905212402344, + "learning_rate": 0.0001, + "loss": 1.7084, + "step": 7428 + }, + { + "epoch": 0.8630845193145512, + "grad_norm": 0.4585065245628357, + "learning_rate": 0.0001, + "loss": 1.6465, + "step": 7429 + }, + { + "epoch": 0.8632006970665118, + "grad_norm": 0.47094056010246277, + "learning_rate": 0.0001, + "loss": 1.7554, + "step": 7430 + }, + { + "epoch": 0.8633168748184723, + "grad_norm": 0.45176732540130615, + "learning_rate": 0.0001, + "loss": 1.6076, + "step": 7431 + }, + { + "epoch": 0.8634330525704328, + "grad_norm": 0.48504602909088135, + "learning_rate": 0.0001, + "loss": 1.7036, + "step": 7432 + }, + { + "epoch": 0.8635492303223933, + "grad_norm": 0.49550312757492065, + "learning_rate": 0.0001, + "loss": 1.6723, + "step": 7433 + }, + { + "epoch": 0.8636654080743538, + "grad_norm": 0.4960317313671112, + "learning_rate": 0.0001, + "loss": 1.5014, + "step": 7434 + }, + { + "epoch": 0.8637815858263143, + "grad_norm": 0.4894813597202301, + "learning_rate": 0.0001, + "loss": 1.6142, + "step": 7435 + }, + { + "epoch": 0.8638977635782747, + "grad_norm": 0.47758471965789795, + "learning_rate": 0.0001, + "loss": 1.6221, + "step": 7436 + }, + { + "epoch": 0.8640139413302352, + "grad_norm": 0.5021971464157104, + "learning_rate": 0.0001, + "loss": 1.7103, + "step": 7437 + }, + { + "epoch": 0.8641301190821957, + "grad_norm": 0.5115821957588196, + "learning_rate": 0.0001, + "loss": 1.8215, + "step": 7438 + }, + { + "epoch": 0.8642462968341562, + "grad_norm": 0.45494914054870605, + "learning_rate": 0.0001, + "loss": 1.5219, + "step": 7439 + }, + { + "epoch": 0.8643624745861168, + "grad_norm": 0.4635239839553833, + "learning_rate": 0.0001, + "loss": 1.7132, + "step": 7440 + }, + { + "epoch": 0.8644786523380773, + "grad_norm": 0.4838850200176239, + "learning_rate": 0.0001, + "loss": 1.453, + "step": 7441 + }, + { + "epoch": 0.8645948300900378, + "grad_norm": 0.48782113194465637, + "learning_rate": 0.0001, + "loss": 1.8607, + "step": 7442 + }, + { + "epoch": 0.8647110078419983, + "grad_norm": 0.4964597225189209, + "learning_rate": 0.0001, + "loss": 1.6444, + "step": 7443 + }, + { + "epoch": 0.8648271855939588, + "grad_norm": 0.4533204734325409, + "learning_rate": 0.0001, + "loss": 1.4895, + "step": 7444 + }, + { + "epoch": 0.8649433633459193, + "grad_norm": 0.4995560348033905, + "learning_rate": 0.0001, + "loss": 1.5984, + "step": 7445 + }, + { + "epoch": 0.8650595410978797, + "grad_norm": 0.48218008875846863, + "learning_rate": 0.0001, + "loss": 1.5235, + "step": 7446 + }, + { + "epoch": 0.8651757188498402, + "grad_norm": 0.4977954626083374, + "learning_rate": 0.0001, + "loss": 1.6671, + "step": 7447 + }, + { + "epoch": 0.8652918966018007, + "grad_norm": 0.4879086911678314, + "learning_rate": 0.0001, + "loss": 1.4763, + "step": 7448 + }, + { + "epoch": 0.8654080743537612, + "grad_norm": 0.4788166880607605, + "learning_rate": 0.0001, + "loss": 1.615, + "step": 7449 + }, + { + "epoch": 0.8655242521057217, + "grad_norm": 0.5052685737609863, + "learning_rate": 0.0001, + "loss": 1.6799, + "step": 7450 + }, + { + "epoch": 0.8656404298576823, + "grad_norm": 0.48076626658439636, + "learning_rate": 0.0001, + "loss": 1.5764, + "step": 7451 + }, + { + "epoch": 0.8657566076096428, + "grad_norm": 0.481800377368927, + "learning_rate": 0.0001, + "loss": 1.4718, + "step": 7452 + }, + { + "epoch": 0.8658727853616033, + "grad_norm": 0.4728825092315674, + "learning_rate": 0.0001, + "loss": 1.591, + "step": 7453 + }, + { + "epoch": 0.8659889631135638, + "grad_norm": 0.4901738464832306, + "learning_rate": 0.0001, + "loss": 1.6488, + "step": 7454 + }, + { + "epoch": 0.8661051408655243, + "grad_norm": 0.47152256965637207, + "learning_rate": 0.0001, + "loss": 1.615, + "step": 7455 + }, + { + "epoch": 0.8662213186174847, + "grad_norm": 0.4627699851989746, + "learning_rate": 0.0001, + "loss": 1.5407, + "step": 7456 + }, + { + "epoch": 0.8663374963694452, + "grad_norm": 0.47442302107810974, + "learning_rate": 0.0001, + "loss": 1.6379, + "step": 7457 + }, + { + "epoch": 0.8664536741214057, + "grad_norm": 0.45434120297431946, + "learning_rate": 0.0001, + "loss": 1.6515, + "step": 7458 + }, + { + "epoch": 0.8665698518733662, + "grad_norm": 0.5147332549095154, + "learning_rate": 0.0001, + "loss": 1.6957, + "step": 7459 + }, + { + "epoch": 0.8666860296253267, + "grad_norm": 0.4700101315975189, + "learning_rate": 0.0001, + "loss": 1.502, + "step": 7460 + }, + { + "epoch": 0.8668022073772873, + "grad_norm": 0.5192029476165771, + "learning_rate": 0.0001, + "loss": 1.562, + "step": 7461 + }, + { + "epoch": 0.8669183851292478, + "grad_norm": 0.44885125756263733, + "learning_rate": 0.0001, + "loss": 1.6334, + "step": 7462 + }, + { + "epoch": 0.8670345628812083, + "grad_norm": 0.45993509888648987, + "learning_rate": 0.0001, + "loss": 1.5642, + "step": 7463 + }, + { + "epoch": 0.8671507406331688, + "grad_norm": 0.46040210127830505, + "learning_rate": 0.0001, + "loss": 1.3868, + "step": 7464 + }, + { + "epoch": 0.8672669183851293, + "grad_norm": 0.47678741812705994, + "learning_rate": 0.0001, + "loss": 1.6829, + "step": 7465 + }, + { + "epoch": 0.8673830961370897, + "grad_norm": 0.4918549358844757, + "learning_rate": 0.0001, + "loss": 1.6523, + "step": 7466 + }, + { + "epoch": 0.8674992738890502, + "grad_norm": 0.45147234201431274, + "learning_rate": 0.0001, + "loss": 1.3795, + "step": 7467 + }, + { + "epoch": 0.8676154516410107, + "grad_norm": 0.4544316530227661, + "learning_rate": 0.0001, + "loss": 1.475, + "step": 7468 + }, + { + "epoch": 0.8677316293929712, + "grad_norm": 0.4829142987728119, + "learning_rate": 0.0001, + "loss": 1.6479, + "step": 7469 + }, + { + "epoch": 0.8678478071449317, + "grad_norm": 0.5195518136024475, + "learning_rate": 0.0001, + "loss": 1.5526, + "step": 7470 + }, + { + "epoch": 0.8679639848968922, + "grad_norm": 0.48266440629959106, + "learning_rate": 0.0001, + "loss": 1.6076, + "step": 7471 + }, + { + "epoch": 0.8680801626488528, + "grad_norm": 0.5128846764564514, + "learning_rate": 0.0001, + "loss": 1.5199, + "step": 7472 + }, + { + "epoch": 0.8681963404008133, + "grad_norm": 0.48383504152297974, + "learning_rate": 0.0001, + "loss": 1.6736, + "step": 7473 + }, + { + "epoch": 0.8683125181527738, + "grad_norm": 0.49934422969818115, + "learning_rate": 0.0001, + "loss": 1.7374, + "step": 7474 + }, + { + "epoch": 0.8684286959047343, + "grad_norm": 0.4719254672527313, + "learning_rate": 0.0001, + "loss": 1.5179, + "step": 7475 + }, + { + "epoch": 0.8685448736566947, + "grad_norm": 0.45778128504753113, + "learning_rate": 0.0001, + "loss": 1.4926, + "step": 7476 + }, + { + "epoch": 0.8686610514086552, + "grad_norm": 0.4514407813549042, + "learning_rate": 0.0001, + "loss": 1.6567, + "step": 7477 + }, + { + "epoch": 0.8687772291606157, + "grad_norm": 0.4867977201938629, + "learning_rate": 0.0001, + "loss": 1.6724, + "step": 7478 + }, + { + "epoch": 0.8688934069125762, + "grad_norm": 0.4617425799369812, + "learning_rate": 0.0001, + "loss": 1.684, + "step": 7479 + }, + { + "epoch": 0.8690095846645367, + "grad_norm": 0.44265344738960266, + "learning_rate": 0.0001, + "loss": 1.4586, + "step": 7480 + }, + { + "epoch": 0.8691257624164972, + "grad_norm": 0.4775390625, + "learning_rate": 0.0001, + "loss": 1.6254, + "step": 7481 + }, + { + "epoch": 0.8692419401684578, + "grad_norm": 0.5168774127960205, + "learning_rate": 0.0001, + "loss": 1.7173, + "step": 7482 + }, + { + "epoch": 0.8693581179204183, + "grad_norm": 0.48702213168144226, + "learning_rate": 0.0001, + "loss": 1.7377, + "step": 7483 + }, + { + "epoch": 0.8694742956723788, + "grad_norm": 0.47748100757598877, + "learning_rate": 0.0001, + "loss": 1.7014, + "step": 7484 + }, + { + "epoch": 0.8695904734243393, + "grad_norm": 0.45373064279556274, + "learning_rate": 0.0001, + "loss": 1.5932, + "step": 7485 + }, + { + "epoch": 0.8697066511762997, + "grad_norm": 0.4954831898212433, + "learning_rate": 0.0001, + "loss": 1.6765, + "step": 7486 + }, + { + "epoch": 0.8698228289282602, + "grad_norm": 0.4588451683521271, + "learning_rate": 0.0001, + "loss": 1.3947, + "step": 7487 + }, + { + "epoch": 0.8699390066802207, + "grad_norm": 0.4915754199028015, + "learning_rate": 0.0001, + "loss": 1.6084, + "step": 7488 + }, + { + "epoch": 0.8700551844321812, + "grad_norm": 0.4902786612510681, + "learning_rate": 0.0001, + "loss": 1.4462, + "step": 7489 + }, + { + "epoch": 0.8701713621841417, + "grad_norm": 0.4758920967578888, + "learning_rate": 0.0001, + "loss": 1.584, + "step": 7490 + }, + { + "epoch": 0.8702875399361022, + "grad_norm": 0.4647005796432495, + "learning_rate": 0.0001, + "loss": 1.6616, + "step": 7491 + }, + { + "epoch": 0.8704037176880627, + "grad_norm": 0.4310697615146637, + "learning_rate": 0.0001, + "loss": 1.4565, + "step": 7492 + }, + { + "epoch": 0.8705198954400233, + "grad_norm": 0.4658608138561249, + "learning_rate": 0.0001, + "loss": 1.6598, + "step": 7493 + }, + { + "epoch": 0.8706360731919838, + "grad_norm": 0.5647340416908264, + "learning_rate": 0.0001, + "loss": 1.7218, + "step": 7494 + }, + { + "epoch": 0.8707522509439443, + "grad_norm": 0.4370400607585907, + "learning_rate": 0.0001, + "loss": 1.5661, + "step": 7495 + }, + { + "epoch": 0.8708684286959048, + "grad_norm": 0.5290013551712036, + "learning_rate": 0.0001, + "loss": 1.7937, + "step": 7496 + }, + { + "epoch": 0.8709846064478652, + "grad_norm": 0.49788692593574524, + "learning_rate": 0.0001, + "loss": 1.5633, + "step": 7497 + }, + { + "epoch": 0.8711007841998257, + "grad_norm": 0.5271779894828796, + "learning_rate": 0.0001, + "loss": 1.4642, + "step": 7498 + }, + { + "epoch": 0.8712169619517862, + "grad_norm": 0.5094662308692932, + "learning_rate": 0.0001, + "loss": 1.766, + "step": 7499 + }, + { + "epoch": 0.8713331397037467, + "grad_norm": 0.45193493366241455, + "learning_rate": 0.0001, + "loss": 1.5286, + "step": 7500 + }, + { + "epoch": 0.8714493174557072, + "grad_norm": 0.4743613600730896, + "learning_rate": 0.0001, + "loss": 1.5546, + "step": 7501 + }, + { + "epoch": 0.8715654952076677, + "grad_norm": 0.5035197734832764, + "learning_rate": 0.0001, + "loss": 1.5991, + "step": 7502 + }, + { + "epoch": 0.8716816729596283, + "grad_norm": 0.4376752972602844, + "learning_rate": 0.0001, + "loss": 1.4405, + "step": 7503 + }, + { + "epoch": 0.8717978507115888, + "grad_norm": 0.5224100947380066, + "learning_rate": 0.0001, + "loss": 1.7625, + "step": 7504 + }, + { + "epoch": 0.8719140284635493, + "grad_norm": 0.44658803939819336, + "learning_rate": 0.0001, + "loss": 1.4969, + "step": 7505 + }, + { + "epoch": 0.8720302062155098, + "grad_norm": 0.44167301058769226, + "learning_rate": 0.0001, + "loss": 1.6027, + "step": 7506 + }, + { + "epoch": 0.8721463839674702, + "grad_norm": 0.45513877272605896, + "learning_rate": 0.0001, + "loss": 1.6441, + "step": 7507 + }, + { + "epoch": 0.8722625617194307, + "grad_norm": 0.5717625617980957, + "learning_rate": 0.0001, + "loss": 1.7618, + "step": 7508 + }, + { + "epoch": 0.8723787394713912, + "grad_norm": 0.4953731298446655, + "learning_rate": 0.0001, + "loss": 1.7237, + "step": 7509 + }, + { + "epoch": 0.8724949172233517, + "grad_norm": 0.49298036098480225, + "learning_rate": 0.0001, + "loss": 1.532, + "step": 7510 + }, + { + "epoch": 0.8726110949753122, + "grad_norm": 0.4598557651042938, + "learning_rate": 0.0001, + "loss": 1.5216, + "step": 7511 + }, + { + "epoch": 0.8727272727272727, + "grad_norm": 0.46120190620422363, + "learning_rate": 0.0001, + "loss": 1.587, + "step": 7512 + }, + { + "epoch": 0.8728434504792332, + "grad_norm": 0.48204609751701355, + "learning_rate": 0.0001, + "loss": 1.669, + "step": 7513 + }, + { + "epoch": 0.8729596282311938, + "grad_norm": 0.5170610547065735, + "learning_rate": 0.0001, + "loss": 1.7495, + "step": 7514 + }, + { + "epoch": 0.8730758059831543, + "grad_norm": 0.47179120779037476, + "learning_rate": 0.0001, + "loss": 1.4475, + "step": 7515 + }, + { + "epoch": 0.8731919837351148, + "grad_norm": 0.5068784952163696, + "learning_rate": 0.0001, + "loss": 1.5792, + "step": 7516 + }, + { + "epoch": 0.8733081614870752, + "grad_norm": 0.46290868520736694, + "learning_rate": 0.0001, + "loss": 1.54, + "step": 7517 + }, + { + "epoch": 0.8734243392390357, + "grad_norm": 0.45205244421958923, + "learning_rate": 0.0001, + "loss": 1.3574, + "step": 7518 + }, + { + "epoch": 0.8735405169909962, + "grad_norm": 0.934231162071228, + "learning_rate": 0.0001, + "loss": 1.5797, + "step": 7519 + }, + { + "epoch": 0.8736566947429567, + "grad_norm": 0.45129621028900146, + "learning_rate": 0.0001, + "loss": 1.4467, + "step": 7520 + }, + { + "epoch": 0.8737728724949172, + "grad_norm": 0.456668496131897, + "learning_rate": 0.0001, + "loss": 1.604, + "step": 7521 + }, + { + "epoch": 0.8738890502468777, + "grad_norm": 0.5230050086975098, + "learning_rate": 0.0001, + "loss": 1.6754, + "step": 7522 + }, + { + "epoch": 0.8740052279988382, + "grad_norm": 0.47268834710121155, + "learning_rate": 0.0001, + "loss": 1.6878, + "step": 7523 + }, + { + "epoch": 0.8741214057507988, + "grad_norm": 0.5117980241775513, + "learning_rate": 0.0001, + "loss": 1.5417, + "step": 7524 + }, + { + "epoch": 0.8742375835027593, + "grad_norm": 0.44424542784690857, + "learning_rate": 0.0001, + "loss": 1.6092, + "step": 7525 + }, + { + "epoch": 0.8743537612547198, + "grad_norm": 0.479316771030426, + "learning_rate": 0.0001, + "loss": 1.7161, + "step": 7526 + }, + { + "epoch": 0.8744699390066802, + "grad_norm": 0.442326158285141, + "learning_rate": 0.0001, + "loss": 1.5868, + "step": 7527 + }, + { + "epoch": 0.8745861167586407, + "grad_norm": 0.4649803936481476, + "learning_rate": 0.0001, + "loss": 1.5859, + "step": 7528 + }, + { + "epoch": 0.8747022945106012, + "grad_norm": 0.41753268241882324, + "learning_rate": 0.0001, + "loss": 1.4568, + "step": 7529 + }, + { + "epoch": 0.8748184722625617, + "grad_norm": 0.5000750422477722, + "learning_rate": 0.0001, + "loss": 1.6933, + "step": 7530 + }, + { + "epoch": 0.8749346500145222, + "grad_norm": 0.4585053026676178, + "learning_rate": 0.0001, + "loss": 1.4732, + "step": 7531 + }, + { + "epoch": 0.8750508277664827, + "grad_norm": 0.48473891615867615, + "learning_rate": 0.0001, + "loss": 1.506, + "step": 7532 + }, + { + "epoch": 0.8751670055184432, + "grad_norm": 0.4806990325450897, + "learning_rate": 0.0001, + "loss": 1.6123, + "step": 7533 + }, + { + "epoch": 0.8752831832704037, + "grad_norm": 0.4711168110370636, + "learning_rate": 0.0001, + "loss": 1.6303, + "step": 7534 + }, + { + "epoch": 0.8753993610223643, + "grad_norm": 0.4625207483768463, + "learning_rate": 0.0001, + "loss": 1.5046, + "step": 7535 + }, + { + "epoch": 0.8755155387743248, + "grad_norm": 0.47735491394996643, + "learning_rate": 0.0001, + "loss": 1.5856, + "step": 7536 + }, + { + "epoch": 0.8756317165262852, + "grad_norm": 0.47820740938186646, + "learning_rate": 0.0001, + "loss": 1.5657, + "step": 7537 + }, + { + "epoch": 0.8757478942782457, + "grad_norm": 0.43879976868629456, + "learning_rate": 0.0001, + "loss": 1.4218, + "step": 7538 + }, + { + "epoch": 0.8758640720302062, + "grad_norm": 0.5144027471542358, + "learning_rate": 0.0001, + "loss": 1.6866, + "step": 7539 + }, + { + "epoch": 0.8759802497821667, + "grad_norm": 0.5422682166099548, + "learning_rate": 0.0001, + "loss": 1.765, + "step": 7540 + }, + { + "epoch": 0.8760964275341272, + "grad_norm": 0.49460479617118835, + "learning_rate": 0.0001, + "loss": 1.5586, + "step": 7541 + }, + { + "epoch": 0.8762126052860877, + "grad_norm": 0.4671952426433563, + "learning_rate": 0.0001, + "loss": 1.6071, + "step": 7542 + }, + { + "epoch": 0.8763287830380482, + "grad_norm": 0.4591098725795746, + "learning_rate": 0.0001, + "loss": 1.5281, + "step": 7543 + }, + { + "epoch": 0.8764449607900087, + "grad_norm": 0.49451717734336853, + "learning_rate": 0.0001, + "loss": 1.6752, + "step": 7544 + }, + { + "epoch": 0.8765611385419693, + "grad_norm": 0.48347243666648865, + "learning_rate": 0.0001, + "loss": 1.5652, + "step": 7545 + }, + { + "epoch": 0.8766773162939298, + "grad_norm": 0.4897051453590393, + "learning_rate": 0.0001, + "loss": 1.6112, + "step": 7546 + }, + { + "epoch": 0.8767934940458902, + "grad_norm": 0.45810526609420776, + "learning_rate": 0.0001, + "loss": 1.5242, + "step": 7547 + }, + { + "epoch": 0.8769096717978507, + "grad_norm": 0.4647473394870758, + "learning_rate": 0.0001, + "loss": 1.5803, + "step": 7548 + }, + { + "epoch": 0.8770258495498112, + "grad_norm": 0.4482647776603699, + "learning_rate": 0.0001, + "loss": 1.5122, + "step": 7549 + }, + { + "epoch": 0.8771420273017717, + "grad_norm": 0.44632047414779663, + "learning_rate": 0.0001, + "loss": 1.4941, + "step": 7550 + }, + { + "epoch": 0.8772582050537322, + "grad_norm": 0.5201831459999084, + "learning_rate": 0.0001, + "loss": 1.6024, + "step": 7551 + }, + { + "epoch": 0.8773743828056927, + "grad_norm": 0.46722298860549927, + "learning_rate": 0.0001, + "loss": 1.7115, + "step": 7552 + }, + { + "epoch": 0.8774905605576532, + "grad_norm": 0.5357077121734619, + "learning_rate": 0.0001, + "loss": 1.7269, + "step": 7553 + }, + { + "epoch": 0.8776067383096137, + "grad_norm": 0.481431782245636, + "learning_rate": 0.0001, + "loss": 1.5137, + "step": 7554 + }, + { + "epoch": 0.8777229160615742, + "grad_norm": 0.4767739474773407, + "learning_rate": 0.0001, + "loss": 1.5876, + "step": 7555 + }, + { + "epoch": 0.8778390938135348, + "grad_norm": 0.4820916950702667, + "learning_rate": 0.0001, + "loss": 1.3223, + "step": 7556 + }, + { + "epoch": 0.8779552715654952, + "grad_norm": 0.49660539627075195, + "learning_rate": 0.0001, + "loss": 1.5837, + "step": 7557 + }, + { + "epoch": 0.8780714493174557, + "grad_norm": 0.4909195303916931, + "learning_rate": 0.0001, + "loss": 1.6346, + "step": 7558 + }, + { + "epoch": 0.8781876270694162, + "grad_norm": 0.47243842482566833, + "learning_rate": 0.0001, + "loss": 1.5658, + "step": 7559 + }, + { + "epoch": 0.8783038048213767, + "grad_norm": 0.501683235168457, + "learning_rate": 0.0001, + "loss": 1.7332, + "step": 7560 + }, + { + "epoch": 0.8784199825733372, + "grad_norm": 0.4446217119693756, + "learning_rate": 0.0001, + "loss": 1.4357, + "step": 7561 + }, + { + "epoch": 0.8785361603252977, + "grad_norm": 0.5229566693305969, + "learning_rate": 0.0001, + "loss": 1.7222, + "step": 7562 + }, + { + "epoch": 0.8786523380772582, + "grad_norm": 0.479064404964447, + "learning_rate": 0.0001, + "loss": 1.5351, + "step": 7563 + }, + { + "epoch": 0.8787685158292187, + "grad_norm": 0.4563969075679779, + "learning_rate": 0.0001, + "loss": 1.5799, + "step": 7564 + }, + { + "epoch": 0.8788846935811792, + "grad_norm": 0.47300323843955994, + "learning_rate": 0.0001, + "loss": 1.4887, + "step": 7565 + }, + { + "epoch": 0.8790008713331398, + "grad_norm": 0.48597297072410583, + "learning_rate": 0.0001, + "loss": 1.5567, + "step": 7566 + }, + { + "epoch": 0.8791170490851002, + "grad_norm": 0.4700767695903778, + "learning_rate": 0.0001, + "loss": 1.554, + "step": 7567 + }, + { + "epoch": 0.8792332268370607, + "grad_norm": 0.50390625, + "learning_rate": 0.0001, + "loss": 1.7064, + "step": 7568 + }, + { + "epoch": 0.8793494045890212, + "grad_norm": 0.4735111892223358, + "learning_rate": 0.0001, + "loss": 1.6123, + "step": 7569 + }, + { + "epoch": 0.8794655823409817, + "grad_norm": 0.43248268961906433, + "learning_rate": 0.0001, + "loss": 1.4049, + "step": 7570 + }, + { + "epoch": 0.8795817600929422, + "grad_norm": 0.4565393328666687, + "learning_rate": 0.0001, + "loss": 1.4616, + "step": 7571 + }, + { + "epoch": 0.8796979378449027, + "grad_norm": 0.46394291520118713, + "learning_rate": 0.0001, + "loss": 1.4404, + "step": 7572 + }, + { + "epoch": 0.8798141155968632, + "grad_norm": 0.47517409920692444, + "learning_rate": 0.0001, + "loss": 1.6117, + "step": 7573 + }, + { + "epoch": 0.8799302933488237, + "grad_norm": 0.515457272529602, + "learning_rate": 0.0001, + "loss": 1.6547, + "step": 7574 + }, + { + "epoch": 0.8800464711007842, + "grad_norm": 0.4533648192882538, + "learning_rate": 0.0001, + "loss": 1.5703, + "step": 7575 + }, + { + "epoch": 0.8801626488527446, + "grad_norm": 0.4539215862751007, + "learning_rate": 0.0001, + "loss": 1.504, + "step": 7576 + }, + { + "epoch": 0.8802788266047052, + "grad_norm": 0.5214062333106995, + "learning_rate": 0.0001, + "loss": 1.5462, + "step": 7577 + }, + { + "epoch": 0.8803950043566657, + "grad_norm": 0.4697798192501068, + "learning_rate": 0.0001, + "loss": 1.5674, + "step": 7578 + }, + { + "epoch": 0.8805111821086262, + "grad_norm": 0.4855616092681885, + "learning_rate": 0.0001, + "loss": 1.5567, + "step": 7579 + }, + { + "epoch": 0.8806273598605867, + "grad_norm": 0.49214091897010803, + "learning_rate": 0.0001, + "loss": 1.6882, + "step": 7580 + }, + { + "epoch": 0.8807435376125472, + "grad_norm": 0.5328443646430969, + "learning_rate": 0.0001, + "loss": 1.7461, + "step": 7581 + }, + { + "epoch": 0.8808597153645077, + "grad_norm": 0.5216013789176941, + "learning_rate": 0.0001, + "loss": 1.8298, + "step": 7582 + }, + { + "epoch": 0.8809758931164682, + "grad_norm": 0.46522417664527893, + "learning_rate": 0.0001, + "loss": 1.5963, + "step": 7583 + }, + { + "epoch": 0.8810920708684287, + "grad_norm": 0.4373832046985626, + "learning_rate": 0.0001, + "loss": 1.5107, + "step": 7584 + }, + { + "epoch": 0.8812082486203892, + "grad_norm": 0.49534451961517334, + "learning_rate": 0.0001, + "loss": 1.7228, + "step": 7585 + }, + { + "epoch": 0.8813244263723496, + "grad_norm": 0.42766979336738586, + "learning_rate": 0.0001, + "loss": 1.3932, + "step": 7586 + }, + { + "epoch": 0.8814406041243102, + "grad_norm": 0.47507309913635254, + "learning_rate": 0.0001, + "loss": 1.6604, + "step": 7587 + }, + { + "epoch": 0.8815567818762707, + "grad_norm": 0.4551963210105896, + "learning_rate": 0.0001, + "loss": 1.4789, + "step": 7588 + }, + { + "epoch": 0.8816729596282312, + "grad_norm": 0.46101507544517517, + "learning_rate": 0.0001, + "loss": 1.7364, + "step": 7589 + }, + { + "epoch": 0.8817891373801917, + "grad_norm": 0.45468607544898987, + "learning_rate": 0.0001, + "loss": 1.3932, + "step": 7590 + }, + { + "epoch": 0.8819053151321522, + "grad_norm": 0.4612409174442291, + "learning_rate": 0.0001, + "loss": 1.592, + "step": 7591 + }, + { + "epoch": 0.8820214928841127, + "grad_norm": 0.4820188581943512, + "learning_rate": 0.0001, + "loss": 1.6734, + "step": 7592 + }, + { + "epoch": 0.8821376706360732, + "grad_norm": 0.49493300914764404, + "learning_rate": 0.0001, + "loss": 1.5493, + "step": 7593 + }, + { + "epoch": 0.8822538483880337, + "grad_norm": 0.45542287826538086, + "learning_rate": 0.0001, + "loss": 1.5775, + "step": 7594 + }, + { + "epoch": 0.8823700261399942, + "grad_norm": 0.47837620973587036, + "learning_rate": 0.0001, + "loss": 1.6641, + "step": 7595 + }, + { + "epoch": 0.8824862038919546, + "grad_norm": 0.45609262585639954, + "learning_rate": 0.0001, + "loss": 1.4127, + "step": 7596 + }, + { + "epoch": 0.8826023816439151, + "grad_norm": 0.46420490741729736, + "learning_rate": 0.0001, + "loss": 1.56, + "step": 7597 + }, + { + "epoch": 0.8827185593958757, + "grad_norm": 0.48322582244873047, + "learning_rate": 0.0001, + "loss": 1.5809, + "step": 7598 + }, + { + "epoch": 0.8828347371478362, + "grad_norm": 0.4640806317329407, + "learning_rate": 0.0001, + "loss": 1.5535, + "step": 7599 + }, + { + "epoch": 0.8829509148997967, + "grad_norm": 0.47058844566345215, + "learning_rate": 0.0001, + "loss": 1.6128, + "step": 7600 + }, + { + "epoch": 0.8830670926517572, + "grad_norm": 0.5269159078598022, + "learning_rate": 0.0001, + "loss": 1.7702, + "step": 7601 + }, + { + "epoch": 0.8831832704037177, + "grad_norm": 0.4986213445663452, + "learning_rate": 0.0001, + "loss": 1.6251, + "step": 7602 + }, + { + "epoch": 0.8832994481556782, + "grad_norm": 0.4375273585319519, + "learning_rate": 0.0001, + "loss": 1.4266, + "step": 7603 + }, + { + "epoch": 0.8834156259076387, + "grad_norm": 0.45944470167160034, + "learning_rate": 0.0001, + "loss": 1.6029, + "step": 7604 + }, + { + "epoch": 0.8835318036595992, + "grad_norm": 0.4776669442653656, + "learning_rate": 0.0001, + "loss": 1.6147, + "step": 7605 + }, + { + "epoch": 0.8836479814115596, + "grad_norm": 0.47542616724967957, + "learning_rate": 0.0001, + "loss": 1.5673, + "step": 7606 + }, + { + "epoch": 0.8837641591635201, + "grad_norm": 0.4733422100543976, + "learning_rate": 0.0001, + "loss": 1.728, + "step": 7607 + }, + { + "epoch": 0.8838803369154807, + "grad_norm": 0.47752857208251953, + "learning_rate": 0.0001, + "loss": 1.4314, + "step": 7608 + }, + { + "epoch": 0.8839965146674412, + "grad_norm": 0.4597974121570587, + "learning_rate": 0.0001, + "loss": 1.4891, + "step": 7609 + }, + { + "epoch": 0.8841126924194017, + "grad_norm": 0.46217969059944153, + "learning_rate": 0.0001, + "loss": 1.4955, + "step": 7610 + }, + { + "epoch": 0.8842288701713622, + "grad_norm": 0.5063923001289368, + "learning_rate": 0.0001, + "loss": 1.6053, + "step": 7611 + }, + { + "epoch": 0.8843450479233227, + "grad_norm": 0.4687163233757019, + "learning_rate": 0.0001, + "loss": 1.6046, + "step": 7612 + }, + { + "epoch": 0.8844612256752832, + "grad_norm": 0.45635896921157837, + "learning_rate": 0.0001, + "loss": 1.4355, + "step": 7613 + }, + { + "epoch": 0.8845774034272437, + "grad_norm": 0.46440935134887695, + "learning_rate": 0.0001, + "loss": 1.5972, + "step": 7614 + }, + { + "epoch": 0.8846935811792042, + "grad_norm": 0.48333290219306946, + "learning_rate": 0.0001, + "loss": 1.615, + "step": 7615 + }, + { + "epoch": 0.8848097589311646, + "grad_norm": 0.4907698929309845, + "learning_rate": 0.0001, + "loss": 1.6455, + "step": 7616 + }, + { + "epoch": 0.8849259366831251, + "grad_norm": 0.48579859733581543, + "learning_rate": 0.0001, + "loss": 1.7002, + "step": 7617 + }, + { + "epoch": 0.8850421144350857, + "grad_norm": 0.46631836891174316, + "learning_rate": 0.0001, + "loss": 1.5675, + "step": 7618 + }, + { + "epoch": 0.8851582921870462, + "grad_norm": 0.47939544916152954, + "learning_rate": 0.0001, + "loss": 1.6185, + "step": 7619 + }, + { + "epoch": 0.8852744699390067, + "grad_norm": 0.4607776701450348, + "learning_rate": 0.0001, + "loss": 1.5851, + "step": 7620 + }, + { + "epoch": 0.8853906476909672, + "grad_norm": 0.47920143604278564, + "learning_rate": 0.0001, + "loss": 1.7053, + "step": 7621 + }, + { + "epoch": 0.8855068254429277, + "grad_norm": 0.48284971714019775, + "learning_rate": 0.0001, + "loss": 1.6131, + "step": 7622 + }, + { + "epoch": 0.8856230031948882, + "grad_norm": 0.4558715224266052, + "learning_rate": 0.0001, + "loss": 1.5614, + "step": 7623 + }, + { + "epoch": 0.8857391809468487, + "grad_norm": 0.47470736503601074, + "learning_rate": 0.0001, + "loss": 1.669, + "step": 7624 + }, + { + "epoch": 0.8858553586988092, + "grad_norm": 0.4660652279853821, + "learning_rate": 0.0001, + "loss": 1.5647, + "step": 7625 + }, + { + "epoch": 0.8859715364507696, + "grad_norm": 0.5015971064567566, + "learning_rate": 0.0001, + "loss": 1.8973, + "step": 7626 + }, + { + "epoch": 0.8860877142027301, + "grad_norm": 0.48925477266311646, + "learning_rate": 0.0001, + "loss": 1.612, + "step": 7627 + }, + { + "epoch": 0.8862038919546906, + "grad_norm": 0.4653438925743103, + "learning_rate": 0.0001, + "loss": 1.5789, + "step": 7628 + }, + { + "epoch": 0.8863200697066512, + "grad_norm": 0.4613575041294098, + "learning_rate": 0.0001, + "loss": 1.5857, + "step": 7629 + }, + { + "epoch": 0.8864362474586117, + "grad_norm": 0.4613388776779175, + "learning_rate": 0.0001, + "loss": 1.7134, + "step": 7630 + }, + { + "epoch": 0.8865524252105722, + "grad_norm": 0.4807787835597992, + "learning_rate": 0.0001, + "loss": 1.6487, + "step": 7631 + }, + { + "epoch": 0.8866686029625327, + "grad_norm": 0.4956919252872467, + "learning_rate": 0.0001, + "loss": 1.5629, + "step": 7632 + }, + { + "epoch": 0.8867847807144932, + "grad_norm": 0.4471133053302765, + "learning_rate": 0.0001, + "loss": 1.5413, + "step": 7633 + }, + { + "epoch": 0.8869009584664537, + "grad_norm": 0.42097005248069763, + "learning_rate": 0.0001, + "loss": 1.4112, + "step": 7634 + }, + { + "epoch": 0.8870171362184142, + "grad_norm": 0.45755913853645325, + "learning_rate": 0.0001, + "loss": 1.5275, + "step": 7635 + }, + { + "epoch": 0.8871333139703746, + "grad_norm": 0.46528616547584534, + "learning_rate": 0.0001, + "loss": 1.5626, + "step": 7636 + }, + { + "epoch": 0.8872494917223351, + "grad_norm": 0.44611406326293945, + "learning_rate": 0.0001, + "loss": 1.4773, + "step": 7637 + }, + { + "epoch": 0.8873656694742956, + "grad_norm": 0.4587550759315491, + "learning_rate": 0.0001, + "loss": 1.6289, + "step": 7638 + }, + { + "epoch": 0.8874818472262562, + "grad_norm": 0.48842063546180725, + "learning_rate": 0.0001, + "loss": 1.5901, + "step": 7639 + }, + { + "epoch": 0.8875980249782167, + "grad_norm": 0.4626847505569458, + "learning_rate": 0.0001, + "loss": 1.5549, + "step": 7640 + }, + { + "epoch": 0.8877142027301772, + "grad_norm": 0.45964398980140686, + "learning_rate": 0.0001, + "loss": 1.5335, + "step": 7641 + }, + { + "epoch": 0.8878303804821377, + "grad_norm": 0.5278240442276001, + "learning_rate": 0.0001, + "loss": 1.6286, + "step": 7642 + }, + { + "epoch": 0.8879465582340982, + "grad_norm": 0.4622969329357147, + "learning_rate": 0.0001, + "loss": 1.716, + "step": 7643 + }, + { + "epoch": 0.8880627359860587, + "grad_norm": 0.47453299164772034, + "learning_rate": 0.0001, + "loss": 1.6328, + "step": 7644 + }, + { + "epoch": 0.8881789137380192, + "grad_norm": 0.4532757103443146, + "learning_rate": 0.0001, + "loss": 1.5739, + "step": 7645 + }, + { + "epoch": 0.8882950914899796, + "grad_norm": 0.47530779242515564, + "learning_rate": 0.0001, + "loss": 1.5514, + "step": 7646 + }, + { + "epoch": 0.8884112692419401, + "grad_norm": 0.4514038562774658, + "learning_rate": 0.0001, + "loss": 1.462, + "step": 7647 + }, + { + "epoch": 0.8885274469939006, + "grad_norm": 0.48598358035087585, + "learning_rate": 0.0001, + "loss": 1.6245, + "step": 7648 + }, + { + "epoch": 0.8886436247458611, + "grad_norm": 0.4912118911743164, + "learning_rate": 0.0001, + "loss": 1.5128, + "step": 7649 + }, + { + "epoch": 0.8887598024978217, + "grad_norm": 0.4736858010292053, + "learning_rate": 0.0001, + "loss": 1.6595, + "step": 7650 + }, + { + "epoch": 0.8888759802497822, + "grad_norm": 0.4599950611591339, + "learning_rate": 0.0001, + "loss": 1.5116, + "step": 7651 + }, + { + "epoch": 0.8889921580017427, + "grad_norm": 0.45858752727508545, + "learning_rate": 0.0001, + "loss": 1.5929, + "step": 7652 + }, + { + "epoch": 0.8891083357537032, + "grad_norm": 0.4588874280452728, + "learning_rate": 0.0001, + "loss": 1.3829, + "step": 7653 + }, + { + "epoch": 0.8892245135056637, + "grad_norm": 0.4724344313144684, + "learning_rate": 0.0001, + "loss": 1.6388, + "step": 7654 + }, + { + "epoch": 0.8893406912576242, + "grad_norm": 0.46815189719200134, + "learning_rate": 0.0001, + "loss": 1.6232, + "step": 7655 + }, + { + "epoch": 0.8894568690095846, + "grad_norm": 0.47574499249458313, + "learning_rate": 0.0001, + "loss": 1.6022, + "step": 7656 + }, + { + "epoch": 0.8895730467615451, + "grad_norm": 0.5131924152374268, + "learning_rate": 0.0001, + "loss": 1.7868, + "step": 7657 + }, + { + "epoch": 0.8896892245135056, + "grad_norm": 0.47800272703170776, + "learning_rate": 0.0001, + "loss": 1.5755, + "step": 7658 + }, + { + "epoch": 0.8898054022654661, + "grad_norm": 0.4923827052116394, + "learning_rate": 0.0001, + "loss": 1.6173, + "step": 7659 + }, + { + "epoch": 0.8899215800174267, + "grad_norm": 0.4540409743785858, + "learning_rate": 0.0001, + "loss": 1.4549, + "step": 7660 + }, + { + "epoch": 0.8900377577693872, + "grad_norm": 0.4510956108570099, + "learning_rate": 0.0001, + "loss": 1.428, + "step": 7661 + }, + { + "epoch": 0.8901539355213477, + "grad_norm": 0.47217369079589844, + "learning_rate": 0.0001, + "loss": 1.6506, + "step": 7662 + }, + { + "epoch": 0.8902701132733082, + "grad_norm": 0.5065828561782837, + "learning_rate": 0.0001, + "loss": 1.5586, + "step": 7663 + }, + { + "epoch": 0.8903862910252687, + "grad_norm": 0.47260862588882446, + "learning_rate": 0.0001, + "loss": 1.5327, + "step": 7664 + }, + { + "epoch": 0.8905024687772292, + "grad_norm": 0.4974091947078705, + "learning_rate": 0.0001, + "loss": 1.6739, + "step": 7665 + }, + { + "epoch": 0.8906186465291897, + "grad_norm": 0.5020241737365723, + "learning_rate": 0.0001, + "loss": 1.5533, + "step": 7666 + }, + { + "epoch": 0.8907348242811501, + "grad_norm": 0.47812047600746155, + "learning_rate": 0.0001, + "loss": 1.573, + "step": 7667 + }, + { + "epoch": 0.8908510020331106, + "grad_norm": 0.5125158429145813, + "learning_rate": 0.0001, + "loss": 1.8338, + "step": 7668 + }, + { + "epoch": 0.8909671797850711, + "grad_norm": 0.4838445484638214, + "learning_rate": 0.0001, + "loss": 1.6133, + "step": 7669 + }, + { + "epoch": 0.8910833575370316, + "grad_norm": 0.4530005156993866, + "learning_rate": 0.0001, + "loss": 1.5775, + "step": 7670 + }, + { + "epoch": 0.8911995352889922, + "grad_norm": 0.4553331732749939, + "learning_rate": 0.0001, + "loss": 1.6472, + "step": 7671 + }, + { + "epoch": 0.8913157130409527, + "grad_norm": 0.46588602662086487, + "learning_rate": 0.0001, + "loss": 1.3147, + "step": 7672 + }, + { + "epoch": 0.8914318907929132, + "grad_norm": 0.46226534247398376, + "learning_rate": 0.0001, + "loss": 1.4333, + "step": 7673 + }, + { + "epoch": 0.8915480685448737, + "grad_norm": 0.506314754486084, + "learning_rate": 0.0001, + "loss": 1.6884, + "step": 7674 + }, + { + "epoch": 0.8916642462968342, + "grad_norm": 0.49548977613449097, + "learning_rate": 0.0001, + "loss": 1.7037, + "step": 7675 + }, + { + "epoch": 0.8917804240487947, + "grad_norm": 0.4775904715061188, + "learning_rate": 0.0001, + "loss": 1.5836, + "step": 7676 + }, + { + "epoch": 0.8918966018007551, + "grad_norm": 0.4993993639945984, + "learning_rate": 0.0001, + "loss": 1.5624, + "step": 7677 + }, + { + "epoch": 0.8920127795527156, + "grad_norm": 0.49439752101898193, + "learning_rate": 0.0001, + "loss": 1.555, + "step": 7678 + }, + { + "epoch": 0.8921289573046761, + "grad_norm": 0.4990589916706085, + "learning_rate": 0.0001, + "loss": 1.6649, + "step": 7679 + }, + { + "epoch": 0.8922451350566366, + "grad_norm": 0.4424578547477722, + "learning_rate": 0.0001, + "loss": 1.2997, + "step": 7680 + }, + { + "epoch": 0.8923613128085972, + "grad_norm": 0.4604630768299103, + "learning_rate": 0.0001, + "loss": 1.6151, + "step": 7681 + }, + { + "epoch": 0.8924774905605577, + "grad_norm": 0.4625977575778961, + "learning_rate": 0.0001, + "loss": 1.5241, + "step": 7682 + }, + { + "epoch": 0.8925936683125182, + "grad_norm": 0.4803260266780853, + "learning_rate": 0.0001, + "loss": 1.6191, + "step": 7683 + }, + { + "epoch": 0.8927098460644787, + "grad_norm": 0.464473694562912, + "learning_rate": 0.0001, + "loss": 1.5425, + "step": 7684 + }, + { + "epoch": 0.8928260238164392, + "grad_norm": 0.44447073340415955, + "learning_rate": 0.0001, + "loss": 1.441, + "step": 7685 + }, + { + "epoch": 0.8929422015683997, + "grad_norm": 0.44388943910598755, + "learning_rate": 0.0001, + "loss": 1.6461, + "step": 7686 + }, + { + "epoch": 0.8930583793203601, + "grad_norm": 0.5310772657394409, + "learning_rate": 0.0001, + "loss": 1.6304, + "step": 7687 + }, + { + "epoch": 0.8931745570723206, + "grad_norm": 0.4848250150680542, + "learning_rate": 0.0001, + "loss": 1.4879, + "step": 7688 + }, + { + "epoch": 0.8932907348242811, + "grad_norm": 0.4669688940048218, + "learning_rate": 0.0001, + "loss": 1.5178, + "step": 7689 + }, + { + "epoch": 0.8934069125762416, + "grad_norm": 0.47490620613098145, + "learning_rate": 0.0001, + "loss": 1.5267, + "step": 7690 + }, + { + "epoch": 0.8935230903282021, + "grad_norm": 0.49228766560554504, + "learning_rate": 0.0001, + "loss": 1.6218, + "step": 7691 + }, + { + "epoch": 0.8936392680801627, + "grad_norm": 0.5638328790664673, + "learning_rate": 0.0001, + "loss": 1.9294, + "step": 7692 + }, + { + "epoch": 0.8937554458321232, + "grad_norm": 0.48162275552749634, + "learning_rate": 0.0001, + "loss": 1.5827, + "step": 7693 + }, + { + "epoch": 0.8938716235840837, + "grad_norm": 0.4998082220554352, + "learning_rate": 0.0001, + "loss": 1.6612, + "step": 7694 + }, + { + "epoch": 0.8939878013360442, + "grad_norm": 0.4761600196361542, + "learning_rate": 0.0001, + "loss": 1.5412, + "step": 7695 + }, + { + "epoch": 0.8941039790880047, + "grad_norm": 0.4868890345096588, + "learning_rate": 0.0001, + "loss": 1.6922, + "step": 7696 + }, + { + "epoch": 0.8942201568399651, + "grad_norm": 0.4814538061618805, + "learning_rate": 0.0001, + "loss": 1.5334, + "step": 7697 + }, + { + "epoch": 0.8943363345919256, + "grad_norm": 0.46864473819732666, + "learning_rate": 0.0001, + "loss": 1.7473, + "step": 7698 + }, + { + "epoch": 0.8944525123438861, + "grad_norm": 0.514991044998169, + "learning_rate": 0.0001, + "loss": 1.6549, + "step": 7699 + }, + { + "epoch": 0.8945686900958466, + "grad_norm": 0.4469473361968994, + "learning_rate": 0.0001, + "loss": 1.5397, + "step": 7700 + }, + { + "epoch": 0.8946848678478071, + "grad_norm": 0.42899852991104126, + "learning_rate": 0.0001, + "loss": 1.4535, + "step": 7701 + }, + { + "epoch": 0.8948010455997677, + "grad_norm": 0.44255563616752625, + "learning_rate": 0.0001, + "loss": 1.4856, + "step": 7702 + }, + { + "epoch": 0.8949172233517282, + "grad_norm": 0.4918566942214966, + "learning_rate": 0.0001, + "loss": 1.578, + "step": 7703 + }, + { + "epoch": 0.8950334011036887, + "grad_norm": 0.506798505783081, + "learning_rate": 0.0001, + "loss": 1.6809, + "step": 7704 + }, + { + "epoch": 0.8951495788556492, + "grad_norm": 0.4588955044746399, + "learning_rate": 0.0001, + "loss": 1.5369, + "step": 7705 + }, + { + "epoch": 0.8952657566076097, + "grad_norm": 0.4813918173313141, + "learning_rate": 0.0001, + "loss": 1.5812, + "step": 7706 + }, + { + "epoch": 0.8953819343595701, + "grad_norm": 0.4796220064163208, + "learning_rate": 0.0001, + "loss": 1.5712, + "step": 7707 + }, + { + "epoch": 0.8954981121115306, + "grad_norm": 0.5280551910400391, + "learning_rate": 0.0001, + "loss": 1.7209, + "step": 7708 + }, + { + "epoch": 0.8956142898634911, + "grad_norm": 0.5038082599639893, + "learning_rate": 0.0001, + "loss": 1.6893, + "step": 7709 + }, + { + "epoch": 0.8957304676154516, + "grad_norm": 0.45545947551727295, + "learning_rate": 0.0001, + "loss": 1.4836, + "step": 7710 + }, + { + "epoch": 0.8958466453674121, + "grad_norm": 0.5030218958854675, + "learning_rate": 0.0001, + "loss": 1.6816, + "step": 7711 + }, + { + "epoch": 0.8959628231193726, + "grad_norm": 0.4473370909690857, + "learning_rate": 0.0001, + "loss": 1.5029, + "step": 7712 + }, + { + "epoch": 0.8960790008713332, + "grad_norm": 0.7040799856185913, + "learning_rate": 0.0001, + "loss": 1.4743, + "step": 7713 + }, + { + "epoch": 0.8961951786232937, + "grad_norm": 0.46077761054039, + "learning_rate": 0.0001, + "loss": 1.6841, + "step": 7714 + }, + { + "epoch": 0.8963113563752542, + "grad_norm": 0.44979485869407654, + "learning_rate": 0.0001, + "loss": 1.5214, + "step": 7715 + }, + { + "epoch": 0.8964275341272147, + "grad_norm": 0.46903368830680847, + "learning_rate": 0.0001, + "loss": 1.5326, + "step": 7716 + }, + { + "epoch": 0.8965437118791751, + "grad_norm": 0.4809918701648712, + "learning_rate": 0.0001, + "loss": 1.6342, + "step": 7717 + }, + { + "epoch": 0.8966598896311356, + "grad_norm": 0.4725591838359833, + "learning_rate": 0.0001, + "loss": 1.5145, + "step": 7718 + }, + { + "epoch": 0.8967760673830961, + "grad_norm": 0.49973586201667786, + "learning_rate": 0.0001, + "loss": 1.783, + "step": 7719 + }, + { + "epoch": 0.8968922451350566, + "grad_norm": 0.4993938207626343, + "learning_rate": 0.0001, + "loss": 1.4557, + "step": 7720 + }, + { + "epoch": 0.8970084228870171, + "grad_norm": 0.5148173570632935, + "learning_rate": 0.0001, + "loss": 1.655, + "step": 7721 + }, + { + "epoch": 0.8971246006389776, + "grad_norm": 0.49831125140190125, + "learning_rate": 0.0001, + "loss": 1.5482, + "step": 7722 + }, + { + "epoch": 0.8972407783909382, + "grad_norm": 0.46448323130607605, + "learning_rate": 0.0001, + "loss": 1.6107, + "step": 7723 + }, + { + "epoch": 0.8973569561428987, + "grad_norm": 0.47163113951683044, + "learning_rate": 0.0001, + "loss": 1.6136, + "step": 7724 + }, + { + "epoch": 0.8974731338948592, + "grad_norm": 0.45095062255859375, + "learning_rate": 0.0001, + "loss": 1.5348, + "step": 7725 + }, + { + "epoch": 0.8975893116468197, + "grad_norm": 0.4865681231021881, + "learning_rate": 0.0001, + "loss": 1.6234, + "step": 7726 + }, + { + "epoch": 0.8977054893987801, + "grad_norm": 0.4516942501068115, + "learning_rate": 0.0001, + "loss": 1.6462, + "step": 7727 + }, + { + "epoch": 0.8978216671507406, + "grad_norm": 0.45870745182037354, + "learning_rate": 0.0001, + "loss": 1.5657, + "step": 7728 + }, + { + "epoch": 0.8979378449027011, + "grad_norm": 0.5079413056373596, + "learning_rate": 0.0001, + "loss": 1.558, + "step": 7729 + }, + { + "epoch": 0.8980540226546616, + "grad_norm": 0.46579909324645996, + "learning_rate": 0.0001, + "loss": 1.497, + "step": 7730 + }, + { + "epoch": 0.8981702004066221, + "grad_norm": 0.46913209557533264, + "learning_rate": 0.0001, + "loss": 1.6732, + "step": 7731 + }, + { + "epoch": 0.8982863781585826, + "grad_norm": 0.46891793608665466, + "learning_rate": 0.0001, + "loss": 1.5816, + "step": 7732 + }, + { + "epoch": 0.8984025559105431, + "grad_norm": 0.5008355379104614, + "learning_rate": 0.0001, + "loss": 1.654, + "step": 7733 + }, + { + "epoch": 0.8985187336625037, + "grad_norm": 0.48343855142593384, + "learning_rate": 0.0001, + "loss": 1.5644, + "step": 7734 + }, + { + "epoch": 0.8986349114144642, + "grad_norm": 0.47268950939178467, + "learning_rate": 0.0001, + "loss": 1.5458, + "step": 7735 + }, + { + "epoch": 0.8987510891664247, + "grad_norm": 0.5122753381729126, + "learning_rate": 0.0001, + "loss": 1.7408, + "step": 7736 + }, + { + "epoch": 0.8988672669183851, + "grad_norm": 0.45394930243492126, + "learning_rate": 0.0001, + "loss": 1.6283, + "step": 7737 + }, + { + "epoch": 0.8989834446703456, + "grad_norm": 0.4667399823665619, + "learning_rate": 0.0001, + "loss": 1.6136, + "step": 7738 + }, + { + "epoch": 0.8990996224223061, + "grad_norm": 0.5103424191474915, + "learning_rate": 0.0001, + "loss": 1.7418, + "step": 7739 + }, + { + "epoch": 0.8992158001742666, + "grad_norm": 0.5049645304679871, + "learning_rate": 0.0001, + "loss": 1.6137, + "step": 7740 + }, + { + "epoch": 0.8993319779262271, + "grad_norm": 0.45100435614585876, + "learning_rate": 0.0001, + "loss": 1.3736, + "step": 7741 + }, + { + "epoch": 0.8994481556781876, + "grad_norm": 0.4995354115962982, + "learning_rate": 0.0001, + "loss": 1.5616, + "step": 7742 + }, + { + "epoch": 0.8995643334301481, + "grad_norm": 0.5154250264167786, + "learning_rate": 0.0001, + "loss": 1.4851, + "step": 7743 + }, + { + "epoch": 0.8996805111821087, + "grad_norm": 0.4704994261264801, + "learning_rate": 0.0001, + "loss": 1.6447, + "step": 7744 + }, + { + "epoch": 0.8997966889340692, + "grad_norm": 0.4912912845611572, + "learning_rate": 0.0001, + "loss": 1.6153, + "step": 7745 + }, + { + "epoch": 0.8999128666860297, + "grad_norm": 0.4594641327857971, + "learning_rate": 0.0001, + "loss": 1.4078, + "step": 7746 + }, + { + "epoch": 0.9000290444379901, + "grad_norm": 0.48515209555625916, + "learning_rate": 0.0001, + "loss": 1.4787, + "step": 7747 + }, + { + "epoch": 0.9001452221899506, + "grad_norm": 0.4500701129436493, + "learning_rate": 0.0001, + "loss": 1.5444, + "step": 7748 + }, + { + "epoch": 0.9002613999419111, + "grad_norm": 0.5031145811080933, + "learning_rate": 0.0001, + "loss": 1.6389, + "step": 7749 + }, + { + "epoch": 0.9003775776938716, + "grad_norm": 0.46528854966163635, + "learning_rate": 0.0001, + "loss": 1.593, + "step": 7750 + }, + { + "epoch": 0.9004937554458321, + "grad_norm": 0.4735311269760132, + "learning_rate": 0.0001, + "loss": 1.5872, + "step": 7751 + }, + { + "epoch": 0.9006099331977926, + "grad_norm": 0.4692305028438568, + "learning_rate": 0.0001, + "loss": 1.5068, + "step": 7752 + }, + { + "epoch": 0.9007261109497531, + "grad_norm": 0.48819053173065186, + "learning_rate": 0.0001, + "loss": 1.6083, + "step": 7753 + }, + { + "epoch": 0.9008422887017136, + "grad_norm": 0.494018018245697, + "learning_rate": 0.0001, + "loss": 1.546, + "step": 7754 + }, + { + "epoch": 0.9009584664536742, + "grad_norm": 0.48794278502464294, + "learning_rate": 0.0001, + "loss": 1.6929, + "step": 7755 + }, + { + "epoch": 0.9010746442056347, + "grad_norm": 0.48159360885620117, + "learning_rate": 0.0001, + "loss": 1.5261, + "step": 7756 + }, + { + "epoch": 0.9011908219575951, + "grad_norm": 0.5053829550743103, + "learning_rate": 0.0001, + "loss": 1.5624, + "step": 7757 + }, + { + "epoch": 0.9013069997095556, + "grad_norm": 0.47055330872535706, + "learning_rate": 0.0001, + "loss": 1.4752, + "step": 7758 + }, + { + "epoch": 0.9014231774615161, + "grad_norm": 0.46060261130332947, + "learning_rate": 0.0001, + "loss": 1.4367, + "step": 7759 + }, + { + "epoch": 0.9015393552134766, + "grad_norm": 0.46970701217651367, + "learning_rate": 0.0001, + "loss": 1.4834, + "step": 7760 + }, + { + "epoch": 0.9016555329654371, + "grad_norm": 0.45572981238365173, + "learning_rate": 0.0001, + "loss": 1.4527, + "step": 7761 + }, + { + "epoch": 0.9017717107173976, + "grad_norm": 0.44444963335990906, + "learning_rate": 0.0001, + "loss": 1.5054, + "step": 7762 + }, + { + "epoch": 0.9018878884693581, + "grad_norm": 0.43386489152908325, + "learning_rate": 0.0001, + "loss": 1.3637, + "step": 7763 + }, + { + "epoch": 0.9020040662213186, + "grad_norm": 0.4735313057899475, + "learning_rate": 0.0001, + "loss": 1.4992, + "step": 7764 + }, + { + "epoch": 0.9021202439732792, + "grad_norm": 0.5182105898857117, + "learning_rate": 0.0001, + "loss": 1.5125, + "step": 7765 + }, + { + "epoch": 0.9022364217252397, + "grad_norm": 0.5161397457122803, + "learning_rate": 0.0001, + "loss": 1.5711, + "step": 7766 + }, + { + "epoch": 0.9023525994772001, + "grad_norm": 0.45563217997550964, + "learning_rate": 0.0001, + "loss": 1.4512, + "step": 7767 + }, + { + "epoch": 0.9024687772291606, + "grad_norm": 0.5249543786048889, + "learning_rate": 0.0001, + "loss": 1.5754, + "step": 7768 + }, + { + "epoch": 0.9025849549811211, + "grad_norm": 0.49920180439949036, + "learning_rate": 0.0001, + "loss": 1.7509, + "step": 7769 + }, + { + "epoch": 0.9027011327330816, + "grad_norm": 0.43166783452033997, + "learning_rate": 0.0001, + "loss": 1.5645, + "step": 7770 + }, + { + "epoch": 0.9028173104850421, + "grad_norm": 0.4483865201473236, + "learning_rate": 0.0001, + "loss": 1.5911, + "step": 7771 + }, + { + "epoch": 0.9029334882370026, + "grad_norm": 0.43736976385116577, + "learning_rate": 0.0001, + "loss": 1.5115, + "step": 7772 + }, + { + "epoch": 0.9030496659889631, + "grad_norm": 0.48580455780029297, + "learning_rate": 0.0001, + "loss": 1.5472, + "step": 7773 + }, + { + "epoch": 0.9031658437409236, + "grad_norm": 0.47388365864753723, + "learning_rate": 0.0001, + "loss": 1.5931, + "step": 7774 + }, + { + "epoch": 0.9032820214928841, + "grad_norm": 0.49823474884033203, + "learning_rate": 0.0001, + "loss": 1.6047, + "step": 7775 + }, + { + "epoch": 0.9033981992448447, + "grad_norm": 0.451608270406723, + "learning_rate": 0.0001, + "loss": 1.5266, + "step": 7776 + }, + { + "epoch": 0.9035143769968051, + "grad_norm": 0.4777389168739319, + "learning_rate": 0.0001, + "loss": 1.7107, + "step": 7777 + }, + { + "epoch": 0.9036305547487656, + "grad_norm": 0.4792795777320862, + "learning_rate": 0.0001, + "loss": 1.655, + "step": 7778 + }, + { + "epoch": 0.9037467325007261, + "grad_norm": 0.4765215814113617, + "learning_rate": 0.0001, + "loss": 1.5002, + "step": 7779 + }, + { + "epoch": 0.9038629102526866, + "grad_norm": 0.46686357259750366, + "learning_rate": 0.0001, + "loss": 1.4847, + "step": 7780 + }, + { + "epoch": 0.9039790880046471, + "grad_norm": 0.5087867379188538, + "learning_rate": 0.0001, + "loss": 1.9357, + "step": 7781 + }, + { + "epoch": 0.9040952657566076, + "grad_norm": 0.46744218468666077, + "learning_rate": 0.0001, + "loss": 1.5073, + "step": 7782 + }, + { + "epoch": 0.9042114435085681, + "grad_norm": 0.5227089524269104, + "learning_rate": 0.0001, + "loss": 1.5938, + "step": 7783 + }, + { + "epoch": 0.9043276212605286, + "grad_norm": 0.4557849168777466, + "learning_rate": 0.0001, + "loss": 1.5522, + "step": 7784 + }, + { + "epoch": 0.9044437990124891, + "grad_norm": 0.4599103033542633, + "learning_rate": 0.0001, + "loss": 1.6792, + "step": 7785 + }, + { + "epoch": 0.9045599767644497, + "grad_norm": 0.5397565364837646, + "learning_rate": 0.0001, + "loss": 1.8295, + "step": 7786 + }, + { + "epoch": 0.9046761545164101, + "grad_norm": 0.5359669923782349, + "learning_rate": 0.0001, + "loss": 1.7149, + "step": 7787 + }, + { + "epoch": 0.9047923322683706, + "grad_norm": 0.49455010890960693, + "learning_rate": 0.0001, + "loss": 1.7036, + "step": 7788 + }, + { + "epoch": 0.9049085100203311, + "grad_norm": 0.4927608072757721, + "learning_rate": 0.0001, + "loss": 1.668, + "step": 7789 + }, + { + "epoch": 0.9050246877722916, + "grad_norm": 0.45446282625198364, + "learning_rate": 0.0001, + "loss": 1.5499, + "step": 7790 + }, + { + "epoch": 0.9051408655242521, + "grad_norm": 0.4715528190135956, + "learning_rate": 0.0001, + "loss": 1.5217, + "step": 7791 + }, + { + "epoch": 0.9052570432762126, + "grad_norm": 0.47455599904060364, + "learning_rate": 0.0001, + "loss": 1.6861, + "step": 7792 + }, + { + "epoch": 0.9053732210281731, + "grad_norm": 0.4646841585636139, + "learning_rate": 0.0001, + "loss": 1.5259, + "step": 7793 + }, + { + "epoch": 0.9054893987801336, + "grad_norm": 0.4721209406852722, + "learning_rate": 0.0001, + "loss": 1.6885, + "step": 7794 + }, + { + "epoch": 0.9056055765320941, + "grad_norm": 0.4585552215576172, + "learning_rate": 0.0001, + "loss": 1.5124, + "step": 7795 + }, + { + "epoch": 0.9057217542840547, + "grad_norm": 0.49558770656585693, + "learning_rate": 0.0001, + "loss": 1.6104, + "step": 7796 + }, + { + "epoch": 0.9058379320360151, + "grad_norm": 0.45324012637138367, + "learning_rate": 0.0001, + "loss": 1.6386, + "step": 7797 + }, + { + "epoch": 0.9059541097879756, + "grad_norm": 0.44756069779396057, + "learning_rate": 0.0001, + "loss": 1.4753, + "step": 7798 + }, + { + "epoch": 0.9060702875399361, + "grad_norm": 0.48996299505233765, + "learning_rate": 0.0001, + "loss": 1.8288, + "step": 7799 + }, + { + "epoch": 0.9061864652918966, + "grad_norm": 0.47252756357192993, + "learning_rate": 0.0001, + "loss": 1.6035, + "step": 7800 + }, + { + "epoch": 0.9063026430438571, + "grad_norm": 0.4882994294166565, + "learning_rate": 0.0001, + "loss": 1.6112, + "step": 7801 + }, + { + "epoch": 0.9064188207958176, + "grad_norm": 0.49475234746932983, + "learning_rate": 0.0001, + "loss": 1.7413, + "step": 7802 + }, + { + "epoch": 0.9065349985477781, + "grad_norm": 0.467546671628952, + "learning_rate": 0.0001, + "loss": 1.3964, + "step": 7803 + }, + { + "epoch": 0.9066511762997386, + "grad_norm": 0.47339338064193726, + "learning_rate": 0.0001, + "loss": 1.6716, + "step": 7804 + }, + { + "epoch": 0.9067673540516991, + "grad_norm": 0.4243144094944, + "learning_rate": 0.0001, + "loss": 1.2828, + "step": 7805 + }, + { + "epoch": 0.9068835318036595, + "grad_norm": 0.4705776572227478, + "learning_rate": 0.0001, + "loss": 1.6059, + "step": 7806 + }, + { + "epoch": 0.9069997095556201, + "grad_norm": 0.48262354731559753, + "learning_rate": 0.0001, + "loss": 1.5293, + "step": 7807 + }, + { + "epoch": 0.9071158873075806, + "grad_norm": 0.46224817633628845, + "learning_rate": 0.0001, + "loss": 1.4478, + "step": 7808 + }, + { + "epoch": 0.9072320650595411, + "grad_norm": 0.48688849806785583, + "learning_rate": 0.0001, + "loss": 1.6116, + "step": 7809 + }, + { + "epoch": 0.9073482428115016, + "grad_norm": 0.47182363271713257, + "learning_rate": 0.0001, + "loss": 1.6092, + "step": 7810 + }, + { + "epoch": 0.9074644205634621, + "grad_norm": 0.5122577548027039, + "learning_rate": 0.0001, + "loss": 1.5642, + "step": 7811 + }, + { + "epoch": 0.9075805983154226, + "grad_norm": 0.4881529211997986, + "learning_rate": 0.0001, + "loss": 1.6594, + "step": 7812 + }, + { + "epoch": 0.9076967760673831, + "grad_norm": 0.5338044166564941, + "learning_rate": 0.0001, + "loss": 1.7145, + "step": 7813 + }, + { + "epoch": 0.9078129538193436, + "grad_norm": 0.4970211684703827, + "learning_rate": 0.0001, + "loss": 1.7056, + "step": 7814 + }, + { + "epoch": 0.9079291315713041, + "grad_norm": 0.574657142162323, + "learning_rate": 0.0001, + "loss": 1.3376, + "step": 7815 + }, + { + "epoch": 0.9080453093232645, + "grad_norm": 0.44657719135284424, + "learning_rate": 0.0001, + "loss": 1.4165, + "step": 7816 + }, + { + "epoch": 0.9081614870752251, + "grad_norm": 0.4609968066215515, + "learning_rate": 0.0001, + "loss": 1.5561, + "step": 7817 + }, + { + "epoch": 0.9082776648271856, + "grad_norm": 0.4657510221004486, + "learning_rate": 0.0001, + "loss": 1.6129, + "step": 7818 + }, + { + "epoch": 0.9083938425791461, + "grad_norm": 0.4992734491825104, + "learning_rate": 0.0001, + "loss": 1.6048, + "step": 7819 + }, + { + "epoch": 0.9085100203311066, + "grad_norm": 0.5076075196266174, + "learning_rate": 0.0001, + "loss": 1.7326, + "step": 7820 + }, + { + "epoch": 0.9086261980830671, + "grad_norm": 0.4936392605304718, + "learning_rate": 0.0001, + "loss": 1.7559, + "step": 7821 + }, + { + "epoch": 0.9087423758350276, + "grad_norm": 0.47953271865844727, + "learning_rate": 0.0001, + "loss": 1.5399, + "step": 7822 + }, + { + "epoch": 0.9088585535869881, + "grad_norm": 0.4652204215526581, + "learning_rate": 0.0001, + "loss": 1.652, + "step": 7823 + }, + { + "epoch": 0.9089747313389486, + "grad_norm": 0.4872530698776245, + "learning_rate": 0.0001, + "loss": 1.521, + "step": 7824 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.4713476300239563, + "learning_rate": 0.0001, + "loss": 1.5699, + "step": 7825 + }, + { + "epoch": 0.9092070868428695, + "grad_norm": 0.4624953269958496, + "learning_rate": 0.0001, + "loss": 1.6234, + "step": 7826 + }, + { + "epoch": 0.90932326459483, + "grad_norm": 0.49820998311042786, + "learning_rate": 0.0001, + "loss": 1.6088, + "step": 7827 + }, + { + "epoch": 0.9094394423467906, + "grad_norm": 0.4489380419254303, + "learning_rate": 0.0001, + "loss": 1.4541, + "step": 7828 + }, + { + "epoch": 0.9095556200987511, + "grad_norm": 0.48509761691093445, + "learning_rate": 0.0001, + "loss": 1.6833, + "step": 7829 + }, + { + "epoch": 0.9096717978507116, + "grad_norm": 0.5044791102409363, + "learning_rate": 0.0001, + "loss": 1.5291, + "step": 7830 + }, + { + "epoch": 0.9097879756026721, + "grad_norm": 0.47124698758125305, + "learning_rate": 0.0001, + "loss": 1.5421, + "step": 7831 + }, + { + "epoch": 0.9099041533546326, + "grad_norm": 0.4781220555305481, + "learning_rate": 0.0001, + "loss": 1.595, + "step": 7832 + }, + { + "epoch": 0.9100203311065931, + "grad_norm": 0.47968727350234985, + "learning_rate": 0.0001, + "loss": 1.7076, + "step": 7833 + }, + { + "epoch": 0.9101365088585536, + "grad_norm": 0.5137500166893005, + "learning_rate": 0.0001, + "loss": 1.7598, + "step": 7834 + }, + { + "epoch": 0.9102526866105141, + "grad_norm": 0.4631037414073944, + "learning_rate": 0.0001, + "loss": 1.5031, + "step": 7835 + }, + { + "epoch": 0.9103688643624746, + "grad_norm": 0.5219733715057373, + "learning_rate": 0.0001, + "loss": 1.7391, + "step": 7836 + }, + { + "epoch": 0.910485042114435, + "grad_norm": 0.4807119071483612, + "learning_rate": 0.0001, + "loss": 1.6519, + "step": 7837 + }, + { + "epoch": 0.9106012198663956, + "grad_norm": 0.5032156705856323, + "learning_rate": 0.0001, + "loss": 1.7154, + "step": 7838 + }, + { + "epoch": 0.9107173976183561, + "grad_norm": 0.7016561031341553, + "learning_rate": 0.0001, + "loss": 1.4776, + "step": 7839 + }, + { + "epoch": 0.9108335753703166, + "grad_norm": 0.4516977071762085, + "learning_rate": 0.0001, + "loss": 1.5318, + "step": 7840 + }, + { + "epoch": 0.9109497531222771, + "grad_norm": 0.4732464849948883, + "learning_rate": 0.0001, + "loss": 1.5465, + "step": 7841 + }, + { + "epoch": 0.9110659308742376, + "grad_norm": 0.4913138151168823, + "learning_rate": 0.0001, + "loss": 1.6617, + "step": 7842 + }, + { + "epoch": 0.9111821086261981, + "grad_norm": 0.5138623118400574, + "learning_rate": 0.0001, + "loss": 1.7446, + "step": 7843 + }, + { + "epoch": 0.9112982863781586, + "grad_norm": 0.46417149901390076, + "learning_rate": 0.0001, + "loss": 1.6198, + "step": 7844 + }, + { + "epoch": 0.9114144641301191, + "grad_norm": 0.47088223695755005, + "learning_rate": 0.0001, + "loss": 1.5643, + "step": 7845 + }, + { + "epoch": 0.9115306418820796, + "grad_norm": 0.45091691613197327, + "learning_rate": 0.0001, + "loss": 1.5734, + "step": 7846 + }, + { + "epoch": 0.91164681963404, + "grad_norm": 0.45545732975006104, + "learning_rate": 0.0001, + "loss": 1.4915, + "step": 7847 + }, + { + "epoch": 0.9117629973860005, + "grad_norm": 0.45981520414352417, + "learning_rate": 0.0001, + "loss": 1.4383, + "step": 7848 + }, + { + "epoch": 0.9118791751379611, + "grad_norm": 0.472409725189209, + "learning_rate": 0.0001, + "loss": 1.5439, + "step": 7849 + }, + { + "epoch": 0.9119953528899216, + "grad_norm": 0.5488510131835938, + "learning_rate": 0.0001, + "loss": 1.7462, + "step": 7850 + }, + { + "epoch": 0.9121115306418821, + "grad_norm": 0.48794418573379517, + "learning_rate": 0.0001, + "loss": 1.4728, + "step": 7851 + }, + { + "epoch": 0.9122277083938426, + "grad_norm": 0.4115673899650574, + "learning_rate": 0.0001, + "loss": 1.4473, + "step": 7852 + }, + { + "epoch": 0.9123438861458031, + "grad_norm": 0.46749910712242126, + "learning_rate": 0.0001, + "loss": 1.6613, + "step": 7853 + }, + { + "epoch": 0.9124600638977636, + "grad_norm": 0.47917798161506653, + "learning_rate": 0.0001, + "loss": 1.5905, + "step": 7854 + }, + { + "epoch": 0.9125762416497241, + "grad_norm": 0.46126171946525574, + "learning_rate": 0.0001, + "loss": 1.6636, + "step": 7855 + }, + { + "epoch": 0.9126924194016846, + "grad_norm": 0.4459565281867981, + "learning_rate": 0.0001, + "loss": 1.4559, + "step": 7856 + }, + { + "epoch": 0.912808597153645, + "grad_norm": 0.4595984220504761, + "learning_rate": 0.0001, + "loss": 1.624, + "step": 7857 + }, + { + "epoch": 0.9129247749056055, + "grad_norm": 0.45594367384910583, + "learning_rate": 0.0001, + "loss": 1.544, + "step": 7858 + }, + { + "epoch": 0.9130409526575661, + "grad_norm": 0.4766393303871155, + "learning_rate": 0.0001, + "loss": 1.5009, + "step": 7859 + }, + { + "epoch": 0.9131571304095266, + "grad_norm": 0.5518864393234253, + "learning_rate": 0.0001, + "loss": 1.7039, + "step": 7860 + }, + { + "epoch": 0.9132733081614871, + "grad_norm": 0.48562371730804443, + "learning_rate": 0.0001, + "loss": 1.583, + "step": 7861 + }, + { + "epoch": 0.9133894859134476, + "grad_norm": 0.5005999207496643, + "learning_rate": 0.0001, + "loss": 1.7962, + "step": 7862 + }, + { + "epoch": 0.9135056636654081, + "grad_norm": 0.4571722149848938, + "learning_rate": 0.0001, + "loss": 1.5681, + "step": 7863 + }, + { + "epoch": 0.9136218414173686, + "grad_norm": 0.4847254753112793, + "learning_rate": 0.0001, + "loss": 1.5251, + "step": 7864 + }, + { + "epoch": 0.9137380191693291, + "grad_norm": 0.5055410265922546, + "learning_rate": 0.0001, + "loss": 1.7978, + "step": 7865 + }, + { + "epoch": 0.9138541969212896, + "grad_norm": 0.5655259490013123, + "learning_rate": 0.0001, + "loss": 1.5319, + "step": 7866 + }, + { + "epoch": 0.91397037467325, + "grad_norm": 0.46798625588417053, + "learning_rate": 0.0001, + "loss": 1.6612, + "step": 7867 + }, + { + "epoch": 0.9140865524252105, + "grad_norm": 0.48041674494743347, + "learning_rate": 0.0001, + "loss": 1.6535, + "step": 7868 + }, + { + "epoch": 0.914202730177171, + "grad_norm": 0.4875914752483368, + "learning_rate": 0.0001, + "loss": 1.6126, + "step": 7869 + }, + { + "epoch": 0.9143189079291316, + "grad_norm": 0.49609968066215515, + "learning_rate": 0.0001, + "loss": 1.7816, + "step": 7870 + }, + { + "epoch": 0.9144350856810921, + "grad_norm": 0.43055441975593567, + "learning_rate": 0.0001, + "loss": 1.4804, + "step": 7871 + }, + { + "epoch": 0.9145512634330526, + "grad_norm": 0.4403667747974396, + "learning_rate": 0.0001, + "loss": 1.5251, + "step": 7872 + }, + { + "epoch": 0.9146674411850131, + "grad_norm": 0.4711524546146393, + "learning_rate": 0.0001, + "loss": 1.5767, + "step": 7873 + }, + { + "epoch": 0.9147836189369736, + "grad_norm": 0.49612268805503845, + "learning_rate": 0.0001, + "loss": 1.7506, + "step": 7874 + }, + { + "epoch": 0.9148997966889341, + "grad_norm": 0.4640529751777649, + "learning_rate": 0.0001, + "loss": 1.5309, + "step": 7875 + }, + { + "epoch": 0.9150159744408946, + "grad_norm": 0.45342904329299927, + "learning_rate": 0.0001, + "loss": 1.5846, + "step": 7876 + }, + { + "epoch": 0.915132152192855, + "grad_norm": 0.4637288749217987, + "learning_rate": 0.0001, + "loss": 1.416, + "step": 7877 + }, + { + "epoch": 0.9152483299448155, + "grad_norm": 0.5066947937011719, + "learning_rate": 0.0001, + "loss": 1.486, + "step": 7878 + }, + { + "epoch": 0.915364507696776, + "grad_norm": 0.47863486409187317, + "learning_rate": 0.0001, + "loss": 1.5661, + "step": 7879 + }, + { + "epoch": 0.9154806854487366, + "grad_norm": 0.4799175262451172, + "learning_rate": 0.0001, + "loss": 1.6418, + "step": 7880 + }, + { + "epoch": 0.9155968632006971, + "grad_norm": 0.49706557393074036, + "learning_rate": 0.0001, + "loss": 1.5838, + "step": 7881 + }, + { + "epoch": 0.9157130409526576, + "grad_norm": 0.49601349234580994, + "learning_rate": 0.0001, + "loss": 1.6441, + "step": 7882 + }, + { + "epoch": 0.9158292187046181, + "grad_norm": 0.5072823762893677, + "learning_rate": 0.0001, + "loss": 1.542, + "step": 7883 + }, + { + "epoch": 0.9159453964565786, + "grad_norm": 0.5159144401550293, + "learning_rate": 0.0001, + "loss": 1.6384, + "step": 7884 + }, + { + "epoch": 0.9160615742085391, + "grad_norm": 0.46649453043937683, + "learning_rate": 0.0001, + "loss": 1.489, + "step": 7885 + }, + { + "epoch": 0.9161777519604996, + "grad_norm": 0.48782074451446533, + "learning_rate": 0.0001, + "loss": 1.6177, + "step": 7886 + }, + { + "epoch": 0.91629392971246, + "grad_norm": 0.5250930190086365, + "learning_rate": 0.0001, + "loss": 1.6332, + "step": 7887 + }, + { + "epoch": 0.9164101074644205, + "grad_norm": 0.4878339469432831, + "learning_rate": 0.0001, + "loss": 1.5546, + "step": 7888 + }, + { + "epoch": 0.916526285216381, + "grad_norm": 0.45729678869247437, + "learning_rate": 0.0001, + "loss": 1.5296, + "step": 7889 + }, + { + "epoch": 0.9166424629683415, + "grad_norm": 0.47600996494293213, + "learning_rate": 0.0001, + "loss": 1.5506, + "step": 7890 + }, + { + "epoch": 0.9167586407203021, + "grad_norm": 0.5157418847084045, + "learning_rate": 0.0001, + "loss": 1.6465, + "step": 7891 + }, + { + "epoch": 0.9168748184722626, + "grad_norm": 0.5000462532043457, + "learning_rate": 0.0001, + "loss": 1.6882, + "step": 7892 + }, + { + "epoch": 0.9169909962242231, + "grad_norm": 0.48779532313346863, + "learning_rate": 0.0001, + "loss": 1.3709, + "step": 7893 + }, + { + "epoch": 0.9171071739761836, + "grad_norm": 0.49037328362464905, + "learning_rate": 0.0001, + "loss": 1.5719, + "step": 7894 + }, + { + "epoch": 0.9172233517281441, + "grad_norm": 0.47151613235473633, + "learning_rate": 0.0001, + "loss": 1.5745, + "step": 7895 + }, + { + "epoch": 0.9173395294801046, + "grad_norm": 0.5137687921524048, + "learning_rate": 0.0001, + "loss": 1.6653, + "step": 7896 + }, + { + "epoch": 0.917455707232065, + "grad_norm": 0.4644807279109955, + "learning_rate": 0.0001, + "loss": 1.5263, + "step": 7897 + }, + { + "epoch": 0.9175718849840255, + "grad_norm": 0.45312634110450745, + "learning_rate": 0.0001, + "loss": 1.5498, + "step": 7898 + }, + { + "epoch": 0.917688062735986, + "grad_norm": 0.5129163861274719, + "learning_rate": 0.0001, + "loss": 1.6555, + "step": 7899 + }, + { + "epoch": 0.9178042404879465, + "grad_norm": 0.474354088306427, + "learning_rate": 0.0001, + "loss": 1.5625, + "step": 7900 + }, + { + "epoch": 0.9179204182399071, + "grad_norm": 0.47130098938941956, + "learning_rate": 0.0001, + "loss": 1.4614, + "step": 7901 + }, + { + "epoch": 0.9180365959918676, + "grad_norm": 0.4573417901992798, + "learning_rate": 0.0001, + "loss": 1.5293, + "step": 7902 + }, + { + "epoch": 0.9181527737438281, + "grad_norm": 0.46235814690589905, + "learning_rate": 0.0001, + "loss": 1.758, + "step": 7903 + }, + { + "epoch": 0.9182689514957886, + "grad_norm": 0.5168223977088928, + "learning_rate": 0.0001, + "loss": 1.2772, + "step": 7904 + }, + { + "epoch": 0.9183851292477491, + "grad_norm": 0.4721146523952484, + "learning_rate": 0.0001, + "loss": 1.6981, + "step": 7905 + }, + { + "epoch": 0.9185013069997096, + "grad_norm": 0.49531880021095276, + "learning_rate": 0.0001, + "loss": 1.5861, + "step": 7906 + }, + { + "epoch": 0.91861748475167, + "grad_norm": 0.49574729800224304, + "learning_rate": 0.0001, + "loss": 1.5242, + "step": 7907 + }, + { + "epoch": 0.9187336625036305, + "grad_norm": 0.4491695761680603, + "learning_rate": 0.0001, + "loss": 1.4508, + "step": 7908 + }, + { + "epoch": 0.918849840255591, + "grad_norm": 0.45107486844062805, + "learning_rate": 0.0001, + "loss": 1.4855, + "step": 7909 + }, + { + "epoch": 0.9189660180075515, + "grad_norm": 0.4571503698825836, + "learning_rate": 0.0001, + "loss": 1.6184, + "step": 7910 + }, + { + "epoch": 0.919082195759512, + "grad_norm": 0.49854135513305664, + "learning_rate": 0.0001, + "loss": 1.7238, + "step": 7911 + }, + { + "epoch": 0.9191983735114726, + "grad_norm": 0.4973702132701874, + "learning_rate": 0.0001, + "loss": 1.5851, + "step": 7912 + }, + { + "epoch": 0.9193145512634331, + "grad_norm": 0.47900450229644775, + "learning_rate": 0.0001, + "loss": 1.4472, + "step": 7913 + }, + { + "epoch": 0.9194307290153936, + "grad_norm": 0.42754286527633667, + "learning_rate": 0.0001, + "loss": 1.5078, + "step": 7914 + }, + { + "epoch": 0.9195469067673541, + "grad_norm": 0.5205299854278564, + "learning_rate": 0.0001, + "loss": 1.6868, + "step": 7915 + }, + { + "epoch": 0.9196630845193146, + "grad_norm": 0.49486666917800903, + "learning_rate": 0.0001, + "loss": 1.685, + "step": 7916 + }, + { + "epoch": 0.919779262271275, + "grad_norm": 0.47060513496398926, + "learning_rate": 0.0001, + "loss": 1.4396, + "step": 7917 + }, + { + "epoch": 0.9198954400232355, + "grad_norm": 0.4912223517894745, + "learning_rate": 0.0001, + "loss": 1.7563, + "step": 7918 + }, + { + "epoch": 0.920011617775196, + "grad_norm": 0.46059104800224304, + "learning_rate": 0.0001, + "loss": 1.3697, + "step": 7919 + }, + { + "epoch": 0.9201277955271565, + "grad_norm": 0.4507901072502136, + "learning_rate": 0.0001, + "loss": 1.5831, + "step": 7920 + }, + { + "epoch": 0.920243973279117, + "grad_norm": 0.49140775203704834, + "learning_rate": 0.0001, + "loss": 1.4478, + "step": 7921 + }, + { + "epoch": 0.9203601510310776, + "grad_norm": 0.4491073489189148, + "learning_rate": 0.0001, + "loss": 1.3941, + "step": 7922 + }, + { + "epoch": 0.9204763287830381, + "grad_norm": 0.49387529492378235, + "learning_rate": 0.0001, + "loss": 1.5508, + "step": 7923 + }, + { + "epoch": 0.9205925065349986, + "grad_norm": 0.4687308669090271, + "learning_rate": 0.0001, + "loss": 1.5208, + "step": 7924 + }, + { + "epoch": 0.9207086842869591, + "grad_norm": 0.5228381752967834, + "learning_rate": 0.0001, + "loss": 1.8851, + "step": 7925 + }, + { + "epoch": 0.9208248620389196, + "grad_norm": 0.49759143590927124, + "learning_rate": 0.0001, + "loss": 1.6124, + "step": 7926 + }, + { + "epoch": 0.92094103979088, + "grad_norm": 0.47675225138664246, + "learning_rate": 0.0001, + "loss": 1.609, + "step": 7927 + }, + { + "epoch": 0.9210572175428405, + "grad_norm": 0.45978063344955444, + "learning_rate": 0.0001, + "loss": 1.5576, + "step": 7928 + }, + { + "epoch": 0.921173395294801, + "grad_norm": 0.5017638802528381, + "learning_rate": 0.0001, + "loss": 1.7266, + "step": 7929 + }, + { + "epoch": 0.9212895730467615, + "grad_norm": 0.467979371547699, + "learning_rate": 0.0001, + "loss": 1.6362, + "step": 7930 + }, + { + "epoch": 0.921405750798722, + "grad_norm": 0.4964231848716736, + "learning_rate": 0.0001, + "loss": 1.4573, + "step": 7931 + }, + { + "epoch": 0.9215219285506825, + "grad_norm": 0.46611857414245605, + "learning_rate": 0.0001, + "loss": 1.5287, + "step": 7932 + }, + { + "epoch": 0.9216381063026431, + "grad_norm": 0.47594428062438965, + "learning_rate": 0.0001, + "loss": 1.6423, + "step": 7933 + }, + { + "epoch": 0.9217542840546036, + "grad_norm": 0.4936331510543823, + "learning_rate": 0.0001, + "loss": 1.6663, + "step": 7934 + }, + { + "epoch": 0.9218704618065641, + "grad_norm": 0.4947721064090729, + "learning_rate": 0.0001, + "loss": 1.5737, + "step": 7935 + }, + { + "epoch": 0.9219866395585246, + "grad_norm": 0.4754369854927063, + "learning_rate": 0.0001, + "loss": 1.5118, + "step": 7936 + }, + { + "epoch": 0.922102817310485, + "grad_norm": 0.46831005811691284, + "learning_rate": 0.0001, + "loss": 1.5884, + "step": 7937 + }, + { + "epoch": 0.9222189950624455, + "grad_norm": 0.498220294713974, + "learning_rate": 0.0001, + "loss": 1.5117, + "step": 7938 + }, + { + "epoch": 0.922335172814406, + "grad_norm": 0.4809699058532715, + "learning_rate": 0.0001, + "loss": 1.6335, + "step": 7939 + }, + { + "epoch": 0.9224513505663665, + "grad_norm": 0.4623294174671173, + "learning_rate": 0.0001, + "loss": 1.4411, + "step": 7940 + }, + { + "epoch": 0.922567528318327, + "grad_norm": 0.49049609899520874, + "learning_rate": 0.0001, + "loss": 1.5309, + "step": 7941 + }, + { + "epoch": 0.9226837060702875, + "grad_norm": 0.47794732451438904, + "learning_rate": 0.0001, + "loss": 1.5218, + "step": 7942 + }, + { + "epoch": 0.9227998838222481, + "grad_norm": 0.49489712715148926, + "learning_rate": 0.0001, + "loss": 1.7071, + "step": 7943 + }, + { + "epoch": 0.9229160615742086, + "grad_norm": 0.4838644564151764, + "learning_rate": 0.0001, + "loss": 1.6534, + "step": 7944 + }, + { + "epoch": 0.9230322393261691, + "grad_norm": 0.542927622795105, + "learning_rate": 0.0001, + "loss": 1.8534, + "step": 7945 + }, + { + "epoch": 0.9231484170781296, + "grad_norm": 0.484602153301239, + "learning_rate": 0.0001, + "loss": 1.7658, + "step": 7946 + }, + { + "epoch": 0.92326459483009, + "grad_norm": 0.45211806893348694, + "learning_rate": 0.0001, + "loss": 1.3895, + "step": 7947 + }, + { + "epoch": 0.9233807725820505, + "grad_norm": 0.5238228440284729, + "learning_rate": 0.0001, + "loss": 1.7041, + "step": 7948 + }, + { + "epoch": 0.923496950334011, + "grad_norm": 0.4958471655845642, + "learning_rate": 0.0001, + "loss": 1.6182, + "step": 7949 + }, + { + "epoch": 0.9236131280859715, + "grad_norm": 0.4387954771518707, + "learning_rate": 0.0001, + "loss": 1.3532, + "step": 7950 + }, + { + "epoch": 0.923729305837932, + "grad_norm": 0.4402277171611786, + "learning_rate": 0.0001, + "loss": 1.4075, + "step": 7951 + }, + { + "epoch": 0.9238454835898925, + "grad_norm": 0.5100318193435669, + "learning_rate": 0.0001, + "loss": 1.5843, + "step": 7952 + }, + { + "epoch": 0.923961661341853, + "grad_norm": 0.4846263527870178, + "learning_rate": 0.0001, + "loss": 1.6603, + "step": 7953 + }, + { + "epoch": 0.9240778390938136, + "grad_norm": 0.4743453562259674, + "learning_rate": 0.0001, + "loss": 1.5289, + "step": 7954 + }, + { + "epoch": 0.9241940168457741, + "grad_norm": 0.5055150985717773, + "learning_rate": 0.0001, + "loss": 1.6125, + "step": 7955 + }, + { + "epoch": 0.9243101945977346, + "grad_norm": 0.46084272861480713, + "learning_rate": 0.0001, + "loss": 1.5453, + "step": 7956 + }, + { + "epoch": 0.924426372349695, + "grad_norm": 0.47752270102500916, + "learning_rate": 0.0001, + "loss": 1.5842, + "step": 7957 + }, + { + "epoch": 0.9245425501016555, + "grad_norm": 0.47318798303604126, + "learning_rate": 0.0001, + "loss": 1.6307, + "step": 7958 + }, + { + "epoch": 0.924658727853616, + "grad_norm": 0.4867374897003174, + "learning_rate": 0.0001, + "loss": 1.6222, + "step": 7959 + }, + { + "epoch": 0.9247749056055765, + "grad_norm": 0.48629096150398254, + "learning_rate": 0.0001, + "loss": 1.7713, + "step": 7960 + }, + { + "epoch": 0.924891083357537, + "grad_norm": 0.47836753726005554, + "learning_rate": 0.0001, + "loss": 1.5697, + "step": 7961 + }, + { + "epoch": 0.9250072611094975, + "grad_norm": 0.49589991569519043, + "learning_rate": 0.0001, + "loss": 1.6846, + "step": 7962 + }, + { + "epoch": 0.925123438861458, + "grad_norm": 0.46878737211227417, + "learning_rate": 0.0001, + "loss": 1.6223, + "step": 7963 + }, + { + "epoch": 0.9252396166134186, + "grad_norm": 0.4647105634212494, + "learning_rate": 0.0001, + "loss": 1.6032, + "step": 7964 + }, + { + "epoch": 0.9253557943653791, + "grad_norm": 0.46214747428894043, + "learning_rate": 0.0001, + "loss": 1.5577, + "step": 7965 + }, + { + "epoch": 0.9254719721173396, + "grad_norm": 0.5062007904052734, + "learning_rate": 0.0001, + "loss": 1.8203, + "step": 7966 + }, + { + "epoch": 0.9255881498693, + "grad_norm": 0.5121557116508484, + "learning_rate": 0.0001, + "loss": 1.7192, + "step": 7967 + }, + { + "epoch": 0.9257043276212605, + "grad_norm": 0.44176679849624634, + "learning_rate": 0.0001, + "loss": 1.4522, + "step": 7968 + }, + { + "epoch": 0.925820505373221, + "grad_norm": 0.47141146659851074, + "learning_rate": 0.0001, + "loss": 1.4804, + "step": 7969 + }, + { + "epoch": 0.9259366831251815, + "grad_norm": 0.48892006278038025, + "learning_rate": 0.0001, + "loss": 1.6353, + "step": 7970 + }, + { + "epoch": 0.926052860877142, + "grad_norm": 0.512973964214325, + "learning_rate": 0.0001, + "loss": 1.7151, + "step": 7971 + }, + { + "epoch": 0.9261690386291025, + "grad_norm": 0.4414759576320648, + "learning_rate": 0.0001, + "loss": 1.3828, + "step": 7972 + }, + { + "epoch": 0.926285216381063, + "grad_norm": 0.48887619376182556, + "learning_rate": 0.0001, + "loss": 1.6999, + "step": 7973 + }, + { + "epoch": 0.9264013941330236, + "grad_norm": 0.45762237906455994, + "learning_rate": 0.0001, + "loss": 1.5184, + "step": 7974 + }, + { + "epoch": 0.9265175718849841, + "grad_norm": 0.45847946405410767, + "learning_rate": 0.0001, + "loss": 1.6602, + "step": 7975 + }, + { + "epoch": 0.9266337496369446, + "grad_norm": 0.49288585782051086, + "learning_rate": 0.0001, + "loss": 1.762, + "step": 7976 + }, + { + "epoch": 0.926749927388905, + "grad_norm": 0.48561185598373413, + "learning_rate": 0.0001, + "loss": 1.5289, + "step": 7977 + }, + { + "epoch": 0.9268661051408655, + "grad_norm": 0.447745680809021, + "learning_rate": 0.0001, + "loss": 1.5616, + "step": 7978 + }, + { + "epoch": 0.926982282892826, + "grad_norm": 0.49611517786979675, + "learning_rate": 0.0001, + "loss": 1.6865, + "step": 7979 + }, + { + "epoch": 0.9270984606447865, + "grad_norm": 0.46112751960754395, + "learning_rate": 0.0001, + "loss": 1.6159, + "step": 7980 + }, + { + "epoch": 0.927214638396747, + "grad_norm": 0.4908156991004944, + "learning_rate": 0.0001, + "loss": 1.6442, + "step": 7981 + }, + { + "epoch": 0.9273308161487075, + "grad_norm": 0.4530208706855774, + "learning_rate": 0.0001, + "loss": 1.5535, + "step": 7982 + }, + { + "epoch": 0.927446993900668, + "grad_norm": 0.49181050062179565, + "learning_rate": 0.0001, + "loss": 1.7461, + "step": 7983 + }, + { + "epoch": 0.9275631716526285, + "grad_norm": 0.49187758564949036, + "learning_rate": 0.0001, + "loss": 1.7731, + "step": 7984 + }, + { + "epoch": 0.9276793494045891, + "grad_norm": 0.5021129846572876, + "learning_rate": 0.0001, + "loss": 1.6372, + "step": 7985 + }, + { + "epoch": 0.9277955271565496, + "grad_norm": 0.508470892906189, + "learning_rate": 0.0001, + "loss": 1.552, + "step": 7986 + }, + { + "epoch": 0.92791170490851, + "grad_norm": 0.5105016231536865, + "learning_rate": 0.0001, + "loss": 1.7802, + "step": 7987 + }, + { + "epoch": 0.9280278826604705, + "grad_norm": 0.4812169671058655, + "learning_rate": 0.0001, + "loss": 1.6524, + "step": 7988 + }, + { + "epoch": 0.928144060412431, + "grad_norm": 0.4714551568031311, + "learning_rate": 0.0001, + "loss": 1.5908, + "step": 7989 + }, + { + "epoch": 0.9282602381643915, + "grad_norm": 0.4296482801437378, + "learning_rate": 0.0001, + "loss": 1.5015, + "step": 7990 + }, + { + "epoch": 0.928376415916352, + "grad_norm": 0.45855334401130676, + "learning_rate": 0.0001, + "loss": 1.574, + "step": 7991 + }, + { + "epoch": 0.9284925936683125, + "grad_norm": 0.4766031801700592, + "learning_rate": 0.0001, + "loss": 1.6346, + "step": 7992 + }, + { + "epoch": 0.928608771420273, + "grad_norm": 0.45450130105018616, + "learning_rate": 0.0001, + "loss": 1.4737, + "step": 7993 + }, + { + "epoch": 0.9287249491722335, + "grad_norm": 0.512444794178009, + "learning_rate": 0.0001, + "loss": 1.6291, + "step": 7994 + }, + { + "epoch": 0.9288411269241941, + "grad_norm": 0.49401575326919556, + "learning_rate": 0.0001, + "loss": 1.6025, + "step": 7995 + }, + { + "epoch": 0.9289573046761546, + "grad_norm": 0.5121031999588013, + "learning_rate": 0.0001, + "loss": 1.6722, + "step": 7996 + }, + { + "epoch": 0.929073482428115, + "grad_norm": 0.4602169692516327, + "learning_rate": 0.0001, + "loss": 1.5312, + "step": 7997 + }, + { + "epoch": 0.9291896601800755, + "grad_norm": 0.49888116121292114, + "learning_rate": 0.0001, + "loss": 1.728, + "step": 7998 + }, + { + "epoch": 0.929305837932036, + "grad_norm": 0.5206019878387451, + "learning_rate": 0.0001, + "loss": 1.6714, + "step": 7999 + }, + { + "epoch": 0.9294220156839965, + "grad_norm": 0.4638976454734802, + "learning_rate": 0.0001, + "loss": 1.5465, + "step": 8000 + }, + { + "epoch": 0.929538193435957, + "grad_norm": 0.48082998394966125, + "learning_rate": 0.0001, + "loss": 1.8074, + "step": 8001 + }, + { + "epoch": 0.9296543711879175, + "grad_norm": 0.4941411018371582, + "learning_rate": 0.0001, + "loss": 1.8171, + "step": 8002 + }, + { + "epoch": 0.929770548939878, + "grad_norm": 0.45711395144462585, + "learning_rate": 0.0001, + "loss": 1.6415, + "step": 8003 + }, + { + "epoch": 0.9298867266918385, + "grad_norm": 0.4651217758655548, + "learning_rate": 0.0001, + "loss": 1.5275, + "step": 8004 + }, + { + "epoch": 0.930002904443799, + "grad_norm": 0.45442432165145874, + "learning_rate": 0.0001, + "loss": 1.5638, + "step": 8005 + }, + { + "epoch": 0.9301190821957596, + "grad_norm": 0.4956342577934265, + "learning_rate": 0.0001, + "loss": 1.4252, + "step": 8006 + }, + { + "epoch": 0.93023525994772, + "grad_norm": 0.5071079730987549, + "learning_rate": 0.0001, + "loss": 1.6602, + "step": 8007 + }, + { + "epoch": 0.9303514376996805, + "grad_norm": 0.4929546117782593, + "learning_rate": 0.0001, + "loss": 1.6939, + "step": 8008 + }, + { + "epoch": 0.930467615451641, + "grad_norm": 0.5169224739074707, + "learning_rate": 0.0001, + "loss": 1.6255, + "step": 8009 + }, + { + "epoch": 0.9305837932036015, + "grad_norm": 0.5088503956794739, + "learning_rate": 0.0001, + "loss": 1.5911, + "step": 8010 + }, + { + "epoch": 0.930699970955562, + "grad_norm": 0.48332953453063965, + "learning_rate": 0.0001, + "loss": 1.712, + "step": 8011 + }, + { + "epoch": 0.9308161487075225, + "grad_norm": 0.49917006492614746, + "learning_rate": 0.0001, + "loss": 1.601, + "step": 8012 + }, + { + "epoch": 0.930932326459483, + "grad_norm": 0.4794350564479828, + "learning_rate": 0.0001, + "loss": 1.5455, + "step": 8013 + }, + { + "epoch": 0.9310485042114435, + "grad_norm": 0.4891558885574341, + "learning_rate": 0.0001, + "loss": 1.6764, + "step": 8014 + }, + { + "epoch": 0.931164681963404, + "grad_norm": 0.4725422263145447, + "learning_rate": 0.0001, + "loss": 1.6397, + "step": 8015 + }, + { + "epoch": 0.9312808597153646, + "grad_norm": 0.48339328169822693, + "learning_rate": 0.0001, + "loss": 1.5296, + "step": 8016 + }, + { + "epoch": 0.931397037467325, + "grad_norm": 0.4951247572898865, + "learning_rate": 0.0001, + "loss": 1.6353, + "step": 8017 + }, + { + "epoch": 0.9315132152192855, + "grad_norm": 0.46006613969802856, + "learning_rate": 0.0001, + "loss": 1.532, + "step": 8018 + }, + { + "epoch": 0.931629392971246, + "grad_norm": 0.4528697431087494, + "learning_rate": 0.0001, + "loss": 1.643, + "step": 8019 + }, + { + "epoch": 0.9317455707232065, + "grad_norm": 0.46240419149398804, + "learning_rate": 0.0001, + "loss": 1.3742, + "step": 8020 + }, + { + "epoch": 0.931861748475167, + "grad_norm": 0.4489692449569702, + "learning_rate": 0.0001, + "loss": 1.5491, + "step": 8021 + }, + { + "epoch": 0.9319779262271275, + "grad_norm": 0.473308265209198, + "learning_rate": 0.0001, + "loss": 1.6718, + "step": 8022 + }, + { + "epoch": 0.932094103979088, + "grad_norm": 0.46006840467453003, + "learning_rate": 0.0001, + "loss": 1.4633, + "step": 8023 + }, + { + "epoch": 0.9322102817310485, + "grad_norm": 0.501011848449707, + "learning_rate": 0.0001, + "loss": 1.645, + "step": 8024 + }, + { + "epoch": 0.932326459483009, + "grad_norm": 0.46970438957214355, + "learning_rate": 0.0001, + "loss": 1.7605, + "step": 8025 + }, + { + "epoch": 0.9324426372349695, + "grad_norm": 0.45778888463974, + "learning_rate": 0.0001, + "loss": 1.4962, + "step": 8026 + }, + { + "epoch": 0.93255881498693, + "grad_norm": 0.46483826637268066, + "learning_rate": 0.0001, + "loss": 1.6317, + "step": 8027 + }, + { + "epoch": 0.9326749927388905, + "grad_norm": 0.4459386169910431, + "learning_rate": 0.0001, + "loss": 1.5468, + "step": 8028 + }, + { + "epoch": 0.932791170490851, + "grad_norm": 0.5162267088890076, + "learning_rate": 0.0001, + "loss": 1.7774, + "step": 8029 + }, + { + "epoch": 0.9329073482428115, + "grad_norm": 0.46646633744239807, + "learning_rate": 0.0001, + "loss": 1.443, + "step": 8030 + }, + { + "epoch": 0.933023525994772, + "grad_norm": 0.5139045119285583, + "learning_rate": 0.0001, + "loss": 1.7225, + "step": 8031 + }, + { + "epoch": 0.9331397037467325, + "grad_norm": 0.44446662068367004, + "learning_rate": 0.0001, + "loss": 1.3792, + "step": 8032 + }, + { + "epoch": 0.933255881498693, + "grad_norm": 0.46956273913383484, + "learning_rate": 0.0001, + "loss": 1.5063, + "step": 8033 + }, + { + "epoch": 0.9333720592506535, + "grad_norm": 0.4521263837814331, + "learning_rate": 0.0001, + "loss": 1.5792, + "step": 8034 + }, + { + "epoch": 0.933488237002614, + "grad_norm": 0.5222106575965881, + "learning_rate": 0.0001, + "loss": 1.6902, + "step": 8035 + }, + { + "epoch": 0.9336044147545745, + "grad_norm": 0.44853895902633667, + "learning_rate": 0.0001, + "loss": 1.5561, + "step": 8036 + }, + { + "epoch": 0.933720592506535, + "grad_norm": 0.48177504539489746, + "learning_rate": 0.0001, + "loss": 1.5831, + "step": 8037 + }, + { + "epoch": 0.9338367702584955, + "grad_norm": 0.4503477215766907, + "learning_rate": 0.0001, + "loss": 1.3327, + "step": 8038 + }, + { + "epoch": 0.933952948010456, + "grad_norm": 0.507217288017273, + "learning_rate": 0.0001, + "loss": 1.7489, + "step": 8039 + }, + { + "epoch": 0.9340691257624165, + "grad_norm": 0.4814956486225128, + "learning_rate": 0.0001, + "loss": 1.6036, + "step": 8040 + }, + { + "epoch": 0.934185303514377, + "grad_norm": 0.4742831885814667, + "learning_rate": 0.0001, + "loss": 1.5807, + "step": 8041 + }, + { + "epoch": 0.9343014812663375, + "grad_norm": 0.4570356607437134, + "learning_rate": 0.0001, + "loss": 1.4408, + "step": 8042 + }, + { + "epoch": 0.934417659018298, + "grad_norm": 0.46834975481033325, + "learning_rate": 0.0001, + "loss": 1.4698, + "step": 8043 + }, + { + "epoch": 0.9345338367702585, + "grad_norm": 0.5158945322036743, + "learning_rate": 0.0001, + "loss": 1.4868, + "step": 8044 + }, + { + "epoch": 0.934650014522219, + "grad_norm": 0.5279756188392639, + "learning_rate": 0.0001, + "loss": 1.5014, + "step": 8045 + }, + { + "epoch": 0.9347661922741795, + "grad_norm": 0.47560861706733704, + "learning_rate": 0.0001, + "loss": 1.5755, + "step": 8046 + }, + { + "epoch": 0.9348823700261399, + "grad_norm": 0.4888511300086975, + "learning_rate": 0.0001, + "loss": 1.6097, + "step": 8047 + }, + { + "epoch": 0.9349985477781005, + "grad_norm": 0.4677184820175171, + "learning_rate": 0.0001, + "loss": 1.6272, + "step": 8048 + }, + { + "epoch": 0.935114725530061, + "grad_norm": 0.4719541370868683, + "learning_rate": 0.0001, + "loss": 1.6077, + "step": 8049 + }, + { + "epoch": 0.9352309032820215, + "grad_norm": 0.477180540561676, + "learning_rate": 0.0001, + "loss": 1.6941, + "step": 8050 + }, + { + "epoch": 0.935347081033982, + "grad_norm": 0.46396854519844055, + "learning_rate": 0.0001, + "loss": 1.3669, + "step": 8051 + }, + { + "epoch": 0.9354632587859425, + "grad_norm": 0.4677002727985382, + "learning_rate": 0.0001, + "loss": 1.493, + "step": 8052 + }, + { + "epoch": 0.935579436537903, + "grad_norm": 0.45009395480155945, + "learning_rate": 0.0001, + "loss": 1.4174, + "step": 8053 + }, + { + "epoch": 0.9356956142898635, + "grad_norm": 0.47856685519218445, + "learning_rate": 0.0001, + "loss": 1.613, + "step": 8054 + }, + { + "epoch": 0.935811792041824, + "grad_norm": 0.5348851084709167, + "learning_rate": 0.0001, + "loss": 1.5635, + "step": 8055 + }, + { + "epoch": 0.9359279697937845, + "grad_norm": 0.4984420835971832, + "learning_rate": 0.0001, + "loss": 1.6902, + "step": 8056 + }, + { + "epoch": 0.9360441475457449, + "grad_norm": 0.5199939608573914, + "learning_rate": 0.0001, + "loss": 1.6925, + "step": 8057 + }, + { + "epoch": 0.9361603252977055, + "grad_norm": 0.5202213525772095, + "learning_rate": 0.0001, + "loss": 1.6618, + "step": 8058 + }, + { + "epoch": 0.936276503049666, + "grad_norm": 0.5361291766166687, + "learning_rate": 0.0001, + "loss": 1.5374, + "step": 8059 + }, + { + "epoch": 0.9363926808016265, + "grad_norm": 0.49866026639938354, + "learning_rate": 0.0001, + "loss": 1.6217, + "step": 8060 + }, + { + "epoch": 0.936508858553587, + "grad_norm": 0.49306201934814453, + "learning_rate": 0.0001, + "loss": 1.5924, + "step": 8061 + }, + { + "epoch": 0.9366250363055475, + "grad_norm": 0.4813399910926819, + "learning_rate": 0.0001, + "loss": 1.6272, + "step": 8062 + }, + { + "epoch": 0.936741214057508, + "grad_norm": 0.47636452317237854, + "learning_rate": 0.0001, + "loss": 1.6794, + "step": 8063 + }, + { + "epoch": 0.9368573918094685, + "grad_norm": 0.4818163812160492, + "learning_rate": 0.0001, + "loss": 1.4483, + "step": 8064 + }, + { + "epoch": 0.936973569561429, + "grad_norm": 0.46799615025520325, + "learning_rate": 0.0001, + "loss": 1.6217, + "step": 8065 + }, + { + "epoch": 0.9370897473133895, + "grad_norm": 0.48917245864868164, + "learning_rate": 0.0001, + "loss": 1.6402, + "step": 8066 + }, + { + "epoch": 0.9372059250653499, + "grad_norm": 0.47310730814933777, + "learning_rate": 0.0001, + "loss": 1.6164, + "step": 8067 + }, + { + "epoch": 0.9373221028173104, + "grad_norm": 0.5221018195152283, + "learning_rate": 0.0001, + "loss": 1.8971, + "step": 8068 + }, + { + "epoch": 0.937438280569271, + "grad_norm": 0.48749321699142456, + "learning_rate": 0.0001, + "loss": 1.5722, + "step": 8069 + }, + { + "epoch": 0.9375544583212315, + "grad_norm": 0.4679155945777893, + "learning_rate": 0.0001, + "loss": 1.5243, + "step": 8070 + }, + { + "epoch": 0.937670636073192, + "grad_norm": 0.47157520055770874, + "learning_rate": 0.0001, + "loss": 1.4295, + "step": 8071 + }, + { + "epoch": 0.9377868138251525, + "grad_norm": 0.4769766628742218, + "learning_rate": 0.0001, + "loss": 1.5411, + "step": 8072 + }, + { + "epoch": 0.937902991577113, + "grad_norm": 0.47961485385894775, + "learning_rate": 0.0001, + "loss": 1.7161, + "step": 8073 + }, + { + "epoch": 0.9380191693290735, + "grad_norm": 0.4806070327758789, + "learning_rate": 0.0001, + "loss": 1.5829, + "step": 8074 + }, + { + "epoch": 0.938135347081034, + "grad_norm": 0.4962315261363983, + "learning_rate": 0.0001, + "loss": 1.658, + "step": 8075 + }, + { + "epoch": 0.9382515248329945, + "grad_norm": 0.5138833522796631, + "learning_rate": 0.0001, + "loss": 1.6177, + "step": 8076 + }, + { + "epoch": 0.9383677025849549, + "grad_norm": 0.4957931935787201, + "learning_rate": 0.0001, + "loss": 1.6425, + "step": 8077 + }, + { + "epoch": 0.9384838803369154, + "grad_norm": 0.49878206849098206, + "learning_rate": 0.0001, + "loss": 1.6026, + "step": 8078 + }, + { + "epoch": 0.938600058088876, + "grad_norm": 0.49658435583114624, + "learning_rate": 0.0001, + "loss": 1.5145, + "step": 8079 + }, + { + "epoch": 0.9387162358408365, + "grad_norm": 0.508560836315155, + "learning_rate": 0.0001, + "loss": 1.5825, + "step": 8080 + }, + { + "epoch": 0.938832413592797, + "grad_norm": 0.4853934943675995, + "learning_rate": 0.0001, + "loss": 1.5385, + "step": 8081 + }, + { + "epoch": 0.9389485913447575, + "grad_norm": 0.478221595287323, + "learning_rate": 0.0001, + "loss": 1.5652, + "step": 8082 + }, + { + "epoch": 0.939064769096718, + "grad_norm": 0.4989098012447357, + "learning_rate": 0.0001, + "loss": 1.647, + "step": 8083 + }, + { + "epoch": 0.9391809468486785, + "grad_norm": 0.4413130283355713, + "learning_rate": 0.0001, + "loss": 1.4044, + "step": 8084 + }, + { + "epoch": 0.939297124600639, + "grad_norm": 0.448485791683197, + "learning_rate": 0.0001, + "loss": 1.6516, + "step": 8085 + }, + { + "epoch": 0.9394133023525995, + "grad_norm": 0.4658021330833435, + "learning_rate": 0.0001, + "loss": 1.5351, + "step": 8086 + }, + { + "epoch": 0.9395294801045599, + "grad_norm": 0.48055848479270935, + "learning_rate": 0.0001, + "loss": 1.556, + "step": 8087 + }, + { + "epoch": 0.9396456578565204, + "grad_norm": 0.47321316599845886, + "learning_rate": 0.0001, + "loss": 1.6199, + "step": 8088 + }, + { + "epoch": 0.9397618356084809, + "grad_norm": 0.44191011786460876, + "learning_rate": 0.0001, + "loss": 1.3483, + "step": 8089 + }, + { + "epoch": 0.9398780133604415, + "grad_norm": 0.5220785140991211, + "learning_rate": 0.0001, + "loss": 1.5912, + "step": 8090 + }, + { + "epoch": 0.939994191112402, + "grad_norm": 0.4720672070980072, + "learning_rate": 0.0001, + "loss": 1.6028, + "step": 8091 + }, + { + "epoch": 0.9401103688643625, + "grad_norm": 0.46814772486686707, + "learning_rate": 0.0001, + "loss": 1.5003, + "step": 8092 + }, + { + "epoch": 0.940226546616323, + "grad_norm": 0.5168691873550415, + "learning_rate": 0.0001, + "loss": 1.8196, + "step": 8093 + }, + { + "epoch": 0.9403427243682835, + "grad_norm": 0.4674499034881592, + "learning_rate": 0.0001, + "loss": 1.5951, + "step": 8094 + }, + { + "epoch": 0.940458902120244, + "grad_norm": 0.49029046297073364, + "learning_rate": 0.0001, + "loss": 1.7264, + "step": 8095 + }, + { + "epoch": 0.9405750798722045, + "grad_norm": 0.5169382691383362, + "learning_rate": 0.0001, + "loss": 1.5545, + "step": 8096 + }, + { + "epoch": 0.940691257624165, + "grad_norm": 0.48217153549194336, + "learning_rate": 0.0001, + "loss": 1.5912, + "step": 8097 + }, + { + "epoch": 0.9408074353761254, + "grad_norm": 0.49331241846084595, + "learning_rate": 0.0001, + "loss": 1.5199, + "step": 8098 + }, + { + "epoch": 0.9409236131280859, + "grad_norm": 0.46810704469680786, + "learning_rate": 0.0001, + "loss": 1.553, + "step": 8099 + }, + { + "epoch": 0.9410397908800465, + "grad_norm": 0.5021561980247498, + "learning_rate": 0.0001, + "loss": 1.7799, + "step": 8100 + }, + { + "epoch": 0.941155968632007, + "grad_norm": 0.44212183356285095, + "learning_rate": 0.0001, + "loss": 1.5045, + "step": 8101 + }, + { + "epoch": 0.9412721463839675, + "grad_norm": 0.4898991882801056, + "learning_rate": 0.0001, + "loss": 1.6766, + "step": 8102 + }, + { + "epoch": 0.941388324135928, + "grad_norm": 0.45801275968551636, + "learning_rate": 0.0001, + "loss": 1.6186, + "step": 8103 + }, + { + "epoch": 0.9415045018878885, + "grad_norm": 0.45079344511032104, + "learning_rate": 0.0001, + "loss": 1.542, + "step": 8104 + }, + { + "epoch": 0.941620679639849, + "grad_norm": 0.4727324843406677, + "learning_rate": 0.0001, + "loss": 1.6742, + "step": 8105 + }, + { + "epoch": 0.9417368573918095, + "grad_norm": 0.4644958972930908, + "learning_rate": 0.0001, + "loss": 1.4713, + "step": 8106 + }, + { + "epoch": 0.94185303514377, + "grad_norm": 0.5391181707382202, + "learning_rate": 0.0001, + "loss": 1.7521, + "step": 8107 + }, + { + "epoch": 0.9419692128957304, + "grad_norm": 0.5218836665153503, + "learning_rate": 0.0001, + "loss": 1.6922, + "step": 8108 + }, + { + "epoch": 0.9420853906476909, + "grad_norm": 0.486215740442276, + "learning_rate": 0.0001, + "loss": 1.6442, + "step": 8109 + }, + { + "epoch": 0.9422015683996514, + "grad_norm": 0.4889783263206482, + "learning_rate": 0.0001, + "loss": 1.5473, + "step": 8110 + }, + { + "epoch": 0.942317746151612, + "grad_norm": 0.44267192482948303, + "learning_rate": 0.0001, + "loss": 1.4231, + "step": 8111 + }, + { + "epoch": 0.9424339239035725, + "grad_norm": 0.49176719784736633, + "learning_rate": 0.0001, + "loss": 1.5304, + "step": 8112 + }, + { + "epoch": 0.942550101655533, + "grad_norm": 0.5220740437507629, + "learning_rate": 0.0001, + "loss": 1.5346, + "step": 8113 + }, + { + "epoch": 0.9426662794074935, + "grad_norm": 0.47036683559417725, + "learning_rate": 0.0001, + "loss": 1.4976, + "step": 8114 + }, + { + "epoch": 0.942782457159454, + "grad_norm": 0.4946513772010803, + "learning_rate": 0.0001, + "loss": 1.4836, + "step": 8115 + }, + { + "epoch": 0.9428986349114145, + "grad_norm": 0.48926377296447754, + "learning_rate": 0.0001, + "loss": 1.6243, + "step": 8116 + }, + { + "epoch": 0.943014812663375, + "grad_norm": 0.46401751041412354, + "learning_rate": 0.0001, + "loss": 1.5319, + "step": 8117 + }, + { + "epoch": 0.9431309904153354, + "grad_norm": 0.4936821162700653, + "learning_rate": 0.0001, + "loss": 1.5866, + "step": 8118 + }, + { + "epoch": 0.9432471681672959, + "grad_norm": 0.5162927508354187, + "learning_rate": 0.0001, + "loss": 1.611, + "step": 8119 + }, + { + "epoch": 0.9433633459192564, + "grad_norm": 0.49203792214393616, + "learning_rate": 0.0001, + "loss": 1.6242, + "step": 8120 + }, + { + "epoch": 0.943479523671217, + "grad_norm": 0.48828327655792236, + "learning_rate": 0.0001, + "loss": 1.6006, + "step": 8121 + }, + { + "epoch": 0.9435957014231775, + "grad_norm": 0.4814288914203644, + "learning_rate": 0.0001, + "loss": 1.6292, + "step": 8122 + }, + { + "epoch": 0.943711879175138, + "grad_norm": 0.4744342267513275, + "learning_rate": 0.0001, + "loss": 1.6242, + "step": 8123 + }, + { + "epoch": 0.9438280569270985, + "grad_norm": 0.4669947922229767, + "learning_rate": 0.0001, + "loss": 1.5537, + "step": 8124 + }, + { + "epoch": 0.943944234679059, + "grad_norm": 0.4419528841972351, + "learning_rate": 0.0001, + "loss": 1.5611, + "step": 8125 + }, + { + "epoch": 0.9440604124310195, + "grad_norm": 0.47628259658813477, + "learning_rate": 0.0001, + "loss": 1.4627, + "step": 8126 + }, + { + "epoch": 0.94417659018298, + "grad_norm": 0.4776514768600464, + "learning_rate": 0.0001, + "loss": 1.439, + "step": 8127 + }, + { + "epoch": 0.9442927679349404, + "grad_norm": 0.4446832537651062, + "learning_rate": 0.0001, + "loss": 1.5658, + "step": 8128 + }, + { + "epoch": 0.9444089456869009, + "grad_norm": 0.5115212202072144, + "learning_rate": 0.0001, + "loss": 1.6909, + "step": 8129 + }, + { + "epoch": 0.9445251234388614, + "grad_norm": 0.47382161021232605, + "learning_rate": 0.0001, + "loss": 1.409, + "step": 8130 + }, + { + "epoch": 0.9446413011908219, + "grad_norm": 0.4738979637622833, + "learning_rate": 0.0001, + "loss": 1.4764, + "step": 8131 + }, + { + "epoch": 0.9447574789427825, + "grad_norm": 0.4864945113658905, + "learning_rate": 0.0001, + "loss": 1.6893, + "step": 8132 + }, + { + "epoch": 0.944873656694743, + "grad_norm": 0.5098589658737183, + "learning_rate": 0.0001, + "loss": 1.4883, + "step": 8133 + }, + { + "epoch": 0.9449898344467035, + "grad_norm": 0.4890642464160919, + "learning_rate": 0.0001, + "loss": 1.7127, + "step": 8134 + }, + { + "epoch": 0.945106012198664, + "grad_norm": 0.499066025018692, + "learning_rate": 0.0001, + "loss": 1.722, + "step": 8135 + }, + { + "epoch": 0.9452221899506245, + "grad_norm": 0.4744119942188263, + "learning_rate": 0.0001, + "loss": 1.6039, + "step": 8136 + }, + { + "epoch": 0.945338367702585, + "grad_norm": 0.45313483476638794, + "learning_rate": 0.0001, + "loss": 1.5495, + "step": 8137 + }, + { + "epoch": 0.9454545454545454, + "grad_norm": 0.4957541227340698, + "learning_rate": 0.0001, + "loss": 1.7737, + "step": 8138 + }, + { + "epoch": 0.9455707232065059, + "grad_norm": 0.46510589122772217, + "learning_rate": 0.0001, + "loss": 1.5611, + "step": 8139 + }, + { + "epoch": 0.9456869009584664, + "grad_norm": 0.49345633387565613, + "learning_rate": 0.0001, + "loss": 1.5756, + "step": 8140 + }, + { + "epoch": 0.9458030787104269, + "grad_norm": 0.44916290044784546, + "learning_rate": 0.0001, + "loss": 1.5718, + "step": 8141 + }, + { + "epoch": 0.9459192564623875, + "grad_norm": 0.5268651247024536, + "learning_rate": 0.0001, + "loss": 1.6663, + "step": 8142 + }, + { + "epoch": 0.946035434214348, + "grad_norm": 0.4611073136329651, + "learning_rate": 0.0001, + "loss": 1.5264, + "step": 8143 + }, + { + "epoch": 0.9461516119663085, + "grad_norm": 0.4878610372543335, + "learning_rate": 0.0001, + "loss": 1.6375, + "step": 8144 + }, + { + "epoch": 0.946267789718269, + "grad_norm": 0.48265448212623596, + "learning_rate": 0.0001, + "loss": 1.5268, + "step": 8145 + }, + { + "epoch": 0.9463839674702295, + "grad_norm": 0.4883013963699341, + "learning_rate": 0.0001, + "loss": 1.5256, + "step": 8146 + }, + { + "epoch": 0.94650014522219, + "grad_norm": 0.4902404844760895, + "learning_rate": 0.0001, + "loss": 1.5719, + "step": 8147 + }, + { + "epoch": 0.9466163229741504, + "grad_norm": 0.48211053013801575, + "learning_rate": 0.0001, + "loss": 1.4926, + "step": 8148 + }, + { + "epoch": 0.9467325007261109, + "grad_norm": 0.4793367087841034, + "learning_rate": 0.0001, + "loss": 1.5874, + "step": 8149 + }, + { + "epoch": 0.9468486784780714, + "grad_norm": 0.47337889671325684, + "learning_rate": 0.0001, + "loss": 1.6667, + "step": 8150 + }, + { + "epoch": 0.9469648562300319, + "grad_norm": 0.45876795053482056, + "learning_rate": 0.0001, + "loss": 1.4361, + "step": 8151 + }, + { + "epoch": 0.9470810339819925, + "grad_norm": 0.5049482583999634, + "learning_rate": 0.0001, + "loss": 1.7341, + "step": 8152 + }, + { + "epoch": 0.947197211733953, + "grad_norm": 0.4585045874118805, + "learning_rate": 0.0001, + "loss": 1.6708, + "step": 8153 + }, + { + "epoch": 0.9473133894859135, + "grad_norm": 0.48491230607032776, + "learning_rate": 0.0001, + "loss": 1.6093, + "step": 8154 + }, + { + "epoch": 0.947429567237874, + "grad_norm": 0.48654916882514954, + "learning_rate": 0.0001, + "loss": 1.7573, + "step": 8155 + }, + { + "epoch": 0.9475457449898345, + "grad_norm": 0.4684811234474182, + "learning_rate": 0.0001, + "loss": 1.5297, + "step": 8156 + }, + { + "epoch": 0.947661922741795, + "grad_norm": 0.4712132513523102, + "learning_rate": 0.0001, + "loss": 1.6316, + "step": 8157 + }, + { + "epoch": 0.9477781004937554, + "grad_norm": 0.4901881217956543, + "learning_rate": 0.0001, + "loss": 1.8043, + "step": 8158 + }, + { + "epoch": 0.9478942782457159, + "grad_norm": 0.4755280017852783, + "learning_rate": 0.0001, + "loss": 1.6087, + "step": 8159 + }, + { + "epoch": 0.9480104559976764, + "grad_norm": 0.5146501064300537, + "learning_rate": 0.0001, + "loss": 1.5001, + "step": 8160 + }, + { + "epoch": 0.9481266337496369, + "grad_norm": 0.4786891043186188, + "learning_rate": 0.0001, + "loss": 1.6811, + "step": 8161 + }, + { + "epoch": 0.9482428115015974, + "grad_norm": 0.44507408142089844, + "learning_rate": 0.0001, + "loss": 1.5259, + "step": 8162 + }, + { + "epoch": 0.948358989253558, + "grad_norm": 0.4719574749469757, + "learning_rate": 0.0001, + "loss": 1.6408, + "step": 8163 + }, + { + "epoch": 0.9484751670055185, + "grad_norm": 0.48670482635498047, + "learning_rate": 0.0001, + "loss": 1.5036, + "step": 8164 + }, + { + "epoch": 0.948591344757479, + "grad_norm": 0.4823938012123108, + "learning_rate": 0.0001, + "loss": 1.5271, + "step": 8165 + }, + { + "epoch": 0.9487075225094395, + "grad_norm": 0.4881124794483185, + "learning_rate": 0.0001, + "loss": 1.607, + "step": 8166 + }, + { + "epoch": 0.9488237002614, + "grad_norm": 0.4872508943080902, + "learning_rate": 0.0001, + "loss": 1.565, + "step": 8167 + }, + { + "epoch": 0.9489398780133604, + "grad_norm": 0.5031945109367371, + "learning_rate": 0.0001, + "loss": 1.6907, + "step": 8168 + }, + { + "epoch": 0.9490560557653209, + "grad_norm": 0.5044823884963989, + "learning_rate": 0.0001, + "loss": 1.6954, + "step": 8169 + }, + { + "epoch": 0.9491722335172814, + "grad_norm": 0.48246756196022034, + "learning_rate": 0.0001, + "loss": 1.6214, + "step": 8170 + }, + { + "epoch": 0.9492884112692419, + "grad_norm": 0.4963812232017517, + "learning_rate": 0.0001, + "loss": 1.531, + "step": 8171 + }, + { + "epoch": 0.9494045890212024, + "grad_norm": 0.4816388785839081, + "learning_rate": 0.0001, + "loss": 1.5913, + "step": 8172 + }, + { + "epoch": 0.949520766773163, + "grad_norm": 0.4748592674732208, + "learning_rate": 0.0001, + "loss": 1.4296, + "step": 8173 + }, + { + "epoch": 0.9496369445251235, + "grad_norm": 0.46375009417533875, + "learning_rate": 0.0001, + "loss": 1.4255, + "step": 8174 + }, + { + "epoch": 0.949753122277084, + "grad_norm": 0.45289376378059387, + "learning_rate": 0.0001, + "loss": 1.558, + "step": 8175 + }, + { + "epoch": 0.9498693000290445, + "grad_norm": 0.48634836077690125, + "learning_rate": 0.0001, + "loss": 1.6393, + "step": 8176 + }, + { + "epoch": 0.949985477781005, + "grad_norm": 0.49181076884269714, + "learning_rate": 0.0001, + "loss": 1.6262, + "step": 8177 + }, + { + "epoch": 0.9501016555329654, + "grad_norm": 0.45793280005455017, + "learning_rate": 0.0001, + "loss": 1.4994, + "step": 8178 + }, + { + "epoch": 0.9502178332849259, + "grad_norm": 0.4974653720855713, + "learning_rate": 0.0001, + "loss": 1.6517, + "step": 8179 + }, + { + "epoch": 0.9503340110368864, + "grad_norm": 0.4716089367866516, + "learning_rate": 0.0001, + "loss": 1.5849, + "step": 8180 + }, + { + "epoch": 0.9504501887888469, + "grad_norm": 0.45210596919059753, + "learning_rate": 0.0001, + "loss": 1.4941, + "step": 8181 + }, + { + "epoch": 0.9505663665408074, + "grad_norm": 0.5031945705413818, + "learning_rate": 0.0001, + "loss": 1.6215, + "step": 8182 + }, + { + "epoch": 0.9506825442927679, + "grad_norm": 0.46104729175567627, + "learning_rate": 0.0001, + "loss": 1.5813, + "step": 8183 + }, + { + "epoch": 0.9507987220447285, + "grad_norm": 0.45279985666275024, + "learning_rate": 0.0001, + "loss": 1.6108, + "step": 8184 + }, + { + "epoch": 0.950914899796689, + "grad_norm": 0.618247389793396, + "learning_rate": 0.0001, + "loss": 1.635, + "step": 8185 + }, + { + "epoch": 0.9510310775486495, + "grad_norm": 0.47192826867103577, + "learning_rate": 0.0001, + "loss": 1.4801, + "step": 8186 + }, + { + "epoch": 0.95114725530061, + "grad_norm": 0.46734699606895447, + "learning_rate": 0.0001, + "loss": 1.567, + "step": 8187 + }, + { + "epoch": 0.9512634330525704, + "grad_norm": 0.46649816632270813, + "learning_rate": 0.0001, + "loss": 1.3187, + "step": 8188 + }, + { + "epoch": 0.9513796108045309, + "grad_norm": 0.5192746520042419, + "learning_rate": 0.0001, + "loss": 1.5849, + "step": 8189 + }, + { + "epoch": 0.9514957885564914, + "grad_norm": 0.476370632648468, + "learning_rate": 0.0001, + "loss": 1.568, + "step": 8190 + }, + { + "epoch": 0.9516119663084519, + "grad_norm": 0.49778425693511963, + "learning_rate": 0.0001, + "loss": 1.6588, + "step": 8191 + }, + { + "epoch": 0.9517281440604124, + "grad_norm": 0.4732555150985718, + "learning_rate": 0.0001, + "loss": 1.5167, + "step": 8192 + }, + { + "epoch": 0.9518443218123729, + "grad_norm": 0.47929319739341736, + "learning_rate": 0.0001, + "loss": 1.646, + "step": 8193 + }, + { + "epoch": 0.9519604995643335, + "grad_norm": 0.44977056980133057, + "learning_rate": 0.0001, + "loss": 1.4695, + "step": 8194 + }, + { + "epoch": 0.952076677316294, + "grad_norm": 0.5259702205657959, + "learning_rate": 0.0001, + "loss": 1.7958, + "step": 8195 + }, + { + "epoch": 0.9521928550682545, + "grad_norm": 0.4790816307067871, + "learning_rate": 0.0001, + "loss": 1.4841, + "step": 8196 + }, + { + "epoch": 0.952309032820215, + "grad_norm": 0.48329922556877136, + "learning_rate": 0.0001, + "loss": 1.6321, + "step": 8197 + }, + { + "epoch": 0.9524252105721754, + "grad_norm": 0.451102077960968, + "learning_rate": 0.0001, + "loss": 1.4469, + "step": 8198 + }, + { + "epoch": 0.9525413883241359, + "grad_norm": 0.4747765064239502, + "learning_rate": 0.0001, + "loss": 1.6745, + "step": 8199 + }, + { + "epoch": 0.9526575660760964, + "grad_norm": 0.5516743063926697, + "learning_rate": 0.0001, + "loss": 1.7372, + "step": 8200 + }, + { + "epoch": 0.9527737438280569, + "grad_norm": 0.4885726869106293, + "learning_rate": 0.0001, + "loss": 1.5667, + "step": 8201 + }, + { + "epoch": 0.9528899215800174, + "grad_norm": 0.4775313436985016, + "learning_rate": 0.0001, + "loss": 1.5917, + "step": 8202 + }, + { + "epoch": 0.9530060993319779, + "grad_norm": 0.4628119170665741, + "learning_rate": 0.0001, + "loss": 1.4747, + "step": 8203 + }, + { + "epoch": 0.9531222770839384, + "grad_norm": 0.4696086049079895, + "learning_rate": 0.0001, + "loss": 1.6371, + "step": 8204 + }, + { + "epoch": 0.953238454835899, + "grad_norm": 0.4801238179206848, + "learning_rate": 0.0001, + "loss": 1.6711, + "step": 8205 + }, + { + "epoch": 0.9533546325878595, + "grad_norm": 0.46630024909973145, + "learning_rate": 0.0001, + "loss": 1.6602, + "step": 8206 + }, + { + "epoch": 0.95347081033982, + "grad_norm": 0.5154693722724915, + "learning_rate": 0.0001, + "loss": 1.7074, + "step": 8207 + }, + { + "epoch": 0.9535869880917804, + "grad_norm": 0.456986665725708, + "learning_rate": 0.0001, + "loss": 1.4658, + "step": 8208 + }, + { + "epoch": 0.9537031658437409, + "grad_norm": 0.4490494430065155, + "learning_rate": 0.0001, + "loss": 1.4067, + "step": 8209 + }, + { + "epoch": 0.9538193435957014, + "grad_norm": 0.4910639226436615, + "learning_rate": 0.0001, + "loss": 1.6069, + "step": 8210 + }, + { + "epoch": 0.9539355213476619, + "grad_norm": 0.5007901787757874, + "learning_rate": 0.0001, + "loss": 1.6315, + "step": 8211 + }, + { + "epoch": 0.9540516990996224, + "grad_norm": 0.45628076791763306, + "learning_rate": 0.0001, + "loss": 1.5923, + "step": 8212 + }, + { + "epoch": 0.9541678768515829, + "grad_norm": 0.48491212725639343, + "learning_rate": 0.0001, + "loss": 1.605, + "step": 8213 + }, + { + "epoch": 0.9542840546035434, + "grad_norm": 0.48505932092666626, + "learning_rate": 0.0001, + "loss": 1.4959, + "step": 8214 + }, + { + "epoch": 0.954400232355504, + "grad_norm": 0.5037823915481567, + "learning_rate": 0.0001, + "loss": 1.4856, + "step": 8215 + }, + { + "epoch": 0.9545164101074645, + "grad_norm": 0.5135097503662109, + "learning_rate": 0.0001, + "loss": 1.7156, + "step": 8216 + }, + { + "epoch": 0.954632587859425, + "grad_norm": 0.48048651218414307, + "learning_rate": 0.0001, + "loss": 1.7173, + "step": 8217 + }, + { + "epoch": 0.9547487656113854, + "grad_norm": 0.48271870613098145, + "learning_rate": 0.0001, + "loss": 1.5344, + "step": 8218 + }, + { + "epoch": 0.9548649433633459, + "grad_norm": 0.48280853033065796, + "learning_rate": 0.0001, + "loss": 1.7241, + "step": 8219 + }, + { + "epoch": 0.9549811211153064, + "grad_norm": 0.4850626289844513, + "learning_rate": 0.0001, + "loss": 1.6221, + "step": 8220 + }, + { + "epoch": 0.9550972988672669, + "grad_norm": 0.48917168378829956, + "learning_rate": 0.0001, + "loss": 1.6632, + "step": 8221 + }, + { + "epoch": 0.9552134766192274, + "grad_norm": 0.4904051423072815, + "learning_rate": 0.0001, + "loss": 1.627, + "step": 8222 + }, + { + "epoch": 0.9553296543711879, + "grad_norm": 0.47541365027427673, + "learning_rate": 0.0001, + "loss": 1.5431, + "step": 8223 + }, + { + "epoch": 0.9554458321231484, + "grad_norm": 0.49688780307769775, + "learning_rate": 0.0001, + "loss": 1.7412, + "step": 8224 + }, + { + "epoch": 0.9555620098751089, + "grad_norm": 0.5080480575561523, + "learning_rate": 0.0001, + "loss": 1.6331, + "step": 8225 + }, + { + "epoch": 0.9556781876270695, + "grad_norm": 0.48701146245002747, + "learning_rate": 0.0001, + "loss": 1.5155, + "step": 8226 + }, + { + "epoch": 0.95579436537903, + "grad_norm": 0.4538536071777344, + "learning_rate": 0.0001, + "loss": 1.5677, + "step": 8227 + }, + { + "epoch": 0.9559105431309904, + "grad_norm": 0.47831106185913086, + "learning_rate": 0.0001, + "loss": 1.5717, + "step": 8228 + }, + { + "epoch": 0.9560267208829509, + "grad_norm": 0.4827806055545807, + "learning_rate": 0.0001, + "loss": 1.5685, + "step": 8229 + }, + { + "epoch": 0.9561428986349114, + "grad_norm": 0.4946243166923523, + "learning_rate": 0.0001, + "loss": 1.6507, + "step": 8230 + }, + { + "epoch": 0.9562590763868719, + "grad_norm": 0.5115143060684204, + "learning_rate": 0.0001, + "loss": 1.6444, + "step": 8231 + }, + { + "epoch": 0.9563752541388324, + "grad_norm": 0.4955557584762573, + "learning_rate": 0.0001, + "loss": 1.6565, + "step": 8232 + }, + { + "epoch": 0.9564914318907929, + "grad_norm": 0.45829346776008606, + "learning_rate": 0.0001, + "loss": 1.4938, + "step": 8233 + }, + { + "epoch": 0.9566076096427534, + "grad_norm": 0.4455798864364624, + "learning_rate": 0.0001, + "loss": 1.5516, + "step": 8234 + }, + { + "epoch": 0.9567237873947139, + "grad_norm": 0.48835891485214233, + "learning_rate": 0.0001, + "loss": 1.6687, + "step": 8235 + }, + { + "epoch": 0.9568399651466745, + "grad_norm": 0.46689534187316895, + "learning_rate": 0.0001, + "loss": 1.6003, + "step": 8236 + }, + { + "epoch": 0.956956142898635, + "grad_norm": 0.5193210244178772, + "learning_rate": 0.0001, + "loss": 1.4655, + "step": 8237 + }, + { + "epoch": 0.9570723206505954, + "grad_norm": 0.46074503660202026, + "learning_rate": 0.0001, + "loss": 1.4701, + "step": 8238 + }, + { + "epoch": 0.9571884984025559, + "grad_norm": 0.48494353890419006, + "learning_rate": 0.0001, + "loss": 1.5749, + "step": 8239 + }, + { + "epoch": 0.9573046761545164, + "grad_norm": 0.45777463912963867, + "learning_rate": 0.0001, + "loss": 1.5426, + "step": 8240 + }, + { + "epoch": 0.9574208539064769, + "grad_norm": 0.49893805384635925, + "learning_rate": 0.0001, + "loss": 1.6708, + "step": 8241 + }, + { + "epoch": 0.9575370316584374, + "grad_norm": 0.500220537185669, + "learning_rate": 0.0001, + "loss": 1.6171, + "step": 8242 + }, + { + "epoch": 0.9576532094103979, + "grad_norm": 0.4701831638813019, + "learning_rate": 0.0001, + "loss": 1.542, + "step": 8243 + }, + { + "epoch": 0.9577693871623584, + "grad_norm": 0.465260773897171, + "learning_rate": 0.0001, + "loss": 1.5155, + "step": 8244 + }, + { + "epoch": 0.9578855649143189, + "grad_norm": 0.4423646628856659, + "learning_rate": 0.0001, + "loss": 1.5118, + "step": 8245 + }, + { + "epoch": 0.9580017426662794, + "grad_norm": 0.49591514468193054, + "learning_rate": 0.0001, + "loss": 1.6665, + "step": 8246 + }, + { + "epoch": 0.95811792041824, + "grad_norm": 0.4624507427215576, + "learning_rate": 0.0001, + "loss": 1.5759, + "step": 8247 + }, + { + "epoch": 0.9582340981702004, + "grad_norm": 0.4537637531757355, + "learning_rate": 0.0001, + "loss": 1.3326, + "step": 8248 + }, + { + "epoch": 0.9583502759221609, + "grad_norm": 0.45610177516937256, + "learning_rate": 0.0001, + "loss": 1.5874, + "step": 8249 + }, + { + "epoch": 0.9584664536741214, + "grad_norm": 0.5058176517486572, + "learning_rate": 0.0001, + "loss": 1.5944, + "step": 8250 + }, + { + "epoch": 0.9585826314260819, + "grad_norm": 0.47740715742111206, + "learning_rate": 0.0001, + "loss": 1.5308, + "step": 8251 + }, + { + "epoch": 0.9586988091780424, + "grad_norm": 0.5462493300437927, + "learning_rate": 0.0001, + "loss": 1.643, + "step": 8252 + }, + { + "epoch": 0.9588149869300029, + "grad_norm": 0.4755356013774872, + "learning_rate": 0.0001, + "loss": 1.4913, + "step": 8253 + }, + { + "epoch": 0.9589311646819634, + "grad_norm": 0.4820975661277771, + "learning_rate": 0.0001, + "loss": 1.534, + "step": 8254 + }, + { + "epoch": 0.9590473424339239, + "grad_norm": 0.4660854637622833, + "learning_rate": 0.0001, + "loss": 1.5667, + "step": 8255 + }, + { + "epoch": 0.9591635201858844, + "grad_norm": 0.5366430282592773, + "learning_rate": 0.0001, + "loss": 1.6443, + "step": 8256 + }, + { + "epoch": 0.959279697937845, + "grad_norm": 0.5088802576065063, + "learning_rate": 0.0001, + "loss": 1.6137, + "step": 8257 + }, + { + "epoch": 0.9593958756898054, + "grad_norm": 0.5079680681228638, + "learning_rate": 0.0001, + "loss": 1.753, + "step": 8258 + }, + { + "epoch": 0.9595120534417659, + "grad_norm": 0.4622834622859955, + "learning_rate": 0.0001, + "loss": 1.5732, + "step": 8259 + }, + { + "epoch": 0.9596282311937264, + "grad_norm": 0.5069734454154968, + "learning_rate": 0.0001, + "loss": 1.6656, + "step": 8260 + }, + { + "epoch": 0.9597444089456869, + "grad_norm": 0.493930459022522, + "learning_rate": 0.0001, + "loss": 1.4827, + "step": 8261 + }, + { + "epoch": 0.9598605866976474, + "grad_norm": 0.5060780644416809, + "learning_rate": 0.0001, + "loss": 1.4897, + "step": 8262 + }, + { + "epoch": 0.9599767644496079, + "grad_norm": 0.48548170924186707, + "learning_rate": 0.0001, + "loss": 1.6126, + "step": 8263 + }, + { + "epoch": 0.9600929422015684, + "grad_norm": 0.45788517594337463, + "learning_rate": 0.0001, + "loss": 1.505, + "step": 8264 + }, + { + "epoch": 0.9602091199535289, + "grad_norm": 0.46271592378616333, + "learning_rate": 0.0001, + "loss": 1.7345, + "step": 8265 + }, + { + "epoch": 0.9603252977054894, + "grad_norm": 0.48142072558403015, + "learning_rate": 0.0001, + "loss": 1.627, + "step": 8266 + }, + { + "epoch": 0.9604414754574498, + "grad_norm": 0.47520777583122253, + "learning_rate": 0.0001, + "loss": 1.6576, + "step": 8267 + }, + { + "epoch": 0.9605576532094104, + "grad_norm": 0.4597654938697815, + "learning_rate": 0.0001, + "loss": 1.5501, + "step": 8268 + }, + { + "epoch": 0.9606738309613709, + "grad_norm": 0.4608597159385681, + "learning_rate": 0.0001, + "loss": 1.511, + "step": 8269 + }, + { + "epoch": 0.9607900087133314, + "grad_norm": 0.4679330289363861, + "learning_rate": 0.0001, + "loss": 1.4058, + "step": 8270 + }, + { + "epoch": 0.9609061864652919, + "grad_norm": 0.49732252955436707, + "learning_rate": 0.0001, + "loss": 1.5015, + "step": 8271 + }, + { + "epoch": 0.9610223642172524, + "grad_norm": 0.4775616526603699, + "learning_rate": 0.0001, + "loss": 1.5531, + "step": 8272 + }, + { + "epoch": 0.9611385419692129, + "grad_norm": 0.5359445214271545, + "learning_rate": 0.0001, + "loss": 1.58, + "step": 8273 + }, + { + "epoch": 0.9612547197211734, + "grad_norm": 0.4927753508090973, + "learning_rate": 0.0001, + "loss": 1.6331, + "step": 8274 + }, + { + "epoch": 0.9613708974731339, + "grad_norm": 0.48681047558784485, + "learning_rate": 0.0001, + "loss": 1.6162, + "step": 8275 + }, + { + "epoch": 0.9614870752250944, + "grad_norm": 0.49308332800865173, + "learning_rate": 0.0001, + "loss": 1.563, + "step": 8276 + }, + { + "epoch": 0.9616032529770548, + "grad_norm": 0.48825904726982117, + "learning_rate": 0.0001, + "loss": 1.6006, + "step": 8277 + }, + { + "epoch": 0.9617194307290154, + "grad_norm": 0.5138652324676514, + "learning_rate": 0.0001, + "loss": 1.6027, + "step": 8278 + }, + { + "epoch": 0.9618356084809759, + "grad_norm": 0.5201594233512878, + "learning_rate": 0.0001, + "loss": 1.6361, + "step": 8279 + }, + { + "epoch": 0.9619517862329364, + "grad_norm": 0.5101884603500366, + "learning_rate": 0.0001, + "loss": 1.7062, + "step": 8280 + }, + { + "epoch": 0.9620679639848969, + "grad_norm": 0.45568764209747314, + "learning_rate": 0.0001, + "loss": 1.5697, + "step": 8281 + }, + { + "epoch": 0.9621841417368574, + "grad_norm": 0.46922385692596436, + "learning_rate": 0.0001, + "loss": 1.5052, + "step": 8282 + }, + { + "epoch": 0.9623003194888179, + "grad_norm": 0.47019103169441223, + "learning_rate": 0.0001, + "loss": 1.6064, + "step": 8283 + }, + { + "epoch": 0.9624164972407784, + "grad_norm": 0.5026654601097107, + "learning_rate": 0.0001, + "loss": 1.6182, + "step": 8284 + }, + { + "epoch": 0.9625326749927389, + "grad_norm": 0.5346139669418335, + "learning_rate": 0.0001, + "loss": 1.6782, + "step": 8285 + }, + { + "epoch": 0.9626488527446994, + "grad_norm": 0.4690621495246887, + "learning_rate": 0.0001, + "loss": 1.5121, + "step": 8286 + }, + { + "epoch": 0.9627650304966598, + "grad_norm": 0.5523574948310852, + "learning_rate": 0.0001, + "loss": 1.8614, + "step": 8287 + }, + { + "epoch": 0.9628812082486203, + "grad_norm": 0.5208613872528076, + "learning_rate": 0.0001, + "loss": 1.69, + "step": 8288 + }, + { + "epoch": 0.9629973860005809, + "grad_norm": 0.5181357264518738, + "learning_rate": 0.0001, + "loss": 1.6707, + "step": 8289 + }, + { + "epoch": 0.9631135637525414, + "grad_norm": 0.49998828768730164, + "learning_rate": 0.0001, + "loss": 1.57, + "step": 8290 + }, + { + "epoch": 0.9632297415045019, + "grad_norm": 0.47236523032188416, + "learning_rate": 0.0001, + "loss": 1.4172, + "step": 8291 + }, + { + "epoch": 0.9633459192564624, + "grad_norm": 0.49933215975761414, + "learning_rate": 0.0001, + "loss": 1.6872, + "step": 8292 + }, + { + "epoch": 0.9634620970084229, + "grad_norm": 0.48672953248023987, + "learning_rate": 0.0001, + "loss": 1.5394, + "step": 8293 + }, + { + "epoch": 0.9635782747603834, + "grad_norm": 0.45596054196357727, + "learning_rate": 0.0001, + "loss": 1.4982, + "step": 8294 + }, + { + "epoch": 0.9636944525123439, + "grad_norm": 0.47312960028648376, + "learning_rate": 0.0001, + "loss": 1.505, + "step": 8295 + }, + { + "epoch": 0.9638106302643044, + "grad_norm": 0.5012199282646179, + "learning_rate": 0.0001, + "loss": 1.499, + "step": 8296 + }, + { + "epoch": 0.9639268080162648, + "grad_norm": 0.516236424446106, + "learning_rate": 0.0001, + "loss": 1.6011, + "step": 8297 + }, + { + "epoch": 0.9640429857682253, + "grad_norm": 0.44847118854522705, + "learning_rate": 0.0001, + "loss": 1.5113, + "step": 8298 + }, + { + "epoch": 0.9641591635201859, + "grad_norm": 0.4940093457698822, + "learning_rate": 0.0001, + "loss": 1.5627, + "step": 8299 + }, + { + "epoch": 0.9642753412721464, + "grad_norm": 0.4744294285774231, + "learning_rate": 0.0001, + "loss": 1.5101, + "step": 8300 + }, + { + "epoch": 0.9643915190241069, + "grad_norm": 0.4880334436893463, + "learning_rate": 0.0001, + "loss": 1.5232, + "step": 8301 + }, + { + "epoch": 0.9645076967760674, + "grad_norm": 0.49763065576553345, + "learning_rate": 0.0001, + "loss": 1.6985, + "step": 8302 + }, + { + "epoch": 0.9646238745280279, + "grad_norm": 0.49351394176483154, + "learning_rate": 0.0001, + "loss": 1.7411, + "step": 8303 + }, + { + "epoch": 0.9647400522799884, + "grad_norm": 0.4660297930240631, + "learning_rate": 0.0001, + "loss": 1.5433, + "step": 8304 + }, + { + "epoch": 0.9648562300319489, + "grad_norm": 0.4791782796382904, + "learning_rate": 0.0001, + "loss": 1.6004, + "step": 8305 + }, + { + "epoch": 0.9649724077839094, + "grad_norm": 0.49400556087493896, + "learning_rate": 0.0001, + "loss": 1.5253, + "step": 8306 + }, + { + "epoch": 0.9650885855358698, + "grad_norm": 0.4803016483783722, + "learning_rate": 0.0001, + "loss": 1.5192, + "step": 8307 + }, + { + "epoch": 0.9652047632878303, + "grad_norm": 0.49062761664390564, + "learning_rate": 0.0001, + "loss": 1.6898, + "step": 8308 + }, + { + "epoch": 0.9653209410397908, + "grad_norm": 0.5109800696372986, + "learning_rate": 0.0001, + "loss": 1.631, + "step": 8309 + }, + { + "epoch": 0.9654371187917514, + "grad_norm": 0.503459632396698, + "learning_rate": 0.0001, + "loss": 1.6486, + "step": 8310 + }, + { + "epoch": 0.9655532965437119, + "grad_norm": 0.47792255878448486, + "learning_rate": 0.0001, + "loss": 1.5972, + "step": 8311 + }, + { + "epoch": 0.9656694742956724, + "grad_norm": 0.4999959468841553, + "learning_rate": 0.0001, + "loss": 1.6538, + "step": 8312 + }, + { + "epoch": 0.9657856520476329, + "grad_norm": 0.49376481771469116, + "learning_rate": 0.0001, + "loss": 1.6086, + "step": 8313 + }, + { + "epoch": 0.9659018297995934, + "grad_norm": 0.4859766662120819, + "learning_rate": 0.0001, + "loss": 1.5698, + "step": 8314 + }, + { + "epoch": 0.9660180075515539, + "grad_norm": 0.4656546413898468, + "learning_rate": 0.0001, + "loss": 1.5478, + "step": 8315 + }, + { + "epoch": 0.9661341853035144, + "grad_norm": 0.48763415217399597, + "learning_rate": 0.0001, + "loss": 1.5704, + "step": 8316 + }, + { + "epoch": 0.9662503630554748, + "grad_norm": 0.479206919670105, + "learning_rate": 0.0001, + "loss": 1.5964, + "step": 8317 + }, + { + "epoch": 0.9663665408074353, + "grad_norm": 0.48830246925354004, + "learning_rate": 0.0001, + "loss": 1.6451, + "step": 8318 + }, + { + "epoch": 0.9664827185593958, + "grad_norm": 0.49852919578552246, + "learning_rate": 0.0001, + "loss": 1.5722, + "step": 8319 + }, + { + "epoch": 0.9665988963113564, + "grad_norm": 0.45353084802627563, + "learning_rate": 0.0001, + "loss": 1.4958, + "step": 8320 + }, + { + "epoch": 0.9667150740633169, + "grad_norm": 0.49439936876296997, + "learning_rate": 0.0001, + "loss": 1.6234, + "step": 8321 + }, + { + "epoch": 0.9668312518152774, + "grad_norm": 0.48684781789779663, + "learning_rate": 0.0001, + "loss": 1.5654, + "step": 8322 + }, + { + "epoch": 0.9669474295672379, + "grad_norm": 0.47375577688217163, + "learning_rate": 0.0001, + "loss": 1.5868, + "step": 8323 + }, + { + "epoch": 0.9670636073191984, + "grad_norm": 0.45029014348983765, + "learning_rate": 0.0001, + "loss": 1.5204, + "step": 8324 + }, + { + "epoch": 0.9671797850711589, + "grad_norm": 0.49063849449157715, + "learning_rate": 0.0001, + "loss": 1.6855, + "step": 8325 + }, + { + "epoch": 0.9672959628231194, + "grad_norm": 0.5382382273674011, + "learning_rate": 0.0001, + "loss": 1.6313, + "step": 8326 + }, + { + "epoch": 0.9674121405750798, + "grad_norm": 0.49472776055336, + "learning_rate": 0.0001, + "loss": 1.5912, + "step": 8327 + }, + { + "epoch": 0.9675283183270403, + "grad_norm": 0.4821051359176636, + "learning_rate": 0.0001, + "loss": 1.6096, + "step": 8328 + }, + { + "epoch": 0.9676444960790008, + "grad_norm": 0.47823530435562134, + "learning_rate": 0.0001, + "loss": 1.612, + "step": 8329 + }, + { + "epoch": 0.9677606738309613, + "grad_norm": 0.4891481399536133, + "learning_rate": 0.0001, + "loss": 1.5234, + "step": 8330 + }, + { + "epoch": 0.9678768515829219, + "grad_norm": 0.49164843559265137, + "learning_rate": 0.0001, + "loss": 1.6268, + "step": 8331 + }, + { + "epoch": 0.9679930293348824, + "grad_norm": 0.473734974861145, + "learning_rate": 0.0001, + "loss": 1.5407, + "step": 8332 + }, + { + "epoch": 0.9681092070868429, + "grad_norm": 0.47365325689315796, + "learning_rate": 0.0001, + "loss": 1.4787, + "step": 8333 + }, + { + "epoch": 0.9682253848388034, + "grad_norm": 0.4796622395515442, + "learning_rate": 0.0001, + "loss": 1.4766, + "step": 8334 + }, + { + "epoch": 0.9683415625907639, + "grad_norm": 0.5099849700927734, + "learning_rate": 0.0001, + "loss": 1.5258, + "step": 8335 + }, + { + "epoch": 0.9684577403427244, + "grad_norm": 0.4894762337207794, + "learning_rate": 0.0001, + "loss": 1.5813, + "step": 8336 + }, + { + "epoch": 0.9685739180946848, + "grad_norm": 0.5052227973937988, + "learning_rate": 0.0001, + "loss": 1.633, + "step": 8337 + }, + { + "epoch": 0.9686900958466453, + "grad_norm": 0.5599405169487, + "learning_rate": 0.0001, + "loss": 1.7586, + "step": 8338 + }, + { + "epoch": 0.9688062735986058, + "grad_norm": 0.5493488907814026, + "learning_rate": 0.0001, + "loss": 1.5712, + "step": 8339 + }, + { + "epoch": 0.9689224513505663, + "grad_norm": 0.47974830865859985, + "learning_rate": 0.0001, + "loss": 1.6222, + "step": 8340 + }, + { + "epoch": 0.9690386291025269, + "grad_norm": 0.46246853470802307, + "learning_rate": 0.0001, + "loss": 1.6754, + "step": 8341 + }, + { + "epoch": 0.9691548068544874, + "grad_norm": 0.5237904787063599, + "learning_rate": 0.0001, + "loss": 1.7271, + "step": 8342 + }, + { + "epoch": 0.9692709846064479, + "grad_norm": 0.49900978803634644, + "learning_rate": 0.0001, + "loss": 1.6926, + "step": 8343 + }, + { + "epoch": 0.9693871623584084, + "grad_norm": 0.45971980690956116, + "learning_rate": 0.0001, + "loss": 1.4754, + "step": 8344 + }, + { + "epoch": 0.9695033401103689, + "grad_norm": 0.46642741560935974, + "learning_rate": 0.0001, + "loss": 1.5411, + "step": 8345 + }, + { + "epoch": 0.9696195178623294, + "grad_norm": 0.4861880838871002, + "learning_rate": 0.0001, + "loss": 1.6039, + "step": 8346 + }, + { + "epoch": 0.9697356956142898, + "grad_norm": 0.49903160333633423, + "learning_rate": 0.0001, + "loss": 1.6349, + "step": 8347 + }, + { + "epoch": 0.9698518733662503, + "grad_norm": 0.47082969546318054, + "learning_rate": 0.0001, + "loss": 1.601, + "step": 8348 + }, + { + "epoch": 0.9699680511182108, + "grad_norm": 0.48401105403900146, + "learning_rate": 0.0001, + "loss": 1.5224, + "step": 8349 + }, + { + "epoch": 0.9700842288701713, + "grad_norm": 0.4795251190662384, + "learning_rate": 0.0001, + "loss": 1.6493, + "step": 8350 + }, + { + "epoch": 0.9702004066221319, + "grad_norm": 0.4561566114425659, + "learning_rate": 0.0001, + "loss": 1.4822, + "step": 8351 + }, + { + "epoch": 0.9703165843740924, + "grad_norm": 0.48859137296676636, + "learning_rate": 0.0001, + "loss": 1.5395, + "step": 8352 + }, + { + "epoch": 0.9704327621260529, + "grad_norm": 0.447553426027298, + "learning_rate": 0.0001, + "loss": 1.4135, + "step": 8353 + }, + { + "epoch": 0.9705489398780134, + "grad_norm": 0.4990467429161072, + "learning_rate": 0.0001, + "loss": 1.561, + "step": 8354 + }, + { + "epoch": 0.9706651176299739, + "grad_norm": 0.5028551816940308, + "learning_rate": 0.0001, + "loss": 1.7281, + "step": 8355 + }, + { + "epoch": 0.9707812953819344, + "grad_norm": 0.49683597683906555, + "learning_rate": 0.0001, + "loss": 1.5454, + "step": 8356 + }, + { + "epoch": 0.9708974731338949, + "grad_norm": 0.47198501229286194, + "learning_rate": 0.0001, + "loss": 1.6287, + "step": 8357 + }, + { + "epoch": 0.9710136508858553, + "grad_norm": 0.5202149152755737, + "learning_rate": 0.0001, + "loss": 1.6653, + "step": 8358 + }, + { + "epoch": 0.9711298286378158, + "grad_norm": 0.505222737789154, + "learning_rate": 0.0001, + "loss": 1.687, + "step": 8359 + }, + { + "epoch": 0.9712460063897763, + "grad_norm": 0.5158810615539551, + "learning_rate": 0.0001, + "loss": 1.6364, + "step": 8360 + }, + { + "epoch": 0.9713621841417368, + "grad_norm": 0.5185474157333374, + "learning_rate": 0.0001, + "loss": 1.8292, + "step": 8361 + }, + { + "epoch": 0.9714783618936974, + "grad_norm": 0.479807049036026, + "learning_rate": 0.0001, + "loss": 1.5451, + "step": 8362 + }, + { + "epoch": 0.9715945396456579, + "grad_norm": 0.47190988063812256, + "learning_rate": 0.0001, + "loss": 1.6801, + "step": 8363 + }, + { + "epoch": 0.9717107173976184, + "grad_norm": 0.47049620747566223, + "learning_rate": 0.0001, + "loss": 1.5388, + "step": 8364 + }, + { + "epoch": 0.9718268951495789, + "grad_norm": 0.49020835757255554, + "learning_rate": 0.0001, + "loss": 1.6445, + "step": 8365 + }, + { + "epoch": 0.9719430729015394, + "grad_norm": 0.49755486845970154, + "learning_rate": 0.0001, + "loss": 1.5634, + "step": 8366 + }, + { + "epoch": 0.9720592506534999, + "grad_norm": 0.4959685504436493, + "learning_rate": 0.0001, + "loss": 1.6562, + "step": 8367 + }, + { + "epoch": 0.9721754284054603, + "grad_norm": 0.5201342701911926, + "learning_rate": 0.0001, + "loss": 1.5855, + "step": 8368 + }, + { + "epoch": 0.9722916061574208, + "grad_norm": 0.4995202422142029, + "learning_rate": 0.0001, + "loss": 1.5136, + "step": 8369 + }, + { + "epoch": 0.9724077839093813, + "grad_norm": 0.4514632225036621, + "learning_rate": 0.0001, + "loss": 1.3639, + "step": 8370 + }, + { + "epoch": 0.9725239616613418, + "grad_norm": 0.46928879618644714, + "learning_rate": 0.0001, + "loss": 1.4085, + "step": 8371 + }, + { + "epoch": 0.9726401394133024, + "grad_norm": 0.4802158772945404, + "learning_rate": 0.0001, + "loss": 1.5368, + "step": 8372 + }, + { + "epoch": 0.9727563171652629, + "grad_norm": 0.4651092290878296, + "learning_rate": 0.0001, + "loss": 1.5232, + "step": 8373 + }, + { + "epoch": 0.9728724949172234, + "grad_norm": 0.46335577964782715, + "learning_rate": 0.0001, + "loss": 1.4136, + "step": 8374 + }, + { + "epoch": 0.9729886726691839, + "grad_norm": 0.48499464988708496, + "learning_rate": 0.0001, + "loss": 1.7017, + "step": 8375 + }, + { + "epoch": 0.9731048504211444, + "grad_norm": 0.4987207055091858, + "learning_rate": 0.0001, + "loss": 1.6231, + "step": 8376 + }, + { + "epoch": 0.9732210281731049, + "grad_norm": 0.505811333656311, + "learning_rate": 0.0001, + "loss": 1.7212, + "step": 8377 + }, + { + "epoch": 0.9733372059250653, + "grad_norm": 0.49306151270866394, + "learning_rate": 0.0001, + "loss": 1.6688, + "step": 8378 + }, + { + "epoch": 0.9734533836770258, + "grad_norm": 0.4923427700996399, + "learning_rate": 0.0001, + "loss": 1.6344, + "step": 8379 + }, + { + "epoch": 0.9735695614289863, + "grad_norm": 0.4896833002567291, + "learning_rate": 0.0001, + "loss": 1.5661, + "step": 8380 + }, + { + "epoch": 0.9736857391809468, + "grad_norm": 0.52273029088974, + "learning_rate": 0.0001, + "loss": 1.5139, + "step": 8381 + }, + { + "epoch": 0.9738019169329073, + "grad_norm": 0.48786941170692444, + "learning_rate": 0.0001, + "loss": 1.4738, + "step": 8382 + }, + { + "epoch": 0.9739180946848679, + "grad_norm": 0.49467676877975464, + "learning_rate": 0.0001, + "loss": 1.7052, + "step": 8383 + }, + { + "epoch": 0.9740342724368284, + "grad_norm": 0.49212512373924255, + "learning_rate": 0.0001, + "loss": 1.6339, + "step": 8384 + }, + { + "epoch": 0.9741504501887889, + "grad_norm": 0.5036439299583435, + "learning_rate": 0.0001, + "loss": 1.8137, + "step": 8385 + }, + { + "epoch": 0.9742666279407494, + "grad_norm": 0.5129532814025879, + "learning_rate": 0.0001, + "loss": 1.7383, + "step": 8386 + }, + { + "epoch": 0.9743828056927099, + "grad_norm": 0.5009684562683105, + "learning_rate": 0.0001, + "loss": 1.6897, + "step": 8387 + }, + { + "epoch": 0.9744989834446703, + "grad_norm": 0.47049927711486816, + "learning_rate": 0.0001, + "loss": 1.5429, + "step": 8388 + }, + { + "epoch": 0.9746151611966308, + "grad_norm": 0.47478753328323364, + "learning_rate": 0.0001, + "loss": 1.6045, + "step": 8389 + }, + { + "epoch": 0.9747313389485913, + "grad_norm": 0.46627920866012573, + "learning_rate": 0.0001, + "loss": 1.5066, + "step": 8390 + }, + { + "epoch": 0.9748475167005518, + "grad_norm": 0.47100529074668884, + "learning_rate": 0.0001, + "loss": 1.5564, + "step": 8391 + }, + { + "epoch": 0.9749636944525123, + "grad_norm": 0.4588838815689087, + "learning_rate": 0.0001, + "loss": 1.5868, + "step": 8392 + }, + { + "epoch": 0.9750798722044729, + "grad_norm": 0.5259383916854858, + "learning_rate": 0.0001, + "loss": 1.6828, + "step": 8393 + }, + { + "epoch": 0.9751960499564334, + "grad_norm": 0.46649569272994995, + "learning_rate": 0.0001, + "loss": 1.5351, + "step": 8394 + }, + { + "epoch": 0.9753122277083939, + "grad_norm": 0.48280128836631775, + "learning_rate": 0.0001, + "loss": 1.8095, + "step": 8395 + }, + { + "epoch": 0.9754284054603544, + "grad_norm": 0.510216474533081, + "learning_rate": 0.0001, + "loss": 1.5507, + "step": 8396 + }, + { + "epoch": 0.9755445832123149, + "grad_norm": 0.48745256662368774, + "learning_rate": 0.0001, + "loss": 1.6381, + "step": 8397 + }, + { + "epoch": 0.9756607609642753, + "grad_norm": 0.5059899091720581, + "learning_rate": 0.0001, + "loss": 1.6724, + "step": 8398 + }, + { + "epoch": 0.9757769387162358, + "grad_norm": 0.4656660854816437, + "learning_rate": 0.0001, + "loss": 1.6712, + "step": 8399 + }, + { + "epoch": 0.9758931164681963, + "grad_norm": 0.5184774398803711, + "learning_rate": 0.0001, + "loss": 1.8107, + "step": 8400 + }, + { + "epoch": 0.9760092942201568, + "grad_norm": 0.4714641273021698, + "learning_rate": 0.0001, + "loss": 1.4841, + "step": 8401 + }, + { + "epoch": 0.9761254719721173, + "grad_norm": 0.4753645062446594, + "learning_rate": 0.0001, + "loss": 1.7, + "step": 8402 + }, + { + "epoch": 0.9762416497240778, + "grad_norm": 0.5242862701416016, + "learning_rate": 0.0001, + "loss": 1.8109, + "step": 8403 + }, + { + "epoch": 0.9763578274760384, + "grad_norm": 0.5038365721702576, + "learning_rate": 0.0001, + "loss": 1.8278, + "step": 8404 + }, + { + "epoch": 0.9764740052279989, + "grad_norm": 0.5028054714202881, + "learning_rate": 0.0001, + "loss": 1.7647, + "step": 8405 + }, + { + "epoch": 0.9765901829799594, + "grad_norm": 0.4805586636066437, + "learning_rate": 0.0001, + "loss": 1.5734, + "step": 8406 + }, + { + "epoch": 0.9767063607319199, + "grad_norm": 0.4857881963253021, + "learning_rate": 0.0001, + "loss": 1.4969, + "step": 8407 + }, + { + "epoch": 0.9768225384838803, + "grad_norm": 0.47471508383750916, + "learning_rate": 0.0001, + "loss": 1.5072, + "step": 8408 + }, + { + "epoch": 0.9769387162358408, + "grad_norm": 0.4627721309661865, + "learning_rate": 0.0001, + "loss": 1.4733, + "step": 8409 + }, + { + "epoch": 0.9770548939878013, + "grad_norm": 0.4718911647796631, + "learning_rate": 0.0001, + "loss": 1.4798, + "step": 8410 + }, + { + "epoch": 0.9771710717397618, + "grad_norm": 0.5181329846382141, + "learning_rate": 0.0001, + "loss": 1.6377, + "step": 8411 + }, + { + "epoch": 0.9772872494917223, + "grad_norm": 0.4619660973548889, + "learning_rate": 0.0001, + "loss": 1.4207, + "step": 8412 + }, + { + "epoch": 0.9774034272436828, + "grad_norm": 0.5440229773521423, + "learning_rate": 0.0001, + "loss": 1.7901, + "step": 8413 + }, + { + "epoch": 0.9775196049956434, + "grad_norm": 0.4952663481235504, + "learning_rate": 0.0001, + "loss": 1.6403, + "step": 8414 + }, + { + "epoch": 0.9776357827476039, + "grad_norm": 0.4936034083366394, + "learning_rate": 0.0001, + "loss": 1.6011, + "step": 8415 + }, + { + "epoch": 0.9777519604995644, + "grad_norm": 0.5232906341552734, + "learning_rate": 0.0001, + "loss": 1.5554, + "step": 8416 + }, + { + "epoch": 0.9778681382515249, + "grad_norm": 0.5204361081123352, + "learning_rate": 0.0001, + "loss": 1.561, + "step": 8417 + }, + { + "epoch": 0.9779843160034853, + "grad_norm": 0.4749402701854706, + "learning_rate": 0.0001, + "loss": 1.4276, + "step": 8418 + }, + { + "epoch": 0.9781004937554458, + "grad_norm": 0.4584287106990814, + "learning_rate": 0.0001, + "loss": 1.5456, + "step": 8419 + }, + { + "epoch": 0.9782166715074063, + "grad_norm": 0.4839183986186981, + "learning_rate": 0.0001, + "loss": 1.62, + "step": 8420 + }, + { + "epoch": 0.9783328492593668, + "grad_norm": 0.4972332715988159, + "learning_rate": 0.0001, + "loss": 1.6055, + "step": 8421 + }, + { + "epoch": 0.9784490270113273, + "grad_norm": 0.5022913813591003, + "learning_rate": 0.0001, + "loss": 1.5358, + "step": 8422 + }, + { + "epoch": 0.9785652047632878, + "grad_norm": 0.5144555568695068, + "learning_rate": 0.0001, + "loss": 1.6239, + "step": 8423 + }, + { + "epoch": 0.9786813825152483, + "grad_norm": 0.5038486123085022, + "learning_rate": 0.0001, + "loss": 1.6212, + "step": 8424 + }, + { + "epoch": 0.9787975602672089, + "grad_norm": 0.45622316002845764, + "learning_rate": 0.0001, + "loss": 1.4674, + "step": 8425 + }, + { + "epoch": 0.9789137380191694, + "grad_norm": 0.48347601294517517, + "learning_rate": 0.0001, + "loss": 1.5393, + "step": 8426 + }, + { + "epoch": 0.9790299157711299, + "grad_norm": 0.5117625594139099, + "learning_rate": 0.0001, + "loss": 1.5874, + "step": 8427 + }, + { + "epoch": 0.9791460935230903, + "grad_norm": 0.47401314973831177, + "learning_rate": 0.0001, + "loss": 1.6127, + "step": 8428 + }, + { + "epoch": 0.9792622712750508, + "grad_norm": 0.4935537278652191, + "learning_rate": 0.0001, + "loss": 1.6038, + "step": 8429 + }, + { + "epoch": 0.9793784490270113, + "grad_norm": 0.47367969155311584, + "learning_rate": 0.0001, + "loss": 1.5804, + "step": 8430 + }, + { + "epoch": 0.9794946267789718, + "grad_norm": 0.4980182647705078, + "learning_rate": 0.0001, + "loss": 1.6661, + "step": 8431 + }, + { + "epoch": 0.9796108045309323, + "grad_norm": 0.5033068656921387, + "learning_rate": 0.0001, + "loss": 1.6927, + "step": 8432 + }, + { + "epoch": 0.9797269822828928, + "grad_norm": 0.49150750041007996, + "learning_rate": 0.0001, + "loss": 1.7359, + "step": 8433 + }, + { + "epoch": 0.9798431600348533, + "grad_norm": 0.4907017648220062, + "learning_rate": 0.0001, + "loss": 1.6193, + "step": 8434 + }, + { + "epoch": 0.9799593377868139, + "grad_norm": 0.46434399485588074, + "learning_rate": 0.0001, + "loss": 1.5551, + "step": 8435 + }, + { + "epoch": 0.9800755155387744, + "grad_norm": 0.4952322840690613, + "learning_rate": 0.0001, + "loss": 1.5669, + "step": 8436 + }, + { + "epoch": 0.9801916932907349, + "grad_norm": 0.4482409358024597, + "learning_rate": 0.0001, + "loss": 1.4537, + "step": 8437 + }, + { + "epoch": 0.9803078710426953, + "grad_norm": 0.4462791979312897, + "learning_rate": 0.0001, + "loss": 1.4427, + "step": 8438 + }, + { + "epoch": 0.9804240487946558, + "grad_norm": 0.5374612808227539, + "learning_rate": 0.0001, + "loss": 1.8228, + "step": 8439 + }, + { + "epoch": 0.9805402265466163, + "grad_norm": 0.5060397386550903, + "learning_rate": 0.0001, + "loss": 1.693, + "step": 8440 + }, + { + "epoch": 0.9806564042985768, + "grad_norm": 0.4491170048713684, + "learning_rate": 0.0001, + "loss": 1.4703, + "step": 8441 + }, + { + "epoch": 0.9807725820505373, + "grad_norm": 0.49389106035232544, + "learning_rate": 0.0001, + "loss": 1.5651, + "step": 8442 + }, + { + "epoch": 0.9808887598024978, + "grad_norm": 0.49476566910743713, + "learning_rate": 0.0001, + "loss": 1.5014, + "step": 8443 + }, + { + "epoch": 0.9810049375544583, + "grad_norm": 0.45577380061149597, + "learning_rate": 0.0001, + "loss": 1.3664, + "step": 8444 + }, + { + "epoch": 0.9811211153064188, + "grad_norm": 0.48937976360321045, + "learning_rate": 0.0001, + "loss": 1.5614, + "step": 8445 + }, + { + "epoch": 0.9812372930583794, + "grad_norm": 0.4766305983066559, + "learning_rate": 0.0001, + "loss": 1.5832, + "step": 8446 + }, + { + "epoch": 0.9813534708103399, + "grad_norm": 0.4743248224258423, + "learning_rate": 0.0001, + "loss": 1.6103, + "step": 8447 + }, + { + "epoch": 0.9814696485623003, + "grad_norm": 0.504391610622406, + "learning_rate": 0.0001, + "loss": 1.5322, + "step": 8448 + }, + { + "epoch": 0.9815858263142608, + "grad_norm": 0.46172428131103516, + "learning_rate": 0.0001, + "loss": 1.5427, + "step": 8449 + }, + { + "epoch": 0.9817020040662213, + "grad_norm": 0.4999525249004364, + "learning_rate": 0.0001, + "loss": 1.6239, + "step": 8450 + }, + { + "epoch": 0.9818181818181818, + "grad_norm": 0.49794474244117737, + "learning_rate": 0.0001, + "loss": 1.7356, + "step": 8451 + }, + { + "epoch": 0.9819343595701423, + "grad_norm": 0.47993290424346924, + "learning_rate": 0.0001, + "loss": 1.6223, + "step": 8452 + }, + { + "epoch": 0.9820505373221028, + "grad_norm": 0.5240808725357056, + "learning_rate": 0.0001, + "loss": 1.7495, + "step": 8453 + }, + { + "epoch": 0.9821667150740633, + "grad_norm": 0.486801415681839, + "learning_rate": 0.0001, + "loss": 1.6978, + "step": 8454 + }, + { + "epoch": 0.9822828928260238, + "grad_norm": 0.5116367936134338, + "learning_rate": 0.0001, + "loss": 1.6763, + "step": 8455 + }, + { + "epoch": 0.9823990705779844, + "grad_norm": 0.4694158732891083, + "learning_rate": 0.0001, + "loss": 1.638, + "step": 8456 + }, + { + "epoch": 0.9825152483299449, + "grad_norm": 0.4877493679523468, + "learning_rate": 0.0001, + "loss": 1.5703, + "step": 8457 + }, + { + "epoch": 0.9826314260819053, + "grad_norm": 0.5048372745513916, + "learning_rate": 0.0001, + "loss": 1.6968, + "step": 8458 + }, + { + "epoch": 0.9827476038338658, + "grad_norm": 0.4648038148880005, + "learning_rate": 0.0001, + "loss": 1.4964, + "step": 8459 + }, + { + "epoch": 0.9828637815858263, + "grad_norm": 0.48997968435287476, + "learning_rate": 0.0001, + "loss": 1.4991, + "step": 8460 + }, + { + "epoch": 0.9829799593377868, + "grad_norm": 0.4944762885570526, + "learning_rate": 0.0001, + "loss": 1.6226, + "step": 8461 + }, + { + "epoch": 0.9830961370897473, + "grad_norm": 0.4919930398464203, + "learning_rate": 0.0001, + "loss": 1.499, + "step": 8462 + }, + { + "epoch": 0.9832123148417078, + "grad_norm": 0.49641650915145874, + "learning_rate": 0.0001, + "loss": 1.5649, + "step": 8463 + }, + { + "epoch": 0.9833284925936683, + "grad_norm": 0.4891132414340973, + "learning_rate": 0.0001, + "loss": 1.6833, + "step": 8464 + }, + { + "epoch": 0.9834446703456288, + "grad_norm": 0.4601823687553406, + "learning_rate": 0.0001, + "loss": 1.531, + "step": 8465 + }, + { + "epoch": 0.9835608480975893, + "grad_norm": 0.4825168550014496, + "learning_rate": 0.0001, + "loss": 1.6394, + "step": 8466 + }, + { + "epoch": 0.9836770258495499, + "grad_norm": 0.5182327032089233, + "learning_rate": 0.0001, + "loss": 1.5996, + "step": 8467 + }, + { + "epoch": 0.9837932036015103, + "grad_norm": 0.47299832105636597, + "learning_rate": 0.0001, + "loss": 1.6853, + "step": 8468 + }, + { + "epoch": 0.9839093813534708, + "grad_norm": 0.505413293838501, + "learning_rate": 0.0001, + "loss": 1.664, + "step": 8469 + }, + { + "epoch": 0.9840255591054313, + "grad_norm": 0.4697488844394684, + "learning_rate": 0.0001, + "loss": 1.41, + "step": 8470 + }, + { + "epoch": 0.9841417368573918, + "grad_norm": 0.4823164641857147, + "learning_rate": 0.0001, + "loss": 1.6351, + "step": 8471 + }, + { + "epoch": 0.9842579146093523, + "grad_norm": 0.49613675475120544, + "learning_rate": 0.0001, + "loss": 1.6637, + "step": 8472 + }, + { + "epoch": 0.9843740923613128, + "grad_norm": 0.49066096544265747, + "learning_rate": 0.0001, + "loss": 1.3881, + "step": 8473 + }, + { + "epoch": 0.9844902701132733, + "grad_norm": 0.4947955012321472, + "learning_rate": 0.0001, + "loss": 1.6372, + "step": 8474 + }, + { + "epoch": 0.9846064478652338, + "grad_norm": 0.4665144681930542, + "learning_rate": 0.0001, + "loss": 1.5908, + "step": 8475 + }, + { + "epoch": 0.9847226256171943, + "grad_norm": 0.5288201570510864, + "learning_rate": 0.0001, + "loss": 1.8544, + "step": 8476 + }, + { + "epoch": 0.9848388033691549, + "grad_norm": 0.4513753354549408, + "learning_rate": 0.0001, + "loss": 1.3675, + "step": 8477 + }, + { + "epoch": 0.9849549811211153, + "grad_norm": 0.49009808897972107, + "learning_rate": 0.0001, + "loss": 1.5639, + "step": 8478 + }, + { + "epoch": 0.9850711588730758, + "grad_norm": 0.5423754453659058, + "learning_rate": 0.0001, + "loss": 1.7736, + "step": 8479 + }, + { + "epoch": 0.9851873366250363, + "grad_norm": 0.5365590453147888, + "learning_rate": 0.0001, + "loss": 1.6318, + "step": 8480 + }, + { + "epoch": 0.9853035143769968, + "grad_norm": 0.44759395718574524, + "learning_rate": 0.0001, + "loss": 1.5299, + "step": 8481 + }, + { + "epoch": 0.9854196921289573, + "grad_norm": 0.5050853490829468, + "learning_rate": 0.0001, + "loss": 1.5673, + "step": 8482 + }, + { + "epoch": 0.9855358698809178, + "grad_norm": 0.4968460202217102, + "learning_rate": 0.0001, + "loss": 1.5135, + "step": 8483 + }, + { + "epoch": 0.9856520476328783, + "grad_norm": 0.4942067265510559, + "learning_rate": 0.0001, + "loss": 1.497, + "step": 8484 + }, + { + "epoch": 0.9857682253848388, + "grad_norm": 0.5043184161186218, + "learning_rate": 0.0001, + "loss": 1.5682, + "step": 8485 + }, + { + "epoch": 0.9858844031367993, + "grad_norm": 0.4849433898925781, + "learning_rate": 0.0001, + "loss": 1.6144, + "step": 8486 + }, + { + "epoch": 0.9860005808887597, + "grad_norm": 0.5094505548477173, + "learning_rate": 0.0001, + "loss": 1.6876, + "step": 8487 + }, + { + "epoch": 0.9861167586407203, + "grad_norm": 0.5001594424247742, + "learning_rate": 0.0001, + "loss": 1.6043, + "step": 8488 + }, + { + "epoch": 0.9862329363926808, + "grad_norm": 0.47105178236961365, + "learning_rate": 0.0001, + "loss": 1.4964, + "step": 8489 + }, + { + "epoch": 0.9863491141446413, + "grad_norm": 0.4758048951625824, + "learning_rate": 0.0001, + "loss": 1.3925, + "step": 8490 + }, + { + "epoch": 0.9864652918966018, + "grad_norm": 0.4678146541118622, + "learning_rate": 0.0001, + "loss": 1.5739, + "step": 8491 + }, + { + "epoch": 0.9865814696485623, + "grad_norm": 0.4687998294830322, + "learning_rate": 0.0001, + "loss": 1.5574, + "step": 8492 + }, + { + "epoch": 0.9866976474005228, + "grad_norm": 0.5320387482643127, + "learning_rate": 0.0001, + "loss": 1.6209, + "step": 8493 + }, + { + "epoch": 0.9868138251524833, + "grad_norm": 0.49444085359573364, + "learning_rate": 0.0001, + "loss": 1.64, + "step": 8494 + }, + { + "epoch": 0.9869300029044438, + "grad_norm": 0.4462701678276062, + "learning_rate": 0.0001, + "loss": 1.491, + "step": 8495 + }, + { + "epoch": 0.9870461806564043, + "grad_norm": 0.46784061193466187, + "learning_rate": 0.0001, + "loss": 1.547, + "step": 8496 + }, + { + "epoch": 0.9871623584083647, + "grad_norm": 0.4919097423553467, + "learning_rate": 0.0001, + "loss": 1.6808, + "step": 8497 + }, + { + "epoch": 0.9872785361603253, + "grad_norm": 0.48356717824935913, + "learning_rate": 0.0001, + "loss": 1.7458, + "step": 8498 + }, + { + "epoch": 0.9873947139122858, + "grad_norm": 0.4529293477535248, + "learning_rate": 0.0001, + "loss": 1.6302, + "step": 8499 + }, + { + "epoch": 0.9875108916642463, + "grad_norm": 0.48209211230278015, + "learning_rate": 0.0001, + "loss": 1.63, + "step": 8500 + }, + { + "epoch": 0.9876270694162068, + "grad_norm": 0.4654987156391144, + "learning_rate": 0.0001, + "loss": 1.6053, + "step": 8501 + }, + { + "epoch": 0.9877432471681673, + "grad_norm": 0.4693775177001953, + "learning_rate": 0.0001, + "loss": 1.4176, + "step": 8502 + }, + { + "epoch": 0.9878594249201278, + "grad_norm": 0.4983534514904022, + "learning_rate": 0.0001, + "loss": 1.5224, + "step": 8503 + }, + { + "epoch": 0.9879756026720883, + "grad_norm": 0.4841345250606537, + "learning_rate": 0.0001, + "loss": 1.5284, + "step": 8504 + }, + { + "epoch": 0.9880917804240488, + "grad_norm": 0.5288339853286743, + "learning_rate": 0.0001, + "loss": 1.7306, + "step": 8505 + }, + { + "epoch": 0.9882079581760093, + "grad_norm": 0.482524037361145, + "learning_rate": 0.0001, + "loss": 1.5476, + "step": 8506 + }, + { + "epoch": 0.9883241359279697, + "grad_norm": 0.48337680101394653, + "learning_rate": 0.0001, + "loss": 1.5168, + "step": 8507 + }, + { + "epoch": 0.9884403136799302, + "grad_norm": 0.4871644079685211, + "learning_rate": 0.0001, + "loss": 1.5644, + "step": 8508 + }, + { + "epoch": 0.9885564914318908, + "grad_norm": 0.47832533717155457, + "learning_rate": 0.0001, + "loss": 1.6587, + "step": 8509 + }, + { + "epoch": 0.9886726691838513, + "grad_norm": 0.4998105764389038, + "learning_rate": 0.0001, + "loss": 1.4837, + "step": 8510 + }, + { + "epoch": 0.9887888469358118, + "grad_norm": 0.4856521189212799, + "learning_rate": 0.0001, + "loss": 1.4956, + "step": 8511 + }, + { + "epoch": 0.9889050246877723, + "grad_norm": 0.4702942967414856, + "learning_rate": 0.0001, + "loss": 1.51, + "step": 8512 + }, + { + "epoch": 0.9890212024397328, + "grad_norm": 0.4836369752883911, + "learning_rate": 0.0001, + "loss": 1.5537, + "step": 8513 + }, + { + "epoch": 0.9891373801916933, + "grad_norm": 0.4650129973888397, + "learning_rate": 0.0001, + "loss": 1.4638, + "step": 8514 + }, + { + "epoch": 0.9892535579436538, + "grad_norm": 0.47725415229797363, + "learning_rate": 0.0001, + "loss": 1.5804, + "step": 8515 + }, + { + "epoch": 0.9893697356956143, + "grad_norm": 0.4913800358772278, + "learning_rate": 0.0001, + "loss": 1.5883, + "step": 8516 + }, + { + "epoch": 0.9894859134475747, + "grad_norm": 0.5163478851318359, + "learning_rate": 0.0001, + "loss": 1.6778, + "step": 8517 + }, + { + "epoch": 0.9896020911995352, + "grad_norm": 0.4827187657356262, + "learning_rate": 0.0001, + "loss": 1.6692, + "step": 8518 + }, + { + "epoch": 0.9897182689514958, + "grad_norm": 0.5060868859291077, + "learning_rate": 0.0001, + "loss": 1.6114, + "step": 8519 + }, + { + "epoch": 0.9898344467034563, + "grad_norm": 0.5102714896202087, + "learning_rate": 0.0001, + "loss": 1.6579, + "step": 8520 + }, + { + "epoch": 0.9899506244554168, + "grad_norm": 0.4683992862701416, + "learning_rate": 0.0001, + "loss": 1.5495, + "step": 8521 + }, + { + "epoch": 0.9900668022073773, + "grad_norm": 0.4988732635974884, + "learning_rate": 0.0001, + "loss": 1.5149, + "step": 8522 + }, + { + "epoch": 0.9901829799593378, + "grad_norm": 0.48030632734298706, + "learning_rate": 0.0001, + "loss": 1.5803, + "step": 8523 + }, + { + "epoch": 0.9902991577112983, + "grad_norm": 0.48717671632766724, + "learning_rate": 0.0001, + "loss": 1.6383, + "step": 8524 + }, + { + "epoch": 0.9904153354632588, + "grad_norm": 0.4652939736843109, + "learning_rate": 0.0001, + "loss": 1.4736, + "step": 8525 + }, + { + "epoch": 0.9905315132152193, + "grad_norm": 0.4697415232658386, + "learning_rate": 0.0001, + "loss": 1.5953, + "step": 8526 + }, + { + "epoch": 0.9906476909671798, + "grad_norm": 0.4873330593109131, + "learning_rate": 0.0001, + "loss": 1.4776, + "step": 8527 + }, + { + "epoch": 0.9907638687191402, + "grad_norm": 0.5050543546676636, + "learning_rate": 0.0001, + "loss": 1.6988, + "step": 8528 + }, + { + "epoch": 0.9908800464711008, + "grad_norm": 0.499187171459198, + "learning_rate": 0.0001, + "loss": 1.7552, + "step": 8529 + }, + { + "epoch": 0.9909962242230613, + "grad_norm": 0.5250015258789062, + "learning_rate": 0.0001, + "loss": 1.7133, + "step": 8530 + }, + { + "epoch": 0.9911124019750218, + "grad_norm": 0.48826849460601807, + "learning_rate": 0.0001, + "loss": 1.5265, + "step": 8531 + }, + { + "epoch": 0.9912285797269823, + "grad_norm": 0.45894527435302734, + "learning_rate": 0.0001, + "loss": 1.4252, + "step": 8532 + }, + { + "epoch": 0.9913447574789428, + "grad_norm": 0.45296138525009155, + "learning_rate": 0.0001, + "loss": 1.4576, + "step": 8533 + }, + { + "epoch": 0.9914609352309033, + "grad_norm": 0.5120823383331299, + "learning_rate": 0.0001, + "loss": 1.5537, + "step": 8534 + }, + { + "epoch": 0.9915771129828638, + "grad_norm": 0.5219103693962097, + "learning_rate": 0.0001, + "loss": 1.7421, + "step": 8535 + }, + { + "epoch": 0.9916932907348243, + "grad_norm": 0.41183364391326904, + "learning_rate": 0.0001, + "loss": 1.2066, + "step": 8536 + }, + { + "epoch": 0.9918094684867848, + "grad_norm": 0.49815478920936584, + "learning_rate": 0.0001, + "loss": 1.671, + "step": 8537 + }, + { + "epoch": 0.9919256462387452, + "grad_norm": 0.4828040301799774, + "learning_rate": 0.0001, + "loss": 1.5463, + "step": 8538 + }, + { + "epoch": 0.9920418239907057, + "grad_norm": 0.48174577951431274, + "learning_rate": 0.0001, + "loss": 1.6149, + "step": 8539 + }, + { + "epoch": 0.9921580017426663, + "grad_norm": 0.4945439100265503, + "learning_rate": 0.0001, + "loss": 1.4874, + "step": 8540 + }, + { + "epoch": 0.9922741794946268, + "grad_norm": 0.5396121144294739, + "learning_rate": 0.0001, + "loss": 1.4201, + "step": 8541 + }, + { + "epoch": 0.9923903572465873, + "grad_norm": 0.4893397390842438, + "learning_rate": 0.0001, + "loss": 1.6438, + "step": 8542 + }, + { + "epoch": 0.9925065349985478, + "grad_norm": 0.4512374699115753, + "learning_rate": 0.0001, + "loss": 1.5482, + "step": 8543 + }, + { + "epoch": 0.9926227127505083, + "grad_norm": 0.5038131475448608, + "learning_rate": 0.0001, + "loss": 1.6511, + "step": 8544 + }, + { + "epoch": 0.9927388905024688, + "grad_norm": 0.47863298654556274, + "learning_rate": 0.0001, + "loss": 1.6239, + "step": 8545 + }, + { + "epoch": 0.9928550682544293, + "grad_norm": 0.4746008515357971, + "learning_rate": 0.0001, + "loss": 1.4573, + "step": 8546 + }, + { + "epoch": 0.9929712460063898, + "grad_norm": 0.5020937323570251, + "learning_rate": 0.0001, + "loss": 1.6619, + "step": 8547 + }, + { + "epoch": 0.9930874237583502, + "grad_norm": 0.5293759703636169, + "learning_rate": 0.0001, + "loss": 1.6165, + "step": 8548 + }, + { + "epoch": 0.9932036015103107, + "grad_norm": 0.49491435289382935, + "learning_rate": 0.0001, + "loss": 1.6167, + "step": 8549 + }, + { + "epoch": 0.9933197792622713, + "grad_norm": 0.4743204414844513, + "learning_rate": 0.0001, + "loss": 1.6571, + "step": 8550 + }, + { + "epoch": 0.9934359570142318, + "grad_norm": 0.47521528601646423, + "learning_rate": 0.0001, + "loss": 1.527, + "step": 8551 + }, + { + "epoch": 0.9935521347661923, + "grad_norm": 0.5073684453964233, + "learning_rate": 0.0001, + "loss": 1.7728, + "step": 8552 + }, + { + "epoch": 0.9936683125181528, + "grad_norm": 0.5444392561912537, + "learning_rate": 0.0001, + "loss": 1.6679, + "step": 8553 + }, + { + "epoch": 0.9937844902701133, + "grad_norm": 0.47420382499694824, + "learning_rate": 0.0001, + "loss": 1.4998, + "step": 8554 + }, + { + "epoch": 0.9939006680220738, + "grad_norm": 0.4889985918998718, + "learning_rate": 0.0001, + "loss": 1.6306, + "step": 8555 + }, + { + "epoch": 0.9940168457740343, + "grad_norm": 0.49144667387008667, + "learning_rate": 0.0001, + "loss": 1.6487, + "step": 8556 + }, + { + "epoch": 0.9941330235259948, + "grad_norm": 0.5177491307258606, + "learning_rate": 0.0001, + "loss": 1.6854, + "step": 8557 + }, + { + "epoch": 0.9942492012779552, + "grad_norm": 0.4622519612312317, + "learning_rate": 0.0001, + "loss": 1.5771, + "step": 8558 + }, + { + "epoch": 0.9943653790299157, + "grad_norm": 0.463818222284317, + "learning_rate": 0.0001, + "loss": 1.5265, + "step": 8559 + }, + { + "epoch": 0.9944815567818762, + "grad_norm": 0.46386709809303284, + "learning_rate": 0.0001, + "loss": 1.4344, + "step": 8560 + }, + { + "epoch": 0.9945977345338368, + "grad_norm": 0.4772911071777344, + "learning_rate": 0.0001, + "loss": 1.4433, + "step": 8561 + }, + { + "epoch": 0.9947139122857973, + "grad_norm": 0.47931206226348877, + "learning_rate": 0.0001, + "loss": 1.5446, + "step": 8562 + }, + { + "epoch": 0.9948300900377578, + "grad_norm": 0.5052580237388611, + "learning_rate": 0.0001, + "loss": 1.5963, + "step": 8563 + }, + { + "epoch": 0.9949462677897183, + "grad_norm": 0.46733200550079346, + "learning_rate": 0.0001, + "loss": 1.3345, + "step": 8564 + }, + { + "epoch": 0.9950624455416788, + "grad_norm": 0.5169939994812012, + "learning_rate": 0.0001, + "loss": 1.4962, + "step": 8565 + }, + { + "epoch": 0.9951786232936393, + "grad_norm": 0.5090740919113159, + "learning_rate": 0.0001, + "loss": 1.7412, + "step": 8566 + }, + { + "epoch": 0.9952948010455998, + "grad_norm": 0.4987412691116333, + "learning_rate": 0.0001, + "loss": 1.4844, + "step": 8567 + }, + { + "epoch": 0.9954109787975602, + "grad_norm": 0.44403526186943054, + "learning_rate": 0.0001, + "loss": 1.5212, + "step": 8568 + }, + { + "epoch": 0.9955271565495207, + "grad_norm": 0.5056188702583313, + "learning_rate": 0.0001, + "loss": 1.659, + "step": 8569 + }, + { + "epoch": 0.9956433343014812, + "grad_norm": 0.4768769145011902, + "learning_rate": 0.0001, + "loss": 1.4187, + "step": 8570 + }, + { + "epoch": 0.9957595120534418, + "grad_norm": 0.5366693735122681, + "learning_rate": 0.0001, + "loss": 1.6996, + "step": 8571 + }, + { + "epoch": 0.9958756898054023, + "grad_norm": 0.49038922786712646, + "learning_rate": 0.0001, + "loss": 1.6859, + "step": 8572 + }, + { + "epoch": 0.9959918675573628, + "grad_norm": 0.4839872419834137, + "learning_rate": 0.0001, + "loss": 1.7012, + "step": 8573 + }, + { + "epoch": 0.9961080453093233, + "grad_norm": 0.4965837001800537, + "learning_rate": 0.0001, + "loss": 1.6153, + "step": 8574 + }, + { + "epoch": 0.9962242230612838, + "grad_norm": 0.4981982111930847, + "learning_rate": 0.0001, + "loss": 1.7764, + "step": 8575 + }, + { + "epoch": 0.9963404008132443, + "grad_norm": 0.47916489839553833, + "learning_rate": 0.0001, + "loss": 1.4184, + "step": 8576 + }, + { + "epoch": 0.9964565785652048, + "grad_norm": 0.5100079774856567, + "learning_rate": 0.0001, + "loss": 1.5561, + "step": 8577 + }, + { + "epoch": 0.9965727563171652, + "grad_norm": 0.46579378843307495, + "learning_rate": 0.0001, + "loss": 1.4657, + "step": 8578 + }, + { + "epoch": 0.9966889340691257, + "grad_norm": 0.4757480323314667, + "learning_rate": 0.0001, + "loss": 1.5739, + "step": 8579 + }, + { + "epoch": 0.9968051118210862, + "grad_norm": 0.49027368426322937, + "learning_rate": 0.0001, + "loss": 1.623, + "step": 8580 + }, + { + "epoch": 0.9969212895730467, + "grad_norm": 0.4813475012779236, + "learning_rate": 0.0001, + "loss": 1.5207, + "step": 8581 + }, + { + "epoch": 0.9970374673250073, + "grad_norm": 0.4723621606826782, + "learning_rate": 0.0001, + "loss": 1.515, + "step": 8582 + }, + { + "epoch": 0.9971536450769678, + "grad_norm": 0.476460337638855, + "learning_rate": 0.0001, + "loss": 1.4617, + "step": 8583 + }, + { + "epoch": 0.9972698228289283, + "grad_norm": 0.4537820518016815, + "learning_rate": 0.0001, + "loss": 1.4144, + "step": 8584 + }, + { + "epoch": 0.9973860005808888, + "grad_norm": 0.4895983636379242, + "learning_rate": 0.0001, + "loss": 1.5439, + "step": 8585 + }, + { + "epoch": 0.9975021783328493, + "grad_norm": 0.4980904161930084, + "learning_rate": 0.0001, + "loss": 1.6071, + "step": 8586 + }, + { + "epoch": 0.9976183560848098, + "grad_norm": 0.4741244912147522, + "learning_rate": 0.0001, + "loss": 1.5683, + "step": 8587 + }, + { + "epoch": 0.9977345338367702, + "grad_norm": 0.4763984680175781, + "learning_rate": 0.0001, + "loss": 1.5386, + "step": 8588 + }, + { + "epoch": 0.9978507115887307, + "grad_norm": 0.49783235788345337, + "learning_rate": 0.0001, + "loss": 1.6471, + "step": 8589 + }, + { + "epoch": 0.9979668893406912, + "grad_norm": 0.5360357165336609, + "learning_rate": 0.0001, + "loss": 1.555, + "step": 8590 + }, + { + "epoch": 0.9980830670926517, + "grad_norm": 0.5079225301742554, + "learning_rate": 0.0001, + "loss": 1.5375, + "step": 8591 + }, + { + "epoch": 0.9981992448446123, + "grad_norm": 0.5056329965591431, + "learning_rate": 0.0001, + "loss": 1.6868, + "step": 8592 + }, + { + "epoch": 0.9983154225965728, + "grad_norm": 0.46236658096313477, + "learning_rate": 0.0001, + "loss": 1.4826, + "step": 8593 + }, + { + "epoch": 0.9984316003485333, + "grad_norm": 0.5396600365638733, + "learning_rate": 0.0001, + "loss": 1.807, + "step": 8594 + }, + { + "epoch": 0.9985477781004938, + "grad_norm": 0.46349719166755676, + "learning_rate": 0.0001, + "loss": 1.6027, + "step": 8595 + }, + { + "epoch": 0.9986639558524543, + "grad_norm": 0.5006660223007202, + "learning_rate": 0.0001, + "loss": 1.5728, + "step": 8596 + }, + { + "epoch": 0.9987801336044148, + "grad_norm": 0.4902550280094147, + "learning_rate": 0.0001, + "loss": 1.567, + "step": 8597 + }, + { + "epoch": 0.9988963113563752, + "grad_norm": 0.48567691445350647, + "learning_rate": 0.0001, + "loss": 1.561, + "step": 8598 + }, + { + "epoch": 0.9990124891083357, + "grad_norm": 0.47548311948776245, + "learning_rate": 0.0001, + "loss": 1.7234, + "step": 8599 + }, + { + "epoch": 0.9991286668602962, + "grad_norm": 0.47243523597717285, + "learning_rate": 0.0001, + "loss": 1.5788, + "step": 8600 + }, + { + "epoch": 0.9992448446122567, + "grad_norm": 0.4965050518512726, + "learning_rate": 0.0001, + "loss": 1.6962, + "step": 8601 + }, + { + "epoch": 0.9993610223642172, + "grad_norm": 0.5007203817367554, + "learning_rate": 0.0001, + "loss": 1.707, + "step": 8602 + }, + { + "epoch": 0.9994772001161778, + "grad_norm": 0.48062342405319214, + "learning_rate": 0.0001, + "loss": 1.5141, + "step": 8603 + }, + { + "epoch": 0.9995933778681383, + "grad_norm": 0.5184653401374817, + "learning_rate": 0.0001, + "loss": 1.6188, + "step": 8604 + }, + { + "epoch": 0.9997095556200988, + "grad_norm": 0.4720516502857208, + "learning_rate": 0.0001, + "loss": 1.553, + "step": 8605 + }, + { + "epoch": 0.9998257333720593, + "grad_norm": 0.4640704393386841, + "learning_rate": 0.0001, + "loss": 1.4149, + "step": 8606 + }, + { + "epoch": 0.9999419111240198, + "grad_norm": 0.4718928933143616, + "learning_rate": 0.0001, + "loss": 1.4999, + "step": 8607 + }, + { + "epoch": 1.0000580888759802, + "grad_norm": 0.4901491701602936, + "learning_rate": 0.0001, + "loss": 1.4583, + "step": 8608 + }, + { + "epoch": 1.0001742666279407, + "grad_norm": 0.5047919154167175, + "learning_rate": 0.0001, + "loss": 1.5183, + "step": 8609 + }, + { + "epoch": 1.0002904443799012, + "grad_norm": 0.5121291279792786, + "learning_rate": 0.0001, + "loss": 1.6074, + "step": 8610 + }, + { + "epoch": 1.0004066221318617, + "grad_norm": 0.46516066789627075, + "learning_rate": 0.0001, + "loss": 1.3715, + "step": 8611 + }, + { + "epoch": 1.0005227998838222, + "grad_norm": 0.4786158800125122, + "learning_rate": 0.0001, + "loss": 1.5568, + "step": 8612 + }, + { + "epoch": 1.0006389776357827, + "grad_norm": 0.507445752620697, + "learning_rate": 0.0001, + "loss": 1.4281, + "step": 8613 + }, + { + "epoch": 1.0007551553877432, + "grad_norm": 0.5036141872406006, + "learning_rate": 0.0001, + "loss": 1.5231, + "step": 8614 + }, + { + "epoch": 1.0008713331397037, + "grad_norm": 0.46509867906570435, + "learning_rate": 0.0001, + "loss": 1.4069, + "step": 8615 + }, + { + "epoch": 1.0009875108916642, + "grad_norm": 0.5023954510688782, + "learning_rate": 0.0001, + "loss": 1.5552, + "step": 8616 + }, + { + "epoch": 1.0011036886436246, + "grad_norm": 0.4837322235107422, + "learning_rate": 0.0001, + "loss": 1.3964, + "step": 8617 + }, + { + "epoch": 1.0012198663955854, + "grad_norm": 0.5714748501777649, + "learning_rate": 0.0001, + "loss": 1.4821, + "step": 8618 + }, + { + "epoch": 1.0013360441475458, + "grad_norm": 0.4612061381340027, + "learning_rate": 0.0001, + "loss": 1.3439, + "step": 8619 + }, + { + "epoch": 1.0014522218995063, + "grad_norm": 0.5135992765426636, + "learning_rate": 0.0001, + "loss": 1.4263, + "step": 8620 + }, + { + "epoch": 1.0015683996514668, + "grad_norm": 0.5336510539054871, + "learning_rate": 0.0001, + "loss": 1.5123, + "step": 8621 + }, + { + "epoch": 1.0016845774034273, + "grad_norm": 0.531232476234436, + "learning_rate": 0.0001, + "loss": 1.3025, + "step": 8622 + }, + { + "epoch": 1.0018007551553878, + "grad_norm": 0.5298891663551331, + "learning_rate": 0.0001, + "loss": 1.6329, + "step": 8623 + }, + { + "epoch": 1.0019169329073483, + "grad_norm": 0.5009737014770508, + "learning_rate": 0.0001, + "loss": 1.3511, + "step": 8624 + }, + { + "epoch": 1.0020331106593088, + "grad_norm": 0.5414840579032898, + "learning_rate": 0.0001, + "loss": 1.3387, + "step": 8625 + }, + { + "epoch": 1.0021492884112693, + "grad_norm": 0.5156089067459106, + "learning_rate": 0.0001, + "loss": 1.5678, + "step": 8626 + }, + { + "epoch": 1.0022654661632298, + "grad_norm": 0.48899945616722107, + "learning_rate": 0.0001, + "loss": 1.3948, + "step": 8627 + }, + { + "epoch": 1.0023816439151902, + "grad_norm": 0.5076591968536377, + "learning_rate": 0.0001, + "loss": 1.5157, + "step": 8628 + }, + { + "epoch": 1.0024978216671507, + "grad_norm": 0.5568975806236267, + "learning_rate": 0.0001, + "loss": 1.506, + "step": 8629 + }, + { + "epoch": 1.0026139994191112, + "grad_norm": 0.5163382887840271, + "learning_rate": 0.0001, + "loss": 1.4018, + "step": 8630 + }, + { + "epoch": 1.0027301771710717, + "grad_norm": 0.5191821455955505, + "learning_rate": 0.0001, + "loss": 1.3851, + "step": 8631 + }, + { + "epoch": 1.0028463549230322, + "grad_norm": 0.5488196015357971, + "learning_rate": 0.0001, + "loss": 1.5607, + "step": 8632 + }, + { + "epoch": 1.0029625326749927, + "grad_norm": 0.49666348099708557, + "learning_rate": 0.0001, + "loss": 1.3315, + "step": 8633 + }, + { + "epoch": 1.0030787104269532, + "grad_norm": 0.5014142394065857, + "learning_rate": 0.0001, + "loss": 1.4998, + "step": 8634 + }, + { + "epoch": 1.0031948881789137, + "grad_norm": 0.48046401143074036, + "learning_rate": 0.0001, + "loss": 1.4761, + "step": 8635 + }, + { + "epoch": 1.0033110659308742, + "grad_norm": 0.6161479353904724, + "learning_rate": 0.0001, + "loss": 1.364, + "step": 8636 + }, + { + "epoch": 1.0034272436828346, + "grad_norm": 0.5767279267311096, + "learning_rate": 0.0001, + "loss": 1.5088, + "step": 8637 + }, + { + "epoch": 1.0035434214347951, + "grad_norm": 0.517012357711792, + "learning_rate": 0.0001, + "loss": 1.5278, + "step": 8638 + }, + { + "epoch": 1.0036595991867558, + "grad_norm": 0.5002815127372742, + "learning_rate": 0.0001, + "loss": 1.5128, + "step": 8639 + }, + { + "epoch": 1.0037757769387163, + "grad_norm": 0.5128475427627563, + "learning_rate": 0.0001, + "loss": 1.3022, + "step": 8640 + }, + { + "epoch": 1.0038919546906768, + "grad_norm": 0.5304717421531677, + "learning_rate": 0.0001, + "loss": 1.657, + "step": 8641 + }, + { + "epoch": 1.0040081324426373, + "grad_norm": 0.4929862320423126, + "learning_rate": 0.0001, + "loss": 1.4718, + "step": 8642 + }, + { + "epoch": 1.0041243101945978, + "grad_norm": 0.5075370669364929, + "learning_rate": 0.0001, + "loss": 1.4413, + "step": 8643 + }, + { + "epoch": 1.0042404879465583, + "grad_norm": 0.5167858004570007, + "learning_rate": 0.0001, + "loss": 1.6206, + "step": 8644 + }, + { + "epoch": 1.0043566656985188, + "grad_norm": 0.45497381687164307, + "learning_rate": 0.0001, + "loss": 1.2107, + "step": 8645 + }, + { + "epoch": 1.0044728434504793, + "grad_norm": 0.5639305710792542, + "learning_rate": 0.0001, + "loss": 1.5184, + "step": 8646 + }, + { + "epoch": 1.0045890212024398, + "grad_norm": 0.558202862739563, + "learning_rate": 0.0001, + "loss": 1.5464, + "step": 8647 + }, + { + "epoch": 1.0047051989544002, + "grad_norm": 0.5788266658782959, + "learning_rate": 0.0001, + "loss": 1.4751, + "step": 8648 + }, + { + "epoch": 1.0048213767063607, + "grad_norm": 0.5463119745254517, + "learning_rate": 0.0001, + "loss": 1.5891, + "step": 8649 + }, + { + "epoch": 1.0049375544583212, + "grad_norm": 0.5739423632621765, + "learning_rate": 0.0001, + "loss": 1.5901, + "step": 8650 + }, + { + "epoch": 1.0050537322102817, + "grad_norm": 0.5138026475906372, + "learning_rate": 0.0001, + "loss": 1.366, + "step": 8651 + }, + { + "epoch": 1.0051699099622422, + "grad_norm": 0.5081404447555542, + "learning_rate": 0.0001, + "loss": 1.4853, + "step": 8652 + }, + { + "epoch": 1.0052860877142027, + "grad_norm": 0.5575280785560608, + "learning_rate": 0.0001, + "loss": 1.5619, + "step": 8653 + }, + { + "epoch": 1.0054022654661632, + "grad_norm": 0.49208971858024597, + "learning_rate": 0.0001, + "loss": 1.4699, + "step": 8654 + }, + { + "epoch": 1.0055184432181237, + "grad_norm": 0.4909859001636505, + "learning_rate": 0.0001, + "loss": 1.3758, + "step": 8655 + }, + { + "epoch": 1.0056346209700842, + "grad_norm": 0.5197198987007141, + "learning_rate": 0.0001, + "loss": 1.6733, + "step": 8656 + }, + { + "epoch": 1.0057507987220446, + "grad_norm": 0.4799864888191223, + "learning_rate": 0.0001, + "loss": 1.4464, + "step": 8657 + }, + { + "epoch": 1.0058669764740051, + "grad_norm": 0.4831238389015198, + "learning_rate": 0.0001, + "loss": 1.3808, + "step": 8658 + }, + { + "epoch": 1.0059831542259656, + "grad_norm": 0.47417446970939636, + "learning_rate": 0.0001, + "loss": 1.3522, + "step": 8659 + }, + { + "epoch": 1.0060993319779263, + "grad_norm": 0.48708805441856384, + "learning_rate": 0.0001, + "loss": 1.4166, + "step": 8660 + }, + { + "epoch": 1.0062155097298868, + "grad_norm": 0.5076435208320618, + "learning_rate": 0.0001, + "loss": 1.478, + "step": 8661 + }, + { + "epoch": 1.0063316874818473, + "grad_norm": 0.4805087745189667, + "learning_rate": 0.0001, + "loss": 1.3286, + "step": 8662 + }, + { + "epoch": 1.0064478652338078, + "grad_norm": 0.5486977696418762, + "learning_rate": 0.0001, + "loss": 1.567, + "step": 8663 + }, + { + "epoch": 1.0065640429857683, + "grad_norm": 0.5629132390022278, + "learning_rate": 0.0001, + "loss": 1.5211, + "step": 8664 + }, + { + "epoch": 1.0066802207377288, + "grad_norm": 0.5157844424247742, + "learning_rate": 0.0001, + "loss": 1.4976, + "step": 8665 + }, + { + "epoch": 1.0067963984896893, + "grad_norm": 0.5176374316215515, + "learning_rate": 0.0001, + "loss": 1.4913, + "step": 8666 + }, + { + "epoch": 1.0069125762416498, + "grad_norm": 0.5113467574119568, + "learning_rate": 0.0001, + "loss": 1.3973, + "step": 8667 + }, + { + "epoch": 1.0070287539936102, + "grad_norm": 0.5078527331352234, + "learning_rate": 0.0001, + "loss": 1.4826, + "step": 8668 + }, + { + "epoch": 1.0071449317455707, + "grad_norm": 0.5093343257904053, + "learning_rate": 0.0001, + "loss": 1.3933, + "step": 8669 + }, + { + "epoch": 1.0072611094975312, + "grad_norm": 0.5552733540534973, + "learning_rate": 0.0001, + "loss": 1.5032, + "step": 8670 + }, + { + "epoch": 1.0073772872494917, + "grad_norm": 0.5340274572372437, + "learning_rate": 0.0001, + "loss": 1.5547, + "step": 8671 + }, + { + "epoch": 1.0074934650014522, + "grad_norm": 0.5053258538246155, + "learning_rate": 0.0001, + "loss": 1.3473, + "step": 8672 + }, + { + "epoch": 1.0076096427534127, + "grad_norm": 0.5026618838310242, + "learning_rate": 0.0001, + "loss": 1.3966, + "step": 8673 + }, + { + "epoch": 1.0077258205053732, + "grad_norm": 0.6316371560096741, + "learning_rate": 0.0001, + "loss": 1.6261, + "step": 8674 + }, + { + "epoch": 1.0078419982573337, + "grad_norm": 0.48417550325393677, + "learning_rate": 0.0001, + "loss": 1.3751, + "step": 8675 + }, + { + "epoch": 1.0079581760092942, + "grad_norm": 0.4940205514431, + "learning_rate": 0.0001, + "loss": 1.4245, + "step": 8676 + }, + { + "epoch": 1.0080743537612546, + "grad_norm": 0.5635389089584351, + "learning_rate": 0.0001, + "loss": 1.4426, + "step": 8677 + }, + { + "epoch": 1.0081905315132151, + "grad_norm": 0.5025110840797424, + "learning_rate": 0.0001, + "loss": 1.4626, + "step": 8678 + }, + { + "epoch": 1.0083067092651756, + "grad_norm": 0.5255551338195801, + "learning_rate": 0.0001, + "loss": 1.347, + "step": 8679 + }, + { + "epoch": 1.0084228870171361, + "grad_norm": 0.5457799434661865, + "learning_rate": 0.0001, + "loss": 1.6472, + "step": 8680 + }, + { + "epoch": 1.0085390647690968, + "grad_norm": 0.5311607122421265, + "learning_rate": 0.0001, + "loss": 1.5476, + "step": 8681 + }, + { + "epoch": 1.0086552425210573, + "grad_norm": 0.5294631719589233, + "learning_rate": 0.0001, + "loss": 1.5026, + "step": 8682 + }, + { + "epoch": 1.0087714202730178, + "grad_norm": 0.5186182856559753, + "learning_rate": 0.0001, + "loss": 1.498, + "step": 8683 + }, + { + "epoch": 1.0088875980249783, + "grad_norm": 0.502144455909729, + "learning_rate": 0.0001, + "loss": 1.4491, + "step": 8684 + }, + { + "epoch": 1.0090037757769388, + "grad_norm": 0.49139168858528137, + "learning_rate": 0.0001, + "loss": 1.4062, + "step": 8685 + }, + { + "epoch": 1.0091199535288993, + "grad_norm": 0.5063096284866333, + "learning_rate": 0.0001, + "loss": 1.3168, + "step": 8686 + }, + { + "epoch": 1.0092361312808598, + "grad_norm": 0.539588212966919, + "learning_rate": 0.0001, + "loss": 1.5566, + "step": 8687 + }, + { + "epoch": 1.0093523090328202, + "grad_norm": 0.5028448104858398, + "learning_rate": 0.0001, + "loss": 1.3836, + "step": 8688 + }, + { + "epoch": 1.0094684867847807, + "grad_norm": 0.5191101431846619, + "learning_rate": 0.0001, + "loss": 1.4249, + "step": 8689 + }, + { + "epoch": 1.0095846645367412, + "grad_norm": 0.5024988055229187, + "learning_rate": 0.0001, + "loss": 1.3312, + "step": 8690 + }, + { + "epoch": 1.0097008422887017, + "grad_norm": 0.5392231941223145, + "learning_rate": 0.0001, + "loss": 1.4941, + "step": 8691 + }, + { + "epoch": 1.0098170200406622, + "grad_norm": 0.5703687071800232, + "learning_rate": 0.0001, + "loss": 1.5044, + "step": 8692 + }, + { + "epoch": 1.0099331977926227, + "grad_norm": 0.5280400514602661, + "learning_rate": 0.0001, + "loss": 1.4647, + "step": 8693 + }, + { + "epoch": 1.0100493755445832, + "grad_norm": 0.5094478726387024, + "learning_rate": 0.0001, + "loss": 1.4625, + "step": 8694 + }, + { + "epoch": 1.0101655532965437, + "grad_norm": 0.5534915328025818, + "learning_rate": 0.0001, + "loss": 1.4222, + "step": 8695 + }, + { + "epoch": 1.0102817310485042, + "grad_norm": 0.49775946140289307, + "learning_rate": 0.0001, + "loss": 1.3503, + "step": 8696 + }, + { + "epoch": 1.0103979088004647, + "grad_norm": 0.5414407253265381, + "learning_rate": 0.0001, + "loss": 1.3478, + "step": 8697 + }, + { + "epoch": 1.0105140865524251, + "grad_norm": 0.5187419652938843, + "learning_rate": 0.0001, + "loss": 1.3347, + "step": 8698 + }, + { + "epoch": 1.0106302643043856, + "grad_norm": 0.5660606026649475, + "learning_rate": 0.0001, + "loss": 1.4956, + "step": 8699 + }, + { + "epoch": 1.0107464420563461, + "grad_norm": 0.5465399026870728, + "learning_rate": 0.0001, + "loss": 1.5229, + "step": 8700 + }, + { + "epoch": 1.0108626198083066, + "grad_norm": 0.5141976475715637, + "learning_rate": 0.0001, + "loss": 1.3697, + "step": 8701 + }, + { + "epoch": 1.0109787975602673, + "grad_norm": 0.5536482930183411, + "learning_rate": 0.0001, + "loss": 1.5109, + "step": 8702 + }, + { + "epoch": 1.0110949753122278, + "grad_norm": 0.5275995135307312, + "learning_rate": 0.0001, + "loss": 1.5241, + "step": 8703 + }, + { + "epoch": 1.0112111530641883, + "grad_norm": 0.5400773286819458, + "learning_rate": 0.0001, + "loss": 1.4405, + "step": 8704 + }, + { + "epoch": 1.0113273308161488, + "grad_norm": 0.5452625751495361, + "learning_rate": 0.0001, + "loss": 1.5017, + "step": 8705 + }, + { + "epoch": 1.0114435085681093, + "grad_norm": 0.5686435699462891, + "learning_rate": 0.0001, + "loss": 1.571, + "step": 8706 + }, + { + "epoch": 1.0115596863200698, + "grad_norm": 0.5052348971366882, + "learning_rate": 0.0001, + "loss": 1.4301, + "step": 8707 + }, + { + "epoch": 1.0116758640720303, + "grad_norm": 0.5007266998291016, + "learning_rate": 0.0001, + "loss": 1.4753, + "step": 8708 + }, + { + "epoch": 1.0117920418239907, + "grad_norm": 0.5304220914840698, + "learning_rate": 0.0001, + "loss": 1.5255, + "step": 8709 + }, + { + "epoch": 1.0119082195759512, + "grad_norm": 0.5678626298904419, + "learning_rate": 0.0001, + "loss": 1.5826, + "step": 8710 + }, + { + "epoch": 1.0120243973279117, + "grad_norm": 0.5126900672912598, + "learning_rate": 0.0001, + "loss": 1.3308, + "step": 8711 + }, + { + "epoch": 1.0121405750798722, + "grad_norm": 0.543702244758606, + "learning_rate": 0.0001, + "loss": 1.404, + "step": 8712 + }, + { + "epoch": 1.0122567528318327, + "grad_norm": 0.5045793056488037, + "learning_rate": 0.0001, + "loss": 1.5412, + "step": 8713 + }, + { + "epoch": 1.0123729305837932, + "grad_norm": 0.496380478143692, + "learning_rate": 0.0001, + "loss": 1.3823, + "step": 8714 + }, + { + "epoch": 1.0124891083357537, + "grad_norm": 0.5149445533752441, + "learning_rate": 0.0001, + "loss": 1.2453, + "step": 8715 + }, + { + "epoch": 1.0126052860877142, + "grad_norm": 0.5445921421051025, + "learning_rate": 0.0001, + "loss": 1.5304, + "step": 8716 + }, + { + "epoch": 1.0127214638396747, + "grad_norm": 0.536666214466095, + "learning_rate": 0.0001, + "loss": 1.6137, + "step": 8717 + }, + { + "epoch": 1.0128376415916351, + "grad_norm": 0.49998173117637634, + "learning_rate": 0.0001, + "loss": 1.4359, + "step": 8718 + }, + { + "epoch": 1.0129538193435956, + "grad_norm": 0.5400097966194153, + "learning_rate": 0.0001, + "loss": 1.5228, + "step": 8719 + }, + { + "epoch": 1.0130699970955561, + "grad_norm": 0.5498408079147339, + "learning_rate": 0.0001, + "loss": 1.5262, + "step": 8720 + }, + { + "epoch": 1.0131861748475166, + "grad_norm": 0.4792373478412628, + "learning_rate": 0.0001, + "loss": 1.3712, + "step": 8721 + }, + { + "epoch": 1.013302352599477, + "grad_norm": 0.49711140990257263, + "learning_rate": 0.0001, + "loss": 1.4074, + "step": 8722 + }, + { + "epoch": 1.0134185303514378, + "grad_norm": 0.5019938349723816, + "learning_rate": 0.0001, + "loss": 1.3793, + "step": 8723 + }, + { + "epoch": 1.0135347081033983, + "grad_norm": 0.4867321252822876, + "learning_rate": 0.0001, + "loss": 1.3943, + "step": 8724 + }, + { + "epoch": 1.0136508858553588, + "grad_norm": 0.5257205367088318, + "learning_rate": 0.0001, + "loss": 1.4509, + "step": 8725 + }, + { + "epoch": 1.0137670636073193, + "grad_norm": 0.5636011362075806, + "learning_rate": 0.0001, + "loss": 1.5382, + "step": 8726 + }, + { + "epoch": 1.0138832413592798, + "grad_norm": 0.5135155916213989, + "learning_rate": 0.0001, + "loss": 1.2232, + "step": 8727 + }, + { + "epoch": 1.0139994191112403, + "grad_norm": 0.524466872215271, + "learning_rate": 0.0001, + "loss": 1.4098, + "step": 8728 + }, + { + "epoch": 1.0141155968632007, + "grad_norm": 0.5000982284545898, + "learning_rate": 0.0001, + "loss": 1.3986, + "step": 8729 + }, + { + "epoch": 1.0142317746151612, + "grad_norm": 0.5312855243682861, + "learning_rate": 0.0001, + "loss": 1.4831, + "step": 8730 + }, + { + "epoch": 1.0143479523671217, + "grad_norm": 0.530696451663971, + "learning_rate": 0.0001, + "loss": 1.5603, + "step": 8731 + }, + { + "epoch": 1.0144641301190822, + "grad_norm": 0.5051412582397461, + "learning_rate": 0.0001, + "loss": 1.2977, + "step": 8732 + }, + { + "epoch": 1.0145803078710427, + "grad_norm": 0.5274531245231628, + "learning_rate": 0.0001, + "loss": 1.41, + "step": 8733 + }, + { + "epoch": 1.0146964856230032, + "grad_norm": 0.5309851169586182, + "learning_rate": 0.0001, + "loss": 1.47, + "step": 8734 + }, + { + "epoch": 1.0148126633749637, + "grad_norm": 0.575518012046814, + "learning_rate": 0.0001, + "loss": 1.6739, + "step": 8735 + }, + { + "epoch": 1.0149288411269242, + "grad_norm": 0.5956768989562988, + "learning_rate": 0.0001, + "loss": 1.5394, + "step": 8736 + }, + { + "epoch": 1.0150450188788847, + "grad_norm": 0.532591700553894, + "learning_rate": 0.0001, + "loss": 1.5308, + "step": 8737 + }, + { + "epoch": 1.0151611966308451, + "grad_norm": 0.534724235534668, + "learning_rate": 0.0001, + "loss": 1.4574, + "step": 8738 + }, + { + "epoch": 1.0152773743828056, + "grad_norm": 0.5360231995582581, + "learning_rate": 0.0001, + "loss": 1.2831, + "step": 8739 + }, + { + "epoch": 1.0153935521347661, + "grad_norm": 0.557018518447876, + "learning_rate": 0.0001, + "loss": 1.6229, + "step": 8740 + }, + { + "epoch": 1.0155097298867266, + "grad_norm": 0.6012376546859741, + "learning_rate": 0.0001, + "loss": 1.5889, + "step": 8741 + }, + { + "epoch": 1.015625907638687, + "grad_norm": 0.4950985908508301, + "learning_rate": 0.0001, + "loss": 1.3743, + "step": 8742 + }, + { + "epoch": 1.0157420853906476, + "grad_norm": 0.5329170227050781, + "learning_rate": 0.0001, + "loss": 1.3792, + "step": 8743 + }, + { + "epoch": 1.0158582631426083, + "grad_norm": 0.5413834452629089, + "learning_rate": 0.0001, + "loss": 1.4078, + "step": 8744 + }, + { + "epoch": 1.0159744408945688, + "grad_norm": 0.5318317413330078, + "learning_rate": 0.0001, + "loss": 1.516, + "step": 8745 + }, + { + "epoch": 1.0160906186465293, + "grad_norm": 0.49791741371154785, + "learning_rate": 0.0001, + "loss": 1.3419, + "step": 8746 + }, + { + "epoch": 1.0162067963984898, + "grad_norm": 0.5165941119194031, + "learning_rate": 0.0001, + "loss": 1.3846, + "step": 8747 + }, + { + "epoch": 1.0163229741504503, + "grad_norm": 0.5537638068199158, + "learning_rate": 0.0001, + "loss": 1.5566, + "step": 8748 + }, + { + "epoch": 1.0164391519024107, + "grad_norm": 0.5026431083679199, + "learning_rate": 0.0001, + "loss": 1.4458, + "step": 8749 + }, + { + "epoch": 1.0165553296543712, + "grad_norm": 0.5347225069999695, + "learning_rate": 0.0001, + "loss": 1.4458, + "step": 8750 + }, + { + "epoch": 1.0166715074063317, + "grad_norm": 0.5551308393478394, + "learning_rate": 0.0001, + "loss": 1.5918, + "step": 8751 + }, + { + "epoch": 1.0167876851582922, + "grad_norm": 0.5152553915977478, + "learning_rate": 0.0001, + "loss": 1.52, + "step": 8752 + }, + { + "epoch": 1.0169038629102527, + "grad_norm": 0.5854957699775696, + "learning_rate": 0.0001, + "loss": 1.7042, + "step": 8753 + }, + { + "epoch": 1.0170200406622132, + "grad_norm": 0.5672193169593811, + "learning_rate": 0.0001, + "loss": 1.6815, + "step": 8754 + }, + { + "epoch": 1.0171362184141737, + "grad_norm": 0.5420659184455872, + "learning_rate": 0.0001, + "loss": 1.5568, + "step": 8755 + }, + { + "epoch": 1.0172523961661342, + "grad_norm": 0.5565413236618042, + "learning_rate": 0.0001, + "loss": 1.5412, + "step": 8756 + }, + { + "epoch": 1.0173685739180947, + "grad_norm": 0.49962347745895386, + "learning_rate": 0.0001, + "loss": 1.4468, + "step": 8757 + }, + { + "epoch": 1.0174847516700551, + "grad_norm": 0.5187664031982422, + "learning_rate": 0.0001, + "loss": 1.5169, + "step": 8758 + }, + { + "epoch": 1.0176009294220156, + "grad_norm": 0.5551736354827881, + "learning_rate": 0.0001, + "loss": 1.5355, + "step": 8759 + }, + { + "epoch": 1.0177171071739761, + "grad_norm": 0.5217881202697754, + "learning_rate": 0.0001, + "loss": 1.5438, + "step": 8760 + }, + { + "epoch": 1.0178332849259366, + "grad_norm": 0.5211256742477417, + "learning_rate": 0.0001, + "loss": 1.4881, + "step": 8761 + }, + { + "epoch": 1.017949462677897, + "grad_norm": 0.5577038526535034, + "learning_rate": 0.0001, + "loss": 1.5617, + "step": 8762 + }, + { + "epoch": 1.0180656404298576, + "grad_norm": 0.5176113247871399, + "learning_rate": 0.0001, + "loss": 1.3198, + "step": 8763 + }, + { + "epoch": 1.018181818181818, + "grad_norm": 0.5276638865470886, + "learning_rate": 0.0001, + "loss": 1.4202, + "step": 8764 + }, + { + "epoch": 1.0182979959337788, + "grad_norm": 0.5093098282814026, + "learning_rate": 0.0001, + "loss": 1.4446, + "step": 8765 + }, + { + "epoch": 1.0184141736857393, + "grad_norm": 0.5525769591331482, + "learning_rate": 0.0001, + "loss": 1.5357, + "step": 8766 + }, + { + "epoch": 1.0185303514376998, + "grad_norm": 0.4998970329761505, + "learning_rate": 0.0001, + "loss": 1.4808, + "step": 8767 + }, + { + "epoch": 1.0186465291896603, + "grad_norm": 0.5910646915435791, + "learning_rate": 0.0001, + "loss": 1.7389, + "step": 8768 + }, + { + "epoch": 1.0187627069416207, + "grad_norm": 0.5479773879051208, + "learning_rate": 0.0001, + "loss": 1.6655, + "step": 8769 + }, + { + "epoch": 1.0188788846935812, + "grad_norm": 0.5090250968933105, + "learning_rate": 0.0001, + "loss": 1.3965, + "step": 8770 + }, + { + "epoch": 1.0189950624455417, + "grad_norm": 0.5310608148574829, + "learning_rate": 0.0001, + "loss": 1.5007, + "step": 8771 + }, + { + "epoch": 1.0191112401975022, + "grad_norm": 0.4970023036003113, + "learning_rate": 0.0001, + "loss": 1.3572, + "step": 8772 + }, + { + "epoch": 1.0192274179494627, + "grad_norm": 0.504270613193512, + "learning_rate": 0.0001, + "loss": 1.373, + "step": 8773 + }, + { + "epoch": 1.0193435957014232, + "grad_norm": 0.49610963463783264, + "learning_rate": 0.0001, + "loss": 1.3157, + "step": 8774 + }, + { + "epoch": 1.0194597734533837, + "grad_norm": 0.5968664884567261, + "learning_rate": 0.0001, + "loss": 1.5525, + "step": 8775 + }, + { + "epoch": 1.0195759512053442, + "grad_norm": 0.4930863082408905, + "learning_rate": 0.0001, + "loss": 1.4478, + "step": 8776 + }, + { + "epoch": 1.0196921289573047, + "grad_norm": 0.5297977924346924, + "learning_rate": 0.0001, + "loss": 1.4772, + "step": 8777 + }, + { + "epoch": 1.0198083067092651, + "grad_norm": 0.5384182333946228, + "learning_rate": 0.0001, + "loss": 1.3761, + "step": 8778 + }, + { + "epoch": 1.0199244844612256, + "grad_norm": 0.5289637446403503, + "learning_rate": 0.0001, + "loss": 1.3421, + "step": 8779 + }, + { + "epoch": 1.0200406622131861, + "grad_norm": 0.4995555877685547, + "learning_rate": 0.0001, + "loss": 1.4968, + "step": 8780 + }, + { + "epoch": 1.0201568399651466, + "grad_norm": 0.5270565748214722, + "learning_rate": 0.0001, + "loss": 1.4003, + "step": 8781 + }, + { + "epoch": 1.020273017717107, + "grad_norm": 0.5530638694763184, + "learning_rate": 0.0001, + "loss": 1.4916, + "step": 8782 + }, + { + "epoch": 1.0203891954690676, + "grad_norm": 0.519996702671051, + "learning_rate": 0.0001, + "loss": 1.4601, + "step": 8783 + }, + { + "epoch": 1.020505373221028, + "grad_norm": 0.5551260113716125, + "learning_rate": 0.0001, + "loss": 1.5845, + "step": 8784 + }, + { + "epoch": 1.0206215509729886, + "grad_norm": 0.5324636697769165, + "learning_rate": 0.0001, + "loss": 1.5639, + "step": 8785 + }, + { + "epoch": 1.0207377287249493, + "grad_norm": 0.5127756595611572, + "learning_rate": 0.0001, + "loss": 1.4244, + "step": 8786 + }, + { + "epoch": 1.0208539064769098, + "grad_norm": 0.5320945978164673, + "learning_rate": 0.0001, + "loss": 1.5453, + "step": 8787 + }, + { + "epoch": 1.0209700842288703, + "grad_norm": 0.5293431282043457, + "learning_rate": 0.0001, + "loss": 1.5285, + "step": 8788 + }, + { + "epoch": 1.0210862619808307, + "grad_norm": 0.5258355140686035, + "learning_rate": 0.0001, + "loss": 1.4553, + "step": 8789 + }, + { + "epoch": 1.0212024397327912, + "grad_norm": 0.6581287980079651, + "learning_rate": 0.0001, + "loss": 1.3883, + "step": 8790 + }, + { + "epoch": 1.0213186174847517, + "grad_norm": 0.5154805779457092, + "learning_rate": 0.0001, + "loss": 1.4723, + "step": 8791 + }, + { + "epoch": 1.0214347952367122, + "grad_norm": 0.48728328943252563, + "learning_rate": 0.0001, + "loss": 1.3606, + "step": 8792 + }, + { + "epoch": 1.0215509729886727, + "grad_norm": 0.5164970755577087, + "learning_rate": 0.0001, + "loss": 1.6262, + "step": 8793 + }, + { + "epoch": 1.0216671507406332, + "grad_norm": 0.5162052512168884, + "learning_rate": 0.0001, + "loss": 1.573, + "step": 8794 + }, + { + "epoch": 1.0217833284925937, + "grad_norm": 0.5044278502464294, + "learning_rate": 0.0001, + "loss": 1.4329, + "step": 8795 + }, + { + "epoch": 1.0218995062445542, + "grad_norm": 0.5441913604736328, + "learning_rate": 0.0001, + "loss": 1.474, + "step": 8796 + }, + { + "epoch": 1.0220156839965147, + "grad_norm": 0.4957961440086365, + "learning_rate": 0.0001, + "loss": 1.3308, + "step": 8797 + }, + { + "epoch": 1.0221318617484751, + "grad_norm": 0.49223870038986206, + "learning_rate": 0.0001, + "loss": 1.2865, + "step": 8798 + }, + { + "epoch": 1.0222480395004356, + "grad_norm": 0.5398341417312622, + "learning_rate": 0.0001, + "loss": 1.4021, + "step": 8799 + }, + { + "epoch": 1.0223642172523961, + "grad_norm": 0.5927699208259583, + "learning_rate": 0.0001, + "loss": 1.6615, + "step": 8800 + }, + { + "epoch": 1.0224803950043566, + "grad_norm": 0.5420244932174683, + "learning_rate": 0.0001, + "loss": 1.3955, + "step": 8801 + }, + { + "epoch": 1.022596572756317, + "grad_norm": 0.5336340069770813, + "learning_rate": 0.0001, + "loss": 1.5337, + "step": 8802 + }, + { + "epoch": 1.0227127505082776, + "grad_norm": 0.5515368580818176, + "learning_rate": 0.0001, + "loss": 1.4957, + "step": 8803 + }, + { + "epoch": 1.022828928260238, + "grad_norm": 0.5734601020812988, + "learning_rate": 0.0001, + "loss": 1.5024, + "step": 8804 + }, + { + "epoch": 1.0229451060121986, + "grad_norm": 0.4861997067928314, + "learning_rate": 0.0001, + "loss": 1.346, + "step": 8805 + }, + { + "epoch": 1.023061283764159, + "grad_norm": 0.501109778881073, + "learning_rate": 0.0001, + "loss": 1.3825, + "step": 8806 + }, + { + "epoch": 1.0231774615161198, + "grad_norm": 0.5470065474510193, + "learning_rate": 0.0001, + "loss": 1.5643, + "step": 8807 + }, + { + "epoch": 1.0232936392680803, + "grad_norm": 0.5535728335380554, + "learning_rate": 0.0001, + "loss": 1.4396, + "step": 8808 + }, + { + "epoch": 1.0234098170200407, + "grad_norm": 0.555833101272583, + "learning_rate": 0.0001, + "loss": 1.6016, + "step": 8809 + }, + { + "epoch": 1.0235259947720012, + "grad_norm": 0.48432549834251404, + "learning_rate": 0.0001, + "loss": 1.2699, + "step": 8810 + }, + { + "epoch": 1.0236421725239617, + "grad_norm": 0.5172594785690308, + "learning_rate": 0.0001, + "loss": 1.4708, + "step": 8811 + }, + { + "epoch": 1.0237583502759222, + "grad_norm": 0.5628485083580017, + "learning_rate": 0.0001, + "loss": 1.6034, + "step": 8812 + }, + { + "epoch": 1.0238745280278827, + "grad_norm": 0.6112015247344971, + "learning_rate": 0.0001, + "loss": 1.6817, + "step": 8813 + }, + { + "epoch": 1.0239907057798432, + "grad_norm": 0.5038037896156311, + "learning_rate": 0.0001, + "loss": 1.4484, + "step": 8814 + }, + { + "epoch": 1.0241068835318037, + "grad_norm": 0.5269456505775452, + "learning_rate": 0.0001, + "loss": 1.5061, + "step": 8815 + }, + { + "epoch": 1.0242230612837642, + "grad_norm": 0.48238101601600647, + "learning_rate": 0.0001, + "loss": 1.3186, + "step": 8816 + }, + { + "epoch": 1.0243392390357247, + "grad_norm": 0.5162169337272644, + "learning_rate": 0.0001, + "loss": 1.464, + "step": 8817 + }, + { + "epoch": 1.0244554167876851, + "grad_norm": 0.5213837027549744, + "learning_rate": 0.0001, + "loss": 1.4911, + "step": 8818 + }, + { + "epoch": 1.0245715945396456, + "grad_norm": 0.5037201642990112, + "learning_rate": 0.0001, + "loss": 1.2898, + "step": 8819 + }, + { + "epoch": 1.0246877722916061, + "grad_norm": 0.517815113067627, + "learning_rate": 0.0001, + "loss": 1.5051, + "step": 8820 + }, + { + "epoch": 1.0248039500435666, + "grad_norm": 0.5647580027580261, + "learning_rate": 0.0001, + "loss": 1.5806, + "step": 8821 + }, + { + "epoch": 1.024920127795527, + "grad_norm": 0.549393355846405, + "learning_rate": 0.0001, + "loss": 1.5951, + "step": 8822 + }, + { + "epoch": 1.0250363055474876, + "grad_norm": 0.5423721075057983, + "learning_rate": 0.0001, + "loss": 1.4828, + "step": 8823 + }, + { + "epoch": 1.025152483299448, + "grad_norm": 0.5369595289230347, + "learning_rate": 0.0001, + "loss": 1.5219, + "step": 8824 + }, + { + "epoch": 1.0252686610514086, + "grad_norm": 0.6080834269523621, + "learning_rate": 0.0001, + "loss": 1.7438, + "step": 8825 + }, + { + "epoch": 1.025384838803369, + "grad_norm": 0.5140199065208435, + "learning_rate": 0.0001, + "loss": 1.3756, + "step": 8826 + }, + { + "epoch": 1.0255010165553295, + "grad_norm": 0.5014476776123047, + "learning_rate": 0.0001, + "loss": 1.5028, + "step": 8827 + }, + { + "epoch": 1.0256171943072903, + "grad_norm": 0.5367702841758728, + "learning_rate": 0.0001, + "loss": 1.3651, + "step": 8828 + }, + { + "epoch": 1.0257333720592507, + "grad_norm": 0.5167638659477234, + "learning_rate": 0.0001, + "loss": 1.329, + "step": 8829 + }, + { + "epoch": 1.0258495498112112, + "grad_norm": 0.5390440225601196, + "learning_rate": 0.0001, + "loss": 1.4476, + "step": 8830 + }, + { + "epoch": 1.0259657275631717, + "grad_norm": 0.5453746914863586, + "learning_rate": 0.0001, + "loss": 1.4783, + "step": 8831 + }, + { + "epoch": 1.0260819053151322, + "grad_norm": 0.5189757943153381, + "learning_rate": 0.0001, + "loss": 1.4924, + "step": 8832 + }, + { + "epoch": 1.0261980830670927, + "grad_norm": 0.5460232496261597, + "learning_rate": 0.0001, + "loss": 1.496, + "step": 8833 + }, + { + "epoch": 1.0263142608190532, + "grad_norm": 0.4965575337409973, + "learning_rate": 0.0001, + "loss": 1.448, + "step": 8834 + }, + { + "epoch": 1.0264304385710137, + "grad_norm": 0.5070030689239502, + "learning_rate": 0.0001, + "loss": 1.4164, + "step": 8835 + }, + { + "epoch": 1.0265466163229742, + "grad_norm": 0.5291348099708557, + "learning_rate": 0.0001, + "loss": 1.5749, + "step": 8836 + }, + { + "epoch": 1.0266627940749347, + "grad_norm": 0.49816176295280457, + "learning_rate": 0.0001, + "loss": 1.3712, + "step": 8837 + }, + { + "epoch": 1.0267789718268951, + "grad_norm": 0.5235051512718201, + "learning_rate": 0.0001, + "loss": 1.4987, + "step": 8838 + }, + { + "epoch": 1.0268951495788556, + "grad_norm": 0.5131754875183105, + "learning_rate": 0.0001, + "loss": 1.336, + "step": 8839 + }, + { + "epoch": 1.0270113273308161, + "grad_norm": 0.5183280110359192, + "learning_rate": 0.0001, + "loss": 1.4141, + "step": 8840 + }, + { + "epoch": 1.0271275050827766, + "grad_norm": 0.5118065476417542, + "learning_rate": 0.0001, + "loss": 1.4272, + "step": 8841 + }, + { + "epoch": 1.027243682834737, + "grad_norm": 0.5284550189971924, + "learning_rate": 0.0001, + "loss": 1.438, + "step": 8842 + }, + { + "epoch": 1.0273598605866976, + "grad_norm": 0.5649248957633972, + "learning_rate": 0.0001, + "loss": 1.5256, + "step": 8843 + }, + { + "epoch": 1.027476038338658, + "grad_norm": 0.5072284936904907, + "learning_rate": 0.0001, + "loss": 1.4069, + "step": 8844 + }, + { + "epoch": 1.0275922160906186, + "grad_norm": 0.5180877447128296, + "learning_rate": 0.0001, + "loss": 1.3545, + "step": 8845 + }, + { + "epoch": 1.027708393842579, + "grad_norm": 0.5194306373596191, + "learning_rate": 0.0001, + "loss": 1.4195, + "step": 8846 + }, + { + "epoch": 1.0278245715945395, + "grad_norm": 0.5667744278907776, + "learning_rate": 0.0001, + "loss": 1.5755, + "step": 8847 + }, + { + "epoch": 1.0279407493465, + "grad_norm": 0.5082692503929138, + "learning_rate": 0.0001, + "loss": 1.5024, + "step": 8848 + }, + { + "epoch": 1.0280569270984607, + "grad_norm": 0.6028507947921753, + "learning_rate": 0.0001, + "loss": 1.4596, + "step": 8849 + }, + { + "epoch": 1.0281731048504212, + "grad_norm": 0.5387246012687683, + "learning_rate": 0.0001, + "loss": 1.5774, + "step": 8850 + }, + { + "epoch": 1.0282892826023817, + "grad_norm": 0.4912740886211395, + "learning_rate": 0.0001, + "loss": 1.4794, + "step": 8851 + }, + { + "epoch": 1.0284054603543422, + "grad_norm": 0.5244132876396179, + "learning_rate": 0.0001, + "loss": 1.3191, + "step": 8852 + }, + { + "epoch": 1.0285216381063027, + "grad_norm": 0.5031388998031616, + "learning_rate": 0.0001, + "loss": 1.3729, + "step": 8853 + }, + { + "epoch": 1.0286378158582632, + "grad_norm": 0.544710636138916, + "learning_rate": 0.0001, + "loss": 1.5955, + "step": 8854 + }, + { + "epoch": 1.0287539936102237, + "grad_norm": 0.54068523645401, + "learning_rate": 0.0001, + "loss": 1.3927, + "step": 8855 + }, + { + "epoch": 1.0288701713621842, + "grad_norm": 0.5172038078308105, + "learning_rate": 0.0001, + "loss": 1.4375, + "step": 8856 + }, + { + "epoch": 1.0289863491141447, + "grad_norm": 0.49291446805000305, + "learning_rate": 0.0001, + "loss": 1.3361, + "step": 8857 + }, + { + "epoch": 1.0291025268661051, + "grad_norm": 0.5305606722831726, + "learning_rate": 0.0001, + "loss": 1.4012, + "step": 8858 + }, + { + "epoch": 1.0292187046180656, + "grad_norm": 0.4981054365634918, + "learning_rate": 0.0001, + "loss": 1.5502, + "step": 8859 + }, + { + "epoch": 1.0293348823700261, + "grad_norm": 0.510947585105896, + "learning_rate": 0.0001, + "loss": 1.3286, + "step": 8860 + }, + { + "epoch": 1.0294510601219866, + "grad_norm": 0.5207487940788269, + "learning_rate": 0.0001, + "loss": 1.4023, + "step": 8861 + }, + { + "epoch": 1.029567237873947, + "grad_norm": 0.5454730987548828, + "learning_rate": 0.0001, + "loss": 1.5136, + "step": 8862 + }, + { + "epoch": 1.0296834156259076, + "grad_norm": 0.5361663699150085, + "learning_rate": 0.0001, + "loss": 1.6202, + "step": 8863 + }, + { + "epoch": 1.029799593377868, + "grad_norm": 0.5152962803840637, + "learning_rate": 0.0001, + "loss": 1.3848, + "step": 8864 + }, + { + "epoch": 1.0299157711298286, + "grad_norm": 0.5426158905029297, + "learning_rate": 0.0001, + "loss": 1.2973, + "step": 8865 + }, + { + "epoch": 1.030031948881789, + "grad_norm": 0.5150105357170105, + "learning_rate": 0.0001, + "loss": 1.4212, + "step": 8866 + }, + { + "epoch": 1.0301481266337496, + "grad_norm": 0.5968256592750549, + "learning_rate": 0.0001, + "loss": 1.6619, + "step": 8867 + }, + { + "epoch": 1.03026430438571, + "grad_norm": 0.4991264343261719, + "learning_rate": 0.0001, + "loss": 1.3305, + "step": 8868 + }, + { + "epoch": 1.0303804821376705, + "grad_norm": 0.5225552916526794, + "learning_rate": 0.0001, + "loss": 1.5386, + "step": 8869 + }, + { + "epoch": 1.0304966598896312, + "grad_norm": 0.5236073732376099, + "learning_rate": 0.0001, + "loss": 1.5996, + "step": 8870 + }, + { + "epoch": 1.0306128376415917, + "grad_norm": 0.5573896169662476, + "learning_rate": 0.0001, + "loss": 1.4516, + "step": 8871 + }, + { + "epoch": 1.0307290153935522, + "grad_norm": 0.47595614194869995, + "learning_rate": 0.0001, + "loss": 1.312, + "step": 8872 + }, + { + "epoch": 1.0308451931455127, + "grad_norm": 0.5099783539772034, + "learning_rate": 0.0001, + "loss": 1.3819, + "step": 8873 + }, + { + "epoch": 1.0309613708974732, + "grad_norm": 0.5189265608787537, + "learning_rate": 0.0001, + "loss": 1.5551, + "step": 8874 + }, + { + "epoch": 1.0310775486494337, + "grad_norm": 0.567966103553772, + "learning_rate": 0.0001, + "loss": 1.5482, + "step": 8875 + }, + { + "epoch": 1.0311937264013942, + "grad_norm": 0.5577686429023743, + "learning_rate": 0.0001, + "loss": 1.674, + "step": 8876 + }, + { + "epoch": 1.0313099041533547, + "grad_norm": 0.5853254795074463, + "learning_rate": 0.0001, + "loss": 1.4078, + "step": 8877 + }, + { + "epoch": 1.0314260819053152, + "grad_norm": 0.5423609018325806, + "learning_rate": 0.0001, + "loss": 1.4577, + "step": 8878 + }, + { + "epoch": 1.0315422596572756, + "grad_norm": 0.5053103566169739, + "learning_rate": 0.0001, + "loss": 1.5071, + "step": 8879 + }, + { + "epoch": 1.0316584374092361, + "grad_norm": 0.5344913601875305, + "learning_rate": 0.0001, + "loss": 1.7323, + "step": 8880 + }, + { + "epoch": 1.0317746151611966, + "grad_norm": 0.5342625379562378, + "learning_rate": 0.0001, + "loss": 1.5624, + "step": 8881 + }, + { + "epoch": 1.031890792913157, + "grad_norm": 0.48883843421936035, + "learning_rate": 0.0001, + "loss": 1.2327, + "step": 8882 + }, + { + "epoch": 1.0320069706651176, + "grad_norm": 0.49293336272239685, + "learning_rate": 0.0001, + "loss": 1.4864, + "step": 8883 + }, + { + "epoch": 1.032123148417078, + "grad_norm": 0.4973764717578888, + "learning_rate": 0.0001, + "loss": 1.1467, + "step": 8884 + }, + { + "epoch": 1.0322393261690386, + "grad_norm": 0.49885573983192444, + "learning_rate": 0.0001, + "loss": 1.4539, + "step": 8885 + }, + { + "epoch": 1.032355503920999, + "grad_norm": 0.5181958079338074, + "learning_rate": 0.0001, + "loss": 1.478, + "step": 8886 + }, + { + "epoch": 1.0324716816729596, + "grad_norm": 0.5104524493217468, + "learning_rate": 0.0001, + "loss": 1.4491, + "step": 8887 + }, + { + "epoch": 1.03258785942492, + "grad_norm": 0.5194922089576721, + "learning_rate": 0.0001, + "loss": 1.4931, + "step": 8888 + }, + { + "epoch": 1.0327040371768805, + "grad_norm": 0.5585922002792358, + "learning_rate": 0.0001, + "loss": 1.4493, + "step": 8889 + }, + { + "epoch": 1.032820214928841, + "grad_norm": 0.49765944480895996, + "learning_rate": 0.0001, + "loss": 1.3614, + "step": 8890 + }, + { + "epoch": 1.0329363926808017, + "grad_norm": 0.5335497260093689, + "learning_rate": 0.0001, + "loss": 1.3586, + "step": 8891 + }, + { + "epoch": 1.0330525704327622, + "grad_norm": 0.5823356509208679, + "learning_rate": 0.0001, + "loss": 1.4327, + "step": 8892 + }, + { + "epoch": 1.0331687481847227, + "grad_norm": 0.5257899165153503, + "learning_rate": 0.0001, + "loss": 1.5516, + "step": 8893 + }, + { + "epoch": 1.0332849259366832, + "grad_norm": 0.5384433269500732, + "learning_rate": 0.0001, + "loss": 1.5017, + "step": 8894 + }, + { + "epoch": 1.0334011036886437, + "grad_norm": 0.5588715672492981, + "learning_rate": 0.0001, + "loss": 1.3805, + "step": 8895 + }, + { + "epoch": 1.0335172814406042, + "grad_norm": 0.5330429673194885, + "learning_rate": 0.0001, + "loss": 1.482, + "step": 8896 + }, + { + "epoch": 1.0336334591925647, + "grad_norm": 0.515548050403595, + "learning_rate": 0.0001, + "loss": 1.447, + "step": 8897 + }, + { + "epoch": 1.0337496369445252, + "grad_norm": 0.5171681046485901, + "learning_rate": 0.0001, + "loss": 1.6035, + "step": 8898 + }, + { + "epoch": 1.0338658146964856, + "grad_norm": 0.5828492045402527, + "learning_rate": 0.0001, + "loss": 1.6547, + "step": 8899 + }, + { + "epoch": 1.0339819924484461, + "grad_norm": 0.4846728444099426, + "learning_rate": 0.0001, + "loss": 1.4961, + "step": 8900 + }, + { + "epoch": 1.0340981702004066, + "grad_norm": 0.5182422995567322, + "learning_rate": 0.0001, + "loss": 1.4393, + "step": 8901 + }, + { + "epoch": 1.034214347952367, + "grad_norm": 0.5431578755378723, + "learning_rate": 0.0001, + "loss": 1.4924, + "step": 8902 + }, + { + "epoch": 1.0343305257043276, + "grad_norm": 0.5584523677825928, + "learning_rate": 0.0001, + "loss": 1.4686, + "step": 8903 + }, + { + "epoch": 1.034446703456288, + "grad_norm": 0.5252351760864258, + "learning_rate": 0.0001, + "loss": 1.269, + "step": 8904 + }, + { + "epoch": 1.0345628812082486, + "grad_norm": 0.575616717338562, + "learning_rate": 0.0001, + "loss": 1.6217, + "step": 8905 + }, + { + "epoch": 1.034679058960209, + "grad_norm": 0.6257792115211487, + "learning_rate": 0.0001, + "loss": 1.7119, + "step": 8906 + }, + { + "epoch": 1.0347952367121696, + "grad_norm": 0.532507598400116, + "learning_rate": 0.0001, + "loss": 1.5132, + "step": 8907 + }, + { + "epoch": 1.03491141446413, + "grad_norm": 0.5430809855461121, + "learning_rate": 0.0001, + "loss": 1.4875, + "step": 8908 + }, + { + "epoch": 1.0350275922160905, + "grad_norm": 0.5150136351585388, + "learning_rate": 0.0001, + "loss": 1.4398, + "step": 8909 + }, + { + "epoch": 1.035143769968051, + "grad_norm": 0.5215170383453369, + "learning_rate": 0.0001, + "loss": 1.4457, + "step": 8910 + }, + { + "epoch": 1.0352599477200117, + "grad_norm": 0.5841111540794373, + "learning_rate": 0.0001, + "loss": 1.5476, + "step": 8911 + }, + { + "epoch": 1.0353761254719722, + "grad_norm": 0.5070576667785645, + "learning_rate": 0.0001, + "loss": 1.5439, + "step": 8912 + }, + { + "epoch": 1.0354923032239327, + "grad_norm": 0.5849988460540771, + "learning_rate": 0.0001, + "loss": 1.4834, + "step": 8913 + }, + { + "epoch": 1.0356084809758932, + "grad_norm": 0.5653141736984253, + "learning_rate": 0.0001, + "loss": 1.5947, + "step": 8914 + }, + { + "epoch": 1.0357246587278537, + "grad_norm": 0.5517808198928833, + "learning_rate": 0.0001, + "loss": 1.5693, + "step": 8915 + }, + { + "epoch": 1.0358408364798142, + "grad_norm": 0.5292582511901855, + "learning_rate": 0.0001, + "loss": 1.3979, + "step": 8916 + }, + { + "epoch": 1.0359570142317747, + "grad_norm": 0.5182827711105347, + "learning_rate": 0.0001, + "loss": 1.4861, + "step": 8917 + }, + { + "epoch": 1.0360731919837352, + "grad_norm": 0.5552460551261902, + "learning_rate": 0.0001, + "loss": 1.4503, + "step": 8918 + }, + { + "epoch": 1.0361893697356956, + "grad_norm": 0.5514026880264282, + "learning_rate": 0.0001, + "loss": 1.3159, + "step": 8919 + }, + { + "epoch": 1.0363055474876561, + "grad_norm": 0.5646169185638428, + "learning_rate": 0.0001, + "loss": 1.4833, + "step": 8920 + }, + { + "epoch": 1.0364217252396166, + "grad_norm": 0.5406813621520996, + "learning_rate": 0.0001, + "loss": 1.5865, + "step": 8921 + }, + { + "epoch": 1.036537902991577, + "grad_norm": 0.5190636515617371, + "learning_rate": 0.0001, + "loss": 1.5229, + "step": 8922 + }, + { + "epoch": 1.0366540807435376, + "grad_norm": 0.538845956325531, + "learning_rate": 0.0001, + "loss": 1.6992, + "step": 8923 + }, + { + "epoch": 1.036770258495498, + "grad_norm": 0.5209506154060364, + "learning_rate": 0.0001, + "loss": 1.3851, + "step": 8924 + }, + { + "epoch": 1.0368864362474586, + "grad_norm": 0.5400375127792358, + "learning_rate": 0.0001, + "loss": 1.332, + "step": 8925 + }, + { + "epoch": 1.037002613999419, + "grad_norm": 0.5286649465560913, + "learning_rate": 0.0001, + "loss": 1.4175, + "step": 8926 + }, + { + "epoch": 1.0371187917513796, + "grad_norm": 0.5743568539619446, + "learning_rate": 0.0001, + "loss": 1.4531, + "step": 8927 + }, + { + "epoch": 1.03723496950334, + "grad_norm": 0.5251564979553223, + "learning_rate": 0.0001, + "loss": 1.4803, + "step": 8928 + }, + { + "epoch": 1.0373511472553005, + "grad_norm": 0.5093726515769958, + "learning_rate": 0.0001, + "loss": 1.4133, + "step": 8929 + }, + { + "epoch": 1.037467325007261, + "grad_norm": 0.5467512011528015, + "learning_rate": 0.0001, + "loss": 1.5559, + "step": 8930 + }, + { + "epoch": 1.0375835027592215, + "grad_norm": 0.5583242774009705, + "learning_rate": 0.0001, + "loss": 1.5438, + "step": 8931 + }, + { + "epoch": 1.037699680511182, + "grad_norm": 0.5613623857498169, + "learning_rate": 0.0001, + "loss": 1.4814, + "step": 8932 + }, + { + "epoch": 1.0378158582631427, + "grad_norm": 0.5468906760215759, + "learning_rate": 0.0001, + "loss": 1.4407, + "step": 8933 + }, + { + "epoch": 1.0379320360151032, + "grad_norm": 0.5442972183227539, + "learning_rate": 0.0001, + "loss": 1.3933, + "step": 8934 + }, + { + "epoch": 1.0380482137670637, + "grad_norm": 0.5066025853157043, + "learning_rate": 0.0001, + "loss": 1.5188, + "step": 8935 + }, + { + "epoch": 1.0381643915190242, + "grad_norm": 0.548570990562439, + "learning_rate": 0.0001, + "loss": 1.5706, + "step": 8936 + }, + { + "epoch": 1.0382805692709847, + "grad_norm": 0.5386464595794678, + "learning_rate": 0.0001, + "loss": 1.5915, + "step": 8937 + }, + { + "epoch": 1.0383967470229452, + "grad_norm": 0.6019245386123657, + "learning_rate": 0.0001, + "loss": 1.6093, + "step": 8938 + }, + { + "epoch": 1.0385129247749056, + "grad_norm": 0.5137844085693359, + "learning_rate": 0.0001, + "loss": 1.3028, + "step": 8939 + }, + { + "epoch": 1.0386291025268661, + "grad_norm": 0.5185123085975647, + "learning_rate": 0.0001, + "loss": 1.5192, + "step": 8940 + }, + { + "epoch": 1.0387452802788266, + "grad_norm": 0.5555708408355713, + "learning_rate": 0.0001, + "loss": 1.3948, + "step": 8941 + }, + { + "epoch": 1.038861458030787, + "grad_norm": 0.49165698885917664, + "learning_rate": 0.0001, + "loss": 1.3102, + "step": 8942 + }, + { + "epoch": 1.0389776357827476, + "grad_norm": 0.568077564239502, + "learning_rate": 0.0001, + "loss": 1.6435, + "step": 8943 + }, + { + "epoch": 1.039093813534708, + "grad_norm": 0.5384320020675659, + "learning_rate": 0.0001, + "loss": 1.4999, + "step": 8944 + }, + { + "epoch": 1.0392099912866686, + "grad_norm": 0.5384372472763062, + "learning_rate": 0.0001, + "loss": 1.3606, + "step": 8945 + }, + { + "epoch": 1.039326169038629, + "grad_norm": 0.5323529839515686, + "learning_rate": 0.0001, + "loss": 1.4326, + "step": 8946 + }, + { + "epoch": 1.0394423467905896, + "grad_norm": 0.5654020309448242, + "learning_rate": 0.0001, + "loss": 1.6373, + "step": 8947 + }, + { + "epoch": 1.03955852454255, + "grad_norm": 0.5455036163330078, + "learning_rate": 0.0001, + "loss": 1.2164, + "step": 8948 + }, + { + "epoch": 1.0396747022945105, + "grad_norm": 0.5437000393867493, + "learning_rate": 0.0001, + "loss": 1.4991, + "step": 8949 + }, + { + "epoch": 1.039790880046471, + "grad_norm": 0.5596999526023865, + "learning_rate": 0.0001, + "loss": 1.5026, + "step": 8950 + }, + { + "epoch": 1.0399070577984315, + "grad_norm": 0.5869429707527161, + "learning_rate": 0.0001, + "loss": 1.5198, + "step": 8951 + }, + { + "epoch": 1.040023235550392, + "grad_norm": 0.5478681921958923, + "learning_rate": 0.0001, + "loss": 1.34, + "step": 8952 + }, + { + "epoch": 1.0401394133023527, + "grad_norm": 0.5384049415588379, + "learning_rate": 0.0001, + "loss": 1.4315, + "step": 8953 + }, + { + "epoch": 1.0402555910543132, + "grad_norm": 0.5583698749542236, + "learning_rate": 0.0001, + "loss": 1.5661, + "step": 8954 + }, + { + "epoch": 1.0403717688062737, + "grad_norm": 0.5423256754875183, + "learning_rate": 0.0001, + "loss": 1.4291, + "step": 8955 + }, + { + "epoch": 1.0404879465582342, + "grad_norm": 0.5298671722412109, + "learning_rate": 0.0001, + "loss": 1.4724, + "step": 8956 + }, + { + "epoch": 1.0406041243101947, + "grad_norm": 0.5103389024734497, + "learning_rate": 0.0001, + "loss": 1.2942, + "step": 8957 + }, + { + "epoch": 1.0407203020621552, + "grad_norm": 0.5083001852035522, + "learning_rate": 0.0001, + "loss": 1.4806, + "step": 8958 + }, + { + "epoch": 1.0408364798141156, + "grad_norm": 0.5489501953125, + "learning_rate": 0.0001, + "loss": 1.5624, + "step": 8959 + }, + { + "epoch": 1.0409526575660761, + "grad_norm": 0.5339622497558594, + "learning_rate": 0.0001, + "loss": 1.4333, + "step": 8960 + }, + { + "epoch": 1.0410688353180366, + "grad_norm": 0.5265421271324158, + "learning_rate": 0.0001, + "loss": 1.5101, + "step": 8961 + }, + { + "epoch": 1.0411850130699971, + "grad_norm": 0.539472222328186, + "learning_rate": 0.0001, + "loss": 1.4749, + "step": 8962 + }, + { + "epoch": 1.0413011908219576, + "grad_norm": 0.5840896964073181, + "learning_rate": 0.0001, + "loss": 1.4922, + "step": 8963 + }, + { + "epoch": 1.041417368573918, + "grad_norm": 0.5356101393699646, + "learning_rate": 0.0001, + "loss": 1.4835, + "step": 8964 + }, + { + "epoch": 1.0415335463258786, + "grad_norm": 0.5115817785263062, + "learning_rate": 0.0001, + "loss": 1.4016, + "step": 8965 + }, + { + "epoch": 1.041649724077839, + "grad_norm": 0.5461002588272095, + "learning_rate": 0.0001, + "loss": 1.4222, + "step": 8966 + }, + { + "epoch": 1.0417659018297996, + "grad_norm": 0.515606701374054, + "learning_rate": 0.0001, + "loss": 1.4263, + "step": 8967 + }, + { + "epoch": 1.04188207958176, + "grad_norm": 0.5375041365623474, + "learning_rate": 0.0001, + "loss": 1.6334, + "step": 8968 + }, + { + "epoch": 1.0419982573337205, + "grad_norm": 0.5649837851524353, + "learning_rate": 0.0001, + "loss": 1.5275, + "step": 8969 + }, + { + "epoch": 1.042114435085681, + "grad_norm": 0.5564467310905457, + "learning_rate": 0.0001, + "loss": 1.4987, + "step": 8970 + }, + { + "epoch": 1.0422306128376415, + "grad_norm": 0.5804434418678284, + "learning_rate": 0.0001, + "loss": 1.6147, + "step": 8971 + }, + { + "epoch": 1.042346790589602, + "grad_norm": 0.5008756518363953, + "learning_rate": 0.0001, + "loss": 1.3786, + "step": 8972 + }, + { + "epoch": 1.0424629683415625, + "grad_norm": 0.4874401092529297, + "learning_rate": 0.0001, + "loss": 1.4006, + "step": 8973 + }, + { + "epoch": 1.042579146093523, + "grad_norm": 0.5213797688484192, + "learning_rate": 0.0001, + "loss": 1.5882, + "step": 8974 + }, + { + "epoch": 1.0426953238454837, + "grad_norm": 0.5059088468551636, + "learning_rate": 0.0001, + "loss": 1.4512, + "step": 8975 + }, + { + "epoch": 1.0428115015974442, + "grad_norm": 0.5125690698623657, + "learning_rate": 0.0001, + "loss": 1.3756, + "step": 8976 + }, + { + "epoch": 1.0429276793494047, + "grad_norm": 0.4973143935203552, + "learning_rate": 0.0001, + "loss": 1.4351, + "step": 8977 + }, + { + "epoch": 1.0430438571013652, + "grad_norm": 0.5088719129562378, + "learning_rate": 0.0001, + "loss": 1.5333, + "step": 8978 + }, + { + "epoch": 1.0431600348533256, + "grad_norm": 0.5613342523574829, + "learning_rate": 0.0001, + "loss": 1.5149, + "step": 8979 + }, + { + "epoch": 1.0432762126052861, + "grad_norm": 0.5364037752151489, + "learning_rate": 0.0001, + "loss": 1.4992, + "step": 8980 + }, + { + "epoch": 1.0433923903572466, + "grad_norm": 0.5083293914794922, + "learning_rate": 0.0001, + "loss": 1.4213, + "step": 8981 + }, + { + "epoch": 1.0435085681092071, + "grad_norm": 0.5561593174934387, + "learning_rate": 0.0001, + "loss": 1.4165, + "step": 8982 + }, + { + "epoch": 1.0436247458611676, + "grad_norm": 0.49919378757476807, + "learning_rate": 0.0001, + "loss": 1.4196, + "step": 8983 + }, + { + "epoch": 1.043740923613128, + "grad_norm": 0.5361339449882507, + "learning_rate": 0.0001, + "loss": 1.3825, + "step": 8984 + }, + { + "epoch": 1.0438571013650886, + "grad_norm": 0.5248799920082092, + "learning_rate": 0.0001, + "loss": 1.5932, + "step": 8985 + }, + { + "epoch": 1.043973279117049, + "grad_norm": 0.5760166049003601, + "learning_rate": 0.0001, + "loss": 1.5835, + "step": 8986 + }, + { + "epoch": 1.0440894568690096, + "grad_norm": 0.580518364906311, + "learning_rate": 0.0001, + "loss": 1.5113, + "step": 8987 + }, + { + "epoch": 1.04420563462097, + "grad_norm": 0.5657472610473633, + "learning_rate": 0.0001, + "loss": 1.6004, + "step": 8988 + }, + { + "epoch": 1.0443218123729305, + "grad_norm": 0.5147629380226135, + "learning_rate": 0.0001, + "loss": 1.3792, + "step": 8989 + }, + { + "epoch": 1.044437990124891, + "grad_norm": 0.5364604592323303, + "learning_rate": 0.0001, + "loss": 1.4996, + "step": 8990 + }, + { + "epoch": 1.0445541678768515, + "grad_norm": 0.4889623820781708, + "learning_rate": 0.0001, + "loss": 1.3463, + "step": 8991 + }, + { + "epoch": 1.044670345628812, + "grad_norm": 0.5112910270690918, + "learning_rate": 0.0001, + "loss": 1.423, + "step": 8992 + }, + { + "epoch": 1.0447865233807725, + "grad_norm": 0.589764416217804, + "learning_rate": 0.0001, + "loss": 1.5348, + "step": 8993 + }, + { + "epoch": 1.044902701132733, + "grad_norm": 0.5439825057983398, + "learning_rate": 0.0001, + "loss": 1.5048, + "step": 8994 + }, + { + "epoch": 1.0450188788846937, + "grad_norm": 0.5245374441146851, + "learning_rate": 0.0001, + "loss": 1.4242, + "step": 8995 + }, + { + "epoch": 1.0451350566366542, + "grad_norm": 0.5435076951980591, + "learning_rate": 0.0001, + "loss": 1.4895, + "step": 8996 + }, + { + "epoch": 1.0452512343886147, + "grad_norm": 0.6293114423751831, + "learning_rate": 0.0001, + "loss": 1.6639, + "step": 8997 + }, + { + "epoch": 1.0453674121405752, + "grad_norm": 0.5348904728889465, + "learning_rate": 0.0001, + "loss": 1.4794, + "step": 8998 + }, + { + "epoch": 1.0454835898925356, + "grad_norm": 0.5541291832923889, + "learning_rate": 0.0001, + "loss": 1.5895, + "step": 8999 + }, + { + "epoch": 1.0455997676444961, + "grad_norm": 0.6162405610084534, + "learning_rate": 0.0001, + "loss": 1.8332, + "step": 9000 + }, + { + "epoch": 1.0457159453964566, + "grad_norm": 0.5342404842376709, + "learning_rate": 0.0001, + "loss": 1.6188, + "step": 9001 + }, + { + "epoch": 1.0458321231484171, + "grad_norm": 0.5075395107269287, + "learning_rate": 0.0001, + "loss": 1.2412, + "step": 9002 + }, + { + "epoch": 1.0459483009003776, + "grad_norm": 0.529941737651825, + "learning_rate": 0.0001, + "loss": 1.4548, + "step": 9003 + }, + { + "epoch": 1.046064478652338, + "grad_norm": 0.5113204717636108, + "learning_rate": 0.0001, + "loss": 1.4907, + "step": 9004 + }, + { + "epoch": 1.0461806564042986, + "grad_norm": 0.5787484049797058, + "learning_rate": 0.0001, + "loss": 1.7276, + "step": 9005 + }, + { + "epoch": 1.046296834156259, + "grad_norm": 0.5223624110221863, + "learning_rate": 0.0001, + "loss": 1.4266, + "step": 9006 + }, + { + "epoch": 1.0464130119082196, + "grad_norm": 0.5437747836112976, + "learning_rate": 0.0001, + "loss": 1.5004, + "step": 9007 + }, + { + "epoch": 1.04652918966018, + "grad_norm": 0.5204196572303772, + "learning_rate": 0.0001, + "loss": 1.43, + "step": 9008 + }, + { + "epoch": 1.0466453674121405, + "grad_norm": 0.5550678372383118, + "learning_rate": 0.0001, + "loss": 1.5454, + "step": 9009 + }, + { + "epoch": 1.046761545164101, + "grad_norm": 0.5623777508735657, + "learning_rate": 0.0001, + "loss": 1.4201, + "step": 9010 + }, + { + "epoch": 1.0468777229160615, + "grad_norm": 0.5182955861091614, + "learning_rate": 0.0001, + "loss": 1.5477, + "step": 9011 + }, + { + "epoch": 1.046993900668022, + "grad_norm": 0.5236296653747559, + "learning_rate": 0.0001, + "loss": 1.5087, + "step": 9012 + }, + { + "epoch": 1.0471100784199825, + "grad_norm": 0.5745497345924377, + "learning_rate": 0.0001, + "loss": 1.6299, + "step": 9013 + }, + { + "epoch": 1.047226256171943, + "grad_norm": 0.5346052646636963, + "learning_rate": 0.0001, + "loss": 1.5206, + "step": 9014 + }, + { + "epoch": 1.0473424339239035, + "grad_norm": 0.523341178894043, + "learning_rate": 0.0001, + "loss": 1.5304, + "step": 9015 + }, + { + "epoch": 1.047458611675864, + "grad_norm": 0.5185964703559875, + "learning_rate": 0.0001, + "loss": 1.5386, + "step": 9016 + }, + { + "epoch": 1.0475747894278247, + "grad_norm": 0.5521393418312073, + "learning_rate": 0.0001, + "loss": 1.7436, + "step": 9017 + }, + { + "epoch": 1.0476909671797852, + "grad_norm": 0.566999077796936, + "learning_rate": 0.0001, + "loss": 1.5032, + "step": 9018 + }, + { + "epoch": 1.0478071449317456, + "grad_norm": 0.5497294068336487, + "learning_rate": 0.0001, + "loss": 1.4579, + "step": 9019 + }, + { + "epoch": 1.0479233226837061, + "grad_norm": 0.5116751194000244, + "learning_rate": 0.0001, + "loss": 1.5127, + "step": 9020 + }, + { + "epoch": 1.0480395004356666, + "grad_norm": 0.5744738578796387, + "learning_rate": 0.0001, + "loss": 1.4848, + "step": 9021 + }, + { + "epoch": 1.0481556781876271, + "grad_norm": 0.4972259998321533, + "learning_rate": 0.0001, + "loss": 1.3011, + "step": 9022 + }, + { + "epoch": 1.0482718559395876, + "grad_norm": 0.5275663137435913, + "learning_rate": 0.0001, + "loss": 1.528, + "step": 9023 + }, + { + "epoch": 1.048388033691548, + "grad_norm": 0.4991401135921478, + "learning_rate": 0.0001, + "loss": 1.2919, + "step": 9024 + }, + { + "epoch": 1.0485042114435086, + "grad_norm": 0.507801353931427, + "learning_rate": 0.0001, + "loss": 1.4393, + "step": 9025 + }, + { + "epoch": 1.048620389195469, + "grad_norm": 0.528578519821167, + "learning_rate": 0.0001, + "loss": 1.522, + "step": 9026 + }, + { + "epoch": 1.0487365669474296, + "grad_norm": 0.5243181586265564, + "learning_rate": 0.0001, + "loss": 1.466, + "step": 9027 + }, + { + "epoch": 1.04885274469939, + "grad_norm": 0.5625602602958679, + "learning_rate": 0.0001, + "loss": 1.4427, + "step": 9028 + }, + { + "epoch": 1.0489689224513505, + "grad_norm": 0.539786159992218, + "learning_rate": 0.0001, + "loss": 1.4553, + "step": 9029 + }, + { + "epoch": 1.049085100203311, + "grad_norm": 0.5501077175140381, + "learning_rate": 0.0001, + "loss": 1.269, + "step": 9030 + }, + { + "epoch": 1.0492012779552715, + "grad_norm": 0.5288931131362915, + "learning_rate": 0.0001, + "loss": 1.4523, + "step": 9031 + }, + { + "epoch": 1.049317455707232, + "grad_norm": 0.5670494437217712, + "learning_rate": 0.0001, + "loss": 1.5632, + "step": 9032 + }, + { + "epoch": 1.0494336334591925, + "grad_norm": 0.5463694930076599, + "learning_rate": 0.0001, + "loss": 1.5296, + "step": 9033 + }, + { + "epoch": 1.049549811211153, + "grad_norm": 0.5625814199447632, + "learning_rate": 0.0001, + "loss": 1.559, + "step": 9034 + }, + { + "epoch": 1.0496659889631135, + "grad_norm": 0.5289028286933899, + "learning_rate": 0.0001, + "loss": 1.4389, + "step": 9035 + }, + { + "epoch": 1.049782166715074, + "grad_norm": 0.5531147122383118, + "learning_rate": 0.0001, + "loss": 1.5066, + "step": 9036 + }, + { + "epoch": 1.0498983444670347, + "grad_norm": 0.5495567917823792, + "learning_rate": 0.0001, + "loss": 1.3533, + "step": 9037 + }, + { + "epoch": 1.0500145222189952, + "grad_norm": 0.5757434368133545, + "learning_rate": 0.0001, + "loss": 1.6329, + "step": 9038 + }, + { + "epoch": 1.0501306999709557, + "grad_norm": 0.5772126317024231, + "learning_rate": 0.0001, + "loss": 1.4649, + "step": 9039 + }, + { + "epoch": 1.0502468777229161, + "grad_norm": 0.5771605968475342, + "learning_rate": 0.0001, + "loss": 1.3436, + "step": 9040 + }, + { + "epoch": 1.0503630554748766, + "grad_norm": 0.5607427358627319, + "learning_rate": 0.0001, + "loss": 1.5775, + "step": 9041 + }, + { + "epoch": 1.0504792332268371, + "grad_norm": 0.5379540920257568, + "learning_rate": 0.0001, + "loss": 1.4105, + "step": 9042 + }, + { + "epoch": 1.0505954109787976, + "grad_norm": 0.5043210387229919, + "learning_rate": 0.0001, + "loss": 1.4431, + "step": 9043 + }, + { + "epoch": 1.050711588730758, + "grad_norm": 0.5489790439605713, + "learning_rate": 0.0001, + "loss": 1.4465, + "step": 9044 + }, + { + "epoch": 1.0508277664827186, + "grad_norm": 0.5171908736228943, + "learning_rate": 0.0001, + "loss": 1.4456, + "step": 9045 + }, + { + "epoch": 1.050943944234679, + "grad_norm": 0.5216082334518433, + "learning_rate": 0.0001, + "loss": 1.3013, + "step": 9046 + }, + { + "epoch": 1.0510601219866396, + "grad_norm": 0.5552801489830017, + "learning_rate": 0.0001, + "loss": 1.5301, + "step": 9047 + }, + { + "epoch": 1.0511762997386, + "grad_norm": 0.5355478525161743, + "learning_rate": 0.0001, + "loss": 1.3406, + "step": 9048 + }, + { + "epoch": 1.0512924774905605, + "grad_norm": 0.5976464748382568, + "learning_rate": 0.0001, + "loss": 1.6491, + "step": 9049 + }, + { + "epoch": 1.051408655242521, + "grad_norm": 0.5584911704063416, + "learning_rate": 0.0001, + "loss": 1.4411, + "step": 9050 + }, + { + "epoch": 1.0515248329944815, + "grad_norm": 0.5827659964561462, + "learning_rate": 0.0001, + "loss": 1.5539, + "step": 9051 + }, + { + "epoch": 1.051641010746442, + "grad_norm": 0.5411894917488098, + "learning_rate": 0.0001, + "loss": 1.397, + "step": 9052 + }, + { + "epoch": 1.0517571884984025, + "grad_norm": 0.507165789604187, + "learning_rate": 0.0001, + "loss": 1.3847, + "step": 9053 + }, + { + "epoch": 1.051873366250363, + "grad_norm": 0.5528204441070557, + "learning_rate": 0.0001, + "loss": 1.3638, + "step": 9054 + }, + { + "epoch": 1.0519895440023235, + "grad_norm": 0.5597609877586365, + "learning_rate": 0.0001, + "loss": 1.5453, + "step": 9055 + }, + { + "epoch": 1.052105721754284, + "grad_norm": 0.5310388207435608, + "learning_rate": 0.0001, + "loss": 1.4972, + "step": 9056 + }, + { + "epoch": 1.0522218995062445, + "grad_norm": 0.5567108392715454, + "learning_rate": 0.0001, + "loss": 1.4428, + "step": 9057 + }, + { + "epoch": 1.052338077258205, + "grad_norm": 0.539630651473999, + "learning_rate": 0.0001, + "loss": 1.3737, + "step": 9058 + }, + { + "epoch": 1.0524542550101657, + "grad_norm": 0.5464910864830017, + "learning_rate": 0.0001, + "loss": 1.4159, + "step": 9059 + }, + { + "epoch": 1.0525704327621261, + "grad_norm": 0.5596455931663513, + "learning_rate": 0.0001, + "loss": 1.4302, + "step": 9060 + }, + { + "epoch": 1.0526866105140866, + "grad_norm": 0.5715380311012268, + "learning_rate": 0.0001, + "loss": 1.3316, + "step": 9061 + }, + { + "epoch": 1.0528027882660471, + "grad_norm": 0.5546104907989502, + "learning_rate": 0.0001, + "loss": 1.3358, + "step": 9062 + }, + { + "epoch": 1.0529189660180076, + "grad_norm": 0.7018982768058777, + "learning_rate": 0.0001, + "loss": 1.3523, + "step": 9063 + }, + { + "epoch": 1.053035143769968, + "grad_norm": 0.5631259679794312, + "learning_rate": 0.0001, + "loss": 1.3676, + "step": 9064 + }, + { + "epoch": 1.0531513215219286, + "grad_norm": 0.5323575735092163, + "learning_rate": 0.0001, + "loss": 1.3987, + "step": 9065 + }, + { + "epoch": 1.053267499273889, + "grad_norm": 0.5142088532447815, + "learning_rate": 0.0001, + "loss": 1.4036, + "step": 9066 + }, + { + "epoch": 1.0533836770258496, + "grad_norm": 0.5592623353004456, + "learning_rate": 0.0001, + "loss": 1.4503, + "step": 9067 + }, + { + "epoch": 1.05349985477781, + "grad_norm": 0.5052597522735596, + "learning_rate": 0.0001, + "loss": 1.4116, + "step": 9068 + }, + { + "epoch": 1.0536160325297705, + "grad_norm": 0.517086923122406, + "learning_rate": 0.0001, + "loss": 1.4075, + "step": 9069 + }, + { + "epoch": 1.053732210281731, + "grad_norm": 0.5268409252166748, + "learning_rate": 0.0001, + "loss": 1.4975, + "step": 9070 + }, + { + "epoch": 1.0538483880336915, + "grad_norm": 0.536361038684845, + "learning_rate": 0.0001, + "loss": 1.5102, + "step": 9071 + }, + { + "epoch": 1.053964565785652, + "grad_norm": 0.5307349562644958, + "learning_rate": 0.0001, + "loss": 1.3567, + "step": 9072 + }, + { + "epoch": 1.0540807435376125, + "grad_norm": 0.5793299078941345, + "learning_rate": 0.0001, + "loss": 1.5215, + "step": 9073 + }, + { + "epoch": 1.054196921289573, + "grad_norm": 0.5216554999351501, + "learning_rate": 0.0001, + "loss": 1.4039, + "step": 9074 + }, + { + "epoch": 1.0543130990415335, + "grad_norm": 0.49403879046440125, + "learning_rate": 0.0001, + "loss": 1.2771, + "step": 9075 + }, + { + "epoch": 1.054429276793494, + "grad_norm": 0.5500255823135376, + "learning_rate": 0.0001, + "loss": 1.4948, + "step": 9076 + }, + { + "epoch": 1.0545454545454545, + "grad_norm": 0.5305203795433044, + "learning_rate": 0.0001, + "loss": 1.5828, + "step": 9077 + }, + { + "epoch": 1.054661632297415, + "grad_norm": 0.5569433569908142, + "learning_rate": 0.0001, + "loss": 1.553, + "step": 9078 + }, + { + "epoch": 1.0547778100493757, + "grad_norm": 0.4997643232345581, + "learning_rate": 0.0001, + "loss": 1.3793, + "step": 9079 + }, + { + "epoch": 1.0548939878013361, + "grad_norm": 0.5268417596817017, + "learning_rate": 0.0001, + "loss": 1.4932, + "step": 9080 + }, + { + "epoch": 1.0550101655532966, + "grad_norm": 0.5548385381698608, + "learning_rate": 0.0001, + "loss": 1.5881, + "step": 9081 + }, + { + "epoch": 1.0551263433052571, + "grad_norm": 0.5406033992767334, + "learning_rate": 0.0001, + "loss": 1.5279, + "step": 9082 + }, + { + "epoch": 1.0552425210572176, + "grad_norm": 0.5218345522880554, + "learning_rate": 0.0001, + "loss": 1.4086, + "step": 9083 + }, + { + "epoch": 1.055358698809178, + "grad_norm": 0.5393182635307312, + "learning_rate": 0.0001, + "loss": 1.3487, + "step": 9084 + }, + { + "epoch": 1.0554748765611386, + "grad_norm": 0.5356934070587158, + "learning_rate": 0.0001, + "loss": 1.3364, + "step": 9085 + }, + { + "epoch": 1.055591054313099, + "grad_norm": 0.5603475570678711, + "learning_rate": 0.0001, + "loss": 1.6026, + "step": 9086 + }, + { + "epoch": 1.0557072320650596, + "grad_norm": 0.5328572988510132, + "learning_rate": 0.0001, + "loss": 1.6066, + "step": 9087 + }, + { + "epoch": 1.05582340981702, + "grad_norm": 0.5267220139503479, + "learning_rate": 0.0001, + "loss": 1.383, + "step": 9088 + }, + { + "epoch": 1.0559395875689805, + "grad_norm": 0.5259425640106201, + "learning_rate": 0.0001, + "loss": 1.4592, + "step": 9089 + }, + { + "epoch": 1.056055765320941, + "grad_norm": 0.5833221673965454, + "learning_rate": 0.0001, + "loss": 1.6627, + "step": 9090 + }, + { + "epoch": 1.0561719430729015, + "grad_norm": 0.5569556951522827, + "learning_rate": 0.0001, + "loss": 1.4606, + "step": 9091 + }, + { + "epoch": 1.056288120824862, + "grad_norm": 0.5657941699028015, + "learning_rate": 0.0001, + "loss": 1.5946, + "step": 9092 + }, + { + "epoch": 1.0564042985768225, + "grad_norm": 0.5313854813575745, + "learning_rate": 0.0001, + "loss": 1.4161, + "step": 9093 + }, + { + "epoch": 1.056520476328783, + "grad_norm": 0.530001699924469, + "learning_rate": 0.0001, + "loss": 1.431, + "step": 9094 + }, + { + "epoch": 1.0566366540807435, + "grad_norm": 0.5262467265129089, + "learning_rate": 0.0001, + "loss": 1.4129, + "step": 9095 + }, + { + "epoch": 1.056752831832704, + "grad_norm": 0.567335844039917, + "learning_rate": 0.0001, + "loss": 1.3553, + "step": 9096 + }, + { + "epoch": 1.0568690095846645, + "grad_norm": 0.5013755559921265, + "learning_rate": 0.0001, + "loss": 1.4375, + "step": 9097 + }, + { + "epoch": 1.056985187336625, + "grad_norm": 0.494312047958374, + "learning_rate": 0.0001, + "loss": 1.445, + "step": 9098 + }, + { + "epoch": 1.0571013650885854, + "grad_norm": 0.5399910807609558, + "learning_rate": 0.0001, + "loss": 1.6267, + "step": 9099 + }, + { + "epoch": 1.0572175428405461, + "grad_norm": 0.5516524314880371, + "learning_rate": 0.0001, + "loss": 1.3564, + "step": 9100 + }, + { + "epoch": 1.0573337205925066, + "grad_norm": 0.5304887294769287, + "learning_rate": 0.0001, + "loss": 1.3785, + "step": 9101 + }, + { + "epoch": 1.0574498983444671, + "grad_norm": 0.591498851776123, + "learning_rate": 0.0001, + "loss": 1.542, + "step": 9102 + }, + { + "epoch": 1.0575660760964276, + "grad_norm": 0.5390315651893616, + "learning_rate": 0.0001, + "loss": 1.3806, + "step": 9103 + }, + { + "epoch": 1.057682253848388, + "grad_norm": 0.5507586598396301, + "learning_rate": 0.0001, + "loss": 1.4722, + "step": 9104 + }, + { + "epoch": 1.0577984316003486, + "grad_norm": 0.5363011956214905, + "learning_rate": 0.0001, + "loss": 1.3498, + "step": 9105 + }, + { + "epoch": 1.057914609352309, + "grad_norm": 0.5194579362869263, + "learning_rate": 0.0001, + "loss": 1.5218, + "step": 9106 + }, + { + "epoch": 1.0580307871042696, + "grad_norm": 0.5246658325195312, + "learning_rate": 0.0001, + "loss": 1.5004, + "step": 9107 + }, + { + "epoch": 1.05814696485623, + "grad_norm": 0.5200863480567932, + "learning_rate": 0.0001, + "loss": 1.5608, + "step": 9108 + }, + { + "epoch": 1.0582631426081905, + "grad_norm": 0.5483285188674927, + "learning_rate": 0.0001, + "loss": 1.5353, + "step": 9109 + }, + { + "epoch": 1.058379320360151, + "grad_norm": 0.49789682030677795, + "learning_rate": 0.0001, + "loss": 1.3264, + "step": 9110 + }, + { + "epoch": 1.0584954981121115, + "grad_norm": 0.5474060773849487, + "learning_rate": 0.0001, + "loss": 1.4245, + "step": 9111 + }, + { + "epoch": 1.058611675864072, + "grad_norm": 0.6262152791023254, + "learning_rate": 0.0001, + "loss": 1.6001, + "step": 9112 + }, + { + "epoch": 1.0587278536160325, + "grad_norm": 0.522517740726471, + "learning_rate": 0.0001, + "loss": 1.5039, + "step": 9113 + }, + { + "epoch": 1.058844031367993, + "grad_norm": 0.5017138123512268, + "learning_rate": 0.0001, + "loss": 1.413, + "step": 9114 + }, + { + "epoch": 1.0589602091199535, + "grad_norm": 0.5721921920776367, + "learning_rate": 0.0001, + "loss": 1.5267, + "step": 9115 + }, + { + "epoch": 1.059076386871914, + "grad_norm": 0.5460672378540039, + "learning_rate": 0.0001, + "loss": 1.3668, + "step": 9116 + }, + { + "epoch": 1.0591925646238745, + "grad_norm": 0.5231673121452332, + "learning_rate": 0.0001, + "loss": 1.5067, + "step": 9117 + }, + { + "epoch": 1.059308742375835, + "grad_norm": 0.5345351099967957, + "learning_rate": 0.0001, + "loss": 1.521, + "step": 9118 + }, + { + "epoch": 1.0594249201277954, + "grad_norm": 0.5771772861480713, + "learning_rate": 0.0001, + "loss": 1.708, + "step": 9119 + }, + { + "epoch": 1.059541097879756, + "grad_norm": 0.5495395660400391, + "learning_rate": 0.0001, + "loss": 1.4496, + "step": 9120 + }, + { + "epoch": 1.0596572756317166, + "grad_norm": 0.5115154981613159, + "learning_rate": 0.0001, + "loss": 1.2167, + "step": 9121 + }, + { + "epoch": 1.0597734533836771, + "grad_norm": 0.5737767219543457, + "learning_rate": 0.0001, + "loss": 1.6987, + "step": 9122 + }, + { + "epoch": 1.0598896311356376, + "grad_norm": 0.5117427706718445, + "learning_rate": 0.0001, + "loss": 1.3973, + "step": 9123 + }, + { + "epoch": 1.060005808887598, + "grad_norm": 0.5852498412132263, + "learning_rate": 0.0001, + "loss": 1.5844, + "step": 9124 + }, + { + "epoch": 1.0601219866395586, + "grad_norm": 0.5291174054145813, + "learning_rate": 0.0001, + "loss": 1.4754, + "step": 9125 + }, + { + "epoch": 1.060238164391519, + "grad_norm": 0.5075642466545105, + "learning_rate": 0.0001, + "loss": 1.414, + "step": 9126 + }, + { + "epoch": 1.0603543421434796, + "grad_norm": 0.47634080052375793, + "learning_rate": 0.0001, + "loss": 1.3369, + "step": 9127 + }, + { + "epoch": 1.06047051989544, + "grad_norm": 0.5397040843963623, + "learning_rate": 0.0001, + "loss": 1.6, + "step": 9128 + }, + { + "epoch": 1.0605866976474005, + "grad_norm": 0.5401850938796997, + "learning_rate": 0.0001, + "loss": 1.5148, + "step": 9129 + }, + { + "epoch": 1.060702875399361, + "grad_norm": 0.5514304041862488, + "learning_rate": 0.0001, + "loss": 1.5312, + "step": 9130 + }, + { + "epoch": 1.0608190531513215, + "grad_norm": 0.5192503929138184, + "learning_rate": 0.0001, + "loss": 1.4662, + "step": 9131 + }, + { + "epoch": 1.060935230903282, + "grad_norm": 0.5016432404518127, + "learning_rate": 0.0001, + "loss": 1.6191, + "step": 9132 + }, + { + "epoch": 1.0610514086552425, + "grad_norm": 0.5191652178764343, + "learning_rate": 0.0001, + "loss": 1.3155, + "step": 9133 + }, + { + "epoch": 1.061167586407203, + "grad_norm": 0.5235947966575623, + "learning_rate": 0.0001, + "loss": 1.5193, + "step": 9134 + }, + { + "epoch": 1.0612837641591635, + "grad_norm": 0.538102924823761, + "learning_rate": 0.0001, + "loss": 1.4814, + "step": 9135 + }, + { + "epoch": 1.061399941911124, + "grad_norm": 0.5283055901527405, + "learning_rate": 0.0001, + "loss": 1.5145, + "step": 9136 + }, + { + "epoch": 1.0615161196630845, + "grad_norm": 0.5631850361824036, + "learning_rate": 0.0001, + "loss": 1.662, + "step": 9137 + }, + { + "epoch": 1.061632297415045, + "grad_norm": 0.5427637100219727, + "learning_rate": 0.0001, + "loss": 1.4435, + "step": 9138 + }, + { + "epoch": 1.0617484751670054, + "grad_norm": 0.5537014603614807, + "learning_rate": 0.0001, + "loss": 1.5422, + "step": 9139 + }, + { + "epoch": 1.061864652918966, + "grad_norm": 0.5393050312995911, + "learning_rate": 0.0001, + "loss": 1.6201, + "step": 9140 + }, + { + "epoch": 1.0619808306709264, + "grad_norm": 0.5826035141944885, + "learning_rate": 0.0001, + "loss": 1.6369, + "step": 9141 + }, + { + "epoch": 1.0620970084228871, + "grad_norm": 0.5376782417297363, + "learning_rate": 0.0001, + "loss": 1.4487, + "step": 9142 + }, + { + "epoch": 1.0622131861748476, + "grad_norm": 0.5235744714736938, + "learning_rate": 0.0001, + "loss": 1.3827, + "step": 9143 + }, + { + "epoch": 1.062329363926808, + "grad_norm": 0.5324127674102783, + "learning_rate": 0.0001, + "loss": 1.4812, + "step": 9144 + }, + { + "epoch": 1.0624455416787686, + "grad_norm": 0.5413472652435303, + "learning_rate": 0.0001, + "loss": 1.5188, + "step": 9145 + }, + { + "epoch": 1.062561719430729, + "grad_norm": 0.5227320790290833, + "learning_rate": 0.0001, + "loss": 1.5676, + "step": 9146 + }, + { + "epoch": 1.0626778971826896, + "grad_norm": 0.5471850037574768, + "learning_rate": 0.0001, + "loss": 1.4662, + "step": 9147 + }, + { + "epoch": 1.06279407493465, + "grad_norm": 0.5257613658905029, + "learning_rate": 0.0001, + "loss": 1.4821, + "step": 9148 + }, + { + "epoch": 1.0629102526866105, + "grad_norm": 0.5216736793518066, + "learning_rate": 0.0001, + "loss": 1.4997, + "step": 9149 + }, + { + "epoch": 1.063026430438571, + "grad_norm": 0.5594833493232727, + "learning_rate": 0.0001, + "loss": 1.6263, + "step": 9150 + }, + { + "epoch": 1.0631426081905315, + "grad_norm": 0.49669763445854187, + "learning_rate": 0.0001, + "loss": 1.4158, + "step": 9151 + }, + { + "epoch": 1.063258785942492, + "grad_norm": 0.5265761613845825, + "learning_rate": 0.0001, + "loss": 1.5632, + "step": 9152 + }, + { + "epoch": 1.0633749636944525, + "grad_norm": 0.5300150513648987, + "learning_rate": 0.0001, + "loss": 1.4174, + "step": 9153 + }, + { + "epoch": 1.063491141446413, + "grad_norm": 0.5201780796051025, + "learning_rate": 0.0001, + "loss": 1.3244, + "step": 9154 + }, + { + "epoch": 1.0636073191983735, + "grad_norm": 0.6005831360816956, + "learning_rate": 0.0001, + "loss": 1.5095, + "step": 9155 + }, + { + "epoch": 1.063723496950334, + "grad_norm": 0.5342938303947449, + "learning_rate": 0.0001, + "loss": 1.2851, + "step": 9156 + }, + { + "epoch": 1.0638396747022945, + "grad_norm": 0.5873286128044128, + "learning_rate": 0.0001, + "loss": 1.5314, + "step": 9157 + }, + { + "epoch": 1.063955852454255, + "grad_norm": 0.5283979177474976, + "learning_rate": 0.0001, + "loss": 1.3008, + "step": 9158 + }, + { + "epoch": 1.0640720302062154, + "grad_norm": 0.5319207310676575, + "learning_rate": 0.0001, + "loss": 1.3622, + "step": 9159 + }, + { + "epoch": 1.064188207958176, + "grad_norm": 0.526910662651062, + "learning_rate": 0.0001, + "loss": 1.3385, + "step": 9160 + }, + { + "epoch": 1.0643043857101364, + "grad_norm": 0.5416330099105835, + "learning_rate": 0.0001, + "loss": 1.369, + "step": 9161 + }, + { + "epoch": 1.0644205634620971, + "grad_norm": 0.5400428175926208, + "learning_rate": 0.0001, + "loss": 1.4853, + "step": 9162 + }, + { + "epoch": 1.0645367412140576, + "grad_norm": 0.5297672152519226, + "learning_rate": 0.0001, + "loss": 1.4955, + "step": 9163 + }, + { + "epoch": 1.064652918966018, + "grad_norm": 0.5449459552764893, + "learning_rate": 0.0001, + "loss": 1.4985, + "step": 9164 + }, + { + "epoch": 1.0647690967179786, + "grad_norm": 0.5715656876564026, + "learning_rate": 0.0001, + "loss": 1.6004, + "step": 9165 + }, + { + "epoch": 1.064885274469939, + "grad_norm": 0.5295259952545166, + "learning_rate": 0.0001, + "loss": 1.3461, + "step": 9166 + }, + { + "epoch": 1.0650014522218996, + "grad_norm": 0.5438884496688843, + "learning_rate": 0.0001, + "loss": 1.3848, + "step": 9167 + }, + { + "epoch": 1.06511762997386, + "grad_norm": 0.542167603969574, + "learning_rate": 0.0001, + "loss": 1.5223, + "step": 9168 + }, + { + "epoch": 1.0652338077258205, + "grad_norm": 0.5348635315895081, + "learning_rate": 0.0001, + "loss": 1.3742, + "step": 9169 + }, + { + "epoch": 1.065349985477781, + "grad_norm": 0.5906919836997986, + "learning_rate": 0.0001, + "loss": 1.6726, + "step": 9170 + }, + { + "epoch": 1.0654661632297415, + "grad_norm": 0.5702545642852783, + "learning_rate": 0.0001, + "loss": 1.4587, + "step": 9171 + }, + { + "epoch": 1.065582340981702, + "grad_norm": 0.5129088759422302, + "learning_rate": 0.0001, + "loss": 1.2603, + "step": 9172 + }, + { + "epoch": 1.0656985187336625, + "grad_norm": 0.5603011250495911, + "learning_rate": 0.0001, + "loss": 1.5184, + "step": 9173 + }, + { + "epoch": 1.065814696485623, + "grad_norm": 0.6020777821540833, + "learning_rate": 0.0001, + "loss": 1.5807, + "step": 9174 + }, + { + "epoch": 1.0659308742375835, + "grad_norm": 0.5032897591590881, + "learning_rate": 0.0001, + "loss": 1.3745, + "step": 9175 + }, + { + "epoch": 1.066047051989544, + "grad_norm": 0.5128781199455261, + "learning_rate": 0.0001, + "loss": 1.2593, + "step": 9176 + }, + { + "epoch": 1.0661632297415045, + "grad_norm": 0.5254261493682861, + "learning_rate": 0.0001, + "loss": 1.3345, + "step": 9177 + }, + { + "epoch": 1.066279407493465, + "grad_norm": 0.508584201335907, + "learning_rate": 0.0001, + "loss": 1.4429, + "step": 9178 + }, + { + "epoch": 1.0663955852454254, + "grad_norm": 0.5458223223686218, + "learning_rate": 0.0001, + "loss": 1.487, + "step": 9179 + }, + { + "epoch": 1.066511762997386, + "grad_norm": 0.5139407515525818, + "learning_rate": 0.0001, + "loss": 1.4294, + "step": 9180 + }, + { + "epoch": 1.0666279407493464, + "grad_norm": 0.505296528339386, + "learning_rate": 0.0001, + "loss": 1.3642, + "step": 9181 + }, + { + "epoch": 1.066744118501307, + "grad_norm": 0.5002397298812866, + "learning_rate": 0.0001, + "loss": 1.3983, + "step": 9182 + }, + { + "epoch": 1.0668602962532674, + "grad_norm": 0.551529049873352, + "learning_rate": 0.0001, + "loss": 1.5567, + "step": 9183 + }, + { + "epoch": 1.0669764740052279, + "grad_norm": 0.578542947769165, + "learning_rate": 0.0001, + "loss": 1.4973, + "step": 9184 + }, + { + "epoch": 1.0670926517571886, + "grad_norm": 0.531636655330658, + "learning_rate": 0.0001, + "loss": 1.5038, + "step": 9185 + }, + { + "epoch": 1.067208829509149, + "grad_norm": 0.5241780281066895, + "learning_rate": 0.0001, + "loss": 1.3676, + "step": 9186 + }, + { + "epoch": 1.0673250072611096, + "grad_norm": 0.5631434321403503, + "learning_rate": 0.0001, + "loss": 1.5443, + "step": 9187 + }, + { + "epoch": 1.06744118501307, + "grad_norm": 0.5912564992904663, + "learning_rate": 0.0001, + "loss": 1.5429, + "step": 9188 + }, + { + "epoch": 1.0675573627650305, + "grad_norm": 0.4988497197628021, + "learning_rate": 0.0001, + "loss": 1.5006, + "step": 9189 + }, + { + "epoch": 1.067673540516991, + "grad_norm": 0.5904607772827148, + "learning_rate": 0.0001, + "loss": 1.4782, + "step": 9190 + }, + { + "epoch": 1.0677897182689515, + "grad_norm": 0.5441171526908875, + "learning_rate": 0.0001, + "loss": 1.3963, + "step": 9191 + }, + { + "epoch": 1.067905896020912, + "grad_norm": 0.541077196598053, + "learning_rate": 0.0001, + "loss": 1.5066, + "step": 9192 + }, + { + "epoch": 1.0680220737728725, + "grad_norm": 0.53225177526474, + "learning_rate": 0.0001, + "loss": 1.4918, + "step": 9193 + }, + { + "epoch": 1.068138251524833, + "grad_norm": 0.543021559715271, + "learning_rate": 0.0001, + "loss": 1.3748, + "step": 9194 + }, + { + "epoch": 1.0682544292767935, + "grad_norm": 0.6031702756881714, + "learning_rate": 0.0001, + "loss": 1.5544, + "step": 9195 + }, + { + "epoch": 1.068370607028754, + "grad_norm": 0.571552574634552, + "learning_rate": 0.0001, + "loss": 1.3105, + "step": 9196 + }, + { + "epoch": 1.0684867847807145, + "grad_norm": 0.5475126504898071, + "learning_rate": 0.0001, + "loss": 1.4106, + "step": 9197 + }, + { + "epoch": 1.068602962532675, + "grad_norm": 0.5954582691192627, + "learning_rate": 0.0001, + "loss": 1.6204, + "step": 9198 + }, + { + "epoch": 1.0687191402846354, + "grad_norm": 0.5108276605606079, + "learning_rate": 0.0001, + "loss": 1.3587, + "step": 9199 + }, + { + "epoch": 1.068835318036596, + "grad_norm": 0.5748639106750488, + "learning_rate": 0.0001, + "loss": 1.6054, + "step": 9200 + }, + { + "epoch": 1.0689514957885564, + "grad_norm": 0.5265009999275208, + "learning_rate": 0.0001, + "loss": 1.3955, + "step": 9201 + }, + { + "epoch": 1.069067673540517, + "grad_norm": 0.5206406116485596, + "learning_rate": 0.0001, + "loss": 1.4456, + "step": 9202 + }, + { + "epoch": 1.0691838512924774, + "grad_norm": 0.5493237972259521, + "learning_rate": 0.0001, + "loss": 1.6656, + "step": 9203 + }, + { + "epoch": 1.069300029044438, + "grad_norm": 0.5620342493057251, + "learning_rate": 0.0001, + "loss": 1.4971, + "step": 9204 + }, + { + "epoch": 1.0694162067963986, + "grad_norm": 0.5706990361213684, + "learning_rate": 0.0001, + "loss": 1.6995, + "step": 9205 + }, + { + "epoch": 1.069532384548359, + "grad_norm": 0.5248960852622986, + "learning_rate": 0.0001, + "loss": 1.4711, + "step": 9206 + }, + { + "epoch": 1.0696485623003196, + "grad_norm": 0.5431904196739197, + "learning_rate": 0.0001, + "loss": 1.4966, + "step": 9207 + }, + { + "epoch": 1.06976474005228, + "grad_norm": 0.5573443174362183, + "learning_rate": 0.0001, + "loss": 1.5127, + "step": 9208 + }, + { + "epoch": 1.0698809178042405, + "grad_norm": 0.5166314840316772, + "learning_rate": 0.0001, + "loss": 1.4974, + "step": 9209 + }, + { + "epoch": 1.069997095556201, + "grad_norm": 0.5510638356208801, + "learning_rate": 0.0001, + "loss": 1.4451, + "step": 9210 + }, + { + "epoch": 1.0701132733081615, + "grad_norm": 0.5057435035705566, + "learning_rate": 0.0001, + "loss": 1.3959, + "step": 9211 + }, + { + "epoch": 1.070229451060122, + "grad_norm": 0.5617602467536926, + "learning_rate": 0.0001, + "loss": 1.5963, + "step": 9212 + }, + { + "epoch": 1.0703456288120825, + "grad_norm": 0.5629894733428955, + "learning_rate": 0.0001, + "loss": 1.5919, + "step": 9213 + }, + { + "epoch": 1.070461806564043, + "grad_norm": 0.5455237030982971, + "learning_rate": 0.0001, + "loss": 1.4992, + "step": 9214 + }, + { + "epoch": 1.0705779843160035, + "grad_norm": 0.5817468762397766, + "learning_rate": 0.0001, + "loss": 1.5759, + "step": 9215 + }, + { + "epoch": 1.070694162067964, + "grad_norm": 0.5639060735702515, + "learning_rate": 0.0001, + "loss": 1.4777, + "step": 9216 + }, + { + "epoch": 1.0708103398199245, + "grad_norm": 0.5920657515525818, + "learning_rate": 0.0001, + "loss": 1.6202, + "step": 9217 + }, + { + "epoch": 1.070926517571885, + "grad_norm": 0.513184666633606, + "learning_rate": 0.0001, + "loss": 1.394, + "step": 9218 + }, + { + "epoch": 1.0710426953238454, + "grad_norm": 0.5242840647697449, + "learning_rate": 0.0001, + "loss": 1.5025, + "step": 9219 + }, + { + "epoch": 1.071158873075806, + "grad_norm": 0.5487340688705444, + "learning_rate": 0.0001, + "loss": 1.5416, + "step": 9220 + }, + { + "epoch": 1.0712750508277664, + "grad_norm": 0.7941916584968567, + "learning_rate": 0.0001, + "loss": 1.7697, + "step": 9221 + }, + { + "epoch": 1.071391228579727, + "grad_norm": 0.5666763782501221, + "learning_rate": 0.0001, + "loss": 1.6823, + "step": 9222 + }, + { + "epoch": 1.0715074063316874, + "grad_norm": 0.5537146925926208, + "learning_rate": 0.0001, + "loss": 1.3735, + "step": 9223 + }, + { + "epoch": 1.0716235840836479, + "grad_norm": 0.5601913928985596, + "learning_rate": 0.0001, + "loss": 1.3579, + "step": 9224 + }, + { + "epoch": 1.0717397618356084, + "grad_norm": 0.5344251990318298, + "learning_rate": 0.0001, + "loss": 1.5254, + "step": 9225 + }, + { + "epoch": 1.071855939587569, + "grad_norm": 0.5359842777252197, + "learning_rate": 0.0001, + "loss": 1.3922, + "step": 9226 + }, + { + "epoch": 1.0719721173395296, + "grad_norm": 0.5405870079994202, + "learning_rate": 0.0001, + "loss": 1.4498, + "step": 9227 + }, + { + "epoch": 1.07208829509149, + "grad_norm": 0.51329505443573, + "learning_rate": 0.0001, + "loss": 1.4123, + "step": 9228 + }, + { + "epoch": 1.0722044728434506, + "grad_norm": 0.5659348368644714, + "learning_rate": 0.0001, + "loss": 1.3418, + "step": 9229 + }, + { + "epoch": 1.072320650595411, + "grad_norm": 0.5718405842781067, + "learning_rate": 0.0001, + "loss": 1.5285, + "step": 9230 + }, + { + "epoch": 1.0724368283473715, + "grad_norm": 0.5486005544662476, + "learning_rate": 0.0001, + "loss": 1.6518, + "step": 9231 + }, + { + "epoch": 1.072553006099332, + "grad_norm": 0.5066012740135193, + "learning_rate": 0.0001, + "loss": 1.3796, + "step": 9232 + }, + { + "epoch": 1.0726691838512925, + "grad_norm": 0.5650283098220825, + "learning_rate": 0.0001, + "loss": 1.5083, + "step": 9233 + }, + { + "epoch": 1.072785361603253, + "grad_norm": 0.5359314680099487, + "learning_rate": 0.0001, + "loss": 1.4413, + "step": 9234 + }, + { + "epoch": 1.0729015393552135, + "grad_norm": 0.5751736164093018, + "learning_rate": 0.0001, + "loss": 1.6148, + "step": 9235 + }, + { + "epoch": 1.073017717107174, + "grad_norm": 0.5163309574127197, + "learning_rate": 0.0001, + "loss": 1.4626, + "step": 9236 + }, + { + "epoch": 1.0731338948591345, + "grad_norm": 0.550383448600769, + "learning_rate": 0.0001, + "loss": 1.459, + "step": 9237 + }, + { + "epoch": 1.073250072611095, + "grad_norm": 0.5238305330276489, + "learning_rate": 0.0001, + "loss": 1.4432, + "step": 9238 + }, + { + "epoch": 1.0733662503630554, + "grad_norm": 0.49950873851776123, + "learning_rate": 0.0001, + "loss": 1.3584, + "step": 9239 + }, + { + "epoch": 1.073482428115016, + "grad_norm": 0.508315920829773, + "learning_rate": 0.0001, + "loss": 1.3408, + "step": 9240 + }, + { + "epoch": 1.0735986058669764, + "grad_norm": 0.5786048173904419, + "learning_rate": 0.0001, + "loss": 1.27, + "step": 9241 + }, + { + "epoch": 1.073714783618937, + "grad_norm": 0.5773483514785767, + "learning_rate": 0.0001, + "loss": 1.5728, + "step": 9242 + }, + { + "epoch": 1.0738309613708974, + "grad_norm": 0.5315750241279602, + "learning_rate": 0.0001, + "loss": 1.469, + "step": 9243 + }, + { + "epoch": 1.0739471391228579, + "grad_norm": 0.6098319292068481, + "learning_rate": 0.0001, + "loss": 1.5211, + "step": 9244 + }, + { + "epoch": 1.0740633168748184, + "grad_norm": 0.5994076132774353, + "learning_rate": 0.0001, + "loss": 1.6424, + "step": 9245 + }, + { + "epoch": 1.074179494626779, + "grad_norm": 0.5081837177276611, + "learning_rate": 0.0001, + "loss": 1.3348, + "step": 9246 + }, + { + "epoch": 1.0742956723787396, + "grad_norm": 0.5301439166069031, + "learning_rate": 0.0001, + "loss": 1.4837, + "step": 9247 + }, + { + "epoch": 1.0744118501307, + "grad_norm": 0.5718616247177124, + "learning_rate": 0.0001, + "loss": 1.5717, + "step": 9248 + }, + { + "epoch": 1.0745280278826606, + "grad_norm": 0.5080563426017761, + "learning_rate": 0.0001, + "loss": 1.3807, + "step": 9249 + }, + { + "epoch": 1.074644205634621, + "grad_norm": 0.5610292553901672, + "learning_rate": 0.0001, + "loss": 1.4981, + "step": 9250 + }, + { + "epoch": 1.0747603833865815, + "grad_norm": 0.5332043766975403, + "learning_rate": 0.0001, + "loss": 1.4046, + "step": 9251 + }, + { + "epoch": 1.074876561138542, + "grad_norm": 0.5250877737998962, + "learning_rate": 0.0001, + "loss": 1.4784, + "step": 9252 + }, + { + "epoch": 1.0749927388905025, + "grad_norm": 0.5090211629867554, + "learning_rate": 0.0001, + "loss": 1.3705, + "step": 9253 + }, + { + "epoch": 1.075108916642463, + "grad_norm": 0.5132327079772949, + "learning_rate": 0.0001, + "loss": 1.4105, + "step": 9254 + }, + { + "epoch": 1.0752250943944235, + "grad_norm": 0.5518538951873779, + "learning_rate": 0.0001, + "loss": 1.5959, + "step": 9255 + }, + { + "epoch": 1.075341272146384, + "grad_norm": 0.5492732524871826, + "learning_rate": 0.0001, + "loss": 1.6017, + "step": 9256 + }, + { + "epoch": 1.0754574498983445, + "grad_norm": 0.5491774678230286, + "learning_rate": 0.0001, + "loss": 1.4346, + "step": 9257 + }, + { + "epoch": 1.075573627650305, + "grad_norm": 0.5792922377586365, + "learning_rate": 0.0001, + "loss": 1.5449, + "step": 9258 + }, + { + "epoch": 1.0756898054022654, + "grad_norm": 0.5162052512168884, + "learning_rate": 0.0001, + "loss": 1.3871, + "step": 9259 + }, + { + "epoch": 1.075805983154226, + "grad_norm": 0.5442473292350769, + "learning_rate": 0.0001, + "loss": 1.5057, + "step": 9260 + }, + { + "epoch": 1.0759221609061864, + "grad_norm": 0.5718018412590027, + "learning_rate": 0.0001, + "loss": 1.5434, + "step": 9261 + }, + { + "epoch": 1.076038338658147, + "grad_norm": 0.53264319896698, + "learning_rate": 0.0001, + "loss": 1.4633, + "step": 9262 + }, + { + "epoch": 1.0761545164101074, + "grad_norm": 0.5195859670639038, + "learning_rate": 0.0001, + "loss": 1.3004, + "step": 9263 + }, + { + "epoch": 1.0762706941620679, + "grad_norm": 0.5994012951850891, + "learning_rate": 0.0001, + "loss": 1.6071, + "step": 9264 + }, + { + "epoch": 1.0763868719140284, + "grad_norm": 0.5237147212028503, + "learning_rate": 0.0001, + "loss": 1.3221, + "step": 9265 + }, + { + "epoch": 1.0765030496659889, + "grad_norm": 0.5110024809837341, + "learning_rate": 0.0001, + "loss": 1.2827, + "step": 9266 + }, + { + "epoch": 1.0766192274179494, + "grad_norm": 0.5228458642959595, + "learning_rate": 0.0001, + "loss": 1.4062, + "step": 9267 + }, + { + "epoch": 1.07673540516991, + "grad_norm": 0.5706833004951477, + "learning_rate": 0.0001, + "loss": 1.5371, + "step": 9268 + }, + { + "epoch": 1.0768515829218706, + "grad_norm": 0.5496780276298523, + "learning_rate": 0.0001, + "loss": 1.5444, + "step": 9269 + }, + { + "epoch": 1.076967760673831, + "grad_norm": 0.5509944558143616, + "learning_rate": 0.0001, + "loss": 1.3909, + "step": 9270 + }, + { + "epoch": 1.0770839384257915, + "grad_norm": 0.5543323755264282, + "learning_rate": 0.0001, + "loss": 1.4529, + "step": 9271 + }, + { + "epoch": 1.077200116177752, + "grad_norm": 0.5567429661750793, + "learning_rate": 0.0001, + "loss": 1.441, + "step": 9272 + }, + { + "epoch": 1.0773162939297125, + "grad_norm": 0.5358352661132812, + "learning_rate": 0.0001, + "loss": 1.3017, + "step": 9273 + }, + { + "epoch": 1.077432471681673, + "grad_norm": 0.5601693987846375, + "learning_rate": 0.0001, + "loss": 1.5659, + "step": 9274 + }, + { + "epoch": 1.0775486494336335, + "grad_norm": 0.5503692626953125, + "learning_rate": 0.0001, + "loss": 1.4401, + "step": 9275 + }, + { + "epoch": 1.077664827185594, + "grad_norm": 0.5987486839294434, + "learning_rate": 0.0001, + "loss": 1.459, + "step": 9276 + }, + { + "epoch": 1.0777810049375545, + "grad_norm": 0.6052455902099609, + "learning_rate": 0.0001, + "loss": 1.6409, + "step": 9277 + }, + { + "epoch": 1.077897182689515, + "grad_norm": 0.5339061617851257, + "learning_rate": 0.0001, + "loss": 1.4148, + "step": 9278 + }, + { + "epoch": 1.0780133604414754, + "grad_norm": 0.5292863845825195, + "learning_rate": 0.0001, + "loss": 1.5569, + "step": 9279 + }, + { + "epoch": 1.078129538193436, + "grad_norm": 0.5491698980331421, + "learning_rate": 0.0001, + "loss": 1.3836, + "step": 9280 + }, + { + "epoch": 1.0782457159453964, + "grad_norm": 0.5501977205276489, + "learning_rate": 0.0001, + "loss": 1.6123, + "step": 9281 + }, + { + "epoch": 1.078361893697357, + "grad_norm": 0.5370213389396667, + "learning_rate": 0.0001, + "loss": 1.5334, + "step": 9282 + }, + { + "epoch": 1.0784780714493174, + "grad_norm": 0.5482035279273987, + "learning_rate": 0.0001, + "loss": 1.239, + "step": 9283 + }, + { + "epoch": 1.0785942492012779, + "grad_norm": 0.5296024680137634, + "learning_rate": 0.0001, + "loss": 1.4803, + "step": 9284 + }, + { + "epoch": 1.0787104269532384, + "grad_norm": 0.4946173131465912, + "learning_rate": 0.0001, + "loss": 1.4238, + "step": 9285 + }, + { + "epoch": 1.0788266047051989, + "grad_norm": 0.5261144042015076, + "learning_rate": 0.0001, + "loss": 1.4101, + "step": 9286 + }, + { + "epoch": 1.0789427824571594, + "grad_norm": 0.5421222448348999, + "learning_rate": 0.0001, + "loss": 1.4615, + "step": 9287 + }, + { + "epoch": 1.07905896020912, + "grad_norm": 0.5617155432701111, + "learning_rate": 0.0001, + "loss": 1.4545, + "step": 9288 + }, + { + "epoch": 1.0791751379610806, + "grad_norm": 0.5525049567222595, + "learning_rate": 0.0001, + "loss": 1.4768, + "step": 9289 + }, + { + "epoch": 1.079291315713041, + "grad_norm": 0.5454285740852356, + "learning_rate": 0.0001, + "loss": 1.559, + "step": 9290 + }, + { + "epoch": 1.0794074934650015, + "grad_norm": 0.5999751091003418, + "learning_rate": 0.0001, + "loss": 1.3304, + "step": 9291 + }, + { + "epoch": 1.079523671216962, + "grad_norm": 0.5454568862915039, + "learning_rate": 0.0001, + "loss": 1.375, + "step": 9292 + }, + { + "epoch": 1.0796398489689225, + "grad_norm": 0.5416858792304993, + "learning_rate": 0.0001, + "loss": 1.4815, + "step": 9293 + }, + { + "epoch": 1.079756026720883, + "grad_norm": 0.5695158243179321, + "learning_rate": 0.0001, + "loss": 1.5292, + "step": 9294 + }, + { + "epoch": 1.0798722044728435, + "grad_norm": 0.5237754583358765, + "learning_rate": 0.0001, + "loss": 1.2665, + "step": 9295 + }, + { + "epoch": 1.079988382224804, + "grad_norm": 0.5280129909515381, + "learning_rate": 0.0001, + "loss": 1.3654, + "step": 9296 + }, + { + "epoch": 1.0801045599767645, + "grad_norm": 0.5281645655632019, + "learning_rate": 0.0001, + "loss": 1.2907, + "step": 9297 + }, + { + "epoch": 1.080220737728725, + "grad_norm": 0.5463537573814392, + "learning_rate": 0.0001, + "loss": 1.4693, + "step": 9298 + }, + { + "epoch": 1.0803369154806854, + "grad_norm": 0.5508739948272705, + "learning_rate": 0.0001, + "loss": 1.304, + "step": 9299 + }, + { + "epoch": 1.080453093232646, + "grad_norm": 0.6003327369689941, + "learning_rate": 0.0001, + "loss": 1.4681, + "step": 9300 + }, + { + "epoch": 1.0805692709846064, + "grad_norm": 0.5155467987060547, + "learning_rate": 0.0001, + "loss": 1.3432, + "step": 9301 + }, + { + "epoch": 1.080685448736567, + "grad_norm": 0.5434906482696533, + "learning_rate": 0.0001, + "loss": 1.4054, + "step": 9302 + }, + { + "epoch": 1.0808016264885274, + "grad_norm": 0.5202093124389648, + "learning_rate": 0.0001, + "loss": 1.3913, + "step": 9303 + }, + { + "epoch": 1.080917804240488, + "grad_norm": 0.5816706418991089, + "learning_rate": 0.0001, + "loss": 1.4522, + "step": 9304 + }, + { + "epoch": 1.0810339819924484, + "grad_norm": 0.5725724101066589, + "learning_rate": 0.0001, + "loss": 1.5505, + "step": 9305 + }, + { + "epoch": 1.0811501597444089, + "grad_norm": 0.5437831282615662, + "learning_rate": 0.0001, + "loss": 1.5443, + "step": 9306 + }, + { + "epoch": 1.0812663374963694, + "grad_norm": 0.5659807920455933, + "learning_rate": 0.0001, + "loss": 1.4473, + "step": 9307 + }, + { + "epoch": 1.0813825152483298, + "grad_norm": 0.5198776125907898, + "learning_rate": 0.0001, + "loss": 1.4128, + "step": 9308 + }, + { + "epoch": 1.0814986930002903, + "grad_norm": 0.6664596199989319, + "learning_rate": 0.0001, + "loss": 1.7793, + "step": 9309 + }, + { + "epoch": 1.081614870752251, + "grad_norm": 0.5590860247612, + "learning_rate": 0.0001, + "loss": 1.4313, + "step": 9310 + }, + { + "epoch": 1.0817310485042115, + "grad_norm": 0.5522528290748596, + "learning_rate": 0.0001, + "loss": 1.378, + "step": 9311 + }, + { + "epoch": 1.081847226256172, + "grad_norm": 0.5644780397415161, + "learning_rate": 0.0001, + "loss": 1.3321, + "step": 9312 + }, + { + "epoch": 1.0819634040081325, + "grad_norm": 0.5319089889526367, + "learning_rate": 0.0001, + "loss": 1.2627, + "step": 9313 + }, + { + "epoch": 1.082079581760093, + "grad_norm": 0.5518312454223633, + "learning_rate": 0.0001, + "loss": 1.3252, + "step": 9314 + }, + { + "epoch": 1.0821957595120535, + "grad_norm": 0.5743319392204285, + "learning_rate": 0.0001, + "loss": 1.5075, + "step": 9315 + }, + { + "epoch": 1.082311937264014, + "grad_norm": 0.5341470241546631, + "learning_rate": 0.0001, + "loss": 1.348, + "step": 9316 + }, + { + "epoch": 1.0824281150159745, + "grad_norm": 0.5499274134635925, + "learning_rate": 0.0001, + "loss": 1.3848, + "step": 9317 + }, + { + "epoch": 1.082544292767935, + "grad_norm": 0.5466471910476685, + "learning_rate": 0.0001, + "loss": 1.4814, + "step": 9318 + }, + { + "epoch": 1.0826604705198954, + "grad_norm": 0.5436463356018066, + "learning_rate": 0.0001, + "loss": 1.3873, + "step": 9319 + }, + { + "epoch": 1.082776648271856, + "grad_norm": 0.5741965770721436, + "learning_rate": 0.0001, + "loss": 1.5742, + "step": 9320 + }, + { + "epoch": 1.0828928260238164, + "grad_norm": 0.5448581576347351, + "learning_rate": 0.0001, + "loss": 1.4217, + "step": 9321 + }, + { + "epoch": 1.083009003775777, + "grad_norm": 0.5822571516036987, + "learning_rate": 0.0001, + "loss": 1.5581, + "step": 9322 + }, + { + "epoch": 1.0831251815277374, + "grad_norm": 0.6465715765953064, + "learning_rate": 0.0001, + "loss": 1.5893, + "step": 9323 + }, + { + "epoch": 1.083241359279698, + "grad_norm": 0.5584543943405151, + "learning_rate": 0.0001, + "loss": 1.5302, + "step": 9324 + }, + { + "epoch": 1.0833575370316584, + "grad_norm": 0.5714010000228882, + "learning_rate": 0.0001, + "loss": 1.5215, + "step": 9325 + }, + { + "epoch": 1.0834737147836189, + "grad_norm": 0.5465094447135925, + "learning_rate": 0.0001, + "loss": 1.4766, + "step": 9326 + }, + { + "epoch": 1.0835898925355794, + "grad_norm": 0.5097503066062927, + "learning_rate": 0.0001, + "loss": 1.3755, + "step": 9327 + }, + { + "epoch": 1.0837060702875398, + "grad_norm": 0.5056722164154053, + "learning_rate": 0.0001, + "loss": 1.2346, + "step": 9328 + }, + { + "epoch": 1.0838222480395003, + "grad_norm": 0.565323531627655, + "learning_rate": 0.0001, + "loss": 1.3642, + "step": 9329 + }, + { + "epoch": 1.083938425791461, + "grad_norm": 0.5437831878662109, + "learning_rate": 0.0001, + "loss": 1.3076, + "step": 9330 + }, + { + "epoch": 1.0840546035434215, + "grad_norm": 0.5402218103408813, + "learning_rate": 0.0001, + "loss": 1.3891, + "step": 9331 + }, + { + "epoch": 1.084170781295382, + "grad_norm": 0.5713097453117371, + "learning_rate": 0.0001, + "loss": 1.4606, + "step": 9332 + }, + { + "epoch": 1.0842869590473425, + "grad_norm": 0.5773684978485107, + "learning_rate": 0.0001, + "loss": 1.4461, + "step": 9333 + }, + { + "epoch": 1.084403136799303, + "grad_norm": 0.4951843023300171, + "learning_rate": 0.0001, + "loss": 1.1599, + "step": 9334 + }, + { + "epoch": 1.0845193145512635, + "grad_norm": 0.5116522312164307, + "learning_rate": 0.0001, + "loss": 1.4187, + "step": 9335 + }, + { + "epoch": 1.084635492303224, + "grad_norm": 0.5548305511474609, + "learning_rate": 0.0001, + "loss": 1.6528, + "step": 9336 + }, + { + "epoch": 1.0847516700551845, + "grad_norm": 0.5454692840576172, + "learning_rate": 0.0001, + "loss": 1.5651, + "step": 9337 + }, + { + "epoch": 1.084867847807145, + "grad_norm": 0.5281533598899841, + "learning_rate": 0.0001, + "loss": 1.385, + "step": 9338 + }, + { + "epoch": 1.0849840255591054, + "grad_norm": 0.5260929465293884, + "learning_rate": 0.0001, + "loss": 1.3408, + "step": 9339 + }, + { + "epoch": 1.085100203311066, + "grad_norm": 0.5246114134788513, + "learning_rate": 0.0001, + "loss": 1.3622, + "step": 9340 + }, + { + "epoch": 1.0852163810630264, + "grad_norm": 0.5710988640785217, + "learning_rate": 0.0001, + "loss": 1.5343, + "step": 9341 + }, + { + "epoch": 1.085332558814987, + "grad_norm": 0.5941675901412964, + "learning_rate": 0.0001, + "loss": 1.5731, + "step": 9342 + }, + { + "epoch": 1.0854487365669474, + "grad_norm": 0.5492114424705505, + "learning_rate": 0.0001, + "loss": 1.4177, + "step": 9343 + }, + { + "epoch": 1.085564914318908, + "grad_norm": 0.5595629215240479, + "learning_rate": 0.0001, + "loss": 1.4012, + "step": 9344 + }, + { + "epoch": 1.0856810920708684, + "grad_norm": 0.5756628513336182, + "learning_rate": 0.0001, + "loss": 1.5284, + "step": 9345 + }, + { + "epoch": 1.0857972698228289, + "grad_norm": 0.5529462099075317, + "learning_rate": 0.0001, + "loss": 1.5355, + "step": 9346 + }, + { + "epoch": 1.0859134475747894, + "grad_norm": 0.5427260398864746, + "learning_rate": 0.0001, + "loss": 1.3664, + "step": 9347 + }, + { + "epoch": 1.0860296253267498, + "grad_norm": 0.49472326040267944, + "learning_rate": 0.0001, + "loss": 1.3941, + "step": 9348 + }, + { + "epoch": 1.0861458030787103, + "grad_norm": 0.5269715189933777, + "learning_rate": 0.0001, + "loss": 1.4096, + "step": 9349 + }, + { + "epoch": 1.0862619808306708, + "grad_norm": 0.5339666604995728, + "learning_rate": 0.0001, + "loss": 1.4698, + "step": 9350 + }, + { + "epoch": 1.0863781585826313, + "grad_norm": 0.5161392688751221, + "learning_rate": 0.0001, + "loss": 1.3614, + "step": 9351 + }, + { + "epoch": 1.086494336334592, + "grad_norm": 0.5255089998245239, + "learning_rate": 0.0001, + "loss": 1.3307, + "step": 9352 + }, + { + "epoch": 1.0866105140865525, + "grad_norm": 0.5742126107215881, + "learning_rate": 0.0001, + "loss": 1.567, + "step": 9353 + }, + { + "epoch": 1.086726691838513, + "grad_norm": 0.5712875723838806, + "learning_rate": 0.0001, + "loss": 1.4733, + "step": 9354 + }, + { + "epoch": 1.0868428695904735, + "grad_norm": 0.5453868508338928, + "learning_rate": 0.0001, + "loss": 1.5304, + "step": 9355 + }, + { + "epoch": 1.086959047342434, + "grad_norm": 0.5706861019134521, + "learning_rate": 0.0001, + "loss": 1.4652, + "step": 9356 + }, + { + "epoch": 1.0870752250943945, + "grad_norm": 0.5392447710037231, + "learning_rate": 0.0001, + "loss": 1.45, + "step": 9357 + }, + { + "epoch": 1.087191402846355, + "grad_norm": 0.5431585907936096, + "learning_rate": 0.0001, + "loss": 1.484, + "step": 9358 + }, + { + "epoch": 1.0873075805983154, + "grad_norm": 0.6064168810844421, + "learning_rate": 0.0001, + "loss": 1.596, + "step": 9359 + }, + { + "epoch": 1.087423758350276, + "grad_norm": 0.6154705882072449, + "learning_rate": 0.0001, + "loss": 1.4216, + "step": 9360 + }, + { + "epoch": 1.0875399361022364, + "grad_norm": 0.5736680030822754, + "learning_rate": 0.0001, + "loss": 1.5443, + "step": 9361 + }, + { + "epoch": 1.087656113854197, + "grad_norm": 0.5389428734779358, + "learning_rate": 0.0001, + "loss": 1.5365, + "step": 9362 + }, + { + "epoch": 1.0877722916061574, + "grad_norm": 0.5451179146766663, + "learning_rate": 0.0001, + "loss": 1.4913, + "step": 9363 + }, + { + "epoch": 1.087888469358118, + "grad_norm": 0.5412275791168213, + "learning_rate": 0.0001, + "loss": 1.4005, + "step": 9364 + }, + { + "epoch": 1.0880046471100784, + "grad_norm": 0.5797693729400635, + "learning_rate": 0.0001, + "loss": 1.5728, + "step": 9365 + }, + { + "epoch": 1.0881208248620389, + "grad_norm": 0.5813042521476746, + "learning_rate": 0.0001, + "loss": 1.7605, + "step": 9366 + }, + { + "epoch": 1.0882370026139994, + "grad_norm": 0.5206052660942078, + "learning_rate": 0.0001, + "loss": 1.3434, + "step": 9367 + }, + { + "epoch": 1.0883531803659598, + "grad_norm": 0.5384072661399841, + "learning_rate": 0.0001, + "loss": 1.5433, + "step": 9368 + }, + { + "epoch": 1.0884693581179203, + "grad_norm": 0.49630725383758545, + "learning_rate": 0.0001, + "loss": 1.4529, + "step": 9369 + }, + { + "epoch": 1.0885855358698808, + "grad_norm": 0.5630286335945129, + "learning_rate": 0.0001, + "loss": 1.498, + "step": 9370 + }, + { + "epoch": 1.0887017136218413, + "grad_norm": 0.5383244156837463, + "learning_rate": 0.0001, + "loss": 1.3553, + "step": 9371 + }, + { + "epoch": 1.088817891373802, + "grad_norm": 0.6035181879997253, + "learning_rate": 0.0001, + "loss": 1.489, + "step": 9372 + }, + { + "epoch": 1.0889340691257625, + "grad_norm": 0.5531001687049866, + "learning_rate": 0.0001, + "loss": 1.584, + "step": 9373 + }, + { + "epoch": 1.089050246877723, + "grad_norm": 0.5467984676361084, + "learning_rate": 0.0001, + "loss": 1.3719, + "step": 9374 + }, + { + "epoch": 1.0891664246296835, + "grad_norm": 0.5076682567596436, + "learning_rate": 0.0001, + "loss": 1.4403, + "step": 9375 + }, + { + "epoch": 1.089282602381644, + "grad_norm": 0.5291927456855774, + "learning_rate": 0.0001, + "loss": 1.3413, + "step": 9376 + }, + { + "epoch": 1.0893987801336045, + "grad_norm": 0.4952625632286072, + "learning_rate": 0.0001, + "loss": 1.3823, + "step": 9377 + }, + { + "epoch": 1.089514957885565, + "grad_norm": 0.585818350315094, + "learning_rate": 0.0001, + "loss": 1.656, + "step": 9378 + }, + { + "epoch": 1.0896311356375254, + "grad_norm": 0.6007453799247742, + "learning_rate": 0.0001, + "loss": 1.5042, + "step": 9379 + }, + { + "epoch": 1.089747313389486, + "grad_norm": 0.5968008637428284, + "learning_rate": 0.0001, + "loss": 1.5623, + "step": 9380 + }, + { + "epoch": 1.0898634911414464, + "grad_norm": 0.5476697087287903, + "learning_rate": 0.0001, + "loss": 1.5362, + "step": 9381 + }, + { + "epoch": 1.089979668893407, + "grad_norm": 0.5096556544303894, + "learning_rate": 0.0001, + "loss": 1.2194, + "step": 9382 + }, + { + "epoch": 1.0900958466453674, + "grad_norm": 0.5080416798591614, + "learning_rate": 0.0001, + "loss": 1.23, + "step": 9383 + }, + { + "epoch": 1.090212024397328, + "grad_norm": 0.5420169234275818, + "learning_rate": 0.0001, + "loss": 1.4699, + "step": 9384 + }, + { + "epoch": 1.0903282021492884, + "grad_norm": 0.543263852596283, + "learning_rate": 0.0001, + "loss": 1.3452, + "step": 9385 + }, + { + "epoch": 1.0904443799012489, + "grad_norm": 0.500427782535553, + "learning_rate": 0.0001, + "loss": 1.4676, + "step": 9386 + }, + { + "epoch": 1.0905605576532094, + "grad_norm": 0.5219041109085083, + "learning_rate": 0.0001, + "loss": 1.2534, + "step": 9387 + }, + { + "epoch": 1.0906767354051699, + "grad_norm": 0.5618671774864197, + "learning_rate": 0.0001, + "loss": 1.5241, + "step": 9388 + }, + { + "epoch": 1.0907929131571303, + "grad_norm": 0.567960262298584, + "learning_rate": 0.0001, + "loss": 1.599, + "step": 9389 + }, + { + "epoch": 1.0909090909090908, + "grad_norm": 0.5592353940010071, + "learning_rate": 0.0001, + "loss": 1.4526, + "step": 9390 + }, + { + "epoch": 1.0910252686610513, + "grad_norm": 0.5943126082420349, + "learning_rate": 0.0001, + "loss": 1.4938, + "step": 9391 + }, + { + "epoch": 1.0911414464130118, + "grad_norm": 0.5825600028038025, + "learning_rate": 0.0001, + "loss": 1.62, + "step": 9392 + }, + { + "epoch": 1.0912576241649723, + "grad_norm": 0.5191078186035156, + "learning_rate": 0.0001, + "loss": 1.4428, + "step": 9393 + }, + { + "epoch": 1.091373801916933, + "grad_norm": 0.5243180394172668, + "learning_rate": 0.0001, + "loss": 1.4733, + "step": 9394 + }, + { + "epoch": 1.0914899796688935, + "grad_norm": 0.548667311668396, + "learning_rate": 0.0001, + "loss": 1.4179, + "step": 9395 + }, + { + "epoch": 1.091606157420854, + "grad_norm": 0.5515474081039429, + "learning_rate": 0.0001, + "loss": 1.3246, + "step": 9396 + }, + { + "epoch": 1.0917223351728145, + "grad_norm": 0.5582205057144165, + "learning_rate": 0.0001, + "loss": 1.5639, + "step": 9397 + }, + { + "epoch": 1.091838512924775, + "grad_norm": 0.5735296607017517, + "learning_rate": 0.0001, + "loss": 1.5022, + "step": 9398 + }, + { + "epoch": 1.0919546906767355, + "grad_norm": 0.5381777286529541, + "learning_rate": 0.0001, + "loss": 1.4056, + "step": 9399 + }, + { + "epoch": 1.092070868428696, + "grad_norm": 0.506504237651825, + "learning_rate": 0.0001, + "loss": 1.3763, + "step": 9400 + }, + { + "epoch": 1.0921870461806564, + "grad_norm": 0.5787509679794312, + "learning_rate": 0.0001, + "loss": 1.4996, + "step": 9401 + }, + { + "epoch": 1.092303223932617, + "grad_norm": 0.543312132358551, + "learning_rate": 0.0001, + "loss": 1.3438, + "step": 9402 + }, + { + "epoch": 1.0924194016845774, + "grad_norm": 0.5725640058517456, + "learning_rate": 0.0001, + "loss": 1.54, + "step": 9403 + }, + { + "epoch": 1.092535579436538, + "grad_norm": 0.5353484153747559, + "learning_rate": 0.0001, + "loss": 1.2541, + "step": 9404 + }, + { + "epoch": 1.0926517571884984, + "grad_norm": 0.5454126596450806, + "learning_rate": 0.0001, + "loss": 1.3389, + "step": 9405 + }, + { + "epoch": 1.0927679349404589, + "grad_norm": 0.549069881439209, + "learning_rate": 0.0001, + "loss": 1.5135, + "step": 9406 + }, + { + "epoch": 1.0928841126924194, + "grad_norm": 0.572575032711029, + "learning_rate": 0.0001, + "loss": 1.5948, + "step": 9407 + }, + { + "epoch": 1.0930002904443799, + "grad_norm": 0.5477794408798218, + "learning_rate": 0.0001, + "loss": 1.3677, + "step": 9408 + }, + { + "epoch": 1.0931164681963403, + "grad_norm": 0.5575615167617798, + "learning_rate": 0.0001, + "loss": 1.6481, + "step": 9409 + }, + { + "epoch": 1.0932326459483008, + "grad_norm": 0.502451479434967, + "learning_rate": 0.0001, + "loss": 1.4112, + "step": 9410 + }, + { + "epoch": 1.0933488237002613, + "grad_norm": 0.5695579051971436, + "learning_rate": 0.0001, + "loss": 1.6426, + "step": 9411 + }, + { + "epoch": 1.0934650014522218, + "grad_norm": 0.5555123686790466, + "learning_rate": 0.0001, + "loss": 1.4722, + "step": 9412 + }, + { + "epoch": 1.0935811792041823, + "grad_norm": 0.5110344886779785, + "learning_rate": 0.0001, + "loss": 1.485, + "step": 9413 + }, + { + "epoch": 1.093697356956143, + "grad_norm": 0.514350414276123, + "learning_rate": 0.0001, + "loss": 1.3294, + "step": 9414 + }, + { + "epoch": 1.0938135347081035, + "grad_norm": 0.570740818977356, + "learning_rate": 0.0001, + "loss": 1.6172, + "step": 9415 + }, + { + "epoch": 1.093929712460064, + "grad_norm": 0.5143060684204102, + "learning_rate": 0.0001, + "loss": 1.5067, + "step": 9416 + }, + { + "epoch": 1.0940458902120245, + "grad_norm": 0.5782046914100647, + "learning_rate": 0.0001, + "loss": 1.6283, + "step": 9417 + }, + { + "epoch": 1.094162067963985, + "grad_norm": 0.5496023893356323, + "learning_rate": 0.0001, + "loss": 1.3474, + "step": 9418 + }, + { + "epoch": 1.0942782457159455, + "grad_norm": 0.4826601445674896, + "learning_rate": 0.0001, + "loss": 1.1978, + "step": 9419 + }, + { + "epoch": 1.094394423467906, + "grad_norm": 0.5562894940376282, + "learning_rate": 0.0001, + "loss": 1.4771, + "step": 9420 + }, + { + "epoch": 1.0945106012198664, + "grad_norm": 0.5985347032546997, + "learning_rate": 0.0001, + "loss": 1.5084, + "step": 9421 + }, + { + "epoch": 1.094626778971827, + "grad_norm": 0.5639268755912781, + "learning_rate": 0.0001, + "loss": 1.359, + "step": 9422 + }, + { + "epoch": 1.0947429567237874, + "grad_norm": 0.6072770357131958, + "learning_rate": 0.0001, + "loss": 1.7112, + "step": 9423 + }, + { + "epoch": 1.094859134475748, + "grad_norm": 0.5154184103012085, + "learning_rate": 0.0001, + "loss": 1.1247, + "step": 9424 + }, + { + "epoch": 1.0949753122277084, + "grad_norm": 0.6177548170089722, + "learning_rate": 0.0001, + "loss": 1.6308, + "step": 9425 + }, + { + "epoch": 1.0950914899796689, + "grad_norm": 0.5776795148849487, + "learning_rate": 0.0001, + "loss": 1.4542, + "step": 9426 + }, + { + "epoch": 1.0952076677316294, + "grad_norm": 0.5506322383880615, + "learning_rate": 0.0001, + "loss": 1.4099, + "step": 9427 + }, + { + "epoch": 1.0953238454835899, + "grad_norm": 0.5430233478546143, + "learning_rate": 0.0001, + "loss": 1.4948, + "step": 9428 + }, + { + "epoch": 1.0954400232355503, + "grad_norm": 0.5830442309379578, + "learning_rate": 0.0001, + "loss": 1.4822, + "step": 9429 + }, + { + "epoch": 1.0955562009875108, + "grad_norm": 0.541569173336029, + "learning_rate": 0.0001, + "loss": 1.5574, + "step": 9430 + }, + { + "epoch": 1.0956723787394713, + "grad_norm": 0.556121289730072, + "learning_rate": 0.0001, + "loss": 1.5205, + "step": 9431 + }, + { + "epoch": 1.0957885564914318, + "grad_norm": 0.5274482369422913, + "learning_rate": 0.0001, + "loss": 1.4293, + "step": 9432 + }, + { + "epoch": 1.0959047342433923, + "grad_norm": 0.5375184416770935, + "learning_rate": 0.0001, + "loss": 1.4055, + "step": 9433 + }, + { + "epoch": 1.0960209119953528, + "grad_norm": 0.5496751666069031, + "learning_rate": 0.0001, + "loss": 1.4816, + "step": 9434 + }, + { + "epoch": 1.0961370897473133, + "grad_norm": 0.5520657896995544, + "learning_rate": 0.0001, + "loss": 1.4712, + "step": 9435 + }, + { + "epoch": 1.096253267499274, + "grad_norm": 0.5120051503181458, + "learning_rate": 0.0001, + "loss": 1.5101, + "step": 9436 + }, + { + "epoch": 1.0963694452512345, + "grad_norm": 0.5237596035003662, + "learning_rate": 0.0001, + "loss": 1.4054, + "step": 9437 + }, + { + "epoch": 1.096485623003195, + "grad_norm": 0.5413674116134644, + "learning_rate": 0.0001, + "loss": 1.4558, + "step": 9438 + }, + { + "epoch": 1.0966018007551555, + "grad_norm": 0.5916610360145569, + "learning_rate": 0.0001, + "loss": 1.5184, + "step": 9439 + }, + { + "epoch": 1.096717978507116, + "grad_norm": 0.6033625602722168, + "learning_rate": 0.0001, + "loss": 1.5428, + "step": 9440 + }, + { + "epoch": 1.0968341562590764, + "grad_norm": 0.549626886844635, + "learning_rate": 0.0001, + "loss": 1.4412, + "step": 9441 + }, + { + "epoch": 1.096950334011037, + "grad_norm": 0.566194474697113, + "learning_rate": 0.0001, + "loss": 1.637, + "step": 9442 + }, + { + "epoch": 1.0970665117629974, + "grad_norm": 0.5590158700942993, + "learning_rate": 0.0001, + "loss": 1.4561, + "step": 9443 + }, + { + "epoch": 1.097182689514958, + "grad_norm": 0.5413280129432678, + "learning_rate": 0.0001, + "loss": 1.4254, + "step": 9444 + }, + { + "epoch": 1.0972988672669184, + "grad_norm": 0.5623098611831665, + "learning_rate": 0.0001, + "loss": 1.4923, + "step": 9445 + }, + { + "epoch": 1.0974150450188789, + "grad_norm": 0.6405941843986511, + "learning_rate": 0.0001, + "loss": 1.4324, + "step": 9446 + }, + { + "epoch": 1.0975312227708394, + "grad_norm": 0.5057178735733032, + "learning_rate": 0.0001, + "loss": 1.5107, + "step": 9447 + }, + { + "epoch": 1.0976474005227999, + "grad_norm": 0.5987020134925842, + "learning_rate": 0.0001, + "loss": 1.308, + "step": 9448 + }, + { + "epoch": 1.0977635782747603, + "grad_norm": 0.5348968505859375, + "learning_rate": 0.0001, + "loss": 1.344, + "step": 9449 + }, + { + "epoch": 1.0978797560267208, + "grad_norm": 0.5361587405204773, + "learning_rate": 0.0001, + "loss": 1.4619, + "step": 9450 + }, + { + "epoch": 1.0979959337786813, + "grad_norm": 0.549802839756012, + "learning_rate": 0.0001, + "loss": 1.4666, + "step": 9451 + }, + { + "epoch": 1.0981121115306418, + "grad_norm": 0.5512414574623108, + "learning_rate": 0.0001, + "loss": 1.5521, + "step": 9452 + }, + { + "epoch": 1.0982282892826023, + "grad_norm": 0.5330852270126343, + "learning_rate": 0.0001, + "loss": 1.5781, + "step": 9453 + }, + { + "epoch": 1.0983444670345628, + "grad_norm": 0.5517197847366333, + "learning_rate": 0.0001, + "loss": 1.3646, + "step": 9454 + }, + { + "epoch": 1.0984606447865233, + "grad_norm": 0.5206728577613831, + "learning_rate": 0.0001, + "loss": 1.3936, + "step": 9455 + }, + { + "epoch": 1.098576822538484, + "grad_norm": 0.5265716910362244, + "learning_rate": 0.0001, + "loss": 1.3308, + "step": 9456 + }, + { + "epoch": 1.0986930002904445, + "grad_norm": 0.5415132641792297, + "learning_rate": 0.0001, + "loss": 1.385, + "step": 9457 + }, + { + "epoch": 1.098809178042405, + "grad_norm": 0.5541356801986694, + "learning_rate": 0.0001, + "loss": 1.4734, + "step": 9458 + }, + { + "epoch": 1.0989253557943655, + "grad_norm": 0.5190122723579407, + "learning_rate": 0.0001, + "loss": 1.3846, + "step": 9459 + }, + { + "epoch": 1.099041533546326, + "grad_norm": 0.554965615272522, + "learning_rate": 0.0001, + "loss": 1.5115, + "step": 9460 + }, + { + "epoch": 1.0991577112982864, + "grad_norm": 0.5586269497871399, + "learning_rate": 0.0001, + "loss": 1.4455, + "step": 9461 + }, + { + "epoch": 1.099273889050247, + "grad_norm": 0.5546631217002869, + "learning_rate": 0.0001, + "loss": 1.5013, + "step": 9462 + }, + { + "epoch": 1.0993900668022074, + "grad_norm": 0.5459680557250977, + "learning_rate": 0.0001, + "loss": 1.4849, + "step": 9463 + }, + { + "epoch": 1.099506244554168, + "grad_norm": 0.568166196346283, + "learning_rate": 0.0001, + "loss": 1.566, + "step": 9464 + }, + { + "epoch": 1.0996224223061284, + "grad_norm": 0.5397794842720032, + "learning_rate": 0.0001, + "loss": 1.5538, + "step": 9465 + }, + { + "epoch": 1.0997386000580889, + "grad_norm": 0.5615697503089905, + "learning_rate": 0.0001, + "loss": 1.5312, + "step": 9466 + }, + { + "epoch": 1.0998547778100494, + "grad_norm": 0.6064543128013611, + "learning_rate": 0.0001, + "loss": 1.5027, + "step": 9467 + }, + { + "epoch": 1.0999709555620099, + "grad_norm": 0.5744174718856812, + "learning_rate": 0.0001, + "loss": 1.4792, + "step": 9468 + }, + { + "epoch": 1.1000871333139703, + "grad_norm": 0.5579218864440918, + "learning_rate": 0.0001, + "loss": 1.4867, + "step": 9469 + }, + { + "epoch": 1.1002033110659308, + "grad_norm": 0.5771361589431763, + "learning_rate": 0.0001, + "loss": 1.508, + "step": 9470 + }, + { + "epoch": 1.1003194888178913, + "grad_norm": 0.575631856918335, + "learning_rate": 0.0001, + "loss": 1.5174, + "step": 9471 + }, + { + "epoch": 1.1004356665698518, + "grad_norm": 0.5344403982162476, + "learning_rate": 0.0001, + "loss": 1.5082, + "step": 9472 + }, + { + "epoch": 1.1005518443218123, + "grad_norm": 0.5184001922607422, + "learning_rate": 0.0001, + "loss": 1.3486, + "step": 9473 + }, + { + "epoch": 1.1006680220737728, + "grad_norm": 0.5264585018157959, + "learning_rate": 0.0001, + "loss": 1.3588, + "step": 9474 + }, + { + "epoch": 1.1007841998257333, + "grad_norm": 0.5590055584907532, + "learning_rate": 0.0001, + "loss": 1.4734, + "step": 9475 + }, + { + "epoch": 1.1009003775776938, + "grad_norm": 0.5235846638679504, + "learning_rate": 0.0001, + "loss": 1.4077, + "step": 9476 + }, + { + "epoch": 1.1010165553296543, + "grad_norm": 0.5595501661300659, + "learning_rate": 0.0001, + "loss": 1.4029, + "step": 9477 + }, + { + "epoch": 1.101132733081615, + "grad_norm": 0.5435571074485779, + "learning_rate": 0.0001, + "loss": 1.4213, + "step": 9478 + }, + { + "epoch": 1.1012489108335755, + "grad_norm": 0.5264965295791626, + "learning_rate": 0.0001, + "loss": 1.455, + "step": 9479 + }, + { + "epoch": 1.101365088585536, + "grad_norm": 0.529292643070221, + "learning_rate": 0.0001, + "loss": 1.4103, + "step": 9480 + }, + { + "epoch": 1.1014812663374964, + "grad_norm": 0.5540322661399841, + "learning_rate": 0.0001, + "loss": 1.5225, + "step": 9481 + }, + { + "epoch": 1.101597444089457, + "grad_norm": 0.5076307058334351, + "learning_rate": 0.0001, + "loss": 1.3656, + "step": 9482 + }, + { + "epoch": 1.1017136218414174, + "grad_norm": 0.5783825516700745, + "learning_rate": 0.0001, + "loss": 1.5087, + "step": 9483 + }, + { + "epoch": 1.101829799593378, + "grad_norm": 0.5533871650695801, + "learning_rate": 0.0001, + "loss": 1.4969, + "step": 9484 + }, + { + "epoch": 1.1019459773453384, + "grad_norm": 0.5388155579566956, + "learning_rate": 0.0001, + "loss": 1.4595, + "step": 9485 + }, + { + "epoch": 1.1020621550972989, + "grad_norm": 0.5467915534973145, + "learning_rate": 0.0001, + "loss": 1.5315, + "step": 9486 + }, + { + "epoch": 1.1021783328492594, + "grad_norm": 0.6151425242424011, + "learning_rate": 0.0001, + "loss": 1.5872, + "step": 9487 + }, + { + "epoch": 1.1022945106012199, + "grad_norm": 0.5877282023429871, + "learning_rate": 0.0001, + "loss": 1.5517, + "step": 9488 + }, + { + "epoch": 1.1024106883531803, + "grad_norm": 0.5632818937301636, + "learning_rate": 0.0001, + "loss": 1.5353, + "step": 9489 + }, + { + "epoch": 1.1025268661051408, + "grad_norm": 0.5703235268592834, + "learning_rate": 0.0001, + "loss": 1.5007, + "step": 9490 + }, + { + "epoch": 1.1026430438571013, + "grad_norm": 0.563475489616394, + "learning_rate": 0.0001, + "loss": 1.5169, + "step": 9491 + }, + { + "epoch": 1.1027592216090618, + "grad_norm": 0.5732678174972534, + "learning_rate": 0.0001, + "loss": 1.4732, + "step": 9492 + }, + { + "epoch": 1.1028753993610223, + "grad_norm": 0.5850802659988403, + "learning_rate": 0.0001, + "loss": 1.4957, + "step": 9493 + }, + { + "epoch": 1.1029915771129828, + "grad_norm": 0.5793954133987427, + "learning_rate": 0.0001, + "loss": 1.583, + "step": 9494 + }, + { + "epoch": 1.1031077548649433, + "grad_norm": 0.5984706878662109, + "learning_rate": 0.0001, + "loss": 1.6546, + "step": 9495 + }, + { + "epoch": 1.1032239326169038, + "grad_norm": 0.5244345664978027, + "learning_rate": 0.0001, + "loss": 1.4301, + "step": 9496 + }, + { + "epoch": 1.1033401103688645, + "grad_norm": 0.5405702590942383, + "learning_rate": 0.0001, + "loss": 1.359, + "step": 9497 + }, + { + "epoch": 1.103456288120825, + "grad_norm": 0.5445573329925537, + "learning_rate": 0.0001, + "loss": 1.615, + "step": 9498 + }, + { + "epoch": 1.1035724658727855, + "grad_norm": 0.5632995367050171, + "learning_rate": 0.0001, + "loss": 1.4753, + "step": 9499 + }, + { + "epoch": 1.103688643624746, + "grad_norm": 0.5281904935836792, + "learning_rate": 0.0001, + "loss": 1.4822, + "step": 9500 + }, + { + "epoch": 1.1038048213767064, + "grad_norm": 0.5330088138580322, + "learning_rate": 0.0001, + "loss": 1.5222, + "step": 9501 + }, + { + "epoch": 1.103920999128667, + "grad_norm": 0.514532744884491, + "learning_rate": 0.0001, + "loss": 1.4414, + "step": 9502 + }, + { + "epoch": 1.1040371768806274, + "grad_norm": 0.548172652721405, + "learning_rate": 0.0001, + "loss": 1.4451, + "step": 9503 + }, + { + "epoch": 1.104153354632588, + "grad_norm": 0.5549656748771667, + "learning_rate": 0.0001, + "loss": 1.3786, + "step": 9504 + }, + { + "epoch": 1.1042695323845484, + "grad_norm": 0.5625692009925842, + "learning_rate": 0.0001, + "loss": 1.5153, + "step": 9505 + }, + { + "epoch": 1.1043857101365089, + "grad_norm": 0.5380663871765137, + "learning_rate": 0.0001, + "loss": 1.3388, + "step": 9506 + }, + { + "epoch": 1.1045018878884694, + "grad_norm": 0.530598521232605, + "learning_rate": 0.0001, + "loss": 1.3424, + "step": 9507 + }, + { + "epoch": 1.1046180656404299, + "grad_norm": 0.5374849438667297, + "learning_rate": 0.0001, + "loss": 1.4022, + "step": 9508 + }, + { + "epoch": 1.1047342433923903, + "grad_norm": 0.5946294665336609, + "learning_rate": 0.0001, + "loss": 1.6452, + "step": 9509 + }, + { + "epoch": 1.1048504211443508, + "grad_norm": 0.5669423937797546, + "learning_rate": 0.0001, + "loss": 1.3185, + "step": 9510 + }, + { + "epoch": 1.1049665988963113, + "grad_norm": 0.6064764261245728, + "learning_rate": 0.0001, + "loss": 1.4928, + "step": 9511 + }, + { + "epoch": 1.1050827766482718, + "grad_norm": 0.5889172554016113, + "learning_rate": 0.0001, + "loss": 1.455, + "step": 9512 + }, + { + "epoch": 1.1051989544002323, + "grad_norm": 0.5890141725540161, + "learning_rate": 0.0001, + "loss": 1.4985, + "step": 9513 + }, + { + "epoch": 1.1053151321521928, + "grad_norm": 0.5455361604690552, + "learning_rate": 0.0001, + "loss": 1.4995, + "step": 9514 + }, + { + "epoch": 1.1054313099041533, + "grad_norm": 0.5481176972389221, + "learning_rate": 0.0001, + "loss": 1.5108, + "step": 9515 + }, + { + "epoch": 1.1055474876561138, + "grad_norm": 0.602695882320404, + "learning_rate": 0.0001, + "loss": 1.6609, + "step": 9516 + }, + { + "epoch": 1.1056636654080743, + "grad_norm": 0.5367211103439331, + "learning_rate": 0.0001, + "loss": 1.5845, + "step": 9517 + }, + { + "epoch": 1.1057798431600347, + "grad_norm": 0.5153917670249939, + "learning_rate": 0.0001, + "loss": 1.5084, + "step": 9518 + }, + { + "epoch": 1.1058960209119952, + "grad_norm": 0.523540735244751, + "learning_rate": 0.0001, + "loss": 1.4537, + "step": 9519 + }, + { + "epoch": 1.106012198663956, + "grad_norm": 0.5269156098365784, + "learning_rate": 0.0001, + "loss": 1.4749, + "step": 9520 + }, + { + "epoch": 1.1061283764159164, + "grad_norm": 0.5302053689956665, + "learning_rate": 0.0001, + "loss": 1.4461, + "step": 9521 + }, + { + "epoch": 1.106244554167877, + "grad_norm": 0.5808258652687073, + "learning_rate": 0.0001, + "loss": 1.4169, + "step": 9522 + }, + { + "epoch": 1.1063607319198374, + "grad_norm": 0.6308403611183167, + "learning_rate": 0.0001, + "loss": 1.5731, + "step": 9523 + }, + { + "epoch": 1.106476909671798, + "grad_norm": 0.6031569838523865, + "learning_rate": 0.0001, + "loss": 1.5984, + "step": 9524 + }, + { + "epoch": 1.1065930874237584, + "grad_norm": 0.5299091339111328, + "learning_rate": 0.0001, + "loss": 1.4018, + "step": 9525 + }, + { + "epoch": 1.1067092651757189, + "grad_norm": 0.5627480745315552, + "learning_rate": 0.0001, + "loss": 1.6189, + "step": 9526 + }, + { + "epoch": 1.1068254429276794, + "grad_norm": 0.5670896172523499, + "learning_rate": 0.0001, + "loss": 1.5035, + "step": 9527 + }, + { + "epoch": 1.1069416206796399, + "grad_norm": 0.5727272629737854, + "learning_rate": 0.0001, + "loss": 1.3688, + "step": 9528 + }, + { + "epoch": 1.1070577984316003, + "grad_norm": 0.4948303699493408, + "learning_rate": 0.0001, + "loss": 1.3057, + "step": 9529 + }, + { + "epoch": 1.1071739761835608, + "grad_norm": 0.5203784108161926, + "learning_rate": 0.0001, + "loss": 1.4601, + "step": 9530 + }, + { + "epoch": 1.1072901539355213, + "grad_norm": 0.5920235514640808, + "learning_rate": 0.0001, + "loss": 1.4401, + "step": 9531 + }, + { + "epoch": 1.1074063316874818, + "grad_norm": 0.5513145923614502, + "learning_rate": 0.0001, + "loss": 1.4962, + "step": 9532 + }, + { + "epoch": 1.1075225094394423, + "grad_norm": 0.5774796605110168, + "learning_rate": 0.0001, + "loss": 1.5884, + "step": 9533 + }, + { + "epoch": 1.1076386871914028, + "grad_norm": 0.5905603170394897, + "learning_rate": 0.0001, + "loss": 1.5502, + "step": 9534 + }, + { + "epoch": 1.1077548649433633, + "grad_norm": 0.5975148677825928, + "learning_rate": 0.0001, + "loss": 1.5869, + "step": 9535 + }, + { + "epoch": 1.1078710426953238, + "grad_norm": 0.5394390821456909, + "learning_rate": 0.0001, + "loss": 1.4733, + "step": 9536 + }, + { + "epoch": 1.1079872204472843, + "grad_norm": 0.5816001296043396, + "learning_rate": 0.0001, + "loss": 1.4818, + "step": 9537 + }, + { + "epoch": 1.1081033981992447, + "grad_norm": 0.5599253177642822, + "learning_rate": 0.0001, + "loss": 1.3033, + "step": 9538 + }, + { + "epoch": 1.1082195759512055, + "grad_norm": 0.5270270109176636, + "learning_rate": 0.0001, + "loss": 1.4092, + "step": 9539 + }, + { + "epoch": 1.108335753703166, + "grad_norm": 0.5689064264297485, + "learning_rate": 0.0001, + "loss": 1.536, + "step": 9540 + }, + { + "epoch": 1.1084519314551264, + "grad_norm": 0.5355569124221802, + "learning_rate": 0.0001, + "loss": 1.6211, + "step": 9541 + }, + { + "epoch": 1.108568109207087, + "grad_norm": 0.5691352486610413, + "learning_rate": 0.0001, + "loss": 1.4697, + "step": 9542 + }, + { + "epoch": 1.1086842869590474, + "grad_norm": 0.5562692284584045, + "learning_rate": 0.0001, + "loss": 1.5146, + "step": 9543 + }, + { + "epoch": 1.108800464711008, + "grad_norm": 0.5737292766571045, + "learning_rate": 0.0001, + "loss": 1.4986, + "step": 9544 + }, + { + "epoch": 1.1089166424629684, + "grad_norm": 0.5629547238349915, + "learning_rate": 0.0001, + "loss": 1.4551, + "step": 9545 + }, + { + "epoch": 1.1090328202149289, + "grad_norm": 0.5484119057655334, + "learning_rate": 0.0001, + "loss": 1.4055, + "step": 9546 + }, + { + "epoch": 1.1091489979668894, + "grad_norm": 0.5538215041160583, + "learning_rate": 0.0001, + "loss": 1.6544, + "step": 9547 + }, + { + "epoch": 1.1092651757188499, + "grad_norm": 0.5706968903541565, + "learning_rate": 0.0001, + "loss": 1.7345, + "step": 9548 + }, + { + "epoch": 1.1093813534708103, + "grad_norm": 0.5173110961914062, + "learning_rate": 0.0001, + "loss": 1.4377, + "step": 9549 + }, + { + "epoch": 1.1094975312227708, + "grad_norm": 0.5889958143234253, + "learning_rate": 0.0001, + "loss": 1.4377, + "step": 9550 + }, + { + "epoch": 1.1096137089747313, + "grad_norm": 0.5318642258644104, + "learning_rate": 0.0001, + "loss": 1.4558, + "step": 9551 + }, + { + "epoch": 1.1097298867266918, + "grad_norm": 0.5898750424385071, + "learning_rate": 0.0001, + "loss": 1.5067, + "step": 9552 + }, + { + "epoch": 1.1098460644786523, + "grad_norm": 0.5130350589752197, + "learning_rate": 0.0001, + "loss": 1.3356, + "step": 9553 + }, + { + "epoch": 1.1099622422306128, + "grad_norm": 0.5386868119239807, + "learning_rate": 0.0001, + "loss": 1.6305, + "step": 9554 + }, + { + "epoch": 1.1100784199825733, + "grad_norm": 0.5684298872947693, + "learning_rate": 0.0001, + "loss": 1.4623, + "step": 9555 + }, + { + "epoch": 1.1101945977345338, + "grad_norm": 0.5544883012771606, + "learning_rate": 0.0001, + "loss": 1.5781, + "step": 9556 + }, + { + "epoch": 1.1103107754864943, + "grad_norm": 0.5304974913597107, + "learning_rate": 0.0001, + "loss": 1.3058, + "step": 9557 + }, + { + "epoch": 1.1104269532384548, + "grad_norm": 0.5397143363952637, + "learning_rate": 0.0001, + "loss": 1.4132, + "step": 9558 + }, + { + "epoch": 1.1105431309904152, + "grad_norm": 0.5635391473770142, + "learning_rate": 0.0001, + "loss": 1.4487, + "step": 9559 + }, + { + "epoch": 1.1106593087423757, + "grad_norm": 0.5330328345298767, + "learning_rate": 0.0001, + "loss": 1.5323, + "step": 9560 + }, + { + "epoch": 1.1107754864943362, + "grad_norm": 0.5222686529159546, + "learning_rate": 0.0001, + "loss": 1.2798, + "step": 9561 + }, + { + "epoch": 1.110891664246297, + "grad_norm": 0.5562769770622253, + "learning_rate": 0.0001, + "loss": 1.4548, + "step": 9562 + }, + { + "epoch": 1.1110078419982574, + "grad_norm": 0.5504674911499023, + "learning_rate": 0.0001, + "loss": 1.5051, + "step": 9563 + }, + { + "epoch": 1.111124019750218, + "grad_norm": 0.5468124747276306, + "learning_rate": 0.0001, + "loss": 1.4323, + "step": 9564 + }, + { + "epoch": 1.1112401975021784, + "grad_norm": 0.5254167914390564, + "learning_rate": 0.0001, + "loss": 1.5379, + "step": 9565 + }, + { + "epoch": 1.1113563752541389, + "grad_norm": 0.5691385269165039, + "learning_rate": 0.0001, + "loss": 1.6049, + "step": 9566 + }, + { + "epoch": 1.1114725530060994, + "grad_norm": 0.570458710193634, + "learning_rate": 0.0001, + "loss": 1.4127, + "step": 9567 + }, + { + "epoch": 1.1115887307580599, + "grad_norm": 0.5755772590637207, + "learning_rate": 0.0001, + "loss": 1.5589, + "step": 9568 + }, + { + "epoch": 1.1117049085100204, + "grad_norm": 0.5539082288742065, + "learning_rate": 0.0001, + "loss": 1.5229, + "step": 9569 + }, + { + "epoch": 1.1118210862619808, + "grad_norm": 0.5218854546546936, + "learning_rate": 0.0001, + "loss": 1.2754, + "step": 9570 + }, + { + "epoch": 1.1119372640139413, + "grad_norm": 0.5608935952186584, + "learning_rate": 0.0001, + "loss": 1.5962, + "step": 9571 + }, + { + "epoch": 1.1120534417659018, + "grad_norm": 0.5075331330299377, + "learning_rate": 0.0001, + "loss": 1.2668, + "step": 9572 + }, + { + "epoch": 1.1121696195178623, + "grad_norm": 0.5797237753868103, + "learning_rate": 0.0001, + "loss": 1.4987, + "step": 9573 + }, + { + "epoch": 1.1122857972698228, + "grad_norm": 0.5733375549316406, + "learning_rate": 0.0001, + "loss": 1.5519, + "step": 9574 + }, + { + "epoch": 1.1124019750217833, + "grad_norm": 0.5502446293830872, + "learning_rate": 0.0001, + "loss": 1.467, + "step": 9575 + }, + { + "epoch": 1.1125181527737438, + "grad_norm": 0.5844667553901672, + "learning_rate": 0.0001, + "loss": 1.5373, + "step": 9576 + }, + { + "epoch": 1.1126343305257043, + "grad_norm": 0.5160120129585266, + "learning_rate": 0.0001, + "loss": 1.3696, + "step": 9577 + }, + { + "epoch": 1.1127505082776648, + "grad_norm": 0.5747503042221069, + "learning_rate": 0.0001, + "loss": 1.4859, + "step": 9578 + }, + { + "epoch": 1.1128666860296252, + "grad_norm": 0.5804192423820496, + "learning_rate": 0.0001, + "loss": 1.5198, + "step": 9579 + }, + { + "epoch": 1.1129828637815857, + "grad_norm": 0.5319089293479919, + "learning_rate": 0.0001, + "loss": 1.4072, + "step": 9580 + }, + { + "epoch": 1.1130990415335464, + "grad_norm": 0.5397303104400635, + "learning_rate": 0.0001, + "loss": 1.3097, + "step": 9581 + }, + { + "epoch": 1.113215219285507, + "grad_norm": 0.5700457692146301, + "learning_rate": 0.0001, + "loss": 1.4927, + "step": 9582 + }, + { + "epoch": 1.1133313970374674, + "grad_norm": 0.5417009592056274, + "learning_rate": 0.0001, + "loss": 1.4684, + "step": 9583 + }, + { + "epoch": 1.113447574789428, + "grad_norm": 0.573021650314331, + "learning_rate": 0.0001, + "loss": 1.4537, + "step": 9584 + }, + { + "epoch": 1.1135637525413884, + "grad_norm": 0.5706760883331299, + "learning_rate": 0.0001, + "loss": 1.7165, + "step": 9585 + }, + { + "epoch": 1.1136799302933489, + "grad_norm": 0.5758902430534363, + "learning_rate": 0.0001, + "loss": 1.6319, + "step": 9586 + }, + { + "epoch": 1.1137961080453094, + "grad_norm": 0.541764497756958, + "learning_rate": 0.0001, + "loss": 1.3593, + "step": 9587 + }, + { + "epoch": 1.1139122857972699, + "grad_norm": 0.5544206500053406, + "learning_rate": 0.0001, + "loss": 1.4866, + "step": 9588 + }, + { + "epoch": 1.1140284635492304, + "grad_norm": 0.6012476086616516, + "learning_rate": 0.0001, + "loss": 1.6206, + "step": 9589 + }, + { + "epoch": 1.1141446413011908, + "grad_norm": 0.5391044020652771, + "learning_rate": 0.0001, + "loss": 1.2987, + "step": 9590 + }, + { + "epoch": 1.1142608190531513, + "grad_norm": 0.547203540802002, + "learning_rate": 0.0001, + "loss": 1.5008, + "step": 9591 + }, + { + "epoch": 1.1143769968051118, + "grad_norm": 0.5447909832000732, + "learning_rate": 0.0001, + "loss": 1.4603, + "step": 9592 + }, + { + "epoch": 1.1144931745570723, + "grad_norm": 0.5598756074905396, + "learning_rate": 0.0001, + "loss": 1.5212, + "step": 9593 + }, + { + "epoch": 1.1146093523090328, + "grad_norm": 0.5283433198928833, + "learning_rate": 0.0001, + "loss": 1.2927, + "step": 9594 + }, + { + "epoch": 1.1147255300609933, + "grad_norm": 0.5749019384384155, + "learning_rate": 0.0001, + "loss": 1.4112, + "step": 9595 + }, + { + "epoch": 1.1148417078129538, + "grad_norm": 0.5759902000427246, + "learning_rate": 0.0001, + "loss": 1.4703, + "step": 9596 + }, + { + "epoch": 1.1149578855649143, + "grad_norm": 0.5656595230102539, + "learning_rate": 0.0001, + "loss": 1.5034, + "step": 9597 + }, + { + "epoch": 1.1150740633168748, + "grad_norm": 0.5748841762542725, + "learning_rate": 0.0001, + "loss": 1.5227, + "step": 9598 + }, + { + "epoch": 1.1151902410688352, + "grad_norm": 0.4959535002708435, + "learning_rate": 0.0001, + "loss": 1.3178, + "step": 9599 + }, + { + "epoch": 1.1153064188207957, + "grad_norm": 0.5196616053581238, + "learning_rate": 0.0001, + "loss": 1.36, + "step": 9600 + }, + { + "epoch": 1.1154225965727562, + "grad_norm": 0.5188265442848206, + "learning_rate": 0.0001, + "loss": 1.4426, + "step": 9601 + }, + { + "epoch": 1.1155387743247167, + "grad_norm": 0.5687859058380127, + "learning_rate": 0.0001, + "loss": 1.5413, + "step": 9602 + }, + { + "epoch": 1.1156549520766774, + "grad_norm": 0.5094379186630249, + "learning_rate": 0.0001, + "loss": 1.5168, + "step": 9603 + }, + { + "epoch": 1.115771129828638, + "grad_norm": 0.5415772199630737, + "learning_rate": 0.0001, + "loss": 1.5183, + "step": 9604 + }, + { + "epoch": 1.1158873075805984, + "grad_norm": 0.5665364861488342, + "learning_rate": 0.0001, + "loss": 1.5004, + "step": 9605 + }, + { + "epoch": 1.1160034853325589, + "grad_norm": 0.5534007549285889, + "learning_rate": 0.0001, + "loss": 1.4372, + "step": 9606 + }, + { + "epoch": 1.1161196630845194, + "grad_norm": 0.5435113310813904, + "learning_rate": 0.0001, + "loss": 1.3756, + "step": 9607 + }, + { + "epoch": 1.1162358408364799, + "grad_norm": 0.5332193374633789, + "learning_rate": 0.0001, + "loss": 1.4215, + "step": 9608 + }, + { + "epoch": 1.1163520185884404, + "grad_norm": 0.5340940952301025, + "learning_rate": 0.0001, + "loss": 1.4451, + "step": 9609 + }, + { + "epoch": 1.1164681963404008, + "grad_norm": 0.5510147213935852, + "learning_rate": 0.0001, + "loss": 1.4416, + "step": 9610 + }, + { + "epoch": 1.1165843740923613, + "grad_norm": 0.5450196266174316, + "learning_rate": 0.0001, + "loss": 1.5096, + "step": 9611 + }, + { + "epoch": 1.1167005518443218, + "grad_norm": 0.5567142367362976, + "learning_rate": 0.0001, + "loss": 1.5802, + "step": 9612 + }, + { + "epoch": 1.1168167295962823, + "grad_norm": 0.5469493865966797, + "learning_rate": 0.0001, + "loss": 1.6743, + "step": 9613 + }, + { + "epoch": 1.1169329073482428, + "grad_norm": 0.5908190011978149, + "learning_rate": 0.0001, + "loss": 1.5183, + "step": 9614 + }, + { + "epoch": 1.1170490851002033, + "grad_norm": 0.5396865010261536, + "learning_rate": 0.0001, + "loss": 1.5314, + "step": 9615 + }, + { + "epoch": 1.1171652628521638, + "grad_norm": 0.5347968339920044, + "learning_rate": 0.0001, + "loss": 1.5557, + "step": 9616 + }, + { + "epoch": 1.1172814406041243, + "grad_norm": 0.5435789823532104, + "learning_rate": 0.0001, + "loss": 1.4568, + "step": 9617 + }, + { + "epoch": 1.1173976183560848, + "grad_norm": 0.530580461025238, + "learning_rate": 0.0001, + "loss": 1.3363, + "step": 9618 + }, + { + "epoch": 1.1175137961080452, + "grad_norm": 0.5703923106193542, + "learning_rate": 0.0001, + "loss": 1.5131, + "step": 9619 + }, + { + "epoch": 1.1176299738600057, + "grad_norm": 0.5795918703079224, + "learning_rate": 0.0001, + "loss": 1.6841, + "step": 9620 + }, + { + "epoch": 1.1177461516119662, + "grad_norm": 0.5287982225418091, + "learning_rate": 0.0001, + "loss": 1.4183, + "step": 9621 + }, + { + "epoch": 1.1178623293639267, + "grad_norm": 0.5755120515823364, + "learning_rate": 0.0001, + "loss": 1.5835, + "step": 9622 + }, + { + "epoch": 1.1179785071158874, + "grad_norm": 0.569776713848114, + "learning_rate": 0.0001, + "loss": 1.6366, + "step": 9623 + }, + { + "epoch": 1.118094684867848, + "grad_norm": 0.5526320338249207, + "learning_rate": 0.0001, + "loss": 1.4253, + "step": 9624 + }, + { + "epoch": 1.1182108626198084, + "grad_norm": 0.5705094337463379, + "learning_rate": 0.0001, + "loss": 1.4741, + "step": 9625 + }, + { + "epoch": 1.1183270403717689, + "grad_norm": 0.5555620789527893, + "learning_rate": 0.0001, + "loss": 1.6051, + "step": 9626 + }, + { + "epoch": 1.1184432181237294, + "grad_norm": 0.5216465592384338, + "learning_rate": 0.0001, + "loss": 1.3455, + "step": 9627 + }, + { + "epoch": 1.1185593958756899, + "grad_norm": 0.5284774899482727, + "learning_rate": 0.0001, + "loss": 1.4075, + "step": 9628 + }, + { + "epoch": 1.1186755736276504, + "grad_norm": 0.47694408893585205, + "learning_rate": 0.0001, + "loss": 1.2213, + "step": 9629 + }, + { + "epoch": 1.1187917513796108, + "grad_norm": 0.547134280204773, + "learning_rate": 0.0001, + "loss": 1.5259, + "step": 9630 + }, + { + "epoch": 1.1189079291315713, + "grad_norm": 0.5496186017990112, + "learning_rate": 0.0001, + "loss": 1.4448, + "step": 9631 + }, + { + "epoch": 1.1190241068835318, + "grad_norm": 0.6156900525093079, + "learning_rate": 0.0001, + "loss": 1.553, + "step": 9632 + }, + { + "epoch": 1.1191402846354923, + "grad_norm": 0.6084433794021606, + "learning_rate": 0.0001, + "loss": 1.5547, + "step": 9633 + }, + { + "epoch": 1.1192564623874528, + "grad_norm": 0.5293398499488831, + "learning_rate": 0.0001, + "loss": 1.4728, + "step": 9634 + }, + { + "epoch": 1.1193726401394133, + "grad_norm": 0.550264835357666, + "learning_rate": 0.0001, + "loss": 1.3892, + "step": 9635 + }, + { + "epoch": 1.1194888178913738, + "grad_norm": 0.5736042261123657, + "learning_rate": 0.0001, + "loss": 1.4799, + "step": 9636 + }, + { + "epoch": 1.1196049956433343, + "grad_norm": 0.5628027319908142, + "learning_rate": 0.0001, + "loss": 1.5377, + "step": 9637 + }, + { + "epoch": 1.1197211733952948, + "grad_norm": 0.5793375372886658, + "learning_rate": 0.0001, + "loss": 1.351, + "step": 9638 + }, + { + "epoch": 1.1198373511472552, + "grad_norm": 0.5381454229354858, + "learning_rate": 0.0001, + "loss": 1.4153, + "step": 9639 + }, + { + "epoch": 1.1199535288992157, + "grad_norm": 0.48907560110092163, + "learning_rate": 0.0001, + "loss": 1.3104, + "step": 9640 + }, + { + "epoch": 1.1200697066511762, + "grad_norm": 0.5359677672386169, + "learning_rate": 0.0001, + "loss": 1.4069, + "step": 9641 + }, + { + "epoch": 1.1201858844031367, + "grad_norm": 0.5884495377540588, + "learning_rate": 0.0001, + "loss": 1.4382, + "step": 9642 + }, + { + "epoch": 1.1203020621550972, + "grad_norm": 0.5695892572402954, + "learning_rate": 0.0001, + "loss": 1.4316, + "step": 9643 + }, + { + "epoch": 1.1204182399070577, + "grad_norm": 0.5442866683006287, + "learning_rate": 0.0001, + "loss": 1.2676, + "step": 9644 + }, + { + "epoch": 1.1205344176590184, + "grad_norm": 0.5650526881217957, + "learning_rate": 0.0001, + "loss": 1.5336, + "step": 9645 + }, + { + "epoch": 1.120650595410979, + "grad_norm": 0.6149001717567444, + "learning_rate": 0.0001, + "loss": 1.586, + "step": 9646 + }, + { + "epoch": 1.1207667731629394, + "grad_norm": 0.5208737254142761, + "learning_rate": 0.0001, + "loss": 1.3818, + "step": 9647 + }, + { + "epoch": 1.1208829509148999, + "grad_norm": 0.5633329749107361, + "learning_rate": 0.0001, + "loss": 1.4319, + "step": 9648 + }, + { + "epoch": 1.1209991286668604, + "grad_norm": 0.524625301361084, + "learning_rate": 0.0001, + "loss": 1.3997, + "step": 9649 + }, + { + "epoch": 1.1211153064188208, + "grad_norm": 0.5944722890853882, + "learning_rate": 0.0001, + "loss": 1.6914, + "step": 9650 + }, + { + "epoch": 1.1212314841707813, + "grad_norm": 0.5723394155502319, + "learning_rate": 0.0001, + "loss": 1.5587, + "step": 9651 + }, + { + "epoch": 1.1213476619227418, + "grad_norm": 0.5389914512634277, + "learning_rate": 0.0001, + "loss": 1.4158, + "step": 9652 + }, + { + "epoch": 1.1214638396747023, + "grad_norm": 0.5923259854316711, + "learning_rate": 0.0001, + "loss": 1.6293, + "step": 9653 + }, + { + "epoch": 1.1215800174266628, + "grad_norm": 0.562023401260376, + "learning_rate": 0.0001, + "loss": 1.5501, + "step": 9654 + }, + { + "epoch": 1.1216961951786233, + "grad_norm": 0.5436960458755493, + "learning_rate": 0.0001, + "loss": 1.4251, + "step": 9655 + }, + { + "epoch": 1.1218123729305838, + "grad_norm": 0.5693610310554504, + "learning_rate": 0.0001, + "loss": 1.517, + "step": 9656 + }, + { + "epoch": 1.1219285506825443, + "grad_norm": 0.5483703017234802, + "learning_rate": 0.0001, + "loss": 1.5909, + "step": 9657 + }, + { + "epoch": 1.1220447284345048, + "grad_norm": 0.5604730844497681, + "learning_rate": 0.0001, + "loss": 1.5313, + "step": 9658 + }, + { + "epoch": 1.1221609061864652, + "grad_norm": 0.5464431643486023, + "learning_rate": 0.0001, + "loss": 1.3452, + "step": 9659 + }, + { + "epoch": 1.1222770839384257, + "grad_norm": 0.6087496280670166, + "learning_rate": 0.0001, + "loss": 1.492, + "step": 9660 + }, + { + "epoch": 1.1223932616903862, + "grad_norm": 0.5438372492790222, + "learning_rate": 0.0001, + "loss": 1.4823, + "step": 9661 + }, + { + "epoch": 1.1225094394423467, + "grad_norm": 0.5792588591575623, + "learning_rate": 0.0001, + "loss": 1.5359, + "step": 9662 + }, + { + "epoch": 1.1226256171943072, + "grad_norm": 0.5307050347328186, + "learning_rate": 0.0001, + "loss": 1.3508, + "step": 9663 + }, + { + "epoch": 1.1227417949462677, + "grad_norm": 0.5792557001113892, + "learning_rate": 0.0001, + "loss": 1.461, + "step": 9664 + }, + { + "epoch": 1.1228579726982284, + "grad_norm": 0.568598747253418, + "learning_rate": 0.0001, + "loss": 1.4901, + "step": 9665 + }, + { + "epoch": 1.122974150450189, + "grad_norm": 0.5365720987319946, + "learning_rate": 0.0001, + "loss": 1.3632, + "step": 9666 + }, + { + "epoch": 1.1230903282021494, + "grad_norm": 0.5844710469245911, + "learning_rate": 0.0001, + "loss": 1.4948, + "step": 9667 + }, + { + "epoch": 1.1232065059541099, + "grad_norm": 0.5613291263580322, + "learning_rate": 0.0001, + "loss": 1.3457, + "step": 9668 + }, + { + "epoch": 1.1233226837060704, + "grad_norm": 0.5521247386932373, + "learning_rate": 0.0001, + "loss": 1.5323, + "step": 9669 + }, + { + "epoch": 1.1234388614580308, + "grad_norm": 0.5522918701171875, + "learning_rate": 0.0001, + "loss": 1.6036, + "step": 9670 + }, + { + "epoch": 1.1235550392099913, + "grad_norm": 0.5793119072914124, + "learning_rate": 0.0001, + "loss": 1.4542, + "step": 9671 + }, + { + "epoch": 1.1236712169619518, + "grad_norm": 0.5977482199668884, + "learning_rate": 0.0001, + "loss": 1.5044, + "step": 9672 + }, + { + "epoch": 1.1237873947139123, + "grad_norm": 0.5508487224578857, + "learning_rate": 0.0001, + "loss": 1.44, + "step": 9673 + }, + { + "epoch": 1.1239035724658728, + "grad_norm": 0.5501763820648193, + "learning_rate": 0.0001, + "loss": 1.5009, + "step": 9674 + }, + { + "epoch": 1.1240197502178333, + "grad_norm": 0.6087480187416077, + "learning_rate": 0.0001, + "loss": 1.5273, + "step": 9675 + }, + { + "epoch": 1.1241359279697938, + "grad_norm": 0.580289900302887, + "learning_rate": 0.0001, + "loss": 1.5003, + "step": 9676 + }, + { + "epoch": 1.1242521057217543, + "grad_norm": 0.5434087514877319, + "learning_rate": 0.0001, + "loss": 1.4947, + "step": 9677 + }, + { + "epoch": 1.1243682834737148, + "grad_norm": 0.5863969922065735, + "learning_rate": 0.0001, + "loss": 1.5204, + "step": 9678 + }, + { + "epoch": 1.1244844612256752, + "grad_norm": 0.5805991291999817, + "learning_rate": 0.0001, + "loss": 1.5329, + "step": 9679 + }, + { + "epoch": 1.1246006389776357, + "grad_norm": 0.562703013420105, + "learning_rate": 0.0001, + "loss": 1.4149, + "step": 9680 + }, + { + "epoch": 1.1247168167295962, + "grad_norm": 0.5464785695075989, + "learning_rate": 0.0001, + "loss": 1.3219, + "step": 9681 + }, + { + "epoch": 1.1248329944815567, + "grad_norm": 0.5555753707885742, + "learning_rate": 0.0001, + "loss": 1.5344, + "step": 9682 + }, + { + "epoch": 1.1249491722335172, + "grad_norm": 0.5941157937049866, + "learning_rate": 0.0001, + "loss": 1.4475, + "step": 9683 + }, + { + "epoch": 1.1250653499854777, + "grad_norm": 0.6653990745544434, + "learning_rate": 0.0001, + "loss": 1.4802, + "step": 9684 + }, + { + "epoch": 1.1251815277374382, + "grad_norm": 0.5832642912864685, + "learning_rate": 0.0001, + "loss": 1.4649, + "step": 9685 + }, + { + "epoch": 1.1252977054893987, + "grad_norm": 0.5274986028671265, + "learning_rate": 0.0001, + "loss": 1.493, + "step": 9686 + }, + { + "epoch": 1.1254138832413592, + "grad_norm": 0.5797598361968994, + "learning_rate": 0.0001, + "loss": 1.4681, + "step": 9687 + }, + { + "epoch": 1.1255300609933199, + "grad_norm": 0.5291623473167419, + "learning_rate": 0.0001, + "loss": 1.4156, + "step": 9688 + }, + { + "epoch": 1.1256462387452804, + "grad_norm": 0.5337206125259399, + "learning_rate": 0.0001, + "loss": 1.4923, + "step": 9689 + }, + { + "epoch": 1.1257624164972408, + "grad_norm": 0.5710740685462952, + "learning_rate": 0.0001, + "loss": 1.7177, + "step": 9690 + }, + { + "epoch": 1.1258785942492013, + "grad_norm": 0.5532470345497131, + "learning_rate": 0.0001, + "loss": 1.6986, + "step": 9691 + }, + { + "epoch": 1.1259947720011618, + "grad_norm": 0.5095158219337463, + "learning_rate": 0.0001, + "loss": 1.423, + "step": 9692 + }, + { + "epoch": 1.1261109497531223, + "grad_norm": 0.5526217222213745, + "learning_rate": 0.0001, + "loss": 1.5832, + "step": 9693 + }, + { + "epoch": 1.1262271275050828, + "grad_norm": 0.5839323997497559, + "learning_rate": 0.0001, + "loss": 1.6062, + "step": 9694 + }, + { + "epoch": 1.1263433052570433, + "grad_norm": 0.5310125350952148, + "learning_rate": 0.0001, + "loss": 1.273, + "step": 9695 + }, + { + "epoch": 1.1264594830090038, + "grad_norm": 0.5890069007873535, + "learning_rate": 0.0001, + "loss": 1.4701, + "step": 9696 + }, + { + "epoch": 1.1265756607609643, + "grad_norm": 0.5359507203102112, + "learning_rate": 0.0001, + "loss": 1.4048, + "step": 9697 + }, + { + "epoch": 1.1266918385129248, + "grad_norm": 0.5606240630149841, + "learning_rate": 0.0001, + "loss": 1.4849, + "step": 9698 + }, + { + "epoch": 1.1268080162648852, + "grad_norm": 0.534092128276825, + "learning_rate": 0.0001, + "loss": 1.367, + "step": 9699 + }, + { + "epoch": 1.1269241940168457, + "grad_norm": 0.553125262260437, + "learning_rate": 0.0001, + "loss": 1.4268, + "step": 9700 + }, + { + "epoch": 1.1270403717688062, + "grad_norm": 0.5301721096038818, + "learning_rate": 0.0001, + "loss": 1.3242, + "step": 9701 + }, + { + "epoch": 1.1271565495207667, + "grad_norm": 0.6657614707946777, + "learning_rate": 0.0001, + "loss": 1.6773, + "step": 9702 + }, + { + "epoch": 1.1272727272727272, + "grad_norm": 0.541265606880188, + "learning_rate": 0.0001, + "loss": 1.3735, + "step": 9703 + }, + { + "epoch": 1.1273889050246877, + "grad_norm": 0.5840449333190918, + "learning_rate": 0.0001, + "loss": 1.4717, + "step": 9704 + }, + { + "epoch": 1.1275050827766482, + "grad_norm": 0.5938639044761658, + "learning_rate": 0.0001, + "loss": 1.6064, + "step": 9705 + }, + { + "epoch": 1.127621260528609, + "grad_norm": 0.5215795040130615, + "learning_rate": 0.0001, + "loss": 1.5404, + "step": 9706 + }, + { + "epoch": 1.1277374382805694, + "grad_norm": 0.537142276763916, + "learning_rate": 0.0001, + "loss": 1.506, + "step": 9707 + }, + { + "epoch": 1.1278536160325299, + "grad_norm": 0.6039302349090576, + "learning_rate": 0.0001, + "loss": 1.5017, + "step": 9708 + }, + { + "epoch": 1.1279697937844904, + "grad_norm": 0.510535478591919, + "learning_rate": 0.0001, + "loss": 1.4446, + "step": 9709 + }, + { + "epoch": 1.1280859715364508, + "grad_norm": 0.5227552056312561, + "learning_rate": 0.0001, + "loss": 1.4608, + "step": 9710 + }, + { + "epoch": 1.1282021492884113, + "grad_norm": 0.5117825865745544, + "learning_rate": 0.0001, + "loss": 1.386, + "step": 9711 + }, + { + "epoch": 1.1283183270403718, + "grad_norm": 0.5713513493537903, + "learning_rate": 0.0001, + "loss": 1.5172, + "step": 9712 + }, + { + "epoch": 1.1284345047923323, + "grad_norm": 0.6017709374427795, + "learning_rate": 0.0001, + "loss": 1.4103, + "step": 9713 + }, + { + "epoch": 1.1285506825442928, + "grad_norm": 0.5840389728546143, + "learning_rate": 0.0001, + "loss": 1.4657, + "step": 9714 + }, + { + "epoch": 1.1286668602962533, + "grad_norm": 0.5689877867698669, + "learning_rate": 0.0001, + "loss": 1.4169, + "step": 9715 + }, + { + "epoch": 1.1287830380482138, + "grad_norm": 0.603010356426239, + "learning_rate": 0.0001, + "loss": 1.5207, + "step": 9716 + }, + { + "epoch": 1.1288992158001743, + "grad_norm": 0.5397545695304871, + "learning_rate": 0.0001, + "loss": 1.407, + "step": 9717 + }, + { + "epoch": 1.1290153935521348, + "grad_norm": 0.5655140280723572, + "learning_rate": 0.0001, + "loss": 1.5362, + "step": 9718 + }, + { + "epoch": 1.1291315713040952, + "grad_norm": 0.5146525502204895, + "learning_rate": 0.0001, + "loss": 1.4001, + "step": 9719 + }, + { + "epoch": 1.1292477490560557, + "grad_norm": 0.6142404675483704, + "learning_rate": 0.0001, + "loss": 1.5049, + "step": 9720 + }, + { + "epoch": 1.1293639268080162, + "grad_norm": 0.5134833455085754, + "learning_rate": 0.0001, + "loss": 1.3542, + "step": 9721 + }, + { + "epoch": 1.1294801045599767, + "grad_norm": 0.5315486788749695, + "learning_rate": 0.0001, + "loss": 1.3618, + "step": 9722 + }, + { + "epoch": 1.1295962823119372, + "grad_norm": 0.5732195973396301, + "learning_rate": 0.0001, + "loss": 1.52, + "step": 9723 + }, + { + "epoch": 1.1297124600638977, + "grad_norm": 0.5358231067657471, + "learning_rate": 0.0001, + "loss": 1.4426, + "step": 9724 + }, + { + "epoch": 1.1298286378158582, + "grad_norm": 0.5462802648544312, + "learning_rate": 0.0001, + "loss": 1.4065, + "step": 9725 + }, + { + "epoch": 1.1299448155678187, + "grad_norm": 0.5840174555778503, + "learning_rate": 0.0001, + "loss": 1.5295, + "step": 9726 + }, + { + "epoch": 1.1300609933197792, + "grad_norm": 0.536839485168457, + "learning_rate": 0.0001, + "loss": 1.2596, + "step": 9727 + }, + { + "epoch": 1.1301771710717397, + "grad_norm": 0.5742023587226868, + "learning_rate": 0.0001, + "loss": 1.534, + "step": 9728 + }, + { + "epoch": 1.1302933488237001, + "grad_norm": 0.5591979026794434, + "learning_rate": 0.0001, + "loss": 1.4578, + "step": 9729 + }, + { + "epoch": 1.1304095265756609, + "grad_norm": 0.5407907366752625, + "learning_rate": 0.0001, + "loss": 1.368, + "step": 9730 + }, + { + "epoch": 1.1305257043276213, + "grad_norm": 0.5862224102020264, + "learning_rate": 0.0001, + "loss": 1.6457, + "step": 9731 + }, + { + "epoch": 1.1306418820795818, + "grad_norm": 0.5518715977668762, + "learning_rate": 0.0001, + "loss": 1.421, + "step": 9732 + }, + { + "epoch": 1.1307580598315423, + "grad_norm": 0.5652369260787964, + "learning_rate": 0.0001, + "loss": 1.4673, + "step": 9733 + }, + { + "epoch": 1.1308742375835028, + "grad_norm": 0.5239549279212952, + "learning_rate": 0.0001, + "loss": 1.4401, + "step": 9734 + }, + { + "epoch": 1.1309904153354633, + "grad_norm": 0.5143555998802185, + "learning_rate": 0.0001, + "loss": 1.3138, + "step": 9735 + }, + { + "epoch": 1.1311065930874238, + "grad_norm": 0.5065417289733887, + "learning_rate": 0.0001, + "loss": 1.4762, + "step": 9736 + }, + { + "epoch": 1.1312227708393843, + "grad_norm": 0.5721327662467957, + "learning_rate": 0.0001, + "loss": 1.5481, + "step": 9737 + }, + { + "epoch": 1.1313389485913448, + "grad_norm": 0.553887665271759, + "learning_rate": 0.0001, + "loss": 1.5036, + "step": 9738 + }, + { + "epoch": 1.1314551263433053, + "grad_norm": 0.5835604667663574, + "learning_rate": 0.0001, + "loss": 1.6025, + "step": 9739 + }, + { + "epoch": 1.1315713040952657, + "grad_norm": 0.6082079410552979, + "learning_rate": 0.0001, + "loss": 1.5311, + "step": 9740 + }, + { + "epoch": 1.1316874818472262, + "grad_norm": 0.5478276014328003, + "learning_rate": 0.0001, + "loss": 1.559, + "step": 9741 + }, + { + "epoch": 1.1318036595991867, + "grad_norm": 0.5668119192123413, + "learning_rate": 0.0001, + "loss": 1.5674, + "step": 9742 + }, + { + "epoch": 1.1319198373511472, + "grad_norm": 0.5409606695175171, + "learning_rate": 0.0001, + "loss": 1.2484, + "step": 9743 + }, + { + "epoch": 1.1320360151031077, + "grad_norm": 0.5933104157447815, + "learning_rate": 0.0001, + "loss": 1.5732, + "step": 9744 + }, + { + "epoch": 1.1321521928550682, + "grad_norm": 0.5886265635490417, + "learning_rate": 0.0001, + "loss": 1.5903, + "step": 9745 + }, + { + "epoch": 1.1322683706070287, + "grad_norm": 0.534992516040802, + "learning_rate": 0.0001, + "loss": 1.4977, + "step": 9746 + }, + { + "epoch": 1.1323845483589892, + "grad_norm": 0.5970652103424072, + "learning_rate": 0.0001, + "loss": 1.538, + "step": 9747 + }, + { + "epoch": 1.1325007261109499, + "grad_norm": 0.5775313377380371, + "learning_rate": 0.0001, + "loss": 1.5226, + "step": 9748 + }, + { + "epoch": 1.1326169038629104, + "grad_norm": 0.5551691055297852, + "learning_rate": 0.0001, + "loss": 1.4154, + "step": 9749 + }, + { + "epoch": 1.1327330816148709, + "grad_norm": 0.57322096824646, + "learning_rate": 0.0001, + "loss": 1.5234, + "step": 9750 + }, + { + "epoch": 1.1328492593668313, + "grad_norm": 0.5926565527915955, + "learning_rate": 0.0001, + "loss": 1.399, + "step": 9751 + }, + { + "epoch": 1.1329654371187918, + "grad_norm": 0.5369096398353577, + "learning_rate": 0.0001, + "loss": 1.3547, + "step": 9752 + }, + { + "epoch": 1.1330816148707523, + "grad_norm": 0.5695880651473999, + "learning_rate": 0.0001, + "loss": 1.5324, + "step": 9753 + }, + { + "epoch": 1.1331977926227128, + "grad_norm": 0.5742299556732178, + "learning_rate": 0.0001, + "loss": 1.5644, + "step": 9754 + }, + { + "epoch": 1.1333139703746733, + "grad_norm": 0.5906009674072266, + "learning_rate": 0.0001, + "loss": 1.5804, + "step": 9755 + }, + { + "epoch": 1.1334301481266338, + "grad_norm": 0.5348914265632629, + "learning_rate": 0.0001, + "loss": 1.3925, + "step": 9756 + }, + { + "epoch": 1.1335463258785943, + "grad_norm": 0.5486701130867004, + "learning_rate": 0.0001, + "loss": 1.5705, + "step": 9757 + }, + { + "epoch": 1.1336625036305548, + "grad_norm": 0.5349100828170776, + "learning_rate": 0.0001, + "loss": 1.3754, + "step": 9758 + }, + { + "epoch": 1.1337786813825153, + "grad_norm": 0.6003459692001343, + "learning_rate": 0.0001, + "loss": 1.475, + "step": 9759 + }, + { + "epoch": 1.1338948591344757, + "grad_norm": 0.545295774936676, + "learning_rate": 0.0001, + "loss": 1.4824, + "step": 9760 + }, + { + "epoch": 1.1340110368864362, + "grad_norm": 0.5949148535728455, + "learning_rate": 0.0001, + "loss": 1.4104, + "step": 9761 + }, + { + "epoch": 1.1341272146383967, + "grad_norm": 0.6306147575378418, + "learning_rate": 0.0001, + "loss": 1.6523, + "step": 9762 + }, + { + "epoch": 1.1342433923903572, + "grad_norm": 0.5710545182228088, + "learning_rate": 0.0001, + "loss": 1.3827, + "step": 9763 + }, + { + "epoch": 1.1343595701423177, + "grad_norm": 0.5881208777427673, + "learning_rate": 0.0001, + "loss": 1.5104, + "step": 9764 + }, + { + "epoch": 1.1344757478942782, + "grad_norm": 0.5546616315841675, + "learning_rate": 0.0001, + "loss": 1.3955, + "step": 9765 + }, + { + "epoch": 1.1345919256462387, + "grad_norm": 0.5268201231956482, + "learning_rate": 0.0001, + "loss": 1.373, + "step": 9766 + }, + { + "epoch": 1.1347081033981992, + "grad_norm": 0.5580306053161621, + "learning_rate": 0.0001, + "loss": 1.5446, + "step": 9767 + }, + { + "epoch": 1.1348242811501597, + "grad_norm": 0.6127236485481262, + "learning_rate": 0.0001, + "loss": 1.4528, + "step": 9768 + }, + { + "epoch": 1.1349404589021201, + "grad_norm": 0.5953782200813293, + "learning_rate": 0.0001, + "loss": 1.5394, + "step": 9769 + }, + { + "epoch": 1.1350566366540806, + "grad_norm": 0.5282092690467834, + "learning_rate": 0.0001, + "loss": 1.4261, + "step": 9770 + }, + { + "epoch": 1.1351728144060411, + "grad_norm": 0.5678205490112305, + "learning_rate": 0.0001, + "loss": 1.4748, + "step": 9771 + }, + { + "epoch": 1.1352889921580018, + "grad_norm": 0.522504448890686, + "learning_rate": 0.0001, + "loss": 1.4129, + "step": 9772 + }, + { + "epoch": 1.1354051699099623, + "grad_norm": 0.5682795643806458, + "learning_rate": 0.0001, + "loss": 1.4627, + "step": 9773 + }, + { + "epoch": 1.1355213476619228, + "grad_norm": 0.5516985654830933, + "learning_rate": 0.0001, + "loss": 1.3844, + "step": 9774 + }, + { + "epoch": 1.1356375254138833, + "grad_norm": 0.5454441905021667, + "learning_rate": 0.0001, + "loss": 1.6141, + "step": 9775 + }, + { + "epoch": 1.1357537031658438, + "grad_norm": 0.5516277551651001, + "learning_rate": 0.0001, + "loss": 1.4229, + "step": 9776 + }, + { + "epoch": 1.1358698809178043, + "grad_norm": 0.5397228002548218, + "learning_rate": 0.0001, + "loss": 1.6032, + "step": 9777 + }, + { + "epoch": 1.1359860586697648, + "grad_norm": 0.56315678358078, + "learning_rate": 0.0001, + "loss": 1.6077, + "step": 9778 + }, + { + "epoch": 1.1361022364217253, + "grad_norm": 0.5425570607185364, + "learning_rate": 0.0001, + "loss": 1.425, + "step": 9779 + }, + { + "epoch": 1.1362184141736857, + "grad_norm": 0.5313009023666382, + "learning_rate": 0.0001, + "loss": 1.4678, + "step": 9780 + }, + { + "epoch": 1.1363345919256462, + "grad_norm": 0.5546362996101379, + "learning_rate": 0.0001, + "loss": 1.5133, + "step": 9781 + }, + { + "epoch": 1.1364507696776067, + "grad_norm": 0.5482527017593384, + "learning_rate": 0.0001, + "loss": 1.5031, + "step": 9782 + }, + { + "epoch": 1.1365669474295672, + "grad_norm": 0.5520925521850586, + "learning_rate": 0.0001, + "loss": 1.358, + "step": 9783 + }, + { + "epoch": 1.1366831251815277, + "grad_norm": 0.5753809809684753, + "learning_rate": 0.0001, + "loss": 1.5268, + "step": 9784 + }, + { + "epoch": 1.1367993029334882, + "grad_norm": 0.5345529317855835, + "learning_rate": 0.0001, + "loss": 1.3329, + "step": 9785 + }, + { + "epoch": 1.1369154806854487, + "grad_norm": 0.5629556775093079, + "learning_rate": 0.0001, + "loss": 1.4472, + "step": 9786 + }, + { + "epoch": 1.1370316584374092, + "grad_norm": 0.524359405040741, + "learning_rate": 0.0001, + "loss": 1.1784, + "step": 9787 + }, + { + "epoch": 1.1371478361893697, + "grad_norm": 0.5655604004859924, + "learning_rate": 0.0001, + "loss": 1.4612, + "step": 9788 + }, + { + "epoch": 1.1372640139413301, + "grad_norm": 0.5480789542198181, + "learning_rate": 0.0001, + "loss": 1.3745, + "step": 9789 + }, + { + "epoch": 1.1373801916932909, + "grad_norm": 0.568131685256958, + "learning_rate": 0.0001, + "loss": 1.4528, + "step": 9790 + }, + { + "epoch": 1.1374963694452513, + "grad_norm": 0.5922342538833618, + "learning_rate": 0.0001, + "loss": 1.5323, + "step": 9791 + }, + { + "epoch": 1.1376125471972118, + "grad_norm": 0.601453423500061, + "learning_rate": 0.0001, + "loss": 1.6236, + "step": 9792 + }, + { + "epoch": 1.1377287249491723, + "grad_norm": 0.5655685663223267, + "learning_rate": 0.0001, + "loss": 1.5018, + "step": 9793 + }, + { + "epoch": 1.1378449027011328, + "grad_norm": 0.5532330274581909, + "learning_rate": 0.0001, + "loss": 1.4919, + "step": 9794 + }, + { + "epoch": 1.1379610804530933, + "grad_norm": 0.5482434034347534, + "learning_rate": 0.0001, + "loss": 1.5045, + "step": 9795 + }, + { + "epoch": 1.1380772582050538, + "grad_norm": 0.5250821113586426, + "learning_rate": 0.0001, + "loss": 1.3726, + "step": 9796 + }, + { + "epoch": 1.1381934359570143, + "grad_norm": 0.5487037897109985, + "learning_rate": 0.0001, + "loss": 1.5452, + "step": 9797 + }, + { + "epoch": 1.1383096137089748, + "grad_norm": 0.5330802798271179, + "learning_rate": 0.0001, + "loss": 1.4124, + "step": 9798 + }, + { + "epoch": 1.1384257914609353, + "grad_norm": 0.5245431065559387, + "learning_rate": 0.0001, + "loss": 1.5838, + "step": 9799 + }, + { + "epoch": 1.1385419692128957, + "grad_norm": 0.5072425007820129, + "learning_rate": 0.0001, + "loss": 1.3497, + "step": 9800 + }, + { + "epoch": 1.1386581469648562, + "grad_norm": 0.5386658310890198, + "learning_rate": 0.0001, + "loss": 1.4285, + "step": 9801 + }, + { + "epoch": 1.1387743247168167, + "grad_norm": 0.5451638698577881, + "learning_rate": 0.0001, + "loss": 1.5039, + "step": 9802 + }, + { + "epoch": 1.1388905024687772, + "grad_norm": 0.5327491760253906, + "learning_rate": 0.0001, + "loss": 1.4741, + "step": 9803 + }, + { + "epoch": 1.1390066802207377, + "grad_norm": 0.5449811220169067, + "learning_rate": 0.0001, + "loss": 1.4579, + "step": 9804 + }, + { + "epoch": 1.1391228579726982, + "grad_norm": 0.5945097208023071, + "learning_rate": 0.0001, + "loss": 1.5619, + "step": 9805 + }, + { + "epoch": 1.1392390357246587, + "grad_norm": 0.5785495042800903, + "learning_rate": 0.0001, + "loss": 1.4953, + "step": 9806 + }, + { + "epoch": 1.1393552134766192, + "grad_norm": 0.5066744089126587, + "learning_rate": 0.0001, + "loss": 1.3782, + "step": 9807 + }, + { + "epoch": 1.1394713912285797, + "grad_norm": 0.6268693208694458, + "learning_rate": 0.0001, + "loss": 1.601, + "step": 9808 + }, + { + "epoch": 1.1395875689805401, + "grad_norm": 0.5436332821846008, + "learning_rate": 0.0001, + "loss": 1.4839, + "step": 9809 + }, + { + "epoch": 1.1397037467325006, + "grad_norm": 0.5843521356582642, + "learning_rate": 0.0001, + "loss": 1.4932, + "step": 9810 + }, + { + "epoch": 1.1398199244844611, + "grad_norm": 0.5359347462654114, + "learning_rate": 0.0001, + "loss": 1.4952, + "step": 9811 + }, + { + "epoch": 1.1399361022364216, + "grad_norm": 0.5501775145530701, + "learning_rate": 0.0001, + "loss": 1.4508, + "step": 9812 + }, + { + "epoch": 1.140052279988382, + "grad_norm": 0.5738726258277893, + "learning_rate": 0.0001, + "loss": 1.6331, + "step": 9813 + }, + { + "epoch": 1.1401684577403428, + "grad_norm": 0.6035877466201782, + "learning_rate": 0.0001, + "loss": 1.507, + "step": 9814 + }, + { + "epoch": 1.1402846354923033, + "grad_norm": 0.5534920692443848, + "learning_rate": 0.0001, + "loss": 1.4699, + "step": 9815 + }, + { + "epoch": 1.1404008132442638, + "grad_norm": 0.5403549075126648, + "learning_rate": 0.0001, + "loss": 1.5539, + "step": 9816 + }, + { + "epoch": 1.1405169909962243, + "grad_norm": 0.5253798961639404, + "learning_rate": 0.0001, + "loss": 1.3346, + "step": 9817 + }, + { + "epoch": 1.1406331687481848, + "grad_norm": 0.5829864144325256, + "learning_rate": 0.0001, + "loss": 1.5772, + "step": 9818 + }, + { + "epoch": 1.1407493465001453, + "grad_norm": 0.5337379574775696, + "learning_rate": 0.0001, + "loss": 1.447, + "step": 9819 + }, + { + "epoch": 1.1408655242521057, + "grad_norm": 0.5743491053581238, + "learning_rate": 0.0001, + "loss": 1.4244, + "step": 9820 + }, + { + "epoch": 1.1409817020040662, + "grad_norm": 0.524262011051178, + "learning_rate": 0.0001, + "loss": 1.3892, + "step": 9821 + }, + { + "epoch": 1.1410978797560267, + "grad_norm": 0.5501481890678406, + "learning_rate": 0.0001, + "loss": 1.4874, + "step": 9822 + }, + { + "epoch": 1.1412140575079872, + "grad_norm": 0.5878981351852417, + "learning_rate": 0.0001, + "loss": 1.7145, + "step": 9823 + }, + { + "epoch": 1.1413302352599477, + "grad_norm": 0.5892753601074219, + "learning_rate": 0.0001, + "loss": 1.4559, + "step": 9824 + }, + { + "epoch": 1.1414464130119082, + "grad_norm": 0.5729379653930664, + "learning_rate": 0.0001, + "loss": 1.6213, + "step": 9825 + }, + { + "epoch": 1.1415625907638687, + "grad_norm": 0.5200401544570923, + "learning_rate": 0.0001, + "loss": 1.3968, + "step": 9826 + }, + { + "epoch": 1.1416787685158292, + "grad_norm": 0.5553308725357056, + "learning_rate": 0.0001, + "loss": 1.3769, + "step": 9827 + }, + { + "epoch": 1.1417949462677897, + "grad_norm": 0.5183868408203125, + "learning_rate": 0.0001, + "loss": 1.2838, + "step": 9828 + }, + { + "epoch": 1.1419111240197501, + "grad_norm": 0.5820907950401306, + "learning_rate": 0.0001, + "loss": 1.6124, + "step": 9829 + }, + { + "epoch": 1.1420273017717106, + "grad_norm": 0.589948296546936, + "learning_rate": 0.0001, + "loss": 1.5892, + "step": 9830 + }, + { + "epoch": 1.1421434795236711, + "grad_norm": 0.5357127785682678, + "learning_rate": 0.0001, + "loss": 1.4233, + "step": 9831 + }, + { + "epoch": 1.1422596572756318, + "grad_norm": 0.5246970057487488, + "learning_rate": 0.0001, + "loss": 1.3825, + "step": 9832 + }, + { + "epoch": 1.1423758350275923, + "grad_norm": 0.5680531859397888, + "learning_rate": 0.0001, + "loss": 1.5088, + "step": 9833 + }, + { + "epoch": 1.1424920127795528, + "grad_norm": 0.5650409460067749, + "learning_rate": 0.0001, + "loss": 1.4552, + "step": 9834 + }, + { + "epoch": 1.1426081905315133, + "grad_norm": 0.5513635873794556, + "learning_rate": 0.0001, + "loss": 1.471, + "step": 9835 + }, + { + "epoch": 1.1427243682834738, + "grad_norm": 0.5147875547409058, + "learning_rate": 0.0001, + "loss": 1.3892, + "step": 9836 + }, + { + "epoch": 1.1428405460354343, + "grad_norm": 0.5659693479537964, + "learning_rate": 0.0001, + "loss": 1.4035, + "step": 9837 + }, + { + "epoch": 1.1429567237873948, + "grad_norm": 0.5677375793457031, + "learning_rate": 0.0001, + "loss": 1.5084, + "step": 9838 + }, + { + "epoch": 1.1430729015393553, + "grad_norm": 0.5909645557403564, + "learning_rate": 0.0001, + "loss": 1.5125, + "step": 9839 + }, + { + "epoch": 1.1431890792913157, + "grad_norm": 0.603756308555603, + "learning_rate": 0.0001, + "loss": 1.5208, + "step": 9840 + }, + { + "epoch": 1.1433052570432762, + "grad_norm": 0.6034809350967407, + "learning_rate": 0.0001, + "loss": 1.5083, + "step": 9841 + }, + { + "epoch": 1.1434214347952367, + "grad_norm": 0.5629401803016663, + "learning_rate": 0.0001, + "loss": 1.5183, + "step": 9842 + }, + { + "epoch": 1.1435376125471972, + "grad_norm": 0.5638735890388489, + "learning_rate": 0.0001, + "loss": 1.5628, + "step": 9843 + }, + { + "epoch": 1.1436537902991577, + "grad_norm": 0.572892427444458, + "learning_rate": 0.0001, + "loss": 1.4563, + "step": 9844 + }, + { + "epoch": 1.1437699680511182, + "grad_norm": 0.5544632077217102, + "learning_rate": 0.0001, + "loss": 1.4904, + "step": 9845 + }, + { + "epoch": 1.1438861458030787, + "grad_norm": 0.596784770488739, + "learning_rate": 0.0001, + "loss": 1.5844, + "step": 9846 + }, + { + "epoch": 1.1440023235550392, + "grad_norm": 0.5610263347625732, + "learning_rate": 0.0001, + "loss": 1.555, + "step": 9847 + }, + { + "epoch": 1.1441185013069997, + "grad_norm": 0.5021182298660278, + "learning_rate": 0.0001, + "loss": 1.4044, + "step": 9848 + }, + { + "epoch": 1.1442346790589601, + "grad_norm": 0.5500348210334778, + "learning_rate": 0.0001, + "loss": 1.5557, + "step": 9849 + }, + { + "epoch": 1.1443508568109206, + "grad_norm": 0.5434955358505249, + "learning_rate": 0.0001, + "loss": 1.447, + "step": 9850 + }, + { + "epoch": 1.1444670345628811, + "grad_norm": 0.5625091195106506, + "learning_rate": 0.0001, + "loss": 1.4034, + "step": 9851 + }, + { + "epoch": 1.1445832123148416, + "grad_norm": 0.5443983674049377, + "learning_rate": 0.0001, + "loss": 1.5446, + "step": 9852 + }, + { + "epoch": 1.144699390066802, + "grad_norm": 0.5581114888191223, + "learning_rate": 0.0001, + "loss": 1.5058, + "step": 9853 + }, + { + "epoch": 1.1448155678187626, + "grad_norm": 0.5418434143066406, + "learning_rate": 0.0001, + "loss": 1.4448, + "step": 9854 + }, + { + "epoch": 1.1449317455707233, + "grad_norm": 0.5469671487808228, + "learning_rate": 0.0001, + "loss": 1.5129, + "step": 9855 + }, + { + "epoch": 1.1450479233226838, + "grad_norm": 0.5789541602134705, + "learning_rate": 0.0001, + "loss": 1.4371, + "step": 9856 + }, + { + "epoch": 1.1451641010746443, + "grad_norm": 0.5320298075675964, + "learning_rate": 0.0001, + "loss": 1.3405, + "step": 9857 + }, + { + "epoch": 1.1452802788266048, + "grad_norm": 0.563693642616272, + "learning_rate": 0.0001, + "loss": 1.5974, + "step": 9858 + }, + { + "epoch": 1.1453964565785653, + "grad_norm": 0.5295918583869934, + "learning_rate": 0.0001, + "loss": 1.5261, + "step": 9859 + }, + { + "epoch": 1.1455126343305257, + "grad_norm": 0.558912456035614, + "learning_rate": 0.0001, + "loss": 1.515, + "step": 9860 + }, + { + "epoch": 1.1456288120824862, + "grad_norm": 0.5343863368034363, + "learning_rate": 0.0001, + "loss": 1.5597, + "step": 9861 + }, + { + "epoch": 1.1457449898344467, + "grad_norm": 0.543212354183197, + "learning_rate": 0.0001, + "loss": 1.4298, + "step": 9862 + }, + { + "epoch": 1.1458611675864072, + "grad_norm": 0.5438005924224854, + "learning_rate": 0.0001, + "loss": 1.4411, + "step": 9863 + }, + { + "epoch": 1.1459773453383677, + "grad_norm": 0.525090217590332, + "learning_rate": 0.0001, + "loss": 1.258, + "step": 9864 + }, + { + "epoch": 1.1460935230903282, + "grad_norm": 0.5800353288650513, + "learning_rate": 0.0001, + "loss": 1.4973, + "step": 9865 + }, + { + "epoch": 1.1462097008422887, + "grad_norm": 0.565473198890686, + "learning_rate": 0.0001, + "loss": 1.4686, + "step": 9866 + }, + { + "epoch": 1.1463258785942492, + "grad_norm": 0.5632652640342712, + "learning_rate": 0.0001, + "loss": 1.4616, + "step": 9867 + }, + { + "epoch": 1.1464420563462097, + "grad_norm": 0.5769978165626526, + "learning_rate": 0.0001, + "loss": 1.4663, + "step": 9868 + }, + { + "epoch": 1.1465582340981701, + "grad_norm": 0.6273400187492371, + "learning_rate": 0.0001, + "loss": 1.6432, + "step": 9869 + }, + { + "epoch": 1.1466744118501306, + "grad_norm": 0.6044973134994507, + "learning_rate": 0.0001, + "loss": 1.6373, + "step": 9870 + }, + { + "epoch": 1.1467905896020911, + "grad_norm": 0.5586897730827332, + "learning_rate": 0.0001, + "loss": 1.5407, + "step": 9871 + }, + { + "epoch": 1.1469067673540516, + "grad_norm": 0.5836507081985474, + "learning_rate": 0.0001, + "loss": 1.4729, + "step": 9872 + }, + { + "epoch": 1.147022945106012, + "grad_norm": 0.5953487753868103, + "learning_rate": 0.0001, + "loss": 1.6167, + "step": 9873 + }, + { + "epoch": 1.1471391228579728, + "grad_norm": 0.5075315833091736, + "learning_rate": 0.0001, + "loss": 1.336, + "step": 9874 + }, + { + "epoch": 1.1472553006099333, + "grad_norm": 0.5457214713096619, + "learning_rate": 0.0001, + "loss": 1.3285, + "step": 9875 + }, + { + "epoch": 1.1473714783618938, + "grad_norm": 0.5651188492774963, + "learning_rate": 0.0001, + "loss": 1.6115, + "step": 9876 + }, + { + "epoch": 1.1474876561138543, + "grad_norm": 0.6411771774291992, + "learning_rate": 0.0001, + "loss": 1.6225, + "step": 9877 + }, + { + "epoch": 1.1476038338658148, + "grad_norm": 0.516982913017273, + "learning_rate": 0.0001, + "loss": 1.4613, + "step": 9878 + }, + { + "epoch": 1.1477200116177753, + "grad_norm": 0.538935661315918, + "learning_rate": 0.0001, + "loss": 1.5169, + "step": 9879 + }, + { + "epoch": 1.1478361893697357, + "grad_norm": 0.6248571872711182, + "learning_rate": 0.0001, + "loss": 1.5922, + "step": 9880 + }, + { + "epoch": 1.1479523671216962, + "grad_norm": 0.5510977506637573, + "learning_rate": 0.0001, + "loss": 1.6002, + "step": 9881 + }, + { + "epoch": 1.1480685448736567, + "grad_norm": 0.5983824133872986, + "learning_rate": 0.0001, + "loss": 1.6403, + "step": 9882 + }, + { + "epoch": 1.1481847226256172, + "grad_norm": 0.5636187195777893, + "learning_rate": 0.0001, + "loss": 1.4609, + "step": 9883 + }, + { + "epoch": 1.1483009003775777, + "grad_norm": 0.5564061999320984, + "learning_rate": 0.0001, + "loss": 1.4619, + "step": 9884 + }, + { + "epoch": 1.1484170781295382, + "grad_norm": 0.5623925924301147, + "learning_rate": 0.0001, + "loss": 1.3934, + "step": 9885 + }, + { + "epoch": 1.1485332558814987, + "grad_norm": 0.5958746075630188, + "learning_rate": 0.0001, + "loss": 1.391, + "step": 9886 + }, + { + "epoch": 1.1486494336334592, + "grad_norm": 0.6206726431846619, + "learning_rate": 0.0001, + "loss": 1.5622, + "step": 9887 + }, + { + "epoch": 1.1487656113854197, + "grad_norm": 0.5640969276428223, + "learning_rate": 0.0001, + "loss": 1.5668, + "step": 9888 + }, + { + "epoch": 1.1488817891373801, + "grad_norm": 0.5939497947692871, + "learning_rate": 0.0001, + "loss": 1.3998, + "step": 9889 + }, + { + "epoch": 1.1489979668893406, + "grad_norm": 0.5671570897102356, + "learning_rate": 0.0001, + "loss": 1.393, + "step": 9890 + }, + { + "epoch": 1.1491141446413011, + "grad_norm": 0.5769670605659485, + "learning_rate": 0.0001, + "loss": 1.4192, + "step": 9891 + }, + { + "epoch": 1.1492303223932616, + "grad_norm": 0.5393675565719604, + "learning_rate": 0.0001, + "loss": 1.6622, + "step": 9892 + }, + { + "epoch": 1.149346500145222, + "grad_norm": 0.5987977385520935, + "learning_rate": 0.0001, + "loss": 1.482, + "step": 9893 + }, + { + "epoch": 1.1494626778971826, + "grad_norm": 0.5398848652839661, + "learning_rate": 0.0001, + "loss": 1.3972, + "step": 9894 + }, + { + "epoch": 1.149578855649143, + "grad_norm": 0.5538201332092285, + "learning_rate": 0.0001, + "loss": 1.5631, + "step": 9895 + }, + { + "epoch": 1.1496950334011036, + "grad_norm": 0.533630907535553, + "learning_rate": 0.0001, + "loss": 1.43, + "step": 9896 + }, + { + "epoch": 1.1498112111530643, + "grad_norm": 0.5726608037948608, + "learning_rate": 0.0001, + "loss": 1.5875, + "step": 9897 + }, + { + "epoch": 1.1499273889050248, + "grad_norm": 0.5075186491012573, + "learning_rate": 0.0001, + "loss": 1.362, + "step": 9898 + }, + { + "epoch": 1.1500435666569853, + "grad_norm": 0.539199948310852, + "learning_rate": 0.0001, + "loss": 1.324, + "step": 9899 + }, + { + "epoch": 1.1501597444089458, + "grad_norm": 0.5326648950576782, + "learning_rate": 0.0001, + "loss": 1.5828, + "step": 9900 + }, + { + "epoch": 1.1502759221609062, + "grad_norm": 0.5526677966117859, + "learning_rate": 0.0001, + "loss": 1.4872, + "step": 9901 + }, + { + "epoch": 1.1503920999128667, + "grad_norm": 0.5454682111740112, + "learning_rate": 0.0001, + "loss": 1.4269, + "step": 9902 + }, + { + "epoch": 1.1505082776648272, + "grad_norm": 0.5236324071884155, + "learning_rate": 0.0001, + "loss": 1.3648, + "step": 9903 + }, + { + "epoch": 1.1506244554167877, + "grad_norm": 0.5452289581298828, + "learning_rate": 0.0001, + "loss": 1.441, + "step": 9904 + }, + { + "epoch": 1.1507406331687482, + "grad_norm": 0.5598399639129639, + "learning_rate": 0.0001, + "loss": 1.5572, + "step": 9905 + }, + { + "epoch": 1.1508568109207087, + "grad_norm": 0.5843247771263123, + "learning_rate": 0.0001, + "loss": 1.6318, + "step": 9906 + }, + { + "epoch": 1.1509729886726692, + "grad_norm": 0.5665671229362488, + "learning_rate": 0.0001, + "loss": 1.5197, + "step": 9907 + }, + { + "epoch": 1.1510891664246297, + "grad_norm": 0.515862226486206, + "learning_rate": 0.0001, + "loss": 1.3599, + "step": 9908 + }, + { + "epoch": 1.1512053441765902, + "grad_norm": 0.542131781578064, + "learning_rate": 0.0001, + "loss": 1.5009, + "step": 9909 + }, + { + "epoch": 1.1513215219285506, + "grad_norm": 0.5564214587211609, + "learning_rate": 0.0001, + "loss": 1.3456, + "step": 9910 + }, + { + "epoch": 1.1514376996805111, + "grad_norm": 0.5730974674224854, + "learning_rate": 0.0001, + "loss": 1.5647, + "step": 9911 + }, + { + "epoch": 1.1515538774324716, + "grad_norm": 0.5575627088546753, + "learning_rate": 0.0001, + "loss": 1.4728, + "step": 9912 + }, + { + "epoch": 1.151670055184432, + "grad_norm": 0.5514092445373535, + "learning_rate": 0.0001, + "loss": 1.4182, + "step": 9913 + }, + { + "epoch": 1.1517862329363926, + "grad_norm": 0.5962027311325073, + "learning_rate": 0.0001, + "loss": 1.465, + "step": 9914 + }, + { + "epoch": 1.151902410688353, + "grad_norm": 0.5560566186904907, + "learning_rate": 0.0001, + "loss": 1.4282, + "step": 9915 + }, + { + "epoch": 1.1520185884403138, + "grad_norm": 0.5740779042243958, + "learning_rate": 0.0001, + "loss": 1.5807, + "step": 9916 + }, + { + "epoch": 1.1521347661922743, + "grad_norm": 0.5669369101524353, + "learning_rate": 0.0001, + "loss": 1.4826, + "step": 9917 + }, + { + "epoch": 1.1522509439442348, + "grad_norm": 0.5397534370422363, + "learning_rate": 0.0001, + "loss": 1.3715, + "step": 9918 + }, + { + "epoch": 1.1523671216961953, + "grad_norm": 0.5681161284446716, + "learning_rate": 0.0001, + "loss": 1.3601, + "step": 9919 + }, + { + "epoch": 1.1524832994481558, + "grad_norm": 0.5549933910369873, + "learning_rate": 0.0001, + "loss": 1.4637, + "step": 9920 + }, + { + "epoch": 1.1525994772001162, + "grad_norm": 0.5869593024253845, + "learning_rate": 0.0001, + "loss": 1.4902, + "step": 9921 + }, + { + "epoch": 1.1527156549520767, + "grad_norm": 0.5578819513320923, + "learning_rate": 0.0001, + "loss": 1.4433, + "step": 9922 + }, + { + "epoch": 1.1528318327040372, + "grad_norm": 0.5640743970870972, + "learning_rate": 0.0001, + "loss": 1.4265, + "step": 9923 + }, + { + "epoch": 1.1529480104559977, + "grad_norm": 0.5549669861793518, + "learning_rate": 0.0001, + "loss": 1.5671, + "step": 9924 + }, + { + "epoch": 1.1530641882079582, + "grad_norm": 0.5610499978065491, + "learning_rate": 0.0001, + "loss": 1.4156, + "step": 9925 + }, + { + "epoch": 1.1531803659599187, + "grad_norm": 0.5248560905456543, + "learning_rate": 0.0001, + "loss": 1.4177, + "step": 9926 + }, + { + "epoch": 1.1532965437118792, + "grad_norm": 0.5581945180892944, + "learning_rate": 0.0001, + "loss": 1.4434, + "step": 9927 + }, + { + "epoch": 1.1534127214638397, + "grad_norm": 0.5940243601799011, + "learning_rate": 0.0001, + "loss": 1.4916, + "step": 9928 + }, + { + "epoch": 1.1535288992158002, + "grad_norm": 0.5478717088699341, + "learning_rate": 0.0001, + "loss": 1.5128, + "step": 9929 + }, + { + "epoch": 1.1536450769677606, + "grad_norm": 0.6184346079826355, + "learning_rate": 0.0001, + "loss": 1.4189, + "step": 9930 + }, + { + "epoch": 1.1537612547197211, + "grad_norm": 0.5686115622520447, + "learning_rate": 0.0001, + "loss": 1.4612, + "step": 9931 + }, + { + "epoch": 1.1538774324716816, + "grad_norm": 0.5752643942832947, + "learning_rate": 0.0001, + "loss": 1.7198, + "step": 9932 + }, + { + "epoch": 1.153993610223642, + "grad_norm": 0.5206838250160217, + "learning_rate": 0.0001, + "loss": 1.5202, + "step": 9933 + }, + { + "epoch": 1.1541097879756026, + "grad_norm": 0.5340604186058044, + "learning_rate": 0.0001, + "loss": 1.5025, + "step": 9934 + }, + { + "epoch": 1.154225965727563, + "grad_norm": 0.5354512929916382, + "learning_rate": 0.0001, + "loss": 1.4083, + "step": 9935 + }, + { + "epoch": 1.1543421434795236, + "grad_norm": 0.5514809489250183, + "learning_rate": 0.0001, + "loss": 1.5335, + "step": 9936 + }, + { + "epoch": 1.154458321231484, + "grad_norm": 0.547510027885437, + "learning_rate": 0.0001, + "loss": 1.4518, + "step": 9937 + }, + { + "epoch": 1.1545744989834446, + "grad_norm": 0.5750606060028076, + "learning_rate": 0.0001, + "loss": 1.6024, + "step": 9938 + }, + { + "epoch": 1.1546906767354053, + "grad_norm": 0.5391731858253479, + "learning_rate": 0.0001, + "loss": 1.492, + "step": 9939 + }, + { + "epoch": 1.1548068544873658, + "grad_norm": 0.5582312345504761, + "learning_rate": 0.0001, + "loss": 1.4282, + "step": 9940 + }, + { + "epoch": 1.1549230322393262, + "grad_norm": 0.5333447456359863, + "learning_rate": 0.0001, + "loss": 1.4328, + "step": 9941 + }, + { + "epoch": 1.1550392099912867, + "grad_norm": 0.5500908493995667, + "learning_rate": 0.0001, + "loss": 1.4777, + "step": 9942 + }, + { + "epoch": 1.1551553877432472, + "grad_norm": 0.5635502934455872, + "learning_rate": 0.0001, + "loss": 1.5813, + "step": 9943 + }, + { + "epoch": 1.1552715654952077, + "grad_norm": 0.5841420292854309, + "learning_rate": 0.0001, + "loss": 1.518, + "step": 9944 + }, + { + "epoch": 1.1553877432471682, + "grad_norm": 0.5658759474754333, + "learning_rate": 0.0001, + "loss": 1.492, + "step": 9945 + }, + { + "epoch": 1.1555039209991287, + "grad_norm": 0.5532236099243164, + "learning_rate": 0.0001, + "loss": 1.4737, + "step": 9946 + }, + { + "epoch": 1.1556200987510892, + "grad_norm": 0.5682868361473083, + "learning_rate": 0.0001, + "loss": 1.5697, + "step": 9947 + }, + { + "epoch": 1.1557362765030497, + "grad_norm": 0.5833743810653687, + "learning_rate": 0.0001, + "loss": 1.4586, + "step": 9948 + }, + { + "epoch": 1.1558524542550102, + "grad_norm": 0.5212302803993225, + "learning_rate": 0.0001, + "loss": 1.2786, + "step": 9949 + }, + { + "epoch": 1.1559686320069706, + "grad_norm": 0.6027308702468872, + "learning_rate": 0.0001, + "loss": 1.4155, + "step": 9950 + }, + { + "epoch": 1.1560848097589311, + "grad_norm": 0.5450575351715088, + "learning_rate": 0.0001, + "loss": 1.4678, + "step": 9951 + }, + { + "epoch": 1.1562009875108916, + "grad_norm": 0.5215844511985779, + "learning_rate": 0.0001, + "loss": 1.4646, + "step": 9952 + }, + { + "epoch": 1.156317165262852, + "grad_norm": 0.5741782188415527, + "learning_rate": 0.0001, + "loss": 1.4504, + "step": 9953 + }, + { + "epoch": 1.1564333430148126, + "grad_norm": 0.5755845904350281, + "learning_rate": 0.0001, + "loss": 1.6222, + "step": 9954 + }, + { + "epoch": 1.156549520766773, + "grad_norm": 0.593981921672821, + "learning_rate": 0.0001, + "loss": 1.6518, + "step": 9955 + }, + { + "epoch": 1.1566656985187336, + "grad_norm": 0.5386795401573181, + "learning_rate": 0.0001, + "loss": 1.4433, + "step": 9956 + }, + { + "epoch": 1.156781876270694, + "grad_norm": 0.5577713847160339, + "learning_rate": 0.0001, + "loss": 1.3742, + "step": 9957 + }, + { + "epoch": 1.1568980540226548, + "grad_norm": 0.5421655178070068, + "learning_rate": 0.0001, + "loss": 1.3991, + "step": 9958 + }, + { + "epoch": 1.1570142317746153, + "grad_norm": 0.5427201986312866, + "learning_rate": 0.0001, + "loss": 1.5449, + "step": 9959 + }, + { + "epoch": 1.1571304095265758, + "grad_norm": 0.5640281438827515, + "learning_rate": 0.0001, + "loss": 1.6018, + "step": 9960 + }, + { + "epoch": 1.1572465872785362, + "grad_norm": 0.5602498054504395, + "learning_rate": 0.0001, + "loss": 1.5278, + "step": 9961 + }, + { + "epoch": 1.1573627650304967, + "grad_norm": 0.5524298548698425, + "learning_rate": 0.0001, + "loss": 1.4721, + "step": 9962 + }, + { + "epoch": 1.1574789427824572, + "grad_norm": 0.5706125497817993, + "learning_rate": 0.0001, + "loss": 1.4719, + "step": 9963 + }, + { + "epoch": 1.1575951205344177, + "grad_norm": 0.5506826639175415, + "learning_rate": 0.0001, + "loss": 1.3571, + "step": 9964 + }, + { + "epoch": 1.1577112982863782, + "grad_norm": 0.5594316720962524, + "learning_rate": 0.0001, + "loss": 1.482, + "step": 9965 + }, + { + "epoch": 1.1578274760383387, + "grad_norm": 0.5849363207817078, + "learning_rate": 0.0001, + "loss": 1.6765, + "step": 9966 + }, + { + "epoch": 1.1579436537902992, + "grad_norm": 0.5251225233078003, + "learning_rate": 0.0001, + "loss": 1.2302, + "step": 9967 + }, + { + "epoch": 1.1580598315422597, + "grad_norm": 0.6071436405181885, + "learning_rate": 0.0001, + "loss": 1.5038, + "step": 9968 + }, + { + "epoch": 1.1581760092942202, + "grad_norm": 0.5688941478729248, + "learning_rate": 0.0001, + "loss": 1.6126, + "step": 9969 + }, + { + "epoch": 1.1582921870461806, + "grad_norm": 0.5586535930633545, + "learning_rate": 0.0001, + "loss": 1.4697, + "step": 9970 + }, + { + "epoch": 1.1584083647981411, + "grad_norm": 0.6454538702964783, + "learning_rate": 0.0001, + "loss": 1.5929, + "step": 9971 + }, + { + "epoch": 1.1585245425501016, + "grad_norm": 0.5616946816444397, + "learning_rate": 0.0001, + "loss": 1.389, + "step": 9972 + }, + { + "epoch": 1.158640720302062, + "grad_norm": 0.6290552616119385, + "learning_rate": 0.0001, + "loss": 1.6434, + "step": 9973 + }, + { + "epoch": 1.1587568980540226, + "grad_norm": 0.5599773526191711, + "learning_rate": 0.0001, + "loss": 1.3618, + "step": 9974 + }, + { + "epoch": 1.158873075805983, + "grad_norm": 0.5668715834617615, + "learning_rate": 0.0001, + "loss": 1.403, + "step": 9975 + }, + { + "epoch": 1.1589892535579436, + "grad_norm": 0.576244056224823, + "learning_rate": 0.0001, + "loss": 1.5943, + "step": 9976 + }, + { + "epoch": 1.159105431309904, + "grad_norm": 0.5568404793739319, + "learning_rate": 0.0001, + "loss": 1.3909, + "step": 9977 + }, + { + "epoch": 1.1592216090618646, + "grad_norm": 0.5190873146057129, + "learning_rate": 0.0001, + "loss": 1.3981, + "step": 9978 + }, + { + "epoch": 1.159337786813825, + "grad_norm": 0.580040693283081, + "learning_rate": 0.0001, + "loss": 1.3702, + "step": 9979 + }, + { + "epoch": 1.1594539645657855, + "grad_norm": 0.5603786110877991, + "learning_rate": 0.0001, + "loss": 1.6677, + "step": 9980 + }, + { + "epoch": 1.1595701423177462, + "grad_norm": 0.5624469518661499, + "learning_rate": 0.0001, + "loss": 1.5952, + "step": 9981 + }, + { + "epoch": 1.1596863200697067, + "grad_norm": 0.5951324701309204, + "learning_rate": 0.0001, + "loss": 1.6157, + "step": 9982 + }, + { + "epoch": 1.1598024978216672, + "grad_norm": 0.547521710395813, + "learning_rate": 0.0001, + "loss": 1.474, + "step": 9983 + }, + { + "epoch": 1.1599186755736277, + "grad_norm": 0.5596818923950195, + "learning_rate": 0.0001, + "loss": 1.4823, + "step": 9984 + }, + { + "epoch": 1.1600348533255882, + "grad_norm": 0.5726192593574524, + "learning_rate": 0.0001, + "loss": 1.4882, + "step": 9985 + }, + { + "epoch": 1.1601510310775487, + "grad_norm": 0.5832332968711853, + "learning_rate": 0.0001, + "loss": 1.7152, + "step": 9986 + }, + { + "epoch": 1.1602672088295092, + "grad_norm": 0.5925440192222595, + "learning_rate": 0.0001, + "loss": 1.5852, + "step": 9987 + }, + { + "epoch": 1.1603833865814697, + "grad_norm": 0.6479099988937378, + "learning_rate": 0.0001, + "loss": 1.6276, + "step": 9988 + }, + { + "epoch": 1.1604995643334302, + "grad_norm": 0.5257347226142883, + "learning_rate": 0.0001, + "loss": 1.4354, + "step": 9989 + }, + { + "epoch": 1.1606157420853906, + "grad_norm": 0.5414207577705383, + "learning_rate": 0.0001, + "loss": 1.4025, + "step": 9990 + }, + { + "epoch": 1.1607319198373511, + "grad_norm": 0.5247325897216797, + "learning_rate": 0.0001, + "loss": 1.511, + "step": 9991 + }, + { + "epoch": 1.1608480975893116, + "grad_norm": 0.5498883724212646, + "learning_rate": 0.0001, + "loss": 1.535, + "step": 9992 + }, + { + "epoch": 1.1609642753412721, + "grad_norm": 0.5376471877098083, + "learning_rate": 0.0001, + "loss": 1.4189, + "step": 9993 + }, + { + "epoch": 1.1610804530932326, + "grad_norm": 0.5790148377418518, + "learning_rate": 0.0001, + "loss": 1.6122, + "step": 9994 + }, + { + "epoch": 1.161196630845193, + "grad_norm": 0.5751370191574097, + "learning_rate": 0.0001, + "loss": 1.4635, + "step": 9995 + }, + { + "epoch": 1.1613128085971536, + "grad_norm": 0.5805469751358032, + "learning_rate": 0.0001, + "loss": 1.4589, + "step": 9996 + }, + { + "epoch": 1.161428986349114, + "grad_norm": 0.5531786680221558, + "learning_rate": 0.0001, + "loss": 1.4823, + "step": 9997 + }, + { + "epoch": 1.1615451641010746, + "grad_norm": 0.5185563564300537, + "learning_rate": 0.0001, + "loss": 1.4648, + "step": 9998 + }, + { + "epoch": 1.1616613418530353, + "grad_norm": 0.5276340246200562, + "learning_rate": 0.0001, + "loss": 1.3336, + "step": 9999 + }, + { + "epoch": 1.1617775196049958, + "grad_norm": 0.5624109506607056, + "learning_rate": 0.0001, + "loss": 1.5546, + "step": 10000 + }, + { + "epoch": 1.1618936973569562, + "grad_norm": 0.5590581297874451, + "learning_rate": 0.0001, + "loss": 1.4464, + "step": 10001 + }, + { + "epoch": 1.1620098751089167, + "grad_norm": 0.60872483253479, + "learning_rate": 0.0001, + "loss": 1.502, + "step": 10002 + }, + { + "epoch": 1.1621260528608772, + "grad_norm": 0.5246327519416809, + "learning_rate": 0.0001, + "loss": 1.3447, + "step": 10003 + }, + { + "epoch": 1.1622422306128377, + "grad_norm": 0.5634382963180542, + "learning_rate": 0.0001, + "loss": 1.4741, + "step": 10004 + }, + { + "epoch": 1.1623584083647982, + "grad_norm": 0.55689537525177, + "learning_rate": 0.0001, + "loss": 1.3776, + "step": 10005 + }, + { + "epoch": 1.1624745861167587, + "grad_norm": 0.5736417174339294, + "learning_rate": 0.0001, + "loss": 1.4398, + "step": 10006 + }, + { + "epoch": 1.1625907638687192, + "grad_norm": 0.5619802474975586, + "learning_rate": 0.0001, + "loss": 1.6099, + "step": 10007 + }, + { + "epoch": 1.1627069416206797, + "grad_norm": 0.5841330885887146, + "learning_rate": 0.0001, + "loss": 1.6485, + "step": 10008 + }, + { + "epoch": 1.1628231193726402, + "grad_norm": 0.5336185693740845, + "learning_rate": 0.0001, + "loss": 1.2244, + "step": 10009 + }, + { + "epoch": 1.1629392971246006, + "grad_norm": 0.5390673875808716, + "learning_rate": 0.0001, + "loss": 1.4239, + "step": 10010 + }, + { + "epoch": 1.1630554748765611, + "grad_norm": 0.5778940320014954, + "learning_rate": 0.0001, + "loss": 1.2823, + "step": 10011 + }, + { + "epoch": 1.1631716526285216, + "grad_norm": 0.5761231184005737, + "learning_rate": 0.0001, + "loss": 1.4649, + "step": 10012 + }, + { + "epoch": 1.1632878303804821, + "grad_norm": 0.5847422480583191, + "learning_rate": 0.0001, + "loss": 1.5658, + "step": 10013 + }, + { + "epoch": 1.1634040081324426, + "grad_norm": 0.6390685439109802, + "learning_rate": 0.0001, + "loss": 1.5276, + "step": 10014 + }, + { + "epoch": 1.163520185884403, + "grad_norm": 0.5886343121528625, + "learning_rate": 0.0001, + "loss": 1.4341, + "step": 10015 + }, + { + "epoch": 1.1636363636363636, + "grad_norm": 0.581598699092865, + "learning_rate": 0.0001, + "loss": 1.4182, + "step": 10016 + }, + { + "epoch": 1.163752541388324, + "grad_norm": 0.6227822303771973, + "learning_rate": 0.0001, + "loss": 1.4497, + "step": 10017 + }, + { + "epoch": 1.1638687191402846, + "grad_norm": 0.5321307182312012, + "learning_rate": 0.0001, + "loss": 1.4031, + "step": 10018 + }, + { + "epoch": 1.163984896892245, + "grad_norm": 0.6062250137329102, + "learning_rate": 0.0001, + "loss": 1.6074, + "step": 10019 + }, + { + "epoch": 1.1641010746442055, + "grad_norm": 0.5998744964599609, + "learning_rate": 0.0001, + "loss": 1.4752, + "step": 10020 + }, + { + "epoch": 1.164217252396166, + "grad_norm": 0.6060172319412231, + "learning_rate": 0.0001, + "loss": 1.458, + "step": 10021 + }, + { + "epoch": 1.1643334301481265, + "grad_norm": 0.5388416051864624, + "learning_rate": 0.0001, + "loss": 1.5048, + "step": 10022 + }, + { + "epoch": 1.1644496079000872, + "grad_norm": 0.5970954895019531, + "learning_rate": 0.0001, + "loss": 1.545, + "step": 10023 + }, + { + "epoch": 1.1645657856520477, + "grad_norm": 0.5507951974868774, + "learning_rate": 0.0001, + "loss": 1.4816, + "step": 10024 + }, + { + "epoch": 1.1646819634040082, + "grad_norm": 0.5912706255912781, + "learning_rate": 0.0001, + "loss": 1.4366, + "step": 10025 + }, + { + "epoch": 1.1647981411559687, + "grad_norm": 0.5806775093078613, + "learning_rate": 0.0001, + "loss": 1.4093, + "step": 10026 + }, + { + "epoch": 1.1649143189079292, + "grad_norm": 0.6133882403373718, + "learning_rate": 0.0001, + "loss": 1.5617, + "step": 10027 + }, + { + "epoch": 1.1650304966598897, + "grad_norm": 0.5739568471908569, + "learning_rate": 0.0001, + "loss": 1.4323, + "step": 10028 + }, + { + "epoch": 1.1651466744118502, + "grad_norm": 0.5220316052436829, + "learning_rate": 0.0001, + "loss": 1.3818, + "step": 10029 + }, + { + "epoch": 1.1652628521638106, + "grad_norm": 0.5443993806838989, + "learning_rate": 0.0001, + "loss": 1.4489, + "step": 10030 + }, + { + "epoch": 1.1653790299157711, + "grad_norm": 0.5931938290596008, + "learning_rate": 0.0001, + "loss": 1.4597, + "step": 10031 + }, + { + "epoch": 1.1654952076677316, + "grad_norm": 0.5669753551483154, + "learning_rate": 0.0001, + "loss": 1.3503, + "step": 10032 + }, + { + "epoch": 1.1656113854196921, + "grad_norm": 0.5309962034225464, + "learning_rate": 0.0001, + "loss": 1.3674, + "step": 10033 + }, + { + "epoch": 1.1657275631716526, + "grad_norm": 0.5499364137649536, + "learning_rate": 0.0001, + "loss": 1.3246, + "step": 10034 + }, + { + "epoch": 1.165843740923613, + "grad_norm": 0.5379160046577454, + "learning_rate": 0.0001, + "loss": 1.4529, + "step": 10035 + }, + { + "epoch": 1.1659599186755736, + "grad_norm": 0.605657696723938, + "learning_rate": 0.0001, + "loss": 1.5954, + "step": 10036 + }, + { + "epoch": 1.166076096427534, + "grad_norm": 0.5854047536849976, + "learning_rate": 0.0001, + "loss": 1.4527, + "step": 10037 + }, + { + "epoch": 1.1661922741794946, + "grad_norm": 0.6003736853599548, + "learning_rate": 0.0001, + "loss": 1.4352, + "step": 10038 + }, + { + "epoch": 1.166308451931455, + "grad_norm": 0.5782053470611572, + "learning_rate": 0.0001, + "loss": 1.4934, + "step": 10039 + }, + { + "epoch": 1.1664246296834155, + "grad_norm": 0.5899875164031982, + "learning_rate": 0.0001, + "loss": 1.689, + "step": 10040 + }, + { + "epoch": 1.1665408074353762, + "grad_norm": 0.580557107925415, + "learning_rate": 0.0001, + "loss": 1.5173, + "step": 10041 + }, + { + "epoch": 1.1666569851873367, + "grad_norm": 0.5655171275138855, + "learning_rate": 0.0001, + "loss": 1.49, + "step": 10042 + }, + { + "epoch": 1.1667731629392972, + "grad_norm": 0.5810794830322266, + "learning_rate": 0.0001, + "loss": 1.5587, + "step": 10043 + }, + { + "epoch": 1.1668893406912577, + "grad_norm": 0.5902233123779297, + "learning_rate": 0.0001, + "loss": 1.6072, + "step": 10044 + }, + { + "epoch": 1.1670055184432182, + "grad_norm": 0.5441036820411682, + "learning_rate": 0.0001, + "loss": 1.4305, + "step": 10045 + }, + { + "epoch": 1.1671216961951787, + "grad_norm": 0.5872811675071716, + "learning_rate": 0.0001, + "loss": 1.499, + "step": 10046 + }, + { + "epoch": 1.1672378739471392, + "grad_norm": 0.5635554194450378, + "learning_rate": 0.0001, + "loss": 1.3447, + "step": 10047 + }, + { + "epoch": 1.1673540516990997, + "grad_norm": 0.5794597864151001, + "learning_rate": 0.0001, + "loss": 1.4998, + "step": 10048 + }, + { + "epoch": 1.1674702294510602, + "grad_norm": 0.5093443393707275, + "learning_rate": 0.0001, + "loss": 1.3911, + "step": 10049 + }, + { + "epoch": 1.1675864072030206, + "grad_norm": 0.5254181623458862, + "learning_rate": 0.0001, + "loss": 1.415, + "step": 10050 + }, + { + "epoch": 1.1677025849549811, + "grad_norm": 0.5460498929023743, + "learning_rate": 0.0001, + "loss": 1.4379, + "step": 10051 + }, + { + "epoch": 1.1678187627069416, + "grad_norm": 0.5802798867225647, + "learning_rate": 0.0001, + "loss": 1.4191, + "step": 10052 + }, + { + "epoch": 1.1679349404589021, + "grad_norm": 0.5423253178596497, + "learning_rate": 0.0001, + "loss": 1.4365, + "step": 10053 + }, + { + "epoch": 1.1680511182108626, + "grad_norm": 0.5617722272872925, + "learning_rate": 0.0001, + "loss": 1.2869, + "step": 10054 + }, + { + "epoch": 1.168167295962823, + "grad_norm": 0.542448103427887, + "learning_rate": 0.0001, + "loss": 1.4043, + "step": 10055 + }, + { + "epoch": 1.1682834737147836, + "grad_norm": 0.5292364954948425, + "learning_rate": 0.0001, + "loss": 1.4988, + "step": 10056 + }, + { + "epoch": 1.168399651466744, + "grad_norm": 0.6243206858634949, + "learning_rate": 0.0001, + "loss": 1.496, + "step": 10057 + }, + { + "epoch": 1.1685158292187046, + "grad_norm": 0.5304319858551025, + "learning_rate": 0.0001, + "loss": 1.36, + "step": 10058 + }, + { + "epoch": 1.168632006970665, + "grad_norm": 0.5566378831863403, + "learning_rate": 0.0001, + "loss": 1.5019, + "step": 10059 + }, + { + "epoch": 1.1687481847226255, + "grad_norm": 0.6295537352561951, + "learning_rate": 0.0001, + "loss": 1.6282, + "step": 10060 + }, + { + "epoch": 1.168864362474586, + "grad_norm": 0.5838227868080139, + "learning_rate": 0.0001, + "loss": 1.5474, + "step": 10061 + }, + { + "epoch": 1.1689805402265465, + "grad_norm": 0.5756407380104065, + "learning_rate": 0.0001, + "loss": 1.6067, + "step": 10062 + }, + { + "epoch": 1.169096717978507, + "grad_norm": 0.5596350431442261, + "learning_rate": 0.0001, + "loss": 1.4751, + "step": 10063 + }, + { + "epoch": 1.1692128957304675, + "grad_norm": 0.5502869486808777, + "learning_rate": 0.0001, + "loss": 1.4222, + "step": 10064 + }, + { + "epoch": 1.1693290734824282, + "grad_norm": 0.5689765214920044, + "learning_rate": 0.0001, + "loss": 1.3957, + "step": 10065 + }, + { + "epoch": 1.1694452512343887, + "grad_norm": 0.5550212264060974, + "learning_rate": 0.0001, + "loss": 1.5104, + "step": 10066 + }, + { + "epoch": 1.1695614289863492, + "grad_norm": 0.5285496711730957, + "learning_rate": 0.0001, + "loss": 1.3828, + "step": 10067 + }, + { + "epoch": 1.1696776067383097, + "grad_norm": 0.5546186566352844, + "learning_rate": 0.0001, + "loss": 1.3966, + "step": 10068 + }, + { + "epoch": 1.1697937844902702, + "grad_norm": 0.5387147068977356, + "learning_rate": 0.0001, + "loss": 1.4273, + "step": 10069 + }, + { + "epoch": 1.1699099622422306, + "grad_norm": 0.6305792927742004, + "learning_rate": 0.0001, + "loss": 1.6158, + "step": 10070 + }, + { + "epoch": 1.1700261399941911, + "grad_norm": 0.5712059736251831, + "learning_rate": 0.0001, + "loss": 1.5246, + "step": 10071 + }, + { + "epoch": 1.1701423177461516, + "grad_norm": 0.5652614831924438, + "learning_rate": 0.0001, + "loss": 1.5069, + "step": 10072 + }, + { + "epoch": 1.1702584954981121, + "grad_norm": 0.6206264495849609, + "learning_rate": 0.0001, + "loss": 1.667, + "step": 10073 + }, + { + "epoch": 1.1703746732500726, + "grad_norm": 0.5684471726417542, + "learning_rate": 0.0001, + "loss": 1.3606, + "step": 10074 + }, + { + "epoch": 1.170490851002033, + "grad_norm": 0.5581884980201721, + "learning_rate": 0.0001, + "loss": 1.4814, + "step": 10075 + }, + { + "epoch": 1.1706070287539936, + "grad_norm": 0.568840742111206, + "learning_rate": 0.0001, + "loss": 1.4518, + "step": 10076 + }, + { + "epoch": 1.170723206505954, + "grad_norm": 0.5445674061775208, + "learning_rate": 0.0001, + "loss": 1.4405, + "step": 10077 + }, + { + "epoch": 1.1708393842579146, + "grad_norm": 0.52695631980896, + "learning_rate": 0.0001, + "loss": 1.332, + "step": 10078 + }, + { + "epoch": 1.170955562009875, + "grad_norm": 0.5562939643859863, + "learning_rate": 0.0001, + "loss": 1.5677, + "step": 10079 + }, + { + "epoch": 1.1710717397618355, + "grad_norm": 0.5368567705154419, + "learning_rate": 0.0001, + "loss": 1.3677, + "step": 10080 + }, + { + "epoch": 1.171187917513796, + "grad_norm": 0.569368839263916, + "learning_rate": 0.0001, + "loss": 1.5184, + "step": 10081 + }, + { + "epoch": 1.1713040952657565, + "grad_norm": 0.5831530690193176, + "learning_rate": 0.0001, + "loss": 1.7276, + "step": 10082 + }, + { + "epoch": 1.1714202730177172, + "grad_norm": 0.5623541474342346, + "learning_rate": 0.0001, + "loss": 1.4092, + "step": 10083 + }, + { + "epoch": 1.1715364507696777, + "grad_norm": 0.5392382144927979, + "learning_rate": 0.0001, + "loss": 1.3762, + "step": 10084 + }, + { + "epoch": 1.1716526285216382, + "grad_norm": 0.5557898283004761, + "learning_rate": 0.0001, + "loss": 1.2674, + "step": 10085 + }, + { + "epoch": 1.1717688062735987, + "grad_norm": 0.5454236268997192, + "learning_rate": 0.0001, + "loss": 1.563, + "step": 10086 + }, + { + "epoch": 1.1718849840255592, + "grad_norm": 0.5669856071472168, + "learning_rate": 0.0001, + "loss": 1.365, + "step": 10087 + }, + { + "epoch": 1.1720011617775197, + "grad_norm": 0.5392054319381714, + "learning_rate": 0.0001, + "loss": 1.4008, + "step": 10088 + }, + { + "epoch": 1.1721173395294802, + "grad_norm": 0.544360876083374, + "learning_rate": 0.0001, + "loss": 1.4126, + "step": 10089 + }, + { + "epoch": 1.1722335172814407, + "grad_norm": 0.5895025730133057, + "learning_rate": 0.0001, + "loss": 1.5437, + "step": 10090 + }, + { + "epoch": 1.1723496950334011, + "grad_norm": 0.504751443862915, + "learning_rate": 0.0001, + "loss": 1.308, + "step": 10091 + }, + { + "epoch": 1.1724658727853616, + "grad_norm": 0.5912155508995056, + "learning_rate": 0.0001, + "loss": 1.5424, + "step": 10092 + }, + { + "epoch": 1.1725820505373221, + "grad_norm": 0.5704768300056458, + "learning_rate": 0.0001, + "loss": 1.397, + "step": 10093 + }, + { + "epoch": 1.1726982282892826, + "grad_norm": 0.6013458371162415, + "learning_rate": 0.0001, + "loss": 1.4999, + "step": 10094 + }, + { + "epoch": 1.172814406041243, + "grad_norm": 0.5588251352310181, + "learning_rate": 0.0001, + "loss": 1.4981, + "step": 10095 + }, + { + "epoch": 1.1729305837932036, + "grad_norm": 0.5580037832260132, + "learning_rate": 0.0001, + "loss": 1.5068, + "step": 10096 + }, + { + "epoch": 1.173046761545164, + "grad_norm": 0.5618078708648682, + "learning_rate": 0.0001, + "loss": 1.4398, + "step": 10097 + }, + { + "epoch": 1.1731629392971246, + "grad_norm": 0.5638139247894287, + "learning_rate": 0.0001, + "loss": 1.392, + "step": 10098 + }, + { + "epoch": 1.173279117049085, + "grad_norm": 0.49057668447494507, + "learning_rate": 0.0001, + "loss": 1.3168, + "step": 10099 + }, + { + "epoch": 1.1733952948010455, + "grad_norm": 0.5525259971618652, + "learning_rate": 0.0001, + "loss": 1.5545, + "step": 10100 + }, + { + "epoch": 1.173511472553006, + "grad_norm": 0.5700230598449707, + "learning_rate": 0.0001, + "loss": 1.4463, + "step": 10101 + }, + { + "epoch": 1.1736276503049665, + "grad_norm": 0.517629861831665, + "learning_rate": 0.0001, + "loss": 1.4066, + "step": 10102 + }, + { + "epoch": 1.173743828056927, + "grad_norm": 0.5485448241233826, + "learning_rate": 0.0001, + "loss": 1.4147, + "step": 10103 + }, + { + "epoch": 1.1738600058088875, + "grad_norm": 0.5651414394378662, + "learning_rate": 0.0001, + "loss": 1.4629, + "step": 10104 + }, + { + "epoch": 1.173976183560848, + "grad_norm": 0.5873313546180725, + "learning_rate": 0.0001, + "loss": 1.5314, + "step": 10105 + }, + { + "epoch": 1.1740923613128085, + "grad_norm": 0.5807081460952759, + "learning_rate": 0.0001, + "loss": 1.4068, + "step": 10106 + }, + { + "epoch": 1.1742085390647692, + "grad_norm": 0.5531933903694153, + "learning_rate": 0.0001, + "loss": 1.4888, + "step": 10107 + }, + { + "epoch": 1.1743247168167297, + "grad_norm": 0.6018572449684143, + "learning_rate": 0.0001, + "loss": 1.5242, + "step": 10108 + }, + { + "epoch": 1.1744408945686902, + "grad_norm": 0.5930827260017395, + "learning_rate": 0.0001, + "loss": 1.5595, + "step": 10109 + }, + { + "epoch": 1.1745570723206507, + "grad_norm": 0.5503697991371155, + "learning_rate": 0.0001, + "loss": 1.3225, + "step": 10110 + }, + { + "epoch": 1.1746732500726111, + "grad_norm": 0.5934546589851379, + "learning_rate": 0.0001, + "loss": 1.5799, + "step": 10111 + }, + { + "epoch": 1.1747894278245716, + "grad_norm": 0.6462099552154541, + "learning_rate": 0.0001, + "loss": 1.6257, + "step": 10112 + }, + { + "epoch": 1.1749056055765321, + "grad_norm": 0.5270904302597046, + "learning_rate": 0.0001, + "loss": 1.3637, + "step": 10113 + }, + { + "epoch": 1.1750217833284926, + "grad_norm": 0.6062031984329224, + "learning_rate": 0.0001, + "loss": 1.4929, + "step": 10114 + }, + { + "epoch": 1.175137961080453, + "grad_norm": 0.6364587545394897, + "learning_rate": 0.0001, + "loss": 1.502, + "step": 10115 + }, + { + "epoch": 1.1752541388324136, + "grad_norm": 0.6208913326263428, + "learning_rate": 0.0001, + "loss": 1.5737, + "step": 10116 + }, + { + "epoch": 1.175370316584374, + "grad_norm": 0.5763484835624695, + "learning_rate": 0.0001, + "loss": 1.4576, + "step": 10117 + }, + { + "epoch": 1.1754864943363346, + "grad_norm": 0.6294147968292236, + "learning_rate": 0.0001, + "loss": 1.6216, + "step": 10118 + }, + { + "epoch": 1.175602672088295, + "grad_norm": 0.5596899390220642, + "learning_rate": 0.0001, + "loss": 1.5756, + "step": 10119 + }, + { + "epoch": 1.1757188498402555, + "grad_norm": 0.6140156984329224, + "learning_rate": 0.0001, + "loss": 1.6399, + "step": 10120 + }, + { + "epoch": 1.175835027592216, + "grad_norm": 0.5578920841217041, + "learning_rate": 0.0001, + "loss": 1.4892, + "step": 10121 + }, + { + "epoch": 1.1759512053441765, + "grad_norm": 0.5545052886009216, + "learning_rate": 0.0001, + "loss": 1.41, + "step": 10122 + }, + { + "epoch": 1.176067383096137, + "grad_norm": 0.5487355589866638, + "learning_rate": 0.0001, + "loss": 1.3392, + "step": 10123 + }, + { + "epoch": 1.1761835608480975, + "grad_norm": 0.5947970747947693, + "learning_rate": 0.0001, + "loss": 1.5874, + "step": 10124 + }, + { + "epoch": 1.1762997386000582, + "grad_norm": 0.5524084568023682, + "learning_rate": 0.0001, + "loss": 1.4826, + "step": 10125 + }, + { + "epoch": 1.1764159163520187, + "grad_norm": 0.5855798125267029, + "learning_rate": 0.0001, + "loss": 1.5117, + "step": 10126 + }, + { + "epoch": 1.1765320941039792, + "grad_norm": 0.5738104581832886, + "learning_rate": 0.0001, + "loss": 1.5542, + "step": 10127 + }, + { + "epoch": 1.1766482718559397, + "grad_norm": 0.6277796030044556, + "learning_rate": 0.0001, + "loss": 1.51, + "step": 10128 + }, + { + "epoch": 1.1767644496079002, + "grad_norm": 0.558943510055542, + "learning_rate": 0.0001, + "loss": 1.4912, + "step": 10129 + }, + { + "epoch": 1.1768806273598607, + "grad_norm": 0.5805367231369019, + "learning_rate": 0.0001, + "loss": 1.6493, + "step": 10130 + }, + { + "epoch": 1.1769968051118211, + "grad_norm": 0.5456610918045044, + "learning_rate": 0.0001, + "loss": 1.4819, + "step": 10131 + }, + { + "epoch": 1.1771129828637816, + "grad_norm": 0.5442765355110168, + "learning_rate": 0.0001, + "loss": 1.4487, + "step": 10132 + }, + { + "epoch": 1.1772291606157421, + "grad_norm": 0.5121324062347412, + "learning_rate": 0.0001, + "loss": 1.3845, + "step": 10133 + }, + { + "epoch": 1.1773453383677026, + "grad_norm": 0.5197672843933105, + "learning_rate": 0.0001, + "loss": 1.3748, + "step": 10134 + }, + { + "epoch": 1.177461516119663, + "grad_norm": 0.5549630522727966, + "learning_rate": 0.0001, + "loss": 1.4575, + "step": 10135 + }, + { + "epoch": 1.1775776938716236, + "grad_norm": 0.6009778380393982, + "learning_rate": 0.0001, + "loss": 1.6594, + "step": 10136 + }, + { + "epoch": 1.177693871623584, + "grad_norm": 0.6478327512741089, + "learning_rate": 0.0001, + "loss": 1.3931, + "step": 10137 + }, + { + "epoch": 1.1778100493755446, + "grad_norm": 0.5748621821403503, + "learning_rate": 0.0001, + "loss": 1.5719, + "step": 10138 + }, + { + "epoch": 1.177926227127505, + "grad_norm": 0.5634380578994751, + "learning_rate": 0.0001, + "loss": 1.4188, + "step": 10139 + }, + { + "epoch": 1.1780424048794655, + "grad_norm": 0.5881836414337158, + "learning_rate": 0.0001, + "loss": 1.5994, + "step": 10140 + }, + { + "epoch": 1.178158582631426, + "grad_norm": 0.5679641962051392, + "learning_rate": 0.0001, + "loss": 1.5643, + "step": 10141 + }, + { + "epoch": 1.1782747603833865, + "grad_norm": 0.5754004120826721, + "learning_rate": 0.0001, + "loss": 1.6292, + "step": 10142 + }, + { + "epoch": 1.178390938135347, + "grad_norm": 0.6078845858573914, + "learning_rate": 0.0001, + "loss": 1.5526, + "step": 10143 + }, + { + "epoch": 1.1785071158873075, + "grad_norm": 0.5382782220840454, + "learning_rate": 0.0001, + "loss": 1.3716, + "step": 10144 + }, + { + "epoch": 1.178623293639268, + "grad_norm": 0.565054178237915, + "learning_rate": 0.0001, + "loss": 1.4706, + "step": 10145 + }, + { + "epoch": 1.1787394713912285, + "grad_norm": 0.6017858982086182, + "learning_rate": 0.0001, + "loss": 1.5564, + "step": 10146 + }, + { + "epoch": 1.178855649143189, + "grad_norm": 0.533458948135376, + "learning_rate": 0.0001, + "loss": 1.2963, + "step": 10147 + }, + { + "epoch": 1.1789718268951495, + "grad_norm": 0.6092103719711304, + "learning_rate": 0.0001, + "loss": 1.5096, + "step": 10148 + }, + { + "epoch": 1.1790880046471102, + "grad_norm": 0.5930465459823608, + "learning_rate": 0.0001, + "loss": 1.476, + "step": 10149 + }, + { + "epoch": 1.1792041823990707, + "grad_norm": 0.5392512083053589, + "learning_rate": 0.0001, + "loss": 1.4036, + "step": 10150 + }, + { + "epoch": 1.1793203601510311, + "grad_norm": 0.5717263221740723, + "learning_rate": 0.0001, + "loss": 1.5386, + "step": 10151 + }, + { + "epoch": 1.1794365379029916, + "grad_norm": 0.5188798308372498, + "learning_rate": 0.0001, + "loss": 1.429, + "step": 10152 + }, + { + "epoch": 1.1795527156549521, + "grad_norm": 0.5673980712890625, + "learning_rate": 0.0001, + "loss": 1.446, + "step": 10153 + }, + { + "epoch": 1.1796688934069126, + "grad_norm": 0.5817063450813293, + "learning_rate": 0.0001, + "loss": 1.7309, + "step": 10154 + }, + { + "epoch": 1.179785071158873, + "grad_norm": 0.5734279751777649, + "learning_rate": 0.0001, + "loss": 1.5461, + "step": 10155 + }, + { + "epoch": 1.1799012489108336, + "grad_norm": 0.540127158164978, + "learning_rate": 0.0001, + "loss": 1.5124, + "step": 10156 + }, + { + "epoch": 1.180017426662794, + "grad_norm": 0.6068228483200073, + "learning_rate": 0.0001, + "loss": 1.5512, + "step": 10157 + }, + { + "epoch": 1.1801336044147546, + "grad_norm": 0.5281007289886475, + "learning_rate": 0.0001, + "loss": 1.3727, + "step": 10158 + }, + { + "epoch": 1.180249782166715, + "grad_norm": 0.57302325963974, + "learning_rate": 0.0001, + "loss": 1.4104, + "step": 10159 + }, + { + "epoch": 1.1803659599186755, + "grad_norm": 0.5550638437271118, + "learning_rate": 0.0001, + "loss": 1.5377, + "step": 10160 + }, + { + "epoch": 1.180482137670636, + "grad_norm": 0.559135377407074, + "learning_rate": 0.0001, + "loss": 1.5182, + "step": 10161 + }, + { + "epoch": 1.1805983154225965, + "grad_norm": 0.5715136528015137, + "learning_rate": 0.0001, + "loss": 1.481, + "step": 10162 + }, + { + "epoch": 1.180714493174557, + "grad_norm": 0.5593242645263672, + "learning_rate": 0.0001, + "loss": 1.536, + "step": 10163 + }, + { + "epoch": 1.1808306709265175, + "grad_norm": 0.5487082004547119, + "learning_rate": 0.0001, + "loss": 1.4297, + "step": 10164 + }, + { + "epoch": 1.180946848678478, + "grad_norm": 0.5988155603408813, + "learning_rate": 0.0001, + "loss": 1.6336, + "step": 10165 + }, + { + "epoch": 1.1810630264304385, + "grad_norm": 0.5767937302589417, + "learning_rate": 0.0001, + "loss": 1.5421, + "step": 10166 + }, + { + "epoch": 1.1811792041823992, + "grad_norm": 0.6116029620170593, + "learning_rate": 0.0001, + "loss": 1.5717, + "step": 10167 + }, + { + "epoch": 1.1812953819343597, + "grad_norm": 0.5582594275474548, + "learning_rate": 0.0001, + "loss": 1.361, + "step": 10168 + }, + { + "epoch": 1.1814115596863202, + "grad_norm": 0.5252476930618286, + "learning_rate": 0.0001, + "loss": 1.4629, + "step": 10169 + }, + { + "epoch": 1.1815277374382807, + "grad_norm": 0.5445083379745483, + "learning_rate": 0.0001, + "loss": 1.5312, + "step": 10170 + }, + { + "epoch": 1.1816439151902411, + "grad_norm": 0.5257118344306946, + "learning_rate": 0.0001, + "loss": 1.3504, + "step": 10171 + }, + { + "epoch": 1.1817600929422016, + "grad_norm": 0.5568935871124268, + "learning_rate": 0.0001, + "loss": 1.4852, + "step": 10172 + }, + { + "epoch": 1.1818762706941621, + "grad_norm": 0.5395621061325073, + "learning_rate": 0.0001, + "loss": 1.5846, + "step": 10173 + }, + { + "epoch": 1.1819924484461226, + "grad_norm": 0.5763841867446899, + "learning_rate": 0.0001, + "loss": 1.57, + "step": 10174 + }, + { + "epoch": 1.182108626198083, + "grad_norm": 0.5146093964576721, + "learning_rate": 0.0001, + "loss": 1.3141, + "step": 10175 + }, + { + "epoch": 1.1822248039500436, + "grad_norm": 0.5548669099807739, + "learning_rate": 0.0001, + "loss": 1.4279, + "step": 10176 + }, + { + "epoch": 1.182340981702004, + "grad_norm": 0.5692338347434998, + "learning_rate": 0.0001, + "loss": 1.4724, + "step": 10177 + }, + { + "epoch": 1.1824571594539646, + "grad_norm": 0.5822345614433289, + "learning_rate": 0.0001, + "loss": 1.5019, + "step": 10178 + }, + { + "epoch": 1.182573337205925, + "grad_norm": 0.5448129177093506, + "learning_rate": 0.0001, + "loss": 1.3834, + "step": 10179 + }, + { + "epoch": 1.1826895149578855, + "grad_norm": 0.6466269493103027, + "learning_rate": 0.0001, + "loss": 1.4509, + "step": 10180 + }, + { + "epoch": 1.182805692709846, + "grad_norm": 0.6030110120773315, + "learning_rate": 0.0001, + "loss": 1.4385, + "step": 10181 + }, + { + "epoch": 1.1829218704618065, + "grad_norm": 0.5867081880569458, + "learning_rate": 0.0001, + "loss": 1.6097, + "step": 10182 + }, + { + "epoch": 1.183038048213767, + "grad_norm": 0.543514609336853, + "learning_rate": 0.0001, + "loss": 1.4714, + "step": 10183 + }, + { + "epoch": 1.1831542259657275, + "grad_norm": 0.585824728012085, + "learning_rate": 0.0001, + "loss": 1.6173, + "step": 10184 + }, + { + "epoch": 1.183270403717688, + "grad_norm": 0.5779645442962646, + "learning_rate": 0.0001, + "loss": 1.5448, + "step": 10185 + }, + { + "epoch": 1.1833865814696485, + "grad_norm": 0.5890458226203918, + "learning_rate": 0.0001, + "loss": 1.5479, + "step": 10186 + }, + { + "epoch": 1.183502759221609, + "grad_norm": 0.5767694711685181, + "learning_rate": 0.0001, + "loss": 1.5161, + "step": 10187 + }, + { + "epoch": 1.1836189369735695, + "grad_norm": 0.5827814936637878, + "learning_rate": 0.0001, + "loss": 1.4696, + "step": 10188 + }, + { + "epoch": 1.18373511472553, + "grad_norm": 0.5493532419204712, + "learning_rate": 0.0001, + "loss": 1.4297, + "step": 10189 + }, + { + "epoch": 1.1838512924774907, + "grad_norm": 0.5599081516265869, + "learning_rate": 0.0001, + "loss": 1.4913, + "step": 10190 + }, + { + "epoch": 1.1839674702294511, + "grad_norm": 0.5541647672653198, + "learning_rate": 0.0001, + "loss": 1.5116, + "step": 10191 + }, + { + "epoch": 1.1840836479814116, + "grad_norm": 0.5816530585289001, + "learning_rate": 0.0001, + "loss": 1.611, + "step": 10192 + }, + { + "epoch": 1.1841998257333721, + "grad_norm": 0.6219608187675476, + "learning_rate": 0.0001, + "loss": 1.5939, + "step": 10193 + }, + { + "epoch": 1.1843160034853326, + "grad_norm": 0.5906521081924438, + "learning_rate": 0.0001, + "loss": 1.6049, + "step": 10194 + }, + { + "epoch": 1.184432181237293, + "grad_norm": 0.5859937071800232, + "learning_rate": 0.0001, + "loss": 1.5138, + "step": 10195 + }, + { + "epoch": 1.1845483589892536, + "grad_norm": 0.5565850138664246, + "learning_rate": 0.0001, + "loss": 1.4176, + "step": 10196 + }, + { + "epoch": 1.184664536741214, + "grad_norm": 0.5286614298820496, + "learning_rate": 0.0001, + "loss": 1.2829, + "step": 10197 + }, + { + "epoch": 1.1847807144931746, + "grad_norm": 0.5622779726982117, + "learning_rate": 0.0001, + "loss": 1.6416, + "step": 10198 + }, + { + "epoch": 1.184896892245135, + "grad_norm": 0.5423121452331543, + "learning_rate": 0.0001, + "loss": 1.5449, + "step": 10199 + }, + { + "epoch": 1.1850130699970955, + "grad_norm": 0.564444899559021, + "learning_rate": 0.0001, + "loss": 1.5836, + "step": 10200 + }, + { + "epoch": 1.185129247749056, + "grad_norm": 0.5337464213371277, + "learning_rate": 0.0001, + "loss": 1.4449, + "step": 10201 + }, + { + "epoch": 1.1852454255010165, + "grad_norm": 0.5334920287132263, + "learning_rate": 0.0001, + "loss": 1.369, + "step": 10202 + }, + { + "epoch": 1.185361603252977, + "grad_norm": 0.5582081079483032, + "learning_rate": 0.0001, + "loss": 1.5346, + "step": 10203 + }, + { + "epoch": 1.1854777810049375, + "grad_norm": 0.5935722589492798, + "learning_rate": 0.0001, + "loss": 1.5975, + "step": 10204 + }, + { + "epoch": 1.185593958756898, + "grad_norm": 0.606367290019989, + "learning_rate": 0.0001, + "loss": 1.6052, + "step": 10205 + }, + { + "epoch": 1.1857101365088585, + "grad_norm": 0.5262031555175781, + "learning_rate": 0.0001, + "loss": 1.3391, + "step": 10206 + }, + { + "epoch": 1.185826314260819, + "grad_norm": 0.5637447237968445, + "learning_rate": 0.0001, + "loss": 1.2968, + "step": 10207 + }, + { + "epoch": 1.1859424920127795, + "grad_norm": 0.5710691213607788, + "learning_rate": 0.0001, + "loss": 1.4793, + "step": 10208 + }, + { + "epoch": 1.1860586697647402, + "grad_norm": 0.5453031063079834, + "learning_rate": 0.0001, + "loss": 1.432, + "step": 10209 + }, + { + "epoch": 1.1861748475167007, + "grad_norm": 0.530640721321106, + "learning_rate": 0.0001, + "loss": 1.4899, + "step": 10210 + }, + { + "epoch": 1.1862910252686611, + "grad_norm": 0.5612127184867859, + "learning_rate": 0.0001, + "loss": 1.4825, + "step": 10211 + }, + { + "epoch": 1.1864072030206216, + "grad_norm": 0.5393279194831848, + "learning_rate": 0.0001, + "loss": 1.5267, + "step": 10212 + }, + { + "epoch": 1.1865233807725821, + "grad_norm": 0.5792052149772644, + "learning_rate": 0.0001, + "loss": 1.6498, + "step": 10213 + }, + { + "epoch": 1.1866395585245426, + "grad_norm": 0.5454745292663574, + "learning_rate": 0.0001, + "loss": 1.4819, + "step": 10214 + }, + { + "epoch": 1.186755736276503, + "grad_norm": 0.574486255645752, + "learning_rate": 0.0001, + "loss": 1.5309, + "step": 10215 + }, + { + "epoch": 1.1868719140284636, + "grad_norm": 0.5667810440063477, + "learning_rate": 0.0001, + "loss": 1.4863, + "step": 10216 + }, + { + "epoch": 1.186988091780424, + "grad_norm": 0.540420651435852, + "learning_rate": 0.0001, + "loss": 1.3731, + "step": 10217 + }, + { + "epoch": 1.1871042695323846, + "grad_norm": 0.5797957181930542, + "learning_rate": 0.0001, + "loss": 1.5712, + "step": 10218 + }, + { + "epoch": 1.187220447284345, + "grad_norm": 0.5845776200294495, + "learning_rate": 0.0001, + "loss": 1.5492, + "step": 10219 + }, + { + "epoch": 1.1873366250363055, + "grad_norm": 0.5893095135688782, + "learning_rate": 0.0001, + "loss": 1.5909, + "step": 10220 + }, + { + "epoch": 1.187452802788266, + "grad_norm": 0.5685760378837585, + "learning_rate": 0.0001, + "loss": 1.4096, + "step": 10221 + }, + { + "epoch": 1.1875689805402265, + "grad_norm": 0.5455953478813171, + "learning_rate": 0.0001, + "loss": 1.3178, + "step": 10222 + }, + { + "epoch": 1.187685158292187, + "grad_norm": 0.5952999591827393, + "learning_rate": 0.0001, + "loss": 1.4628, + "step": 10223 + }, + { + "epoch": 1.1878013360441475, + "grad_norm": 0.5808957815170288, + "learning_rate": 0.0001, + "loss": 1.529, + "step": 10224 + }, + { + "epoch": 1.187917513796108, + "grad_norm": 0.5412543416023254, + "learning_rate": 0.0001, + "loss": 1.444, + "step": 10225 + }, + { + "epoch": 1.1880336915480685, + "grad_norm": 0.5611621141433716, + "learning_rate": 0.0001, + "loss": 1.4994, + "step": 10226 + }, + { + "epoch": 1.188149869300029, + "grad_norm": 0.5732444524765015, + "learning_rate": 0.0001, + "loss": 1.5301, + "step": 10227 + }, + { + "epoch": 1.1882660470519895, + "grad_norm": 0.6303038597106934, + "learning_rate": 0.0001, + "loss": 1.5166, + "step": 10228 + }, + { + "epoch": 1.18838222480395, + "grad_norm": 0.5532310009002686, + "learning_rate": 0.0001, + "loss": 1.4627, + "step": 10229 + }, + { + "epoch": 1.1884984025559104, + "grad_norm": 0.5555628538131714, + "learning_rate": 0.0001, + "loss": 1.5655, + "step": 10230 + }, + { + "epoch": 1.188614580307871, + "grad_norm": 0.5487105846405029, + "learning_rate": 0.0001, + "loss": 1.3955, + "step": 10231 + }, + { + "epoch": 1.1887307580598316, + "grad_norm": 0.5543938279151917, + "learning_rate": 0.0001, + "loss": 1.3153, + "step": 10232 + }, + { + "epoch": 1.1888469358117921, + "grad_norm": 0.5894563794136047, + "learning_rate": 0.0001, + "loss": 1.5996, + "step": 10233 + }, + { + "epoch": 1.1889631135637526, + "grad_norm": 0.5444560647010803, + "learning_rate": 0.0001, + "loss": 1.5692, + "step": 10234 + }, + { + "epoch": 1.189079291315713, + "grad_norm": 0.578478991985321, + "learning_rate": 0.0001, + "loss": 1.4049, + "step": 10235 + }, + { + "epoch": 1.1891954690676736, + "grad_norm": 0.5442149639129639, + "learning_rate": 0.0001, + "loss": 1.5456, + "step": 10236 + }, + { + "epoch": 1.189311646819634, + "grad_norm": 0.5239068269729614, + "learning_rate": 0.0001, + "loss": 1.2212, + "step": 10237 + }, + { + "epoch": 1.1894278245715946, + "grad_norm": 0.59259033203125, + "learning_rate": 0.0001, + "loss": 1.2868, + "step": 10238 + }, + { + "epoch": 1.189544002323555, + "grad_norm": 0.536356508731842, + "learning_rate": 0.0001, + "loss": 1.3534, + "step": 10239 + }, + { + "epoch": 1.1896601800755155, + "grad_norm": 0.6062889099121094, + "learning_rate": 0.0001, + "loss": 1.5112, + "step": 10240 + }, + { + "epoch": 1.189776357827476, + "grad_norm": 0.5797139406204224, + "learning_rate": 0.0001, + "loss": 1.618, + "step": 10241 + }, + { + "epoch": 1.1898925355794365, + "grad_norm": 0.6242355704307556, + "learning_rate": 0.0001, + "loss": 1.4998, + "step": 10242 + }, + { + "epoch": 1.190008713331397, + "grad_norm": 0.5886520147323608, + "learning_rate": 0.0001, + "loss": 1.4482, + "step": 10243 + }, + { + "epoch": 1.1901248910833575, + "grad_norm": 0.6011394262313843, + "learning_rate": 0.0001, + "loss": 1.4937, + "step": 10244 + }, + { + "epoch": 1.190241068835318, + "grad_norm": 0.5653864145278931, + "learning_rate": 0.0001, + "loss": 1.5659, + "step": 10245 + }, + { + "epoch": 1.1903572465872785, + "grad_norm": 0.5808043479919434, + "learning_rate": 0.0001, + "loss": 1.4681, + "step": 10246 + }, + { + "epoch": 1.190473424339239, + "grad_norm": 0.5999670028686523, + "learning_rate": 0.0001, + "loss": 1.5587, + "step": 10247 + }, + { + "epoch": 1.1905896020911995, + "grad_norm": 0.6177708506584167, + "learning_rate": 0.0001, + "loss": 1.7287, + "step": 10248 + }, + { + "epoch": 1.19070577984316, + "grad_norm": 0.6043134331703186, + "learning_rate": 0.0001, + "loss": 1.6253, + "step": 10249 + }, + { + "epoch": 1.1908219575951204, + "grad_norm": 0.5593743324279785, + "learning_rate": 0.0001, + "loss": 1.4044, + "step": 10250 + }, + { + "epoch": 1.1909381353470812, + "grad_norm": 0.5556023716926575, + "learning_rate": 0.0001, + "loss": 1.4996, + "step": 10251 + }, + { + "epoch": 1.1910543130990416, + "grad_norm": 0.5386412143707275, + "learning_rate": 0.0001, + "loss": 1.521, + "step": 10252 + }, + { + "epoch": 1.1911704908510021, + "grad_norm": 0.5457038879394531, + "learning_rate": 0.0001, + "loss": 1.5724, + "step": 10253 + }, + { + "epoch": 1.1912866686029626, + "grad_norm": 0.5371884107589722, + "learning_rate": 0.0001, + "loss": 1.5583, + "step": 10254 + }, + { + "epoch": 1.191402846354923, + "grad_norm": 0.5815069675445557, + "learning_rate": 0.0001, + "loss": 1.6517, + "step": 10255 + }, + { + "epoch": 1.1915190241068836, + "grad_norm": 0.5567863583564758, + "learning_rate": 0.0001, + "loss": 1.6194, + "step": 10256 + }, + { + "epoch": 1.191635201858844, + "grad_norm": 0.5386295318603516, + "learning_rate": 0.0001, + "loss": 1.5486, + "step": 10257 + }, + { + "epoch": 1.1917513796108046, + "grad_norm": 0.5735299587249756, + "learning_rate": 0.0001, + "loss": 1.4414, + "step": 10258 + }, + { + "epoch": 1.191867557362765, + "grad_norm": 0.5478100776672363, + "learning_rate": 0.0001, + "loss": 1.5183, + "step": 10259 + }, + { + "epoch": 1.1919837351147256, + "grad_norm": 0.5614447593688965, + "learning_rate": 0.0001, + "loss": 1.4083, + "step": 10260 + }, + { + "epoch": 1.192099912866686, + "grad_norm": 0.5918525457382202, + "learning_rate": 0.0001, + "loss": 1.3954, + "step": 10261 + }, + { + "epoch": 1.1922160906186465, + "grad_norm": 0.5843601822853088, + "learning_rate": 0.0001, + "loss": 1.4277, + "step": 10262 + }, + { + "epoch": 1.192332268370607, + "grad_norm": 0.5581550598144531, + "learning_rate": 0.0001, + "loss": 1.4389, + "step": 10263 + }, + { + "epoch": 1.1924484461225675, + "grad_norm": 0.6755667328834534, + "learning_rate": 0.0001, + "loss": 1.6176, + "step": 10264 + }, + { + "epoch": 1.192564623874528, + "grad_norm": 0.6215943694114685, + "learning_rate": 0.0001, + "loss": 1.6203, + "step": 10265 + }, + { + "epoch": 1.1926808016264885, + "grad_norm": 0.6235942840576172, + "learning_rate": 0.0001, + "loss": 1.5156, + "step": 10266 + }, + { + "epoch": 1.192796979378449, + "grad_norm": 0.5801181197166443, + "learning_rate": 0.0001, + "loss": 1.463, + "step": 10267 + }, + { + "epoch": 1.1929131571304095, + "grad_norm": 0.5831395983695984, + "learning_rate": 0.0001, + "loss": 1.5152, + "step": 10268 + }, + { + "epoch": 1.19302933488237, + "grad_norm": 0.5884434580802917, + "learning_rate": 0.0001, + "loss": 1.5711, + "step": 10269 + }, + { + "epoch": 1.1931455126343304, + "grad_norm": 0.6119523048400879, + "learning_rate": 0.0001, + "loss": 1.6301, + "step": 10270 + }, + { + "epoch": 1.193261690386291, + "grad_norm": 0.5671009421348572, + "learning_rate": 0.0001, + "loss": 1.393, + "step": 10271 + }, + { + "epoch": 1.1933778681382514, + "grad_norm": 0.5740747451782227, + "learning_rate": 0.0001, + "loss": 1.2502, + "step": 10272 + }, + { + "epoch": 1.193494045890212, + "grad_norm": 0.5850660800933838, + "learning_rate": 0.0001, + "loss": 1.3729, + "step": 10273 + }, + { + "epoch": 1.1936102236421726, + "grad_norm": 0.5821871757507324, + "learning_rate": 0.0001, + "loss": 1.3578, + "step": 10274 + }, + { + "epoch": 1.193726401394133, + "grad_norm": 0.5659012198448181, + "learning_rate": 0.0001, + "loss": 1.418, + "step": 10275 + }, + { + "epoch": 1.1938425791460936, + "grad_norm": 0.6060782074928284, + "learning_rate": 0.0001, + "loss": 1.6198, + "step": 10276 + }, + { + "epoch": 1.193958756898054, + "grad_norm": 0.6207935214042664, + "learning_rate": 0.0001, + "loss": 1.4444, + "step": 10277 + }, + { + "epoch": 1.1940749346500146, + "grad_norm": 0.549321711063385, + "learning_rate": 0.0001, + "loss": 1.3837, + "step": 10278 + }, + { + "epoch": 1.194191112401975, + "grad_norm": 0.5631564259529114, + "learning_rate": 0.0001, + "loss": 1.5047, + "step": 10279 + }, + { + "epoch": 1.1943072901539356, + "grad_norm": 0.5477903485298157, + "learning_rate": 0.0001, + "loss": 1.4314, + "step": 10280 + }, + { + "epoch": 1.194423467905896, + "grad_norm": 0.6051657199859619, + "learning_rate": 0.0001, + "loss": 1.4717, + "step": 10281 + }, + { + "epoch": 1.1945396456578565, + "grad_norm": 0.6054665446281433, + "learning_rate": 0.0001, + "loss": 1.5794, + "step": 10282 + }, + { + "epoch": 1.194655823409817, + "grad_norm": 0.57652747631073, + "learning_rate": 0.0001, + "loss": 1.5908, + "step": 10283 + }, + { + "epoch": 1.1947720011617775, + "grad_norm": 0.5573504567146301, + "learning_rate": 0.0001, + "loss": 1.399, + "step": 10284 + }, + { + "epoch": 1.194888178913738, + "grad_norm": 0.5924124121665955, + "learning_rate": 0.0001, + "loss": 1.4233, + "step": 10285 + }, + { + "epoch": 1.1950043566656985, + "grad_norm": 0.5285975337028503, + "learning_rate": 0.0001, + "loss": 1.4941, + "step": 10286 + }, + { + "epoch": 1.195120534417659, + "grad_norm": 0.5311710834503174, + "learning_rate": 0.0001, + "loss": 1.4012, + "step": 10287 + }, + { + "epoch": 1.1952367121696195, + "grad_norm": 0.5347359776496887, + "learning_rate": 0.0001, + "loss": 1.4686, + "step": 10288 + }, + { + "epoch": 1.19535288992158, + "grad_norm": 0.563667356967926, + "learning_rate": 0.0001, + "loss": 1.4839, + "step": 10289 + }, + { + "epoch": 1.1954690676735404, + "grad_norm": 0.568260669708252, + "learning_rate": 0.0001, + "loss": 1.3454, + "step": 10290 + }, + { + "epoch": 1.195585245425501, + "grad_norm": 0.638574481010437, + "learning_rate": 0.0001, + "loss": 1.5333, + "step": 10291 + }, + { + "epoch": 1.1957014231774614, + "grad_norm": 0.6492049098014832, + "learning_rate": 0.0001, + "loss": 1.6565, + "step": 10292 + }, + { + "epoch": 1.1958176009294221, + "grad_norm": 0.5210595726966858, + "learning_rate": 0.0001, + "loss": 1.4179, + "step": 10293 + }, + { + "epoch": 1.1959337786813826, + "grad_norm": 0.5696560144424438, + "learning_rate": 0.0001, + "loss": 1.5132, + "step": 10294 + }, + { + "epoch": 1.196049956433343, + "grad_norm": 0.5875056982040405, + "learning_rate": 0.0001, + "loss": 1.5027, + "step": 10295 + }, + { + "epoch": 1.1961661341853036, + "grad_norm": 0.5168067216873169, + "learning_rate": 0.0001, + "loss": 1.436, + "step": 10296 + }, + { + "epoch": 1.196282311937264, + "grad_norm": 0.5388482213020325, + "learning_rate": 0.0001, + "loss": 1.3848, + "step": 10297 + }, + { + "epoch": 1.1963984896892246, + "grad_norm": 0.595585823059082, + "learning_rate": 0.0001, + "loss": 1.3237, + "step": 10298 + }, + { + "epoch": 1.196514667441185, + "grad_norm": 0.5622440576553345, + "learning_rate": 0.0001, + "loss": 1.5511, + "step": 10299 + }, + { + "epoch": 1.1966308451931456, + "grad_norm": 0.6017595529556274, + "learning_rate": 0.0001, + "loss": 1.6546, + "step": 10300 + }, + { + "epoch": 1.196747022945106, + "grad_norm": 0.5587907433509827, + "learning_rate": 0.0001, + "loss": 1.419, + "step": 10301 + }, + { + "epoch": 1.1968632006970665, + "grad_norm": 0.5632277727127075, + "learning_rate": 0.0001, + "loss": 1.3717, + "step": 10302 + }, + { + "epoch": 1.196979378449027, + "grad_norm": 0.5504304766654968, + "learning_rate": 0.0001, + "loss": 1.5479, + "step": 10303 + }, + { + "epoch": 1.1970955562009875, + "grad_norm": 0.5563431978225708, + "learning_rate": 0.0001, + "loss": 1.4499, + "step": 10304 + }, + { + "epoch": 1.197211733952948, + "grad_norm": 0.5547022223472595, + "learning_rate": 0.0001, + "loss": 1.3948, + "step": 10305 + }, + { + "epoch": 1.1973279117049085, + "grad_norm": 0.5395157337188721, + "learning_rate": 0.0001, + "loss": 1.3875, + "step": 10306 + }, + { + "epoch": 1.197444089456869, + "grad_norm": 0.6372838020324707, + "learning_rate": 0.0001, + "loss": 1.757, + "step": 10307 + }, + { + "epoch": 1.1975602672088295, + "grad_norm": 0.5683894157409668, + "learning_rate": 0.0001, + "loss": 1.5161, + "step": 10308 + }, + { + "epoch": 1.19767644496079, + "grad_norm": 0.5659453868865967, + "learning_rate": 0.0001, + "loss": 1.4273, + "step": 10309 + }, + { + "epoch": 1.1977926227127504, + "grad_norm": 0.5578935146331787, + "learning_rate": 0.0001, + "loss": 1.4621, + "step": 10310 + }, + { + "epoch": 1.197908800464711, + "grad_norm": 0.5382188558578491, + "learning_rate": 0.0001, + "loss": 1.543, + "step": 10311 + }, + { + "epoch": 1.1980249782166714, + "grad_norm": 0.5252891182899475, + "learning_rate": 0.0001, + "loss": 1.3211, + "step": 10312 + }, + { + "epoch": 1.198141155968632, + "grad_norm": 0.545369029045105, + "learning_rate": 0.0001, + "loss": 1.5007, + "step": 10313 + }, + { + "epoch": 1.1982573337205924, + "grad_norm": 0.5860430598258972, + "learning_rate": 0.0001, + "loss": 1.5027, + "step": 10314 + }, + { + "epoch": 1.1983735114725529, + "grad_norm": 0.5563281774520874, + "learning_rate": 0.0001, + "loss": 1.5026, + "step": 10315 + }, + { + "epoch": 1.1984896892245136, + "grad_norm": 0.5620051622390747, + "learning_rate": 0.0001, + "loss": 1.3899, + "step": 10316 + }, + { + "epoch": 1.198605866976474, + "grad_norm": 0.5942341685295105, + "learning_rate": 0.0001, + "loss": 1.5773, + "step": 10317 + }, + { + "epoch": 1.1987220447284346, + "grad_norm": 0.6117276549339294, + "learning_rate": 0.0001, + "loss": 1.6346, + "step": 10318 + }, + { + "epoch": 1.198838222480395, + "grad_norm": 0.5562477707862854, + "learning_rate": 0.0001, + "loss": 1.4398, + "step": 10319 + }, + { + "epoch": 1.1989544002323556, + "grad_norm": 0.5761712193489075, + "learning_rate": 0.0001, + "loss": 1.4883, + "step": 10320 + }, + { + "epoch": 1.199070577984316, + "grad_norm": 0.552450954914093, + "learning_rate": 0.0001, + "loss": 1.3354, + "step": 10321 + }, + { + "epoch": 1.1991867557362765, + "grad_norm": 0.5395271182060242, + "learning_rate": 0.0001, + "loss": 1.3772, + "step": 10322 + }, + { + "epoch": 1.199302933488237, + "grad_norm": 0.555846631526947, + "learning_rate": 0.0001, + "loss": 1.4032, + "step": 10323 + }, + { + "epoch": 1.1994191112401975, + "grad_norm": 0.5601608157157898, + "learning_rate": 0.0001, + "loss": 1.591, + "step": 10324 + }, + { + "epoch": 1.199535288992158, + "grad_norm": 0.5598324537277222, + "learning_rate": 0.0001, + "loss": 1.4669, + "step": 10325 + }, + { + "epoch": 1.1996514667441185, + "grad_norm": 0.5127769112586975, + "learning_rate": 0.0001, + "loss": 1.4241, + "step": 10326 + }, + { + "epoch": 1.199767644496079, + "grad_norm": 0.5928462147712708, + "learning_rate": 0.0001, + "loss": 1.6712, + "step": 10327 + }, + { + "epoch": 1.1998838222480395, + "grad_norm": 0.5600936412811279, + "learning_rate": 0.0001, + "loss": 1.5328, + "step": 10328 + }, + { + "epoch": 1.2, + "grad_norm": 0.5994905233383179, + "learning_rate": 0.0001, + "loss": 1.4261, + "step": 10329 + }, + { + "epoch": 1.2001161777519604, + "grad_norm": 0.5742284059524536, + "learning_rate": 0.0001, + "loss": 1.4516, + "step": 10330 + }, + { + "epoch": 1.200232355503921, + "grad_norm": 0.5460923314094543, + "learning_rate": 0.0001, + "loss": 1.4823, + "step": 10331 + }, + { + "epoch": 1.2003485332558814, + "grad_norm": 0.5468488931655884, + "learning_rate": 0.0001, + "loss": 1.4799, + "step": 10332 + }, + { + "epoch": 1.200464711007842, + "grad_norm": 0.5500593781471252, + "learning_rate": 0.0001, + "loss": 1.2928, + "step": 10333 + }, + { + "epoch": 1.2005808887598026, + "grad_norm": 0.5604000091552734, + "learning_rate": 0.0001, + "loss": 1.4501, + "step": 10334 + }, + { + "epoch": 1.2006970665117631, + "grad_norm": 0.5383157730102539, + "learning_rate": 0.0001, + "loss": 1.414, + "step": 10335 + }, + { + "epoch": 1.2008132442637236, + "grad_norm": 0.6212254166603088, + "learning_rate": 0.0001, + "loss": 1.5292, + "step": 10336 + }, + { + "epoch": 1.200929422015684, + "grad_norm": 0.5845767259597778, + "learning_rate": 0.0001, + "loss": 1.4383, + "step": 10337 + }, + { + "epoch": 1.2010455997676446, + "grad_norm": 0.517145574092865, + "learning_rate": 0.0001, + "loss": 1.2527, + "step": 10338 + }, + { + "epoch": 1.201161777519605, + "grad_norm": 0.617876410484314, + "learning_rate": 0.0001, + "loss": 1.4751, + "step": 10339 + }, + { + "epoch": 1.2012779552715656, + "grad_norm": 0.5485627055168152, + "learning_rate": 0.0001, + "loss": 1.3546, + "step": 10340 + }, + { + "epoch": 1.201394133023526, + "grad_norm": 0.5494995713233948, + "learning_rate": 0.0001, + "loss": 1.4588, + "step": 10341 + }, + { + "epoch": 1.2015103107754865, + "grad_norm": 0.5931583642959595, + "learning_rate": 0.0001, + "loss": 1.4738, + "step": 10342 + }, + { + "epoch": 1.201626488527447, + "grad_norm": 0.57831871509552, + "learning_rate": 0.0001, + "loss": 1.4587, + "step": 10343 + }, + { + "epoch": 1.2017426662794075, + "grad_norm": 0.5546940565109253, + "learning_rate": 0.0001, + "loss": 1.2925, + "step": 10344 + }, + { + "epoch": 1.201858844031368, + "grad_norm": 0.5821717381477356, + "learning_rate": 0.0001, + "loss": 1.4944, + "step": 10345 + }, + { + "epoch": 1.2019750217833285, + "grad_norm": 0.5914159417152405, + "learning_rate": 0.0001, + "loss": 1.4987, + "step": 10346 + }, + { + "epoch": 1.202091199535289, + "grad_norm": 0.5691181421279907, + "learning_rate": 0.0001, + "loss": 1.3892, + "step": 10347 + }, + { + "epoch": 1.2022073772872495, + "grad_norm": 0.5852946639060974, + "learning_rate": 0.0001, + "loss": 1.5349, + "step": 10348 + }, + { + "epoch": 1.20232355503921, + "grad_norm": 0.5942548513412476, + "learning_rate": 0.0001, + "loss": 1.3972, + "step": 10349 + }, + { + "epoch": 1.2024397327911704, + "grad_norm": 0.595831036567688, + "learning_rate": 0.0001, + "loss": 1.5364, + "step": 10350 + }, + { + "epoch": 1.202555910543131, + "grad_norm": 0.5792760848999023, + "learning_rate": 0.0001, + "loss": 1.4927, + "step": 10351 + }, + { + "epoch": 1.2026720882950914, + "grad_norm": 0.5801286101341248, + "learning_rate": 0.0001, + "loss": 1.3312, + "step": 10352 + }, + { + "epoch": 1.202788266047052, + "grad_norm": 0.6192400455474854, + "learning_rate": 0.0001, + "loss": 1.746, + "step": 10353 + }, + { + "epoch": 1.2029044437990124, + "grad_norm": 0.6081662774085999, + "learning_rate": 0.0001, + "loss": 1.5822, + "step": 10354 + }, + { + "epoch": 1.203020621550973, + "grad_norm": 0.6102173924446106, + "learning_rate": 0.0001, + "loss": 1.6073, + "step": 10355 + }, + { + "epoch": 1.2031367993029334, + "grad_norm": 0.5657384395599365, + "learning_rate": 0.0001, + "loss": 1.5184, + "step": 10356 + }, + { + "epoch": 1.2032529770548939, + "grad_norm": 0.5585178732872009, + "learning_rate": 0.0001, + "loss": 1.4841, + "step": 10357 + }, + { + "epoch": 1.2033691548068546, + "grad_norm": 0.5775259137153625, + "learning_rate": 0.0001, + "loss": 1.5499, + "step": 10358 + }, + { + "epoch": 1.203485332558815, + "grad_norm": 0.557350754737854, + "learning_rate": 0.0001, + "loss": 1.5614, + "step": 10359 + }, + { + "epoch": 1.2036015103107756, + "grad_norm": 0.5576322078704834, + "learning_rate": 0.0001, + "loss": 1.7029, + "step": 10360 + }, + { + "epoch": 1.203717688062736, + "grad_norm": 0.5494878888130188, + "learning_rate": 0.0001, + "loss": 1.4551, + "step": 10361 + }, + { + "epoch": 1.2038338658146965, + "grad_norm": 0.580441951751709, + "learning_rate": 0.0001, + "loss": 1.5666, + "step": 10362 + }, + { + "epoch": 1.203950043566657, + "grad_norm": 0.5665770173072815, + "learning_rate": 0.0001, + "loss": 1.5302, + "step": 10363 + }, + { + "epoch": 1.2040662213186175, + "grad_norm": 0.5588644742965698, + "learning_rate": 0.0001, + "loss": 1.3927, + "step": 10364 + }, + { + "epoch": 1.204182399070578, + "grad_norm": 0.6142786741256714, + "learning_rate": 0.0001, + "loss": 1.5917, + "step": 10365 + }, + { + "epoch": 1.2042985768225385, + "grad_norm": 0.6669145822525024, + "learning_rate": 0.0001, + "loss": 1.5222, + "step": 10366 + }, + { + "epoch": 1.204414754574499, + "grad_norm": 0.5337054133415222, + "learning_rate": 0.0001, + "loss": 1.3741, + "step": 10367 + }, + { + "epoch": 1.2045309323264595, + "grad_norm": 0.5432060956954956, + "learning_rate": 0.0001, + "loss": 1.3621, + "step": 10368 + }, + { + "epoch": 1.20464711007842, + "grad_norm": 0.6041423678398132, + "learning_rate": 0.0001, + "loss": 1.6736, + "step": 10369 + }, + { + "epoch": 1.2047632878303804, + "grad_norm": 0.5839071869850159, + "learning_rate": 0.0001, + "loss": 1.4869, + "step": 10370 + }, + { + "epoch": 1.204879465582341, + "grad_norm": 0.5747912526130676, + "learning_rate": 0.0001, + "loss": 1.5618, + "step": 10371 + }, + { + "epoch": 1.2049956433343014, + "grad_norm": 0.5971471667289734, + "learning_rate": 0.0001, + "loss": 1.5796, + "step": 10372 + }, + { + "epoch": 1.205111821086262, + "grad_norm": 0.5386415123939514, + "learning_rate": 0.0001, + "loss": 1.4433, + "step": 10373 + }, + { + "epoch": 1.2052279988382224, + "grad_norm": 0.5704327821731567, + "learning_rate": 0.0001, + "loss": 1.5432, + "step": 10374 + }, + { + "epoch": 1.205344176590183, + "grad_norm": 0.5206299424171448, + "learning_rate": 0.0001, + "loss": 1.4509, + "step": 10375 + }, + { + "epoch": 1.2054603543421436, + "grad_norm": 0.5865551829338074, + "learning_rate": 0.0001, + "loss": 1.342, + "step": 10376 + }, + { + "epoch": 1.205576532094104, + "grad_norm": 0.5369005799293518, + "learning_rate": 0.0001, + "loss": 1.487, + "step": 10377 + }, + { + "epoch": 1.2056927098460646, + "grad_norm": 0.5041610598564148, + "learning_rate": 0.0001, + "loss": 1.1966, + "step": 10378 + }, + { + "epoch": 1.205808887598025, + "grad_norm": 0.5628383755683899, + "learning_rate": 0.0001, + "loss": 1.6165, + "step": 10379 + }, + { + "epoch": 1.2059250653499856, + "grad_norm": 0.5555375218391418, + "learning_rate": 0.0001, + "loss": 1.5333, + "step": 10380 + }, + { + "epoch": 1.206041243101946, + "grad_norm": 0.563068687915802, + "learning_rate": 0.0001, + "loss": 1.5383, + "step": 10381 + }, + { + "epoch": 1.2061574208539065, + "grad_norm": 0.5873087644577026, + "learning_rate": 0.0001, + "loss": 1.604, + "step": 10382 + }, + { + "epoch": 1.206273598605867, + "grad_norm": 0.5575566291809082, + "learning_rate": 0.0001, + "loss": 1.5667, + "step": 10383 + }, + { + "epoch": 1.2063897763578275, + "grad_norm": 0.58903968334198, + "learning_rate": 0.0001, + "loss": 1.3605, + "step": 10384 + }, + { + "epoch": 1.206505954109788, + "grad_norm": 0.5939505696296692, + "learning_rate": 0.0001, + "loss": 1.7172, + "step": 10385 + }, + { + "epoch": 1.2066221318617485, + "grad_norm": 0.5417733192443848, + "learning_rate": 0.0001, + "loss": 1.4175, + "step": 10386 + }, + { + "epoch": 1.206738309613709, + "grad_norm": 0.52766352891922, + "learning_rate": 0.0001, + "loss": 1.4536, + "step": 10387 + }, + { + "epoch": 1.2068544873656695, + "grad_norm": 0.5897431373596191, + "learning_rate": 0.0001, + "loss": 1.4175, + "step": 10388 + }, + { + "epoch": 1.20697066511763, + "grad_norm": 0.6240753531455994, + "learning_rate": 0.0001, + "loss": 1.4608, + "step": 10389 + }, + { + "epoch": 1.2070868428695904, + "grad_norm": 0.6030203104019165, + "learning_rate": 0.0001, + "loss": 1.4075, + "step": 10390 + }, + { + "epoch": 1.207203020621551, + "grad_norm": 0.6183759570121765, + "learning_rate": 0.0001, + "loss": 1.4371, + "step": 10391 + }, + { + "epoch": 1.2073191983735114, + "grad_norm": 0.5684889554977417, + "learning_rate": 0.0001, + "loss": 1.4891, + "step": 10392 + }, + { + "epoch": 1.207435376125472, + "grad_norm": 0.5397784113883972, + "learning_rate": 0.0001, + "loss": 1.3721, + "step": 10393 + }, + { + "epoch": 1.2075515538774324, + "grad_norm": 0.5187193751335144, + "learning_rate": 0.0001, + "loss": 1.3977, + "step": 10394 + }, + { + "epoch": 1.207667731629393, + "grad_norm": 0.5413998961448669, + "learning_rate": 0.0001, + "loss": 1.3461, + "step": 10395 + }, + { + "epoch": 1.2077839093813534, + "grad_norm": 0.545616626739502, + "learning_rate": 0.0001, + "loss": 1.3606, + "step": 10396 + }, + { + "epoch": 1.2079000871333139, + "grad_norm": 0.5437708497047424, + "learning_rate": 0.0001, + "loss": 1.4803, + "step": 10397 + }, + { + "epoch": 1.2080162648852744, + "grad_norm": 0.56759113073349, + "learning_rate": 0.0001, + "loss": 1.4261, + "step": 10398 + }, + { + "epoch": 1.2081324426372348, + "grad_norm": 0.5678495764732361, + "learning_rate": 0.0001, + "loss": 1.5097, + "step": 10399 + }, + { + "epoch": 1.2082486203891956, + "grad_norm": 0.5646939277648926, + "learning_rate": 0.0001, + "loss": 1.5916, + "step": 10400 + }, + { + "epoch": 1.208364798141156, + "grad_norm": 0.554522693157196, + "learning_rate": 0.0001, + "loss": 1.4276, + "step": 10401 + }, + { + "epoch": 1.2084809758931165, + "grad_norm": 0.5794949531555176, + "learning_rate": 0.0001, + "loss": 1.438, + "step": 10402 + }, + { + "epoch": 1.208597153645077, + "grad_norm": 0.5517016053199768, + "learning_rate": 0.0001, + "loss": 1.5691, + "step": 10403 + }, + { + "epoch": 1.2087133313970375, + "grad_norm": 0.5888990759849548, + "learning_rate": 0.0001, + "loss": 1.5009, + "step": 10404 + }, + { + "epoch": 1.208829509148998, + "grad_norm": 0.5803018808364868, + "learning_rate": 0.0001, + "loss": 1.4703, + "step": 10405 + }, + { + "epoch": 1.2089456869009585, + "grad_norm": 0.5619463324546814, + "learning_rate": 0.0001, + "loss": 1.5176, + "step": 10406 + }, + { + "epoch": 1.209061864652919, + "grad_norm": 0.5671004056930542, + "learning_rate": 0.0001, + "loss": 1.7036, + "step": 10407 + }, + { + "epoch": 1.2091780424048795, + "grad_norm": 0.5472593307495117, + "learning_rate": 0.0001, + "loss": 1.4578, + "step": 10408 + }, + { + "epoch": 1.20929422015684, + "grad_norm": 0.5378280878067017, + "learning_rate": 0.0001, + "loss": 1.4031, + "step": 10409 + }, + { + "epoch": 1.2094103979088004, + "grad_norm": 0.5641921758651733, + "learning_rate": 0.0001, + "loss": 1.3906, + "step": 10410 + }, + { + "epoch": 1.209526575660761, + "grad_norm": 0.5625640749931335, + "learning_rate": 0.0001, + "loss": 1.4926, + "step": 10411 + }, + { + "epoch": 1.2096427534127214, + "grad_norm": 0.5496149063110352, + "learning_rate": 0.0001, + "loss": 1.5037, + "step": 10412 + }, + { + "epoch": 1.209758931164682, + "grad_norm": 0.5828267931938171, + "learning_rate": 0.0001, + "loss": 1.6947, + "step": 10413 + }, + { + "epoch": 1.2098751089166424, + "grad_norm": 0.5525361895561218, + "learning_rate": 0.0001, + "loss": 1.5652, + "step": 10414 + }, + { + "epoch": 1.209991286668603, + "grad_norm": 0.524874746799469, + "learning_rate": 0.0001, + "loss": 1.3639, + "step": 10415 + }, + { + "epoch": 1.2101074644205634, + "grad_norm": 0.6062628626823425, + "learning_rate": 0.0001, + "loss": 1.6837, + "step": 10416 + }, + { + "epoch": 1.2102236421725239, + "grad_norm": 0.5272751450538635, + "learning_rate": 0.0001, + "loss": 1.4494, + "step": 10417 + }, + { + "epoch": 1.2103398199244846, + "grad_norm": 0.5667693614959717, + "learning_rate": 0.0001, + "loss": 1.507, + "step": 10418 + }, + { + "epoch": 1.210455997676445, + "grad_norm": 0.5765136480331421, + "learning_rate": 0.0001, + "loss": 1.3549, + "step": 10419 + }, + { + "epoch": 1.2105721754284056, + "grad_norm": 0.5195456743240356, + "learning_rate": 0.0001, + "loss": 1.286, + "step": 10420 + }, + { + "epoch": 1.210688353180366, + "grad_norm": 0.6368436813354492, + "learning_rate": 0.0001, + "loss": 1.6614, + "step": 10421 + }, + { + "epoch": 1.2108045309323265, + "grad_norm": 0.5609405040740967, + "learning_rate": 0.0001, + "loss": 1.4401, + "step": 10422 + }, + { + "epoch": 1.210920708684287, + "grad_norm": 0.6603508591651917, + "learning_rate": 0.0001, + "loss": 1.7794, + "step": 10423 + }, + { + "epoch": 1.2110368864362475, + "grad_norm": 0.5625258684158325, + "learning_rate": 0.0001, + "loss": 1.4226, + "step": 10424 + }, + { + "epoch": 1.211153064188208, + "grad_norm": 0.5961827039718628, + "learning_rate": 0.0001, + "loss": 1.4507, + "step": 10425 + }, + { + "epoch": 1.2112692419401685, + "grad_norm": 0.5945589542388916, + "learning_rate": 0.0001, + "loss": 1.7156, + "step": 10426 + }, + { + "epoch": 1.211385419692129, + "grad_norm": 0.547545850276947, + "learning_rate": 0.0001, + "loss": 1.513, + "step": 10427 + }, + { + "epoch": 1.2115015974440895, + "grad_norm": 0.5815989971160889, + "learning_rate": 0.0001, + "loss": 1.4578, + "step": 10428 + }, + { + "epoch": 1.21161777519605, + "grad_norm": 0.5638428926467896, + "learning_rate": 0.0001, + "loss": 1.4489, + "step": 10429 + }, + { + "epoch": 1.2117339529480105, + "grad_norm": 0.5702305436134338, + "learning_rate": 0.0001, + "loss": 1.4775, + "step": 10430 + }, + { + "epoch": 1.211850130699971, + "grad_norm": 0.5992822051048279, + "learning_rate": 0.0001, + "loss": 1.441, + "step": 10431 + }, + { + "epoch": 1.2119663084519314, + "grad_norm": 0.5825338363647461, + "learning_rate": 0.0001, + "loss": 1.5207, + "step": 10432 + }, + { + "epoch": 1.212082486203892, + "grad_norm": 0.5452916622161865, + "learning_rate": 0.0001, + "loss": 1.3638, + "step": 10433 + }, + { + "epoch": 1.2121986639558524, + "grad_norm": 0.6069163084030151, + "learning_rate": 0.0001, + "loss": 1.4098, + "step": 10434 + }, + { + "epoch": 1.212314841707813, + "grad_norm": 0.6051025390625, + "learning_rate": 0.0001, + "loss": 1.608, + "step": 10435 + }, + { + "epoch": 1.2124310194597734, + "grad_norm": 0.5363568663597107, + "learning_rate": 0.0001, + "loss": 1.274, + "step": 10436 + }, + { + "epoch": 1.2125471972117339, + "grad_norm": 0.5586710572242737, + "learning_rate": 0.0001, + "loss": 1.3582, + "step": 10437 + }, + { + "epoch": 1.2126633749636944, + "grad_norm": 0.5546153783798218, + "learning_rate": 0.0001, + "loss": 1.4534, + "step": 10438 + }, + { + "epoch": 1.2127795527156549, + "grad_norm": 0.5305795073509216, + "learning_rate": 0.0001, + "loss": 1.4127, + "step": 10439 + }, + { + "epoch": 1.2128957304676153, + "grad_norm": 0.5443617701530457, + "learning_rate": 0.0001, + "loss": 1.453, + "step": 10440 + }, + { + "epoch": 1.2130119082195758, + "grad_norm": 0.6291701197624207, + "learning_rate": 0.0001, + "loss": 1.7236, + "step": 10441 + }, + { + "epoch": 1.2131280859715365, + "grad_norm": 0.6024119853973389, + "learning_rate": 0.0001, + "loss": 1.5023, + "step": 10442 + }, + { + "epoch": 1.213244263723497, + "grad_norm": 0.566421389579773, + "learning_rate": 0.0001, + "loss": 1.3462, + "step": 10443 + }, + { + "epoch": 1.2133604414754575, + "grad_norm": 0.6556349992752075, + "learning_rate": 0.0001, + "loss": 1.4306, + "step": 10444 + }, + { + "epoch": 1.213476619227418, + "grad_norm": 0.5445832014083862, + "learning_rate": 0.0001, + "loss": 1.3331, + "step": 10445 + }, + { + "epoch": 1.2135927969793785, + "grad_norm": 0.5676811933517456, + "learning_rate": 0.0001, + "loss": 1.585, + "step": 10446 + }, + { + "epoch": 1.213708974731339, + "grad_norm": 0.5820676684379578, + "learning_rate": 0.0001, + "loss": 1.4923, + "step": 10447 + }, + { + "epoch": 1.2138251524832995, + "grad_norm": 0.5882712602615356, + "learning_rate": 0.0001, + "loss": 1.4838, + "step": 10448 + }, + { + "epoch": 1.21394133023526, + "grad_norm": 0.5337011218070984, + "learning_rate": 0.0001, + "loss": 1.4753, + "step": 10449 + }, + { + "epoch": 1.2140575079872205, + "grad_norm": 0.6516126394271851, + "learning_rate": 0.0001, + "loss": 1.4206, + "step": 10450 + }, + { + "epoch": 1.214173685739181, + "grad_norm": 0.537398099899292, + "learning_rate": 0.0001, + "loss": 1.3852, + "step": 10451 + }, + { + "epoch": 1.2142898634911414, + "grad_norm": 0.5600173473358154, + "learning_rate": 0.0001, + "loss": 1.2863, + "step": 10452 + }, + { + "epoch": 1.214406041243102, + "grad_norm": 0.5451828241348267, + "learning_rate": 0.0001, + "loss": 1.3837, + "step": 10453 + }, + { + "epoch": 1.2145222189950624, + "grad_norm": 0.6120473742485046, + "learning_rate": 0.0001, + "loss": 1.5573, + "step": 10454 + }, + { + "epoch": 1.214638396747023, + "grad_norm": 0.5406019687652588, + "learning_rate": 0.0001, + "loss": 1.4847, + "step": 10455 + }, + { + "epoch": 1.2147545744989834, + "grad_norm": 0.49340879917144775, + "learning_rate": 0.0001, + "loss": 1.2085, + "step": 10456 + }, + { + "epoch": 1.2148707522509439, + "grad_norm": 0.5855630040168762, + "learning_rate": 0.0001, + "loss": 1.5449, + "step": 10457 + }, + { + "epoch": 1.2149869300029044, + "grad_norm": 0.5757850408554077, + "learning_rate": 0.0001, + "loss": 1.4777, + "step": 10458 + }, + { + "epoch": 1.2151031077548649, + "grad_norm": 0.6049354076385498, + "learning_rate": 0.0001, + "loss": 1.4986, + "step": 10459 + }, + { + "epoch": 1.2152192855068256, + "grad_norm": 0.5614561438560486, + "learning_rate": 0.0001, + "loss": 1.1913, + "step": 10460 + }, + { + "epoch": 1.215335463258786, + "grad_norm": 0.58977210521698, + "learning_rate": 0.0001, + "loss": 1.5599, + "step": 10461 + }, + { + "epoch": 1.2154516410107465, + "grad_norm": 0.5945138335227966, + "learning_rate": 0.0001, + "loss": 1.3591, + "step": 10462 + }, + { + "epoch": 1.215567818762707, + "grad_norm": 0.5707786083221436, + "learning_rate": 0.0001, + "loss": 1.4646, + "step": 10463 + }, + { + "epoch": 1.2156839965146675, + "grad_norm": 0.6250419020652771, + "learning_rate": 0.0001, + "loss": 1.5176, + "step": 10464 + }, + { + "epoch": 1.215800174266628, + "grad_norm": 0.5555652379989624, + "learning_rate": 0.0001, + "loss": 1.5091, + "step": 10465 + }, + { + "epoch": 1.2159163520185885, + "grad_norm": 0.5932562947273254, + "learning_rate": 0.0001, + "loss": 1.3049, + "step": 10466 + }, + { + "epoch": 1.216032529770549, + "grad_norm": 0.5786982178688049, + "learning_rate": 0.0001, + "loss": 1.5366, + "step": 10467 + }, + { + "epoch": 1.2161487075225095, + "grad_norm": 0.5523450970649719, + "learning_rate": 0.0001, + "loss": 1.5297, + "step": 10468 + }, + { + "epoch": 1.21626488527447, + "grad_norm": 0.5957953333854675, + "learning_rate": 0.0001, + "loss": 1.4569, + "step": 10469 + }, + { + "epoch": 1.2163810630264305, + "grad_norm": 0.6025353074073792, + "learning_rate": 0.0001, + "loss": 1.6472, + "step": 10470 + }, + { + "epoch": 1.216497240778391, + "grad_norm": 0.6158897280693054, + "learning_rate": 0.0001, + "loss": 1.6028, + "step": 10471 + }, + { + "epoch": 1.2166134185303514, + "grad_norm": 0.5769653916358948, + "learning_rate": 0.0001, + "loss": 1.4907, + "step": 10472 + }, + { + "epoch": 1.216729596282312, + "grad_norm": 0.5845011472702026, + "learning_rate": 0.0001, + "loss": 1.6525, + "step": 10473 + }, + { + "epoch": 1.2168457740342724, + "grad_norm": 0.5649383068084717, + "learning_rate": 0.0001, + "loss": 1.3627, + "step": 10474 + }, + { + "epoch": 1.216961951786233, + "grad_norm": 0.5390584468841553, + "learning_rate": 0.0001, + "loss": 1.4094, + "step": 10475 + }, + { + "epoch": 1.2170781295381934, + "grad_norm": 0.5726324319839478, + "learning_rate": 0.0001, + "loss": 1.55, + "step": 10476 + }, + { + "epoch": 1.2171943072901539, + "grad_norm": 0.5737690329551697, + "learning_rate": 0.0001, + "loss": 1.4491, + "step": 10477 + }, + { + "epoch": 1.2173104850421144, + "grad_norm": 0.5977112650871277, + "learning_rate": 0.0001, + "loss": 1.5047, + "step": 10478 + }, + { + "epoch": 1.2174266627940749, + "grad_norm": 0.5659692287445068, + "learning_rate": 0.0001, + "loss": 1.6041, + "step": 10479 + }, + { + "epoch": 1.2175428405460353, + "grad_norm": 0.6409531831741333, + "learning_rate": 0.0001, + "loss": 1.5836, + "step": 10480 + }, + { + "epoch": 1.2176590182979958, + "grad_norm": 0.5651158094406128, + "learning_rate": 0.0001, + "loss": 1.6229, + "step": 10481 + }, + { + "epoch": 1.2177751960499563, + "grad_norm": 0.5669698119163513, + "learning_rate": 0.0001, + "loss": 1.4251, + "step": 10482 + }, + { + "epoch": 1.2178913738019168, + "grad_norm": 0.5658094882965088, + "learning_rate": 0.0001, + "loss": 1.5442, + "step": 10483 + }, + { + "epoch": 1.2180075515538775, + "grad_norm": 0.5667807459831238, + "learning_rate": 0.0001, + "loss": 1.4133, + "step": 10484 + }, + { + "epoch": 1.218123729305838, + "grad_norm": 0.5489072203636169, + "learning_rate": 0.0001, + "loss": 1.4643, + "step": 10485 + }, + { + "epoch": 1.2182399070577985, + "grad_norm": 0.6018260717391968, + "learning_rate": 0.0001, + "loss": 1.7007, + "step": 10486 + }, + { + "epoch": 1.218356084809759, + "grad_norm": 0.5693395137786865, + "learning_rate": 0.0001, + "loss": 1.5865, + "step": 10487 + }, + { + "epoch": 1.2184722625617195, + "grad_norm": 0.5397615432739258, + "learning_rate": 0.0001, + "loss": 1.4919, + "step": 10488 + }, + { + "epoch": 1.21858844031368, + "grad_norm": 0.5639338493347168, + "learning_rate": 0.0001, + "loss": 1.3537, + "step": 10489 + }, + { + "epoch": 1.2187046180656405, + "grad_norm": 0.5436851382255554, + "learning_rate": 0.0001, + "loss": 1.4667, + "step": 10490 + }, + { + "epoch": 1.218820795817601, + "grad_norm": 0.5400302410125732, + "learning_rate": 0.0001, + "loss": 1.401, + "step": 10491 + }, + { + "epoch": 1.2189369735695614, + "grad_norm": 0.5486143231391907, + "learning_rate": 0.0001, + "loss": 1.4215, + "step": 10492 + }, + { + "epoch": 1.219053151321522, + "grad_norm": 0.6082191467285156, + "learning_rate": 0.0001, + "loss": 1.4852, + "step": 10493 + }, + { + "epoch": 1.2191693290734824, + "grad_norm": 0.5857893228530884, + "learning_rate": 0.0001, + "loss": 1.3528, + "step": 10494 + }, + { + "epoch": 1.219285506825443, + "grad_norm": 0.592745304107666, + "learning_rate": 0.0001, + "loss": 1.4238, + "step": 10495 + }, + { + "epoch": 1.2194016845774034, + "grad_norm": 0.5825994610786438, + "learning_rate": 0.0001, + "loss": 1.5337, + "step": 10496 + }, + { + "epoch": 1.2195178623293639, + "grad_norm": 0.5833573341369629, + "learning_rate": 0.0001, + "loss": 1.437, + "step": 10497 + }, + { + "epoch": 1.2196340400813244, + "grad_norm": 0.5565202236175537, + "learning_rate": 0.0001, + "loss": 1.3335, + "step": 10498 + }, + { + "epoch": 1.2197502178332849, + "grad_norm": 0.5850576758384705, + "learning_rate": 0.0001, + "loss": 1.5889, + "step": 10499 + }, + { + "epoch": 1.2198663955852453, + "grad_norm": 0.5984856486320496, + "learning_rate": 0.0001, + "loss": 1.3888, + "step": 10500 + }, + { + "epoch": 1.2199825733372058, + "grad_norm": 0.5830519795417786, + "learning_rate": 0.0001, + "loss": 1.6643, + "step": 10501 + }, + { + "epoch": 1.2200987510891665, + "grad_norm": 0.5867604613304138, + "learning_rate": 0.0001, + "loss": 1.2365, + "step": 10502 + }, + { + "epoch": 1.220214928841127, + "grad_norm": 0.5673847198486328, + "learning_rate": 0.0001, + "loss": 1.3835, + "step": 10503 + }, + { + "epoch": 1.2203311065930875, + "grad_norm": 0.5929017066955566, + "learning_rate": 0.0001, + "loss": 1.3362, + "step": 10504 + }, + { + "epoch": 1.220447284345048, + "grad_norm": 0.5458045601844788, + "learning_rate": 0.0001, + "loss": 1.356, + "step": 10505 + }, + { + "epoch": 1.2205634620970085, + "grad_norm": 0.6023081541061401, + "learning_rate": 0.0001, + "loss": 1.4702, + "step": 10506 + }, + { + "epoch": 1.220679639848969, + "grad_norm": 0.6090680956840515, + "learning_rate": 0.0001, + "loss": 1.5305, + "step": 10507 + }, + { + "epoch": 1.2207958176009295, + "grad_norm": 0.5901305079460144, + "learning_rate": 0.0001, + "loss": 1.5119, + "step": 10508 + }, + { + "epoch": 1.22091199535289, + "grad_norm": 0.6401813626289368, + "learning_rate": 0.0001, + "loss": 1.6139, + "step": 10509 + }, + { + "epoch": 1.2210281731048505, + "grad_norm": 0.6004911661148071, + "learning_rate": 0.0001, + "loss": 1.4363, + "step": 10510 + }, + { + "epoch": 1.221144350856811, + "grad_norm": 0.5764592289924622, + "learning_rate": 0.0001, + "loss": 1.4868, + "step": 10511 + }, + { + "epoch": 1.2212605286087714, + "grad_norm": 0.5585955381393433, + "learning_rate": 0.0001, + "loss": 1.5034, + "step": 10512 + }, + { + "epoch": 1.221376706360732, + "grad_norm": 0.5347375273704529, + "learning_rate": 0.0001, + "loss": 1.3796, + "step": 10513 + }, + { + "epoch": 1.2214928841126924, + "grad_norm": 0.5225241780281067, + "learning_rate": 0.0001, + "loss": 1.35, + "step": 10514 + }, + { + "epoch": 1.221609061864653, + "grad_norm": 0.5799035429954529, + "learning_rate": 0.0001, + "loss": 1.4947, + "step": 10515 + }, + { + "epoch": 1.2217252396166134, + "grad_norm": 0.5327029824256897, + "learning_rate": 0.0001, + "loss": 1.411, + "step": 10516 + }, + { + "epoch": 1.2218414173685739, + "grad_norm": 0.5418916344642639, + "learning_rate": 0.0001, + "loss": 1.4701, + "step": 10517 + }, + { + "epoch": 1.2219575951205344, + "grad_norm": 0.5544294714927673, + "learning_rate": 0.0001, + "loss": 1.3398, + "step": 10518 + }, + { + "epoch": 1.2220737728724949, + "grad_norm": 0.6825329065322876, + "learning_rate": 0.0001, + "loss": 1.4933, + "step": 10519 + }, + { + "epoch": 1.2221899506244553, + "grad_norm": 0.5987062454223633, + "learning_rate": 0.0001, + "loss": 1.3924, + "step": 10520 + }, + { + "epoch": 1.2223061283764158, + "grad_norm": 0.6384679675102234, + "learning_rate": 0.0001, + "loss": 1.5034, + "step": 10521 + }, + { + "epoch": 1.2224223061283763, + "grad_norm": 0.5685727000236511, + "learning_rate": 0.0001, + "loss": 1.3943, + "step": 10522 + }, + { + "epoch": 1.2225384838803368, + "grad_norm": 0.5603922009468079, + "learning_rate": 0.0001, + "loss": 1.4228, + "step": 10523 + }, + { + "epoch": 1.2226546616322973, + "grad_norm": 0.5447299480438232, + "learning_rate": 0.0001, + "loss": 1.2081, + "step": 10524 + }, + { + "epoch": 1.2227708393842578, + "grad_norm": 0.6237144470214844, + "learning_rate": 0.0001, + "loss": 1.7241, + "step": 10525 + }, + { + "epoch": 1.2228870171362185, + "grad_norm": 0.567017674446106, + "learning_rate": 0.0001, + "loss": 1.5054, + "step": 10526 + }, + { + "epoch": 1.223003194888179, + "grad_norm": 0.5752435326576233, + "learning_rate": 0.0001, + "loss": 1.3276, + "step": 10527 + }, + { + "epoch": 1.2231193726401395, + "grad_norm": 0.5610300898551941, + "learning_rate": 0.0001, + "loss": 1.4231, + "step": 10528 + }, + { + "epoch": 1.2232355503921, + "grad_norm": 0.5971804857254028, + "learning_rate": 0.0001, + "loss": 1.3758, + "step": 10529 + }, + { + "epoch": 1.2233517281440605, + "grad_norm": 0.5715692639350891, + "learning_rate": 0.0001, + "loss": 1.4178, + "step": 10530 + }, + { + "epoch": 1.223467905896021, + "grad_norm": 0.5297221541404724, + "learning_rate": 0.0001, + "loss": 1.3276, + "step": 10531 + }, + { + "epoch": 1.2235840836479814, + "grad_norm": 0.5569010376930237, + "learning_rate": 0.0001, + "loss": 1.3424, + "step": 10532 + }, + { + "epoch": 1.223700261399942, + "grad_norm": 0.5953126549720764, + "learning_rate": 0.0001, + "loss": 1.4072, + "step": 10533 + }, + { + "epoch": 1.2238164391519024, + "grad_norm": 0.6054908037185669, + "learning_rate": 0.0001, + "loss": 1.4562, + "step": 10534 + }, + { + "epoch": 1.223932616903863, + "grad_norm": 0.5700466632843018, + "learning_rate": 0.0001, + "loss": 1.5383, + "step": 10535 + }, + { + "epoch": 1.2240487946558234, + "grad_norm": 0.5606374144554138, + "learning_rate": 0.0001, + "loss": 1.4028, + "step": 10536 + }, + { + "epoch": 1.2241649724077839, + "grad_norm": 0.5463014245033264, + "learning_rate": 0.0001, + "loss": 1.5375, + "step": 10537 + }, + { + "epoch": 1.2242811501597444, + "grad_norm": 0.5794979333877563, + "learning_rate": 0.0001, + "loss": 1.4477, + "step": 10538 + }, + { + "epoch": 1.2243973279117049, + "grad_norm": 0.5675046443939209, + "learning_rate": 0.0001, + "loss": 1.5133, + "step": 10539 + }, + { + "epoch": 1.2245135056636653, + "grad_norm": 0.5605546236038208, + "learning_rate": 0.0001, + "loss": 1.4852, + "step": 10540 + }, + { + "epoch": 1.2246296834156258, + "grad_norm": 0.5566608905792236, + "learning_rate": 0.0001, + "loss": 1.3657, + "step": 10541 + }, + { + "epoch": 1.2247458611675863, + "grad_norm": 0.60628741979599, + "learning_rate": 0.0001, + "loss": 1.6062, + "step": 10542 + }, + { + "epoch": 1.2248620389195468, + "grad_norm": 0.5989949107170105, + "learning_rate": 0.0001, + "loss": 1.4082, + "step": 10543 + }, + { + "epoch": 1.2249782166715075, + "grad_norm": 0.601296067237854, + "learning_rate": 0.0001, + "loss": 1.5123, + "step": 10544 + }, + { + "epoch": 1.225094394423468, + "grad_norm": 0.6421616673469543, + "learning_rate": 0.0001, + "loss": 1.5389, + "step": 10545 + }, + { + "epoch": 1.2252105721754285, + "grad_norm": 0.5911616086959839, + "learning_rate": 0.0001, + "loss": 1.7276, + "step": 10546 + }, + { + "epoch": 1.225326749927389, + "grad_norm": 0.6583091020584106, + "learning_rate": 0.0001, + "loss": 1.6858, + "step": 10547 + }, + { + "epoch": 1.2254429276793495, + "grad_norm": 0.575006365776062, + "learning_rate": 0.0001, + "loss": 1.5305, + "step": 10548 + }, + { + "epoch": 1.22555910543131, + "grad_norm": 0.5844583511352539, + "learning_rate": 0.0001, + "loss": 1.5218, + "step": 10549 + }, + { + "epoch": 1.2256752831832705, + "grad_norm": 0.5701775550842285, + "learning_rate": 0.0001, + "loss": 1.5935, + "step": 10550 + }, + { + "epoch": 1.225791460935231, + "grad_norm": 0.5439273118972778, + "learning_rate": 0.0001, + "loss": 1.3318, + "step": 10551 + }, + { + "epoch": 1.2259076386871914, + "grad_norm": 0.5806834697723389, + "learning_rate": 0.0001, + "loss": 1.3744, + "step": 10552 + }, + { + "epoch": 1.226023816439152, + "grad_norm": 0.5335421562194824, + "learning_rate": 0.0001, + "loss": 1.1911, + "step": 10553 + }, + { + "epoch": 1.2261399941911124, + "grad_norm": 0.5800783038139343, + "learning_rate": 0.0001, + "loss": 1.4885, + "step": 10554 + }, + { + "epoch": 1.226256171943073, + "grad_norm": 0.5973320007324219, + "learning_rate": 0.0001, + "loss": 1.3504, + "step": 10555 + }, + { + "epoch": 1.2263723496950334, + "grad_norm": 0.5879946351051331, + "learning_rate": 0.0001, + "loss": 1.4086, + "step": 10556 + }, + { + "epoch": 1.2264885274469939, + "grad_norm": 0.621277928352356, + "learning_rate": 0.0001, + "loss": 1.5551, + "step": 10557 + }, + { + "epoch": 1.2266047051989544, + "grad_norm": 0.5970613360404968, + "learning_rate": 0.0001, + "loss": 1.2918, + "step": 10558 + }, + { + "epoch": 1.2267208829509149, + "grad_norm": 0.593627393245697, + "learning_rate": 0.0001, + "loss": 1.4994, + "step": 10559 + }, + { + "epoch": 1.2268370607028753, + "grad_norm": 0.6126559972763062, + "learning_rate": 0.0001, + "loss": 1.5531, + "step": 10560 + }, + { + "epoch": 1.2269532384548358, + "grad_norm": 0.5869115591049194, + "learning_rate": 0.0001, + "loss": 1.5224, + "step": 10561 + }, + { + "epoch": 1.2270694162067963, + "grad_norm": 0.587081789970398, + "learning_rate": 0.0001, + "loss": 1.5067, + "step": 10562 + }, + { + "epoch": 1.2271855939587568, + "grad_norm": 0.5886223316192627, + "learning_rate": 0.0001, + "loss": 1.5962, + "step": 10563 + }, + { + "epoch": 1.2273017717107173, + "grad_norm": 0.5749689936637878, + "learning_rate": 0.0001, + "loss": 1.4966, + "step": 10564 + }, + { + "epoch": 1.2274179494626778, + "grad_norm": 0.5669975876808167, + "learning_rate": 0.0001, + "loss": 1.4723, + "step": 10565 + }, + { + "epoch": 1.2275341272146383, + "grad_norm": 0.5260981917381287, + "learning_rate": 0.0001, + "loss": 1.4386, + "step": 10566 + }, + { + "epoch": 1.227650304966599, + "grad_norm": 0.5275533199310303, + "learning_rate": 0.0001, + "loss": 1.4732, + "step": 10567 + }, + { + "epoch": 1.2277664827185595, + "grad_norm": 0.5393403768539429, + "learning_rate": 0.0001, + "loss": 1.37, + "step": 10568 + }, + { + "epoch": 1.22788266047052, + "grad_norm": 0.5790442824363708, + "learning_rate": 0.0001, + "loss": 1.4346, + "step": 10569 + }, + { + "epoch": 1.2279988382224805, + "grad_norm": 0.579831063747406, + "learning_rate": 0.0001, + "loss": 1.3978, + "step": 10570 + }, + { + "epoch": 1.228115015974441, + "grad_norm": 0.5463832020759583, + "learning_rate": 0.0001, + "loss": 1.4308, + "step": 10571 + }, + { + "epoch": 1.2282311937264014, + "grad_norm": 0.551559567451477, + "learning_rate": 0.0001, + "loss": 1.4143, + "step": 10572 + }, + { + "epoch": 1.228347371478362, + "grad_norm": 0.544151782989502, + "learning_rate": 0.0001, + "loss": 1.3611, + "step": 10573 + }, + { + "epoch": 1.2284635492303224, + "grad_norm": 0.5910764932632446, + "learning_rate": 0.0001, + "loss": 1.4523, + "step": 10574 + }, + { + "epoch": 1.228579726982283, + "grad_norm": 0.5585051774978638, + "learning_rate": 0.0001, + "loss": 1.376, + "step": 10575 + }, + { + "epoch": 1.2286959047342434, + "grad_norm": 0.5787076354026794, + "learning_rate": 0.0001, + "loss": 1.5203, + "step": 10576 + }, + { + "epoch": 1.2288120824862039, + "grad_norm": 0.5803587436676025, + "learning_rate": 0.0001, + "loss": 1.5013, + "step": 10577 + }, + { + "epoch": 1.2289282602381644, + "grad_norm": 0.5648260116577148, + "learning_rate": 0.0001, + "loss": 1.4385, + "step": 10578 + }, + { + "epoch": 1.2290444379901249, + "grad_norm": 0.5887954235076904, + "learning_rate": 0.0001, + "loss": 1.6515, + "step": 10579 + }, + { + "epoch": 1.2291606157420853, + "grad_norm": 0.6301035284996033, + "learning_rate": 0.0001, + "loss": 1.5682, + "step": 10580 + }, + { + "epoch": 1.2292767934940458, + "grad_norm": 0.5514262914657593, + "learning_rate": 0.0001, + "loss": 1.4847, + "step": 10581 + }, + { + "epoch": 1.2293929712460063, + "grad_norm": 0.5427731275558472, + "learning_rate": 0.0001, + "loss": 1.5574, + "step": 10582 + }, + { + "epoch": 1.2295091489979668, + "grad_norm": 0.5385409593582153, + "learning_rate": 0.0001, + "loss": 1.3763, + "step": 10583 + }, + { + "epoch": 1.2296253267499273, + "grad_norm": 0.5767699480056763, + "learning_rate": 0.0001, + "loss": 1.5571, + "step": 10584 + }, + { + "epoch": 1.2297415045018878, + "grad_norm": 0.5255317687988281, + "learning_rate": 0.0001, + "loss": 1.4605, + "step": 10585 + }, + { + "epoch": 1.2298576822538485, + "grad_norm": 0.5520825982093811, + "learning_rate": 0.0001, + "loss": 1.4506, + "step": 10586 + }, + { + "epoch": 1.229973860005809, + "grad_norm": 0.5681281089782715, + "learning_rate": 0.0001, + "loss": 1.6923, + "step": 10587 + }, + { + "epoch": 1.2300900377577695, + "grad_norm": 0.5493902564048767, + "learning_rate": 0.0001, + "loss": 1.4525, + "step": 10588 + }, + { + "epoch": 1.23020621550973, + "grad_norm": 0.5706425309181213, + "learning_rate": 0.0001, + "loss": 1.5089, + "step": 10589 + }, + { + "epoch": 1.2303223932616905, + "grad_norm": 0.5630264282226562, + "learning_rate": 0.0001, + "loss": 1.4545, + "step": 10590 + }, + { + "epoch": 1.230438571013651, + "grad_norm": 0.5919108986854553, + "learning_rate": 0.0001, + "loss": 1.4042, + "step": 10591 + }, + { + "epoch": 1.2305547487656114, + "grad_norm": 0.5534734129905701, + "learning_rate": 0.0001, + "loss": 1.4322, + "step": 10592 + }, + { + "epoch": 1.230670926517572, + "grad_norm": 0.5363836884498596, + "learning_rate": 0.0001, + "loss": 1.336, + "step": 10593 + }, + { + "epoch": 1.2307871042695324, + "grad_norm": 0.5639822483062744, + "learning_rate": 0.0001, + "loss": 1.3161, + "step": 10594 + }, + { + "epoch": 1.230903282021493, + "grad_norm": 0.5711953639984131, + "learning_rate": 0.0001, + "loss": 1.5166, + "step": 10595 + }, + { + "epoch": 1.2310194597734534, + "grad_norm": 0.5398626327514648, + "learning_rate": 0.0001, + "loss": 1.3888, + "step": 10596 + }, + { + "epoch": 1.2311356375254139, + "grad_norm": 0.5533974766731262, + "learning_rate": 0.0001, + "loss": 1.3355, + "step": 10597 + }, + { + "epoch": 1.2312518152773744, + "grad_norm": 0.5976875424385071, + "learning_rate": 0.0001, + "loss": 1.4595, + "step": 10598 + }, + { + "epoch": 1.2313679930293349, + "grad_norm": 0.5598598122596741, + "learning_rate": 0.0001, + "loss": 1.3329, + "step": 10599 + }, + { + "epoch": 1.2314841707812954, + "grad_norm": 0.5962227582931519, + "learning_rate": 0.0001, + "loss": 1.4619, + "step": 10600 + }, + { + "epoch": 1.2316003485332558, + "grad_norm": 0.523517370223999, + "learning_rate": 0.0001, + "loss": 1.3384, + "step": 10601 + }, + { + "epoch": 1.2317165262852163, + "grad_norm": 0.5865228772163391, + "learning_rate": 0.0001, + "loss": 1.4522, + "step": 10602 + }, + { + "epoch": 1.2318327040371768, + "grad_norm": 0.5959430932998657, + "learning_rate": 0.0001, + "loss": 1.507, + "step": 10603 + }, + { + "epoch": 1.2319488817891373, + "grad_norm": 0.5698263645172119, + "learning_rate": 0.0001, + "loss": 1.3024, + "step": 10604 + }, + { + "epoch": 1.2320650595410978, + "grad_norm": 0.5440550446510315, + "learning_rate": 0.0001, + "loss": 1.3271, + "step": 10605 + }, + { + "epoch": 1.2321812372930583, + "grad_norm": 0.5678781270980835, + "learning_rate": 0.0001, + "loss": 1.468, + "step": 10606 + }, + { + "epoch": 1.2322974150450188, + "grad_norm": 0.6232162714004517, + "learning_rate": 0.0001, + "loss": 1.5875, + "step": 10607 + }, + { + "epoch": 1.2324135927969793, + "grad_norm": 0.5675930380821228, + "learning_rate": 0.0001, + "loss": 1.4716, + "step": 10608 + }, + { + "epoch": 1.23252977054894, + "grad_norm": 0.5585689544677734, + "learning_rate": 0.0001, + "loss": 1.379, + "step": 10609 + }, + { + "epoch": 1.2326459483009005, + "grad_norm": 0.5632004141807556, + "learning_rate": 0.0001, + "loss": 1.3636, + "step": 10610 + }, + { + "epoch": 1.232762126052861, + "grad_norm": 0.5808938145637512, + "learning_rate": 0.0001, + "loss": 1.6049, + "step": 10611 + }, + { + "epoch": 1.2328783038048214, + "grad_norm": 0.5879512429237366, + "learning_rate": 0.0001, + "loss": 1.5081, + "step": 10612 + }, + { + "epoch": 1.232994481556782, + "grad_norm": 0.5673100352287292, + "learning_rate": 0.0001, + "loss": 1.5315, + "step": 10613 + }, + { + "epoch": 1.2331106593087424, + "grad_norm": 0.6437278389930725, + "learning_rate": 0.0001, + "loss": 1.5295, + "step": 10614 + }, + { + "epoch": 1.233226837060703, + "grad_norm": 0.554803192615509, + "learning_rate": 0.0001, + "loss": 1.5641, + "step": 10615 + }, + { + "epoch": 1.2333430148126634, + "grad_norm": 0.5712867379188538, + "learning_rate": 0.0001, + "loss": 1.3592, + "step": 10616 + }, + { + "epoch": 1.2334591925646239, + "grad_norm": 0.5592646598815918, + "learning_rate": 0.0001, + "loss": 1.4562, + "step": 10617 + }, + { + "epoch": 1.2335753703165844, + "grad_norm": 0.5389765501022339, + "learning_rate": 0.0001, + "loss": 1.3461, + "step": 10618 + }, + { + "epoch": 1.2336915480685449, + "grad_norm": 0.622361958026886, + "learning_rate": 0.0001, + "loss": 1.6166, + "step": 10619 + }, + { + "epoch": 1.2338077258205054, + "grad_norm": 0.5598737597465515, + "learning_rate": 0.0001, + "loss": 1.4707, + "step": 10620 + }, + { + "epoch": 1.2339239035724658, + "grad_norm": 0.5794302821159363, + "learning_rate": 0.0001, + "loss": 1.5521, + "step": 10621 + }, + { + "epoch": 1.2340400813244263, + "grad_norm": 0.5385465025901794, + "learning_rate": 0.0001, + "loss": 1.3243, + "step": 10622 + }, + { + "epoch": 1.2341562590763868, + "grad_norm": 0.5621716380119324, + "learning_rate": 0.0001, + "loss": 1.4385, + "step": 10623 + }, + { + "epoch": 1.2342724368283473, + "grad_norm": 0.5806053876876831, + "learning_rate": 0.0001, + "loss": 1.6138, + "step": 10624 + }, + { + "epoch": 1.2343886145803078, + "grad_norm": 0.5949336290359497, + "learning_rate": 0.0001, + "loss": 1.3553, + "step": 10625 + }, + { + "epoch": 1.2345047923322683, + "grad_norm": 0.6420254707336426, + "learning_rate": 0.0001, + "loss": 1.6207, + "step": 10626 + }, + { + "epoch": 1.2346209700842288, + "grad_norm": 0.5900982618331909, + "learning_rate": 0.0001, + "loss": 1.5271, + "step": 10627 + }, + { + "epoch": 1.2347371478361895, + "grad_norm": 0.5828454494476318, + "learning_rate": 0.0001, + "loss": 1.5352, + "step": 10628 + }, + { + "epoch": 1.23485332558815, + "grad_norm": 0.553706705570221, + "learning_rate": 0.0001, + "loss": 1.3842, + "step": 10629 + }, + { + "epoch": 1.2349695033401105, + "grad_norm": 0.5648255944252014, + "learning_rate": 0.0001, + "loss": 1.5772, + "step": 10630 + }, + { + "epoch": 1.235085681092071, + "grad_norm": 0.5571692585945129, + "learning_rate": 0.0001, + "loss": 1.4299, + "step": 10631 + }, + { + "epoch": 1.2352018588440314, + "grad_norm": 0.5914157032966614, + "learning_rate": 0.0001, + "loss": 1.5363, + "step": 10632 + }, + { + "epoch": 1.235318036595992, + "grad_norm": 0.5305378437042236, + "learning_rate": 0.0001, + "loss": 1.4097, + "step": 10633 + }, + { + "epoch": 1.2354342143479524, + "grad_norm": 0.5221390724182129, + "learning_rate": 0.0001, + "loss": 1.4712, + "step": 10634 + }, + { + "epoch": 1.235550392099913, + "grad_norm": 0.5526520013809204, + "learning_rate": 0.0001, + "loss": 1.5835, + "step": 10635 + }, + { + "epoch": 1.2356665698518734, + "grad_norm": 0.552923321723938, + "learning_rate": 0.0001, + "loss": 1.4845, + "step": 10636 + }, + { + "epoch": 1.2357827476038339, + "grad_norm": 0.5690024495124817, + "learning_rate": 0.0001, + "loss": 1.5021, + "step": 10637 + }, + { + "epoch": 1.2358989253557944, + "grad_norm": 0.5636914372444153, + "learning_rate": 0.0001, + "loss": 1.4861, + "step": 10638 + }, + { + "epoch": 1.2360151031077549, + "grad_norm": 0.6583706736564636, + "learning_rate": 0.0001, + "loss": 1.7168, + "step": 10639 + }, + { + "epoch": 1.2361312808597154, + "grad_norm": 0.600277304649353, + "learning_rate": 0.0001, + "loss": 1.6215, + "step": 10640 + }, + { + "epoch": 1.2362474586116758, + "grad_norm": 0.6001637578010559, + "learning_rate": 0.0001, + "loss": 1.525, + "step": 10641 + }, + { + "epoch": 1.2363636363636363, + "grad_norm": 0.5948476195335388, + "learning_rate": 0.0001, + "loss": 1.5354, + "step": 10642 + }, + { + "epoch": 1.2364798141155968, + "grad_norm": 0.6298750638961792, + "learning_rate": 0.0001, + "loss": 1.4664, + "step": 10643 + }, + { + "epoch": 1.2365959918675573, + "grad_norm": 0.578194797039032, + "learning_rate": 0.0001, + "loss": 1.4185, + "step": 10644 + }, + { + "epoch": 1.2367121696195178, + "grad_norm": 0.5624285340309143, + "learning_rate": 0.0001, + "loss": 1.5575, + "step": 10645 + }, + { + "epoch": 1.2368283473714783, + "grad_norm": 0.5956478714942932, + "learning_rate": 0.0001, + "loss": 1.5, + "step": 10646 + }, + { + "epoch": 1.2369445251234388, + "grad_norm": 0.5751087665557861, + "learning_rate": 0.0001, + "loss": 1.4631, + "step": 10647 + }, + { + "epoch": 1.2370607028753993, + "grad_norm": 0.5814287066459656, + "learning_rate": 0.0001, + "loss": 1.4004, + "step": 10648 + }, + { + "epoch": 1.2371768806273598, + "grad_norm": 0.5077279210090637, + "learning_rate": 0.0001, + "loss": 1.2642, + "step": 10649 + }, + { + "epoch": 1.2372930583793202, + "grad_norm": 0.579255223274231, + "learning_rate": 0.0001, + "loss": 1.5844, + "step": 10650 + }, + { + "epoch": 1.237409236131281, + "grad_norm": 0.6207264065742493, + "learning_rate": 0.0001, + "loss": 1.5855, + "step": 10651 + }, + { + "epoch": 1.2375254138832414, + "grad_norm": 0.6084131598472595, + "learning_rate": 0.0001, + "loss": 1.4875, + "step": 10652 + }, + { + "epoch": 1.237641591635202, + "grad_norm": 0.5942187905311584, + "learning_rate": 0.0001, + "loss": 1.6195, + "step": 10653 + }, + { + "epoch": 1.2377577693871624, + "grad_norm": 0.5793184638023376, + "learning_rate": 0.0001, + "loss": 1.5205, + "step": 10654 + }, + { + "epoch": 1.237873947139123, + "grad_norm": 0.5800839066505432, + "learning_rate": 0.0001, + "loss": 1.4165, + "step": 10655 + }, + { + "epoch": 1.2379901248910834, + "grad_norm": 0.5806957483291626, + "learning_rate": 0.0001, + "loss": 1.4971, + "step": 10656 + }, + { + "epoch": 1.2381063026430439, + "grad_norm": 0.5428292751312256, + "learning_rate": 0.0001, + "loss": 1.2147, + "step": 10657 + }, + { + "epoch": 1.2382224803950044, + "grad_norm": 0.5477049350738525, + "learning_rate": 0.0001, + "loss": 1.3976, + "step": 10658 + }, + { + "epoch": 1.2383386581469649, + "grad_norm": 0.5499922037124634, + "learning_rate": 0.0001, + "loss": 1.3248, + "step": 10659 + }, + { + "epoch": 1.2384548358989254, + "grad_norm": 0.5535095930099487, + "learning_rate": 0.0001, + "loss": 1.312, + "step": 10660 + }, + { + "epoch": 1.2385710136508858, + "grad_norm": 0.5951287150382996, + "learning_rate": 0.0001, + "loss": 1.5418, + "step": 10661 + }, + { + "epoch": 1.2386871914028463, + "grad_norm": 0.5869619846343994, + "learning_rate": 0.0001, + "loss": 1.4575, + "step": 10662 + }, + { + "epoch": 1.2388033691548068, + "grad_norm": 0.5614014267921448, + "learning_rate": 0.0001, + "loss": 1.4689, + "step": 10663 + }, + { + "epoch": 1.2389195469067673, + "grad_norm": 0.5796566605567932, + "learning_rate": 0.0001, + "loss": 1.4835, + "step": 10664 + }, + { + "epoch": 1.2390357246587278, + "grad_norm": 0.5910658240318298, + "learning_rate": 0.0001, + "loss": 1.5578, + "step": 10665 + }, + { + "epoch": 1.2391519024106883, + "grad_norm": 0.564470112323761, + "learning_rate": 0.0001, + "loss": 1.4322, + "step": 10666 + }, + { + "epoch": 1.2392680801626488, + "grad_norm": 0.5840427279472351, + "learning_rate": 0.0001, + "loss": 1.3944, + "step": 10667 + }, + { + "epoch": 1.2393842579146093, + "grad_norm": 0.5382651090621948, + "learning_rate": 0.0001, + "loss": 1.2162, + "step": 10668 + }, + { + "epoch": 1.2395004356665698, + "grad_norm": 0.5443084836006165, + "learning_rate": 0.0001, + "loss": 1.3367, + "step": 10669 + }, + { + "epoch": 1.2396166134185305, + "grad_norm": 0.5765810012817383, + "learning_rate": 0.0001, + "loss": 1.3894, + "step": 10670 + }, + { + "epoch": 1.239732791170491, + "grad_norm": 0.5465205907821655, + "learning_rate": 0.0001, + "loss": 1.3926, + "step": 10671 + }, + { + "epoch": 1.2398489689224514, + "grad_norm": 0.5790244936943054, + "learning_rate": 0.0001, + "loss": 1.3868, + "step": 10672 + }, + { + "epoch": 1.239965146674412, + "grad_norm": 0.5863944888114929, + "learning_rate": 0.0001, + "loss": 1.5827, + "step": 10673 + }, + { + "epoch": 1.2400813244263724, + "grad_norm": 0.5954928994178772, + "learning_rate": 0.0001, + "loss": 1.3925, + "step": 10674 + }, + { + "epoch": 1.240197502178333, + "grad_norm": 0.6234539747238159, + "learning_rate": 0.0001, + "loss": 1.5407, + "step": 10675 + }, + { + "epoch": 1.2403136799302934, + "grad_norm": 0.5883687734603882, + "learning_rate": 0.0001, + "loss": 1.5754, + "step": 10676 + }, + { + "epoch": 1.240429857682254, + "grad_norm": 0.5818089842796326, + "learning_rate": 0.0001, + "loss": 1.4462, + "step": 10677 + }, + { + "epoch": 1.2405460354342144, + "grad_norm": 0.6155632138252258, + "learning_rate": 0.0001, + "loss": 1.4489, + "step": 10678 + }, + { + "epoch": 1.2406622131861749, + "grad_norm": 0.5792834162712097, + "learning_rate": 0.0001, + "loss": 1.4936, + "step": 10679 + }, + { + "epoch": 1.2407783909381354, + "grad_norm": 0.5829952359199524, + "learning_rate": 0.0001, + "loss": 1.5449, + "step": 10680 + }, + { + "epoch": 1.2408945686900958, + "grad_norm": 0.5667127370834351, + "learning_rate": 0.0001, + "loss": 1.4372, + "step": 10681 + }, + { + "epoch": 1.2410107464420563, + "grad_norm": 0.5373808145523071, + "learning_rate": 0.0001, + "loss": 1.364, + "step": 10682 + }, + { + "epoch": 1.2411269241940168, + "grad_norm": 0.5681789517402649, + "learning_rate": 0.0001, + "loss": 1.3148, + "step": 10683 + }, + { + "epoch": 1.2412431019459773, + "grad_norm": 0.5821676254272461, + "learning_rate": 0.0001, + "loss": 1.4943, + "step": 10684 + }, + { + "epoch": 1.2413592796979378, + "grad_norm": 0.583202600479126, + "learning_rate": 0.0001, + "loss": 1.6383, + "step": 10685 + }, + { + "epoch": 1.2414754574498983, + "grad_norm": 0.5483458638191223, + "learning_rate": 0.0001, + "loss": 1.4638, + "step": 10686 + }, + { + "epoch": 1.2415916352018588, + "grad_norm": 0.5675621032714844, + "learning_rate": 0.0001, + "loss": 1.4976, + "step": 10687 + }, + { + "epoch": 1.2417078129538193, + "grad_norm": 0.5624716877937317, + "learning_rate": 0.0001, + "loss": 1.4133, + "step": 10688 + }, + { + "epoch": 1.2418239907057798, + "grad_norm": 0.6353493332862854, + "learning_rate": 0.0001, + "loss": 1.5201, + "step": 10689 + }, + { + "epoch": 1.2419401684577402, + "grad_norm": 0.5259159803390503, + "learning_rate": 0.0001, + "loss": 1.3333, + "step": 10690 + }, + { + "epoch": 1.2420563462097007, + "grad_norm": 0.546409547328949, + "learning_rate": 0.0001, + "loss": 1.4868, + "step": 10691 + }, + { + "epoch": 1.2421725239616612, + "grad_norm": 0.5584643483161926, + "learning_rate": 0.0001, + "loss": 1.4144, + "step": 10692 + }, + { + "epoch": 1.242288701713622, + "grad_norm": 0.5916255116462708, + "learning_rate": 0.0001, + "loss": 1.4505, + "step": 10693 + }, + { + "epoch": 1.2424048794655824, + "grad_norm": 0.6383016705513, + "learning_rate": 0.0001, + "loss": 1.5633, + "step": 10694 + }, + { + "epoch": 1.242521057217543, + "grad_norm": 0.5840632319450378, + "learning_rate": 0.0001, + "loss": 1.4845, + "step": 10695 + }, + { + "epoch": 1.2426372349695034, + "grad_norm": 0.58888179063797, + "learning_rate": 0.0001, + "loss": 1.5685, + "step": 10696 + }, + { + "epoch": 1.242753412721464, + "grad_norm": 0.5815372467041016, + "learning_rate": 0.0001, + "loss": 1.4579, + "step": 10697 + }, + { + "epoch": 1.2428695904734244, + "grad_norm": 0.5951624512672424, + "learning_rate": 0.0001, + "loss": 1.416, + "step": 10698 + }, + { + "epoch": 1.2429857682253849, + "grad_norm": 0.611923098564148, + "learning_rate": 0.0001, + "loss": 1.3961, + "step": 10699 + }, + { + "epoch": 1.2431019459773454, + "grad_norm": 0.5687365531921387, + "learning_rate": 0.0001, + "loss": 1.5876, + "step": 10700 + }, + { + "epoch": 1.2432181237293058, + "grad_norm": 0.5626431703567505, + "learning_rate": 0.0001, + "loss": 1.5305, + "step": 10701 + }, + { + "epoch": 1.2433343014812663, + "grad_norm": 0.5979207754135132, + "learning_rate": 0.0001, + "loss": 1.5413, + "step": 10702 + }, + { + "epoch": 1.2434504792332268, + "grad_norm": 0.5764636993408203, + "learning_rate": 0.0001, + "loss": 1.6826, + "step": 10703 + }, + { + "epoch": 1.2435666569851873, + "grad_norm": 0.6030092835426331, + "learning_rate": 0.0001, + "loss": 1.653, + "step": 10704 + }, + { + "epoch": 1.2436828347371478, + "grad_norm": 0.5411743521690369, + "learning_rate": 0.0001, + "loss": 1.3816, + "step": 10705 + }, + { + "epoch": 1.2437990124891083, + "grad_norm": 0.5814303755760193, + "learning_rate": 0.0001, + "loss": 1.6629, + "step": 10706 + }, + { + "epoch": 1.2439151902410688, + "grad_norm": 0.5599769949913025, + "learning_rate": 0.0001, + "loss": 1.3897, + "step": 10707 + }, + { + "epoch": 1.2440313679930293, + "grad_norm": 0.5339173674583435, + "learning_rate": 0.0001, + "loss": 1.4417, + "step": 10708 + }, + { + "epoch": 1.2441475457449898, + "grad_norm": 0.5466998815536499, + "learning_rate": 0.0001, + "loss": 1.4545, + "step": 10709 + }, + { + "epoch": 1.2442637234969502, + "grad_norm": 0.6054586172103882, + "learning_rate": 0.0001, + "loss": 1.6259, + "step": 10710 + }, + { + "epoch": 1.244379901248911, + "grad_norm": 0.5586958527565002, + "learning_rate": 0.0001, + "loss": 1.4248, + "step": 10711 + }, + { + "epoch": 1.2444960790008714, + "grad_norm": 0.5811334252357483, + "learning_rate": 0.0001, + "loss": 1.4595, + "step": 10712 + }, + { + "epoch": 1.244612256752832, + "grad_norm": 0.5410207509994507, + "learning_rate": 0.0001, + "loss": 1.418, + "step": 10713 + }, + { + "epoch": 1.2447284345047924, + "grad_norm": 0.5457951426506042, + "learning_rate": 0.0001, + "loss": 1.5074, + "step": 10714 + }, + { + "epoch": 1.244844612256753, + "grad_norm": 0.6239075660705566, + "learning_rate": 0.0001, + "loss": 1.5306, + "step": 10715 + }, + { + "epoch": 1.2449607900087134, + "grad_norm": 0.6004078984260559, + "learning_rate": 0.0001, + "loss": 1.6734, + "step": 10716 + }, + { + "epoch": 1.245076967760674, + "grad_norm": 0.5737530589103699, + "learning_rate": 0.0001, + "loss": 1.4565, + "step": 10717 + }, + { + "epoch": 1.2451931455126344, + "grad_norm": 0.5727450847625732, + "learning_rate": 0.0001, + "loss": 1.6573, + "step": 10718 + }, + { + "epoch": 1.2453093232645949, + "grad_norm": 0.5446699261665344, + "learning_rate": 0.0001, + "loss": 1.2936, + "step": 10719 + }, + { + "epoch": 1.2454255010165554, + "grad_norm": 0.5467056035995483, + "learning_rate": 0.0001, + "loss": 1.3091, + "step": 10720 + }, + { + "epoch": 1.2455416787685158, + "grad_norm": 0.5639151334762573, + "learning_rate": 0.0001, + "loss": 1.4404, + "step": 10721 + }, + { + "epoch": 1.2456578565204763, + "grad_norm": 0.6179391741752625, + "learning_rate": 0.0001, + "loss": 1.3933, + "step": 10722 + }, + { + "epoch": 1.2457740342724368, + "grad_norm": 0.5786750912666321, + "learning_rate": 0.0001, + "loss": 1.409, + "step": 10723 + }, + { + "epoch": 1.2458902120243973, + "grad_norm": 0.5844369530677795, + "learning_rate": 0.0001, + "loss": 1.5618, + "step": 10724 + }, + { + "epoch": 1.2460063897763578, + "grad_norm": 0.5636518597602844, + "learning_rate": 0.0001, + "loss": 1.3696, + "step": 10725 + }, + { + "epoch": 1.2461225675283183, + "grad_norm": 0.5851801633834839, + "learning_rate": 0.0001, + "loss": 1.4825, + "step": 10726 + }, + { + "epoch": 1.2462387452802788, + "grad_norm": 0.5837157368659973, + "learning_rate": 0.0001, + "loss": 1.2587, + "step": 10727 + }, + { + "epoch": 1.2463549230322393, + "grad_norm": 0.5497305393218994, + "learning_rate": 0.0001, + "loss": 1.5463, + "step": 10728 + }, + { + "epoch": 1.2464711007841998, + "grad_norm": 0.6165003180503845, + "learning_rate": 0.0001, + "loss": 1.637, + "step": 10729 + }, + { + "epoch": 1.2465872785361602, + "grad_norm": 0.595329999923706, + "learning_rate": 0.0001, + "loss": 1.4555, + "step": 10730 + }, + { + "epoch": 1.2467034562881207, + "grad_norm": 0.5779715180397034, + "learning_rate": 0.0001, + "loss": 1.5749, + "step": 10731 + }, + { + "epoch": 1.2468196340400812, + "grad_norm": 0.6294297575950623, + "learning_rate": 0.0001, + "loss": 1.6384, + "step": 10732 + }, + { + "epoch": 1.2469358117920417, + "grad_norm": 0.5893142223358154, + "learning_rate": 0.0001, + "loss": 1.4439, + "step": 10733 + }, + { + "epoch": 1.2470519895440022, + "grad_norm": 0.5959643721580505, + "learning_rate": 0.0001, + "loss": 1.4772, + "step": 10734 + }, + { + "epoch": 1.247168167295963, + "grad_norm": 0.6068616509437561, + "learning_rate": 0.0001, + "loss": 1.4337, + "step": 10735 + }, + { + "epoch": 1.2472843450479234, + "grad_norm": 0.5508027076721191, + "learning_rate": 0.0001, + "loss": 1.3589, + "step": 10736 + }, + { + "epoch": 1.247400522799884, + "grad_norm": 0.5569624900817871, + "learning_rate": 0.0001, + "loss": 1.4259, + "step": 10737 + }, + { + "epoch": 1.2475167005518444, + "grad_norm": 0.5653142333030701, + "learning_rate": 0.0001, + "loss": 1.5091, + "step": 10738 + }, + { + "epoch": 1.2476328783038049, + "grad_norm": 0.6183950304985046, + "learning_rate": 0.0001, + "loss": 1.5533, + "step": 10739 + }, + { + "epoch": 1.2477490560557654, + "grad_norm": 0.5955910086631775, + "learning_rate": 0.0001, + "loss": 1.3331, + "step": 10740 + }, + { + "epoch": 1.2478652338077258, + "grad_norm": 0.5446209907531738, + "learning_rate": 0.0001, + "loss": 1.4491, + "step": 10741 + }, + { + "epoch": 1.2479814115596863, + "grad_norm": 0.5655883550643921, + "learning_rate": 0.0001, + "loss": 1.4624, + "step": 10742 + }, + { + "epoch": 1.2480975893116468, + "grad_norm": 0.5753975510597229, + "learning_rate": 0.0001, + "loss": 1.5188, + "step": 10743 + }, + { + "epoch": 1.2482137670636073, + "grad_norm": 0.5425572991371155, + "learning_rate": 0.0001, + "loss": 1.4076, + "step": 10744 + }, + { + "epoch": 1.2483299448155678, + "grad_norm": 0.6025961637496948, + "learning_rate": 0.0001, + "loss": 1.5467, + "step": 10745 + }, + { + "epoch": 1.2484461225675283, + "grad_norm": 0.5763283967971802, + "learning_rate": 0.0001, + "loss": 1.5884, + "step": 10746 + }, + { + "epoch": 1.2485623003194888, + "grad_norm": 0.5742851495742798, + "learning_rate": 0.0001, + "loss": 1.501, + "step": 10747 + }, + { + "epoch": 1.2486784780714493, + "grad_norm": 0.5632321834564209, + "learning_rate": 0.0001, + "loss": 1.539, + "step": 10748 + }, + { + "epoch": 1.2487946558234098, + "grad_norm": 0.5582688450813293, + "learning_rate": 0.0001, + "loss": 1.4634, + "step": 10749 + }, + { + "epoch": 1.2489108335753702, + "grad_norm": 0.5706313848495483, + "learning_rate": 0.0001, + "loss": 1.4574, + "step": 10750 + }, + { + "epoch": 1.2490270113273307, + "grad_norm": 0.5375720262527466, + "learning_rate": 0.0001, + "loss": 1.2735, + "step": 10751 + }, + { + "epoch": 1.2491431890792912, + "grad_norm": 0.5515146851539612, + "learning_rate": 0.0001, + "loss": 1.4086, + "step": 10752 + }, + { + "epoch": 1.249259366831252, + "grad_norm": 0.5534106492996216, + "learning_rate": 0.0001, + "loss": 1.284, + "step": 10753 + }, + { + "epoch": 1.2493755445832124, + "grad_norm": 0.5403925776481628, + "learning_rate": 0.0001, + "loss": 1.2556, + "step": 10754 + }, + { + "epoch": 1.249491722335173, + "grad_norm": 0.6206772923469543, + "learning_rate": 0.0001, + "loss": 1.687, + "step": 10755 + }, + { + "epoch": 1.2496079000871334, + "grad_norm": 0.6456174850463867, + "learning_rate": 0.0001, + "loss": 1.467, + "step": 10756 + }, + { + "epoch": 1.249724077839094, + "grad_norm": 0.5558960437774658, + "learning_rate": 0.0001, + "loss": 1.3112, + "step": 10757 + }, + { + "epoch": 1.2498402555910544, + "grad_norm": 0.5899320244789124, + "learning_rate": 0.0001, + "loss": 1.4726, + "step": 10758 + }, + { + "epoch": 1.2499564333430149, + "grad_norm": 0.5711237192153931, + "learning_rate": 0.0001, + "loss": 1.5579, + "step": 10759 + }, + { + "epoch": 1.2500726110949754, + "grad_norm": 0.5839224457740784, + "learning_rate": 0.0001, + "loss": 1.607, + "step": 10760 + }, + { + "epoch": 1.2501887888469359, + "grad_norm": 0.5762811899185181, + "learning_rate": 0.0001, + "loss": 1.4382, + "step": 10761 + }, + { + "epoch": 1.2503049665988963, + "grad_norm": 0.5910693407058716, + "learning_rate": 0.0001, + "loss": 1.5329, + "step": 10762 + }, + { + "epoch": 1.2504211443508568, + "grad_norm": 0.5522376298904419, + "learning_rate": 0.0001, + "loss": 1.1839, + "step": 10763 + }, + { + "epoch": 1.2505373221028173, + "grad_norm": 0.6215783357620239, + "learning_rate": 0.0001, + "loss": 1.5552, + "step": 10764 + }, + { + "epoch": 1.2506534998547778, + "grad_norm": 0.5663979649543762, + "learning_rate": 0.0001, + "loss": 1.5441, + "step": 10765 + }, + { + "epoch": 1.2507696776067383, + "grad_norm": 0.5686525702476501, + "learning_rate": 0.0001, + "loss": 1.5091, + "step": 10766 + }, + { + "epoch": 1.2508858553586988, + "grad_norm": 0.5687610507011414, + "learning_rate": 0.0001, + "loss": 1.3635, + "step": 10767 + }, + { + "epoch": 1.2510020331106593, + "grad_norm": 0.5779849290847778, + "learning_rate": 0.0001, + "loss": 1.6153, + "step": 10768 + }, + { + "epoch": 1.2511182108626198, + "grad_norm": 0.5521991848945618, + "learning_rate": 0.0001, + "loss": 1.4001, + "step": 10769 + }, + { + "epoch": 1.2512343886145803, + "grad_norm": 0.5512406229972839, + "learning_rate": 0.0001, + "loss": 1.3389, + "step": 10770 + }, + { + "epoch": 1.2513505663665407, + "grad_norm": 0.5759303569793701, + "learning_rate": 0.0001, + "loss": 1.5774, + "step": 10771 + }, + { + "epoch": 1.2514667441185012, + "grad_norm": 0.5932089686393738, + "learning_rate": 0.0001, + "loss": 1.6612, + "step": 10772 + }, + { + "epoch": 1.2515829218704617, + "grad_norm": 0.5830219388008118, + "learning_rate": 0.0001, + "loss": 1.3805, + "step": 10773 + }, + { + "epoch": 1.2516990996224222, + "grad_norm": 0.6111432909965515, + "learning_rate": 0.0001, + "loss": 1.5502, + "step": 10774 + }, + { + "epoch": 1.2518152773743827, + "grad_norm": 0.554973840713501, + "learning_rate": 0.0001, + "loss": 1.4567, + "step": 10775 + }, + { + "epoch": 1.2519314551263432, + "grad_norm": 0.5329660177230835, + "learning_rate": 0.0001, + "loss": 1.4075, + "step": 10776 + }, + { + "epoch": 1.2520476328783037, + "grad_norm": 0.5556033849716187, + "learning_rate": 0.0001, + "loss": 1.3319, + "step": 10777 + }, + { + "epoch": 1.2521638106302644, + "grad_norm": 0.5581437349319458, + "learning_rate": 0.0001, + "loss": 1.4436, + "step": 10778 + }, + { + "epoch": 1.2522799883822249, + "grad_norm": 0.5687176585197449, + "learning_rate": 0.0001, + "loss": 1.4248, + "step": 10779 + }, + { + "epoch": 1.2523961661341854, + "grad_norm": 0.6239218711853027, + "learning_rate": 0.0001, + "loss": 1.5807, + "step": 10780 + }, + { + "epoch": 1.2525123438861459, + "grad_norm": 0.542938768863678, + "learning_rate": 0.0001, + "loss": 1.3693, + "step": 10781 + }, + { + "epoch": 1.2526285216381063, + "grad_norm": 0.5767273902893066, + "learning_rate": 0.0001, + "loss": 1.5495, + "step": 10782 + }, + { + "epoch": 1.2527446993900668, + "grad_norm": 0.5635040998458862, + "learning_rate": 0.0001, + "loss": 1.6863, + "step": 10783 + }, + { + "epoch": 1.2528608771420273, + "grad_norm": 0.5268775820732117, + "learning_rate": 0.0001, + "loss": 1.3677, + "step": 10784 + }, + { + "epoch": 1.2529770548939878, + "grad_norm": 0.5350583791732788, + "learning_rate": 0.0001, + "loss": 1.4114, + "step": 10785 + }, + { + "epoch": 1.2530932326459483, + "grad_norm": 0.5941740870475769, + "learning_rate": 0.0001, + "loss": 1.5906, + "step": 10786 + }, + { + "epoch": 1.2532094103979088, + "grad_norm": 0.5417436361312866, + "learning_rate": 0.0001, + "loss": 1.2903, + "step": 10787 + }, + { + "epoch": 1.2533255881498693, + "grad_norm": 0.5705372095108032, + "learning_rate": 0.0001, + "loss": 1.4168, + "step": 10788 + }, + { + "epoch": 1.2534417659018298, + "grad_norm": 0.5509761571884155, + "learning_rate": 0.0001, + "loss": 1.4277, + "step": 10789 + }, + { + "epoch": 1.2535579436537903, + "grad_norm": 0.6663389205932617, + "learning_rate": 0.0001, + "loss": 1.6093, + "step": 10790 + }, + { + "epoch": 1.2536741214057507, + "grad_norm": 0.6134310960769653, + "learning_rate": 0.0001, + "loss": 1.4366, + "step": 10791 + }, + { + "epoch": 1.2537902991577112, + "grad_norm": 0.5742259621620178, + "learning_rate": 0.0001, + "loss": 1.4703, + "step": 10792 + }, + { + "epoch": 1.2539064769096717, + "grad_norm": 0.5614108443260193, + "learning_rate": 0.0001, + "loss": 1.5148, + "step": 10793 + }, + { + "epoch": 1.2540226546616324, + "grad_norm": 0.5708277821540833, + "learning_rate": 0.0001, + "loss": 1.4222, + "step": 10794 + }, + { + "epoch": 1.254138832413593, + "grad_norm": 0.5575094819068909, + "learning_rate": 0.0001, + "loss": 1.3619, + "step": 10795 + }, + { + "epoch": 1.2542550101655534, + "grad_norm": 0.6430823802947998, + "learning_rate": 0.0001, + "loss": 1.4687, + "step": 10796 + }, + { + "epoch": 1.254371187917514, + "grad_norm": 0.6255773901939392, + "learning_rate": 0.0001, + "loss": 1.5583, + "step": 10797 + }, + { + "epoch": 1.2544873656694744, + "grad_norm": 0.5493278503417969, + "learning_rate": 0.0001, + "loss": 1.4425, + "step": 10798 + }, + { + "epoch": 1.2546035434214349, + "grad_norm": 0.5868274569511414, + "learning_rate": 0.0001, + "loss": 1.4117, + "step": 10799 + }, + { + "epoch": 1.2547197211733954, + "grad_norm": 0.5614247918128967, + "learning_rate": 0.0001, + "loss": 1.5633, + "step": 10800 + }, + { + "epoch": 1.2548358989253559, + "grad_norm": 0.6078110933303833, + "learning_rate": 0.0001, + "loss": 1.6503, + "step": 10801 + }, + { + "epoch": 1.2549520766773163, + "grad_norm": 0.5670056939125061, + "learning_rate": 0.0001, + "loss": 1.3722, + "step": 10802 + }, + { + "epoch": 1.2550682544292768, + "grad_norm": 0.5822281837463379, + "learning_rate": 0.0001, + "loss": 1.3983, + "step": 10803 + }, + { + "epoch": 1.2551844321812373, + "grad_norm": 0.5852466821670532, + "learning_rate": 0.0001, + "loss": 1.5878, + "step": 10804 + }, + { + "epoch": 1.2553006099331978, + "grad_norm": 0.5783720016479492, + "learning_rate": 0.0001, + "loss": 1.2861, + "step": 10805 + }, + { + "epoch": 1.2554167876851583, + "grad_norm": 0.5860381722450256, + "learning_rate": 0.0001, + "loss": 1.416, + "step": 10806 + }, + { + "epoch": 1.2555329654371188, + "grad_norm": 0.5932544469833374, + "learning_rate": 0.0001, + "loss": 1.6662, + "step": 10807 + }, + { + "epoch": 1.2556491431890793, + "grad_norm": 0.6420629024505615, + "learning_rate": 0.0001, + "loss": 1.5633, + "step": 10808 + }, + { + "epoch": 1.2557653209410398, + "grad_norm": 0.6060267090797424, + "learning_rate": 0.0001, + "loss": 1.4374, + "step": 10809 + }, + { + "epoch": 1.2558814986930003, + "grad_norm": 0.5844810605049133, + "learning_rate": 0.0001, + "loss": 1.4504, + "step": 10810 + }, + { + "epoch": 1.2559976764449607, + "grad_norm": 0.6256417632102966, + "learning_rate": 0.0001, + "loss": 1.5777, + "step": 10811 + }, + { + "epoch": 1.2561138541969212, + "grad_norm": 0.5611162185668945, + "learning_rate": 0.0001, + "loss": 1.6017, + "step": 10812 + }, + { + "epoch": 1.2562300319488817, + "grad_norm": 0.5997201204299927, + "learning_rate": 0.0001, + "loss": 1.682, + "step": 10813 + }, + { + "epoch": 1.2563462097008422, + "grad_norm": 0.580735981464386, + "learning_rate": 0.0001, + "loss": 1.5223, + "step": 10814 + }, + { + "epoch": 1.2564623874528027, + "grad_norm": 0.5708926916122437, + "learning_rate": 0.0001, + "loss": 1.3772, + "step": 10815 + }, + { + "epoch": 1.2565785652047632, + "grad_norm": 0.5596855282783508, + "learning_rate": 0.0001, + "loss": 1.3001, + "step": 10816 + }, + { + "epoch": 1.2566947429567237, + "grad_norm": 0.6057003736495972, + "learning_rate": 0.0001, + "loss": 1.4523, + "step": 10817 + }, + { + "epoch": 1.2568109207086842, + "grad_norm": 0.6439718008041382, + "learning_rate": 0.0001, + "loss": 1.5285, + "step": 10818 + }, + { + "epoch": 1.2569270984606447, + "grad_norm": 0.5926083326339722, + "learning_rate": 0.0001, + "loss": 1.6552, + "step": 10819 + }, + { + "epoch": 1.2570432762126054, + "grad_norm": 0.5531743764877319, + "learning_rate": 0.0001, + "loss": 1.5071, + "step": 10820 + }, + { + "epoch": 1.2571594539645659, + "grad_norm": 0.5597687363624573, + "learning_rate": 0.0001, + "loss": 1.6024, + "step": 10821 + }, + { + "epoch": 1.2572756317165263, + "grad_norm": 0.5524884462356567, + "learning_rate": 0.0001, + "loss": 1.4648, + "step": 10822 + }, + { + "epoch": 1.2573918094684868, + "grad_norm": 0.6109902262687683, + "learning_rate": 0.0001, + "loss": 1.4158, + "step": 10823 + }, + { + "epoch": 1.2575079872204473, + "grad_norm": 0.5356048345565796, + "learning_rate": 0.0001, + "loss": 1.4295, + "step": 10824 + }, + { + "epoch": 1.2576241649724078, + "grad_norm": 0.5408940315246582, + "learning_rate": 0.0001, + "loss": 1.2335, + "step": 10825 + }, + { + "epoch": 1.2577403427243683, + "grad_norm": 0.5846429467201233, + "learning_rate": 0.0001, + "loss": 1.5364, + "step": 10826 + }, + { + "epoch": 1.2578565204763288, + "grad_norm": 0.5699112415313721, + "learning_rate": 0.0001, + "loss": 1.4308, + "step": 10827 + }, + { + "epoch": 1.2579726982282893, + "grad_norm": 0.6659561395645142, + "learning_rate": 0.0001, + "loss": 1.6991, + "step": 10828 + }, + { + "epoch": 1.2580888759802498, + "grad_norm": 0.6590319871902466, + "learning_rate": 0.0001, + "loss": 1.6286, + "step": 10829 + }, + { + "epoch": 1.2582050537322103, + "grad_norm": 0.633607804775238, + "learning_rate": 0.0001, + "loss": 1.5275, + "step": 10830 + }, + { + "epoch": 1.2583212314841707, + "grad_norm": 0.5824370980262756, + "learning_rate": 0.0001, + "loss": 1.4908, + "step": 10831 + }, + { + "epoch": 1.2584374092361312, + "grad_norm": 0.6036174893379211, + "learning_rate": 0.0001, + "loss": 1.4916, + "step": 10832 + }, + { + "epoch": 1.2585535869880917, + "grad_norm": 0.5827155709266663, + "learning_rate": 0.0001, + "loss": 1.3925, + "step": 10833 + }, + { + "epoch": 1.2586697647400522, + "grad_norm": 0.579399585723877, + "learning_rate": 0.0001, + "loss": 1.4674, + "step": 10834 + }, + { + "epoch": 1.2587859424920127, + "grad_norm": 0.5795319080352783, + "learning_rate": 0.0001, + "loss": 1.5249, + "step": 10835 + }, + { + "epoch": 1.2589021202439734, + "grad_norm": 0.5639935731887817, + "learning_rate": 0.0001, + "loss": 1.3972, + "step": 10836 + }, + { + "epoch": 1.259018297995934, + "grad_norm": 0.5532496571540833, + "learning_rate": 0.0001, + "loss": 1.465, + "step": 10837 + }, + { + "epoch": 1.2591344757478944, + "grad_norm": 0.5931538343429565, + "learning_rate": 0.0001, + "loss": 1.5833, + "step": 10838 + }, + { + "epoch": 1.2592506534998549, + "grad_norm": 0.6140592098236084, + "learning_rate": 0.0001, + "loss": 1.6095, + "step": 10839 + }, + { + "epoch": 1.2593668312518154, + "grad_norm": 0.5485202074050903, + "learning_rate": 0.0001, + "loss": 1.4436, + "step": 10840 + }, + { + "epoch": 1.2594830090037759, + "grad_norm": 0.5648629069328308, + "learning_rate": 0.0001, + "loss": 1.3829, + "step": 10841 + }, + { + "epoch": 1.2595991867557363, + "grad_norm": 0.5815029740333557, + "learning_rate": 0.0001, + "loss": 1.5187, + "step": 10842 + }, + { + "epoch": 1.2597153645076968, + "grad_norm": 0.5790690183639526, + "learning_rate": 0.0001, + "loss": 1.4802, + "step": 10843 + }, + { + "epoch": 1.2598315422596573, + "grad_norm": 0.6052435636520386, + "learning_rate": 0.0001, + "loss": 1.5367, + "step": 10844 + }, + { + "epoch": 1.2599477200116178, + "grad_norm": 0.5813129544258118, + "learning_rate": 0.0001, + "loss": 1.5912, + "step": 10845 + }, + { + "epoch": 1.2600638977635783, + "grad_norm": 0.5918418169021606, + "learning_rate": 0.0001, + "loss": 1.7113, + "step": 10846 + }, + { + "epoch": 1.2601800755155388, + "grad_norm": 0.5554955005645752, + "learning_rate": 0.0001, + "loss": 1.5289, + "step": 10847 + }, + { + "epoch": 1.2602962532674993, + "grad_norm": 0.540164589881897, + "learning_rate": 0.0001, + "loss": 1.3184, + "step": 10848 + }, + { + "epoch": 1.2604124310194598, + "grad_norm": 0.6040856242179871, + "learning_rate": 0.0001, + "loss": 1.4184, + "step": 10849 + }, + { + "epoch": 1.2605286087714203, + "grad_norm": 0.5804588794708252, + "learning_rate": 0.0001, + "loss": 1.4578, + "step": 10850 + }, + { + "epoch": 1.2606447865233807, + "grad_norm": 0.5707676410675049, + "learning_rate": 0.0001, + "loss": 1.5481, + "step": 10851 + }, + { + "epoch": 1.2607609642753412, + "grad_norm": 0.647530734539032, + "learning_rate": 0.0001, + "loss": 1.5698, + "step": 10852 + }, + { + "epoch": 1.2608771420273017, + "grad_norm": 0.5723570585250854, + "learning_rate": 0.0001, + "loss": 1.3976, + "step": 10853 + }, + { + "epoch": 1.2609933197792622, + "grad_norm": 0.5460187196731567, + "learning_rate": 0.0001, + "loss": 1.3676, + "step": 10854 + }, + { + "epoch": 1.2611094975312227, + "grad_norm": 0.5793501138687134, + "learning_rate": 0.0001, + "loss": 1.4138, + "step": 10855 + }, + { + "epoch": 1.2612256752831832, + "grad_norm": 0.6033415198326111, + "learning_rate": 0.0001, + "loss": 1.5383, + "step": 10856 + }, + { + "epoch": 1.2613418530351437, + "grad_norm": 0.5997538566589355, + "learning_rate": 0.0001, + "loss": 1.2872, + "step": 10857 + }, + { + "epoch": 1.2614580307871042, + "grad_norm": 0.5751636624336243, + "learning_rate": 0.0001, + "loss": 1.4681, + "step": 10858 + }, + { + "epoch": 1.2615742085390647, + "grad_norm": 0.6361991167068481, + "learning_rate": 0.0001, + "loss": 1.4256, + "step": 10859 + }, + { + "epoch": 1.2616903862910251, + "grad_norm": 0.5734949707984924, + "learning_rate": 0.0001, + "loss": 1.4381, + "step": 10860 + }, + { + "epoch": 1.2618065640429856, + "grad_norm": 0.6365008354187012, + "learning_rate": 0.0001, + "loss": 1.6581, + "step": 10861 + }, + { + "epoch": 1.2619227417949463, + "grad_norm": 0.6318557858467102, + "learning_rate": 0.0001, + "loss": 1.715, + "step": 10862 + }, + { + "epoch": 1.2620389195469068, + "grad_norm": 0.6001318693161011, + "learning_rate": 0.0001, + "loss": 1.5687, + "step": 10863 + }, + { + "epoch": 1.2621550972988673, + "grad_norm": 0.6317021250724792, + "learning_rate": 0.0001, + "loss": 1.6188, + "step": 10864 + }, + { + "epoch": 1.2622712750508278, + "grad_norm": 0.5643987059593201, + "learning_rate": 0.0001, + "loss": 1.4726, + "step": 10865 + }, + { + "epoch": 1.2623874528027883, + "grad_norm": 0.5744731426239014, + "learning_rate": 0.0001, + "loss": 1.524, + "step": 10866 + }, + { + "epoch": 1.2625036305547488, + "grad_norm": 0.605496883392334, + "learning_rate": 0.0001, + "loss": 1.5188, + "step": 10867 + }, + { + "epoch": 1.2626198083067093, + "grad_norm": 0.5679904222488403, + "learning_rate": 0.0001, + "loss": 1.4915, + "step": 10868 + }, + { + "epoch": 1.2627359860586698, + "grad_norm": 0.5534711480140686, + "learning_rate": 0.0001, + "loss": 1.4292, + "step": 10869 + }, + { + "epoch": 1.2628521638106303, + "grad_norm": 0.5767245292663574, + "learning_rate": 0.0001, + "loss": 1.4301, + "step": 10870 + }, + { + "epoch": 1.2629683415625907, + "grad_norm": 0.5895627737045288, + "learning_rate": 0.0001, + "loss": 1.6787, + "step": 10871 + }, + { + "epoch": 1.2630845193145512, + "grad_norm": 0.5530607104301453, + "learning_rate": 0.0001, + "loss": 1.3382, + "step": 10872 + }, + { + "epoch": 1.2632006970665117, + "grad_norm": 0.5866522789001465, + "learning_rate": 0.0001, + "loss": 1.5503, + "step": 10873 + }, + { + "epoch": 1.2633168748184722, + "grad_norm": 0.5559065937995911, + "learning_rate": 0.0001, + "loss": 1.3051, + "step": 10874 + }, + { + "epoch": 1.2634330525704327, + "grad_norm": 0.5616282820701599, + "learning_rate": 0.0001, + "loss": 1.2669, + "step": 10875 + }, + { + "epoch": 1.2635492303223932, + "grad_norm": 0.5756585597991943, + "learning_rate": 0.0001, + "loss": 1.454, + "step": 10876 + }, + { + "epoch": 1.2636654080743537, + "grad_norm": 0.6078899502754211, + "learning_rate": 0.0001, + "loss": 1.5251, + "step": 10877 + }, + { + "epoch": 1.2637815858263144, + "grad_norm": 0.5421556830406189, + "learning_rate": 0.0001, + "loss": 1.3957, + "step": 10878 + }, + { + "epoch": 1.2638977635782749, + "grad_norm": 0.6021136045455933, + "learning_rate": 0.0001, + "loss": 1.4868, + "step": 10879 + }, + { + "epoch": 1.2640139413302354, + "grad_norm": 0.6049302220344543, + "learning_rate": 0.0001, + "loss": 1.4672, + "step": 10880 + }, + { + "epoch": 1.2641301190821959, + "grad_norm": 0.5303308963775635, + "learning_rate": 0.0001, + "loss": 1.2202, + "step": 10881 + }, + { + "epoch": 1.2642462968341563, + "grad_norm": 0.5799589157104492, + "learning_rate": 0.0001, + "loss": 1.4065, + "step": 10882 + }, + { + "epoch": 1.2643624745861168, + "grad_norm": 0.5389856696128845, + "learning_rate": 0.0001, + "loss": 1.2409, + "step": 10883 + }, + { + "epoch": 1.2644786523380773, + "grad_norm": 0.5793135166168213, + "learning_rate": 0.0001, + "loss": 1.3456, + "step": 10884 + }, + { + "epoch": 1.2645948300900378, + "grad_norm": 0.665398120880127, + "learning_rate": 0.0001, + "loss": 1.6816, + "step": 10885 + }, + { + "epoch": 1.2647110078419983, + "grad_norm": 0.6195827126502991, + "learning_rate": 0.0001, + "loss": 1.3291, + "step": 10886 + }, + { + "epoch": 1.2648271855939588, + "grad_norm": 0.603444516658783, + "learning_rate": 0.0001, + "loss": 1.4637, + "step": 10887 + }, + { + "epoch": 1.2649433633459193, + "grad_norm": 0.6044740080833435, + "learning_rate": 0.0001, + "loss": 1.2983, + "step": 10888 + }, + { + "epoch": 1.2650595410978798, + "grad_norm": 0.5764860510826111, + "learning_rate": 0.0001, + "loss": 1.4351, + "step": 10889 + }, + { + "epoch": 1.2651757188498403, + "grad_norm": 0.5791444182395935, + "learning_rate": 0.0001, + "loss": 1.3424, + "step": 10890 + }, + { + "epoch": 1.2652918966018007, + "grad_norm": 0.6098731160163879, + "learning_rate": 0.0001, + "loss": 1.4804, + "step": 10891 + }, + { + "epoch": 1.2654080743537612, + "grad_norm": 0.5369520783424377, + "learning_rate": 0.0001, + "loss": 1.3948, + "step": 10892 + }, + { + "epoch": 1.2655242521057217, + "grad_norm": 0.6197044849395752, + "learning_rate": 0.0001, + "loss": 1.368, + "step": 10893 + }, + { + "epoch": 1.2656404298576822, + "grad_norm": 0.588814377784729, + "learning_rate": 0.0001, + "loss": 1.3916, + "step": 10894 + }, + { + "epoch": 1.2657566076096427, + "grad_norm": 0.5804957747459412, + "learning_rate": 0.0001, + "loss": 1.4814, + "step": 10895 + }, + { + "epoch": 1.2658727853616032, + "grad_norm": 0.6161913275718689, + "learning_rate": 0.0001, + "loss": 1.641, + "step": 10896 + }, + { + "epoch": 1.2659889631135637, + "grad_norm": 0.6082305312156677, + "learning_rate": 0.0001, + "loss": 1.5288, + "step": 10897 + }, + { + "epoch": 1.2661051408655242, + "grad_norm": 0.5396662950515747, + "learning_rate": 0.0001, + "loss": 1.5025, + "step": 10898 + }, + { + "epoch": 1.2662213186174847, + "grad_norm": 0.5761926770210266, + "learning_rate": 0.0001, + "loss": 1.4168, + "step": 10899 + }, + { + "epoch": 1.2663374963694451, + "grad_norm": 0.5800683498382568, + "learning_rate": 0.0001, + "loss": 1.4611, + "step": 10900 + }, + { + "epoch": 1.2664536741214056, + "grad_norm": 0.5463969111442566, + "learning_rate": 0.0001, + "loss": 1.4545, + "step": 10901 + }, + { + "epoch": 1.2665698518733661, + "grad_norm": 0.5822511315345764, + "learning_rate": 0.0001, + "loss": 1.486, + "step": 10902 + }, + { + "epoch": 1.2666860296253266, + "grad_norm": 0.6074179410934448, + "learning_rate": 0.0001, + "loss": 1.7182, + "step": 10903 + }, + { + "epoch": 1.2668022073772873, + "grad_norm": 0.6279671788215637, + "learning_rate": 0.0001, + "loss": 1.4481, + "step": 10904 + }, + { + "epoch": 1.2669183851292478, + "grad_norm": 0.6327800750732422, + "learning_rate": 0.0001, + "loss": 1.4574, + "step": 10905 + }, + { + "epoch": 1.2670345628812083, + "grad_norm": 0.6204458475112915, + "learning_rate": 0.0001, + "loss": 1.483, + "step": 10906 + }, + { + "epoch": 1.2671507406331688, + "grad_norm": 0.5944911241531372, + "learning_rate": 0.0001, + "loss": 1.6018, + "step": 10907 + }, + { + "epoch": 1.2672669183851293, + "grad_norm": 0.5533126592636108, + "learning_rate": 0.0001, + "loss": 1.5541, + "step": 10908 + }, + { + "epoch": 1.2673830961370898, + "grad_norm": 0.5814772844314575, + "learning_rate": 0.0001, + "loss": 1.5974, + "step": 10909 + }, + { + "epoch": 1.2674992738890503, + "grad_norm": 0.6086983680725098, + "learning_rate": 0.0001, + "loss": 1.4617, + "step": 10910 + }, + { + "epoch": 1.2676154516410107, + "grad_norm": 0.6263343691825867, + "learning_rate": 0.0001, + "loss": 1.5468, + "step": 10911 + }, + { + "epoch": 1.2677316293929712, + "grad_norm": 0.5537508130073547, + "learning_rate": 0.0001, + "loss": 1.3299, + "step": 10912 + }, + { + "epoch": 1.2678478071449317, + "grad_norm": 0.5584046840667725, + "learning_rate": 0.0001, + "loss": 1.3079, + "step": 10913 + }, + { + "epoch": 1.2679639848968922, + "grad_norm": 0.576492428779602, + "learning_rate": 0.0001, + "loss": 1.4632, + "step": 10914 + }, + { + "epoch": 1.2680801626488527, + "grad_norm": 0.5651898384094238, + "learning_rate": 0.0001, + "loss": 1.2658, + "step": 10915 + }, + { + "epoch": 1.2681963404008132, + "grad_norm": 0.5675121545791626, + "learning_rate": 0.0001, + "loss": 1.4674, + "step": 10916 + }, + { + "epoch": 1.2683125181527737, + "grad_norm": 0.612667977809906, + "learning_rate": 0.0001, + "loss": 1.5688, + "step": 10917 + }, + { + "epoch": 1.2684286959047342, + "grad_norm": 0.6280263066291809, + "learning_rate": 0.0001, + "loss": 1.694, + "step": 10918 + }, + { + "epoch": 1.2685448736566947, + "grad_norm": 0.5767539739608765, + "learning_rate": 0.0001, + "loss": 1.4455, + "step": 10919 + }, + { + "epoch": 1.2686610514086554, + "grad_norm": 0.543782114982605, + "learning_rate": 0.0001, + "loss": 1.4729, + "step": 10920 + }, + { + "epoch": 1.2687772291606159, + "grad_norm": 0.5753729939460754, + "learning_rate": 0.0001, + "loss": 1.5188, + "step": 10921 + }, + { + "epoch": 1.2688934069125763, + "grad_norm": 0.5873640775680542, + "learning_rate": 0.0001, + "loss": 1.4895, + "step": 10922 + }, + { + "epoch": 1.2690095846645368, + "grad_norm": 0.556169867515564, + "learning_rate": 0.0001, + "loss": 1.4408, + "step": 10923 + }, + { + "epoch": 1.2691257624164973, + "grad_norm": 0.588584303855896, + "learning_rate": 0.0001, + "loss": 1.6303, + "step": 10924 + }, + { + "epoch": 1.2692419401684578, + "grad_norm": 0.5426774621009827, + "learning_rate": 0.0001, + "loss": 1.411, + "step": 10925 + }, + { + "epoch": 1.2693581179204183, + "grad_norm": 0.5747091174125671, + "learning_rate": 0.0001, + "loss": 1.4987, + "step": 10926 + }, + { + "epoch": 1.2694742956723788, + "grad_norm": 0.5184979438781738, + "learning_rate": 0.0001, + "loss": 1.308, + "step": 10927 + }, + { + "epoch": 1.2695904734243393, + "grad_norm": 0.6157345175743103, + "learning_rate": 0.0001, + "loss": 1.5492, + "step": 10928 + }, + { + "epoch": 1.2697066511762998, + "grad_norm": 0.6227047443389893, + "learning_rate": 0.0001, + "loss": 1.4771, + "step": 10929 + }, + { + "epoch": 1.2698228289282603, + "grad_norm": 0.5665978193283081, + "learning_rate": 0.0001, + "loss": 1.4822, + "step": 10930 + }, + { + "epoch": 1.2699390066802207, + "grad_norm": 0.5821638703346252, + "learning_rate": 0.0001, + "loss": 1.5109, + "step": 10931 + }, + { + "epoch": 1.2700551844321812, + "grad_norm": 0.5837578773498535, + "learning_rate": 0.0001, + "loss": 1.6321, + "step": 10932 + }, + { + "epoch": 1.2701713621841417, + "grad_norm": 0.5726567506790161, + "learning_rate": 0.0001, + "loss": 1.464, + "step": 10933 + }, + { + "epoch": 1.2702875399361022, + "grad_norm": 0.5908870697021484, + "learning_rate": 0.0001, + "loss": 1.5283, + "step": 10934 + }, + { + "epoch": 1.2704037176880627, + "grad_norm": 0.5780557990074158, + "learning_rate": 0.0001, + "loss": 1.539, + "step": 10935 + }, + { + "epoch": 1.2705198954400232, + "grad_norm": 0.5335168838500977, + "learning_rate": 0.0001, + "loss": 1.3698, + "step": 10936 + }, + { + "epoch": 1.2706360731919837, + "grad_norm": 0.5760794281959534, + "learning_rate": 0.0001, + "loss": 1.4409, + "step": 10937 + }, + { + "epoch": 1.2707522509439442, + "grad_norm": 0.5667716264724731, + "learning_rate": 0.0001, + "loss": 1.4142, + "step": 10938 + }, + { + "epoch": 1.2708684286959047, + "grad_norm": 0.5758441686630249, + "learning_rate": 0.0001, + "loss": 1.4093, + "step": 10939 + }, + { + "epoch": 1.2709846064478652, + "grad_norm": 0.5902236104011536, + "learning_rate": 0.0001, + "loss": 1.616, + "step": 10940 + }, + { + "epoch": 1.2711007841998256, + "grad_norm": 0.587958037853241, + "learning_rate": 0.0001, + "loss": 1.4587, + "step": 10941 + }, + { + "epoch": 1.2712169619517861, + "grad_norm": 0.6102663278579712, + "learning_rate": 0.0001, + "loss": 1.4048, + "step": 10942 + }, + { + "epoch": 1.2713331397037466, + "grad_norm": 0.5977857708930969, + "learning_rate": 0.0001, + "loss": 1.3835, + "step": 10943 + }, + { + "epoch": 1.271449317455707, + "grad_norm": 0.5918571352958679, + "learning_rate": 0.0001, + "loss": 1.5698, + "step": 10944 + }, + { + "epoch": 1.2715654952076676, + "grad_norm": 0.5774389505386353, + "learning_rate": 0.0001, + "loss": 1.3861, + "step": 10945 + }, + { + "epoch": 1.2716816729596283, + "grad_norm": 0.6010609865188599, + "learning_rate": 0.0001, + "loss": 1.4712, + "step": 10946 + }, + { + "epoch": 1.2717978507115888, + "grad_norm": 0.5825393795967102, + "learning_rate": 0.0001, + "loss": 1.6338, + "step": 10947 + }, + { + "epoch": 1.2719140284635493, + "grad_norm": 0.6239941120147705, + "learning_rate": 0.0001, + "loss": 1.4736, + "step": 10948 + }, + { + "epoch": 1.2720302062155098, + "grad_norm": 0.5607348680496216, + "learning_rate": 0.0001, + "loss": 1.3118, + "step": 10949 + }, + { + "epoch": 1.2721463839674703, + "grad_norm": 0.6177657842636108, + "learning_rate": 0.0001, + "loss": 1.7027, + "step": 10950 + }, + { + "epoch": 1.2722625617194308, + "grad_norm": 0.5948476791381836, + "learning_rate": 0.0001, + "loss": 1.5479, + "step": 10951 + }, + { + "epoch": 1.2723787394713912, + "grad_norm": 0.5765991806983948, + "learning_rate": 0.0001, + "loss": 1.6417, + "step": 10952 + }, + { + "epoch": 1.2724949172233517, + "grad_norm": 0.5915812253952026, + "learning_rate": 0.0001, + "loss": 1.5304, + "step": 10953 + }, + { + "epoch": 1.2726110949753122, + "grad_norm": 0.5970553755760193, + "learning_rate": 0.0001, + "loss": 1.6319, + "step": 10954 + }, + { + "epoch": 1.2727272727272727, + "grad_norm": 0.5600100755691528, + "learning_rate": 0.0001, + "loss": 1.3926, + "step": 10955 + }, + { + "epoch": 1.2728434504792332, + "grad_norm": 0.5822739005088806, + "learning_rate": 0.0001, + "loss": 1.5323, + "step": 10956 + }, + { + "epoch": 1.2729596282311937, + "grad_norm": 0.6493605971336365, + "learning_rate": 0.0001, + "loss": 1.3364, + "step": 10957 + }, + { + "epoch": 1.2730758059831542, + "grad_norm": 0.5897579193115234, + "learning_rate": 0.0001, + "loss": 1.5439, + "step": 10958 + }, + { + "epoch": 1.2731919837351147, + "grad_norm": 0.5744937062263489, + "learning_rate": 0.0001, + "loss": 1.5039, + "step": 10959 + }, + { + "epoch": 1.2733081614870752, + "grad_norm": 0.5749931335449219, + "learning_rate": 0.0001, + "loss": 1.5115, + "step": 10960 + }, + { + "epoch": 1.2734243392390359, + "grad_norm": 0.5711995363235474, + "learning_rate": 0.0001, + "loss": 1.5007, + "step": 10961 + }, + { + "epoch": 1.2735405169909964, + "grad_norm": 0.572600781917572, + "learning_rate": 0.0001, + "loss": 1.4066, + "step": 10962 + }, + { + "epoch": 1.2736566947429568, + "grad_norm": 0.5981788635253906, + "learning_rate": 0.0001, + "loss": 1.652, + "step": 10963 + }, + { + "epoch": 1.2737728724949173, + "grad_norm": 0.6050378084182739, + "learning_rate": 0.0001, + "loss": 1.5157, + "step": 10964 + }, + { + "epoch": 1.2738890502468778, + "grad_norm": 0.5755563974380493, + "learning_rate": 0.0001, + "loss": 1.6514, + "step": 10965 + }, + { + "epoch": 1.2740052279988383, + "grad_norm": 0.5697473287582397, + "learning_rate": 0.0001, + "loss": 1.3886, + "step": 10966 + }, + { + "epoch": 1.2741214057507988, + "grad_norm": 0.5516510605812073, + "learning_rate": 0.0001, + "loss": 1.626, + "step": 10967 + }, + { + "epoch": 1.2742375835027593, + "grad_norm": 0.5390398502349854, + "learning_rate": 0.0001, + "loss": 1.2317, + "step": 10968 + }, + { + "epoch": 1.2743537612547198, + "grad_norm": 0.5673388838768005, + "learning_rate": 0.0001, + "loss": 1.3808, + "step": 10969 + }, + { + "epoch": 1.2744699390066803, + "grad_norm": 0.5935032367706299, + "learning_rate": 0.0001, + "loss": 1.5228, + "step": 10970 + }, + { + "epoch": 1.2745861167586408, + "grad_norm": 0.604706883430481, + "learning_rate": 0.0001, + "loss": 1.5436, + "step": 10971 + }, + { + "epoch": 1.2747022945106012, + "grad_norm": 0.643751859664917, + "learning_rate": 0.0001, + "loss": 1.5331, + "step": 10972 + }, + { + "epoch": 1.2748184722625617, + "grad_norm": 0.6240712404251099, + "learning_rate": 0.0001, + "loss": 1.6298, + "step": 10973 + }, + { + "epoch": 1.2749346500145222, + "grad_norm": 0.5518888831138611, + "learning_rate": 0.0001, + "loss": 1.4981, + "step": 10974 + }, + { + "epoch": 1.2750508277664827, + "grad_norm": 0.6049218773841858, + "learning_rate": 0.0001, + "loss": 1.4118, + "step": 10975 + }, + { + "epoch": 1.2751670055184432, + "grad_norm": 0.6087144017219543, + "learning_rate": 0.0001, + "loss": 1.4068, + "step": 10976 + }, + { + "epoch": 1.2752831832704037, + "grad_norm": 0.5613987445831299, + "learning_rate": 0.0001, + "loss": 1.6216, + "step": 10977 + }, + { + "epoch": 1.2753993610223642, + "grad_norm": 0.6147012710571289, + "learning_rate": 0.0001, + "loss": 1.5426, + "step": 10978 + }, + { + "epoch": 1.2755155387743247, + "grad_norm": 0.6045247316360474, + "learning_rate": 0.0001, + "loss": 1.5239, + "step": 10979 + }, + { + "epoch": 1.2756317165262852, + "grad_norm": 0.5130501389503479, + "learning_rate": 0.0001, + "loss": 1.2315, + "step": 10980 + }, + { + "epoch": 1.2757478942782456, + "grad_norm": 0.5664576888084412, + "learning_rate": 0.0001, + "loss": 1.5506, + "step": 10981 + }, + { + "epoch": 1.2758640720302061, + "grad_norm": 0.5801703929901123, + "learning_rate": 0.0001, + "loss": 1.5441, + "step": 10982 + }, + { + "epoch": 1.2759802497821666, + "grad_norm": 0.6036249399185181, + "learning_rate": 0.0001, + "loss": 1.3984, + "step": 10983 + }, + { + "epoch": 1.276096427534127, + "grad_norm": 0.6023778915405273, + "learning_rate": 0.0001, + "loss": 1.4279, + "step": 10984 + }, + { + "epoch": 1.2762126052860876, + "grad_norm": 0.5702974200248718, + "learning_rate": 0.0001, + "loss": 1.5797, + "step": 10985 + }, + { + "epoch": 1.276328783038048, + "grad_norm": 0.5740309357643127, + "learning_rate": 0.0001, + "loss": 1.2492, + "step": 10986 + }, + { + "epoch": 1.2764449607900086, + "grad_norm": 0.6035803556442261, + "learning_rate": 0.0001, + "loss": 1.6401, + "step": 10987 + }, + { + "epoch": 1.2765611385419693, + "grad_norm": 0.6096132397651672, + "learning_rate": 0.0001, + "loss": 1.5221, + "step": 10988 + }, + { + "epoch": 1.2766773162939298, + "grad_norm": 0.5646277070045471, + "learning_rate": 0.0001, + "loss": 1.4154, + "step": 10989 + }, + { + "epoch": 1.2767934940458903, + "grad_norm": 0.572085440158844, + "learning_rate": 0.0001, + "loss": 1.5288, + "step": 10990 + }, + { + "epoch": 1.2769096717978508, + "grad_norm": 0.5817086696624756, + "learning_rate": 0.0001, + "loss": 1.57, + "step": 10991 + }, + { + "epoch": 1.2770258495498112, + "grad_norm": 0.5460660457611084, + "learning_rate": 0.0001, + "loss": 1.3342, + "step": 10992 + }, + { + "epoch": 1.2771420273017717, + "grad_norm": 0.622367262840271, + "learning_rate": 0.0001, + "loss": 1.6075, + "step": 10993 + }, + { + "epoch": 1.2772582050537322, + "grad_norm": 0.5549526214599609, + "learning_rate": 0.0001, + "loss": 1.2616, + "step": 10994 + }, + { + "epoch": 1.2773743828056927, + "grad_norm": 0.5860145688056946, + "learning_rate": 0.0001, + "loss": 1.4507, + "step": 10995 + }, + { + "epoch": 1.2774905605576532, + "grad_norm": 0.6013427972793579, + "learning_rate": 0.0001, + "loss": 1.4988, + "step": 10996 + }, + { + "epoch": 1.2776067383096137, + "grad_norm": 0.6159816980361938, + "learning_rate": 0.0001, + "loss": 1.6728, + "step": 10997 + }, + { + "epoch": 1.2777229160615742, + "grad_norm": 0.5887131690979004, + "learning_rate": 0.0001, + "loss": 1.3728, + "step": 10998 + }, + { + "epoch": 1.2778390938135347, + "grad_norm": 0.555867612361908, + "learning_rate": 0.0001, + "loss": 1.4287, + "step": 10999 + }, + { + "epoch": 1.2779552715654952, + "grad_norm": 0.589248538017273, + "learning_rate": 0.0001, + "loss": 1.5133, + "step": 11000 + }, + { + "epoch": 1.2780714493174556, + "grad_norm": 0.5850374698638916, + "learning_rate": 0.0001, + "loss": 1.4953, + "step": 11001 + }, + { + "epoch": 1.2781876270694161, + "grad_norm": 0.5467734932899475, + "learning_rate": 0.0001, + "loss": 1.3924, + "step": 11002 + }, + { + "epoch": 1.2783038048213768, + "grad_norm": 0.54944908618927, + "learning_rate": 0.0001, + "loss": 1.5287, + "step": 11003 + }, + { + "epoch": 1.2784199825733373, + "grad_norm": 0.5763708353042603, + "learning_rate": 0.0001, + "loss": 1.6101, + "step": 11004 + }, + { + "epoch": 1.2785361603252978, + "grad_norm": 0.5769256353378296, + "learning_rate": 0.0001, + "loss": 1.4517, + "step": 11005 + }, + { + "epoch": 1.2786523380772583, + "grad_norm": 0.5926720499992371, + "learning_rate": 0.0001, + "loss": 1.5032, + "step": 11006 + }, + { + "epoch": 1.2787685158292188, + "grad_norm": 0.5912777781486511, + "learning_rate": 0.0001, + "loss": 1.5129, + "step": 11007 + }, + { + "epoch": 1.2788846935811793, + "grad_norm": 0.545050859451294, + "learning_rate": 0.0001, + "loss": 1.3467, + "step": 11008 + }, + { + "epoch": 1.2790008713331398, + "grad_norm": 0.5366930365562439, + "learning_rate": 0.0001, + "loss": 1.4805, + "step": 11009 + }, + { + "epoch": 1.2791170490851003, + "grad_norm": 0.5812363028526306, + "learning_rate": 0.0001, + "loss": 1.4036, + "step": 11010 + }, + { + "epoch": 1.2792332268370608, + "grad_norm": 0.6034533381462097, + "learning_rate": 0.0001, + "loss": 1.573, + "step": 11011 + }, + { + "epoch": 1.2793494045890212, + "grad_norm": 0.5460754632949829, + "learning_rate": 0.0001, + "loss": 1.5372, + "step": 11012 + }, + { + "epoch": 1.2794655823409817, + "grad_norm": 0.5772432088851929, + "learning_rate": 0.0001, + "loss": 1.5513, + "step": 11013 + }, + { + "epoch": 1.2795817600929422, + "grad_norm": 0.5885570049285889, + "learning_rate": 0.0001, + "loss": 1.536, + "step": 11014 + }, + { + "epoch": 1.2796979378449027, + "grad_norm": 0.6416093707084656, + "learning_rate": 0.0001, + "loss": 1.6099, + "step": 11015 + }, + { + "epoch": 1.2798141155968632, + "grad_norm": 0.576856791973114, + "learning_rate": 0.0001, + "loss": 1.5727, + "step": 11016 + }, + { + "epoch": 1.2799302933488237, + "grad_norm": 0.5491908192634583, + "learning_rate": 0.0001, + "loss": 1.3706, + "step": 11017 + }, + { + "epoch": 1.2800464711007842, + "grad_norm": 0.5981287956237793, + "learning_rate": 0.0001, + "loss": 1.6112, + "step": 11018 + }, + { + "epoch": 1.2801626488527447, + "grad_norm": 0.5656360387802124, + "learning_rate": 0.0001, + "loss": 1.3962, + "step": 11019 + }, + { + "epoch": 1.2802788266047052, + "grad_norm": 0.52436763048172, + "learning_rate": 0.0001, + "loss": 1.3104, + "step": 11020 + }, + { + "epoch": 1.2803950043566656, + "grad_norm": 0.5909287929534912, + "learning_rate": 0.0001, + "loss": 1.5743, + "step": 11021 + }, + { + "epoch": 1.2805111821086261, + "grad_norm": 0.5850455164909363, + "learning_rate": 0.0001, + "loss": 1.6079, + "step": 11022 + }, + { + "epoch": 1.2806273598605866, + "grad_norm": 0.6200546026229858, + "learning_rate": 0.0001, + "loss": 1.523, + "step": 11023 + }, + { + "epoch": 1.2807435376125471, + "grad_norm": 0.5266997218132019, + "learning_rate": 0.0001, + "loss": 1.3889, + "step": 11024 + }, + { + "epoch": 1.2808597153645076, + "grad_norm": 0.5400038957595825, + "learning_rate": 0.0001, + "loss": 1.3844, + "step": 11025 + }, + { + "epoch": 1.280975893116468, + "grad_norm": 0.5605980157852173, + "learning_rate": 0.0001, + "loss": 1.4657, + "step": 11026 + }, + { + "epoch": 1.2810920708684286, + "grad_norm": 0.5961278080940247, + "learning_rate": 0.0001, + "loss": 1.441, + "step": 11027 + }, + { + "epoch": 1.281208248620389, + "grad_norm": 0.5594688653945923, + "learning_rate": 0.0001, + "loss": 1.5488, + "step": 11028 + }, + { + "epoch": 1.2813244263723496, + "grad_norm": 0.5864664912223816, + "learning_rate": 0.0001, + "loss": 1.2833, + "step": 11029 + }, + { + "epoch": 1.2814406041243103, + "grad_norm": 0.579258382320404, + "learning_rate": 0.0001, + "loss": 1.393, + "step": 11030 + }, + { + "epoch": 1.2815567818762708, + "grad_norm": 0.5354270339012146, + "learning_rate": 0.0001, + "loss": 1.3188, + "step": 11031 + }, + { + "epoch": 1.2816729596282312, + "grad_norm": 0.5614170432090759, + "learning_rate": 0.0001, + "loss": 1.589, + "step": 11032 + }, + { + "epoch": 1.2817891373801917, + "grad_norm": 0.5893466472625732, + "learning_rate": 0.0001, + "loss": 1.4084, + "step": 11033 + }, + { + "epoch": 1.2819053151321522, + "grad_norm": 0.5984696745872498, + "learning_rate": 0.0001, + "loss": 1.4392, + "step": 11034 + }, + { + "epoch": 1.2820214928841127, + "grad_norm": 0.5932953357696533, + "learning_rate": 0.0001, + "loss": 1.6025, + "step": 11035 + }, + { + "epoch": 1.2821376706360732, + "grad_norm": 0.5950168371200562, + "learning_rate": 0.0001, + "loss": 1.5867, + "step": 11036 + }, + { + "epoch": 1.2822538483880337, + "grad_norm": 0.5833539962768555, + "learning_rate": 0.0001, + "loss": 1.6473, + "step": 11037 + }, + { + "epoch": 1.2823700261399942, + "grad_norm": 0.6518958210945129, + "learning_rate": 0.0001, + "loss": 1.276, + "step": 11038 + }, + { + "epoch": 1.2824862038919547, + "grad_norm": 0.5316762328147888, + "learning_rate": 0.0001, + "loss": 1.4414, + "step": 11039 + }, + { + "epoch": 1.2826023816439152, + "grad_norm": 0.6396839022636414, + "learning_rate": 0.0001, + "loss": 1.6239, + "step": 11040 + }, + { + "epoch": 1.2827185593958756, + "grad_norm": 0.574507474899292, + "learning_rate": 0.0001, + "loss": 1.496, + "step": 11041 + }, + { + "epoch": 1.2828347371478361, + "grad_norm": 0.5620124936103821, + "learning_rate": 0.0001, + "loss": 1.3733, + "step": 11042 + }, + { + "epoch": 1.2829509148997966, + "grad_norm": 0.5690659880638123, + "learning_rate": 0.0001, + "loss": 1.3846, + "step": 11043 + }, + { + "epoch": 1.2830670926517571, + "grad_norm": 0.5415745973587036, + "learning_rate": 0.0001, + "loss": 1.3145, + "step": 11044 + }, + { + "epoch": 1.2831832704037178, + "grad_norm": 0.5873833298683167, + "learning_rate": 0.0001, + "loss": 1.551, + "step": 11045 + }, + { + "epoch": 1.2832994481556783, + "grad_norm": 0.5984528064727783, + "learning_rate": 0.0001, + "loss": 1.5172, + "step": 11046 + }, + { + "epoch": 1.2834156259076388, + "grad_norm": 0.6160796284675598, + "learning_rate": 0.0001, + "loss": 1.4018, + "step": 11047 + }, + { + "epoch": 1.2835318036595993, + "grad_norm": 0.604532778263092, + "learning_rate": 0.0001, + "loss": 1.4446, + "step": 11048 + }, + { + "epoch": 1.2836479814115598, + "grad_norm": 0.5491237044334412, + "learning_rate": 0.0001, + "loss": 1.3932, + "step": 11049 + }, + { + "epoch": 1.2837641591635203, + "grad_norm": 0.5465075969696045, + "learning_rate": 0.0001, + "loss": 1.5201, + "step": 11050 + }, + { + "epoch": 1.2838803369154808, + "grad_norm": 0.6033710837364197, + "learning_rate": 0.0001, + "loss": 1.5147, + "step": 11051 + }, + { + "epoch": 1.2839965146674412, + "grad_norm": 0.5667850375175476, + "learning_rate": 0.0001, + "loss": 1.5165, + "step": 11052 + }, + { + "epoch": 1.2841126924194017, + "grad_norm": 0.5517511367797852, + "learning_rate": 0.0001, + "loss": 1.389, + "step": 11053 + }, + { + "epoch": 1.2842288701713622, + "grad_norm": 0.5938916802406311, + "learning_rate": 0.0001, + "loss": 1.5116, + "step": 11054 + }, + { + "epoch": 1.2843450479233227, + "grad_norm": 0.5698368549346924, + "learning_rate": 0.0001, + "loss": 1.439, + "step": 11055 + }, + { + "epoch": 1.2844612256752832, + "grad_norm": 0.5910446643829346, + "learning_rate": 0.0001, + "loss": 1.4874, + "step": 11056 + }, + { + "epoch": 1.2845774034272437, + "grad_norm": 0.5896273255348206, + "learning_rate": 0.0001, + "loss": 1.3875, + "step": 11057 + }, + { + "epoch": 1.2846935811792042, + "grad_norm": 0.5363755226135254, + "learning_rate": 0.0001, + "loss": 1.3841, + "step": 11058 + }, + { + "epoch": 1.2848097589311647, + "grad_norm": 0.6186480522155762, + "learning_rate": 0.0001, + "loss": 1.5301, + "step": 11059 + }, + { + "epoch": 1.2849259366831252, + "grad_norm": 0.5805513858795166, + "learning_rate": 0.0001, + "loss": 1.4924, + "step": 11060 + }, + { + "epoch": 1.2850421144350856, + "grad_norm": 0.6029322743415833, + "learning_rate": 0.0001, + "loss": 1.3933, + "step": 11061 + }, + { + "epoch": 1.2851582921870461, + "grad_norm": 0.5690828561782837, + "learning_rate": 0.0001, + "loss": 1.4572, + "step": 11062 + }, + { + "epoch": 1.2852744699390066, + "grad_norm": 0.5746386051177979, + "learning_rate": 0.0001, + "loss": 1.4443, + "step": 11063 + }, + { + "epoch": 1.2853906476909671, + "grad_norm": 0.5679745674133301, + "learning_rate": 0.0001, + "loss": 1.3696, + "step": 11064 + }, + { + "epoch": 1.2855068254429276, + "grad_norm": 0.6062312126159668, + "learning_rate": 0.0001, + "loss": 1.5775, + "step": 11065 + }, + { + "epoch": 1.285623003194888, + "grad_norm": 0.5830647945404053, + "learning_rate": 0.0001, + "loss": 1.4848, + "step": 11066 + }, + { + "epoch": 1.2857391809468486, + "grad_norm": 0.608407199382782, + "learning_rate": 0.0001, + "loss": 1.6409, + "step": 11067 + }, + { + "epoch": 1.285855358698809, + "grad_norm": 0.5620405673980713, + "learning_rate": 0.0001, + "loss": 1.4291, + "step": 11068 + }, + { + "epoch": 1.2859715364507696, + "grad_norm": 0.5535077452659607, + "learning_rate": 0.0001, + "loss": 1.4512, + "step": 11069 + }, + { + "epoch": 1.28608771420273, + "grad_norm": 0.5676767230033875, + "learning_rate": 0.0001, + "loss": 1.432, + "step": 11070 + }, + { + "epoch": 1.2862038919546908, + "grad_norm": 0.572475790977478, + "learning_rate": 0.0001, + "loss": 1.4525, + "step": 11071 + }, + { + "epoch": 1.2863200697066512, + "grad_norm": 0.5898261666297913, + "learning_rate": 0.0001, + "loss": 1.3499, + "step": 11072 + }, + { + "epoch": 1.2864362474586117, + "grad_norm": 0.5925431251525879, + "learning_rate": 0.0001, + "loss": 1.4481, + "step": 11073 + }, + { + "epoch": 1.2865524252105722, + "grad_norm": 0.6155528426170349, + "learning_rate": 0.0001, + "loss": 1.5104, + "step": 11074 + }, + { + "epoch": 1.2866686029625327, + "grad_norm": 0.5945338010787964, + "learning_rate": 0.0001, + "loss": 1.3821, + "step": 11075 + }, + { + "epoch": 1.2867847807144932, + "grad_norm": 0.5834474563598633, + "learning_rate": 0.0001, + "loss": 1.4329, + "step": 11076 + }, + { + "epoch": 1.2869009584664537, + "grad_norm": 0.5647174715995789, + "learning_rate": 0.0001, + "loss": 1.4092, + "step": 11077 + }, + { + "epoch": 1.2870171362184142, + "grad_norm": 0.6176016926765442, + "learning_rate": 0.0001, + "loss": 1.5172, + "step": 11078 + }, + { + "epoch": 1.2871333139703747, + "grad_norm": 0.5899996161460876, + "learning_rate": 0.0001, + "loss": 1.55, + "step": 11079 + }, + { + "epoch": 1.2872494917223352, + "grad_norm": 0.605802595615387, + "learning_rate": 0.0001, + "loss": 1.4564, + "step": 11080 + }, + { + "epoch": 1.2873656694742956, + "grad_norm": 0.6284874677658081, + "learning_rate": 0.0001, + "loss": 1.4376, + "step": 11081 + }, + { + "epoch": 1.2874818472262561, + "grad_norm": 0.5752767324447632, + "learning_rate": 0.0001, + "loss": 1.3874, + "step": 11082 + }, + { + "epoch": 1.2875980249782166, + "grad_norm": 0.5632016062736511, + "learning_rate": 0.0001, + "loss": 1.5345, + "step": 11083 + }, + { + "epoch": 1.2877142027301771, + "grad_norm": 0.5655538439750671, + "learning_rate": 0.0001, + "loss": 1.6066, + "step": 11084 + }, + { + "epoch": 1.2878303804821376, + "grad_norm": 0.5820789337158203, + "learning_rate": 0.0001, + "loss": 1.4042, + "step": 11085 + }, + { + "epoch": 1.287946558234098, + "grad_norm": 0.6027318835258484, + "learning_rate": 0.0001, + "loss": 1.5787, + "step": 11086 + }, + { + "epoch": 1.2880627359860588, + "grad_norm": 0.6038578152656555, + "learning_rate": 0.0001, + "loss": 1.5104, + "step": 11087 + }, + { + "epoch": 1.2881789137380193, + "grad_norm": 0.5578043460845947, + "learning_rate": 0.0001, + "loss": 1.4933, + "step": 11088 + }, + { + "epoch": 1.2882950914899798, + "grad_norm": 0.568867564201355, + "learning_rate": 0.0001, + "loss": 1.5107, + "step": 11089 + }, + { + "epoch": 1.2884112692419403, + "grad_norm": 0.6275894641876221, + "learning_rate": 0.0001, + "loss": 1.6483, + "step": 11090 + }, + { + "epoch": 1.2885274469939008, + "grad_norm": 0.5560985207557678, + "learning_rate": 0.0001, + "loss": 1.5027, + "step": 11091 + }, + { + "epoch": 1.2886436247458612, + "grad_norm": 0.5904410481452942, + "learning_rate": 0.0001, + "loss": 1.4185, + "step": 11092 + }, + { + "epoch": 1.2887598024978217, + "grad_norm": 0.5531237125396729, + "learning_rate": 0.0001, + "loss": 1.4716, + "step": 11093 + }, + { + "epoch": 1.2888759802497822, + "grad_norm": 0.6044683456420898, + "learning_rate": 0.0001, + "loss": 1.6057, + "step": 11094 + }, + { + "epoch": 1.2889921580017427, + "grad_norm": 0.5353182554244995, + "learning_rate": 0.0001, + "loss": 1.5528, + "step": 11095 + }, + { + "epoch": 1.2891083357537032, + "grad_norm": 0.5463939905166626, + "learning_rate": 0.0001, + "loss": 1.3904, + "step": 11096 + }, + { + "epoch": 1.2892245135056637, + "grad_norm": 0.6040199995040894, + "learning_rate": 0.0001, + "loss": 1.4816, + "step": 11097 + }, + { + "epoch": 1.2893406912576242, + "grad_norm": 0.567561686038971, + "learning_rate": 0.0001, + "loss": 1.5263, + "step": 11098 + }, + { + "epoch": 1.2894568690095847, + "grad_norm": 0.5605993866920471, + "learning_rate": 0.0001, + "loss": 1.298, + "step": 11099 + }, + { + "epoch": 1.2895730467615452, + "grad_norm": 0.5997190475463867, + "learning_rate": 0.0001, + "loss": 1.4685, + "step": 11100 + }, + { + "epoch": 1.2896892245135056, + "grad_norm": 0.5995270013809204, + "learning_rate": 0.0001, + "loss": 1.5044, + "step": 11101 + }, + { + "epoch": 1.2898054022654661, + "grad_norm": 0.5469571948051453, + "learning_rate": 0.0001, + "loss": 1.3763, + "step": 11102 + }, + { + "epoch": 1.2899215800174266, + "grad_norm": 0.5535038709640503, + "learning_rate": 0.0001, + "loss": 1.4447, + "step": 11103 + }, + { + "epoch": 1.2900377577693871, + "grad_norm": 0.5944443941116333, + "learning_rate": 0.0001, + "loss": 1.3665, + "step": 11104 + }, + { + "epoch": 1.2901539355213476, + "grad_norm": 0.5781884789466858, + "learning_rate": 0.0001, + "loss": 1.5146, + "step": 11105 + }, + { + "epoch": 1.290270113273308, + "grad_norm": 0.5380309820175171, + "learning_rate": 0.0001, + "loss": 1.3409, + "step": 11106 + }, + { + "epoch": 1.2903862910252686, + "grad_norm": 0.6192278861999512, + "learning_rate": 0.0001, + "loss": 1.5987, + "step": 11107 + }, + { + "epoch": 1.290502468777229, + "grad_norm": 0.5789281129837036, + "learning_rate": 0.0001, + "loss": 1.4082, + "step": 11108 + }, + { + "epoch": 1.2906186465291896, + "grad_norm": 0.53936368227005, + "learning_rate": 0.0001, + "loss": 1.4615, + "step": 11109 + }, + { + "epoch": 1.29073482428115, + "grad_norm": 0.5656925439834595, + "learning_rate": 0.0001, + "loss": 1.403, + "step": 11110 + }, + { + "epoch": 1.2908510020331105, + "grad_norm": 0.5613973736763, + "learning_rate": 0.0001, + "loss": 1.4213, + "step": 11111 + }, + { + "epoch": 1.290967179785071, + "grad_norm": 0.5758323073387146, + "learning_rate": 0.0001, + "loss": 1.435, + "step": 11112 + }, + { + "epoch": 1.2910833575370317, + "grad_norm": 0.5894516706466675, + "learning_rate": 0.0001, + "loss": 1.4765, + "step": 11113 + }, + { + "epoch": 1.2911995352889922, + "grad_norm": 0.6146057844161987, + "learning_rate": 0.0001, + "loss": 1.6059, + "step": 11114 + }, + { + "epoch": 1.2913157130409527, + "grad_norm": 0.5563119649887085, + "learning_rate": 0.0001, + "loss": 1.3916, + "step": 11115 + }, + { + "epoch": 1.2914318907929132, + "grad_norm": 0.5988980531692505, + "learning_rate": 0.0001, + "loss": 1.3723, + "step": 11116 + }, + { + "epoch": 1.2915480685448737, + "grad_norm": 0.5595650672912598, + "learning_rate": 0.0001, + "loss": 1.3951, + "step": 11117 + }, + { + "epoch": 1.2916642462968342, + "grad_norm": 0.6587527990341187, + "learning_rate": 0.0001, + "loss": 1.4094, + "step": 11118 + }, + { + "epoch": 1.2917804240487947, + "grad_norm": 0.5808438062667847, + "learning_rate": 0.0001, + "loss": 1.475, + "step": 11119 + }, + { + "epoch": 1.2918966018007552, + "grad_norm": 0.5954582095146179, + "learning_rate": 0.0001, + "loss": 1.3649, + "step": 11120 + }, + { + "epoch": 1.2920127795527157, + "grad_norm": 0.5856751799583435, + "learning_rate": 0.0001, + "loss": 1.4861, + "step": 11121 + }, + { + "epoch": 1.2921289573046761, + "grad_norm": 0.634069561958313, + "learning_rate": 0.0001, + "loss": 1.6051, + "step": 11122 + }, + { + "epoch": 1.2922451350566366, + "grad_norm": 0.5883323550224304, + "learning_rate": 0.0001, + "loss": 1.4626, + "step": 11123 + }, + { + "epoch": 1.2923613128085971, + "grad_norm": 0.5847913026809692, + "learning_rate": 0.0001, + "loss": 1.3908, + "step": 11124 + }, + { + "epoch": 1.2924774905605576, + "grad_norm": 0.6062279939651489, + "learning_rate": 0.0001, + "loss": 1.6381, + "step": 11125 + }, + { + "epoch": 1.292593668312518, + "grad_norm": 0.5873641967773438, + "learning_rate": 0.0001, + "loss": 1.3793, + "step": 11126 + }, + { + "epoch": 1.2927098460644786, + "grad_norm": 0.5468490719795227, + "learning_rate": 0.0001, + "loss": 1.4905, + "step": 11127 + }, + { + "epoch": 1.292826023816439, + "grad_norm": 0.6617196798324585, + "learning_rate": 0.0001, + "loss": 1.7155, + "step": 11128 + }, + { + "epoch": 1.2929422015683998, + "grad_norm": 0.6220079660415649, + "learning_rate": 0.0001, + "loss": 1.5208, + "step": 11129 + }, + { + "epoch": 1.2930583793203603, + "grad_norm": 0.5805248022079468, + "learning_rate": 0.0001, + "loss": 1.5515, + "step": 11130 + }, + { + "epoch": 1.2931745570723208, + "grad_norm": 0.5914593935012817, + "learning_rate": 0.0001, + "loss": 1.4297, + "step": 11131 + }, + { + "epoch": 1.2932907348242813, + "grad_norm": 0.5885531306266785, + "learning_rate": 0.0001, + "loss": 1.5298, + "step": 11132 + }, + { + "epoch": 1.2934069125762417, + "grad_norm": 0.5579025149345398, + "learning_rate": 0.0001, + "loss": 1.4472, + "step": 11133 + }, + { + "epoch": 1.2935230903282022, + "grad_norm": 0.6001786589622498, + "learning_rate": 0.0001, + "loss": 1.5589, + "step": 11134 + }, + { + "epoch": 1.2936392680801627, + "grad_norm": 0.5882461071014404, + "learning_rate": 0.0001, + "loss": 1.4804, + "step": 11135 + }, + { + "epoch": 1.2937554458321232, + "grad_norm": 0.5694596767425537, + "learning_rate": 0.0001, + "loss": 1.3329, + "step": 11136 + }, + { + "epoch": 1.2938716235840837, + "grad_norm": 0.5779051780700684, + "learning_rate": 0.0001, + "loss": 1.6199, + "step": 11137 + }, + { + "epoch": 1.2939878013360442, + "grad_norm": 0.6211315393447876, + "learning_rate": 0.0001, + "loss": 1.6313, + "step": 11138 + }, + { + "epoch": 1.2941039790880047, + "grad_norm": 0.5611022710800171, + "learning_rate": 0.0001, + "loss": 1.4081, + "step": 11139 + }, + { + "epoch": 1.2942201568399652, + "grad_norm": 0.5747177004814148, + "learning_rate": 0.0001, + "loss": 1.4622, + "step": 11140 + }, + { + "epoch": 1.2943363345919257, + "grad_norm": 0.5457985401153564, + "learning_rate": 0.0001, + "loss": 1.2923, + "step": 11141 + }, + { + "epoch": 1.2944525123438861, + "grad_norm": 0.5791705846786499, + "learning_rate": 0.0001, + "loss": 1.4877, + "step": 11142 + }, + { + "epoch": 1.2945686900958466, + "grad_norm": 0.5527358055114746, + "learning_rate": 0.0001, + "loss": 1.3722, + "step": 11143 + }, + { + "epoch": 1.2946848678478071, + "grad_norm": 0.5855079889297485, + "learning_rate": 0.0001, + "loss": 1.4477, + "step": 11144 + }, + { + "epoch": 1.2948010455997676, + "grad_norm": 0.5779101252555847, + "learning_rate": 0.0001, + "loss": 1.559, + "step": 11145 + }, + { + "epoch": 1.294917223351728, + "grad_norm": 0.6360836029052734, + "learning_rate": 0.0001, + "loss": 1.81, + "step": 11146 + }, + { + "epoch": 1.2950334011036886, + "grad_norm": 0.5778235197067261, + "learning_rate": 0.0001, + "loss": 1.3919, + "step": 11147 + }, + { + "epoch": 1.295149578855649, + "grad_norm": 0.5182598233222961, + "learning_rate": 0.0001, + "loss": 1.2329, + "step": 11148 + }, + { + "epoch": 1.2952657566076096, + "grad_norm": 0.5818975567817688, + "learning_rate": 0.0001, + "loss": 1.4925, + "step": 11149 + }, + { + "epoch": 1.29538193435957, + "grad_norm": 0.565624475479126, + "learning_rate": 0.0001, + "loss": 1.3899, + "step": 11150 + }, + { + "epoch": 1.2954981121115305, + "grad_norm": 0.5845359563827515, + "learning_rate": 0.0001, + "loss": 1.6526, + "step": 11151 + }, + { + "epoch": 1.295614289863491, + "grad_norm": 0.5842732787132263, + "learning_rate": 0.0001, + "loss": 1.4591, + "step": 11152 + }, + { + "epoch": 1.2957304676154515, + "grad_norm": 0.5768247842788696, + "learning_rate": 0.0001, + "loss": 1.4674, + "step": 11153 + }, + { + "epoch": 1.295846645367412, + "grad_norm": 0.6080706119537354, + "learning_rate": 0.0001, + "loss": 1.4585, + "step": 11154 + }, + { + "epoch": 1.2959628231193727, + "grad_norm": 0.6032418608665466, + "learning_rate": 0.0001, + "loss": 1.5665, + "step": 11155 + }, + { + "epoch": 1.2960790008713332, + "grad_norm": 0.5942699909210205, + "learning_rate": 0.0001, + "loss": 1.5165, + "step": 11156 + }, + { + "epoch": 1.2961951786232937, + "grad_norm": 0.6410212516784668, + "learning_rate": 0.0001, + "loss": 1.5121, + "step": 11157 + }, + { + "epoch": 1.2963113563752542, + "grad_norm": 0.576407253742218, + "learning_rate": 0.0001, + "loss": 1.4459, + "step": 11158 + }, + { + "epoch": 1.2964275341272147, + "grad_norm": 0.5596346855163574, + "learning_rate": 0.0001, + "loss": 1.293, + "step": 11159 + }, + { + "epoch": 1.2965437118791752, + "grad_norm": 0.5900881886482239, + "learning_rate": 0.0001, + "loss": 1.503, + "step": 11160 + }, + { + "epoch": 1.2966598896311357, + "grad_norm": 0.622340977191925, + "learning_rate": 0.0001, + "loss": 1.5566, + "step": 11161 + }, + { + "epoch": 1.2967760673830961, + "grad_norm": 0.5715858936309814, + "learning_rate": 0.0001, + "loss": 1.2327, + "step": 11162 + }, + { + "epoch": 1.2968922451350566, + "grad_norm": 0.5746508836746216, + "learning_rate": 0.0001, + "loss": 1.4591, + "step": 11163 + }, + { + "epoch": 1.2970084228870171, + "grad_norm": 0.5897271037101746, + "learning_rate": 0.0001, + "loss": 1.4323, + "step": 11164 + }, + { + "epoch": 1.2971246006389776, + "grad_norm": 0.607289731502533, + "learning_rate": 0.0001, + "loss": 1.6555, + "step": 11165 + }, + { + "epoch": 1.297240778390938, + "grad_norm": 0.5573462843894958, + "learning_rate": 0.0001, + "loss": 1.2812, + "step": 11166 + }, + { + "epoch": 1.2973569561428986, + "grad_norm": 0.5533374547958374, + "learning_rate": 0.0001, + "loss": 1.3657, + "step": 11167 + }, + { + "epoch": 1.297473133894859, + "grad_norm": 0.5837119221687317, + "learning_rate": 0.0001, + "loss": 1.4336, + "step": 11168 + }, + { + "epoch": 1.2975893116468196, + "grad_norm": 0.6015486121177673, + "learning_rate": 0.0001, + "loss": 1.4731, + "step": 11169 + }, + { + "epoch": 1.29770548939878, + "grad_norm": 0.5753357410430908, + "learning_rate": 0.0001, + "loss": 1.4456, + "step": 11170 + }, + { + "epoch": 1.2978216671507408, + "grad_norm": 0.578898549079895, + "learning_rate": 0.0001, + "loss": 1.6588, + "step": 11171 + }, + { + "epoch": 1.2979378449027013, + "grad_norm": 0.5891339778900146, + "learning_rate": 0.0001, + "loss": 1.3499, + "step": 11172 + }, + { + "epoch": 1.2980540226546617, + "grad_norm": 0.6010352373123169, + "learning_rate": 0.0001, + "loss": 1.4964, + "step": 11173 + }, + { + "epoch": 1.2981702004066222, + "grad_norm": 0.5747548937797546, + "learning_rate": 0.0001, + "loss": 1.5188, + "step": 11174 + }, + { + "epoch": 1.2982863781585827, + "grad_norm": 0.5667596459388733, + "learning_rate": 0.0001, + "loss": 1.4827, + "step": 11175 + }, + { + "epoch": 1.2984025559105432, + "grad_norm": 0.611515462398529, + "learning_rate": 0.0001, + "loss": 1.4193, + "step": 11176 + }, + { + "epoch": 1.2985187336625037, + "grad_norm": 0.6042912602424622, + "learning_rate": 0.0001, + "loss": 1.5121, + "step": 11177 + }, + { + "epoch": 1.2986349114144642, + "grad_norm": 0.5766490697860718, + "learning_rate": 0.0001, + "loss": 1.4842, + "step": 11178 + }, + { + "epoch": 1.2987510891664247, + "grad_norm": 0.5529983043670654, + "learning_rate": 0.0001, + "loss": 1.5855, + "step": 11179 + }, + { + "epoch": 1.2988672669183852, + "grad_norm": 0.5562178492546082, + "learning_rate": 0.0001, + "loss": 1.5143, + "step": 11180 + }, + { + "epoch": 1.2989834446703457, + "grad_norm": 0.6044286489486694, + "learning_rate": 0.0001, + "loss": 1.4856, + "step": 11181 + }, + { + "epoch": 1.2990996224223061, + "grad_norm": 0.606543779373169, + "learning_rate": 0.0001, + "loss": 1.54, + "step": 11182 + }, + { + "epoch": 1.2992158001742666, + "grad_norm": 0.562369704246521, + "learning_rate": 0.0001, + "loss": 1.3999, + "step": 11183 + }, + { + "epoch": 1.2993319779262271, + "grad_norm": 0.5866766571998596, + "learning_rate": 0.0001, + "loss": 1.4549, + "step": 11184 + }, + { + "epoch": 1.2994481556781876, + "grad_norm": 0.5652108192443848, + "learning_rate": 0.0001, + "loss": 1.6506, + "step": 11185 + }, + { + "epoch": 1.299564333430148, + "grad_norm": 0.5331071615219116, + "learning_rate": 0.0001, + "loss": 1.3049, + "step": 11186 + }, + { + "epoch": 1.2996805111821086, + "grad_norm": 0.5862208604812622, + "learning_rate": 0.0001, + "loss": 1.395, + "step": 11187 + }, + { + "epoch": 1.299796688934069, + "grad_norm": 0.5830875635147095, + "learning_rate": 0.0001, + "loss": 1.6407, + "step": 11188 + }, + { + "epoch": 1.2999128666860296, + "grad_norm": 0.5540904402732849, + "learning_rate": 0.0001, + "loss": 1.3349, + "step": 11189 + }, + { + "epoch": 1.30002904443799, + "grad_norm": 0.5949186086654663, + "learning_rate": 0.0001, + "loss": 1.4834, + "step": 11190 + }, + { + "epoch": 1.3001452221899505, + "grad_norm": 0.6121503710746765, + "learning_rate": 0.0001, + "loss": 1.5453, + "step": 11191 + }, + { + "epoch": 1.300261399941911, + "grad_norm": 0.5854485034942627, + "learning_rate": 0.0001, + "loss": 1.4894, + "step": 11192 + }, + { + "epoch": 1.3003775776938715, + "grad_norm": 0.5966970920562744, + "learning_rate": 0.0001, + "loss": 1.4434, + "step": 11193 + }, + { + "epoch": 1.300493755445832, + "grad_norm": 0.567981481552124, + "learning_rate": 0.0001, + "loss": 1.6029, + "step": 11194 + }, + { + "epoch": 1.3006099331977925, + "grad_norm": 0.568736732006073, + "learning_rate": 0.0001, + "loss": 1.5138, + "step": 11195 + }, + { + "epoch": 1.300726110949753, + "grad_norm": 0.5789809823036194, + "learning_rate": 0.0001, + "loss": 1.4117, + "step": 11196 + }, + { + "epoch": 1.3008422887017137, + "grad_norm": 0.592512845993042, + "learning_rate": 0.0001, + "loss": 1.4149, + "step": 11197 + }, + { + "epoch": 1.3009584664536742, + "grad_norm": 0.564712405204773, + "learning_rate": 0.0001, + "loss": 1.355, + "step": 11198 + }, + { + "epoch": 1.3010746442056347, + "grad_norm": 0.5966187715530396, + "learning_rate": 0.0001, + "loss": 1.4929, + "step": 11199 + }, + { + "epoch": 1.3011908219575952, + "grad_norm": 0.602142870426178, + "learning_rate": 0.0001, + "loss": 1.6281, + "step": 11200 + }, + { + "epoch": 1.3013069997095557, + "grad_norm": 0.5634971857070923, + "learning_rate": 0.0001, + "loss": 1.338, + "step": 11201 + }, + { + "epoch": 1.3014231774615161, + "grad_norm": 0.549117922782898, + "learning_rate": 0.0001, + "loss": 1.4056, + "step": 11202 + }, + { + "epoch": 1.3015393552134766, + "grad_norm": 0.5619381070137024, + "learning_rate": 0.0001, + "loss": 1.5767, + "step": 11203 + }, + { + "epoch": 1.3016555329654371, + "grad_norm": 0.5681606531143188, + "learning_rate": 0.0001, + "loss": 1.4639, + "step": 11204 + }, + { + "epoch": 1.3017717107173976, + "grad_norm": 0.5451623797416687, + "learning_rate": 0.0001, + "loss": 1.5145, + "step": 11205 + }, + { + "epoch": 1.301887888469358, + "grad_norm": 0.550369381904602, + "learning_rate": 0.0001, + "loss": 1.4309, + "step": 11206 + }, + { + "epoch": 1.3020040662213186, + "grad_norm": 0.5932458639144897, + "learning_rate": 0.0001, + "loss": 1.5924, + "step": 11207 + }, + { + "epoch": 1.302120243973279, + "grad_norm": 0.529190719127655, + "learning_rate": 0.0001, + "loss": 1.3753, + "step": 11208 + }, + { + "epoch": 1.3022364217252396, + "grad_norm": 0.584304928779602, + "learning_rate": 0.0001, + "loss": 1.3129, + "step": 11209 + }, + { + "epoch": 1.3023525994772, + "grad_norm": 0.6115213632583618, + "learning_rate": 0.0001, + "loss": 1.4993, + "step": 11210 + }, + { + "epoch": 1.3024687772291605, + "grad_norm": 0.5591283440589905, + "learning_rate": 0.0001, + "loss": 1.4011, + "step": 11211 + }, + { + "epoch": 1.302584954981121, + "grad_norm": 0.6138569116592407, + "learning_rate": 0.0001, + "loss": 1.6387, + "step": 11212 + }, + { + "epoch": 1.3027011327330817, + "grad_norm": 0.5965496897697449, + "learning_rate": 0.0001, + "loss": 1.5133, + "step": 11213 + }, + { + "epoch": 1.3028173104850422, + "grad_norm": 0.5974039435386658, + "learning_rate": 0.0001, + "loss": 1.5243, + "step": 11214 + }, + { + "epoch": 1.3029334882370027, + "grad_norm": 0.5517904162406921, + "learning_rate": 0.0001, + "loss": 1.3946, + "step": 11215 + }, + { + "epoch": 1.3030496659889632, + "grad_norm": 0.6423289775848389, + "learning_rate": 0.0001, + "loss": 1.5502, + "step": 11216 + }, + { + "epoch": 1.3031658437409237, + "grad_norm": 0.5717339515686035, + "learning_rate": 0.0001, + "loss": 1.4246, + "step": 11217 + }, + { + "epoch": 1.3032820214928842, + "grad_norm": 0.5621806383132935, + "learning_rate": 0.0001, + "loss": 1.457, + "step": 11218 + }, + { + "epoch": 1.3033981992448447, + "grad_norm": 0.5442423224449158, + "learning_rate": 0.0001, + "loss": 1.3835, + "step": 11219 + }, + { + "epoch": 1.3035143769968052, + "grad_norm": 0.6146840453147888, + "learning_rate": 0.0001, + "loss": 1.6248, + "step": 11220 + }, + { + "epoch": 1.3036305547487657, + "grad_norm": 0.54494708776474, + "learning_rate": 0.0001, + "loss": 1.4485, + "step": 11221 + }, + { + "epoch": 1.3037467325007261, + "grad_norm": 0.5648915767669678, + "learning_rate": 0.0001, + "loss": 1.346, + "step": 11222 + }, + { + "epoch": 1.3038629102526866, + "grad_norm": 0.5836517810821533, + "learning_rate": 0.0001, + "loss": 1.6378, + "step": 11223 + }, + { + "epoch": 1.3039790880046471, + "grad_norm": 0.6305371522903442, + "learning_rate": 0.0001, + "loss": 1.5957, + "step": 11224 + }, + { + "epoch": 1.3040952657566076, + "grad_norm": 0.5962154865264893, + "learning_rate": 0.0001, + "loss": 1.6403, + "step": 11225 + }, + { + "epoch": 1.304211443508568, + "grad_norm": 0.5581847429275513, + "learning_rate": 0.0001, + "loss": 1.429, + "step": 11226 + }, + { + "epoch": 1.3043276212605286, + "grad_norm": 0.5733591318130493, + "learning_rate": 0.0001, + "loss": 1.5465, + "step": 11227 + }, + { + "epoch": 1.304443799012489, + "grad_norm": 0.5814220309257507, + "learning_rate": 0.0001, + "loss": 1.4996, + "step": 11228 + }, + { + "epoch": 1.3045599767644496, + "grad_norm": 0.5474358201026917, + "learning_rate": 0.0001, + "loss": 1.4572, + "step": 11229 + }, + { + "epoch": 1.30467615451641, + "grad_norm": 0.5901806354522705, + "learning_rate": 0.0001, + "loss": 1.4194, + "step": 11230 + }, + { + "epoch": 1.3047923322683705, + "grad_norm": 0.6499736309051514, + "learning_rate": 0.0001, + "loss": 1.6479, + "step": 11231 + }, + { + "epoch": 1.304908510020331, + "grad_norm": 0.5674834251403809, + "learning_rate": 0.0001, + "loss": 1.2315, + "step": 11232 + }, + { + "epoch": 1.3050246877722915, + "grad_norm": 0.5817140340805054, + "learning_rate": 0.0001, + "loss": 1.5433, + "step": 11233 + }, + { + "epoch": 1.305140865524252, + "grad_norm": 0.5742542147636414, + "learning_rate": 0.0001, + "loss": 1.5561, + "step": 11234 + }, + { + "epoch": 1.3052570432762125, + "grad_norm": 0.5343637466430664, + "learning_rate": 0.0001, + "loss": 1.3944, + "step": 11235 + }, + { + "epoch": 1.305373221028173, + "grad_norm": 0.623912513256073, + "learning_rate": 0.0001, + "loss": 1.5761, + "step": 11236 + }, + { + "epoch": 1.3054893987801335, + "grad_norm": 0.5666230916976929, + "learning_rate": 0.0001, + "loss": 1.3874, + "step": 11237 + }, + { + "epoch": 1.305605576532094, + "grad_norm": 0.5873108506202698, + "learning_rate": 0.0001, + "loss": 1.5621, + "step": 11238 + }, + { + "epoch": 1.3057217542840547, + "grad_norm": 0.5752112865447998, + "learning_rate": 0.0001, + "loss": 1.4949, + "step": 11239 + }, + { + "epoch": 1.3058379320360152, + "grad_norm": 0.528434693813324, + "learning_rate": 0.0001, + "loss": 1.4461, + "step": 11240 + }, + { + "epoch": 1.3059541097879757, + "grad_norm": 0.5661198496818542, + "learning_rate": 0.0001, + "loss": 1.3099, + "step": 11241 + }, + { + "epoch": 1.3060702875399361, + "grad_norm": 0.5762214064598083, + "learning_rate": 0.0001, + "loss": 1.4182, + "step": 11242 + }, + { + "epoch": 1.3061864652918966, + "grad_norm": 0.5601356029510498, + "learning_rate": 0.0001, + "loss": 1.4627, + "step": 11243 + }, + { + "epoch": 1.3063026430438571, + "grad_norm": 0.5520750284194946, + "learning_rate": 0.0001, + "loss": 1.3123, + "step": 11244 + }, + { + "epoch": 1.3064188207958176, + "grad_norm": 0.5668884515762329, + "learning_rate": 0.0001, + "loss": 1.3559, + "step": 11245 + }, + { + "epoch": 1.306534998547778, + "grad_norm": 0.5902964472770691, + "learning_rate": 0.0001, + "loss": 1.5034, + "step": 11246 + }, + { + "epoch": 1.3066511762997386, + "grad_norm": 0.6026374697685242, + "learning_rate": 0.0001, + "loss": 1.3018, + "step": 11247 + }, + { + "epoch": 1.306767354051699, + "grad_norm": 0.5816320776939392, + "learning_rate": 0.0001, + "loss": 1.2819, + "step": 11248 + }, + { + "epoch": 1.3068835318036596, + "grad_norm": 0.5546746850013733, + "learning_rate": 0.0001, + "loss": 1.4505, + "step": 11249 + }, + { + "epoch": 1.30699970955562, + "grad_norm": 0.657646119594574, + "learning_rate": 0.0001, + "loss": 1.5616, + "step": 11250 + }, + { + "epoch": 1.3071158873075805, + "grad_norm": 0.5970759391784668, + "learning_rate": 0.0001, + "loss": 1.5271, + "step": 11251 + }, + { + "epoch": 1.307232065059541, + "grad_norm": 0.6167639493942261, + "learning_rate": 0.0001, + "loss": 1.665, + "step": 11252 + }, + { + "epoch": 1.3073482428115015, + "grad_norm": 0.5807110667228699, + "learning_rate": 0.0001, + "loss": 1.4903, + "step": 11253 + }, + { + "epoch": 1.307464420563462, + "grad_norm": 0.5864242315292358, + "learning_rate": 0.0001, + "loss": 1.4656, + "step": 11254 + }, + { + "epoch": 1.3075805983154227, + "grad_norm": 0.5699091553688049, + "learning_rate": 0.0001, + "loss": 1.4961, + "step": 11255 + }, + { + "epoch": 1.3076967760673832, + "grad_norm": 0.5962982177734375, + "learning_rate": 0.0001, + "loss": 1.5011, + "step": 11256 + }, + { + "epoch": 1.3078129538193437, + "grad_norm": 0.5681212544441223, + "learning_rate": 0.0001, + "loss": 1.4549, + "step": 11257 + }, + { + "epoch": 1.3079291315713042, + "grad_norm": 0.6166642308235168, + "learning_rate": 0.0001, + "loss": 1.6184, + "step": 11258 + }, + { + "epoch": 1.3080453093232647, + "grad_norm": 0.5670874714851379, + "learning_rate": 0.0001, + "loss": 1.4829, + "step": 11259 + }, + { + "epoch": 1.3081614870752252, + "grad_norm": 0.5790867209434509, + "learning_rate": 0.0001, + "loss": 1.4262, + "step": 11260 + }, + { + "epoch": 1.3082776648271857, + "grad_norm": 0.612891674041748, + "learning_rate": 0.0001, + "loss": 1.537, + "step": 11261 + }, + { + "epoch": 1.3083938425791461, + "grad_norm": 0.5698578357696533, + "learning_rate": 0.0001, + "loss": 1.424, + "step": 11262 + }, + { + "epoch": 1.3085100203311066, + "grad_norm": 0.5675930380821228, + "learning_rate": 0.0001, + "loss": 1.6215, + "step": 11263 + }, + { + "epoch": 1.3086261980830671, + "grad_norm": 0.5446262359619141, + "learning_rate": 0.0001, + "loss": 1.4859, + "step": 11264 + }, + { + "epoch": 1.3087423758350276, + "grad_norm": 0.5941632986068726, + "learning_rate": 0.0001, + "loss": 1.5968, + "step": 11265 + }, + { + "epoch": 1.308858553586988, + "grad_norm": 0.6017045378684998, + "learning_rate": 0.0001, + "loss": 1.3836, + "step": 11266 + }, + { + "epoch": 1.3089747313389486, + "grad_norm": 0.6072211265563965, + "learning_rate": 0.0001, + "loss": 1.7059, + "step": 11267 + }, + { + "epoch": 1.309090909090909, + "grad_norm": 0.5675247311592102, + "learning_rate": 0.0001, + "loss": 1.4294, + "step": 11268 + }, + { + "epoch": 1.3092070868428696, + "grad_norm": 0.6296728849411011, + "learning_rate": 0.0001, + "loss": 1.556, + "step": 11269 + }, + { + "epoch": 1.30932326459483, + "grad_norm": 0.5690588355064392, + "learning_rate": 0.0001, + "loss": 1.3506, + "step": 11270 + }, + { + "epoch": 1.3094394423467905, + "grad_norm": 0.5606512427330017, + "learning_rate": 0.0001, + "loss": 1.5288, + "step": 11271 + }, + { + "epoch": 1.309555620098751, + "grad_norm": 0.6174575686454773, + "learning_rate": 0.0001, + "loss": 1.4779, + "step": 11272 + }, + { + "epoch": 1.3096717978507115, + "grad_norm": 0.5738336443901062, + "learning_rate": 0.0001, + "loss": 1.559, + "step": 11273 + }, + { + "epoch": 1.309787975602672, + "grad_norm": 0.6263105869293213, + "learning_rate": 0.0001, + "loss": 1.5608, + "step": 11274 + }, + { + "epoch": 1.3099041533546325, + "grad_norm": 0.5497341156005859, + "learning_rate": 0.0001, + "loss": 1.505, + "step": 11275 + }, + { + "epoch": 1.310020331106593, + "grad_norm": 0.5970504879951477, + "learning_rate": 0.0001, + "loss": 1.3028, + "step": 11276 + }, + { + "epoch": 1.3101365088585535, + "grad_norm": 0.5593211054801941, + "learning_rate": 0.0001, + "loss": 1.5384, + "step": 11277 + }, + { + "epoch": 1.310252686610514, + "grad_norm": 0.5131186246871948, + "learning_rate": 0.0001, + "loss": 1.4588, + "step": 11278 + }, + { + "epoch": 1.3103688643624745, + "grad_norm": 0.5773268938064575, + "learning_rate": 0.0001, + "loss": 1.5205, + "step": 11279 + }, + { + "epoch": 1.310485042114435, + "grad_norm": 0.564262330532074, + "learning_rate": 0.0001, + "loss": 1.5415, + "step": 11280 + }, + { + "epoch": 1.3106012198663957, + "grad_norm": 0.5280408263206482, + "learning_rate": 0.0001, + "loss": 1.4061, + "step": 11281 + }, + { + "epoch": 1.3107173976183562, + "grad_norm": 0.5752106308937073, + "learning_rate": 0.0001, + "loss": 1.4062, + "step": 11282 + }, + { + "epoch": 1.3108335753703166, + "grad_norm": 0.5846773982048035, + "learning_rate": 0.0001, + "loss": 1.3796, + "step": 11283 + }, + { + "epoch": 1.3109497531222771, + "grad_norm": 0.594781756401062, + "learning_rate": 0.0001, + "loss": 1.5035, + "step": 11284 + }, + { + "epoch": 1.3110659308742376, + "grad_norm": 0.5700458288192749, + "learning_rate": 0.0001, + "loss": 1.4275, + "step": 11285 + }, + { + "epoch": 1.311182108626198, + "grad_norm": 0.550640344619751, + "learning_rate": 0.0001, + "loss": 1.2759, + "step": 11286 + }, + { + "epoch": 1.3112982863781586, + "grad_norm": 0.5917354822158813, + "learning_rate": 0.0001, + "loss": 1.5185, + "step": 11287 + }, + { + "epoch": 1.311414464130119, + "grad_norm": 0.6752248406410217, + "learning_rate": 0.0001, + "loss": 1.4566, + "step": 11288 + }, + { + "epoch": 1.3115306418820796, + "grad_norm": 0.6515881419181824, + "learning_rate": 0.0001, + "loss": 1.6851, + "step": 11289 + }, + { + "epoch": 1.31164681963404, + "grad_norm": 0.6039806008338928, + "learning_rate": 0.0001, + "loss": 1.5634, + "step": 11290 + }, + { + "epoch": 1.3117629973860006, + "grad_norm": 0.6334456205368042, + "learning_rate": 0.0001, + "loss": 1.3471, + "step": 11291 + }, + { + "epoch": 1.311879175137961, + "grad_norm": 0.574119508266449, + "learning_rate": 0.0001, + "loss": 1.3821, + "step": 11292 + }, + { + "epoch": 1.3119953528899215, + "grad_norm": 0.5512613654136658, + "learning_rate": 0.0001, + "loss": 1.3874, + "step": 11293 + }, + { + "epoch": 1.312111530641882, + "grad_norm": 0.5976804494857788, + "learning_rate": 0.0001, + "loss": 1.4928, + "step": 11294 + }, + { + "epoch": 1.3122277083938425, + "grad_norm": 0.6032594442367554, + "learning_rate": 0.0001, + "loss": 1.7967, + "step": 11295 + }, + { + "epoch": 1.312343886145803, + "grad_norm": 0.5688466429710388, + "learning_rate": 0.0001, + "loss": 1.4937, + "step": 11296 + }, + { + "epoch": 1.3124600638977637, + "grad_norm": 0.5861960053443909, + "learning_rate": 0.0001, + "loss": 1.4174, + "step": 11297 + }, + { + "epoch": 1.3125762416497242, + "grad_norm": 0.5523131489753723, + "learning_rate": 0.0001, + "loss": 1.3868, + "step": 11298 + }, + { + "epoch": 1.3126924194016847, + "grad_norm": 0.5910392999649048, + "learning_rate": 0.0001, + "loss": 1.3825, + "step": 11299 + }, + { + "epoch": 1.3128085971536452, + "grad_norm": 0.574103832244873, + "learning_rate": 0.0001, + "loss": 1.4774, + "step": 11300 + }, + { + "epoch": 1.3129247749056057, + "grad_norm": 0.5659858584403992, + "learning_rate": 0.0001, + "loss": 1.3085, + "step": 11301 + }, + { + "epoch": 1.3130409526575662, + "grad_norm": 0.5777335166931152, + "learning_rate": 0.0001, + "loss": 1.3941, + "step": 11302 + }, + { + "epoch": 1.3131571304095266, + "grad_norm": 0.6105539202690125, + "learning_rate": 0.0001, + "loss": 1.6012, + "step": 11303 + }, + { + "epoch": 1.3132733081614871, + "grad_norm": 0.605769693851471, + "learning_rate": 0.0001, + "loss": 1.5137, + "step": 11304 + }, + { + "epoch": 1.3133894859134476, + "grad_norm": 0.5957057476043701, + "learning_rate": 0.0001, + "loss": 1.412, + "step": 11305 + }, + { + "epoch": 1.313505663665408, + "grad_norm": 0.5396880507469177, + "learning_rate": 0.0001, + "loss": 1.2279, + "step": 11306 + }, + { + "epoch": 1.3136218414173686, + "grad_norm": 0.6096924543380737, + "learning_rate": 0.0001, + "loss": 1.4824, + "step": 11307 + }, + { + "epoch": 1.313738019169329, + "grad_norm": 0.5338277816772461, + "learning_rate": 0.0001, + "loss": 1.3593, + "step": 11308 + }, + { + "epoch": 1.3138541969212896, + "grad_norm": 0.5803439617156982, + "learning_rate": 0.0001, + "loss": 1.537, + "step": 11309 + }, + { + "epoch": 1.31397037467325, + "grad_norm": 0.624612033367157, + "learning_rate": 0.0001, + "loss": 1.4565, + "step": 11310 + }, + { + "epoch": 1.3140865524252106, + "grad_norm": 0.6359199285507202, + "learning_rate": 0.0001, + "loss": 1.4757, + "step": 11311 + }, + { + "epoch": 1.314202730177171, + "grad_norm": 0.5825530290603638, + "learning_rate": 0.0001, + "loss": 1.6943, + "step": 11312 + }, + { + "epoch": 1.3143189079291315, + "grad_norm": 0.5994022488594055, + "learning_rate": 0.0001, + "loss": 1.3063, + "step": 11313 + }, + { + "epoch": 1.314435085681092, + "grad_norm": 0.5629523992538452, + "learning_rate": 0.0001, + "loss": 1.2914, + "step": 11314 + }, + { + "epoch": 1.3145512634330525, + "grad_norm": 0.5352174639701843, + "learning_rate": 0.0001, + "loss": 1.3611, + "step": 11315 + }, + { + "epoch": 1.314667441185013, + "grad_norm": 0.5785360336303711, + "learning_rate": 0.0001, + "loss": 1.4782, + "step": 11316 + }, + { + "epoch": 1.3147836189369735, + "grad_norm": 0.5326957106590271, + "learning_rate": 0.0001, + "loss": 1.3276, + "step": 11317 + }, + { + "epoch": 1.314899796688934, + "grad_norm": 0.5812473297119141, + "learning_rate": 0.0001, + "loss": 1.5484, + "step": 11318 + }, + { + "epoch": 1.3150159744408945, + "grad_norm": 0.567494809627533, + "learning_rate": 0.0001, + "loss": 1.5323, + "step": 11319 + }, + { + "epoch": 1.315132152192855, + "grad_norm": 0.6070596575737, + "learning_rate": 0.0001, + "loss": 1.4541, + "step": 11320 + }, + { + "epoch": 1.3152483299448154, + "grad_norm": 0.6242623925209045, + "learning_rate": 0.0001, + "loss": 1.5356, + "step": 11321 + }, + { + "epoch": 1.315364507696776, + "grad_norm": 0.5908821821212769, + "learning_rate": 0.0001, + "loss": 1.1826, + "step": 11322 + }, + { + "epoch": 1.3154806854487366, + "grad_norm": 0.6052210927009583, + "learning_rate": 0.0001, + "loss": 1.5074, + "step": 11323 + }, + { + "epoch": 1.3155968632006971, + "grad_norm": 0.5847655534744263, + "learning_rate": 0.0001, + "loss": 1.3685, + "step": 11324 + }, + { + "epoch": 1.3157130409526576, + "grad_norm": 0.5557252168655396, + "learning_rate": 0.0001, + "loss": 1.4326, + "step": 11325 + }, + { + "epoch": 1.315829218704618, + "grad_norm": 0.6291106939315796, + "learning_rate": 0.0001, + "loss": 1.4808, + "step": 11326 + }, + { + "epoch": 1.3159453964565786, + "grad_norm": 0.5765290260314941, + "learning_rate": 0.0001, + "loss": 1.5763, + "step": 11327 + }, + { + "epoch": 1.316061574208539, + "grad_norm": 0.5529724359512329, + "learning_rate": 0.0001, + "loss": 1.3843, + "step": 11328 + }, + { + "epoch": 1.3161777519604996, + "grad_norm": 0.5492607355117798, + "learning_rate": 0.0001, + "loss": 1.564, + "step": 11329 + }, + { + "epoch": 1.31629392971246, + "grad_norm": 0.6425765156745911, + "learning_rate": 0.0001, + "loss": 1.5878, + "step": 11330 + }, + { + "epoch": 1.3164101074644206, + "grad_norm": 0.5800068974494934, + "learning_rate": 0.0001, + "loss": 1.4884, + "step": 11331 + }, + { + "epoch": 1.316526285216381, + "grad_norm": 0.5752959251403809, + "learning_rate": 0.0001, + "loss": 1.5756, + "step": 11332 + }, + { + "epoch": 1.3166424629683415, + "grad_norm": 0.5906381607055664, + "learning_rate": 0.0001, + "loss": 1.5202, + "step": 11333 + }, + { + "epoch": 1.316758640720302, + "grad_norm": 0.6053020358085632, + "learning_rate": 0.0001, + "loss": 1.4883, + "step": 11334 + }, + { + "epoch": 1.3168748184722625, + "grad_norm": 0.578601598739624, + "learning_rate": 0.0001, + "loss": 1.5683, + "step": 11335 + }, + { + "epoch": 1.316990996224223, + "grad_norm": 0.5814728736877441, + "learning_rate": 0.0001, + "loss": 1.5167, + "step": 11336 + }, + { + "epoch": 1.3171071739761835, + "grad_norm": 0.5758288502693176, + "learning_rate": 0.0001, + "loss": 1.4633, + "step": 11337 + }, + { + "epoch": 1.3172233517281442, + "grad_norm": 0.5635868906974792, + "learning_rate": 0.0001, + "loss": 1.4216, + "step": 11338 + }, + { + "epoch": 1.3173395294801047, + "grad_norm": 0.5569899082183838, + "learning_rate": 0.0001, + "loss": 1.3649, + "step": 11339 + }, + { + "epoch": 1.3174557072320652, + "grad_norm": 0.5895545482635498, + "learning_rate": 0.0001, + "loss": 1.4655, + "step": 11340 + }, + { + "epoch": 1.3175718849840257, + "grad_norm": 0.5705216526985168, + "learning_rate": 0.0001, + "loss": 1.4427, + "step": 11341 + }, + { + "epoch": 1.3176880627359862, + "grad_norm": 0.5997105240821838, + "learning_rate": 0.0001, + "loss": 1.5919, + "step": 11342 + }, + { + "epoch": 1.3178042404879466, + "grad_norm": 0.566246509552002, + "learning_rate": 0.0001, + "loss": 1.3959, + "step": 11343 + }, + { + "epoch": 1.3179204182399071, + "grad_norm": 0.5906617641448975, + "learning_rate": 0.0001, + "loss": 1.4819, + "step": 11344 + }, + { + "epoch": 1.3180365959918676, + "grad_norm": 0.6158729791641235, + "learning_rate": 0.0001, + "loss": 1.4863, + "step": 11345 + }, + { + "epoch": 1.318152773743828, + "grad_norm": 0.6189556121826172, + "learning_rate": 0.0001, + "loss": 1.2961, + "step": 11346 + }, + { + "epoch": 1.3182689514957886, + "grad_norm": 0.6094973683357239, + "learning_rate": 0.0001, + "loss": 1.5633, + "step": 11347 + }, + { + "epoch": 1.318385129247749, + "grad_norm": 0.579227864742279, + "learning_rate": 0.0001, + "loss": 1.5111, + "step": 11348 + }, + { + "epoch": 1.3185013069997096, + "grad_norm": 0.5919841527938843, + "learning_rate": 0.0001, + "loss": 1.4532, + "step": 11349 + }, + { + "epoch": 1.31861748475167, + "grad_norm": 0.6161413192749023, + "learning_rate": 0.0001, + "loss": 1.5484, + "step": 11350 + }, + { + "epoch": 1.3187336625036306, + "grad_norm": 0.6276363730430603, + "learning_rate": 0.0001, + "loss": 1.5114, + "step": 11351 + }, + { + "epoch": 1.318849840255591, + "grad_norm": 0.5806869864463806, + "learning_rate": 0.0001, + "loss": 1.4119, + "step": 11352 + }, + { + "epoch": 1.3189660180075515, + "grad_norm": 0.5905710458755493, + "learning_rate": 0.0001, + "loss": 1.3384, + "step": 11353 + }, + { + "epoch": 1.319082195759512, + "grad_norm": 0.6194556355476379, + "learning_rate": 0.0001, + "loss": 1.6933, + "step": 11354 + }, + { + "epoch": 1.3191983735114725, + "grad_norm": 0.6072413921356201, + "learning_rate": 0.0001, + "loss": 1.5567, + "step": 11355 + }, + { + "epoch": 1.319314551263433, + "grad_norm": 0.6501614451408386, + "learning_rate": 0.0001, + "loss": 1.5474, + "step": 11356 + }, + { + "epoch": 1.3194307290153935, + "grad_norm": 0.6481568217277527, + "learning_rate": 0.0001, + "loss": 1.5349, + "step": 11357 + }, + { + "epoch": 1.319546906767354, + "grad_norm": 0.5640507340431213, + "learning_rate": 0.0001, + "loss": 1.4518, + "step": 11358 + }, + { + "epoch": 1.3196630845193145, + "grad_norm": 0.5995705127716064, + "learning_rate": 0.0001, + "loss": 1.4165, + "step": 11359 + }, + { + "epoch": 1.319779262271275, + "grad_norm": 0.561647355556488, + "learning_rate": 0.0001, + "loss": 1.4802, + "step": 11360 + }, + { + "epoch": 1.3198954400232354, + "grad_norm": 0.5589419603347778, + "learning_rate": 0.0001, + "loss": 1.4506, + "step": 11361 + }, + { + "epoch": 1.320011617775196, + "grad_norm": 0.5721113085746765, + "learning_rate": 0.0001, + "loss": 1.3061, + "step": 11362 + }, + { + "epoch": 1.3201277955271564, + "grad_norm": 0.6529408097267151, + "learning_rate": 0.0001, + "loss": 1.6187, + "step": 11363 + }, + { + "epoch": 1.320243973279117, + "grad_norm": 0.5891768932342529, + "learning_rate": 0.0001, + "loss": 1.4214, + "step": 11364 + }, + { + "epoch": 1.3203601510310776, + "grad_norm": 0.5529371500015259, + "learning_rate": 0.0001, + "loss": 1.2607, + "step": 11365 + }, + { + "epoch": 1.3204763287830381, + "grad_norm": 0.6112219095230103, + "learning_rate": 0.0001, + "loss": 1.6024, + "step": 11366 + }, + { + "epoch": 1.3205925065349986, + "grad_norm": 0.559587299823761, + "learning_rate": 0.0001, + "loss": 1.3874, + "step": 11367 + }, + { + "epoch": 1.320708684286959, + "grad_norm": 0.5865263938903809, + "learning_rate": 0.0001, + "loss": 1.4663, + "step": 11368 + }, + { + "epoch": 1.3208248620389196, + "grad_norm": 0.5650449991226196, + "learning_rate": 0.0001, + "loss": 1.19, + "step": 11369 + }, + { + "epoch": 1.32094103979088, + "grad_norm": 0.5587924122810364, + "learning_rate": 0.0001, + "loss": 1.5052, + "step": 11370 + }, + { + "epoch": 1.3210572175428406, + "grad_norm": 0.6297246217727661, + "learning_rate": 0.0001, + "loss": 1.4897, + "step": 11371 + }, + { + "epoch": 1.321173395294801, + "grad_norm": 0.59564608335495, + "learning_rate": 0.0001, + "loss": 1.3343, + "step": 11372 + }, + { + "epoch": 1.3212895730467615, + "grad_norm": 0.5825412273406982, + "learning_rate": 0.0001, + "loss": 1.3953, + "step": 11373 + }, + { + "epoch": 1.321405750798722, + "grad_norm": 0.6541181206703186, + "learning_rate": 0.0001, + "loss": 1.6079, + "step": 11374 + }, + { + "epoch": 1.3215219285506825, + "grad_norm": 0.5667305588722229, + "learning_rate": 0.0001, + "loss": 1.4322, + "step": 11375 + }, + { + "epoch": 1.321638106302643, + "grad_norm": 0.6084561347961426, + "learning_rate": 0.0001, + "loss": 1.4976, + "step": 11376 + }, + { + "epoch": 1.3217542840546035, + "grad_norm": 0.6270078420639038, + "learning_rate": 0.0001, + "loss": 1.5412, + "step": 11377 + }, + { + "epoch": 1.321870461806564, + "grad_norm": 0.576206386089325, + "learning_rate": 0.0001, + "loss": 1.4318, + "step": 11378 + }, + { + "epoch": 1.3219866395585245, + "grad_norm": 0.6498734354972839, + "learning_rate": 0.0001, + "loss": 1.6348, + "step": 11379 + }, + { + "epoch": 1.3221028173104852, + "grad_norm": 0.5948904752731323, + "learning_rate": 0.0001, + "loss": 1.6058, + "step": 11380 + }, + { + "epoch": 1.3222189950624457, + "grad_norm": 0.5957643389701843, + "learning_rate": 0.0001, + "loss": 1.5562, + "step": 11381 + }, + { + "epoch": 1.3223351728144062, + "grad_norm": 0.5807846784591675, + "learning_rate": 0.0001, + "loss": 1.458, + "step": 11382 + }, + { + "epoch": 1.3224513505663666, + "grad_norm": 0.5830207467079163, + "learning_rate": 0.0001, + "loss": 1.4322, + "step": 11383 + }, + { + "epoch": 1.3225675283183271, + "grad_norm": 0.5690310001373291, + "learning_rate": 0.0001, + "loss": 1.5376, + "step": 11384 + }, + { + "epoch": 1.3226837060702876, + "grad_norm": 0.5671119689941406, + "learning_rate": 0.0001, + "loss": 1.445, + "step": 11385 + }, + { + "epoch": 1.3227998838222481, + "grad_norm": 0.5723904967308044, + "learning_rate": 0.0001, + "loss": 1.3052, + "step": 11386 + }, + { + "epoch": 1.3229160615742086, + "grad_norm": 0.5799258947372437, + "learning_rate": 0.0001, + "loss": 1.3406, + "step": 11387 + }, + { + "epoch": 1.323032239326169, + "grad_norm": 0.6347507238388062, + "learning_rate": 0.0001, + "loss": 1.6213, + "step": 11388 + }, + { + "epoch": 1.3231484170781296, + "grad_norm": 0.6038704514503479, + "learning_rate": 0.0001, + "loss": 1.5328, + "step": 11389 + }, + { + "epoch": 1.32326459483009, + "grad_norm": 0.5528449416160583, + "learning_rate": 0.0001, + "loss": 1.4312, + "step": 11390 + }, + { + "epoch": 1.3233807725820506, + "grad_norm": 0.6076419949531555, + "learning_rate": 0.0001, + "loss": 1.507, + "step": 11391 + }, + { + "epoch": 1.323496950334011, + "grad_norm": 0.5536520481109619, + "learning_rate": 0.0001, + "loss": 1.5116, + "step": 11392 + }, + { + "epoch": 1.3236131280859715, + "grad_norm": 0.58294278383255, + "learning_rate": 0.0001, + "loss": 1.4941, + "step": 11393 + }, + { + "epoch": 1.323729305837932, + "grad_norm": 0.6274382472038269, + "learning_rate": 0.0001, + "loss": 1.6235, + "step": 11394 + }, + { + "epoch": 1.3238454835898925, + "grad_norm": 0.5923705101013184, + "learning_rate": 0.0001, + "loss": 1.4008, + "step": 11395 + }, + { + "epoch": 1.323961661341853, + "grad_norm": 0.6048977375030518, + "learning_rate": 0.0001, + "loss": 1.6093, + "step": 11396 + }, + { + "epoch": 1.3240778390938135, + "grad_norm": 0.5796459913253784, + "learning_rate": 0.0001, + "loss": 1.5118, + "step": 11397 + }, + { + "epoch": 1.324194016845774, + "grad_norm": 0.632941484451294, + "learning_rate": 0.0001, + "loss": 1.6222, + "step": 11398 + }, + { + "epoch": 1.3243101945977345, + "grad_norm": 0.582012951374054, + "learning_rate": 0.0001, + "loss": 1.4579, + "step": 11399 + }, + { + "epoch": 1.324426372349695, + "grad_norm": 0.5736321210861206, + "learning_rate": 0.0001, + "loss": 1.333, + "step": 11400 + }, + { + "epoch": 1.3245425501016554, + "grad_norm": 0.5718238949775696, + "learning_rate": 0.0001, + "loss": 1.4398, + "step": 11401 + }, + { + "epoch": 1.324658727853616, + "grad_norm": 0.6074345707893372, + "learning_rate": 0.0001, + "loss": 1.526, + "step": 11402 + }, + { + "epoch": 1.3247749056055764, + "grad_norm": 0.6050236225128174, + "learning_rate": 0.0001, + "loss": 1.4498, + "step": 11403 + }, + { + "epoch": 1.324891083357537, + "grad_norm": 0.5850708484649658, + "learning_rate": 0.0001, + "loss": 1.422, + "step": 11404 + }, + { + "epoch": 1.3250072611094974, + "grad_norm": 0.6051907539367676, + "learning_rate": 0.0001, + "loss": 1.2887, + "step": 11405 + }, + { + "epoch": 1.3251234388614581, + "grad_norm": 0.5894731283187866, + "learning_rate": 0.0001, + "loss": 1.2812, + "step": 11406 + }, + { + "epoch": 1.3252396166134186, + "grad_norm": 0.5793226957321167, + "learning_rate": 0.0001, + "loss": 1.5097, + "step": 11407 + }, + { + "epoch": 1.325355794365379, + "grad_norm": 0.564224898815155, + "learning_rate": 0.0001, + "loss": 1.2043, + "step": 11408 + }, + { + "epoch": 1.3254719721173396, + "grad_norm": 0.5819063782691956, + "learning_rate": 0.0001, + "loss": 1.4239, + "step": 11409 + }, + { + "epoch": 1.3255881498693, + "grad_norm": 0.539932370185852, + "learning_rate": 0.0001, + "loss": 1.5193, + "step": 11410 + }, + { + "epoch": 1.3257043276212606, + "grad_norm": 0.5646186470985413, + "learning_rate": 0.0001, + "loss": 1.2979, + "step": 11411 + }, + { + "epoch": 1.325820505373221, + "grad_norm": 0.5890675783157349, + "learning_rate": 0.0001, + "loss": 1.5787, + "step": 11412 + }, + { + "epoch": 1.3259366831251815, + "grad_norm": 0.5870331525802612, + "learning_rate": 0.0001, + "loss": 1.5525, + "step": 11413 + }, + { + "epoch": 1.326052860877142, + "grad_norm": 0.5778072476387024, + "learning_rate": 0.0001, + "loss": 1.4487, + "step": 11414 + }, + { + "epoch": 1.3261690386291025, + "grad_norm": 0.5781219601631165, + "learning_rate": 0.0001, + "loss": 1.4424, + "step": 11415 + }, + { + "epoch": 1.326285216381063, + "grad_norm": 0.6099265217781067, + "learning_rate": 0.0001, + "loss": 1.5679, + "step": 11416 + }, + { + "epoch": 1.3264013941330235, + "grad_norm": 0.5225432515144348, + "learning_rate": 0.0001, + "loss": 1.2422, + "step": 11417 + }, + { + "epoch": 1.326517571884984, + "grad_norm": 0.630908191204071, + "learning_rate": 0.0001, + "loss": 1.5711, + "step": 11418 + }, + { + "epoch": 1.3266337496369445, + "grad_norm": 0.5389137268066406, + "learning_rate": 0.0001, + "loss": 1.2754, + "step": 11419 + }, + { + "epoch": 1.326749927388905, + "grad_norm": 0.6575531959533691, + "learning_rate": 0.0001, + "loss": 1.5822, + "step": 11420 + }, + { + "epoch": 1.3268661051408654, + "grad_norm": 0.6499255299568176, + "learning_rate": 0.0001, + "loss": 1.558, + "step": 11421 + }, + { + "epoch": 1.3269822828928262, + "grad_norm": 0.5644387006759644, + "learning_rate": 0.0001, + "loss": 1.4451, + "step": 11422 + }, + { + "epoch": 1.3270984606447866, + "grad_norm": 0.6073735356330872, + "learning_rate": 0.0001, + "loss": 1.5635, + "step": 11423 + }, + { + "epoch": 1.3272146383967471, + "grad_norm": 0.6502189636230469, + "learning_rate": 0.0001, + "loss": 1.5142, + "step": 11424 + }, + { + "epoch": 1.3273308161487076, + "grad_norm": 0.6316292881965637, + "learning_rate": 0.0001, + "loss": 1.5855, + "step": 11425 + }, + { + "epoch": 1.3274469939006681, + "grad_norm": 0.5228843092918396, + "learning_rate": 0.0001, + "loss": 1.3864, + "step": 11426 + }, + { + "epoch": 1.3275631716526286, + "grad_norm": 0.5909907221794128, + "learning_rate": 0.0001, + "loss": 1.5358, + "step": 11427 + }, + { + "epoch": 1.327679349404589, + "grad_norm": 0.6318853497505188, + "learning_rate": 0.0001, + "loss": 1.569, + "step": 11428 + }, + { + "epoch": 1.3277955271565496, + "grad_norm": 0.5925455689430237, + "learning_rate": 0.0001, + "loss": 1.4686, + "step": 11429 + }, + { + "epoch": 1.32791170490851, + "grad_norm": 0.6002850532531738, + "learning_rate": 0.0001, + "loss": 1.5035, + "step": 11430 + }, + { + "epoch": 1.3280278826604706, + "grad_norm": 0.6215521693229675, + "learning_rate": 0.0001, + "loss": 1.5532, + "step": 11431 + }, + { + "epoch": 1.328144060412431, + "grad_norm": 0.6084332466125488, + "learning_rate": 0.0001, + "loss": 1.4333, + "step": 11432 + }, + { + "epoch": 1.3282602381643915, + "grad_norm": 0.6313285231590271, + "learning_rate": 0.0001, + "loss": 1.5212, + "step": 11433 + }, + { + "epoch": 1.328376415916352, + "grad_norm": 0.5687292814254761, + "learning_rate": 0.0001, + "loss": 1.4264, + "step": 11434 + }, + { + "epoch": 1.3284925936683125, + "grad_norm": 0.6608239412307739, + "learning_rate": 0.0001, + "loss": 1.6383, + "step": 11435 + }, + { + "epoch": 1.328608771420273, + "grad_norm": 0.6085755825042725, + "learning_rate": 0.0001, + "loss": 1.4592, + "step": 11436 + }, + { + "epoch": 1.3287249491722335, + "grad_norm": 0.62114417552948, + "learning_rate": 0.0001, + "loss": 1.4995, + "step": 11437 + }, + { + "epoch": 1.328841126924194, + "grad_norm": 0.5876638889312744, + "learning_rate": 0.0001, + "loss": 1.5137, + "step": 11438 + }, + { + "epoch": 1.3289573046761545, + "grad_norm": 0.5959676504135132, + "learning_rate": 0.0001, + "loss": 1.5303, + "step": 11439 + }, + { + "epoch": 1.329073482428115, + "grad_norm": 0.569511890411377, + "learning_rate": 0.0001, + "loss": 1.4759, + "step": 11440 + }, + { + "epoch": 1.3291896601800754, + "grad_norm": 0.5644763112068176, + "learning_rate": 0.0001, + "loss": 1.5487, + "step": 11441 + }, + { + "epoch": 1.329305837932036, + "grad_norm": 0.5564660429954529, + "learning_rate": 0.0001, + "loss": 1.5077, + "step": 11442 + }, + { + "epoch": 1.3294220156839964, + "grad_norm": 0.5818626284599304, + "learning_rate": 0.0001, + "loss": 1.6736, + "step": 11443 + }, + { + "epoch": 1.329538193435957, + "grad_norm": 0.6205581426620483, + "learning_rate": 0.0001, + "loss": 1.3026, + "step": 11444 + }, + { + "epoch": 1.3296543711879174, + "grad_norm": 0.5764113068580627, + "learning_rate": 0.0001, + "loss": 1.414, + "step": 11445 + }, + { + "epoch": 1.329770548939878, + "grad_norm": 0.5862987637519836, + "learning_rate": 0.0001, + "loss": 1.5675, + "step": 11446 + }, + { + "epoch": 1.3298867266918384, + "grad_norm": 0.5654096007347107, + "learning_rate": 0.0001, + "loss": 1.2999, + "step": 11447 + }, + { + "epoch": 1.330002904443799, + "grad_norm": 0.6210341453552246, + "learning_rate": 0.0001, + "loss": 1.5713, + "step": 11448 + }, + { + "epoch": 1.3301190821957596, + "grad_norm": 0.6105040311813354, + "learning_rate": 0.0001, + "loss": 1.6378, + "step": 11449 + }, + { + "epoch": 1.33023525994772, + "grad_norm": 0.5692093372344971, + "learning_rate": 0.0001, + "loss": 1.5327, + "step": 11450 + }, + { + "epoch": 1.3303514376996806, + "grad_norm": 0.5587416887283325, + "learning_rate": 0.0001, + "loss": 1.3777, + "step": 11451 + }, + { + "epoch": 1.330467615451641, + "grad_norm": 0.5672914981842041, + "learning_rate": 0.0001, + "loss": 1.3449, + "step": 11452 + }, + { + "epoch": 1.3305837932036015, + "grad_norm": 0.5590872764587402, + "learning_rate": 0.0001, + "loss": 1.5599, + "step": 11453 + }, + { + "epoch": 1.330699970955562, + "grad_norm": 0.5552334189414978, + "learning_rate": 0.0001, + "loss": 1.649, + "step": 11454 + }, + { + "epoch": 1.3308161487075225, + "grad_norm": 0.6139841079711914, + "learning_rate": 0.0001, + "loss": 1.5895, + "step": 11455 + }, + { + "epoch": 1.330932326459483, + "grad_norm": 0.5881022810935974, + "learning_rate": 0.0001, + "loss": 1.3873, + "step": 11456 + }, + { + "epoch": 1.3310485042114435, + "grad_norm": 0.5796955227851868, + "learning_rate": 0.0001, + "loss": 1.598, + "step": 11457 + }, + { + "epoch": 1.331164681963404, + "grad_norm": 0.5995549559593201, + "learning_rate": 0.0001, + "loss": 1.437, + "step": 11458 + }, + { + "epoch": 1.3312808597153645, + "grad_norm": 0.5792880654335022, + "learning_rate": 0.0001, + "loss": 1.4925, + "step": 11459 + }, + { + "epoch": 1.331397037467325, + "grad_norm": 0.5754793882369995, + "learning_rate": 0.0001, + "loss": 1.6644, + "step": 11460 + }, + { + "epoch": 1.3315132152192855, + "grad_norm": 0.593312680721283, + "learning_rate": 0.0001, + "loss": 1.5568, + "step": 11461 + }, + { + "epoch": 1.331629392971246, + "grad_norm": 0.5615260601043701, + "learning_rate": 0.0001, + "loss": 1.3415, + "step": 11462 + }, + { + "epoch": 1.3317455707232064, + "grad_norm": 0.5913121104240417, + "learning_rate": 0.0001, + "loss": 1.2549, + "step": 11463 + }, + { + "epoch": 1.3318617484751671, + "grad_norm": 0.55987948179245, + "learning_rate": 0.0001, + "loss": 1.4738, + "step": 11464 + }, + { + "epoch": 1.3319779262271276, + "grad_norm": 0.6145794987678528, + "learning_rate": 0.0001, + "loss": 1.5215, + "step": 11465 + }, + { + "epoch": 1.3320941039790881, + "grad_norm": 0.5672405362129211, + "learning_rate": 0.0001, + "loss": 1.4475, + "step": 11466 + }, + { + "epoch": 1.3322102817310486, + "grad_norm": 0.6223773956298828, + "learning_rate": 0.0001, + "loss": 1.5384, + "step": 11467 + }, + { + "epoch": 1.332326459483009, + "grad_norm": 0.5505383014678955, + "learning_rate": 0.0001, + "loss": 1.4953, + "step": 11468 + }, + { + "epoch": 1.3324426372349696, + "grad_norm": 0.6319714784622192, + "learning_rate": 0.0001, + "loss": 1.4733, + "step": 11469 + }, + { + "epoch": 1.33255881498693, + "grad_norm": 0.5813085436820984, + "learning_rate": 0.0001, + "loss": 1.5044, + "step": 11470 + }, + { + "epoch": 1.3326749927388906, + "grad_norm": 0.5962019562721252, + "learning_rate": 0.0001, + "loss": 1.3574, + "step": 11471 + }, + { + "epoch": 1.332791170490851, + "grad_norm": 0.5934303402900696, + "learning_rate": 0.0001, + "loss": 1.5318, + "step": 11472 + }, + { + "epoch": 1.3329073482428115, + "grad_norm": 0.6068841218948364, + "learning_rate": 0.0001, + "loss": 1.3101, + "step": 11473 + }, + { + "epoch": 1.333023525994772, + "grad_norm": 0.5842049717903137, + "learning_rate": 0.0001, + "loss": 1.522, + "step": 11474 + }, + { + "epoch": 1.3331397037467325, + "grad_norm": 0.6370680928230286, + "learning_rate": 0.0001, + "loss": 1.4767, + "step": 11475 + }, + { + "epoch": 1.333255881498693, + "grad_norm": 0.6403306722640991, + "learning_rate": 0.0001, + "loss": 1.4926, + "step": 11476 + }, + { + "epoch": 1.3333720592506535, + "grad_norm": 0.5705578327178955, + "learning_rate": 0.0001, + "loss": 1.387, + "step": 11477 + }, + { + "epoch": 1.333488237002614, + "grad_norm": 0.5864787101745605, + "learning_rate": 0.0001, + "loss": 1.623, + "step": 11478 + }, + { + "epoch": 1.3336044147545745, + "grad_norm": 0.5673004388809204, + "learning_rate": 0.0001, + "loss": 1.4897, + "step": 11479 + }, + { + "epoch": 1.333720592506535, + "grad_norm": 0.5407872200012207, + "learning_rate": 0.0001, + "loss": 1.3614, + "step": 11480 + }, + { + "epoch": 1.3338367702584955, + "grad_norm": 0.53462815284729, + "learning_rate": 0.0001, + "loss": 1.3134, + "step": 11481 + }, + { + "epoch": 1.333952948010456, + "grad_norm": 0.5789211392402649, + "learning_rate": 0.0001, + "loss": 1.3455, + "step": 11482 + }, + { + "epoch": 1.3340691257624164, + "grad_norm": 0.6079903244972229, + "learning_rate": 0.0001, + "loss": 1.3866, + "step": 11483 + }, + { + "epoch": 1.334185303514377, + "grad_norm": 0.5825889110565186, + "learning_rate": 0.0001, + "loss": 1.559, + "step": 11484 + }, + { + "epoch": 1.3343014812663374, + "grad_norm": 0.565239429473877, + "learning_rate": 0.0001, + "loss": 1.4349, + "step": 11485 + }, + { + "epoch": 1.334417659018298, + "grad_norm": 0.5922324061393738, + "learning_rate": 0.0001, + "loss": 1.4571, + "step": 11486 + }, + { + "epoch": 1.3345338367702584, + "grad_norm": 0.5655604600906372, + "learning_rate": 0.0001, + "loss": 1.3975, + "step": 11487 + }, + { + "epoch": 1.3346500145222189, + "grad_norm": 0.6273969411849976, + "learning_rate": 0.0001, + "loss": 1.7001, + "step": 11488 + }, + { + "epoch": 1.3347661922741794, + "grad_norm": 0.6035481691360474, + "learning_rate": 0.0001, + "loss": 1.5982, + "step": 11489 + }, + { + "epoch": 1.33488237002614, + "grad_norm": 0.5946334004402161, + "learning_rate": 0.0001, + "loss": 1.51, + "step": 11490 + }, + { + "epoch": 1.3349985477781006, + "grad_norm": 0.587891697883606, + "learning_rate": 0.0001, + "loss": 1.558, + "step": 11491 + }, + { + "epoch": 1.335114725530061, + "grad_norm": 0.6165737509727478, + "learning_rate": 0.0001, + "loss": 1.5527, + "step": 11492 + }, + { + "epoch": 1.3352309032820215, + "grad_norm": 0.6015834808349609, + "learning_rate": 0.0001, + "loss": 1.5087, + "step": 11493 + }, + { + "epoch": 1.335347081033982, + "grad_norm": 0.5921751856803894, + "learning_rate": 0.0001, + "loss": 1.4672, + "step": 11494 + }, + { + "epoch": 1.3354632587859425, + "grad_norm": 0.602714478969574, + "learning_rate": 0.0001, + "loss": 1.2711, + "step": 11495 + }, + { + "epoch": 1.335579436537903, + "grad_norm": 0.5740492939949036, + "learning_rate": 0.0001, + "loss": 1.5967, + "step": 11496 + }, + { + "epoch": 1.3356956142898635, + "grad_norm": 0.5856906771659851, + "learning_rate": 0.0001, + "loss": 1.6184, + "step": 11497 + }, + { + "epoch": 1.335811792041824, + "grad_norm": 0.5492340922355652, + "learning_rate": 0.0001, + "loss": 1.413, + "step": 11498 + }, + { + "epoch": 1.3359279697937845, + "grad_norm": 0.6142228245735168, + "learning_rate": 0.0001, + "loss": 1.4071, + "step": 11499 + }, + { + "epoch": 1.336044147545745, + "grad_norm": 0.5769044160842896, + "learning_rate": 0.0001, + "loss": 1.4111, + "step": 11500 + }, + { + "epoch": 1.3361603252977055, + "grad_norm": 0.6063992381095886, + "learning_rate": 0.0001, + "loss": 1.5716, + "step": 11501 + }, + { + "epoch": 1.336276503049666, + "grad_norm": 0.5918049812316895, + "learning_rate": 0.0001, + "loss": 1.4288, + "step": 11502 + }, + { + "epoch": 1.3363926808016264, + "grad_norm": 0.6081083416938782, + "learning_rate": 0.0001, + "loss": 1.4468, + "step": 11503 + }, + { + "epoch": 1.336508858553587, + "grad_norm": 0.567497730255127, + "learning_rate": 0.0001, + "loss": 1.4711, + "step": 11504 + }, + { + "epoch": 1.3366250363055474, + "grad_norm": 0.5840210318565369, + "learning_rate": 0.0001, + "loss": 1.484, + "step": 11505 + }, + { + "epoch": 1.3367412140575081, + "grad_norm": 0.5692864656448364, + "learning_rate": 0.0001, + "loss": 1.3308, + "step": 11506 + }, + { + "epoch": 1.3368573918094686, + "grad_norm": 0.567046046257019, + "learning_rate": 0.0001, + "loss": 1.4432, + "step": 11507 + }, + { + "epoch": 1.336973569561429, + "grad_norm": 0.5424807667732239, + "learning_rate": 0.0001, + "loss": 1.3719, + "step": 11508 + }, + { + "epoch": 1.3370897473133896, + "grad_norm": 0.5703045725822449, + "learning_rate": 0.0001, + "loss": 1.5068, + "step": 11509 + }, + { + "epoch": 1.33720592506535, + "grad_norm": 0.5481063723564148, + "learning_rate": 0.0001, + "loss": 1.3803, + "step": 11510 + }, + { + "epoch": 1.3373221028173106, + "grad_norm": 0.5333367586135864, + "learning_rate": 0.0001, + "loss": 1.3449, + "step": 11511 + }, + { + "epoch": 1.337438280569271, + "grad_norm": 0.6329618096351624, + "learning_rate": 0.0001, + "loss": 1.5148, + "step": 11512 + }, + { + "epoch": 1.3375544583212315, + "grad_norm": 0.5826229453086853, + "learning_rate": 0.0001, + "loss": 1.3026, + "step": 11513 + }, + { + "epoch": 1.337670636073192, + "grad_norm": 0.5669860243797302, + "learning_rate": 0.0001, + "loss": 1.4939, + "step": 11514 + }, + { + "epoch": 1.3377868138251525, + "grad_norm": 0.5839524865150452, + "learning_rate": 0.0001, + "loss": 1.399, + "step": 11515 + }, + { + "epoch": 1.337902991577113, + "grad_norm": 0.6274355053901672, + "learning_rate": 0.0001, + "loss": 1.6241, + "step": 11516 + }, + { + "epoch": 1.3380191693290735, + "grad_norm": 0.5699935555458069, + "learning_rate": 0.0001, + "loss": 1.3266, + "step": 11517 + }, + { + "epoch": 1.338135347081034, + "grad_norm": 0.5473787188529968, + "learning_rate": 0.0001, + "loss": 1.3531, + "step": 11518 + }, + { + "epoch": 1.3382515248329945, + "grad_norm": 0.605440080165863, + "learning_rate": 0.0001, + "loss": 1.5424, + "step": 11519 + }, + { + "epoch": 1.338367702584955, + "grad_norm": 0.558005690574646, + "learning_rate": 0.0001, + "loss": 1.3049, + "step": 11520 + }, + { + "epoch": 1.3384838803369155, + "grad_norm": 0.6115478873252869, + "learning_rate": 0.0001, + "loss": 1.5498, + "step": 11521 + }, + { + "epoch": 1.338600058088876, + "grad_norm": 0.590194821357727, + "learning_rate": 0.0001, + "loss": 1.4244, + "step": 11522 + }, + { + "epoch": 1.3387162358408364, + "grad_norm": 0.5935150980949402, + "learning_rate": 0.0001, + "loss": 1.4641, + "step": 11523 + }, + { + "epoch": 1.338832413592797, + "grad_norm": 0.6085209846496582, + "learning_rate": 0.0001, + "loss": 1.6074, + "step": 11524 + }, + { + "epoch": 1.3389485913447574, + "grad_norm": 0.5541259050369263, + "learning_rate": 0.0001, + "loss": 1.332, + "step": 11525 + }, + { + "epoch": 1.339064769096718, + "grad_norm": 0.5870988368988037, + "learning_rate": 0.0001, + "loss": 1.5353, + "step": 11526 + }, + { + "epoch": 1.3391809468486784, + "grad_norm": 0.5646886825561523, + "learning_rate": 0.0001, + "loss": 1.5916, + "step": 11527 + }, + { + "epoch": 1.3392971246006389, + "grad_norm": 0.5684593319892883, + "learning_rate": 0.0001, + "loss": 1.4385, + "step": 11528 + }, + { + "epoch": 1.3394133023525994, + "grad_norm": 0.6310387849807739, + "learning_rate": 0.0001, + "loss": 1.7766, + "step": 11529 + }, + { + "epoch": 1.3395294801045599, + "grad_norm": 0.6121505498886108, + "learning_rate": 0.0001, + "loss": 1.4929, + "step": 11530 + }, + { + "epoch": 1.3396456578565203, + "grad_norm": 0.5295522212982178, + "learning_rate": 0.0001, + "loss": 1.4303, + "step": 11531 + }, + { + "epoch": 1.339761835608481, + "grad_norm": 0.5718926191329956, + "learning_rate": 0.0001, + "loss": 1.3464, + "step": 11532 + }, + { + "epoch": 1.3398780133604415, + "grad_norm": 0.598277747631073, + "learning_rate": 0.0001, + "loss": 1.5392, + "step": 11533 + }, + { + "epoch": 1.339994191112402, + "grad_norm": 0.5410280227661133, + "learning_rate": 0.0001, + "loss": 1.3229, + "step": 11534 + }, + { + "epoch": 1.3401103688643625, + "grad_norm": 0.545367956161499, + "learning_rate": 0.0001, + "loss": 1.5294, + "step": 11535 + }, + { + "epoch": 1.340226546616323, + "grad_norm": 0.5809006690979004, + "learning_rate": 0.0001, + "loss": 1.5878, + "step": 11536 + }, + { + "epoch": 1.3403427243682835, + "grad_norm": 0.6075916886329651, + "learning_rate": 0.0001, + "loss": 1.5287, + "step": 11537 + }, + { + "epoch": 1.340458902120244, + "grad_norm": 0.5859774351119995, + "learning_rate": 0.0001, + "loss": 1.214, + "step": 11538 + }, + { + "epoch": 1.3405750798722045, + "grad_norm": 0.5794239640235901, + "learning_rate": 0.0001, + "loss": 1.3904, + "step": 11539 + }, + { + "epoch": 1.340691257624165, + "grad_norm": 0.5825338959693909, + "learning_rate": 0.0001, + "loss": 1.3614, + "step": 11540 + }, + { + "epoch": 1.3408074353761255, + "grad_norm": 0.5686221718788147, + "learning_rate": 0.0001, + "loss": 1.2162, + "step": 11541 + }, + { + "epoch": 1.340923613128086, + "grad_norm": 0.5766289234161377, + "learning_rate": 0.0001, + "loss": 1.5009, + "step": 11542 + }, + { + "epoch": 1.3410397908800464, + "grad_norm": 0.6284175515174866, + "learning_rate": 0.0001, + "loss": 1.5627, + "step": 11543 + }, + { + "epoch": 1.341155968632007, + "grad_norm": 0.6178475022315979, + "learning_rate": 0.0001, + "loss": 1.261, + "step": 11544 + }, + { + "epoch": 1.3412721463839674, + "grad_norm": 0.6055488586425781, + "learning_rate": 0.0001, + "loss": 1.5144, + "step": 11545 + }, + { + "epoch": 1.341388324135928, + "grad_norm": 0.6022990345954895, + "learning_rate": 0.0001, + "loss": 1.7137, + "step": 11546 + }, + { + "epoch": 1.3415045018878884, + "grad_norm": 0.6357380151748657, + "learning_rate": 0.0001, + "loss": 1.5483, + "step": 11547 + }, + { + "epoch": 1.341620679639849, + "grad_norm": 0.582905650138855, + "learning_rate": 0.0001, + "loss": 1.405, + "step": 11548 + }, + { + "epoch": 1.3417368573918096, + "grad_norm": 0.5897303223609924, + "learning_rate": 0.0001, + "loss": 1.4004, + "step": 11549 + }, + { + "epoch": 1.34185303514377, + "grad_norm": 0.6201091408729553, + "learning_rate": 0.0001, + "loss": 1.305, + "step": 11550 + }, + { + "epoch": 1.3419692128957306, + "grad_norm": 0.5758824348449707, + "learning_rate": 0.0001, + "loss": 1.4441, + "step": 11551 + }, + { + "epoch": 1.342085390647691, + "grad_norm": 0.5709869861602783, + "learning_rate": 0.0001, + "loss": 1.3256, + "step": 11552 + }, + { + "epoch": 1.3422015683996515, + "grad_norm": 0.6379136443138123, + "learning_rate": 0.0001, + "loss": 1.4303, + "step": 11553 + }, + { + "epoch": 1.342317746151612, + "grad_norm": 0.5988776087760925, + "learning_rate": 0.0001, + "loss": 1.4281, + "step": 11554 + }, + { + "epoch": 1.3424339239035725, + "grad_norm": 0.597923994064331, + "learning_rate": 0.0001, + "loss": 1.4695, + "step": 11555 + }, + { + "epoch": 1.342550101655533, + "grad_norm": 0.578001081943512, + "learning_rate": 0.0001, + "loss": 1.5161, + "step": 11556 + }, + { + "epoch": 1.3426662794074935, + "grad_norm": 0.5900245308876038, + "learning_rate": 0.0001, + "loss": 1.5779, + "step": 11557 + }, + { + "epoch": 1.342782457159454, + "grad_norm": 0.5991559028625488, + "learning_rate": 0.0001, + "loss": 1.5152, + "step": 11558 + }, + { + "epoch": 1.3428986349114145, + "grad_norm": 0.5901554822921753, + "learning_rate": 0.0001, + "loss": 1.4043, + "step": 11559 + }, + { + "epoch": 1.343014812663375, + "grad_norm": 0.5958110094070435, + "learning_rate": 0.0001, + "loss": 1.4813, + "step": 11560 + }, + { + "epoch": 1.3431309904153355, + "grad_norm": 0.6428405046463013, + "learning_rate": 0.0001, + "loss": 1.5927, + "step": 11561 + }, + { + "epoch": 1.343247168167296, + "grad_norm": 0.6413325071334839, + "learning_rate": 0.0001, + "loss": 1.6529, + "step": 11562 + }, + { + "epoch": 1.3433633459192564, + "grad_norm": 0.596393346786499, + "learning_rate": 0.0001, + "loss": 1.4102, + "step": 11563 + }, + { + "epoch": 1.343479523671217, + "grad_norm": 0.6022195219993591, + "learning_rate": 0.0001, + "loss": 1.5229, + "step": 11564 + }, + { + "epoch": 1.3435957014231774, + "grad_norm": 0.5690363049507141, + "learning_rate": 0.0001, + "loss": 1.4665, + "step": 11565 + }, + { + "epoch": 1.343711879175138, + "grad_norm": 0.5736088156700134, + "learning_rate": 0.0001, + "loss": 1.5445, + "step": 11566 + }, + { + "epoch": 1.3438280569270984, + "grad_norm": 0.551696240901947, + "learning_rate": 0.0001, + "loss": 1.2572, + "step": 11567 + }, + { + "epoch": 1.3439442346790589, + "grad_norm": 0.5662961602210999, + "learning_rate": 0.0001, + "loss": 1.5853, + "step": 11568 + }, + { + "epoch": 1.3440604124310194, + "grad_norm": 0.5956161022186279, + "learning_rate": 0.0001, + "loss": 1.4668, + "step": 11569 + }, + { + "epoch": 1.3441765901829799, + "grad_norm": 0.5635676383972168, + "learning_rate": 0.0001, + "loss": 1.5237, + "step": 11570 + }, + { + "epoch": 1.3442927679349403, + "grad_norm": 0.5952593684196472, + "learning_rate": 0.0001, + "loss": 1.3867, + "step": 11571 + }, + { + "epoch": 1.3444089456869008, + "grad_norm": 0.5866329073905945, + "learning_rate": 0.0001, + "loss": 1.5998, + "step": 11572 + }, + { + "epoch": 1.3445251234388613, + "grad_norm": 0.6334074139595032, + "learning_rate": 0.0001, + "loss": 1.6183, + "step": 11573 + }, + { + "epoch": 1.344641301190822, + "grad_norm": 0.5864160656929016, + "learning_rate": 0.0001, + "loss": 1.3639, + "step": 11574 + }, + { + "epoch": 1.3447574789427825, + "grad_norm": 0.5770210027694702, + "learning_rate": 0.0001, + "loss": 1.3357, + "step": 11575 + }, + { + "epoch": 1.344873656694743, + "grad_norm": 0.5954274535179138, + "learning_rate": 0.0001, + "loss": 1.4733, + "step": 11576 + }, + { + "epoch": 1.3449898344467035, + "grad_norm": 0.6258078813552856, + "learning_rate": 0.0001, + "loss": 1.5786, + "step": 11577 + }, + { + "epoch": 1.345106012198664, + "grad_norm": 0.62962406873703, + "learning_rate": 0.0001, + "loss": 1.5211, + "step": 11578 + }, + { + "epoch": 1.3452221899506245, + "grad_norm": 0.572242259979248, + "learning_rate": 0.0001, + "loss": 1.2881, + "step": 11579 + }, + { + "epoch": 1.345338367702585, + "grad_norm": 0.5981161594390869, + "learning_rate": 0.0001, + "loss": 1.5231, + "step": 11580 + }, + { + "epoch": 1.3454545454545455, + "grad_norm": 0.5858927369117737, + "learning_rate": 0.0001, + "loss": 1.5263, + "step": 11581 + }, + { + "epoch": 1.345570723206506, + "grad_norm": 0.5862258672714233, + "learning_rate": 0.0001, + "loss": 1.456, + "step": 11582 + }, + { + "epoch": 1.3456869009584664, + "grad_norm": 0.5800897479057312, + "learning_rate": 0.0001, + "loss": 1.5431, + "step": 11583 + }, + { + "epoch": 1.345803078710427, + "grad_norm": 0.5962756276130676, + "learning_rate": 0.0001, + "loss": 1.5612, + "step": 11584 + }, + { + "epoch": 1.3459192564623874, + "grad_norm": 0.5521575808525085, + "learning_rate": 0.0001, + "loss": 1.3611, + "step": 11585 + }, + { + "epoch": 1.346035434214348, + "grad_norm": 0.5704815983772278, + "learning_rate": 0.0001, + "loss": 1.4965, + "step": 11586 + }, + { + "epoch": 1.3461516119663084, + "grad_norm": 0.55373615026474, + "learning_rate": 0.0001, + "loss": 1.5358, + "step": 11587 + }, + { + "epoch": 1.3462677897182689, + "grad_norm": 0.5447104573249817, + "learning_rate": 0.0001, + "loss": 1.3546, + "step": 11588 + }, + { + "epoch": 1.3463839674702294, + "grad_norm": 0.6218578815460205, + "learning_rate": 0.0001, + "loss": 1.4718, + "step": 11589 + }, + { + "epoch": 1.34650014522219, + "grad_norm": 0.5952721238136292, + "learning_rate": 0.0001, + "loss": 1.3838, + "step": 11590 + }, + { + "epoch": 1.3466163229741506, + "grad_norm": 0.5739622712135315, + "learning_rate": 0.0001, + "loss": 1.5094, + "step": 11591 + }, + { + "epoch": 1.346732500726111, + "grad_norm": 0.5628431439399719, + "learning_rate": 0.0001, + "loss": 1.3305, + "step": 11592 + }, + { + "epoch": 1.3468486784780715, + "grad_norm": 0.5847616195678711, + "learning_rate": 0.0001, + "loss": 1.6257, + "step": 11593 + }, + { + "epoch": 1.346964856230032, + "grad_norm": 0.5959730744361877, + "learning_rate": 0.0001, + "loss": 1.4444, + "step": 11594 + }, + { + "epoch": 1.3470810339819925, + "grad_norm": 0.5944980382919312, + "learning_rate": 0.0001, + "loss": 1.5047, + "step": 11595 + }, + { + "epoch": 1.347197211733953, + "grad_norm": 0.6205131411552429, + "learning_rate": 0.0001, + "loss": 1.4828, + "step": 11596 + }, + { + "epoch": 1.3473133894859135, + "grad_norm": 0.6303896903991699, + "learning_rate": 0.0001, + "loss": 1.6072, + "step": 11597 + }, + { + "epoch": 1.347429567237874, + "grad_norm": 0.5928091406822205, + "learning_rate": 0.0001, + "loss": 1.5181, + "step": 11598 + }, + { + "epoch": 1.3475457449898345, + "grad_norm": 0.6262045502662659, + "learning_rate": 0.0001, + "loss": 1.5543, + "step": 11599 + }, + { + "epoch": 1.347661922741795, + "grad_norm": 0.6027948260307312, + "learning_rate": 0.0001, + "loss": 1.4472, + "step": 11600 + }, + { + "epoch": 1.3477781004937555, + "grad_norm": 0.5992240905761719, + "learning_rate": 0.0001, + "loss": 1.4503, + "step": 11601 + }, + { + "epoch": 1.347894278245716, + "grad_norm": 0.5964482426643372, + "learning_rate": 0.0001, + "loss": 1.4014, + "step": 11602 + }, + { + "epoch": 1.3480104559976764, + "grad_norm": 0.5847633481025696, + "learning_rate": 0.0001, + "loss": 1.5462, + "step": 11603 + }, + { + "epoch": 1.348126633749637, + "grad_norm": 0.5737441778182983, + "learning_rate": 0.0001, + "loss": 1.4899, + "step": 11604 + }, + { + "epoch": 1.3482428115015974, + "grad_norm": 0.5500698685646057, + "learning_rate": 0.0001, + "loss": 1.4647, + "step": 11605 + }, + { + "epoch": 1.348358989253558, + "grad_norm": 0.5695959329605103, + "learning_rate": 0.0001, + "loss": 1.484, + "step": 11606 + }, + { + "epoch": 1.3484751670055184, + "grad_norm": 0.5738762617111206, + "learning_rate": 0.0001, + "loss": 1.6238, + "step": 11607 + }, + { + "epoch": 1.3485913447574789, + "grad_norm": 0.5387170314788818, + "learning_rate": 0.0001, + "loss": 1.402, + "step": 11608 + }, + { + "epoch": 1.3487075225094394, + "grad_norm": 0.5509867072105408, + "learning_rate": 0.0001, + "loss": 1.4802, + "step": 11609 + }, + { + "epoch": 1.3488237002613999, + "grad_norm": 0.5982716679573059, + "learning_rate": 0.0001, + "loss": 1.4475, + "step": 11610 + }, + { + "epoch": 1.3489398780133603, + "grad_norm": 0.5490851402282715, + "learning_rate": 0.0001, + "loss": 1.4278, + "step": 11611 + }, + { + "epoch": 1.3490560557653208, + "grad_norm": 0.5994747877120972, + "learning_rate": 0.0001, + "loss": 1.4825, + "step": 11612 + }, + { + "epoch": 1.3491722335172813, + "grad_norm": 0.6025500297546387, + "learning_rate": 0.0001, + "loss": 1.6634, + "step": 11613 + }, + { + "epoch": 1.3492884112692418, + "grad_norm": 0.5859069228172302, + "learning_rate": 0.0001, + "loss": 1.4316, + "step": 11614 + }, + { + "epoch": 1.3494045890212023, + "grad_norm": 0.6145164370536804, + "learning_rate": 0.0001, + "loss": 1.4605, + "step": 11615 + }, + { + "epoch": 1.349520766773163, + "grad_norm": 0.6078627109527588, + "learning_rate": 0.0001, + "loss": 1.5687, + "step": 11616 + }, + { + "epoch": 1.3496369445251235, + "grad_norm": 0.5715606808662415, + "learning_rate": 0.0001, + "loss": 1.5395, + "step": 11617 + }, + { + "epoch": 1.349753122277084, + "grad_norm": 0.5815566778182983, + "learning_rate": 0.0001, + "loss": 1.403, + "step": 11618 + }, + { + "epoch": 1.3498693000290445, + "grad_norm": 0.5756155252456665, + "learning_rate": 0.0001, + "loss": 1.3829, + "step": 11619 + }, + { + "epoch": 1.349985477781005, + "grad_norm": 0.6013368368148804, + "learning_rate": 0.0001, + "loss": 1.492, + "step": 11620 + }, + { + "epoch": 1.3501016555329655, + "grad_norm": 0.5688734650611877, + "learning_rate": 0.0001, + "loss": 1.3319, + "step": 11621 + }, + { + "epoch": 1.350217833284926, + "grad_norm": 0.5575860738754272, + "learning_rate": 0.0001, + "loss": 1.3737, + "step": 11622 + }, + { + "epoch": 1.3503340110368864, + "grad_norm": 0.6282293200492859, + "learning_rate": 0.0001, + "loss": 1.4909, + "step": 11623 + }, + { + "epoch": 1.350450188788847, + "grad_norm": 0.5837210416793823, + "learning_rate": 0.0001, + "loss": 1.3822, + "step": 11624 + }, + { + "epoch": 1.3505663665408074, + "grad_norm": 0.6083855032920837, + "learning_rate": 0.0001, + "loss": 1.5915, + "step": 11625 + }, + { + "epoch": 1.350682544292768, + "grad_norm": 0.5760353803634644, + "learning_rate": 0.0001, + "loss": 1.4134, + "step": 11626 + }, + { + "epoch": 1.3507987220447284, + "grad_norm": 0.5956183075904846, + "learning_rate": 0.0001, + "loss": 1.5215, + "step": 11627 + }, + { + "epoch": 1.3509148997966889, + "grad_norm": 0.5973549485206604, + "learning_rate": 0.0001, + "loss": 1.6133, + "step": 11628 + }, + { + "epoch": 1.3510310775486494, + "grad_norm": 0.5945565104484558, + "learning_rate": 0.0001, + "loss": 1.564, + "step": 11629 + }, + { + "epoch": 1.3511472553006099, + "grad_norm": 0.5747148394584656, + "learning_rate": 0.0001, + "loss": 1.383, + "step": 11630 + }, + { + "epoch": 1.3512634330525704, + "grad_norm": 0.6089474558830261, + "learning_rate": 0.0001, + "loss": 1.4009, + "step": 11631 + }, + { + "epoch": 1.351379610804531, + "grad_norm": 0.5638538599014282, + "learning_rate": 0.0001, + "loss": 1.4793, + "step": 11632 + }, + { + "epoch": 1.3514957885564916, + "grad_norm": 0.5719699263572693, + "learning_rate": 0.0001, + "loss": 1.3883, + "step": 11633 + }, + { + "epoch": 1.351611966308452, + "grad_norm": 0.6159690618515015, + "learning_rate": 0.0001, + "loss": 1.4904, + "step": 11634 + }, + { + "epoch": 1.3517281440604125, + "grad_norm": 0.6014747023582458, + "learning_rate": 0.0001, + "loss": 1.5219, + "step": 11635 + }, + { + "epoch": 1.351844321812373, + "grad_norm": 0.5920547246932983, + "learning_rate": 0.0001, + "loss": 1.4401, + "step": 11636 + }, + { + "epoch": 1.3519604995643335, + "grad_norm": 0.596462607383728, + "learning_rate": 0.0001, + "loss": 1.4317, + "step": 11637 + }, + { + "epoch": 1.352076677316294, + "grad_norm": 0.5985187888145447, + "learning_rate": 0.0001, + "loss": 1.4105, + "step": 11638 + }, + { + "epoch": 1.3521928550682545, + "grad_norm": 0.556800901889801, + "learning_rate": 0.0001, + "loss": 1.31, + "step": 11639 + }, + { + "epoch": 1.352309032820215, + "grad_norm": 0.5883356332778931, + "learning_rate": 0.0001, + "loss": 1.584, + "step": 11640 + }, + { + "epoch": 1.3524252105721755, + "grad_norm": 0.5282686352729797, + "learning_rate": 0.0001, + "loss": 1.4686, + "step": 11641 + }, + { + "epoch": 1.352541388324136, + "grad_norm": 0.5937612056732178, + "learning_rate": 0.0001, + "loss": 1.4411, + "step": 11642 + }, + { + "epoch": 1.3526575660760964, + "grad_norm": 0.6212623715400696, + "learning_rate": 0.0001, + "loss": 1.5084, + "step": 11643 + }, + { + "epoch": 1.352773743828057, + "grad_norm": 0.5802718997001648, + "learning_rate": 0.0001, + "loss": 1.3355, + "step": 11644 + }, + { + "epoch": 1.3528899215800174, + "grad_norm": 0.5913172364234924, + "learning_rate": 0.0001, + "loss": 1.4938, + "step": 11645 + }, + { + "epoch": 1.353006099331978, + "grad_norm": 0.5834310054779053, + "learning_rate": 0.0001, + "loss": 1.4286, + "step": 11646 + }, + { + "epoch": 1.3531222770839384, + "grad_norm": 0.5490809679031372, + "learning_rate": 0.0001, + "loss": 1.536, + "step": 11647 + }, + { + "epoch": 1.3532384548358989, + "grad_norm": 0.5814580321311951, + "learning_rate": 0.0001, + "loss": 1.304, + "step": 11648 + }, + { + "epoch": 1.3533546325878594, + "grad_norm": 0.6068845391273499, + "learning_rate": 0.0001, + "loss": 1.4916, + "step": 11649 + }, + { + "epoch": 1.3534708103398199, + "grad_norm": 0.6316379308700562, + "learning_rate": 0.0001, + "loss": 1.2805, + "step": 11650 + }, + { + "epoch": 1.3535869880917804, + "grad_norm": 0.5487270951271057, + "learning_rate": 0.0001, + "loss": 1.536, + "step": 11651 + }, + { + "epoch": 1.3537031658437408, + "grad_norm": 0.6253399848937988, + "learning_rate": 0.0001, + "loss": 1.609, + "step": 11652 + }, + { + "epoch": 1.3538193435957013, + "grad_norm": 0.5893374085426331, + "learning_rate": 0.0001, + "loss": 1.5526, + "step": 11653 + }, + { + "epoch": 1.3539355213476618, + "grad_norm": 0.5669167637825012, + "learning_rate": 0.0001, + "loss": 1.51, + "step": 11654 + }, + { + "epoch": 1.3540516990996223, + "grad_norm": 0.601037323474884, + "learning_rate": 0.0001, + "loss": 1.3515, + "step": 11655 + }, + { + "epoch": 1.3541678768515828, + "grad_norm": 0.5730093717575073, + "learning_rate": 0.0001, + "loss": 1.4615, + "step": 11656 + }, + { + "epoch": 1.3542840546035433, + "grad_norm": 0.5829418301582336, + "learning_rate": 0.0001, + "loss": 1.486, + "step": 11657 + }, + { + "epoch": 1.354400232355504, + "grad_norm": 0.5747588872909546, + "learning_rate": 0.0001, + "loss": 1.5405, + "step": 11658 + }, + { + "epoch": 1.3545164101074645, + "grad_norm": 0.6172161102294922, + "learning_rate": 0.0001, + "loss": 1.5367, + "step": 11659 + }, + { + "epoch": 1.354632587859425, + "grad_norm": 0.5914645195007324, + "learning_rate": 0.0001, + "loss": 1.37, + "step": 11660 + }, + { + "epoch": 1.3547487656113855, + "grad_norm": 0.6524379253387451, + "learning_rate": 0.0001, + "loss": 1.5559, + "step": 11661 + }, + { + "epoch": 1.354864943363346, + "grad_norm": 0.588579535484314, + "learning_rate": 0.0001, + "loss": 1.38, + "step": 11662 + }, + { + "epoch": 1.3549811211153064, + "grad_norm": 0.5805348753929138, + "learning_rate": 0.0001, + "loss": 1.6023, + "step": 11663 + }, + { + "epoch": 1.355097298867267, + "grad_norm": 0.5591554641723633, + "learning_rate": 0.0001, + "loss": 1.4325, + "step": 11664 + }, + { + "epoch": 1.3552134766192274, + "grad_norm": 0.545655369758606, + "learning_rate": 0.0001, + "loss": 1.5484, + "step": 11665 + }, + { + "epoch": 1.355329654371188, + "grad_norm": 0.5984359383583069, + "learning_rate": 0.0001, + "loss": 1.573, + "step": 11666 + }, + { + "epoch": 1.3554458321231484, + "grad_norm": 0.5707201361656189, + "learning_rate": 0.0001, + "loss": 1.5234, + "step": 11667 + }, + { + "epoch": 1.3555620098751089, + "grad_norm": 0.6246615648269653, + "learning_rate": 0.0001, + "loss": 1.464, + "step": 11668 + }, + { + "epoch": 1.3556781876270694, + "grad_norm": 0.5760742425918579, + "learning_rate": 0.0001, + "loss": 1.4609, + "step": 11669 + }, + { + "epoch": 1.3557943653790299, + "grad_norm": 0.5490787029266357, + "learning_rate": 0.0001, + "loss": 1.2988, + "step": 11670 + }, + { + "epoch": 1.3559105431309904, + "grad_norm": 0.5468339920043945, + "learning_rate": 0.0001, + "loss": 1.2894, + "step": 11671 + }, + { + "epoch": 1.3560267208829508, + "grad_norm": 0.6399328708648682, + "learning_rate": 0.0001, + "loss": 1.4589, + "step": 11672 + }, + { + "epoch": 1.3561428986349113, + "grad_norm": 0.6269949078559875, + "learning_rate": 0.0001, + "loss": 1.6827, + "step": 11673 + }, + { + "epoch": 1.356259076386872, + "grad_norm": 0.5965935587882996, + "learning_rate": 0.0001, + "loss": 1.4121, + "step": 11674 + }, + { + "epoch": 1.3563752541388325, + "grad_norm": 0.586742639541626, + "learning_rate": 0.0001, + "loss": 1.5327, + "step": 11675 + }, + { + "epoch": 1.356491431890793, + "grad_norm": 0.5911440253257751, + "learning_rate": 0.0001, + "loss": 1.5497, + "step": 11676 + }, + { + "epoch": 1.3566076096427535, + "grad_norm": 0.6567511558532715, + "learning_rate": 0.0001, + "loss": 1.6401, + "step": 11677 + }, + { + "epoch": 1.356723787394714, + "grad_norm": 0.5498983263969421, + "learning_rate": 0.0001, + "loss": 1.269, + "step": 11678 + }, + { + "epoch": 1.3568399651466745, + "grad_norm": 0.5766180753707886, + "learning_rate": 0.0001, + "loss": 1.4743, + "step": 11679 + }, + { + "epoch": 1.356956142898635, + "grad_norm": 0.5323266386985779, + "learning_rate": 0.0001, + "loss": 1.2326, + "step": 11680 + }, + { + "epoch": 1.3570723206505955, + "grad_norm": 0.5497226119041443, + "learning_rate": 0.0001, + "loss": 1.5357, + "step": 11681 + }, + { + "epoch": 1.357188498402556, + "grad_norm": 0.5257942080497742, + "learning_rate": 0.0001, + "loss": 1.2964, + "step": 11682 + }, + { + "epoch": 1.3573046761545164, + "grad_norm": 0.6501601338386536, + "learning_rate": 0.0001, + "loss": 1.5782, + "step": 11683 + }, + { + "epoch": 1.357420853906477, + "grad_norm": 0.5762137174606323, + "learning_rate": 0.0001, + "loss": 1.3364, + "step": 11684 + }, + { + "epoch": 1.3575370316584374, + "grad_norm": 0.6714363694190979, + "learning_rate": 0.0001, + "loss": 1.5958, + "step": 11685 + }, + { + "epoch": 1.357653209410398, + "grad_norm": 0.614554762840271, + "learning_rate": 0.0001, + "loss": 1.5366, + "step": 11686 + }, + { + "epoch": 1.3577693871623584, + "grad_norm": 0.5594595670700073, + "learning_rate": 0.0001, + "loss": 1.3527, + "step": 11687 + }, + { + "epoch": 1.3578855649143189, + "grad_norm": 0.6071390509605408, + "learning_rate": 0.0001, + "loss": 1.4608, + "step": 11688 + }, + { + "epoch": 1.3580017426662794, + "grad_norm": 0.5696654319763184, + "learning_rate": 0.0001, + "loss": 1.5251, + "step": 11689 + }, + { + "epoch": 1.3581179204182399, + "grad_norm": 0.5189816355705261, + "learning_rate": 0.0001, + "loss": 1.2097, + "step": 11690 + }, + { + "epoch": 1.3582340981702004, + "grad_norm": 0.5831829905509949, + "learning_rate": 0.0001, + "loss": 1.5446, + "step": 11691 + }, + { + "epoch": 1.3583502759221608, + "grad_norm": 0.6040063500404358, + "learning_rate": 0.0001, + "loss": 1.597, + "step": 11692 + }, + { + "epoch": 1.3584664536741213, + "grad_norm": 0.5630427598953247, + "learning_rate": 0.0001, + "loss": 1.384, + "step": 11693 + }, + { + "epoch": 1.3585826314260818, + "grad_norm": 0.6305066347122192, + "learning_rate": 0.0001, + "loss": 1.6258, + "step": 11694 + }, + { + "epoch": 1.3586988091780423, + "grad_norm": 0.614304780960083, + "learning_rate": 0.0001, + "loss": 1.4363, + "step": 11695 + }, + { + "epoch": 1.3588149869300028, + "grad_norm": 0.5854483842849731, + "learning_rate": 0.0001, + "loss": 1.4955, + "step": 11696 + }, + { + "epoch": 1.3589311646819633, + "grad_norm": 0.5789347290992737, + "learning_rate": 0.0001, + "loss": 1.4742, + "step": 11697 + }, + { + "epoch": 1.3590473424339238, + "grad_norm": 0.5860118269920349, + "learning_rate": 0.0001, + "loss": 1.4509, + "step": 11698 + }, + { + "epoch": 1.3591635201858843, + "grad_norm": 0.5637704730033875, + "learning_rate": 0.0001, + "loss": 1.4924, + "step": 11699 + }, + { + "epoch": 1.359279697937845, + "grad_norm": 0.6265964508056641, + "learning_rate": 0.0001, + "loss": 1.5091, + "step": 11700 + }, + { + "epoch": 1.3593958756898055, + "grad_norm": 0.6148953437805176, + "learning_rate": 0.0001, + "loss": 1.4862, + "step": 11701 + }, + { + "epoch": 1.359512053441766, + "grad_norm": 0.6008301973342896, + "learning_rate": 0.0001, + "loss": 1.4017, + "step": 11702 + }, + { + "epoch": 1.3596282311937264, + "grad_norm": 0.6362335681915283, + "learning_rate": 0.0001, + "loss": 1.4398, + "step": 11703 + }, + { + "epoch": 1.359744408945687, + "grad_norm": 0.6056876182556152, + "learning_rate": 0.0001, + "loss": 1.4626, + "step": 11704 + }, + { + "epoch": 1.3598605866976474, + "grad_norm": 0.625529944896698, + "learning_rate": 0.0001, + "loss": 1.3796, + "step": 11705 + }, + { + "epoch": 1.359976764449608, + "grad_norm": 0.6198188662528992, + "learning_rate": 0.0001, + "loss": 1.3204, + "step": 11706 + }, + { + "epoch": 1.3600929422015684, + "grad_norm": 0.540850818157196, + "learning_rate": 0.0001, + "loss": 1.3004, + "step": 11707 + }, + { + "epoch": 1.360209119953529, + "grad_norm": 0.6461174488067627, + "learning_rate": 0.0001, + "loss": 1.5201, + "step": 11708 + }, + { + "epoch": 1.3603252977054894, + "grad_norm": 0.5790330767631531, + "learning_rate": 0.0001, + "loss": 1.4575, + "step": 11709 + }, + { + "epoch": 1.3604414754574499, + "grad_norm": 0.6312645673751831, + "learning_rate": 0.0001, + "loss": 1.6026, + "step": 11710 + }, + { + "epoch": 1.3605576532094104, + "grad_norm": 0.5561741590499878, + "learning_rate": 0.0001, + "loss": 1.2689, + "step": 11711 + }, + { + "epoch": 1.3606738309613708, + "grad_norm": 0.5601104497909546, + "learning_rate": 0.0001, + "loss": 1.4587, + "step": 11712 + }, + { + "epoch": 1.3607900087133313, + "grad_norm": 0.6619279384613037, + "learning_rate": 0.0001, + "loss": 1.6191, + "step": 11713 + }, + { + "epoch": 1.3609061864652918, + "grad_norm": 0.6291702389717102, + "learning_rate": 0.0001, + "loss": 1.5444, + "step": 11714 + }, + { + "epoch": 1.3610223642172525, + "grad_norm": 0.6017669439315796, + "learning_rate": 0.0001, + "loss": 1.3569, + "step": 11715 + }, + { + "epoch": 1.361138541969213, + "grad_norm": 0.5894443988800049, + "learning_rate": 0.0001, + "loss": 1.5589, + "step": 11716 + }, + { + "epoch": 1.3612547197211735, + "grad_norm": 0.6237130165100098, + "learning_rate": 0.0001, + "loss": 1.5952, + "step": 11717 + }, + { + "epoch": 1.361370897473134, + "grad_norm": 0.5698156952857971, + "learning_rate": 0.0001, + "loss": 1.3084, + "step": 11718 + }, + { + "epoch": 1.3614870752250945, + "grad_norm": 0.5625483393669128, + "learning_rate": 0.0001, + "loss": 1.3964, + "step": 11719 + }, + { + "epoch": 1.361603252977055, + "grad_norm": 0.6011530756950378, + "learning_rate": 0.0001, + "loss": 1.5401, + "step": 11720 + }, + { + "epoch": 1.3617194307290155, + "grad_norm": 0.5694208741188049, + "learning_rate": 0.0001, + "loss": 1.4648, + "step": 11721 + }, + { + "epoch": 1.361835608480976, + "grad_norm": 0.6437340974807739, + "learning_rate": 0.0001, + "loss": 1.5627, + "step": 11722 + }, + { + "epoch": 1.3619517862329364, + "grad_norm": 0.5433085560798645, + "learning_rate": 0.0001, + "loss": 1.2774, + "step": 11723 + }, + { + "epoch": 1.362067963984897, + "grad_norm": 0.5646154880523682, + "learning_rate": 0.0001, + "loss": 1.5754, + "step": 11724 + }, + { + "epoch": 1.3621841417368574, + "grad_norm": 0.5664541125297546, + "learning_rate": 0.0001, + "loss": 1.3567, + "step": 11725 + }, + { + "epoch": 1.362300319488818, + "grad_norm": 0.536392867565155, + "learning_rate": 0.0001, + "loss": 1.4264, + "step": 11726 + }, + { + "epoch": 1.3624164972407784, + "grad_norm": 0.578348696231842, + "learning_rate": 0.0001, + "loss": 1.4304, + "step": 11727 + }, + { + "epoch": 1.362532674992739, + "grad_norm": 0.633996307849884, + "learning_rate": 0.0001, + "loss": 1.5121, + "step": 11728 + }, + { + "epoch": 1.3626488527446994, + "grad_norm": 0.5468116998672485, + "learning_rate": 0.0001, + "loss": 1.4625, + "step": 11729 + }, + { + "epoch": 1.3627650304966599, + "grad_norm": 0.5580371618270874, + "learning_rate": 0.0001, + "loss": 1.3462, + "step": 11730 + }, + { + "epoch": 1.3628812082486204, + "grad_norm": 0.5752867460250854, + "learning_rate": 0.0001, + "loss": 1.3912, + "step": 11731 + }, + { + "epoch": 1.3629973860005808, + "grad_norm": 0.626487135887146, + "learning_rate": 0.0001, + "loss": 1.5946, + "step": 11732 + }, + { + "epoch": 1.3631135637525413, + "grad_norm": 0.5772953033447266, + "learning_rate": 0.0001, + "loss": 1.4676, + "step": 11733 + }, + { + "epoch": 1.3632297415045018, + "grad_norm": 0.5664294362068176, + "learning_rate": 0.0001, + "loss": 1.3619, + "step": 11734 + }, + { + "epoch": 1.3633459192564623, + "grad_norm": 0.5935901999473572, + "learning_rate": 0.0001, + "loss": 1.519, + "step": 11735 + }, + { + "epoch": 1.3634620970084228, + "grad_norm": 0.6294174194335938, + "learning_rate": 0.0001, + "loss": 1.6044, + "step": 11736 + }, + { + "epoch": 1.3635782747603833, + "grad_norm": 0.6048177480697632, + "learning_rate": 0.0001, + "loss": 1.4796, + "step": 11737 + }, + { + "epoch": 1.3636944525123438, + "grad_norm": 0.6142577528953552, + "learning_rate": 0.0001, + "loss": 1.4884, + "step": 11738 + }, + { + "epoch": 1.3638106302643043, + "grad_norm": 0.549647867679596, + "learning_rate": 0.0001, + "loss": 1.4506, + "step": 11739 + }, + { + "epoch": 1.3639268080162648, + "grad_norm": 0.6002432703971863, + "learning_rate": 0.0001, + "loss": 1.557, + "step": 11740 + }, + { + "epoch": 1.3640429857682252, + "grad_norm": 0.5926917791366577, + "learning_rate": 0.0001, + "loss": 1.4598, + "step": 11741 + }, + { + "epoch": 1.364159163520186, + "grad_norm": 0.5604010224342346, + "learning_rate": 0.0001, + "loss": 1.5695, + "step": 11742 + }, + { + "epoch": 1.3642753412721464, + "grad_norm": 0.5525916218757629, + "learning_rate": 0.0001, + "loss": 1.3864, + "step": 11743 + }, + { + "epoch": 1.364391519024107, + "grad_norm": 0.5697430968284607, + "learning_rate": 0.0001, + "loss": 1.3873, + "step": 11744 + }, + { + "epoch": 1.3645076967760674, + "grad_norm": 0.5901938080787659, + "learning_rate": 0.0001, + "loss": 1.4719, + "step": 11745 + }, + { + "epoch": 1.364623874528028, + "grad_norm": 0.6322489976882935, + "learning_rate": 0.0001, + "loss": 1.4157, + "step": 11746 + }, + { + "epoch": 1.3647400522799884, + "grad_norm": 0.563325822353363, + "learning_rate": 0.0001, + "loss": 1.4027, + "step": 11747 + }, + { + "epoch": 1.364856230031949, + "grad_norm": 0.5789667963981628, + "learning_rate": 0.0001, + "loss": 1.5618, + "step": 11748 + }, + { + "epoch": 1.3649724077839094, + "grad_norm": 0.58510422706604, + "learning_rate": 0.0001, + "loss": 1.4739, + "step": 11749 + }, + { + "epoch": 1.3650885855358699, + "grad_norm": 0.5673305988311768, + "learning_rate": 0.0001, + "loss": 1.3469, + "step": 11750 + }, + { + "epoch": 1.3652047632878304, + "grad_norm": 0.6077521443367004, + "learning_rate": 0.0001, + "loss": 1.5691, + "step": 11751 + }, + { + "epoch": 1.3653209410397908, + "grad_norm": 0.6002272963523865, + "learning_rate": 0.0001, + "loss": 1.6788, + "step": 11752 + }, + { + "epoch": 1.3654371187917513, + "grad_norm": 0.5982224345207214, + "learning_rate": 0.0001, + "loss": 1.4948, + "step": 11753 + }, + { + "epoch": 1.3655532965437118, + "grad_norm": 0.5681218504905701, + "learning_rate": 0.0001, + "loss": 1.3629, + "step": 11754 + }, + { + "epoch": 1.3656694742956723, + "grad_norm": 0.6169049739837646, + "learning_rate": 0.0001, + "loss": 1.5424, + "step": 11755 + }, + { + "epoch": 1.3657856520476328, + "grad_norm": 0.5828377604484558, + "learning_rate": 0.0001, + "loss": 1.4077, + "step": 11756 + }, + { + "epoch": 1.3659018297995935, + "grad_norm": 0.5715786814689636, + "learning_rate": 0.0001, + "loss": 1.423, + "step": 11757 + }, + { + "epoch": 1.366018007551554, + "grad_norm": 0.5703264474868774, + "learning_rate": 0.0001, + "loss": 1.4941, + "step": 11758 + }, + { + "epoch": 1.3661341853035145, + "grad_norm": 0.6128681302070618, + "learning_rate": 0.0001, + "loss": 1.532, + "step": 11759 + }, + { + "epoch": 1.366250363055475, + "grad_norm": 0.5742857456207275, + "learning_rate": 0.0001, + "loss": 1.3648, + "step": 11760 + }, + { + "epoch": 1.3663665408074355, + "grad_norm": 0.5583314895629883, + "learning_rate": 0.0001, + "loss": 1.4086, + "step": 11761 + }, + { + "epoch": 1.366482718559396, + "grad_norm": 0.5679374933242798, + "learning_rate": 0.0001, + "loss": 1.5492, + "step": 11762 + }, + { + "epoch": 1.3665988963113564, + "grad_norm": 0.5741711258888245, + "learning_rate": 0.0001, + "loss": 1.4876, + "step": 11763 + }, + { + "epoch": 1.366715074063317, + "grad_norm": 0.5635258555412292, + "learning_rate": 0.0001, + "loss": 1.4451, + "step": 11764 + }, + { + "epoch": 1.3668312518152774, + "grad_norm": 0.5916835069656372, + "learning_rate": 0.0001, + "loss": 1.3363, + "step": 11765 + }, + { + "epoch": 1.366947429567238, + "grad_norm": 0.5715609192848206, + "learning_rate": 0.0001, + "loss": 1.3299, + "step": 11766 + }, + { + "epoch": 1.3670636073191984, + "grad_norm": 0.6283421516418457, + "learning_rate": 0.0001, + "loss": 1.588, + "step": 11767 + }, + { + "epoch": 1.367179785071159, + "grad_norm": 0.557995617389679, + "learning_rate": 0.0001, + "loss": 1.3089, + "step": 11768 + }, + { + "epoch": 1.3672959628231194, + "grad_norm": 0.5753653049468994, + "learning_rate": 0.0001, + "loss": 1.4653, + "step": 11769 + }, + { + "epoch": 1.3674121405750799, + "grad_norm": 0.6110243797302246, + "learning_rate": 0.0001, + "loss": 1.5119, + "step": 11770 + }, + { + "epoch": 1.3675283183270404, + "grad_norm": 0.642266571521759, + "learning_rate": 0.0001, + "loss": 1.4771, + "step": 11771 + }, + { + "epoch": 1.3676444960790008, + "grad_norm": 0.5759993195533752, + "learning_rate": 0.0001, + "loss": 1.3839, + "step": 11772 + }, + { + "epoch": 1.3677606738309613, + "grad_norm": 0.5989052653312683, + "learning_rate": 0.0001, + "loss": 1.5407, + "step": 11773 + }, + { + "epoch": 1.3678768515829218, + "grad_norm": 0.5996981263160706, + "learning_rate": 0.0001, + "loss": 1.4698, + "step": 11774 + }, + { + "epoch": 1.3679930293348823, + "grad_norm": 0.5985528826713562, + "learning_rate": 0.0001, + "loss": 1.4157, + "step": 11775 + }, + { + "epoch": 1.3681092070868428, + "grad_norm": 0.5674298405647278, + "learning_rate": 0.0001, + "loss": 1.3278, + "step": 11776 + }, + { + "epoch": 1.3682253848388033, + "grad_norm": 0.6040525436401367, + "learning_rate": 0.0001, + "loss": 1.5492, + "step": 11777 + }, + { + "epoch": 1.3683415625907638, + "grad_norm": 0.5432946085929871, + "learning_rate": 0.0001, + "loss": 1.4631, + "step": 11778 + }, + { + "epoch": 1.3684577403427243, + "grad_norm": 0.5686826109886169, + "learning_rate": 0.0001, + "loss": 1.4459, + "step": 11779 + }, + { + "epoch": 1.3685739180946848, + "grad_norm": 0.5497419238090515, + "learning_rate": 0.0001, + "loss": 1.2312, + "step": 11780 + }, + { + "epoch": 1.3686900958466452, + "grad_norm": 0.6301116943359375, + "learning_rate": 0.0001, + "loss": 1.4836, + "step": 11781 + }, + { + "epoch": 1.3688062735986057, + "grad_norm": 0.562529981136322, + "learning_rate": 0.0001, + "loss": 1.4798, + "step": 11782 + }, + { + "epoch": 1.3689224513505664, + "grad_norm": 0.6351971626281738, + "learning_rate": 0.0001, + "loss": 1.584, + "step": 11783 + }, + { + "epoch": 1.369038629102527, + "grad_norm": 0.5856718420982361, + "learning_rate": 0.0001, + "loss": 1.4204, + "step": 11784 + }, + { + "epoch": 1.3691548068544874, + "grad_norm": 0.5878714323043823, + "learning_rate": 0.0001, + "loss": 1.5263, + "step": 11785 + }, + { + "epoch": 1.369270984606448, + "grad_norm": 0.5866144299507141, + "learning_rate": 0.0001, + "loss": 1.4446, + "step": 11786 + }, + { + "epoch": 1.3693871623584084, + "grad_norm": 0.5930546522140503, + "learning_rate": 0.0001, + "loss": 1.436, + "step": 11787 + }, + { + "epoch": 1.369503340110369, + "grad_norm": 0.5738999247550964, + "learning_rate": 0.0001, + "loss": 1.3887, + "step": 11788 + }, + { + "epoch": 1.3696195178623294, + "grad_norm": 0.6546767950057983, + "learning_rate": 0.0001, + "loss": 1.7115, + "step": 11789 + }, + { + "epoch": 1.3697356956142899, + "grad_norm": 0.5498051643371582, + "learning_rate": 0.0001, + "loss": 1.3995, + "step": 11790 + }, + { + "epoch": 1.3698518733662504, + "grad_norm": 0.5807487368583679, + "learning_rate": 0.0001, + "loss": 1.4775, + "step": 11791 + }, + { + "epoch": 1.3699680511182108, + "grad_norm": 0.582394003868103, + "learning_rate": 0.0001, + "loss": 1.4096, + "step": 11792 + }, + { + "epoch": 1.3700842288701713, + "grad_norm": 0.5830745100975037, + "learning_rate": 0.0001, + "loss": 1.4392, + "step": 11793 + }, + { + "epoch": 1.3702004066221318, + "grad_norm": 0.6170597672462463, + "learning_rate": 0.0001, + "loss": 1.6125, + "step": 11794 + }, + { + "epoch": 1.3703165843740923, + "grad_norm": 0.5893037915229797, + "learning_rate": 0.0001, + "loss": 1.4202, + "step": 11795 + }, + { + "epoch": 1.3704327621260528, + "grad_norm": 0.618791401386261, + "learning_rate": 0.0001, + "loss": 1.5396, + "step": 11796 + }, + { + "epoch": 1.3705489398780133, + "grad_norm": 0.6027114987373352, + "learning_rate": 0.0001, + "loss": 1.4953, + "step": 11797 + }, + { + "epoch": 1.3706651176299738, + "grad_norm": 0.5932108163833618, + "learning_rate": 0.0001, + "loss": 1.5312, + "step": 11798 + }, + { + "epoch": 1.3707812953819345, + "grad_norm": 0.6085308790206909, + "learning_rate": 0.0001, + "loss": 1.4098, + "step": 11799 + }, + { + "epoch": 1.370897473133895, + "grad_norm": 0.6194184422492981, + "learning_rate": 0.0001, + "loss": 1.4491, + "step": 11800 + }, + { + "epoch": 1.3710136508858555, + "grad_norm": 0.5471352934837341, + "learning_rate": 0.0001, + "loss": 1.2728, + "step": 11801 + }, + { + "epoch": 1.371129828637816, + "grad_norm": 0.5235902667045593, + "learning_rate": 0.0001, + "loss": 1.3627, + "step": 11802 + }, + { + "epoch": 1.3712460063897765, + "grad_norm": 0.6246318817138672, + "learning_rate": 0.0001, + "loss": 1.393, + "step": 11803 + }, + { + "epoch": 1.371362184141737, + "grad_norm": 0.6138882637023926, + "learning_rate": 0.0001, + "loss": 1.2501, + "step": 11804 + }, + { + "epoch": 1.3714783618936974, + "grad_norm": 0.6174538731575012, + "learning_rate": 0.0001, + "loss": 1.34, + "step": 11805 + }, + { + "epoch": 1.371594539645658, + "grad_norm": 0.6327789425849915, + "learning_rate": 0.0001, + "loss": 1.5747, + "step": 11806 + }, + { + "epoch": 1.3717107173976184, + "grad_norm": 0.5819458365440369, + "learning_rate": 0.0001, + "loss": 1.4555, + "step": 11807 + }, + { + "epoch": 1.371826895149579, + "grad_norm": 0.5566579699516296, + "learning_rate": 0.0001, + "loss": 1.3598, + "step": 11808 + }, + { + "epoch": 1.3719430729015394, + "grad_norm": 0.608745813369751, + "learning_rate": 0.0001, + "loss": 1.7057, + "step": 11809 + }, + { + "epoch": 1.3720592506534999, + "grad_norm": 0.6066353917121887, + "learning_rate": 0.0001, + "loss": 1.4786, + "step": 11810 + }, + { + "epoch": 1.3721754284054604, + "grad_norm": 0.6526094079017639, + "learning_rate": 0.0001, + "loss": 1.6568, + "step": 11811 + }, + { + "epoch": 1.3722916061574209, + "grad_norm": 0.5477768778800964, + "learning_rate": 0.0001, + "loss": 1.4258, + "step": 11812 + }, + { + "epoch": 1.3724077839093813, + "grad_norm": 0.5935622453689575, + "learning_rate": 0.0001, + "loss": 1.4168, + "step": 11813 + }, + { + "epoch": 1.3725239616613418, + "grad_norm": 0.6308596730232239, + "learning_rate": 0.0001, + "loss": 1.3882, + "step": 11814 + }, + { + "epoch": 1.3726401394133023, + "grad_norm": 0.5651733875274658, + "learning_rate": 0.0001, + "loss": 1.2528, + "step": 11815 + }, + { + "epoch": 1.3727563171652628, + "grad_norm": 0.5938041806221008, + "learning_rate": 0.0001, + "loss": 1.4036, + "step": 11816 + }, + { + "epoch": 1.3728724949172233, + "grad_norm": 0.584787905216217, + "learning_rate": 0.0001, + "loss": 1.5447, + "step": 11817 + }, + { + "epoch": 1.3729886726691838, + "grad_norm": 0.5955517292022705, + "learning_rate": 0.0001, + "loss": 1.3351, + "step": 11818 + }, + { + "epoch": 1.3731048504211443, + "grad_norm": 0.595584511756897, + "learning_rate": 0.0001, + "loss": 1.5651, + "step": 11819 + }, + { + "epoch": 1.3732210281731048, + "grad_norm": 0.5970519781112671, + "learning_rate": 0.0001, + "loss": 1.4131, + "step": 11820 + }, + { + "epoch": 1.3733372059250653, + "grad_norm": 0.579200029373169, + "learning_rate": 0.0001, + "loss": 1.4517, + "step": 11821 + }, + { + "epoch": 1.3734533836770257, + "grad_norm": 0.6147415637969971, + "learning_rate": 0.0001, + "loss": 1.5784, + "step": 11822 + }, + { + "epoch": 1.3735695614289862, + "grad_norm": 0.6013969779014587, + "learning_rate": 0.0001, + "loss": 1.5465, + "step": 11823 + }, + { + "epoch": 1.3736857391809467, + "grad_norm": 0.5508737564086914, + "learning_rate": 0.0001, + "loss": 1.4251, + "step": 11824 + }, + { + "epoch": 1.3738019169329074, + "grad_norm": 0.5326734185218811, + "learning_rate": 0.0001, + "loss": 1.3177, + "step": 11825 + }, + { + "epoch": 1.373918094684868, + "grad_norm": 0.5824387073516846, + "learning_rate": 0.0001, + "loss": 1.4181, + "step": 11826 + }, + { + "epoch": 1.3740342724368284, + "grad_norm": 0.5780125856399536, + "learning_rate": 0.0001, + "loss": 1.3633, + "step": 11827 + }, + { + "epoch": 1.374150450188789, + "grad_norm": 0.56535804271698, + "learning_rate": 0.0001, + "loss": 1.5986, + "step": 11828 + }, + { + "epoch": 1.3742666279407494, + "grad_norm": 0.6138978600502014, + "learning_rate": 0.0001, + "loss": 1.5314, + "step": 11829 + }, + { + "epoch": 1.3743828056927099, + "grad_norm": 0.6302936673164368, + "learning_rate": 0.0001, + "loss": 1.5483, + "step": 11830 + }, + { + "epoch": 1.3744989834446704, + "grad_norm": 0.5425077080726624, + "learning_rate": 0.0001, + "loss": 1.2737, + "step": 11831 + }, + { + "epoch": 1.3746151611966309, + "grad_norm": 0.5782576203346252, + "learning_rate": 0.0001, + "loss": 1.3394, + "step": 11832 + }, + { + "epoch": 1.3747313389485913, + "grad_norm": 0.5924588441848755, + "learning_rate": 0.0001, + "loss": 1.4326, + "step": 11833 + }, + { + "epoch": 1.3748475167005518, + "grad_norm": 0.6358031034469604, + "learning_rate": 0.0001, + "loss": 1.5883, + "step": 11834 + }, + { + "epoch": 1.3749636944525123, + "grad_norm": 0.5872380137443542, + "learning_rate": 0.0001, + "loss": 1.3631, + "step": 11835 + }, + { + "epoch": 1.3750798722044728, + "grad_norm": 0.5860925316810608, + "learning_rate": 0.0001, + "loss": 1.5267, + "step": 11836 + }, + { + "epoch": 1.3751960499564333, + "grad_norm": 0.5498318076133728, + "learning_rate": 0.0001, + "loss": 1.4259, + "step": 11837 + }, + { + "epoch": 1.3753122277083938, + "grad_norm": 0.5745248198509216, + "learning_rate": 0.0001, + "loss": 1.5403, + "step": 11838 + }, + { + "epoch": 1.3754284054603543, + "grad_norm": 0.6191520690917969, + "learning_rate": 0.0001, + "loss": 1.4546, + "step": 11839 + }, + { + "epoch": 1.3755445832123148, + "grad_norm": 0.5659325122833252, + "learning_rate": 0.0001, + "loss": 1.3464, + "step": 11840 + }, + { + "epoch": 1.3756607609642755, + "grad_norm": 0.5831794738769531, + "learning_rate": 0.0001, + "loss": 1.3686, + "step": 11841 + }, + { + "epoch": 1.375776938716236, + "grad_norm": 0.6090126633644104, + "learning_rate": 0.0001, + "loss": 1.4903, + "step": 11842 + }, + { + "epoch": 1.3758931164681965, + "grad_norm": 0.5565717816352844, + "learning_rate": 0.0001, + "loss": 1.4553, + "step": 11843 + }, + { + "epoch": 1.376009294220157, + "grad_norm": 0.6339083909988403, + "learning_rate": 0.0001, + "loss": 1.5181, + "step": 11844 + }, + { + "epoch": 1.3761254719721174, + "grad_norm": 0.5814673900604248, + "learning_rate": 0.0001, + "loss": 1.3132, + "step": 11845 + }, + { + "epoch": 1.376241649724078, + "grad_norm": 0.5672376155853271, + "learning_rate": 0.0001, + "loss": 1.3554, + "step": 11846 + }, + { + "epoch": 1.3763578274760384, + "grad_norm": 0.6251681447029114, + "learning_rate": 0.0001, + "loss": 1.5639, + "step": 11847 + }, + { + "epoch": 1.376474005227999, + "grad_norm": 0.5934876203536987, + "learning_rate": 0.0001, + "loss": 1.5827, + "step": 11848 + }, + { + "epoch": 1.3765901829799594, + "grad_norm": 0.6062328219413757, + "learning_rate": 0.0001, + "loss": 1.5786, + "step": 11849 + }, + { + "epoch": 1.3767063607319199, + "grad_norm": 0.6141518354415894, + "learning_rate": 0.0001, + "loss": 1.7519, + "step": 11850 + }, + { + "epoch": 1.3768225384838804, + "grad_norm": 0.5661596059799194, + "learning_rate": 0.0001, + "loss": 1.4716, + "step": 11851 + }, + { + "epoch": 1.3769387162358409, + "grad_norm": 0.5913867354393005, + "learning_rate": 0.0001, + "loss": 1.523, + "step": 11852 + }, + { + "epoch": 1.3770548939878013, + "grad_norm": 0.6132528185844421, + "learning_rate": 0.0001, + "loss": 1.5667, + "step": 11853 + }, + { + "epoch": 1.3771710717397618, + "grad_norm": 0.5519713163375854, + "learning_rate": 0.0001, + "loss": 1.2775, + "step": 11854 + }, + { + "epoch": 1.3772872494917223, + "grad_norm": 0.586379885673523, + "learning_rate": 0.0001, + "loss": 1.3913, + "step": 11855 + }, + { + "epoch": 1.3774034272436828, + "grad_norm": 0.6136939525604248, + "learning_rate": 0.0001, + "loss": 1.5782, + "step": 11856 + }, + { + "epoch": 1.3775196049956433, + "grad_norm": 0.6094126105308533, + "learning_rate": 0.0001, + "loss": 1.4171, + "step": 11857 + }, + { + "epoch": 1.3776357827476038, + "grad_norm": 0.6217323541641235, + "learning_rate": 0.0001, + "loss": 1.4317, + "step": 11858 + }, + { + "epoch": 1.3777519604995643, + "grad_norm": 0.6349629163742065, + "learning_rate": 0.0001, + "loss": 1.4441, + "step": 11859 + }, + { + "epoch": 1.3778681382515248, + "grad_norm": 0.565229058265686, + "learning_rate": 0.0001, + "loss": 1.5085, + "step": 11860 + }, + { + "epoch": 1.3779843160034853, + "grad_norm": 0.6026855707168579, + "learning_rate": 0.0001, + "loss": 1.4853, + "step": 11861 + }, + { + "epoch": 1.3781004937554457, + "grad_norm": 0.5912261009216309, + "learning_rate": 0.0001, + "loss": 1.5247, + "step": 11862 + }, + { + "epoch": 1.3782166715074062, + "grad_norm": 0.5594475865364075, + "learning_rate": 0.0001, + "loss": 1.5227, + "step": 11863 + }, + { + "epoch": 1.3783328492593667, + "grad_norm": 0.5987929701805115, + "learning_rate": 0.0001, + "loss": 1.4032, + "step": 11864 + }, + { + "epoch": 1.3784490270113272, + "grad_norm": 0.5730629563331604, + "learning_rate": 0.0001, + "loss": 1.338, + "step": 11865 + }, + { + "epoch": 1.3785652047632877, + "grad_norm": 0.5603897571563721, + "learning_rate": 0.0001, + "loss": 1.4159, + "step": 11866 + }, + { + "epoch": 1.3786813825152484, + "grad_norm": 0.6142343878746033, + "learning_rate": 0.0001, + "loss": 1.5328, + "step": 11867 + }, + { + "epoch": 1.378797560267209, + "grad_norm": 0.6077200174331665, + "learning_rate": 0.0001, + "loss": 1.516, + "step": 11868 + }, + { + "epoch": 1.3789137380191694, + "grad_norm": 0.5924538373947144, + "learning_rate": 0.0001, + "loss": 1.5671, + "step": 11869 + }, + { + "epoch": 1.3790299157711299, + "grad_norm": 0.572593092918396, + "learning_rate": 0.0001, + "loss": 1.3483, + "step": 11870 + }, + { + "epoch": 1.3791460935230904, + "grad_norm": 0.5781865119934082, + "learning_rate": 0.0001, + "loss": 1.2929, + "step": 11871 + }, + { + "epoch": 1.3792622712750509, + "grad_norm": 0.6432904005050659, + "learning_rate": 0.0001, + "loss": 1.5648, + "step": 11872 + }, + { + "epoch": 1.3793784490270113, + "grad_norm": 0.5920875072479248, + "learning_rate": 0.0001, + "loss": 1.3776, + "step": 11873 + }, + { + "epoch": 1.3794946267789718, + "grad_norm": 0.5507972836494446, + "learning_rate": 0.0001, + "loss": 1.3918, + "step": 11874 + }, + { + "epoch": 1.3796108045309323, + "grad_norm": 0.5901596546173096, + "learning_rate": 0.0001, + "loss": 1.4458, + "step": 11875 + }, + { + "epoch": 1.3797269822828928, + "grad_norm": 0.6390935182571411, + "learning_rate": 0.0001, + "loss": 1.6274, + "step": 11876 + }, + { + "epoch": 1.3798431600348533, + "grad_norm": 0.6109213829040527, + "learning_rate": 0.0001, + "loss": 1.5234, + "step": 11877 + }, + { + "epoch": 1.3799593377868138, + "grad_norm": 0.5569701790809631, + "learning_rate": 0.0001, + "loss": 1.3981, + "step": 11878 + }, + { + "epoch": 1.3800755155387743, + "grad_norm": 0.5873339772224426, + "learning_rate": 0.0001, + "loss": 1.4033, + "step": 11879 + }, + { + "epoch": 1.3801916932907348, + "grad_norm": 0.6170241236686707, + "learning_rate": 0.0001, + "loss": 1.4489, + "step": 11880 + }, + { + "epoch": 1.3803078710426953, + "grad_norm": 0.5810169577598572, + "learning_rate": 0.0001, + "loss": 1.3362, + "step": 11881 + }, + { + "epoch": 1.3804240487946557, + "grad_norm": 0.6244444251060486, + "learning_rate": 0.0001, + "loss": 1.5752, + "step": 11882 + }, + { + "epoch": 1.3805402265466165, + "grad_norm": 0.5936790704727173, + "learning_rate": 0.0001, + "loss": 1.3589, + "step": 11883 + }, + { + "epoch": 1.380656404298577, + "grad_norm": 0.5871090888977051, + "learning_rate": 0.0001, + "loss": 1.4629, + "step": 11884 + }, + { + "epoch": 1.3807725820505374, + "grad_norm": 0.6491077542304993, + "learning_rate": 0.0001, + "loss": 1.3812, + "step": 11885 + }, + { + "epoch": 1.380888759802498, + "grad_norm": 0.5600960850715637, + "learning_rate": 0.0001, + "loss": 1.2137, + "step": 11886 + }, + { + "epoch": 1.3810049375544584, + "grad_norm": 0.6018544435501099, + "learning_rate": 0.0001, + "loss": 1.4018, + "step": 11887 + }, + { + "epoch": 1.381121115306419, + "grad_norm": 0.58489990234375, + "learning_rate": 0.0001, + "loss": 1.3715, + "step": 11888 + }, + { + "epoch": 1.3812372930583794, + "grad_norm": 0.5836353302001953, + "learning_rate": 0.0001, + "loss": 1.2618, + "step": 11889 + }, + { + "epoch": 1.3813534708103399, + "grad_norm": 0.6115884184837341, + "learning_rate": 0.0001, + "loss": 1.58, + "step": 11890 + }, + { + "epoch": 1.3814696485623004, + "grad_norm": 0.6390863060951233, + "learning_rate": 0.0001, + "loss": 1.4521, + "step": 11891 + }, + { + "epoch": 1.3815858263142609, + "grad_norm": 0.5852669477462769, + "learning_rate": 0.0001, + "loss": 1.4158, + "step": 11892 + }, + { + "epoch": 1.3817020040662213, + "grad_norm": 0.6284686923027039, + "learning_rate": 0.0001, + "loss": 1.4269, + "step": 11893 + }, + { + "epoch": 1.3818181818181818, + "grad_norm": 0.6020316481590271, + "learning_rate": 0.0001, + "loss": 1.5378, + "step": 11894 + }, + { + "epoch": 1.3819343595701423, + "grad_norm": 0.650677502155304, + "learning_rate": 0.0001, + "loss": 1.7152, + "step": 11895 + }, + { + "epoch": 1.3820505373221028, + "grad_norm": 0.5915738940238953, + "learning_rate": 0.0001, + "loss": 1.4075, + "step": 11896 + }, + { + "epoch": 1.3821667150740633, + "grad_norm": 0.6179349422454834, + "learning_rate": 0.0001, + "loss": 1.5468, + "step": 11897 + }, + { + "epoch": 1.3822828928260238, + "grad_norm": 0.6398385167121887, + "learning_rate": 0.0001, + "loss": 1.2884, + "step": 11898 + }, + { + "epoch": 1.3823990705779843, + "grad_norm": 0.6043710112571716, + "learning_rate": 0.0001, + "loss": 1.4017, + "step": 11899 + }, + { + "epoch": 1.3825152483299448, + "grad_norm": 0.5860773921012878, + "learning_rate": 0.0001, + "loss": 1.4341, + "step": 11900 + }, + { + "epoch": 1.3826314260819053, + "grad_norm": 0.6333348751068115, + "learning_rate": 0.0001, + "loss": 1.3162, + "step": 11901 + }, + { + "epoch": 1.3827476038338657, + "grad_norm": 0.6222668290138245, + "learning_rate": 0.0001, + "loss": 1.4695, + "step": 11902 + }, + { + "epoch": 1.3828637815858262, + "grad_norm": 0.5919185280799866, + "learning_rate": 0.0001, + "loss": 1.3576, + "step": 11903 + }, + { + "epoch": 1.3829799593377867, + "grad_norm": 0.5540260076522827, + "learning_rate": 0.0001, + "loss": 1.2607, + "step": 11904 + }, + { + "epoch": 1.3830961370897472, + "grad_norm": 0.6326545476913452, + "learning_rate": 0.0001, + "loss": 1.5169, + "step": 11905 + }, + { + "epoch": 1.3832123148417077, + "grad_norm": 0.5651580691337585, + "learning_rate": 0.0001, + "loss": 1.3546, + "step": 11906 + }, + { + "epoch": 1.3833284925936682, + "grad_norm": 0.5845093131065369, + "learning_rate": 0.0001, + "loss": 1.7022, + "step": 11907 + }, + { + "epoch": 1.3834446703456287, + "grad_norm": 0.6123130321502686, + "learning_rate": 0.0001, + "loss": 1.672, + "step": 11908 + }, + { + "epoch": 1.3835608480975894, + "grad_norm": 0.5754287242889404, + "learning_rate": 0.0001, + "loss": 1.4662, + "step": 11909 + }, + { + "epoch": 1.3836770258495499, + "grad_norm": 0.6103564500808716, + "learning_rate": 0.0001, + "loss": 1.4633, + "step": 11910 + }, + { + "epoch": 1.3837932036015104, + "grad_norm": 0.6126376390457153, + "learning_rate": 0.0001, + "loss": 1.4635, + "step": 11911 + }, + { + "epoch": 1.3839093813534709, + "grad_norm": 0.5791416168212891, + "learning_rate": 0.0001, + "loss": 1.4217, + "step": 11912 + }, + { + "epoch": 1.3840255591054313, + "grad_norm": 0.5979540944099426, + "learning_rate": 0.0001, + "loss": 1.3736, + "step": 11913 + }, + { + "epoch": 1.3841417368573918, + "grad_norm": 0.5859628319740295, + "learning_rate": 0.0001, + "loss": 1.2924, + "step": 11914 + }, + { + "epoch": 1.3842579146093523, + "grad_norm": 0.6160323619842529, + "learning_rate": 0.0001, + "loss": 1.4061, + "step": 11915 + }, + { + "epoch": 1.3843740923613128, + "grad_norm": 0.6810671091079712, + "learning_rate": 0.0001, + "loss": 1.3882, + "step": 11916 + }, + { + "epoch": 1.3844902701132733, + "grad_norm": 0.6706360578536987, + "learning_rate": 0.0001, + "loss": 1.5949, + "step": 11917 + }, + { + "epoch": 1.3846064478652338, + "grad_norm": 0.5953517556190491, + "learning_rate": 0.0001, + "loss": 1.3044, + "step": 11918 + }, + { + "epoch": 1.3847226256171943, + "grad_norm": 0.5882486701011658, + "learning_rate": 0.0001, + "loss": 1.4569, + "step": 11919 + }, + { + "epoch": 1.3848388033691548, + "grad_norm": 0.6317164301872253, + "learning_rate": 0.0001, + "loss": 1.5054, + "step": 11920 + }, + { + "epoch": 1.3849549811211153, + "grad_norm": 0.6241341233253479, + "learning_rate": 0.0001, + "loss": 1.5809, + "step": 11921 + }, + { + "epoch": 1.3850711588730757, + "grad_norm": 0.6015408635139465, + "learning_rate": 0.0001, + "loss": 1.4114, + "step": 11922 + }, + { + "epoch": 1.3851873366250362, + "grad_norm": 0.5517604351043701, + "learning_rate": 0.0001, + "loss": 1.4652, + "step": 11923 + }, + { + "epoch": 1.3853035143769967, + "grad_norm": 0.6593002080917358, + "learning_rate": 0.0001, + "loss": 1.6226, + "step": 11924 + }, + { + "epoch": 1.3854196921289574, + "grad_norm": 0.5856340527534485, + "learning_rate": 0.0001, + "loss": 1.5272, + "step": 11925 + }, + { + "epoch": 1.385535869880918, + "grad_norm": 0.6336721181869507, + "learning_rate": 0.0001, + "loss": 1.5932, + "step": 11926 + }, + { + "epoch": 1.3856520476328784, + "grad_norm": 0.5610465407371521, + "learning_rate": 0.0001, + "loss": 1.6593, + "step": 11927 + }, + { + "epoch": 1.385768225384839, + "grad_norm": 0.6065878868103027, + "learning_rate": 0.0001, + "loss": 1.4454, + "step": 11928 + }, + { + "epoch": 1.3858844031367994, + "grad_norm": 0.5784077048301697, + "learning_rate": 0.0001, + "loss": 1.4109, + "step": 11929 + }, + { + "epoch": 1.3860005808887599, + "grad_norm": 0.5930120348930359, + "learning_rate": 0.0001, + "loss": 1.2691, + "step": 11930 + }, + { + "epoch": 1.3861167586407204, + "grad_norm": 0.599595308303833, + "learning_rate": 0.0001, + "loss": 1.3383, + "step": 11931 + }, + { + "epoch": 1.3862329363926809, + "grad_norm": 0.6132897734642029, + "learning_rate": 0.0001, + "loss": 1.5586, + "step": 11932 + }, + { + "epoch": 1.3863491141446413, + "grad_norm": 0.5871015191078186, + "learning_rate": 0.0001, + "loss": 1.3863, + "step": 11933 + }, + { + "epoch": 1.3864652918966018, + "grad_norm": 0.588950514793396, + "learning_rate": 0.0001, + "loss": 1.3364, + "step": 11934 + }, + { + "epoch": 1.3865814696485623, + "grad_norm": 0.5841085314750671, + "learning_rate": 0.0001, + "loss": 1.4215, + "step": 11935 + }, + { + "epoch": 1.3866976474005228, + "grad_norm": 0.5449914932250977, + "learning_rate": 0.0001, + "loss": 1.3709, + "step": 11936 + }, + { + "epoch": 1.3868138251524833, + "grad_norm": 0.6109606623649597, + "learning_rate": 0.0001, + "loss": 1.3732, + "step": 11937 + }, + { + "epoch": 1.3869300029044438, + "grad_norm": 0.6314137578010559, + "learning_rate": 0.0001, + "loss": 1.3493, + "step": 11938 + }, + { + "epoch": 1.3870461806564043, + "grad_norm": 0.6377010345458984, + "learning_rate": 0.0001, + "loss": 1.3789, + "step": 11939 + }, + { + "epoch": 1.3871623584083648, + "grad_norm": 0.6468605399131775, + "learning_rate": 0.0001, + "loss": 1.5538, + "step": 11940 + }, + { + "epoch": 1.3872785361603253, + "grad_norm": 0.6271966099739075, + "learning_rate": 0.0001, + "loss": 1.5676, + "step": 11941 + }, + { + "epoch": 1.3873947139122857, + "grad_norm": 0.5833338499069214, + "learning_rate": 0.0001, + "loss": 1.5451, + "step": 11942 + }, + { + "epoch": 1.3875108916642462, + "grad_norm": 0.5465646386146545, + "learning_rate": 0.0001, + "loss": 1.2402, + "step": 11943 + }, + { + "epoch": 1.3876270694162067, + "grad_norm": 0.5553290247917175, + "learning_rate": 0.0001, + "loss": 1.3301, + "step": 11944 + }, + { + "epoch": 1.3877432471681672, + "grad_norm": 0.6450720429420471, + "learning_rate": 0.0001, + "loss": 1.5107, + "step": 11945 + }, + { + "epoch": 1.3878594249201277, + "grad_norm": 0.6172477602958679, + "learning_rate": 0.0001, + "loss": 1.5336, + "step": 11946 + }, + { + "epoch": 1.3879756026720882, + "grad_norm": 0.6228959560394287, + "learning_rate": 0.0001, + "loss": 1.5618, + "step": 11947 + }, + { + "epoch": 1.3880917804240487, + "grad_norm": 0.6162193417549133, + "learning_rate": 0.0001, + "loss": 1.4889, + "step": 11948 + }, + { + "epoch": 1.3882079581760092, + "grad_norm": 0.6280121803283691, + "learning_rate": 0.0001, + "loss": 1.5292, + "step": 11949 + }, + { + "epoch": 1.3883241359279697, + "grad_norm": 0.5984324812889099, + "learning_rate": 0.0001, + "loss": 1.6882, + "step": 11950 + }, + { + "epoch": 1.3884403136799304, + "grad_norm": 0.5944661498069763, + "learning_rate": 0.0001, + "loss": 1.4478, + "step": 11951 + }, + { + "epoch": 1.3885564914318909, + "grad_norm": 0.6321014165878296, + "learning_rate": 0.0001, + "loss": 1.4836, + "step": 11952 + }, + { + "epoch": 1.3886726691838513, + "grad_norm": 0.6064655780792236, + "learning_rate": 0.0001, + "loss": 1.5876, + "step": 11953 + }, + { + "epoch": 1.3887888469358118, + "grad_norm": 0.5939200520515442, + "learning_rate": 0.0001, + "loss": 1.443, + "step": 11954 + }, + { + "epoch": 1.3889050246877723, + "grad_norm": 0.5834383368492126, + "learning_rate": 0.0001, + "loss": 1.3573, + "step": 11955 + }, + { + "epoch": 1.3890212024397328, + "grad_norm": 0.5999382138252258, + "learning_rate": 0.0001, + "loss": 1.3501, + "step": 11956 + }, + { + "epoch": 1.3891373801916933, + "grad_norm": 0.652275562286377, + "learning_rate": 0.0001, + "loss": 1.4214, + "step": 11957 + }, + { + "epoch": 1.3892535579436538, + "grad_norm": 0.6107578277587891, + "learning_rate": 0.0001, + "loss": 1.472, + "step": 11958 + }, + { + "epoch": 1.3893697356956143, + "grad_norm": 0.6266455054283142, + "learning_rate": 0.0001, + "loss": 1.6335, + "step": 11959 + }, + { + "epoch": 1.3894859134475748, + "grad_norm": 0.5955777168273926, + "learning_rate": 0.0001, + "loss": 1.4787, + "step": 11960 + }, + { + "epoch": 1.3896020911995353, + "grad_norm": 0.6200416684150696, + "learning_rate": 0.0001, + "loss": 1.4147, + "step": 11961 + }, + { + "epoch": 1.3897182689514957, + "grad_norm": 0.6102170348167419, + "learning_rate": 0.0001, + "loss": 1.5664, + "step": 11962 + }, + { + "epoch": 1.3898344467034562, + "grad_norm": 0.5599600076675415, + "learning_rate": 0.0001, + "loss": 1.432, + "step": 11963 + }, + { + "epoch": 1.3899506244554167, + "grad_norm": 0.6110653281211853, + "learning_rate": 0.0001, + "loss": 1.5714, + "step": 11964 + }, + { + "epoch": 1.3900668022073772, + "grad_norm": 0.5802760720252991, + "learning_rate": 0.0001, + "loss": 1.488, + "step": 11965 + }, + { + "epoch": 1.3901829799593377, + "grad_norm": 0.5832214951515198, + "learning_rate": 0.0001, + "loss": 1.6663, + "step": 11966 + }, + { + "epoch": 1.3902991577112984, + "grad_norm": 0.5920937657356262, + "learning_rate": 0.0001, + "loss": 1.3629, + "step": 11967 + }, + { + "epoch": 1.390415335463259, + "grad_norm": 0.6126205921173096, + "learning_rate": 0.0001, + "loss": 1.4929, + "step": 11968 + }, + { + "epoch": 1.3905315132152194, + "grad_norm": 0.6226344108581543, + "learning_rate": 0.0001, + "loss": 1.6481, + "step": 11969 + }, + { + "epoch": 1.3906476909671799, + "grad_norm": 0.5618362426757812, + "learning_rate": 0.0001, + "loss": 1.3998, + "step": 11970 + }, + { + "epoch": 1.3907638687191404, + "grad_norm": 0.5985326766967773, + "learning_rate": 0.0001, + "loss": 1.5428, + "step": 11971 + }, + { + "epoch": 1.3908800464711009, + "grad_norm": 0.611813485622406, + "learning_rate": 0.0001, + "loss": 1.5763, + "step": 11972 + }, + { + "epoch": 1.3909962242230614, + "grad_norm": 0.5800209641456604, + "learning_rate": 0.0001, + "loss": 1.4741, + "step": 11973 + }, + { + "epoch": 1.3911124019750218, + "grad_norm": 0.6421018838882446, + "learning_rate": 0.0001, + "loss": 1.5652, + "step": 11974 + }, + { + "epoch": 1.3912285797269823, + "grad_norm": 0.5720347166061401, + "learning_rate": 0.0001, + "loss": 1.4541, + "step": 11975 + }, + { + "epoch": 1.3913447574789428, + "grad_norm": 0.5714155435562134, + "learning_rate": 0.0001, + "loss": 1.3498, + "step": 11976 + }, + { + "epoch": 1.3914609352309033, + "grad_norm": 0.5725839734077454, + "learning_rate": 0.0001, + "loss": 1.479, + "step": 11977 + }, + { + "epoch": 1.3915771129828638, + "grad_norm": 0.5719639658927917, + "learning_rate": 0.0001, + "loss": 1.5502, + "step": 11978 + }, + { + "epoch": 1.3916932907348243, + "grad_norm": 0.5681836605072021, + "learning_rate": 0.0001, + "loss": 1.5435, + "step": 11979 + }, + { + "epoch": 1.3918094684867848, + "grad_norm": 0.5832224488258362, + "learning_rate": 0.0001, + "loss": 1.5036, + "step": 11980 + }, + { + "epoch": 1.3919256462387453, + "grad_norm": 0.6468372941017151, + "learning_rate": 0.0001, + "loss": 1.4172, + "step": 11981 + }, + { + "epoch": 1.3920418239907058, + "grad_norm": 0.5739614367485046, + "learning_rate": 0.0001, + "loss": 1.5003, + "step": 11982 + }, + { + "epoch": 1.3921580017426662, + "grad_norm": 0.6735573410987854, + "learning_rate": 0.0001, + "loss": 1.6389, + "step": 11983 + }, + { + "epoch": 1.3922741794946267, + "grad_norm": 0.6430903077125549, + "learning_rate": 0.0001, + "loss": 1.5639, + "step": 11984 + }, + { + "epoch": 1.3923903572465872, + "grad_norm": 0.5974782705307007, + "learning_rate": 0.0001, + "loss": 1.442, + "step": 11985 + }, + { + "epoch": 1.3925065349985477, + "grad_norm": 0.6093497276306152, + "learning_rate": 0.0001, + "loss": 1.4616, + "step": 11986 + }, + { + "epoch": 1.3926227127505082, + "grad_norm": 0.5952013731002808, + "learning_rate": 0.0001, + "loss": 1.481, + "step": 11987 + }, + { + "epoch": 1.3927388905024687, + "grad_norm": 0.577219545841217, + "learning_rate": 0.0001, + "loss": 1.5508, + "step": 11988 + }, + { + "epoch": 1.3928550682544292, + "grad_norm": 0.5776684284210205, + "learning_rate": 0.0001, + "loss": 1.4687, + "step": 11989 + }, + { + "epoch": 1.3929712460063897, + "grad_norm": 0.580201268196106, + "learning_rate": 0.0001, + "loss": 1.5702, + "step": 11990 + }, + { + "epoch": 1.3930874237583502, + "grad_norm": 0.5810071229934692, + "learning_rate": 0.0001, + "loss": 1.4639, + "step": 11991 + }, + { + "epoch": 1.3932036015103106, + "grad_norm": 0.5787037014961243, + "learning_rate": 0.0001, + "loss": 1.3579, + "step": 11992 + }, + { + "epoch": 1.3933197792622714, + "grad_norm": 0.5724977850914001, + "learning_rate": 0.0001, + "loss": 1.4215, + "step": 11993 + }, + { + "epoch": 1.3934359570142318, + "grad_norm": 0.5736649632453918, + "learning_rate": 0.0001, + "loss": 1.454, + "step": 11994 + }, + { + "epoch": 1.3935521347661923, + "grad_norm": 0.6170047521591187, + "learning_rate": 0.0001, + "loss": 1.3581, + "step": 11995 + }, + { + "epoch": 1.3936683125181528, + "grad_norm": 0.5980851650238037, + "learning_rate": 0.0001, + "loss": 1.5077, + "step": 11996 + }, + { + "epoch": 1.3937844902701133, + "grad_norm": 0.5532777309417725, + "learning_rate": 0.0001, + "loss": 1.3881, + "step": 11997 + }, + { + "epoch": 1.3939006680220738, + "grad_norm": 0.6409569382667542, + "learning_rate": 0.0001, + "loss": 1.5328, + "step": 11998 + }, + { + "epoch": 1.3940168457740343, + "grad_norm": 0.564768373966217, + "learning_rate": 0.0001, + "loss": 1.3573, + "step": 11999 + }, + { + "epoch": 1.3941330235259948, + "grad_norm": 0.6319162845611572, + "learning_rate": 0.0001, + "loss": 1.5109, + "step": 12000 + }, + { + "epoch": 1.3942492012779553, + "grad_norm": 0.537635862827301, + "learning_rate": 0.0001, + "loss": 1.2859, + "step": 12001 + }, + { + "epoch": 1.3943653790299158, + "grad_norm": 0.6129380464553833, + "learning_rate": 0.0001, + "loss": 1.4286, + "step": 12002 + }, + { + "epoch": 1.3944815567818762, + "grad_norm": 0.5544732809066772, + "learning_rate": 0.0001, + "loss": 1.328, + "step": 12003 + }, + { + "epoch": 1.3945977345338367, + "grad_norm": 0.5666175484657288, + "learning_rate": 0.0001, + "loss": 1.448, + "step": 12004 + }, + { + "epoch": 1.3947139122857972, + "grad_norm": 0.5639254450798035, + "learning_rate": 0.0001, + "loss": 1.2761, + "step": 12005 + }, + { + "epoch": 1.3948300900377577, + "grad_norm": 0.6033602952957153, + "learning_rate": 0.0001, + "loss": 1.6798, + "step": 12006 + }, + { + "epoch": 1.3949462677897182, + "grad_norm": 0.6276678442955017, + "learning_rate": 0.0001, + "loss": 1.4993, + "step": 12007 + }, + { + "epoch": 1.3950624455416787, + "grad_norm": 0.5830793976783752, + "learning_rate": 0.0001, + "loss": 1.517, + "step": 12008 + }, + { + "epoch": 1.3951786232936394, + "grad_norm": 0.5950773358345032, + "learning_rate": 0.0001, + "loss": 1.4298, + "step": 12009 + }, + { + "epoch": 1.3952948010455999, + "grad_norm": 0.6116399168968201, + "learning_rate": 0.0001, + "loss": 1.4268, + "step": 12010 + }, + { + "epoch": 1.3954109787975604, + "grad_norm": 0.5737411975860596, + "learning_rate": 0.0001, + "loss": 1.4567, + "step": 12011 + }, + { + "epoch": 1.3955271565495209, + "grad_norm": 0.5670390129089355, + "learning_rate": 0.0001, + "loss": 1.3832, + "step": 12012 + }, + { + "epoch": 1.3956433343014814, + "grad_norm": 0.5643200278282166, + "learning_rate": 0.0001, + "loss": 1.5224, + "step": 12013 + }, + { + "epoch": 1.3957595120534418, + "grad_norm": 0.588569700717926, + "learning_rate": 0.0001, + "loss": 1.4724, + "step": 12014 + }, + { + "epoch": 1.3958756898054023, + "grad_norm": 0.5916202664375305, + "learning_rate": 0.0001, + "loss": 1.3856, + "step": 12015 + }, + { + "epoch": 1.3959918675573628, + "grad_norm": 0.57044517993927, + "learning_rate": 0.0001, + "loss": 1.4782, + "step": 12016 + }, + { + "epoch": 1.3961080453093233, + "grad_norm": 0.5924696326255798, + "learning_rate": 0.0001, + "loss": 1.4195, + "step": 12017 + }, + { + "epoch": 1.3962242230612838, + "grad_norm": 0.5659184455871582, + "learning_rate": 0.0001, + "loss": 1.3907, + "step": 12018 + }, + { + "epoch": 1.3963404008132443, + "grad_norm": 0.6118780970573425, + "learning_rate": 0.0001, + "loss": 1.4633, + "step": 12019 + }, + { + "epoch": 1.3964565785652048, + "grad_norm": 0.569577693939209, + "learning_rate": 0.0001, + "loss": 1.3672, + "step": 12020 + }, + { + "epoch": 1.3965727563171653, + "grad_norm": 0.5691976547241211, + "learning_rate": 0.0001, + "loss": 1.3072, + "step": 12021 + }, + { + "epoch": 1.3966889340691258, + "grad_norm": 0.6030403971672058, + "learning_rate": 0.0001, + "loss": 1.4615, + "step": 12022 + }, + { + "epoch": 1.3968051118210862, + "grad_norm": 0.560248851776123, + "learning_rate": 0.0001, + "loss": 1.2186, + "step": 12023 + }, + { + "epoch": 1.3969212895730467, + "grad_norm": 0.5866391658782959, + "learning_rate": 0.0001, + "loss": 1.4396, + "step": 12024 + }, + { + "epoch": 1.3970374673250072, + "grad_norm": 0.624195396900177, + "learning_rate": 0.0001, + "loss": 1.6459, + "step": 12025 + }, + { + "epoch": 1.3971536450769677, + "grad_norm": 0.6529926657676697, + "learning_rate": 0.0001, + "loss": 1.6331, + "step": 12026 + }, + { + "epoch": 1.3972698228289282, + "grad_norm": 0.6984851360321045, + "learning_rate": 0.0001, + "loss": 1.6492, + "step": 12027 + }, + { + "epoch": 1.3973860005808887, + "grad_norm": 0.6242703795433044, + "learning_rate": 0.0001, + "loss": 1.4967, + "step": 12028 + }, + { + "epoch": 1.3975021783328492, + "grad_norm": 0.5590314865112305, + "learning_rate": 0.0001, + "loss": 1.3682, + "step": 12029 + }, + { + "epoch": 1.3976183560848097, + "grad_norm": 0.6193909049034119, + "learning_rate": 0.0001, + "loss": 1.5752, + "step": 12030 + }, + { + "epoch": 1.3977345338367702, + "grad_norm": 0.5770513415336609, + "learning_rate": 0.0001, + "loss": 1.563, + "step": 12031 + }, + { + "epoch": 1.3978507115887306, + "grad_norm": 0.5817797780036926, + "learning_rate": 0.0001, + "loss": 1.4469, + "step": 12032 + }, + { + "epoch": 1.3979668893406911, + "grad_norm": 0.5368718504905701, + "learning_rate": 0.0001, + "loss": 1.5278, + "step": 12033 + }, + { + "epoch": 1.3980830670926516, + "grad_norm": 0.6158215403556824, + "learning_rate": 0.0001, + "loss": 1.5191, + "step": 12034 + }, + { + "epoch": 1.3981992448446123, + "grad_norm": 0.5757713317871094, + "learning_rate": 0.0001, + "loss": 1.3353, + "step": 12035 + }, + { + "epoch": 1.3983154225965728, + "grad_norm": 0.5898905992507935, + "learning_rate": 0.0001, + "loss": 1.4962, + "step": 12036 + }, + { + "epoch": 1.3984316003485333, + "grad_norm": 0.5727129578590393, + "learning_rate": 0.0001, + "loss": 1.5522, + "step": 12037 + }, + { + "epoch": 1.3985477781004938, + "grad_norm": 0.573549747467041, + "learning_rate": 0.0001, + "loss": 1.4548, + "step": 12038 + }, + { + "epoch": 1.3986639558524543, + "grad_norm": 0.5966438055038452, + "learning_rate": 0.0001, + "loss": 1.4093, + "step": 12039 + }, + { + "epoch": 1.3987801336044148, + "grad_norm": 0.5646287202835083, + "learning_rate": 0.0001, + "loss": 1.4349, + "step": 12040 + }, + { + "epoch": 1.3988963113563753, + "grad_norm": 0.6419237852096558, + "learning_rate": 0.0001, + "loss": 1.6598, + "step": 12041 + }, + { + "epoch": 1.3990124891083358, + "grad_norm": 0.62077397108078, + "learning_rate": 0.0001, + "loss": 1.5193, + "step": 12042 + }, + { + "epoch": 1.3991286668602962, + "grad_norm": 0.5966094136238098, + "learning_rate": 0.0001, + "loss": 1.6089, + "step": 12043 + }, + { + "epoch": 1.3992448446122567, + "grad_norm": 0.5946581363677979, + "learning_rate": 0.0001, + "loss": 1.4991, + "step": 12044 + }, + { + "epoch": 1.3993610223642172, + "grad_norm": 0.5981970429420471, + "learning_rate": 0.0001, + "loss": 1.5537, + "step": 12045 + }, + { + "epoch": 1.3994772001161777, + "grad_norm": 0.5579191446304321, + "learning_rate": 0.0001, + "loss": 1.3866, + "step": 12046 + }, + { + "epoch": 1.3995933778681382, + "grad_norm": 0.5763770937919617, + "learning_rate": 0.0001, + "loss": 1.518, + "step": 12047 + }, + { + "epoch": 1.3997095556200987, + "grad_norm": 0.5983228087425232, + "learning_rate": 0.0001, + "loss": 1.3796, + "step": 12048 + }, + { + "epoch": 1.3998257333720592, + "grad_norm": 0.5548502206802368, + "learning_rate": 0.0001, + "loss": 1.2765, + "step": 12049 + }, + { + "epoch": 1.3999419111240199, + "grad_norm": 0.598829984664917, + "learning_rate": 0.0001, + "loss": 1.4531, + "step": 12050 + }, + { + "epoch": 1.4000580888759804, + "grad_norm": 0.5747253894805908, + "learning_rate": 0.0001, + "loss": 1.2931, + "step": 12051 + }, + { + "epoch": 1.4001742666279409, + "grad_norm": 0.5953359007835388, + "learning_rate": 0.0001, + "loss": 1.3444, + "step": 12052 + }, + { + "epoch": 1.4002904443799014, + "grad_norm": 0.5862138271331787, + "learning_rate": 0.0001, + "loss": 1.6761, + "step": 12053 + }, + { + "epoch": 1.4004066221318618, + "grad_norm": 0.5845853090286255, + "learning_rate": 0.0001, + "loss": 1.5413, + "step": 12054 + }, + { + "epoch": 1.4005227998838223, + "grad_norm": 0.5902648568153381, + "learning_rate": 0.0001, + "loss": 1.372, + "step": 12055 + }, + { + "epoch": 1.4006389776357828, + "grad_norm": 0.6484795212745667, + "learning_rate": 0.0001, + "loss": 1.6057, + "step": 12056 + }, + { + "epoch": 1.4007551553877433, + "grad_norm": 0.6351671814918518, + "learning_rate": 0.0001, + "loss": 1.6507, + "step": 12057 + }, + { + "epoch": 1.4008713331397038, + "grad_norm": 0.5990087389945984, + "learning_rate": 0.0001, + "loss": 1.5616, + "step": 12058 + }, + { + "epoch": 1.4009875108916643, + "grad_norm": 0.5999724864959717, + "learning_rate": 0.0001, + "loss": 1.5903, + "step": 12059 + }, + { + "epoch": 1.4011036886436248, + "grad_norm": 0.6274200081825256, + "learning_rate": 0.0001, + "loss": 1.5174, + "step": 12060 + }, + { + "epoch": 1.4012198663955853, + "grad_norm": 0.5672786831855774, + "learning_rate": 0.0001, + "loss": 1.4998, + "step": 12061 + }, + { + "epoch": 1.4013360441475458, + "grad_norm": 0.6059845089912415, + "learning_rate": 0.0001, + "loss": 1.6055, + "step": 12062 + }, + { + "epoch": 1.4014522218995062, + "grad_norm": 0.583419144153595, + "learning_rate": 0.0001, + "loss": 1.3577, + "step": 12063 + }, + { + "epoch": 1.4015683996514667, + "grad_norm": 0.5932354927062988, + "learning_rate": 0.0001, + "loss": 1.5363, + "step": 12064 + }, + { + "epoch": 1.4016845774034272, + "grad_norm": 0.6560299396514893, + "learning_rate": 0.0001, + "loss": 1.4958, + "step": 12065 + }, + { + "epoch": 1.4018007551553877, + "grad_norm": 0.5621389150619507, + "learning_rate": 0.0001, + "loss": 1.5461, + "step": 12066 + }, + { + "epoch": 1.4019169329073482, + "grad_norm": 0.5786882638931274, + "learning_rate": 0.0001, + "loss": 1.4813, + "step": 12067 + }, + { + "epoch": 1.4020331106593087, + "grad_norm": 0.5459783673286438, + "learning_rate": 0.0001, + "loss": 1.5, + "step": 12068 + }, + { + "epoch": 1.4021492884112692, + "grad_norm": 0.5643264651298523, + "learning_rate": 0.0001, + "loss": 1.5432, + "step": 12069 + }, + { + "epoch": 1.4022654661632297, + "grad_norm": 0.5764142870903015, + "learning_rate": 0.0001, + "loss": 1.3799, + "step": 12070 + }, + { + "epoch": 1.4023816439151902, + "grad_norm": 0.5647251009941101, + "learning_rate": 0.0001, + "loss": 1.4376, + "step": 12071 + }, + { + "epoch": 1.4024978216671506, + "grad_norm": 0.5865373611450195, + "learning_rate": 0.0001, + "loss": 1.513, + "step": 12072 + }, + { + "epoch": 1.4026139994191111, + "grad_norm": 0.5826906561851501, + "learning_rate": 0.0001, + "loss": 1.4781, + "step": 12073 + }, + { + "epoch": 1.4027301771710716, + "grad_norm": 0.5692470669746399, + "learning_rate": 0.0001, + "loss": 1.3101, + "step": 12074 + }, + { + "epoch": 1.4028463549230321, + "grad_norm": 0.5442948937416077, + "learning_rate": 0.0001, + "loss": 1.5004, + "step": 12075 + }, + { + "epoch": 1.4029625326749926, + "grad_norm": 0.5658257007598877, + "learning_rate": 0.0001, + "loss": 1.3957, + "step": 12076 + }, + { + "epoch": 1.4030787104269533, + "grad_norm": 0.595670759677887, + "learning_rate": 0.0001, + "loss": 1.5329, + "step": 12077 + }, + { + "epoch": 1.4031948881789138, + "grad_norm": 0.6052472591400146, + "learning_rate": 0.0001, + "loss": 1.6607, + "step": 12078 + }, + { + "epoch": 1.4033110659308743, + "grad_norm": 0.571075975894928, + "learning_rate": 0.0001, + "loss": 1.3262, + "step": 12079 + }, + { + "epoch": 1.4034272436828348, + "grad_norm": 0.5579153299331665, + "learning_rate": 0.0001, + "loss": 1.4938, + "step": 12080 + }, + { + "epoch": 1.4035434214347953, + "grad_norm": 0.5987997055053711, + "learning_rate": 0.0001, + "loss": 1.442, + "step": 12081 + }, + { + "epoch": 1.4036595991867558, + "grad_norm": 0.5797942280769348, + "learning_rate": 0.0001, + "loss": 1.5384, + "step": 12082 + }, + { + "epoch": 1.4037757769387162, + "grad_norm": 0.5952906012535095, + "learning_rate": 0.0001, + "loss": 1.3313, + "step": 12083 + }, + { + "epoch": 1.4038919546906767, + "grad_norm": 0.6068854331970215, + "learning_rate": 0.0001, + "loss": 1.427, + "step": 12084 + }, + { + "epoch": 1.4040081324426372, + "grad_norm": 0.5405552983283997, + "learning_rate": 0.0001, + "loss": 1.3378, + "step": 12085 + }, + { + "epoch": 1.4041243101945977, + "grad_norm": 0.572024405002594, + "learning_rate": 0.0001, + "loss": 1.425, + "step": 12086 + }, + { + "epoch": 1.4042404879465582, + "grad_norm": 0.5662723183631897, + "learning_rate": 0.0001, + "loss": 1.3541, + "step": 12087 + }, + { + "epoch": 1.4043566656985187, + "grad_norm": 0.6154362559318542, + "learning_rate": 0.0001, + "loss": 1.5061, + "step": 12088 + }, + { + "epoch": 1.4044728434504792, + "grad_norm": 0.6339954733848572, + "learning_rate": 0.0001, + "loss": 1.4753, + "step": 12089 + }, + { + "epoch": 1.4045890212024397, + "grad_norm": 0.6176583170890808, + "learning_rate": 0.0001, + "loss": 1.4823, + "step": 12090 + }, + { + "epoch": 1.4047051989544002, + "grad_norm": 0.6026437878608704, + "learning_rate": 0.0001, + "loss": 1.5651, + "step": 12091 + }, + { + "epoch": 1.4048213767063609, + "grad_norm": 0.6249407529830933, + "learning_rate": 0.0001, + "loss": 1.4091, + "step": 12092 + }, + { + "epoch": 1.4049375544583214, + "grad_norm": 0.583416223526001, + "learning_rate": 0.0001, + "loss": 1.4451, + "step": 12093 + }, + { + "epoch": 1.4050537322102818, + "grad_norm": 0.6077203750610352, + "learning_rate": 0.0001, + "loss": 1.4855, + "step": 12094 + }, + { + "epoch": 1.4051699099622423, + "grad_norm": 0.6278978586196899, + "learning_rate": 0.0001, + "loss": 1.6374, + "step": 12095 + }, + { + "epoch": 1.4052860877142028, + "grad_norm": 0.5796852707862854, + "learning_rate": 0.0001, + "loss": 1.4089, + "step": 12096 + }, + { + "epoch": 1.4054022654661633, + "grad_norm": 0.6815804839134216, + "learning_rate": 0.0001, + "loss": 1.596, + "step": 12097 + }, + { + "epoch": 1.4055184432181238, + "grad_norm": 0.5999861359596252, + "learning_rate": 0.0001, + "loss": 1.4805, + "step": 12098 + }, + { + "epoch": 1.4056346209700843, + "grad_norm": 0.5877710580825806, + "learning_rate": 0.0001, + "loss": 1.4686, + "step": 12099 + }, + { + "epoch": 1.4057507987220448, + "grad_norm": 0.5974360108375549, + "learning_rate": 0.0001, + "loss": 1.4517, + "step": 12100 + }, + { + "epoch": 1.4058669764740053, + "grad_norm": 0.6353722810745239, + "learning_rate": 0.0001, + "loss": 1.5599, + "step": 12101 + }, + { + "epoch": 1.4059831542259658, + "grad_norm": 0.5967050194740295, + "learning_rate": 0.0001, + "loss": 1.5928, + "step": 12102 + }, + { + "epoch": 1.4060993319779262, + "grad_norm": 0.5885502099990845, + "learning_rate": 0.0001, + "loss": 1.4882, + "step": 12103 + }, + { + "epoch": 1.4062155097298867, + "grad_norm": 0.5896927118301392, + "learning_rate": 0.0001, + "loss": 1.5195, + "step": 12104 + }, + { + "epoch": 1.4063316874818472, + "grad_norm": 0.536248505115509, + "learning_rate": 0.0001, + "loss": 1.3211, + "step": 12105 + }, + { + "epoch": 1.4064478652338077, + "grad_norm": 0.5472630858421326, + "learning_rate": 0.0001, + "loss": 1.43, + "step": 12106 + }, + { + "epoch": 1.4065640429857682, + "grad_norm": 0.5885692238807678, + "learning_rate": 0.0001, + "loss": 1.5514, + "step": 12107 + }, + { + "epoch": 1.4066802207377287, + "grad_norm": 0.5371417999267578, + "learning_rate": 0.0001, + "loss": 1.187, + "step": 12108 + }, + { + "epoch": 1.4067963984896892, + "grad_norm": 0.5889664888381958, + "learning_rate": 0.0001, + "loss": 1.3359, + "step": 12109 + }, + { + "epoch": 1.4069125762416497, + "grad_norm": 0.6135367751121521, + "learning_rate": 0.0001, + "loss": 1.4449, + "step": 12110 + }, + { + "epoch": 1.4070287539936102, + "grad_norm": 0.5663803815841675, + "learning_rate": 0.0001, + "loss": 1.2486, + "step": 12111 + }, + { + "epoch": 1.4071449317455706, + "grad_norm": 0.6089535355567932, + "learning_rate": 0.0001, + "loss": 1.6265, + "step": 12112 + }, + { + "epoch": 1.4072611094975311, + "grad_norm": 0.6175603270530701, + "learning_rate": 0.0001, + "loss": 1.6734, + "step": 12113 + }, + { + "epoch": 1.4073772872494916, + "grad_norm": 0.5911760330200195, + "learning_rate": 0.0001, + "loss": 1.3986, + "step": 12114 + }, + { + "epoch": 1.4074934650014521, + "grad_norm": 0.6093399524688721, + "learning_rate": 0.0001, + "loss": 1.5308, + "step": 12115 + }, + { + "epoch": 1.4076096427534126, + "grad_norm": 0.5803685188293457, + "learning_rate": 0.0001, + "loss": 1.3183, + "step": 12116 + }, + { + "epoch": 1.407725820505373, + "grad_norm": 0.5896522998809814, + "learning_rate": 0.0001, + "loss": 1.6055, + "step": 12117 + }, + { + "epoch": 1.4078419982573336, + "grad_norm": 0.573806881904602, + "learning_rate": 0.0001, + "loss": 1.3718, + "step": 12118 + }, + { + "epoch": 1.4079581760092943, + "grad_norm": 0.5594179034233093, + "learning_rate": 0.0001, + "loss": 1.3664, + "step": 12119 + }, + { + "epoch": 1.4080743537612548, + "grad_norm": 0.6025107502937317, + "learning_rate": 0.0001, + "loss": 1.3603, + "step": 12120 + }, + { + "epoch": 1.4081905315132153, + "grad_norm": 0.5635510683059692, + "learning_rate": 0.0001, + "loss": 1.5932, + "step": 12121 + }, + { + "epoch": 1.4083067092651758, + "grad_norm": 0.5392552614212036, + "learning_rate": 0.0001, + "loss": 1.3761, + "step": 12122 + }, + { + "epoch": 1.4084228870171362, + "grad_norm": 0.5588886141777039, + "learning_rate": 0.0001, + "loss": 1.4119, + "step": 12123 + }, + { + "epoch": 1.4085390647690967, + "grad_norm": 0.5739279389381409, + "learning_rate": 0.0001, + "loss": 1.4097, + "step": 12124 + }, + { + "epoch": 1.4086552425210572, + "grad_norm": 0.5879178047180176, + "learning_rate": 0.0001, + "loss": 1.6335, + "step": 12125 + }, + { + "epoch": 1.4087714202730177, + "grad_norm": 0.5897621512413025, + "learning_rate": 0.0001, + "loss": 1.5896, + "step": 12126 + }, + { + "epoch": 1.4088875980249782, + "grad_norm": 0.6275389194488525, + "learning_rate": 0.0001, + "loss": 1.5319, + "step": 12127 + }, + { + "epoch": 1.4090037757769387, + "grad_norm": 0.6143684983253479, + "learning_rate": 0.0001, + "loss": 1.4294, + "step": 12128 + }, + { + "epoch": 1.4091199535288992, + "grad_norm": 0.5457308292388916, + "learning_rate": 0.0001, + "loss": 1.305, + "step": 12129 + }, + { + "epoch": 1.4092361312808597, + "grad_norm": 0.6137149930000305, + "learning_rate": 0.0001, + "loss": 1.5955, + "step": 12130 + }, + { + "epoch": 1.4093523090328202, + "grad_norm": 0.5918616056442261, + "learning_rate": 0.0001, + "loss": 1.3573, + "step": 12131 + }, + { + "epoch": 1.4094684867847806, + "grad_norm": 0.5807372331619263, + "learning_rate": 0.0001, + "loss": 1.4503, + "step": 12132 + }, + { + "epoch": 1.4095846645367411, + "grad_norm": 0.6110495328903198, + "learning_rate": 0.0001, + "loss": 1.4377, + "step": 12133 + }, + { + "epoch": 1.4097008422887018, + "grad_norm": 0.5787477493286133, + "learning_rate": 0.0001, + "loss": 1.3938, + "step": 12134 + }, + { + "epoch": 1.4098170200406623, + "grad_norm": 0.5760836005210876, + "learning_rate": 0.0001, + "loss": 1.429, + "step": 12135 + }, + { + "epoch": 1.4099331977926228, + "grad_norm": 0.6453091502189636, + "learning_rate": 0.0001, + "loss": 1.6428, + "step": 12136 + }, + { + "epoch": 1.4100493755445833, + "grad_norm": 0.6404998898506165, + "learning_rate": 0.0001, + "loss": 1.5553, + "step": 12137 + }, + { + "epoch": 1.4101655532965438, + "grad_norm": 0.6601923108100891, + "learning_rate": 0.0001, + "loss": 1.5512, + "step": 12138 + }, + { + "epoch": 1.4102817310485043, + "grad_norm": 0.6087546944618225, + "learning_rate": 0.0001, + "loss": 1.4711, + "step": 12139 + }, + { + "epoch": 1.4103979088004648, + "grad_norm": 0.5570737719535828, + "learning_rate": 0.0001, + "loss": 1.2991, + "step": 12140 + }, + { + "epoch": 1.4105140865524253, + "grad_norm": 0.5413463115692139, + "learning_rate": 0.0001, + "loss": 1.185, + "step": 12141 + }, + { + "epoch": 1.4106302643043858, + "grad_norm": 0.5919083952903748, + "learning_rate": 0.0001, + "loss": 1.4681, + "step": 12142 + }, + { + "epoch": 1.4107464420563463, + "grad_norm": 0.593340277671814, + "learning_rate": 0.0001, + "loss": 1.7068, + "step": 12143 + }, + { + "epoch": 1.4108626198083067, + "grad_norm": 0.6711177229881287, + "learning_rate": 0.0001, + "loss": 1.4589, + "step": 12144 + }, + { + "epoch": 1.4109787975602672, + "grad_norm": 0.6454046368598938, + "learning_rate": 0.0001, + "loss": 1.4587, + "step": 12145 + }, + { + "epoch": 1.4110949753122277, + "grad_norm": 0.576941967010498, + "learning_rate": 0.0001, + "loss": 1.4441, + "step": 12146 + }, + { + "epoch": 1.4112111530641882, + "grad_norm": 0.5717135667800903, + "learning_rate": 0.0001, + "loss": 1.4607, + "step": 12147 + }, + { + "epoch": 1.4113273308161487, + "grad_norm": 0.5811867117881775, + "learning_rate": 0.0001, + "loss": 1.4087, + "step": 12148 + }, + { + "epoch": 1.4114435085681092, + "grad_norm": 0.5930108428001404, + "learning_rate": 0.0001, + "loss": 1.425, + "step": 12149 + }, + { + "epoch": 1.4115596863200697, + "grad_norm": 0.6307324767112732, + "learning_rate": 0.0001, + "loss": 1.4217, + "step": 12150 + }, + { + "epoch": 1.4116758640720302, + "grad_norm": 0.5676311254501343, + "learning_rate": 0.0001, + "loss": 1.3643, + "step": 12151 + }, + { + "epoch": 1.4117920418239907, + "grad_norm": 0.6175429821014404, + "learning_rate": 0.0001, + "loss": 1.5436, + "step": 12152 + }, + { + "epoch": 1.4119082195759511, + "grad_norm": 0.6128252744674683, + "learning_rate": 0.0001, + "loss": 1.5466, + "step": 12153 + }, + { + "epoch": 1.4120243973279116, + "grad_norm": 0.5954226851463318, + "learning_rate": 0.0001, + "loss": 1.4271, + "step": 12154 + }, + { + "epoch": 1.4121405750798721, + "grad_norm": 0.6151418089866638, + "learning_rate": 0.0001, + "loss": 1.5006, + "step": 12155 + }, + { + "epoch": 1.4122567528318326, + "grad_norm": 0.6414467096328735, + "learning_rate": 0.0001, + "loss": 1.4582, + "step": 12156 + }, + { + "epoch": 1.412372930583793, + "grad_norm": 0.6506933569908142, + "learning_rate": 0.0001, + "loss": 1.6118, + "step": 12157 + }, + { + "epoch": 1.4124891083357536, + "grad_norm": 0.603602409362793, + "learning_rate": 0.0001, + "loss": 1.5034, + "step": 12158 + }, + { + "epoch": 1.412605286087714, + "grad_norm": 0.5889068245887756, + "learning_rate": 0.0001, + "loss": 1.5119, + "step": 12159 + }, + { + "epoch": 1.4127214638396748, + "grad_norm": 0.5658511519432068, + "learning_rate": 0.0001, + "loss": 1.4418, + "step": 12160 + }, + { + "epoch": 1.4128376415916353, + "grad_norm": 0.5495668053627014, + "learning_rate": 0.0001, + "loss": 1.3484, + "step": 12161 + }, + { + "epoch": 1.4129538193435958, + "grad_norm": 0.5731701254844666, + "learning_rate": 0.0001, + "loss": 1.3556, + "step": 12162 + }, + { + "epoch": 1.4130699970955563, + "grad_norm": 0.6092509031295776, + "learning_rate": 0.0001, + "loss": 1.5438, + "step": 12163 + }, + { + "epoch": 1.4131861748475167, + "grad_norm": 0.6485771536827087, + "learning_rate": 0.0001, + "loss": 1.5345, + "step": 12164 + }, + { + "epoch": 1.4133023525994772, + "grad_norm": 0.5795009136199951, + "learning_rate": 0.0001, + "loss": 1.492, + "step": 12165 + }, + { + "epoch": 1.4134185303514377, + "grad_norm": 0.5661119222640991, + "learning_rate": 0.0001, + "loss": 1.3841, + "step": 12166 + }, + { + "epoch": 1.4135347081033982, + "grad_norm": 0.5987005829811096, + "learning_rate": 0.0001, + "loss": 1.4501, + "step": 12167 + }, + { + "epoch": 1.4136508858553587, + "grad_norm": 0.5951616168022156, + "learning_rate": 0.0001, + "loss": 1.4909, + "step": 12168 + }, + { + "epoch": 1.4137670636073192, + "grad_norm": 0.6180946826934814, + "learning_rate": 0.0001, + "loss": 1.5858, + "step": 12169 + }, + { + "epoch": 1.4138832413592797, + "grad_norm": 0.5951772928237915, + "learning_rate": 0.0001, + "loss": 1.5188, + "step": 12170 + }, + { + "epoch": 1.4139994191112402, + "grad_norm": 0.5945565700531006, + "learning_rate": 0.0001, + "loss": 1.3671, + "step": 12171 + }, + { + "epoch": 1.4141155968632007, + "grad_norm": 0.6298597455024719, + "learning_rate": 0.0001, + "loss": 1.5741, + "step": 12172 + }, + { + "epoch": 1.4142317746151611, + "grad_norm": 0.5758437514305115, + "learning_rate": 0.0001, + "loss": 1.478, + "step": 12173 + }, + { + "epoch": 1.4143479523671216, + "grad_norm": 0.6084062457084656, + "learning_rate": 0.0001, + "loss": 1.635, + "step": 12174 + }, + { + "epoch": 1.4144641301190821, + "grad_norm": 0.5881122946739197, + "learning_rate": 0.0001, + "loss": 1.3891, + "step": 12175 + }, + { + "epoch": 1.4145803078710428, + "grad_norm": 0.6065901517868042, + "learning_rate": 0.0001, + "loss": 1.5944, + "step": 12176 + }, + { + "epoch": 1.4146964856230033, + "grad_norm": 0.57171231508255, + "learning_rate": 0.0001, + "loss": 1.353, + "step": 12177 + }, + { + "epoch": 1.4148126633749638, + "grad_norm": 0.606834888458252, + "learning_rate": 0.0001, + "loss": 1.4338, + "step": 12178 + }, + { + "epoch": 1.4149288411269243, + "grad_norm": 0.6262538433074951, + "learning_rate": 0.0001, + "loss": 1.4993, + "step": 12179 + }, + { + "epoch": 1.4150450188788848, + "grad_norm": 0.6646376252174377, + "learning_rate": 0.0001, + "loss": 1.6679, + "step": 12180 + }, + { + "epoch": 1.4151611966308453, + "grad_norm": 0.6121298670768738, + "learning_rate": 0.0001, + "loss": 1.4422, + "step": 12181 + }, + { + "epoch": 1.4152773743828058, + "grad_norm": 0.5688852071762085, + "learning_rate": 0.0001, + "loss": 1.4792, + "step": 12182 + }, + { + "epoch": 1.4153935521347663, + "grad_norm": 0.5843722820281982, + "learning_rate": 0.0001, + "loss": 1.3999, + "step": 12183 + }, + { + "epoch": 1.4155097298867267, + "grad_norm": 0.6013531684875488, + "learning_rate": 0.0001, + "loss": 1.3599, + "step": 12184 + }, + { + "epoch": 1.4156259076386872, + "grad_norm": 0.5897423028945923, + "learning_rate": 0.0001, + "loss": 1.4682, + "step": 12185 + }, + { + "epoch": 1.4157420853906477, + "grad_norm": 0.6187875270843506, + "learning_rate": 0.0001, + "loss": 1.4988, + "step": 12186 + }, + { + "epoch": 1.4158582631426082, + "grad_norm": 0.5957651734352112, + "learning_rate": 0.0001, + "loss": 1.4374, + "step": 12187 + }, + { + "epoch": 1.4159744408945687, + "grad_norm": 0.6102905869483948, + "learning_rate": 0.0001, + "loss": 1.5182, + "step": 12188 + }, + { + "epoch": 1.4160906186465292, + "grad_norm": 0.5826073288917542, + "learning_rate": 0.0001, + "loss": 1.3984, + "step": 12189 + }, + { + "epoch": 1.4162067963984897, + "grad_norm": 0.6257545948028564, + "learning_rate": 0.0001, + "loss": 1.4765, + "step": 12190 + }, + { + "epoch": 1.4163229741504502, + "grad_norm": 0.6139261722564697, + "learning_rate": 0.0001, + "loss": 1.4693, + "step": 12191 + }, + { + "epoch": 1.4164391519024107, + "grad_norm": 0.6307207942008972, + "learning_rate": 0.0001, + "loss": 1.5803, + "step": 12192 + }, + { + "epoch": 1.4165553296543711, + "grad_norm": 0.603281557559967, + "learning_rate": 0.0001, + "loss": 1.4784, + "step": 12193 + }, + { + "epoch": 1.4166715074063316, + "grad_norm": 0.5983949303627014, + "learning_rate": 0.0001, + "loss": 1.4863, + "step": 12194 + }, + { + "epoch": 1.4167876851582921, + "grad_norm": 0.5481703281402588, + "learning_rate": 0.0001, + "loss": 1.4079, + "step": 12195 + }, + { + "epoch": 1.4169038629102526, + "grad_norm": 0.6520917415618896, + "learning_rate": 0.0001, + "loss": 1.544, + "step": 12196 + }, + { + "epoch": 1.417020040662213, + "grad_norm": 0.553615927696228, + "learning_rate": 0.0001, + "loss": 1.3623, + "step": 12197 + }, + { + "epoch": 1.4171362184141736, + "grad_norm": 0.5994259119033813, + "learning_rate": 0.0001, + "loss": 1.6403, + "step": 12198 + }, + { + "epoch": 1.417252396166134, + "grad_norm": 0.5477364659309387, + "learning_rate": 0.0001, + "loss": 1.3191, + "step": 12199 + }, + { + "epoch": 1.4173685739180946, + "grad_norm": 0.6125954389572144, + "learning_rate": 0.0001, + "loss": 1.4055, + "step": 12200 + }, + { + "epoch": 1.417484751670055, + "grad_norm": 0.642378032207489, + "learning_rate": 0.0001, + "loss": 1.5665, + "step": 12201 + }, + { + "epoch": 1.4176009294220158, + "grad_norm": 0.5678790807723999, + "learning_rate": 0.0001, + "loss": 1.3472, + "step": 12202 + }, + { + "epoch": 1.4177171071739763, + "grad_norm": 0.6221288442611694, + "learning_rate": 0.0001, + "loss": 1.5922, + "step": 12203 + }, + { + "epoch": 1.4178332849259367, + "grad_norm": 0.6019090414047241, + "learning_rate": 0.0001, + "loss": 1.6035, + "step": 12204 + }, + { + "epoch": 1.4179494626778972, + "grad_norm": 0.6357331275939941, + "learning_rate": 0.0001, + "loss": 1.5103, + "step": 12205 + }, + { + "epoch": 1.4180656404298577, + "grad_norm": 0.5553552508354187, + "learning_rate": 0.0001, + "loss": 1.2786, + "step": 12206 + }, + { + "epoch": 1.4181818181818182, + "grad_norm": 0.5938397645950317, + "learning_rate": 0.0001, + "loss": 1.4726, + "step": 12207 + }, + { + "epoch": 1.4182979959337787, + "grad_norm": 0.5866955518722534, + "learning_rate": 0.0001, + "loss": 1.3525, + "step": 12208 + }, + { + "epoch": 1.4184141736857392, + "grad_norm": 0.5551325678825378, + "learning_rate": 0.0001, + "loss": 1.4403, + "step": 12209 + }, + { + "epoch": 1.4185303514376997, + "grad_norm": 0.5848428606987, + "learning_rate": 0.0001, + "loss": 1.5868, + "step": 12210 + }, + { + "epoch": 1.4186465291896602, + "grad_norm": 0.5819724798202515, + "learning_rate": 0.0001, + "loss": 1.288, + "step": 12211 + }, + { + "epoch": 1.4187627069416207, + "grad_norm": 0.5821203589439392, + "learning_rate": 0.0001, + "loss": 1.3893, + "step": 12212 + }, + { + "epoch": 1.4188788846935811, + "grad_norm": 0.6024807691574097, + "learning_rate": 0.0001, + "loss": 1.6227, + "step": 12213 + }, + { + "epoch": 1.4189950624455416, + "grad_norm": 0.5902191996574402, + "learning_rate": 0.0001, + "loss": 1.4969, + "step": 12214 + }, + { + "epoch": 1.4191112401975021, + "grad_norm": 0.5712905526161194, + "learning_rate": 0.0001, + "loss": 1.5282, + "step": 12215 + }, + { + "epoch": 1.4192274179494626, + "grad_norm": 0.5864148139953613, + "learning_rate": 0.0001, + "loss": 1.5394, + "step": 12216 + }, + { + "epoch": 1.419343595701423, + "grad_norm": 0.5870895385742188, + "learning_rate": 0.0001, + "loss": 1.4937, + "step": 12217 + }, + { + "epoch": 1.4194597734533838, + "grad_norm": 0.6374808549880981, + "learning_rate": 0.0001, + "loss": 1.6601, + "step": 12218 + }, + { + "epoch": 1.4195759512053443, + "grad_norm": 0.5821941494941711, + "learning_rate": 0.0001, + "loss": 1.555, + "step": 12219 + }, + { + "epoch": 1.4196921289573048, + "grad_norm": 0.6282082796096802, + "learning_rate": 0.0001, + "loss": 1.6674, + "step": 12220 + }, + { + "epoch": 1.4198083067092653, + "grad_norm": 0.5595186352729797, + "learning_rate": 0.0001, + "loss": 1.5652, + "step": 12221 + }, + { + "epoch": 1.4199244844612258, + "grad_norm": 0.5783095359802246, + "learning_rate": 0.0001, + "loss": 1.2876, + "step": 12222 + }, + { + "epoch": 1.4200406622131863, + "grad_norm": 0.6268340349197388, + "learning_rate": 0.0001, + "loss": 1.2669, + "step": 12223 + }, + { + "epoch": 1.4201568399651467, + "grad_norm": 0.6090834140777588, + "learning_rate": 0.0001, + "loss": 1.5892, + "step": 12224 + }, + { + "epoch": 1.4202730177171072, + "grad_norm": 0.616369903087616, + "learning_rate": 0.0001, + "loss": 1.4723, + "step": 12225 + }, + { + "epoch": 1.4203891954690677, + "grad_norm": 0.5462055802345276, + "learning_rate": 0.0001, + "loss": 1.3814, + "step": 12226 + }, + { + "epoch": 1.4205053732210282, + "grad_norm": 0.6000543236732483, + "learning_rate": 0.0001, + "loss": 1.4903, + "step": 12227 + }, + { + "epoch": 1.4206215509729887, + "grad_norm": 0.5703785419464111, + "learning_rate": 0.0001, + "loss": 1.3543, + "step": 12228 + }, + { + "epoch": 1.4207377287249492, + "grad_norm": 0.6079279780387878, + "learning_rate": 0.0001, + "loss": 1.6149, + "step": 12229 + }, + { + "epoch": 1.4208539064769097, + "grad_norm": 0.5839101076126099, + "learning_rate": 0.0001, + "loss": 1.3544, + "step": 12230 + }, + { + "epoch": 1.4209700842288702, + "grad_norm": 0.5797370672225952, + "learning_rate": 0.0001, + "loss": 1.3705, + "step": 12231 + }, + { + "epoch": 1.4210862619808307, + "grad_norm": 0.6282606720924377, + "learning_rate": 0.0001, + "loss": 1.7211, + "step": 12232 + }, + { + "epoch": 1.4212024397327911, + "grad_norm": 0.6144933700561523, + "learning_rate": 0.0001, + "loss": 1.4443, + "step": 12233 + }, + { + "epoch": 1.4213186174847516, + "grad_norm": 0.5617024898529053, + "learning_rate": 0.0001, + "loss": 1.4082, + "step": 12234 + }, + { + "epoch": 1.4214347952367121, + "grad_norm": 0.6399552226066589, + "learning_rate": 0.0001, + "loss": 1.4947, + "step": 12235 + }, + { + "epoch": 1.4215509729886726, + "grad_norm": 0.5742717385292053, + "learning_rate": 0.0001, + "loss": 1.477, + "step": 12236 + }, + { + "epoch": 1.421667150740633, + "grad_norm": 0.5643975734710693, + "learning_rate": 0.0001, + "loss": 1.6196, + "step": 12237 + }, + { + "epoch": 1.4217833284925936, + "grad_norm": 0.593525230884552, + "learning_rate": 0.0001, + "loss": 1.5359, + "step": 12238 + }, + { + "epoch": 1.421899506244554, + "grad_norm": 0.5971342325210571, + "learning_rate": 0.0001, + "loss": 1.488, + "step": 12239 + }, + { + "epoch": 1.4220156839965146, + "grad_norm": 0.6006376147270203, + "learning_rate": 0.0001, + "loss": 1.4928, + "step": 12240 + }, + { + "epoch": 1.422131861748475, + "grad_norm": 0.6138136982917786, + "learning_rate": 0.0001, + "loss": 1.4406, + "step": 12241 + }, + { + "epoch": 1.4222480395004355, + "grad_norm": 0.5824870467185974, + "learning_rate": 0.0001, + "loss": 1.4909, + "step": 12242 + }, + { + "epoch": 1.422364217252396, + "grad_norm": 0.6046587824821472, + "learning_rate": 0.0001, + "loss": 1.5849, + "step": 12243 + }, + { + "epoch": 1.4224803950043567, + "grad_norm": 0.5978199243545532, + "learning_rate": 0.0001, + "loss": 1.4384, + "step": 12244 + }, + { + "epoch": 1.4225965727563172, + "grad_norm": 0.5881246328353882, + "learning_rate": 0.0001, + "loss": 1.4615, + "step": 12245 + }, + { + "epoch": 1.4227127505082777, + "grad_norm": 0.5994898676872253, + "learning_rate": 0.0001, + "loss": 1.6306, + "step": 12246 + }, + { + "epoch": 1.4228289282602382, + "grad_norm": 0.6213389039039612, + "learning_rate": 0.0001, + "loss": 1.571, + "step": 12247 + }, + { + "epoch": 1.4229451060121987, + "grad_norm": 0.5992786884307861, + "learning_rate": 0.0001, + "loss": 1.3701, + "step": 12248 + }, + { + "epoch": 1.4230612837641592, + "grad_norm": 0.5737839341163635, + "learning_rate": 0.0001, + "loss": 1.411, + "step": 12249 + }, + { + "epoch": 1.4231774615161197, + "grad_norm": 0.6159092783927917, + "learning_rate": 0.0001, + "loss": 1.6564, + "step": 12250 + }, + { + "epoch": 1.4232936392680802, + "grad_norm": 0.596993625164032, + "learning_rate": 0.0001, + "loss": 1.3847, + "step": 12251 + }, + { + "epoch": 1.4234098170200407, + "grad_norm": 0.5880178213119507, + "learning_rate": 0.0001, + "loss": 1.5226, + "step": 12252 + }, + { + "epoch": 1.4235259947720011, + "grad_norm": 0.6582285165786743, + "learning_rate": 0.0001, + "loss": 1.5161, + "step": 12253 + }, + { + "epoch": 1.4236421725239616, + "grad_norm": 0.608213484287262, + "learning_rate": 0.0001, + "loss": 1.5923, + "step": 12254 + }, + { + "epoch": 1.4237583502759221, + "grad_norm": 0.5370960235595703, + "learning_rate": 0.0001, + "loss": 1.4218, + "step": 12255 + }, + { + "epoch": 1.4238745280278826, + "grad_norm": 0.6302644610404968, + "learning_rate": 0.0001, + "loss": 1.6416, + "step": 12256 + }, + { + "epoch": 1.423990705779843, + "grad_norm": 0.5852667093276978, + "learning_rate": 0.0001, + "loss": 1.567, + "step": 12257 + }, + { + "epoch": 1.4241068835318036, + "grad_norm": 0.6241486072540283, + "learning_rate": 0.0001, + "loss": 1.5519, + "step": 12258 + }, + { + "epoch": 1.424223061283764, + "grad_norm": 0.5911672711372375, + "learning_rate": 0.0001, + "loss": 1.597, + "step": 12259 + }, + { + "epoch": 1.4243392390357248, + "grad_norm": 0.5758203268051147, + "learning_rate": 0.0001, + "loss": 1.4032, + "step": 12260 + }, + { + "epoch": 1.4244554167876853, + "grad_norm": 0.6107878088951111, + "learning_rate": 0.0001, + "loss": 1.5565, + "step": 12261 + }, + { + "epoch": 1.4245715945396458, + "grad_norm": 0.5618066787719727, + "learning_rate": 0.0001, + "loss": 1.4639, + "step": 12262 + }, + { + "epoch": 1.4246877722916063, + "grad_norm": 0.6158115863800049, + "learning_rate": 0.0001, + "loss": 1.8096, + "step": 12263 + }, + { + "epoch": 1.4248039500435667, + "grad_norm": 0.6369965672492981, + "learning_rate": 0.0001, + "loss": 1.6449, + "step": 12264 + }, + { + "epoch": 1.4249201277955272, + "grad_norm": 0.6416753530502319, + "learning_rate": 0.0001, + "loss": 1.5327, + "step": 12265 + }, + { + "epoch": 1.4250363055474877, + "grad_norm": 0.5738315582275391, + "learning_rate": 0.0001, + "loss": 1.4617, + "step": 12266 + }, + { + "epoch": 1.4251524832994482, + "grad_norm": 0.559740424156189, + "learning_rate": 0.0001, + "loss": 1.5697, + "step": 12267 + }, + { + "epoch": 1.4252686610514087, + "grad_norm": 0.5917907953262329, + "learning_rate": 0.0001, + "loss": 1.5039, + "step": 12268 + }, + { + "epoch": 1.4253848388033692, + "grad_norm": 0.5424224734306335, + "learning_rate": 0.0001, + "loss": 1.3014, + "step": 12269 + }, + { + "epoch": 1.4255010165553297, + "grad_norm": 0.5904528498649597, + "learning_rate": 0.0001, + "loss": 1.4219, + "step": 12270 + }, + { + "epoch": 1.4256171943072902, + "grad_norm": 0.6030588746070862, + "learning_rate": 0.0001, + "loss": 1.454, + "step": 12271 + }, + { + "epoch": 1.4257333720592507, + "grad_norm": 0.5945565104484558, + "learning_rate": 0.0001, + "loss": 1.3429, + "step": 12272 + }, + { + "epoch": 1.4258495498112111, + "grad_norm": 0.6340063810348511, + "learning_rate": 0.0001, + "loss": 1.5204, + "step": 12273 + }, + { + "epoch": 1.4259657275631716, + "grad_norm": 0.5972619652748108, + "learning_rate": 0.0001, + "loss": 1.2667, + "step": 12274 + }, + { + "epoch": 1.4260819053151321, + "grad_norm": 0.6026214361190796, + "learning_rate": 0.0001, + "loss": 1.4078, + "step": 12275 + }, + { + "epoch": 1.4261980830670926, + "grad_norm": 0.6557515859603882, + "learning_rate": 0.0001, + "loss": 1.6499, + "step": 12276 + }, + { + "epoch": 1.426314260819053, + "grad_norm": 0.6378829479217529, + "learning_rate": 0.0001, + "loss": 1.5011, + "step": 12277 + }, + { + "epoch": 1.4264304385710136, + "grad_norm": 0.655852735042572, + "learning_rate": 0.0001, + "loss": 1.6301, + "step": 12278 + }, + { + "epoch": 1.426546616322974, + "grad_norm": 0.5908859968185425, + "learning_rate": 0.0001, + "loss": 1.3729, + "step": 12279 + }, + { + "epoch": 1.4266627940749346, + "grad_norm": 0.6130912899971008, + "learning_rate": 0.0001, + "loss": 1.4393, + "step": 12280 + }, + { + "epoch": 1.426778971826895, + "grad_norm": 0.5807083249092102, + "learning_rate": 0.0001, + "loss": 1.4061, + "step": 12281 + }, + { + "epoch": 1.4268951495788555, + "grad_norm": 0.6125562191009521, + "learning_rate": 0.0001, + "loss": 1.3565, + "step": 12282 + }, + { + "epoch": 1.427011327330816, + "grad_norm": 0.5933031439781189, + "learning_rate": 0.0001, + "loss": 1.6421, + "step": 12283 + }, + { + "epoch": 1.4271275050827765, + "grad_norm": 0.5931923389434814, + "learning_rate": 0.0001, + "loss": 1.4202, + "step": 12284 + }, + { + "epoch": 1.427243682834737, + "grad_norm": 0.5963932275772095, + "learning_rate": 0.0001, + "loss": 1.3515, + "step": 12285 + }, + { + "epoch": 1.4273598605866977, + "grad_norm": 0.5983773469924927, + "learning_rate": 0.0001, + "loss": 1.4857, + "step": 12286 + }, + { + "epoch": 1.4274760383386582, + "grad_norm": 0.5747543573379517, + "learning_rate": 0.0001, + "loss": 1.398, + "step": 12287 + }, + { + "epoch": 1.4275922160906187, + "grad_norm": 0.6180018782615662, + "learning_rate": 0.0001, + "loss": 1.5407, + "step": 12288 + }, + { + "epoch": 1.4277083938425792, + "grad_norm": 0.6087707281112671, + "learning_rate": 0.0001, + "loss": 1.5047, + "step": 12289 + }, + { + "epoch": 1.4278245715945397, + "grad_norm": 0.5981317758560181, + "learning_rate": 0.0001, + "loss": 1.4278, + "step": 12290 + }, + { + "epoch": 1.4279407493465002, + "grad_norm": 0.5648661255836487, + "learning_rate": 0.0001, + "loss": 1.2978, + "step": 12291 + }, + { + "epoch": 1.4280569270984607, + "grad_norm": 0.6909130215644836, + "learning_rate": 0.0001, + "loss": 1.7132, + "step": 12292 + }, + { + "epoch": 1.4281731048504211, + "grad_norm": 0.603410005569458, + "learning_rate": 0.0001, + "loss": 1.4631, + "step": 12293 + }, + { + "epoch": 1.4282892826023816, + "grad_norm": 0.5939321517944336, + "learning_rate": 0.0001, + "loss": 1.6074, + "step": 12294 + }, + { + "epoch": 1.4284054603543421, + "grad_norm": 0.5982928276062012, + "learning_rate": 0.0001, + "loss": 1.4063, + "step": 12295 + }, + { + "epoch": 1.4285216381063026, + "grad_norm": 0.6024434566497803, + "learning_rate": 0.0001, + "loss": 1.3077, + "step": 12296 + }, + { + "epoch": 1.428637815858263, + "grad_norm": 0.656810998916626, + "learning_rate": 0.0001, + "loss": 1.4021, + "step": 12297 + }, + { + "epoch": 1.4287539936102236, + "grad_norm": 0.6280006170272827, + "learning_rate": 0.0001, + "loss": 1.7002, + "step": 12298 + }, + { + "epoch": 1.428870171362184, + "grad_norm": 0.6182981729507446, + "learning_rate": 0.0001, + "loss": 1.4683, + "step": 12299 + }, + { + "epoch": 1.4289863491141446, + "grad_norm": 0.6073260307312012, + "learning_rate": 0.0001, + "loss": 1.423, + "step": 12300 + }, + { + "epoch": 1.429102526866105, + "grad_norm": 0.6266109943389893, + "learning_rate": 0.0001, + "loss": 1.5458, + "step": 12301 + }, + { + "epoch": 1.4292187046180658, + "grad_norm": 0.5566325187683105, + "learning_rate": 0.0001, + "loss": 1.4415, + "step": 12302 + }, + { + "epoch": 1.4293348823700263, + "grad_norm": 0.6451058387756348, + "learning_rate": 0.0001, + "loss": 1.5358, + "step": 12303 + }, + { + "epoch": 1.4294510601219867, + "grad_norm": 0.6131970882415771, + "learning_rate": 0.0001, + "loss": 1.4633, + "step": 12304 + }, + { + "epoch": 1.4295672378739472, + "grad_norm": 0.5836344361305237, + "learning_rate": 0.0001, + "loss": 1.4836, + "step": 12305 + }, + { + "epoch": 1.4296834156259077, + "grad_norm": 0.6282364726066589, + "learning_rate": 0.0001, + "loss": 1.4959, + "step": 12306 + }, + { + "epoch": 1.4297995933778682, + "grad_norm": 0.5855079889297485, + "learning_rate": 0.0001, + "loss": 1.5406, + "step": 12307 + }, + { + "epoch": 1.4299157711298287, + "grad_norm": 0.606343150138855, + "learning_rate": 0.0001, + "loss": 1.5755, + "step": 12308 + }, + { + "epoch": 1.4300319488817892, + "grad_norm": 0.5874128341674805, + "learning_rate": 0.0001, + "loss": 1.5141, + "step": 12309 + }, + { + "epoch": 1.4301481266337497, + "grad_norm": 0.6407783031463623, + "learning_rate": 0.0001, + "loss": 1.5108, + "step": 12310 + }, + { + "epoch": 1.4302643043857102, + "grad_norm": 0.6048932671546936, + "learning_rate": 0.0001, + "loss": 1.4187, + "step": 12311 + }, + { + "epoch": 1.4303804821376707, + "grad_norm": 0.6171375513076782, + "learning_rate": 0.0001, + "loss": 1.4019, + "step": 12312 + }, + { + "epoch": 1.4304966598896312, + "grad_norm": 0.5970510244369507, + "learning_rate": 0.0001, + "loss": 1.3356, + "step": 12313 + }, + { + "epoch": 1.4306128376415916, + "grad_norm": 0.5908089280128479, + "learning_rate": 0.0001, + "loss": 1.4893, + "step": 12314 + }, + { + "epoch": 1.4307290153935521, + "grad_norm": 0.5906482338905334, + "learning_rate": 0.0001, + "loss": 1.4824, + "step": 12315 + }, + { + "epoch": 1.4308451931455126, + "grad_norm": 0.5727223753929138, + "learning_rate": 0.0001, + "loss": 1.2765, + "step": 12316 + }, + { + "epoch": 1.430961370897473, + "grad_norm": 0.5993191599845886, + "learning_rate": 0.0001, + "loss": 1.4062, + "step": 12317 + }, + { + "epoch": 1.4310775486494336, + "grad_norm": 0.6337776184082031, + "learning_rate": 0.0001, + "loss": 1.5402, + "step": 12318 + }, + { + "epoch": 1.431193726401394, + "grad_norm": 0.6048466563224792, + "learning_rate": 0.0001, + "loss": 1.3284, + "step": 12319 + }, + { + "epoch": 1.4313099041533546, + "grad_norm": 0.5903589129447937, + "learning_rate": 0.0001, + "loss": 1.4525, + "step": 12320 + }, + { + "epoch": 1.431426081905315, + "grad_norm": 0.5887618064880371, + "learning_rate": 0.0001, + "loss": 1.3265, + "step": 12321 + }, + { + "epoch": 1.4315422596572756, + "grad_norm": 0.6316114068031311, + "learning_rate": 0.0001, + "loss": 1.4795, + "step": 12322 + }, + { + "epoch": 1.431658437409236, + "grad_norm": 0.6685931086540222, + "learning_rate": 0.0001, + "loss": 1.6197, + "step": 12323 + }, + { + "epoch": 1.4317746151611965, + "grad_norm": 0.6843975186347961, + "learning_rate": 0.0001, + "loss": 1.6115, + "step": 12324 + }, + { + "epoch": 1.431890792913157, + "grad_norm": 0.5919124484062195, + "learning_rate": 0.0001, + "loss": 1.3703, + "step": 12325 + }, + { + "epoch": 1.4320069706651175, + "grad_norm": 0.5988925695419312, + "learning_rate": 0.0001, + "loss": 1.5675, + "step": 12326 + }, + { + "epoch": 1.432123148417078, + "grad_norm": 0.574201762676239, + "learning_rate": 0.0001, + "loss": 1.4022, + "step": 12327 + }, + { + "epoch": 1.4322393261690387, + "grad_norm": 0.6083642840385437, + "learning_rate": 0.0001, + "loss": 1.3776, + "step": 12328 + }, + { + "epoch": 1.4323555039209992, + "grad_norm": 0.5715479850769043, + "learning_rate": 0.0001, + "loss": 1.4453, + "step": 12329 + }, + { + "epoch": 1.4324716816729597, + "grad_norm": 0.6147144436836243, + "learning_rate": 0.0001, + "loss": 1.5643, + "step": 12330 + }, + { + "epoch": 1.4325878594249202, + "grad_norm": 0.6399287581443787, + "learning_rate": 0.0001, + "loss": 1.2921, + "step": 12331 + }, + { + "epoch": 1.4327040371768807, + "grad_norm": 0.6057835221290588, + "learning_rate": 0.0001, + "loss": 1.5514, + "step": 12332 + }, + { + "epoch": 1.4328202149288412, + "grad_norm": 0.6630198955535889, + "learning_rate": 0.0001, + "loss": 1.5945, + "step": 12333 + }, + { + "epoch": 1.4329363926808016, + "grad_norm": 0.6036765575408936, + "learning_rate": 0.0001, + "loss": 1.4586, + "step": 12334 + }, + { + "epoch": 1.4330525704327621, + "grad_norm": 0.5870527029037476, + "learning_rate": 0.0001, + "loss": 1.3427, + "step": 12335 + }, + { + "epoch": 1.4331687481847226, + "grad_norm": 0.6059634685516357, + "learning_rate": 0.0001, + "loss": 1.349, + "step": 12336 + }, + { + "epoch": 1.433284925936683, + "grad_norm": 0.5747280120849609, + "learning_rate": 0.0001, + "loss": 1.4929, + "step": 12337 + }, + { + "epoch": 1.4334011036886436, + "grad_norm": 0.6289669275283813, + "learning_rate": 0.0001, + "loss": 1.4954, + "step": 12338 + }, + { + "epoch": 1.433517281440604, + "grad_norm": 0.5724921226501465, + "learning_rate": 0.0001, + "loss": 1.4182, + "step": 12339 + }, + { + "epoch": 1.4336334591925646, + "grad_norm": 0.6268770098686218, + "learning_rate": 0.0001, + "loss": 1.4344, + "step": 12340 + }, + { + "epoch": 1.433749636944525, + "grad_norm": 0.6141247749328613, + "learning_rate": 0.0001, + "loss": 1.5246, + "step": 12341 + }, + { + "epoch": 1.4338658146964856, + "grad_norm": 0.702262282371521, + "learning_rate": 0.0001, + "loss": 1.6, + "step": 12342 + }, + { + "epoch": 1.433981992448446, + "grad_norm": 0.595066249370575, + "learning_rate": 0.0001, + "loss": 1.5602, + "step": 12343 + }, + { + "epoch": 1.4340981702004068, + "grad_norm": 0.6045823693275452, + "learning_rate": 0.0001, + "loss": 1.4681, + "step": 12344 + }, + { + "epoch": 1.4342143479523672, + "grad_norm": 0.5839776992797852, + "learning_rate": 0.0001, + "loss": 1.4221, + "step": 12345 + }, + { + "epoch": 1.4343305257043277, + "grad_norm": 0.6514957547187805, + "learning_rate": 0.0001, + "loss": 1.5948, + "step": 12346 + }, + { + "epoch": 1.4344467034562882, + "grad_norm": 0.5899053812026978, + "learning_rate": 0.0001, + "loss": 1.3677, + "step": 12347 + }, + { + "epoch": 1.4345628812082487, + "grad_norm": 0.5787526369094849, + "learning_rate": 0.0001, + "loss": 1.4326, + "step": 12348 + }, + { + "epoch": 1.4346790589602092, + "grad_norm": 0.5898358821868896, + "learning_rate": 0.0001, + "loss": 1.4003, + "step": 12349 + }, + { + "epoch": 1.4347952367121697, + "grad_norm": 0.6458524465560913, + "learning_rate": 0.0001, + "loss": 1.5119, + "step": 12350 + }, + { + "epoch": 1.4349114144641302, + "grad_norm": 0.646244466304779, + "learning_rate": 0.0001, + "loss": 1.5125, + "step": 12351 + }, + { + "epoch": 1.4350275922160907, + "grad_norm": 0.5955740213394165, + "learning_rate": 0.0001, + "loss": 1.3983, + "step": 12352 + }, + { + "epoch": 1.4351437699680512, + "grad_norm": 0.5988348722457886, + "learning_rate": 0.0001, + "loss": 1.4731, + "step": 12353 + }, + { + "epoch": 1.4352599477200116, + "grad_norm": 0.5847843885421753, + "learning_rate": 0.0001, + "loss": 1.295, + "step": 12354 + }, + { + "epoch": 1.4353761254719721, + "grad_norm": 0.6146335005760193, + "learning_rate": 0.0001, + "loss": 1.5285, + "step": 12355 + }, + { + "epoch": 1.4354923032239326, + "grad_norm": 0.604469895362854, + "learning_rate": 0.0001, + "loss": 1.4477, + "step": 12356 + }, + { + "epoch": 1.435608480975893, + "grad_norm": 0.6842860579490662, + "learning_rate": 0.0001, + "loss": 1.5657, + "step": 12357 + }, + { + "epoch": 1.4357246587278536, + "grad_norm": 0.5877547860145569, + "learning_rate": 0.0001, + "loss": 1.4932, + "step": 12358 + }, + { + "epoch": 1.435840836479814, + "grad_norm": 0.687014102935791, + "learning_rate": 0.0001, + "loss": 1.5146, + "step": 12359 + }, + { + "epoch": 1.4359570142317746, + "grad_norm": 0.5968105792999268, + "learning_rate": 0.0001, + "loss": 1.4974, + "step": 12360 + }, + { + "epoch": 1.436073191983735, + "grad_norm": 0.5516940355300903, + "learning_rate": 0.0001, + "loss": 1.3209, + "step": 12361 + }, + { + "epoch": 1.4361893697356956, + "grad_norm": 0.6110051274299622, + "learning_rate": 0.0001, + "loss": 1.5701, + "step": 12362 + }, + { + "epoch": 1.436305547487656, + "grad_norm": 0.5829412937164307, + "learning_rate": 0.0001, + "loss": 1.4339, + "step": 12363 + }, + { + "epoch": 1.4364217252396165, + "grad_norm": 0.6736522912979126, + "learning_rate": 0.0001, + "loss": 1.5634, + "step": 12364 + }, + { + "epoch": 1.436537902991577, + "grad_norm": 0.5804245471954346, + "learning_rate": 0.0001, + "loss": 1.3728, + "step": 12365 + }, + { + "epoch": 1.4366540807435375, + "grad_norm": 0.6237603425979614, + "learning_rate": 0.0001, + "loss": 1.5145, + "step": 12366 + }, + { + "epoch": 1.436770258495498, + "grad_norm": 0.6224261522293091, + "learning_rate": 0.0001, + "loss": 1.4202, + "step": 12367 + }, + { + "epoch": 1.4368864362474585, + "grad_norm": 0.599617600440979, + "learning_rate": 0.0001, + "loss": 1.3436, + "step": 12368 + }, + { + "epoch": 1.437002613999419, + "grad_norm": 0.5778672695159912, + "learning_rate": 0.0001, + "loss": 1.3741, + "step": 12369 + }, + { + "epoch": 1.4371187917513797, + "grad_norm": 0.6169531941413879, + "learning_rate": 0.0001, + "loss": 1.5259, + "step": 12370 + }, + { + "epoch": 1.4372349695033402, + "grad_norm": 0.6243462562561035, + "learning_rate": 0.0001, + "loss": 1.4946, + "step": 12371 + }, + { + "epoch": 1.4373511472553007, + "grad_norm": 0.6082959175109863, + "learning_rate": 0.0001, + "loss": 1.4367, + "step": 12372 + }, + { + "epoch": 1.4374673250072612, + "grad_norm": 0.5401190519332886, + "learning_rate": 0.0001, + "loss": 1.303, + "step": 12373 + }, + { + "epoch": 1.4375835027592216, + "grad_norm": 0.5593122243881226, + "learning_rate": 0.0001, + "loss": 1.3716, + "step": 12374 + }, + { + "epoch": 1.4376996805111821, + "grad_norm": 0.6530990600585938, + "learning_rate": 0.0001, + "loss": 1.4847, + "step": 12375 + }, + { + "epoch": 1.4378158582631426, + "grad_norm": 0.5983834266662598, + "learning_rate": 0.0001, + "loss": 1.4915, + "step": 12376 + }, + { + "epoch": 1.437932036015103, + "grad_norm": 0.6394164562225342, + "learning_rate": 0.0001, + "loss": 1.5291, + "step": 12377 + }, + { + "epoch": 1.4380482137670636, + "grad_norm": 0.5918828845024109, + "learning_rate": 0.0001, + "loss": 1.593, + "step": 12378 + }, + { + "epoch": 1.438164391519024, + "grad_norm": 0.5928050875663757, + "learning_rate": 0.0001, + "loss": 1.6254, + "step": 12379 + }, + { + "epoch": 1.4382805692709846, + "grad_norm": 0.6321570873260498, + "learning_rate": 0.0001, + "loss": 1.499, + "step": 12380 + }, + { + "epoch": 1.438396747022945, + "grad_norm": 0.5853205323219299, + "learning_rate": 0.0001, + "loss": 1.4488, + "step": 12381 + }, + { + "epoch": 1.4385129247749056, + "grad_norm": 0.5675798058509827, + "learning_rate": 0.0001, + "loss": 1.4716, + "step": 12382 + }, + { + "epoch": 1.438629102526866, + "grad_norm": 0.6351810097694397, + "learning_rate": 0.0001, + "loss": 1.5845, + "step": 12383 + }, + { + "epoch": 1.4387452802788265, + "grad_norm": 0.575139582157135, + "learning_rate": 0.0001, + "loss": 1.4373, + "step": 12384 + }, + { + "epoch": 1.438861458030787, + "grad_norm": 0.625085711479187, + "learning_rate": 0.0001, + "loss": 1.5833, + "step": 12385 + }, + { + "epoch": 1.4389776357827477, + "grad_norm": 0.5660273432731628, + "learning_rate": 0.0001, + "loss": 1.5369, + "step": 12386 + }, + { + "epoch": 1.4390938135347082, + "grad_norm": 0.642395555973053, + "learning_rate": 0.0001, + "loss": 1.6152, + "step": 12387 + }, + { + "epoch": 1.4392099912866687, + "grad_norm": 0.5889819860458374, + "learning_rate": 0.0001, + "loss": 1.4482, + "step": 12388 + }, + { + "epoch": 1.4393261690386292, + "grad_norm": 0.5822515487670898, + "learning_rate": 0.0001, + "loss": 1.3953, + "step": 12389 + }, + { + "epoch": 1.4394423467905897, + "grad_norm": 0.5766950249671936, + "learning_rate": 0.0001, + "loss": 1.4415, + "step": 12390 + }, + { + "epoch": 1.4395585245425502, + "grad_norm": 0.6112682819366455, + "learning_rate": 0.0001, + "loss": 1.3334, + "step": 12391 + }, + { + "epoch": 1.4396747022945107, + "grad_norm": 0.6508562564849854, + "learning_rate": 0.0001, + "loss": 1.4605, + "step": 12392 + }, + { + "epoch": 1.4397908800464712, + "grad_norm": 0.6107341647148132, + "learning_rate": 0.0001, + "loss": 1.5053, + "step": 12393 + }, + { + "epoch": 1.4399070577984316, + "grad_norm": 0.5927090048789978, + "learning_rate": 0.0001, + "loss": 1.3687, + "step": 12394 + }, + { + "epoch": 1.4400232355503921, + "grad_norm": 0.6269469261169434, + "learning_rate": 0.0001, + "loss": 1.5531, + "step": 12395 + }, + { + "epoch": 1.4401394133023526, + "grad_norm": 0.6517143249511719, + "learning_rate": 0.0001, + "loss": 1.4674, + "step": 12396 + }, + { + "epoch": 1.4402555910543131, + "grad_norm": 0.5869902968406677, + "learning_rate": 0.0001, + "loss": 1.452, + "step": 12397 + }, + { + "epoch": 1.4403717688062736, + "grad_norm": 0.6312974691390991, + "learning_rate": 0.0001, + "loss": 1.4868, + "step": 12398 + }, + { + "epoch": 1.440487946558234, + "grad_norm": 0.5878767371177673, + "learning_rate": 0.0001, + "loss": 1.5031, + "step": 12399 + }, + { + "epoch": 1.4406041243101946, + "grad_norm": 0.6097482442855835, + "learning_rate": 0.0001, + "loss": 1.5549, + "step": 12400 + }, + { + "epoch": 1.440720302062155, + "grad_norm": 0.5648863315582275, + "learning_rate": 0.0001, + "loss": 1.2942, + "step": 12401 + }, + { + "epoch": 1.4408364798141156, + "grad_norm": 0.5527321100234985, + "learning_rate": 0.0001, + "loss": 1.2734, + "step": 12402 + }, + { + "epoch": 1.440952657566076, + "grad_norm": 0.587592601776123, + "learning_rate": 0.0001, + "loss": 1.5027, + "step": 12403 + }, + { + "epoch": 1.4410688353180365, + "grad_norm": 0.6088840961456299, + "learning_rate": 0.0001, + "loss": 1.6002, + "step": 12404 + }, + { + "epoch": 1.441185013069997, + "grad_norm": 0.5852679014205933, + "learning_rate": 0.0001, + "loss": 1.3525, + "step": 12405 + }, + { + "epoch": 1.4413011908219575, + "grad_norm": 0.65325528383255, + "learning_rate": 0.0001, + "loss": 1.3384, + "step": 12406 + }, + { + "epoch": 1.441417368573918, + "grad_norm": 0.6041934490203857, + "learning_rate": 0.0001, + "loss": 1.4435, + "step": 12407 + }, + { + "epoch": 1.4415335463258785, + "grad_norm": 0.6076403856277466, + "learning_rate": 0.0001, + "loss": 1.3845, + "step": 12408 + }, + { + "epoch": 1.441649724077839, + "grad_norm": 0.6228477954864502, + "learning_rate": 0.0001, + "loss": 1.6537, + "step": 12409 + }, + { + "epoch": 1.4417659018297995, + "grad_norm": 0.6112052202224731, + "learning_rate": 0.0001, + "loss": 1.5046, + "step": 12410 + }, + { + "epoch": 1.44188207958176, + "grad_norm": 0.5732336640357971, + "learning_rate": 0.0001, + "loss": 1.2829, + "step": 12411 + }, + { + "epoch": 1.4419982573337207, + "grad_norm": 0.6284494400024414, + "learning_rate": 0.0001, + "loss": 1.6029, + "step": 12412 + }, + { + "epoch": 1.4421144350856812, + "grad_norm": 0.5829569101333618, + "learning_rate": 0.0001, + "loss": 1.4614, + "step": 12413 + }, + { + "epoch": 1.4422306128376416, + "grad_norm": 0.6057117581367493, + "learning_rate": 0.0001, + "loss": 1.4082, + "step": 12414 + }, + { + "epoch": 1.4423467905896021, + "grad_norm": 0.6320570111274719, + "learning_rate": 0.0001, + "loss": 1.5682, + "step": 12415 + }, + { + "epoch": 1.4424629683415626, + "grad_norm": 0.6196115612983704, + "learning_rate": 0.0001, + "loss": 1.5874, + "step": 12416 + }, + { + "epoch": 1.4425791460935231, + "grad_norm": 0.5623551607131958, + "learning_rate": 0.0001, + "loss": 1.3886, + "step": 12417 + }, + { + "epoch": 1.4426953238454836, + "grad_norm": 0.6607520580291748, + "learning_rate": 0.0001, + "loss": 1.6463, + "step": 12418 + }, + { + "epoch": 1.442811501597444, + "grad_norm": 0.617299497127533, + "learning_rate": 0.0001, + "loss": 1.3807, + "step": 12419 + }, + { + "epoch": 1.4429276793494046, + "grad_norm": 0.6651842594146729, + "learning_rate": 0.0001, + "loss": 1.7093, + "step": 12420 + }, + { + "epoch": 1.443043857101365, + "grad_norm": 0.5932329893112183, + "learning_rate": 0.0001, + "loss": 1.4479, + "step": 12421 + }, + { + "epoch": 1.4431600348533256, + "grad_norm": 0.6579514145851135, + "learning_rate": 0.0001, + "loss": 1.4316, + "step": 12422 + }, + { + "epoch": 1.443276212605286, + "grad_norm": 0.664814829826355, + "learning_rate": 0.0001, + "loss": 1.4231, + "step": 12423 + }, + { + "epoch": 1.4433923903572465, + "grad_norm": 0.6053681373596191, + "learning_rate": 0.0001, + "loss": 1.4038, + "step": 12424 + }, + { + "epoch": 1.443508568109207, + "grad_norm": 0.5771030187606812, + "learning_rate": 0.0001, + "loss": 1.4651, + "step": 12425 + }, + { + "epoch": 1.4436247458611675, + "grad_norm": 0.594408392906189, + "learning_rate": 0.0001, + "loss": 1.5309, + "step": 12426 + }, + { + "epoch": 1.4437409236131282, + "grad_norm": 0.6161181330680847, + "learning_rate": 0.0001, + "loss": 1.3431, + "step": 12427 + }, + { + "epoch": 1.4438571013650887, + "grad_norm": 0.6196521520614624, + "learning_rate": 0.0001, + "loss": 1.5768, + "step": 12428 + }, + { + "epoch": 1.4439732791170492, + "grad_norm": 0.6354605555534363, + "learning_rate": 0.0001, + "loss": 1.4162, + "step": 12429 + }, + { + "epoch": 1.4440894568690097, + "grad_norm": 0.6167144179344177, + "learning_rate": 0.0001, + "loss": 1.4741, + "step": 12430 + }, + { + "epoch": 1.4442056346209702, + "grad_norm": 0.6271317601203918, + "learning_rate": 0.0001, + "loss": 1.4724, + "step": 12431 + }, + { + "epoch": 1.4443218123729307, + "grad_norm": 0.6042360067367554, + "learning_rate": 0.0001, + "loss": 1.547, + "step": 12432 + }, + { + "epoch": 1.4444379901248912, + "grad_norm": 0.5653975605964661, + "learning_rate": 0.0001, + "loss": 1.3804, + "step": 12433 + }, + { + "epoch": 1.4445541678768516, + "grad_norm": 0.5969522595405579, + "learning_rate": 0.0001, + "loss": 1.5427, + "step": 12434 + }, + { + "epoch": 1.4446703456288121, + "grad_norm": 0.5756595730781555, + "learning_rate": 0.0001, + "loss": 1.4859, + "step": 12435 + }, + { + "epoch": 1.4447865233807726, + "grad_norm": 0.5508007407188416, + "learning_rate": 0.0001, + "loss": 1.3658, + "step": 12436 + }, + { + "epoch": 1.4449027011327331, + "grad_norm": 0.578050971031189, + "learning_rate": 0.0001, + "loss": 1.4602, + "step": 12437 + }, + { + "epoch": 1.4450188788846936, + "grad_norm": 0.5895441174507141, + "learning_rate": 0.0001, + "loss": 1.3945, + "step": 12438 + }, + { + "epoch": 1.445135056636654, + "grad_norm": 0.5890359282493591, + "learning_rate": 0.0001, + "loss": 1.3473, + "step": 12439 + }, + { + "epoch": 1.4452512343886146, + "grad_norm": 0.5464291572570801, + "learning_rate": 0.0001, + "loss": 1.4685, + "step": 12440 + }, + { + "epoch": 1.445367412140575, + "grad_norm": 0.6562072038650513, + "learning_rate": 0.0001, + "loss": 1.5293, + "step": 12441 + }, + { + "epoch": 1.4454835898925356, + "grad_norm": 0.5997947454452515, + "learning_rate": 0.0001, + "loss": 1.5211, + "step": 12442 + }, + { + "epoch": 1.445599767644496, + "grad_norm": 0.5633268356323242, + "learning_rate": 0.0001, + "loss": 1.2866, + "step": 12443 + }, + { + "epoch": 1.4457159453964565, + "grad_norm": 0.5751778483390808, + "learning_rate": 0.0001, + "loss": 1.3926, + "step": 12444 + }, + { + "epoch": 1.445832123148417, + "grad_norm": 0.6264200210571289, + "learning_rate": 0.0001, + "loss": 1.4634, + "step": 12445 + }, + { + "epoch": 1.4459483009003775, + "grad_norm": 0.5811033844947815, + "learning_rate": 0.0001, + "loss": 1.3373, + "step": 12446 + }, + { + "epoch": 1.446064478652338, + "grad_norm": 0.5755273103713989, + "learning_rate": 0.0001, + "loss": 1.4198, + "step": 12447 + }, + { + "epoch": 1.4461806564042985, + "grad_norm": 0.6127162575721741, + "learning_rate": 0.0001, + "loss": 1.5854, + "step": 12448 + }, + { + "epoch": 1.446296834156259, + "grad_norm": 0.631125271320343, + "learning_rate": 0.0001, + "loss": 1.5738, + "step": 12449 + }, + { + "epoch": 1.4464130119082195, + "grad_norm": 0.6190559267997742, + "learning_rate": 0.0001, + "loss": 1.6324, + "step": 12450 + }, + { + "epoch": 1.44652918966018, + "grad_norm": 0.6076474189758301, + "learning_rate": 0.0001, + "loss": 1.4934, + "step": 12451 + }, + { + "epoch": 1.4466453674121404, + "grad_norm": 0.6262378692626953, + "learning_rate": 0.0001, + "loss": 1.5541, + "step": 12452 + }, + { + "epoch": 1.446761545164101, + "grad_norm": 0.6727213859558105, + "learning_rate": 0.0001, + "loss": 1.6884, + "step": 12453 + }, + { + "epoch": 1.4468777229160616, + "grad_norm": 0.5651970505714417, + "learning_rate": 0.0001, + "loss": 1.4839, + "step": 12454 + }, + { + "epoch": 1.4469939006680221, + "grad_norm": 0.5852252244949341, + "learning_rate": 0.0001, + "loss": 1.5358, + "step": 12455 + }, + { + "epoch": 1.4471100784199826, + "grad_norm": 0.6040740609169006, + "learning_rate": 0.0001, + "loss": 1.3637, + "step": 12456 + }, + { + "epoch": 1.4472262561719431, + "grad_norm": 0.6351321935653687, + "learning_rate": 0.0001, + "loss": 1.4719, + "step": 12457 + }, + { + "epoch": 1.4473424339239036, + "grad_norm": 0.5973447561264038, + "learning_rate": 0.0001, + "loss": 1.5772, + "step": 12458 + }, + { + "epoch": 1.447458611675864, + "grad_norm": 0.5978641510009766, + "learning_rate": 0.0001, + "loss": 1.5503, + "step": 12459 + }, + { + "epoch": 1.4475747894278246, + "grad_norm": 0.6435196995735168, + "learning_rate": 0.0001, + "loss": 1.6186, + "step": 12460 + }, + { + "epoch": 1.447690967179785, + "grad_norm": 0.5686025619506836, + "learning_rate": 0.0001, + "loss": 1.4097, + "step": 12461 + }, + { + "epoch": 1.4478071449317456, + "grad_norm": 0.5564260482788086, + "learning_rate": 0.0001, + "loss": 1.2317, + "step": 12462 + }, + { + "epoch": 1.447923322683706, + "grad_norm": 0.6230119466781616, + "learning_rate": 0.0001, + "loss": 1.49, + "step": 12463 + }, + { + "epoch": 1.4480395004356665, + "grad_norm": 0.5975537896156311, + "learning_rate": 0.0001, + "loss": 1.4125, + "step": 12464 + }, + { + "epoch": 1.448155678187627, + "grad_norm": 0.5980433821678162, + "learning_rate": 0.0001, + "loss": 1.444, + "step": 12465 + }, + { + "epoch": 1.4482718559395875, + "grad_norm": 0.5679630041122437, + "learning_rate": 0.0001, + "loss": 1.2907, + "step": 12466 + }, + { + "epoch": 1.448388033691548, + "grad_norm": 0.643408477306366, + "learning_rate": 0.0001, + "loss": 1.5006, + "step": 12467 + }, + { + "epoch": 1.4485042114435085, + "grad_norm": 0.6029306054115295, + "learning_rate": 0.0001, + "loss": 1.4144, + "step": 12468 + }, + { + "epoch": 1.4486203891954692, + "grad_norm": 0.624788224697113, + "learning_rate": 0.0001, + "loss": 1.4684, + "step": 12469 + }, + { + "epoch": 1.4487365669474297, + "grad_norm": 0.6190298199653625, + "learning_rate": 0.0001, + "loss": 1.2963, + "step": 12470 + }, + { + "epoch": 1.4488527446993902, + "grad_norm": 0.6062245965003967, + "learning_rate": 0.0001, + "loss": 1.5758, + "step": 12471 + }, + { + "epoch": 1.4489689224513507, + "grad_norm": 0.6660163402557373, + "learning_rate": 0.0001, + "loss": 1.5742, + "step": 12472 + }, + { + "epoch": 1.4490851002033112, + "grad_norm": 0.5759906768798828, + "learning_rate": 0.0001, + "loss": 1.3949, + "step": 12473 + }, + { + "epoch": 1.4492012779552716, + "grad_norm": 0.6217337846755981, + "learning_rate": 0.0001, + "loss": 1.483, + "step": 12474 + }, + { + "epoch": 1.4493174557072321, + "grad_norm": 0.5870078802108765, + "learning_rate": 0.0001, + "loss": 1.4583, + "step": 12475 + }, + { + "epoch": 1.4494336334591926, + "grad_norm": 0.5740808844566345, + "learning_rate": 0.0001, + "loss": 1.2145, + "step": 12476 + }, + { + "epoch": 1.4495498112111531, + "grad_norm": 0.603854775428772, + "learning_rate": 0.0001, + "loss": 1.5352, + "step": 12477 + }, + { + "epoch": 1.4496659889631136, + "grad_norm": 0.5887278914451599, + "learning_rate": 0.0001, + "loss": 1.4273, + "step": 12478 + }, + { + "epoch": 1.449782166715074, + "grad_norm": 0.5863503217697144, + "learning_rate": 0.0001, + "loss": 1.4259, + "step": 12479 + }, + { + "epoch": 1.4498983444670346, + "grad_norm": 0.5903075337409973, + "learning_rate": 0.0001, + "loss": 1.439, + "step": 12480 + }, + { + "epoch": 1.450014522218995, + "grad_norm": 0.6201311945915222, + "learning_rate": 0.0001, + "loss": 1.4628, + "step": 12481 + }, + { + "epoch": 1.4501306999709556, + "grad_norm": 0.6051203608512878, + "learning_rate": 0.0001, + "loss": 1.4167, + "step": 12482 + }, + { + "epoch": 1.450246877722916, + "grad_norm": 0.6963009834289551, + "learning_rate": 0.0001, + "loss": 1.5327, + "step": 12483 + }, + { + "epoch": 1.4503630554748765, + "grad_norm": 0.6066775918006897, + "learning_rate": 0.0001, + "loss": 1.401, + "step": 12484 + }, + { + "epoch": 1.450479233226837, + "grad_norm": 0.5624590516090393, + "learning_rate": 0.0001, + "loss": 1.2795, + "step": 12485 + }, + { + "epoch": 1.4505954109787975, + "grad_norm": 0.5763733983039856, + "learning_rate": 0.0001, + "loss": 1.3443, + "step": 12486 + }, + { + "epoch": 1.450711588730758, + "grad_norm": 0.6045727729797363, + "learning_rate": 0.0001, + "loss": 1.6086, + "step": 12487 + }, + { + "epoch": 1.4508277664827185, + "grad_norm": 0.6144154667854309, + "learning_rate": 0.0001, + "loss": 1.4593, + "step": 12488 + }, + { + "epoch": 1.450943944234679, + "grad_norm": 0.6358152031898499, + "learning_rate": 0.0001, + "loss": 1.5782, + "step": 12489 + }, + { + "epoch": 1.4510601219866395, + "grad_norm": 0.5821871161460876, + "learning_rate": 0.0001, + "loss": 1.4008, + "step": 12490 + }, + { + "epoch": 1.4511762997386, + "grad_norm": 0.6219722032546997, + "learning_rate": 0.0001, + "loss": 1.4496, + "step": 12491 + }, + { + "epoch": 1.4512924774905605, + "grad_norm": 0.5885584354400635, + "learning_rate": 0.0001, + "loss": 1.3345, + "step": 12492 + }, + { + "epoch": 1.451408655242521, + "grad_norm": 0.6406071782112122, + "learning_rate": 0.0001, + "loss": 1.5645, + "step": 12493 + }, + { + "epoch": 1.4515248329944814, + "grad_norm": 0.5714595317840576, + "learning_rate": 0.0001, + "loss": 1.2703, + "step": 12494 + }, + { + "epoch": 1.451641010746442, + "grad_norm": 0.6086097955703735, + "learning_rate": 0.0001, + "loss": 1.6087, + "step": 12495 + }, + { + "epoch": 1.4517571884984026, + "grad_norm": 0.6160340309143066, + "learning_rate": 0.0001, + "loss": 1.7256, + "step": 12496 + }, + { + "epoch": 1.4518733662503631, + "grad_norm": 0.6223379969596863, + "learning_rate": 0.0001, + "loss": 1.5614, + "step": 12497 + }, + { + "epoch": 1.4519895440023236, + "grad_norm": 0.6125460267066956, + "learning_rate": 0.0001, + "loss": 1.3264, + "step": 12498 + }, + { + "epoch": 1.452105721754284, + "grad_norm": 0.6102118492126465, + "learning_rate": 0.0001, + "loss": 1.3831, + "step": 12499 + }, + { + "epoch": 1.4522218995062446, + "grad_norm": 0.6099421381950378, + "learning_rate": 0.0001, + "loss": 1.3743, + "step": 12500 + }, + { + "epoch": 1.452338077258205, + "grad_norm": 0.559723436832428, + "learning_rate": 0.0001, + "loss": 1.3618, + "step": 12501 + }, + { + "epoch": 1.4524542550101656, + "grad_norm": 0.6058681011199951, + "learning_rate": 0.0001, + "loss": 1.552, + "step": 12502 + }, + { + "epoch": 1.452570432762126, + "grad_norm": 0.6532595753669739, + "learning_rate": 0.0001, + "loss": 1.6282, + "step": 12503 + }, + { + "epoch": 1.4526866105140865, + "grad_norm": 0.6153339743614197, + "learning_rate": 0.0001, + "loss": 1.5724, + "step": 12504 + }, + { + "epoch": 1.452802788266047, + "grad_norm": 0.5688385963439941, + "learning_rate": 0.0001, + "loss": 1.2545, + "step": 12505 + }, + { + "epoch": 1.4529189660180075, + "grad_norm": 0.6670095920562744, + "learning_rate": 0.0001, + "loss": 1.6536, + "step": 12506 + }, + { + "epoch": 1.453035143769968, + "grad_norm": 0.6240342259407043, + "learning_rate": 0.0001, + "loss": 1.4827, + "step": 12507 + }, + { + "epoch": 1.4531513215219285, + "grad_norm": 0.6448224186897278, + "learning_rate": 0.0001, + "loss": 1.629, + "step": 12508 + }, + { + "epoch": 1.453267499273889, + "grad_norm": 0.6262985467910767, + "learning_rate": 0.0001, + "loss": 1.5922, + "step": 12509 + }, + { + "epoch": 1.4533836770258495, + "grad_norm": 0.580199122428894, + "learning_rate": 0.0001, + "loss": 1.3952, + "step": 12510 + }, + { + "epoch": 1.4534998547778102, + "grad_norm": 0.5817636847496033, + "learning_rate": 0.0001, + "loss": 1.3002, + "step": 12511 + }, + { + "epoch": 1.4536160325297707, + "grad_norm": 0.6186819672584534, + "learning_rate": 0.0001, + "loss": 1.3567, + "step": 12512 + }, + { + "epoch": 1.4537322102817312, + "grad_norm": 0.6286445260047913, + "learning_rate": 0.0001, + "loss": 1.4373, + "step": 12513 + }, + { + "epoch": 1.4538483880336917, + "grad_norm": 0.6268229484558105, + "learning_rate": 0.0001, + "loss": 1.5821, + "step": 12514 + }, + { + "epoch": 1.4539645657856521, + "grad_norm": 0.5959900617599487, + "learning_rate": 0.0001, + "loss": 1.4463, + "step": 12515 + }, + { + "epoch": 1.4540807435376126, + "grad_norm": 0.6190244555473328, + "learning_rate": 0.0001, + "loss": 1.3746, + "step": 12516 + }, + { + "epoch": 1.4541969212895731, + "grad_norm": 0.6077380180358887, + "learning_rate": 0.0001, + "loss": 1.5158, + "step": 12517 + }, + { + "epoch": 1.4543130990415336, + "grad_norm": 0.6312915086746216, + "learning_rate": 0.0001, + "loss": 1.5431, + "step": 12518 + }, + { + "epoch": 1.454429276793494, + "grad_norm": 0.642156720161438, + "learning_rate": 0.0001, + "loss": 1.6197, + "step": 12519 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 0.6618093252182007, + "learning_rate": 0.0001, + "loss": 1.4224, + "step": 12520 + }, + { + "epoch": 1.454661632297415, + "grad_norm": 0.61574786901474, + "learning_rate": 0.0001, + "loss": 1.5195, + "step": 12521 + }, + { + "epoch": 1.4547778100493756, + "grad_norm": 0.5520590543746948, + "learning_rate": 0.0001, + "loss": 1.1745, + "step": 12522 + }, + { + "epoch": 1.454893987801336, + "grad_norm": 0.6121432185173035, + "learning_rate": 0.0001, + "loss": 1.4132, + "step": 12523 + }, + { + "epoch": 1.4550101655532965, + "grad_norm": 0.6063690185546875, + "learning_rate": 0.0001, + "loss": 1.3249, + "step": 12524 + }, + { + "epoch": 1.455126343305257, + "grad_norm": 0.6351370811462402, + "learning_rate": 0.0001, + "loss": 1.3921, + "step": 12525 + }, + { + "epoch": 1.4552425210572175, + "grad_norm": 0.5611215829849243, + "learning_rate": 0.0001, + "loss": 1.5057, + "step": 12526 + }, + { + "epoch": 1.455358698809178, + "grad_norm": 0.6196964383125305, + "learning_rate": 0.0001, + "loss": 1.4851, + "step": 12527 + }, + { + "epoch": 1.4554748765611385, + "grad_norm": 0.5891339778900146, + "learning_rate": 0.0001, + "loss": 1.3492, + "step": 12528 + }, + { + "epoch": 1.455591054313099, + "grad_norm": 0.634013295173645, + "learning_rate": 0.0001, + "loss": 1.5088, + "step": 12529 + }, + { + "epoch": 1.4557072320650595, + "grad_norm": 0.5923905968666077, + "learning_rate": 0.0001, + "loss": 1.5076, + "step": 12530 + }, + { + "epoch": 1.45582340981702, + "grad_norm": 0.6321912407875061, + "learning_rate": 0.0001, + "loss": 1.5716, + "step": 12531 + }, + { + "epoch": 1.4559395875689805, + "grad_norm": 0.5776755213737488, + "learning_rate": 0.0001, + "loss": 1.5406, + "step": 12532 + }, + { + "epoch": 1.456055765320941, + "grad_norm": 0.5691312551498413, + "learning_rate": 0.0001, + "loss": 1.4853, + "step": 12533 + }, + { + "epoch": 1.4561719430729014, + "grad_norm": 0.6355810165405273, + "learning_rate": 0.0001, + "loss": 1.4057, + "step": 12534 + }, + { + "epoch": 1.456288120824862, + "grad_norm": 0.6114268898963928, + "learning_rate": 0.0001, + "loss": 1.5544, + "step": 12535 + }, + { + "epoch": 1.4564042985768224, + "grad_norm": 0.5496631860733032, + "learning_rate": 0.0001, + "loss": 1.3485, + "step": 12536 + }, + { + "epoch": 1.4565204763287831, + "grad_norm": 0.6105073094367981, + "learning_rate": 0.0001, + "loss": 1.5128, + "step": 12537 + }, + { + "epoch": 1.4566366540807436, + "grad_norm": 0.5859121680259705, + "learning_rate": 0.0001, + "loss": 1.4094, + "step": 12538 + }, + { + "epoch": 1.456752831832704, + "grad_norm": 0.6114695072174072, + "learning_rate": 0.0001, + "loss": 1.4062, + "step": 12539 + }, + { + "epoch": 1.4568690095846646, + "grad_norm": 0.6075177788734436, + "learning_rate": 0.0001, + "loss": 1.3643, + "step": 12540 + }, + { + "epoch": 1.456985187336625, + "grad_norm": 0.5910870432853699, + "learning_rate": 0.0001, + "loss": 1.4241, + "step": 12541 + }, + { + "epoch": 1.4571013650885856, + "grad_norm": 0.595967710018158, + "learning_rate": 0.0001, + "loss": 1.4613, + "step": 12542 + }, + { + "epoch": 1.457217542840546, + "grad_norm": 0.6669548749923706, + "learning_rate": 0.0001, + "loss": 1.6616, + "step": 12543 + }, + { + "epoch": 1.4573337205925065, + "grad_norm": 0.6057718396186829, + "learning_rate": 0.0001, + "loss": 1.5482, + "step": 12544 + }, + { + "epoch": 1.457449898344467, + "grad_norm": 0.5876947641372681, + "learning_rate": 0.0001, + "loss": 1.4503, + "step": 12545 + }, + { + "epoch": 1.4575660760964275, + "grad_norm": 0.6106549501419067, + "learning_rate": 0.0001, + "loss": 1.4327, + "step": 12546 + }, + { + "epoch": 1.457682253848388, + "grad_norm": 0.6266193985939026, + "learning_rate": 0.0001, + "loss": 1.2981, + "step": 12547 + }, + { + "epoch": 1.4577984316003485, + "grad_norm": 0.6059079170227051, + "learning_rate": 0.0001, + "loss": 1.5158, + "step": 12548 + }, + { + "epoch": 1.457914609352309, + "grad_norm": 0.5994994640350342, + "learning_rate": 0.0001, + "loss": 1.6222, + "step": 12549 + }, + { + "epoch": 1.4580307871042695, + "grad_norm": 0.6154032945632935, + "learning_rate": 0.0001, + "loss": 1.5556, + "step": 12550 + }, + { + "epoch": 1.45814696485623, + "grad_norm": 0.6104035377502441, + "learning_rate": 0.0001, + "loss": 1.5981, + "step": 12551 + }, + { + "epoch": 1.4582631426081905, + "grad_norm": 0.6142469644546509, + "learning_rate": 0.0001, + "loss": 1.5546, + "step": 12552 + }, + { + "epoch": 1.4583793203601512, + "grad_norm": 0.574055552482605, + "learning_rate": 0.0001, + "loss": 1.309, + "step": 12553 + }, + { + "epoch": 1.4584954981121117, + "grad_norm": 0.5898140668869019, + "learning_rate": 0.0001, + "loss": 1.3786, + "step": 12554 + }, + { + "epoch": 1.4586116758640721, + "grad_norm": 0.5797202587127686, + "learning_rate": 0.0001, + "loss": 1.472, + "step": 12555 + }, + { + "epoch": 1.4587278536160326, + "grad_norm": 0.554664134979248, + "learning_rate": 0.0001, + "loss": 1.3166, + "step": 12556 + }, + { + "epoch": 1.4588440313679931, + "grad_norm": 0.6085032820701599, + "learning_rate": 0.0001, + "loss": 1.4686, + "step": 12557 + }, + { + "epoch": 1.4589602091199536, + "grad_norm": 0.625577986240387, + "learning_rate": 0.0001, + "loss": 1.5754, + "step": 12558 + }, + { + "epoch": 1.459076386871914, + "grad_norm": 0.6222672462463379, + "learning_rate": 0.0001, + "loss": 1.5968, + "step": 12559 + }, + { + "epoch": 1.4591925646238746, + "grad_norm": 0.61932772397995, + "learning_rate": 0.0001, + "loss": 1.5144, + "step": 12560 + }, + { + "epoch": 1.459308742375835, + "grad_norm": 0.590408444404602, + "learning_rate": 0.0001, + "loss": 1.4954, + "step": 12561 + }, + { + "epoch": 1.4594249201277956, + "grad_norm": 0.6052151918411255, + "learning_rate": 0.0001, + "loss": 1.5481, + "step": 12562 + }, + { + "epoch": 1.459541097879756, + "grad_norm": 0.5636199712753296, + "learning_rate": 0.0001, + "loss": 1.452, + "step": 12563 + }, + { + "epoch": 1.4596572756317165, + "grad_norm": 0.639377236366272, + "learning_rate": 0.0001, + "loss": 1.6294, + "step": 12564 + }, + { + "epoch": 1.459773453383677, + "grad_norm": 0.5761369466781616, + "learning_rate": 0.0001, + "loss": 1.5082, + "step": 12565 + }, + { + "epoch": 1.4598896311356375, + "grad_norm": 0.59371018409729, + "learning_rate": 0.0001, + "loss": 1.53, + "step": 12566 + }, + { + "epoch": 1.460005808887598, + "grad_norm": 0.6135494112968445, + "learning_rate": 0.0001, + "loss": 1.5947, + "step": 12567 + }, + { + "epoch": 1.4601219866395585, + "grad_norm": 0.6341174840927124, + "learning_rate": 0.0001, + "loss": 1.2997, + "step": 12568 + }, + { + "epoch": 1.460238164391519, + "grad_norm": 0.6097371578216553, + "learning_rate": 0.0001, + "loss": 1.579, + "step": 12569 + }, + { + "epoch": 1.4603543421434795, + "grad_norm": 0.5795385837554932, + "learning_rate": 0.0001, + "loss": 1.3574, + "step": 12570 + }, + { + "epoch": 1.46047051989544, + "grad_norm": 0.618484377861023, + "learning_rate": 0.0001, + "loss": 1.4424, + "step": 12571 + }, + { + "epoch": 1.4605866976474005, + "grad_norm": 0.5799589157104492, + "learning_rate": 0.0001, + "loss": 1.4669, + "step": 12572 + }, + { + "epoch": 1.460702875399361, + "grad_norm": 0.6043416261672974, + "learning_rate": 0.0001, + "loss": 1.39, + "step": 12573 + }, + { + "epoch": 1.4608190531513214, + "grad_norm": 0.5822587013244629, + "learning_rate": 0.0001, + "loss": 1.3416, + "step": 12574 + }, + { + "epoch": 1.460935230903282, + "grad_norm": 0.6009185910224915, + "learning_rate": 0.0001, + "loss": 1.3142, + "step": 12575 + }, + { + "epoch": 1.4610514086552424, + "grad_norm": 0.5679329633712769, + "learning_rate": 0.0001, + "loss": 1.3666, + "step": 12576 + }, + { + "epoch": 1.461167586407203, + "grad_norm": 0.6215576529502869, + "learning_rate": 0.0001, + "loss": 1.5026, + "step": 12577 + }, + { + "epoch": 1.4612837641591634, + "grad_norm": 0.5975043177604675, + "learning_rate": 0.0001, + "loss": 1.39, + "step": 12578 + }, + { + "epoch": 1.461399941911124, + "grad_norm": 0.5994958281517029, + "learning_rate": 0.0001, + "loss": 1.3617, + "step": 12579 + }, + { + "epoch": 1.4615161196630846, + "grad_norm": 0.6470338106155396, + "learning_rate": 0.0001, + "loss": 1.6114, + "step": 12580 + }, + { + "epoch": 1.461632297415045, + "grad_norm": 0.7016136050224304, + "learning_rate": 0.0001, + "loss": 1.5484, + "step": 12581 + }, + { + "epoch": 1.4617484751670056, + "grad_norm": 0.5727640986442566, + "learning_rate": 0.0001, + "loss": 1.3699, + "step": 12582 + }, + { + "epoch": 1.461864652918966, + "grad_norm": 0.5754709839820862, + "learning_rate": 0.0001, + "loss": 1.4004, + "step": 12583 + }, + { + "epoch": 1.4619808306709265, + "grad_norm": 0.5758077502250671, + "learning_rate": 0.0001, + "loss": 1.4384, + "step": 12584 + }, + { + "epoch": 1.462097008422887, + "grad_norm": 0.5902067422866821, + "learning_rate": 0.0001, + "loss": 1.3025, + "step": 12585 + }, + { + "epoch": 1.4622131861748475, + "grad_norm": 0.5974416136741638, + "learning_rate": 0.0001, + "loss": 1.6851, + "step": 12586 + }, + { + "epoch": 1.462329363926808, + "grad_norm": 0.5915681719779968, + "learning_rate": 0.0001, + "loss": 1.4601, + "step": 12587 + }, + { + "epoch": 1.4624455416787685, + "grad_norm": 0.5832463502883911, + "learning_rate": 0.0001, + "loss": 1.4135, + "step": 12588 + }, + { + "epoch": 1.462561719430729, + "grad_norm": 0.6045186519622803, + "learning_rate": 0.0001, + "loss": 1.5844, + "step": 12589 + }, + { + "epoch": 1.4626778971826895, + "grad_norm": 0.634339451789856, + "learning_rate": 0.0001, + "loss": 1.61, + "step": 12590 + }, + { + "epoch": 1.46279407493465, + "grad_norm": 0.5863074660301208, + "learning_rate": 0.0001, + "loss": 1.4268, + "step": 12591 + }, + { + "epoch": 1.4629102526866105, + "grad_norm": 0.577804684638977, + "learning_rate": 0.0001, + "loss": 1.4489, + "step": 12592 + }, + { + "epoch": 1.463026430438571, + "grad_norm": 0.6123642921447754, + "learning_rate": 0.0001, + "loss": 1.5616, + "step": 12593 + }, + { + "epoch": 1.4631426081905314, + "grad_norm": 0.6405830979347229, + "learning_rate": 0.0001, + "loss": 1.658, + "step": 12594 + }, + { + "epoch": 1.4632587859424921, + "grad_norm": 0.6347610354423523, + "learning_rate": 0.0001, + "loss": 1.4584, + "step": 12595 + }, + { + "epoch": 1.4633749636944526, + "grad_norm": 0.5713246464729309, + "learning_rate": 0.0001, + "loss": 1.4615, + "step": 12596 + }, + { + "epoch": 1.4634911414464131, + "grad_norm": 0.5953773260116577, + "learning_rate": 0.0001, + "loss": 1.4558, + "step": 12597 + }, + { + "epoch": 1.4636073191983736, + "grad_norm": 0.5882254838943481, + "learning_rate": 0.0001, + "loss": 1.5515, + "step": 12598 + }, + { + "epoch": 1.463723496950334, + "grad_norm": 0.5993982553482056, + "learning_rate": 0.0001, + "loss": 1.3766, + "step": 12599 + }, + { + "epoch": 1.4638396747022946, + "grad_norm": 0.6374693512916565, + "learning_rate": 0.0001, + "loss": 1.5259, + "step": 12600 + }, + { + "epoch": 1.463955852454255, + "grad_norm": 0.6188412308692932, + "learning_rate": 0.0001, + "loss": 1.2187, + "step": 12601 + }, + { + "epoch": 1.4640720302062156, + "grad_norm": 0.5834885835647583, + "learning_rate": 0.0001, + "loss": 1.4844, + "step": 12602 + }, + { + "epoch": 1.464188207958176, + "grad_norm": 0.6189852356910706, + "learning_rate": 0.0001, + "loss": 1.5104, + "step": 12603 + }, + { + "epoch": 1.4643043857101365, + "grad_norm": 0.6446365714073181, + "learning_rate": 0.0001, + "loss": 1.6013, + "step": 12604 + }, + { + "epoch": 1.464420563462097, + "grad_norm": 0.6015826463699341, + "learning_rate": 0.0001, + "loss": 1.4581, + "step": 12605 + }, + { + "epoch": 1.4645367412140575, + "grad_norm": 0.6048887372016907, + "learning_rate": 0.0001, + "loss": 1.391, + "step": 12606 + }, + { + "epoch": 1.464652918966018, + "grad_norm": 0.5923680663108826, + "learning_rate": 0.0001, + "loss": 1.3832, + "step": 12607 + }, + { + "epoch": 1.4647690967179785, + "grad_norm": 0.6021586656570435, + "learning_rate": 0.0001, + "loss": 1.4473, + "step": 12608 + }, + { + "epoch": 1.464885274469939, + "grad_norm": 0.5922961831092834, + "learning_rate": 0.0001, + "loss": 1.5496, + "step": 12609 + }, + { + "epoch": 1.4650014522218995, + "grad_norm": 0.5911400318145752, + "learning_rate": 0.0001, + "loss": 1.3201, + "step": 12610 + }, + { + "epoch": 1.46511762997386, + "grad_norm": 0.6118112206459045, + "learning_rate": 0.0001, + "loss": 1.5522, + "step": 12611 + }, + { + "epoch": 1.4652338077258205, + "grad_norm": 0.5985094308853149, + "learning_rate": 0.0001, + "loss": 1.4773, + "step": 12612 + }, + { + "epoch": 1.465349985477781, + "grad_norm": 0.6050367951393127, + "learning_rate": 0.0001, + "loss": 1.4864, + "step": 12613 + }, + { + "epoch": 1.4654661632297414, + "grad_norm": 0.5625033378601074, + "learning_rate": 0.0001, + "loss": 1.1263, + "step": 12614 + }, + { + "epoch": 1.465582340981702, + "grad_norm": 0.6526144742965698, + "learning_rate": 0.0001, + "loss": 1.4681, + "step": 12615 + }, + { + "epoch": 1.4656985187336624, + "grad_norm": 0.5911290645599365, + "learning_rate": 0.0001, + "loss": 1.4809, + "step": 12616 + }, + { + "epoch": 1.465814696485623, + "grad_norm": 0.610272228717804, + "learning_rate": 0.0001, + "loss": 1.3182, + "step": 12617 + }, + { + "epoch": 1.4659308742375834, + "grad_norm": 0.6235426068305969, + "learning_rate": 0.0001, + "loss": 1.6892, + "step": 12618 + }, + { + "epoch": 1.4660470519895439, + "grad_norm": 0.5913504362106323, + "learning_rate": 0.0001, + "loss": 1.4667, + "step": 12619 + }, + { + "epoch": 1.4661632297415044, + "grad_norm": 0.6157997846603394, + "learning_rate": 0.0001, + "loss": 1.528, + "step": 12620 + }, + { + "epoch": 1.466279407493465, + "grad_norm": 0.6300349235534668, + "learning_rate": 0.0001, + "loss": 1.507, + "step": 12621 + }, + { + "epoch": 1.4663955852454256, + "grad_norm": 0.5744130611419678, + "learning_rate": 0.0001, + "loss": 1.2242, + "step": 12622 + }, + { + "epoch": 1.466511762997386, + "grad_norm": 0.6219088435173035, + "learning_rate": 0.0001, + "loss": 1.5588, + "step": 12623 + }, + { + "epoch": 1.4666279407493465, + "grad_norm": 0.5827035307884216, + "learning_rate": 0.0001, + "loss": 1.4531, + "step": 12624 + }, + { + "epoch": 1.466744118501307, + "grad_norm": 0.6625955104827881, + "learning_rate": 0.0001, + "loss": 1.6221, + "step": 12625 + }, + { + "epoch": 1.4668602962532675, + "grad_norm": 0.6402918100357056, + "learning_rate": 0.0001, + "loss": 1.6257, + "step": 12626 + }, + { + "epoch": 1.466976474005228, + "grad_norm": 0.5460782647132874, + "learning_rate": 0.0001, + "loss": 1.3716, + "step": 12627 + }, + { + "epoch": 1.4670926517571885, + "grad_norm": 0.5440313220024109, + "learning_rate": 0.0001, + "loss": 1.3949, + "step": 12628 + }, + { + "epoch": 1.467208829509149, + "grad_norm": 0.6203206181526184, + "learning_rate": 0.0001, + "loss": 1.6124, + "step": 12629 + }, + { + "epoch": 1.4673250072611095, + "grad_norm": 0.59822678565979, + "learning_rate": 0.0001, + "loss": 1.4645, + "step": 12630 + }, + { + "epoch": 1.46744118501307, + "grad_norm": 0.596192479133606, + "learning_rate": 0.0001, + "loss": 1.4839, + "step": 12631 + }, + { + "epoch": 1.4675573627650305, + "grad_norm": 0.567420482635498, + "learning_rate": 0.0001, + "loss": 1.495, + "step": 12632 + }, + { + "epoch": 1.467673540516991, + "grad_norm": 0.6413534879684448, + "learning_rate": 0.0001, + "loss": 1.3717, + "step": 12633 + }, + { + "epoch": 1.4677897182689514, + "grad_norm": 0.5863707065582275, + "learning_rate": 0.0001, + "loss": 1.411, + "step": 12634 + }, + { + "epoch": 1.467905896020912, + "grad_norm": 0.5698891282081604, + "learning_rate": 0.0001, + "loss": 1.3382, + "step": 12635 + }, + { + "epoch": 1.4680220737728724, + "grad_norm": 0.6080556511878967, + "learning_rate": 0.0001, + "loss": 1.4324, + "step": 12636 + }, + { + "epoch": 1.4681382515248331, + "grad_norm": 0.6110317707061768, + "learning_rate": 0.0001, + "loss": 1.5286, + "step": 12637 + }, + { + "epoch": 1.4682544292767936, + "grad_norm": 0.6099866032600403, + "learning_rate": 0.0001, + "loss": 1.4331, + "step": 12638 + }, + { + "epoch": 1.468370607028754, + "grad_norm": 0.5823593139648438, + "learning_rate": 0.0001, + "loss": 1.4508, + "step": 12639 + }, + { + "epoch": 1.4684867847807146, + "grad_norm": 0.6026428937911987, + "learning_rate": 0.0001, + "loss": 1.6118, + "step": 12640 + }, + { + "epoch": 1.468602962532675, + "grad_norm": 0.5801883935928345, + "learning_rate": 0.0001, + "loss": 1.41, + "step": 12641 + }, + { + "epoch": 1.4687191402846356, + "grad_norm": 0.5360670685768127, + "learning_rate": 0.0001, + "loss": 1.3604, + "step": 12642 + }, + { + "epoch": 1.468835318036596, + "grad_norm": 0.5586822032928467, + "learning_rate": 0.0001, + "loss": 1.3045, + "step": 12643 + }, + { + "epoch": 1.4689514957885565, + "grad_norm": 0.5789737701416016, + "learning_rate": 0.0001, + "loss": 1.5573, + "step": 12644 + }, + { + "epoch": 1.469067673540517, + "grad_norm": 0.5735986828804016, + "learning_rate": 0.0001, + "loss": 1.3718, + "step": 12645 + }, + { + "epoch": 1.4691838512924775, + "grad_norm": 0.7054757475852966, + "learning_rate": 0.0001, + "loss": 1.5534, + "step": 12646 + }, + { + "epoch": 1.469300029044438, + "grad_norm": 0.6231496930122375, + "learning_rate": 0.0001, + "loss": 1.4146, + "step": 12647 + }, + { + "epoch": 1.4694162067963985, + "grad_norm": 0.5634230375289917, + "learning_rate": 0.0001, + "loss": 1.3909, + "step": 12648 + }, + { + "epoch": 1.469532384548359, + "grad_norm": 0.6670734286308289, + "learning_rate": 0.0001, + "loss": 1.5684, + "step": 12649 + }, + { + "epoch": 1.4696485623003195, + "grad_norm": 0.592263400554657, + "learning_rate": 0.0001, + "loss": 1.399, + "step": 12650 + }, + { + "epoch": 1.46976474005228, + "grad_norm": 0.6188017725944519, + "learning_rate": 0.0001, + "loss": 1.5407, + "step": 12651 + }, + { + "epoch": 1.4698809178042405, + "grad_norm": 0.5997496247291565, + "learning_rate": 0.0001, + "loss": 1.4838, + "step": 12652 + }, + { + "epoch": 1.469997095556201, + "grad_norm": 0.6102793216705322, + "learning_rate": 0.0001, + "loss": 1.4263, + "step": 12653 + }, + { + "epoch": 1.4701132733081614, + "grad_norm": 0.605833888053894, + "learning_rate": 0.0001, + "loss": 1.5273, + "step": 12654 + }, + { + "epoch": 1.470229451060122, + "grad_norm": 0.606594443321228, + "learning_rate": 0.0001, + "loss": 1.4393, + "step": 12655 + }, + { + "epoch": 1.4703456288120824, + "grad_norm": 0.5853731036186218, + "learning_rate": 0.0001, + "loss": 1.3908, + "step": 12656 + }, + { + "epoch": 1.470461806564043, + "grad_norm": 0.59587162733078, + "learning_rate": 0.0001, + "loss": 1.343, + "step": 12657 + }, + { + "epoch": 1.4705779843160034, + "grad_norm": 0.6557053327560425, + "learning_rate": 0.0001, + "loss": 1.7309, + "step": 12658 + }, + { + "epoch": 1.4706941620679639, + "grad_norm": 0.6285101771354675, + "learning_rate": 0.0001, + "loss": 1.5067, + "step": 12659 + }, + { + "epoch": 1.4708103398199244, + "grad_norm": 0.5789119005203247, + "learning_rate": 0.0001, + "loss": 1.2987, + "step": 12660 + }, + { + "epoch": 1.4709265175718849, + "grad_norm": 0.5933101177215576, + "learning_rate": 0.0001, + "loss": 1.3469, + "step": 12661 + }, + { + "epoch": 1.4710426953238454, + "grad_norm": 0.5343286395072937, + "learning_rate": 0.0001, + "loss": 1.3598, + "step": 12662 + }, + { + "epoch": 1.471158873075806, + "grad_norm": 0.6221956610679626, + "learning_rate": 0.0001, + "loss": 1.4563, + "step": 12663 + }, + { + "epoch": 1.4712750508277666, + "grad_norm": 0.6174246668815613, + "learning_rate": 0.0001, + "loss": 1.4157, + "step": 12664 + }, + { + "epoch": 1.471391228579727, + "grad_norm": 0.6641348600387573, + "learning_rate": 0.0001, + "loss": 1.4751, + "step": 12665 + }, + { + "epoch": 1.4715074063316875, + "grad_norm": 0.5612043738365173, + "learning_rate": 0.0001, + "loss": 1.2723, + "step": 12666 + }, + { + "epoch": 1.471623584083648, + "grad_norm": 0.6334659457206726, + "learning_rate": 0.0001, + "loss": 1.5654, + "step": 12667 + }, + { + "epoch": 1.4717397618356085, + "grad_norm": 0.5860728025436401, + "learning_rate": 0.0001, + "loss": 1.505, + "step": 12668 + }, + { + "epoch": 1.471855939587569, + "grad_norm": 0.6046940684318542, + "learning_rate": 0.0001, + "loss": 1.5854, + "step": 12669 + }, + { + "epoch": 1.4719721173395295, + "grad_norm": 0.5735712051391602, + "learning_rate": 0.0001, + "loss": 1.3659, + "step": 12670 + }, + { + "epoch": 1.47208829509149, + "grad_norm": 0.6405760049819946, + "learning_rate": 0.0001, + "loss": 1.4848, + "step": 12671 + }, + { + "epoch": 1.4722044728434505, + "grad_norm": 0.6492209434509277, + "learning_rate": 0.0001, + "loss": 1.5792, + "step": 12672 + }, + { + "epoch": 1.472320650595411, + "grad_norm": 0.5880120396614075, + "learning_rate": 0.0001, + "loss": 1.2946, + "step": 12673 + }, + { + "epoch": 1.4724368283473714, + "grad_norm": 0.5945394039154053, + "learning_rate": 0.0001, + "loss": 1.3463, + "step": 12674 + }, + { + "epoch": 1.472553006099332, + "grad_norm": 0.6104127168655396, + "learning_rate": 0.0001, + "loss": 1.4624, + "step": 12675 + }, + { + "epoch": 1.4726691838512924, + "grad_norm": 0.6284909248352051, + "learning_rate": 0.0001, + "loss": 1.6087, + "step": 12676 + }, + { + "epoch": 1.472785361603253, + "grad_norm": 0.6103723645210266, + "learning_rate": 0.0001, + "loss": 1.5014, + "step": 12677 + }, + { + "epoch": 1.4729015393552134, + "grad_norm": 0.6179032921791077, + "learning_rate": 0.0001, + "loss": 1.5127, + "step": 12678 + }, + { + "epoch": 1.473017717107174, + "grad_norm": 0.5750223398208618, + "learning_rate": 0.0001, + "loss": 1.4715, + "step": 12679 + }, + { + "epoch": 1.4731338948591346, + "grad_norm": 0.5653707981109619, + "learning_rate": 0.0001, + "loss": 1.2779, + "step": 12680 + }, + { + "epoch": 1.473250072611095, + "grad_norm": 0.6165471076965332, + "learning_rate": 0.0001, + "loss": 1.3156, + "step": 12681 + }, + { + "epoch": 1.4733662503630556, + "grad_norm": 0.5788655281066895, + "learning_rate": 0.0001, + "loss": 1.3653, + "step": 12682 + }, + { + "epoch": 1.473482428115016, + "grad_norm": 0.5805605053901672, + "learning_rate": 0.0001, + "loss": 1.4103, + "step": 12683 + }, + { + "epoch": 1.4735986058669766, + "grad_norm": 0.6233782172203064, + "learning_rate": 0.0001, + "loss": 1.427, + "step": 12684 + }, + { + "epoch": 1.473714783618937, + "grad_norm": 0.6231675744056702, + "learning_rate": 0.0001, + "loss": 1.5637, + "step": 12685 + }, + { + "epoch": 1.4738309613708975, + "grad_norm": 0.5923117399215698, + "learning_rate": 0.0001, + "loss": 1.3208, + "step": 12686 + }, + { + "epoch": 1.473947139122858, + "grad_norm": 0.6191889047622681, + "learning_rate": 0.0001, + "loss": 1.4431, + "step": 12687 + }, + { + "epoch": 1.4740633168748185, + "grad_norm": 0.5808857083320618, + "learning_rate": 0.0001, + "loss": 1.4314, + "step": 12688 + }, + { + "epoch": 1.474179494626779, + "grad_norm": 0.6818951964378357, + "learning_rate": 0.0001, + "loss": 1.5522, + "step": 12689 + }, + { + "epoch": 1.4742956723787395, + "grad_norm": 0.6566728353500366, + "learning_rate": 0.0001, + "loss": 1.5566, + "step": 12690 + }, + { + "epoch": 1.4744118501307, + "grad_norm": 0.6200664639472961, + "learning_rate": 0.0001, + "loss": 1.5531, + "step": 12691 + }, + { + "epoch": 1.4745280278826605, + "grad_norm": 0.5786384344100952, + "learning_rate": 0.0001, + "loss": 1.4235, + "step": 12692 + }, + { + "epoch": 1.474644205634621, + "grad_norm": 0.5918937921524048, + "learning_rate": 0.0001, + "loss": 1.3155, + "step": 12693 + }, + { + "epoch": 1.4747603833865814, + "grad_norm": 0.6062121987342834, + "learning_rate": 0.0001, + "loss": 1.4284, + "step": 12694 + }, + { + "epoch": 1.474876561138542, + "grad_norm": 0.614924967288971, + "learning_rate": 0.0001, + "loss": 1.6247, + "step": 12695 + }, + { + "epoch": 1.4749927388905024, + "grad_norm": 0.6098072528839111, + "learning_rate": 0.0001, + "loss": 1.3263, + "step": 12696 + }, + { + "epoch": 1.475108916642463, + "grad_norm": 0.6092824935913086, + "learning_rate": 0.0001, + "loss": 1.4069, + "step": 12697 + }, + { + "epoch": 1.4752250943944234, + "grad_norm": 0.6376292109489441, + "learning_rate": 0.0001, + "loss": 1.4499, + "step": 12698 + }, + { + "epoch": 1.4753412721463839, + "grad_norm": 0.6538935899734497, + "learning_rate": 0.0001, + "loss": 1.4994, + "step": 12699 + }, + { + "epoch": 1.4754574498983444, + "grad_norm": 0.6000575423240662, + "learning_rate": 0.0001, + "loss": 1.3261, + "step": 12700 + }, + { + "epoch": 1.4755736276503049, + "grad_norm": 0.6823968887329102, + "learning_rate": 0.0001, + "loss": 1.2802, + "step": 12701 + }, + { + "epoch": 1.4756898054022654, + "grad_norm": 0.6618595123291016, + "learning_rate": 0.0001, + "loss": 1.5822, + "step": 12702 + }, + { + "epoch": 1.4758059831542258, + "grad_norm": 0.6516097187995911, + "learning_rate": 0.0001, + "loss": 1.5829, + "step": 12703 + }, + { + "epoch": 1.4759221609061863, + "grad_norm": 0.6231207251548767, + "learning_rate": 0.0001, + "loss": 1.687, + "step": 12704 + }, + { + "epoch": 1.476038338658147, + "grad_norm": 0.6442578434944153, + "learning_rate": 0.0001, + "loss": 1.4111, + "step": 12705 + }, + { + "epoch": 1.4761545164101075, + "grad_norm": 0.6241633296012878, + "learning_rate": 0.0001, + "loss": 1.6373, + "step": 12706 + }, + { + "epoch": 1.476270694162068, + "grad_norm": 0.5882102847099304, + "learning_rate": 0.0001, + "loss": 1.3378, + "step": 12707 + }, + { + "epoch": 1.4763868719140285, + "grad_norm": 0.6080202460289001, + "learning_rate": 0.0001, + "loss": 1.5929, + "step": 12708 + }, + { + "epoch": 1.476503049665989, + "grad_norm": 0.6191835999488831, + "learning_rate": 0.0001, + "loss": 1.4236, + "step": 12709 + }, + { + "epoch": 1.4766192274179495, + "grad_norm": 0.5939556360244751, + "learning_rate": 0.0001, + "loss": 1.3395, + "step": 12710 + }, + { + "epoch": 1.47673540516991, + "grad_norm": 0.6249536275863647, + "learning_rate": 0.0001, + "loss": 1.4709, + "step": 12711 + }, + { + "epoch": 1.4768515829218705, + "grad_norm": 0.5902261137962341, + "learning_rate": 0.0001, + "loss": 1.519, + "step": 12712 + }, + { + "epoch": 1.476967760673831, + "grad_norm": 0.6220241785049438, + "learning_rate": 0.0001, + "loss": 1.5443, + "step": 12713 + }, + { + "epoch": 1.4770839384257914, + "grad_norm": 0.5737898349761963, + "learning_rate": 0.0001, + "loss": 1.5521, + "step": 12714 + }, + { + "epoch": 1.477200116177752, + "grad_norm": 0.6006242036819458, + "learning_rate": 0.0001, + "loss": 1.5542, + "step": 12715 + }, + { + "epoch": 1.4773162939297124, + "grad_norm": 0.6238159537315369, + "learning_rate": 0.0001, + "loss": 1.3714, + "step": 12716 + }, + { + "epoch": 1.477432471681673, + "grad_norm": 0.5738762617111206, + "learning_rate": 0.0001, + "loss": 1.3738, + "step": 12717 + }, + { + "epoch": 1.4775486494336334, + "grad_norm": 0.573409914970398, + "learning_rate": 0.0001, + "loss": 1.2755, + "step": 12718 + }, + { + "epoch": 1.4776648271855939, + "grad_norm": 0.5664563775062561, + "learning_rate": 0.0001, + "loss": 1.4181, + "step": 12719 + }, + { + "epoch": 1.4777810049375544, + "grad_norm": 0.6218703389167786, + "learning_rate": 0.0001, + "loss": 1.6621, + "step": 12720 + }, + { + "epoch": 1.477897182689515, + "grad_norm": 0.5889036655426025, + "learning_rate": 0.0001, + "loss": 1.6083, + "step": 12721 + }, + { + "epoch": 1.4780133604414756, + "grad_norm": 0.6417556405067444, + "learning_rate": 0.0001, + "loss": 1.6125, + "step": 12722 + }, + { + "epoch": 1.478129538193436, + "grad_norm": 0.5941647291183472, + "learning_rate": 0.0001, + "loss": 1.5546, + "step": 12723 + }, + { + "epoch": 1.4782457159453966, + "grad_norm": 0.5559972524642944, + "learning_rate": 0.0001, + "loss": 1.4296, + "step": 12724 + }, + { + "epoch": 1.478361893697357, + "grad_norm": 0.6716883182525635, + "learning_rate": 0.0001, + "loss": 1.5613, + "step": 12725 + }, + { + "epoch": 1.4784780714493175, + "grad_norm": 0.582317054271698, + "learning_rate": 0.0001, + "loss": 1.556, + "step": 12726 + }, + { + "epoch": 1.478594249201278, + "grad_norm": 0.5578384399414062, + "learning_rate": 0.0001, + "loss": 1.4072, + "step": 12727 + }, + { + "epoch": 1.4787104269532385, + "grad_norm": 0.6159401535987854, + "learning_rate": 0.0001, + "loss": 1.4841, + "step": 12728 + }, + { + "epoch": 1.478826604705199, + "grad_norm": 0.5790227055549622, + "learning_rate": 0.0001, + "loss": 1.2968, + "step": 12729 + }, + { + "epoch": 1.4789427824571595, + "grad_norm": 0.5429813265800476, + "learning_rate": 0.0001, + "loss": 1.2731, + "step": 12730 + }, + { + "epoch": 1.47905896020912, + "grad_norm": 0.6098136901855469, + "learning_rate": 0.0001, + "loss": 1.5287, + "step": 12731 + }, + { + "epoch": 1.4791751379610805, + "grad_norm": 0.6206817030906677, + "learning_rate": 0.0001, + "loss": 1.3591, + "step": 12732 + }, + { + "epoch": 1.479291315713041, + "grad_norm": 0.6183215975761414, + "learning_rate": 0.0001, + "loss": 1.5038, + "step": 12733 + }, + { + "epoch": 1.4794074934650014, + "grad_norm": 0.6770209670066833, + "learning_rate": 0.0001, + "loss": 1.5481, + "step": 12734 + }, + { + "epoch": 1.479523671216962, + "grad_norm": 0.6122325658798218, + "learning_rate": 0.0001, + "loss": 1.3449, + "step": 12735 + }, + { + "epoch": 1.4796398489689224, + "grad_norm": 0.6098016500473022, + "learning_rate": 0.0001, + "loss": 1.4898, + "step": 12736 + }, + { + "epoch": 1.479756026720883, + "grad_norm": 0.6338397860527039, + "learning_rate": 0.0001, + "loss": 1.4301, + "step": 12737 + }, + { + "epoch": 1.4798722044728434, + "grad_norm": 0.6147565841674805, + "learning_rate": 0.0001, + "loss": 1.5078, + "step": 12738 + }, + { + "epoch": 1.4799883822248039, + "grad_norm": 0.622284471988678, + "learning_rate": 0.0001, + "loss": 1.546, + "step": 12739 + }, + { + "epoch": 1.4801045599767644, + "grad_norm": 0.5711101293563843, + "learning_rate": 0.0001, + "loss": 1.4056, + "step": 12740 + }, + { + "epoch": 1.4802207377287249, + "grad_norm": 0.5917078852653503, + "learning_rate": 0.0001, + "loss": 1.4603, + "step": 12741 + }, + { + "epoch": 1.4803369154806854, + "grad_norm": 0.5660672783851624, + "learning_rate": 0.0001, + "loss": 1.3994, + "step": 12742 + }, + { + "epoch": 1.4804530932326458, + "grad_norm": 0.586409330368042, + "learning_rate": 0.0001, + "loss": 1.3446, + "step": 12743 + }, + { + "epoch": 1.4805692709846063, + "grad_norm": 0.6024650931358337, + "learning_rate": 0.0001, + "loss": 1.4112, + "step": 12744 + }, + { + "epoch": 1.4806854487365668, + "grad_norm": 0.5979096293449402, + "learning_rate": 0.0001, + "loss": 1.4108, + "step": 12745 + }, + { + "epoch": 1.4808016264885273, + "grad_norm": 0.6005404591560364, + "learning_rate": 0.0001, + "loss": 1.4058, + "step": 12746 + }, + { + "epoch": 1.480917804240488, + "grad_norm": 0.6152083873748779, + "learning_rate": 0.0001, + "loss": 1.4269, + "step": 12747 + }, + { + "epoch": 1.4810339819924485, + "grad_norm": 0.6299720406532288, + "learning_rate": 0.0001, + "loss": 1.306, + "step": 12748 + }, + { + "epoch": 1.481150159744409, + "grad_norm": 0.6793599724769592, + "learning_rate": 0.0001, + "loss": 1.5236, + "step": 12749 + }, + { + "epoch": 1.4812663374963695, + "grad_norm": 0.5687341690063477, + "learning_rate": 0.0001, + "loss": 1.4857, + "step": 12750 + }, + { + "epoch": 1.48138251524833, + "grad_norm": 0.5625494718551636, + "learning_rate": 0.0001, + "loss": 1.3357, + "step": 12751 + }, + { + "epoch": 1.4814986930002905, + "grad_norm": 0.6246379017829895, + "learning_rate": 0.0001, + "loss": 1.4443, + "step": 12752 + }, + { + "epoch": 1.481614870752251, + "grad_norm": 0.6253631114959717, + "learning_rate": 0.0001, + "loss": 1.4476, + "step": 12753 + }, + { + "epoch": 1.4817310485042114, + "grad_norm": 0.603517472743988, + "learning_rate": 0.0001, + "loss": 1.3907, + "step": 12754 + }, + { + "epoch": 1.481847226256172, + "grad_norm": 0.5724433660507202, + "learning_rate": 0.0001, + "loss": 1.4412, + "step": 12755 + }, + { + "epoch": 1.4819634040081324, + "grad_norm": 0.6194106340408325, + "learning_rate": 0.0001, + "loss": 1.4979, + "step": 12756 + }, + { + "epoch": 1.482079581760093, + "grad_norm": 0.6359567642211914, + "learning_rate": 0.0001, + "loss": 1.5081, + "step": 12757 + }, + { + "epoch": 1.4821957595120534, + "grad_norm": 0.5960992574691772, + "learning_rate": 0.0001, + "loss": 1.4689, + "step": 12758 + }, + { + "epoch": 1.482311937264014, + "grad_norm": 0.5790872573852539, + "learning_rate": 0.0001, + "loss": 1.3061, + "step": 12759 + }, + { + "epoch": 1.4824281150159744, + "grad_norm": 0.5998115539550781, + "learning_rate": 0.0001, + "loss": 1.5584, + "step": 12760 + }, + { + "epoch": 1.4825442927679349, + "grad_norm": 0.5446498394012451, + "learning_rate": 0.0001, + "loss": 1.3484, + "step": 12761 + }, + { + "epoch": 1.4826604705198954, + "grad_norm": 0.6086790561676025, + "learning_rate": 0.0001, + "loss": 1.4267, + "step": 12762 + }, + { + "epoch": 1.482776648271856, + "grad_norm": 0.5855661630630493, + "learning_rate": 0.0001, + "loss": 1.1741, + "step": 12763 + }, + { + "epoch": 1.4828928260238166, + "grad_norm": 0.6026800274848938, + "learning_rate": 0.0001, + "loss": 1.5557, + "step": 12764 + }, + { + "epoch": 1.483009003775777, + "grad_norm": 0.5826662182807922, + "learning_rate": 0.0001, + "loss": 1.5116, + "step": 12765 + }, + { + "epoch": 1.4831251815277375, + "grad_norm": 0.6261777281761169, + "learning_rate": 0.0001, + "loss": 1.3935, + "step": 12766 + }, + { + "epoch": 1.483241359279698, + "grad_norm": 0.6069555282592773, + "learning_rate": 0.0001, + "loss": 1.3201, + "step": 12767 + }, + { + "epoch": 1.4833575370316585, + "grad_norm": 0.6280861496925354, + "learning_rate": 0.0001, + "loss": 1.5374, + "step": 12768 + }, + { + "epoch": 1.483473714783619, + "grad_norm": 0.6123801469802856, + "learning_rate": 0.0001, + "loss": 1.2891, + "step": 12769 + }, + { + "epoch": 1.4835898925355795, + "grad_norm": 0.5813351273536682, + "learning_rate": 0.0001, + "loss": 1.3189, + "step": 12770 + }, + { + "epoch": 1.48370607028754, + "grad_norm": 0.6434810161590576, + "learning_rate": 0.0001, + "loss": 1.5881, + "step": 12771 + }, + { + "epoch": 1.4838222480395005, + "grad_norm": 0.6274320483207703, + "learning_rate": 0.0001, + "loss": 1.5605, + "step": 12772 + }, + { + "epoch": 1.483938425791461, + "grad_norm": 0.5794488787651062, + "learning_rate": 0.0001, + "loss": 1.5702, + "step": 12773 + }, + { + "epoch": 1.4840546035434214, + "grad_norm": 0.5843713879585266, + "learning_rate": 0.0001, + "loss": 1.4887, + "step": 12774 + }, + { + "epoch": 1.484170781295382, + "grad_norm": 0.5706945657730103, + "learning_rate": 0.0001, + "loss": 1.4438, + "step": 12775 + }, + { + "epoch": 1.4842869590473424, + "grad_norm": 0.6139296293258667, + "learning_rate": 0.0001, + "loss": 1.5284, + "step": 12776 + }, + { + "epoch": 1.484403136799303, + "grad_norm": 0.658415675163269, + "learning_rate": 0.0001, + "loss": 1.5691, + "step": 12777 + }, + { + "epoch": 1.4845193145512634, + "grad_norm": 0.6205752491950989, + "learning_rate": 0.0001, + "loss": 1.4368, + "step": 12778 + }, + { + "epoch": 1.484635492303224, + "grad_norm": 0.5956551432609558, + "learning_rate": 0.0001, + "loss": 1.4247, + "step": 12779 + }, + { + "epoch": 1.4847516700551844, + "grad_norm": 0.5731070637702942, + "learning_rate": 0.0001, + "loss": 1.4603, + "step": 12780 + }, + { + "epoch": 1.4848678478071449, + "grad_norm": 0.5833913683891296, + "learning_rate": 0.0001, + "loss": 1.4769, + "step": 12781 + }, + { + "epoch": 1.4849840255591054, + "grad_norm": 0.5474773645401001, + "learning_rate": 0.0001, + "loss": 1.509, + "step": 12782 + }, + { + "epoch": 1.4851002033110658, + "grad_norm": 0.5622663497924805, + "learning_rate": 0.0001, + "loss": 1.4723, + "step": 12783 + }, + { + "epoch": 1.4852163810630263, + "grad_norm": 0.5745351314544678, + "learning_rate": 0.0001, + "loss": 1.4861, + "step": 12784 + }, + { + "epoch": 1.4853325588149868, + "grad_norm": 0.5686841607093811, + "learning_rate": 0.0001, + "loss": 1.4275, + "step": 12785 + }, + { + "epoch": 1.4854487365669473, + "grad_norm": 0.5676813721656799, + "learning_rate": 0.0001, + "loss": 1.3021, + "step": 12786 + }, + { + "epoch": 1.4855649143189078, + "grad_norm": 0.5953998565673828, + "learning_rate": 0.0001, + "loss": 1.3295, + "step": 12787 + }, + { + "epoch": 1.4856810920708683, + "grad_norm": 0.5832408666610718, + "learning_rate": 0.0001, + "loss": 1.4133, + "step": 12788 + }, + { + "epoch": 1.485797269822829, + "grad_norm": 0.6295830011367798, + "learning_rate": 0.0001, + "loss": 1.4368, + "step": 12789 + }, + { + "epoch": 1.4859134475747895, + "grad_norm": 0.595488965511322, + "learning_rate": 0.0001, + "loss": 1.5149, + "step": 12790 + }, + { + "epoch": 1.48602962532675, + "grad_norm": 0.5632891654968262, + "learning_rate": 0.0001, + "loss": 1.3456, + "step": 12791 + }, + { + "epoch": 1.4861458030787105, + "grad_norm": 0.5827213525772095, + "learning_rate": 0.0001, + "loss": 1.4786, + "step": 12792 + }, + { + "epoch": 1.486261980830671, + "grad_norm": 0.58870929479599, + "learning_rate": 0.0001, + "loss": 1.3798, + "step": 12793 + }, + { + "epoch": 1.4863781585826314, + "grad_norm": 0.566349983215332, + "learning_rate": 0.0001, + "loss": 1.2853, + "step": 12794 + }, + { + "epoch": 1.486494336334592, + "grad_norm": 0.5674836039543152, + "learning_rate": 0.0001, + "loss": 1.4405, + "step": 12795 + }, + { + "epoch": 1.4866105140865524, + "grad_norm": 0.6061223745346069, + "learning_rate": 0.0001, + "loss": 1.5349, + "step": 12796 + }, + { + "epoch": 1.486726691838513, + "grad_norm": 0.6239713430404663, + "learning_rate": 0.0001, + "loss": 1.5709, + "step": 12797 + }, + { + "epoch": 1.4868428695904734, + "grad_norm": 0.5661698579788208, + "learning_rate": 0.0001, + "loss": 1.3916, + "step": 12798 + }, + { + "epoch": 1.486959047342434, + "grad_norm": 0.6092931628227234, + "learning_rate": 0.0001, + "loss": 1.506, + "step": 12799 + }, + { + "epoch": 1.4870752250943944, + "grad_norm": 0.6277779936790466, + "learning_rate": 0.0001, + "loss": 1.6359, + "step": 12800 + }, + { + "epoch": 1.4871914028463549, + "grad_norm": 0.5889933705329895, + "learning_rate": 0.0001, + "loss": 1.3339, + "step": 12801 + }, + { + "epoch": 1.4873075805983154, + "grad_norm": 0.5621128082275391, + "learning_rate": 0.0001, + "loss": 1.2968, + "step": 12802 + }, + { + "epoch": 1.4874237583502758, + "grad_norm": 0.5749889016151428, + "learning_rate": 0.0001, + "loss": 1.4791, + "step": 12803 + }, + { + "epoch": 1.4875399361022366, + "grad_norm": 0.6799257397651672, + "learning_rate": 0.0001, + "loss": 1.616, + "step": 12804 + }, + { + "epoch": 1.487656113854197, + "grad_norm": 0.575088381767273, + "learning_rate": 0.0001, + "loss": 1.5002, + "step": 12805 + }, + { + "epoch": 1.4877722916061575, + "grad_norm": 0.6447527408599854, + "learning_rate": 0.0001, + "loss": 1.5963, + "step": 12806 + }, + { + "epoch": 1.487888469358118, + "grad_norm": 0.6129550337791443, + "learning_rate": 0.0001, + "loss": 1.4895, + "step": 12807 + }, + { + "epoch": 1.4880046471100785, + "grad_norm": 0.6152228713035583, + "learning_rate": 0.0001, + "loss": 1.388, + "step": 12808 + }, + { + "epoch": 1.488120824862039, + "grad_norm": 0.6193944215774536, + "learning_rate": 0.0001, + "loss": 1.5167, + "step": 12809 + }, + { + "epoch": 1.4882370026139995, + "grad_norm": 0.6093383431434631, + "learning_rate": 0.0001, + "loss": 1.6097, + "step": 12810 + }, + { + "epoch": 1.48835318036596, + "grad_norm": 0.589266300201416, + "learning_rate": 0.0001, + "loss": 1.3637, + "step": 12811 + }, + { + "epoch": 1.4884693581179205, + "grad_norm": 0.6045963764190674, + "learning_rate": 0.0001, + "loss": 1.4242, + "step": 12812 + }, + { + "epoch": 1.488585535869881, + "grad_norm": 0.5933640003204346, + "learning_rate": 0.0001, + "loss": 1.3325, + "step": 12813 + }, + { + "epoch": 1.4887017136218414, + "grad_norm": 0.5831065773963928, + "learning_rate": 0.0001, + "loss": 1.4035, + "step": 12814 + }, + { + "epoch": 1.488817891373802, + "grad_norm": 0.6712548136711121, + "learning_rate": 0.0001, + "loss": 1.4683, + "step": 12815 + }, + { + "epoch": 1.4889340691257624, + "grad_norm": 0.5910773277282715, + "learning_rate": 0.0001, + "loss": 1.4458, + "step": 12816 + }, + { + "epoch": 1.489050246877723, + "grad_norm": 0.6439294219017029, + "learning_rate": 0.0001, + "loss": 1.6244, + "step": 12817 + }, + { + "epoch": 1.4891664246296834, + "grad_norm": 0.5771844387054443, + "learning_rate": 0.0001, + "loss": 1.4584, + "step": 12818 + }, + { + "epoch": 1.489282602381644, + "grad_norm": 0.6531968712806702, + "learning_rate": 0.0001, + "loss": 1.533, + "step": 12819 + }, + { + "epoch": 1.4893987801336044, + "grad_norm": 0.6017487049102783, + "learning_rate": 0.0001, + "loss": 1.4118, + "step": 12820 + }, + { + "epoch": 1.4895149578855649, + "grad_norm": 0.6122699975967407, + "learning_rate": 0.0001, + "loss": 1.4481, + "step": 12821 + }, + { + "epoch": 1.4896311356375254, + "grad_norm": 0.6408692598342896, + "learning_rate": 0.0001, + "loss": 1.5795, + "step": 12822 + }, + { + "epoch": 1.4897473133894858, + "grad_norm": 0.6655322909355164, + "learning_rate": 0.0001, + "loss": 1.5516, + "step": 12823 + }, + { + "epoch": 1.4898634911414463, + "grad_norm": 0.5901119709014893, + "learning_rate": 0.0001, + "loss": 1.3471, + "step": 12824 + }, + { + "epoch": 1.4899796688934068, + "grad_norm": 0.574163019657135, + "learning_rate": 0.0001, + "loss": 1.3641, + "step": 12825 + }, + { + "epoch": 1.4900958466453673, + "grad_norm": 0.641035795211792, + "learning_rate": 0.0001, + "loss": 1.4546, + "step": 12826 + }, + { + "epoch": 1.4902120243973278, + "grad_norm": 0.6070435643196106, + "learning_rate": 0.0001, + "loss": 1.5669, + "step": 12827 + }, + { + "epoch": 1.4903282021492883, + "grad_norm": 0.6125596761703491, + "learning_rate": 0.0001, + "loss": 1.5613, + "step": 12828 + }, + { + "epoch": 1.4904443799012488, + "grad_norm": 0.5792481899261475, + "learning_rate": 0.0001, + "loss": 1.5242, + "step": 12829 + }, + { + "epoch": 1.4905605576532093, + "grad_norm": 0.6352988481521606, + "learning_rate": 0.0001, + "loss": 1.6555, + "step": 12830 + }, + { + "epoch": 1.49067673540517, + "grad_norm": 0.6231880187988281, + "learning_rate": 0.0001, + "loss": 1.6766, + "step": 12831 + }, + { + "epoch": 1.4907929131571305, + "grad_norm": 0.5636422634124756, + "learning_rate": 0.0001, + "loss": 1.1924, + "step": 12832 + }, + { + "epoch": 1.490909090909091, + "grad_norm": 0.6103519797325134, + "learning_rate": 0.0001, + "loss": 1.5181, + "step": 12833 + }, + { + "epoch": 1.4910252686610515, + "grad_norm": 0.5947346687316895, + "learning_rate": 0.0001, + "loss": 1.3394, + "step": 12834 + }, + { + "epoch": 1.491141446413012, + "grad_norm": 0.6331939101219177, + "learning_rate": 0.0001, + "loss": 1.4172, + "step": 12835 + }, + { + "epoch": 1.4912576241649724, + "grad_norm": 0.5931614637374878, + "learning_rate": 0.0001, + "loss": 1.2726, + "step": 12836 + }, + { + "epoch": 1.491373801916933, + "grad_norm": 0.6468703150749207, + "learning_rate": 0.0001, + "loss": 1.4609, + "step": 12837 + }, + { + "epoch": 1.4914899796688934, + "grad_norm": 0.6426784992218018, + "learning_rate": 0.0001, + "loss": 1.5128, + "step": 12838 + }, + { + "epoch": 1.491606157420854, + "grad_norm": 0.6114495992660522, + "learning_rate": 0.0001, + "loss": 1.3851, + "step": 12839 + }, + { + "epoch": 1.4917223351728144, + "grad_norm": 0.5668593645095825, + "learning_rate": 0.0001, + "loss": 1.3923, + "step": 12840 + }, + { + "epoch": 1.4918385129247749, + "grad_norm": 0.6152085661888123, + "learning_rate": 0.0001, + "loss": 1.4781, + "step": 12841 + }, + { + "epoch": 1.4919546906767354, + "grad_norm": 0.6062020063400269, + "learning_rate": 0.0001, + "loss": 1.4925, + "step": 12842 + }, + { + "epoch": 1.4920708684286959, + "grad_norm": 0.589048445224762, + "learning_rate": 0.0001, + "loss": 1.4028, + "step": 12843 + }, + { + "epoch": 1.4921870461806563, + "grad_norm": 0.6015335917472839, + "learning_rate": 0.0001, + "loss": 1.3599, + "step": 12844 + }, + { + "epoch": 1.4923032239326168, + "grad_norm": 0.581270158290863, + "learning_rate": 0.0001, + "loss": 1.4128, + "step": 12845 + }, + { + "epoch": 1.4924194016845775, + "grad_norm": 0.5742862820625305, + "learning_rate": 0.0001, + "loss": 1.4333, + "step": 12846 + }, + { + "epoch": 1.492535579436538, + "grad_norm": 0.6063739061355591, + "learning_rate": 0.0001, + "loss": 1.4433, + "step": 12847 + }, + { + "epoch": 1.4926517571884985, + "grad_norm": 0.5964763760566711, + "learning_rate": 0.0001, + "loss": 1.3218, + "step": 12848 + }, + { + "epoch": 1.492767934940459, + "grad_norm": 0.6101694107055664, + "learning_rate": 0.0001, + "loss": 1.5799, + "step": 12849 + }, + { + "epoch": 1.4928841126924195, + "grad_norm": 0.6307281851768494, + "learning_rate": 0.0001, + "loss": 1.5571, + "step": 12850 + }, + { + "epoch": 1.49300029044438, + "grad_norm": 0.5971539616584778, + "learning_rate": 0.0001, + "loss": 1.4422, + "step": 12851 + }, + { + "epoch": 1.4931164681963405, + "grad_norm": 0.6096937656402588, + "learning_rate": 0.0001, + "loss": 1.4911, + "step": 12852 + }, + { + "epoch": 1.493232645948301, + "grad_norm": 0.6482102274894714, + "learning_rate": 0.0001, + "loss": 1.4308, + "step": 12853 + }, + { + "epoch": 1.4933488237002615, + "grad_norm": 0.6256344318389893, + "learning_rate": 0.0001, + "loss": 1.5942, + "step": 12854 + }, + { + "epoch": 1.493465001452222, + "grad_norm": 0.5984079837799072, + "learning_rate": 0.0001, + "loss": 1.4259, + "step": 12855 + }, + { + "epoch": 1.4935811792041824, + "grad_norm": 0.5901398658752441, + "learning_rate": 0.0001, + "loss": 1.4518, + "step": 12856 + }, + { + "epoch": 1.493697356956143, + "grad_norm": 0.6232024431228638, + "learning_rate": 0.0001, + "loss": 1.5517, + "step": 12857 + }, + { + "epoch": 1.4938135347081034, + "grad_norm": 0.5728425979614258, + "learning_rate": 0.0001, + "loss": 1.2854, + "step": 12858 + }, + { + "epoch": 1.493929712460064, + "grad_norm": 0.617725670337677, + "learning_rate": 0.0001, + "loss": 1.447, + "step": 12859 + }, + { + "epoch": 1.4940458902120244, + "grad_norm": 0.6785517334938049, + "learning_rate": 0.0001, + "loss": 1.4123, + "step": 12860 + }, + { + "epoch": 1.4941620679639849, + "grad_norm": 0.6345421075820923, + "learning_rate": 0.0001, + "loss": 1.5892, + "step": 12861 + }, + { + "epoch": 1.4942782457159454, + "grad_norm": 0.5922654271125793, + "learning_rate": 0.0001, + "loss": 1.4769, + "step": 12862 + }, + { + "epoch": 1.4943944234679059, + "grad_norm": 0.6028105616569519, + "learning_rate": 0.0001, + "loss": 1.518, + "step": 12863 + }, + { + "epoch": 1.4945106012198663, + "grad_norm": 0.6089810132980347, + "learning_rate": 0.0001, + "loss": 1.333, + "step": 12864 + }, + { + "epoch": 1.4946267789718268, + "grad_norm": 0.6128267645835876, + "learning_rate": 0.0001, + "loss": 1.5329, + "step": 12865 + }, + { + "epoch": 1.4947429567237873, + "grad_norm": 0.7104302644729614, + "learning_rate": 0.0001, + "loss": 1.7867, + "step": 12866 + }, + { + "epoch": 1.4948591344757478, + "grad_norm": 0.6253650784492493, + "learning_rate": 0.0001, + "loss": 1.567, + "step": 12867 + }, + { + "epoch": 1.4949753122277083, + "grad_norm": 0.6499916911125183, + "learning_rate": 0.0001, + "loss": 1.5073, + "step": 12868 + }, + { + "epoch": 1.4950914899796688, + "grad_norm": 0.6280500888824463, + "learning_rate": 0.0001, + "loss": 1.421, + "step": 12869 + }, + { + "epoch": 1.4952076677316293, + "grad_norm": 0.6782636046409607, + "learning_rate": 0.0001, + "loss": 1.6919, + "step": 12870 + }, + { + "epoch": 1.4953238454835898, + "grad_norm": 0.6510661840438843, + "learning_rate": 0.0001, + "loss": 1.6278, + "step": 12871 + }, + { + "epoch": 1.4954400232355505, + "grad_norm": 0.6078386306762695, + "learning_rate": 0.0001, + "loss": 1.2536, + "step": 12872 + }, + { + "epoch": 1.495556200987511, + "grad_norm": 0.5885645747184753, + "learning_rate": 0.0001, + "loss": 1.3384, + "step": 12873 + }, + { + "epoch": 1.4956723787394715, + "grad_norm": 0.560592770576477, + "learning_rate": 0.0001, + "loss": 1.4449, + "step": 12874 + }, + { + "epoch": 1.495788556491432, + "grad_norm": 0.6519601345062256, + "learning_rate": 0.0001, + "loss": 1.4626, + "step": 12875 + }, + { + "epoch": 1.4959047342433924, + "grad_norm": 0.6099236607551575, + "learning_rate": 0.0001, + "loss": 1.4496, + "step": 12876 + }, + { + "epoch": 1.496020911995353, + "grad_norm": 0.5811101794242859, + "learning_rate": 0.0001, + "loss": 1.4068, + "step": 12877 + }, + { + "epoch": 1.4961370897473134, + "grad_norm": 0.5691701769828796, + "learning_rate": 0.0001, + "loss": 1.3746, + "step": 12878 + }, + { + "epoch": 1.496253267499274, + "grad_norm": 0.568541944026947, + "learning_rate": 0.0001, + "loss": 1.2409, + "step": 12879 + }, + { + "epoch": 1.4963694452512344, + "grad_norm": 0.6283146142959595, + "learning_rate": 0.0001, + "loss": 1.4772, + "step": 12880 + }, + { + "epoch": 1.4964856230031949, + "grad_norm": 0.6234946250915527, + "learning_rate": 0.0001, + "loss": 1.5406, + "step": 12881 + }, + { + "epoch": 1.4966018007551554, + "grad_norm": 0.6640174984931946, + "learning_rate": 0.0001, + "loss": 1.756, + "step": 12882 + }, + { + "epoch": 1.4967179785071159, + "grad_norm": 0.645535945892334, + "learning_rate": 0.0001, + "loss": 1.5076, + "step": 12883 + }, + { + "epoch": 1.4968341562590763, + "grad_norm": 0.615752637386322, + "learning_rate": 0.0001, + "loss": 1.5753, + "step": 12884 + }, + { + "epoch": 1.4969503340110368, + "grad_norm": 0.6437330842018127, + "learning_rate": 0.0001, + "loss": 1.764, + "step": 12885 + }, + { + "epoch": 1.4970665117629973, + "grad_norm": 0.633956789970398, + "learning_rate": 0.0001, + "loss": 1.6816, + "step": 12886 + }, + { + "epoch": 1.4971826895149578, + "grad_norm": 0.625065803527832, + "learning_rate": 0.0001, + "loss": 1.6142, + "step": 12887 + }, + { + "epoch": 1.4972988672669185, + "grad_norm": 0.5906104445457458, + "learning_rate": 0.0001, + "loss": 1.2654, + "step": 12888 + }, + { + "epoch": 1.497415045018879, + "grad_norm": 0.6069050431251526, + "learning_rate": 0.0001, + "loss": 1.5332, + "step": 12889 + }, + { + "epoch": 1.4975312227708395, + "grad_norm": 0.5757995247840881, + "learning_rate": 0.0001, + "loss": 1.2766, + "step": 12890 + }, + { + "epoch": 1.4976474005228, + "grad_norm": 0.6138774752616882, + "learning_rate": 0.0001, + "loss": 1.4732, + "step": 12891 + }, + { + "epoch": 1.4977635782747605, + "grad_norm": 0.6091035008430481, + "learning_rate": 0.0001, + "loss": 1.4733, + "step": 12892 + }, + { + "epoch": 1.497879756026721, + "grad_norm": 0.5950594544410706, + "learning_rate": 0.0001, + "loss": 1.524, + "step": 12893 + }, + { + "epoch": 1.4979959337786815, + "grad_norm": 0.6246311664581299, + "learning_rate": 0.0001, + "loss": 1.4968, + "step": 12894 + }, + { + "epoch": 1.498112111530642, + "grad_norm": 0.5842698216438293, + "learning_rate": 0.0001, + "loss": 1.365, + "step": 12895 + }, + { + "epoch": 1.4982282892826024, + "grad_norm": 0.6111376881599426, + "learning_rate": 0.0001, + "loss": 1.3392, + "step": 12896 + }, + { + "epoch": 1.498344467034563, + "grad_norm": 0.5652458667755127, + "learning_rate": 0.0001, + "loss": 1.2803, + "step": 12897 + }, + { + "epoch": 1.4984606447865234, + "grad_norm": 0.600919246673584, + "learning_rate": 0.0001, + "loss": 1.4203, + "step": 12898 + }, + { + "epoch": 1.498576822538484, + "grad_norm": 0.5875718593597412, + "learning_rate": 0.0001, + "loss": 1.2227, + "step": 12899 + }, + { + "epoch": 1.4986930002904444, + "grad_norm": 0.6111353635787964, + "learning_rate": 0.0001, + "loss": 1.3412, + "step": 12900 + }, + { + "epoch": 1.4988091780424049, + "grad_norm": 0.6344109177589417, + "learning_rate": 0.0001, + "loss": 1.7502, + "step": 12901 + }, + { + "epoch": 1.4989253557943654, + "grad_norm": 0.6019986867904663, + "learning_rate": 0.0001, + "loss": 1.4987, + "step": 12902 + }, + { + "epoch": 1.4990415335463259, + "grad_norm": 0.6078724265098572, + "learning_rate": 0.0001, + "loss": 1.5648, + "step": 12903 + }, + { + "epoch": 1.4991577112982863, + "grad_norm": 0.6435593962669373, + "learning_rate": 0.0001, + "loss": 1.5631, + "step": 12904 + }, + { + "epoch": 1.4992738890502468, + "grad_norm": 0.661933422088623, + "learning_rate": 0.0001, + "loss": 1.4889, + "step": 12905 + }, + { + "epoch": 1.4993900668022073, + "grad_norm": 0.6397930383682251, + "learning_rate": 0.0001, + "loss": 1.6223, + "step": 12906 + }, + { + "epoch": 1.4995062445541678, + "grad_norm": 0.5697651505470276, + "learning_rate": 0.0001, + "loss": 1.4793, + "step": 12907 + }, + { + "epoch": 1.4996224223061283, + "grad_norm": 0.5920810103416443, + "learning_rate": 0.0001, + "loss": 1.4955, + "step": 12908 + }, + { + "epoch": 1.4997386000580888, + "grad_norm": 0.6010572910308838, + "learning_rate": 0.0001, + "loss": 1.472, + "step": 12909 + }, + { + "epoch": 1.4998547778100493, + "grad_norm": 0.5660801529884338, + "learning_rate": 0.0001, + "loss": 1.4184, + "step": 12910 + }, + { + "epoch": 1.4999709555620098, + "grad_norm": 0.5613225102424622, + "learning_rate": 0.0001, + "loss": 1.3346, + "step": 12911 + }, + { + "epoch": 1.5000871333139703, + "grad_norm": 0.6039864420890808, + "learning_rate": 0.0001, + "loss": 1.495, + "step": 12912 + }, + { + "epoch": 1.5002033110659307, + "grad_norm": 0.6184871196746826, + "learning_rate": 0.0001, + "loss": 1.595, + "step": 12913 + }, + { + "epoch": 1.5003194888178912, + "grad_norm": 0.602561891078949, + "learning_rate": 0.0001, + "loss": 1.4573, + "step": 12914 + }, + { + "epoch": 1.5004356665698517, + "grad_norm": 0.597723126411438, + "learning_rate": 0.0001, + "loss": 1.4666, + "step": 12915 + }, + { + "epoch": 1.5005518443218122, + "grad_norm": 0.6435630917549133, + "learning_rate": 0.0001, + "loss": 1.5864, + "step": 12916 + }, + { + "epoch": 1.500668022073773, + "grad_norm": 0.6061123013496399, + "learning_rate": 0.0001, + "loss": 1.4316, + "step": 12917 + }, + { + "epoch": 1.5007841998257334, + "grad_norm": 0.5902183055877686, + "learning_rate": 0.0001, + "loss": 1.5089, + "step": 12918 + }, + { + "epoch": 1.500900377577694, + "grad_norm": 0.6924389600753784, + "learning_rate": 0.0001, + "loss": 1.552, + "step": 12919 + }, + { + "epoch": 1.5010165553296544, + "grad_norm": 0.6402321457862854, + "learning_rate": 0.0001, + "loss": 1.4686, + "step": 12920 + }, + { + "epoch": 1.5011327330816149, + "grad_norm": 0.6038404703140259, + "learning_rate": 0.0001, + "loss": 1.452, + "step": 12921 + }, + { + "epoch": 1.5012489108335754, + "grad_norm": 0.5710585713386536, + "learning_rate": 0.0001, + "loss": 1.5174, + "step": 12922 + }, + { + "epoch": 1.5013650885855359, + "grad_norm": 0.6276952624320984, + "learning_rate": 0.0001, + "loss": 1.5144, + "step": 12923 + }, + { + "epoch": 1.5014812663374963, + "grad_norm": 0.6179443001747131, + "learning_rate": 0.0001, + "loss": 1.4969, + "step": 12924 + }, + { + "epoch": 1.5015974440894568, + "grad_norm": 0.648830235004425, + "learning_rate": 0.0001, + "loss": 1.5558, + "step": 12925 + }, + { + "epoch": 1.5017136218414173, + "grad_norm": 0.6248328685760498, + "learning_rate": 0.0001, + "loss": 1.6314, + "step": 12926 + }, + { + "epoch": 1.501829799593378, + "grad_norm": 0.6510769724845886, + "learning_rate": 0.0001, + "loss": 1.5131, + "step": 12927 + }, + { + "epoch": 1.5019459773453385, + "grad_norm": 0.6090167760848999, + "learning_rate": 0.0001, + "loss": 1.3349, + "step": 12928 + }, + { + "epoch": 1.502062155097299, + "grad_norm": 0.5935208201408386, + "learning_rate": 0.0001, + "loss": 1.3589, + "step": 12929 + }, + { + "epoch": 1.5021783328492595, + "grad_norm": 0.6401615142822266, + "learning_rate": 0.0001, + "loss": 1.4513, + "step": 12930 + }, + { + "epoch": 1.50229451060122, + "grad_norm": 0.6500523090362549, + "learning_rate": 0.0001, + "loss": 1.5076, + "step": 12931 + }, + { + "epoch": 1.5024106883531805, + "grad_norm": 0.6160486936569214, + "learning_rate": 0.0001, + "loss": 1.4097, + "step": 12932 + }, + { + "epoch": 1.502526866105141, + "grad_norm": 0.6370952129364014, + "learning_rate": 0.0001, + "loss": 1.4829, + "step": 12933 + }, + { + "epoch": 1.5026430438571015, + "grad_norm": 0.6076738834381104, + "learning_rate": 0.0001, + "loss": 1.5261, + "step": 12934 + }, + { + "epoch": 1.502759221609062, + "grad_norm": 0.5521711707115173, + "learning_rate": 0.0001, + "loss": 1.3567, + "step": 12935 + }, + { + "epoch": 1.5028753993610224, + "grad_norm": 0.571445643901825, + "learning_rate": 0.0001, + "loss": 1.198, + "step": 12936 + }, + { + "epoch": 1.502991577112983, + "grad_norm": 0.6402273178100586, + "learning_rate": 0.0001, + "loss": 1.5463, + "step": 12937 + }, + { + "epoch": 1.5031077548649434, + "grad_norm": 0.6269349455833435, + "learning_rate": 0.0001, + "loss": 1.5242, + "step": 12938 + }, + { + "epoch": 1.503223932616904, + "grad_norm": 0.6312558054924011, + "learning_rate": 0.0001, + "loss": 1.4097, + "step": 12939 + }, + { + "epoch": 1.5033401103688644, + "grad_norm": 0.6417350172996521, + "learning_rate": 0.0001, + "loss": 1.5901, + "step": 12940 + }, + { + "epoch": 1.5034562881208249, + "grad_norm": 0.6578287482261658, + "learning_rate": 0.0001, + "loss": 1.5081, + "step": 12941 + }, + { + "epoch": 1.5035724658727854, + "grad_norm": 0.5819112062454224, + "learning_rate": 0.0001, + "loss": 1.3784, + "step": 12942 + }, + { + "epoch": 1.5036886436247459, + "grad_norm": 0.6250476837158203, + "learning_rate": 0.0001, + "loss": 1.4396, + "step": 12943 + }, + { + "epoch": 1.5038048213767063, + "grad_norm": 0.7036281824111938, + "learning_rate": 0.0001, + "loss": 1.5268, + "step": 12944 + }, + { + "epoch": 1.5039209991286668, + "grad_norm": 0.5976950526237488, + "learning_rate": 0.0001, + "loss": 1.3829, + "step": 12945 + }, + { + "epoch": 1.5040371768806273, + "grad_norm": 0.6492063403129578, + "learning_rate": 0.0001, + "loss": 1.5779, + "step": 12946 + }, + { + "epoch": 1.5041533546325878, + "grad_norm": 0.5875254273414612, + "learning_rate": 0.0001, + "loss": 1.4128, + "step": 12947 + }, + { + "epoch": 1.5042695323845483, + "grad_norm": 0.6553013324737549, + "learning_rate": 0.0001, + "loss": 1.513, + "step": 12948 + }, + { + "epoch": 1.5043857101365088, + "grad_norm": 0.6260391473770142, + "learning_rate": 0.0001, + "loss": 1.5521, + "step": 12949 + }, + { + "epoch": 1.5045018878884693, + "grad_norm": 0.7011594772338867, + "learning_rate": 0.0001, + "loss": 1.5969, + "step": 12950 + }, + { + "epoch": 1.5046180656404298, + "grad_norm": 0.6216688752174377, + "learning_rate": 0.0001, + "loss": 1.4793, + "step": 12951 + }, + { + "epoch": 1.5047342433923903, + "grad_norm": 0.6175325512886047, + "learning_rate": 0.0001, + "loss": 1.5253, + "step": 12952 + }, + { + "epoch": 1.5048504211443507, + "grad_norm": 0.657291829586029, + "learning_rate": 0.0001, + "loss": 1.5744, + "step": 12953 + }, + { + "epoch": 1.5049665988963112, + "grad_norm": 0.6169841289520264, + "learning_rate": 0.0001, + "loss": 1.4183, + "step": 12954 + }, + { + "epoch": 1.5050827766482717, + "grad_norm": 0.6225051879882812, + "learning_rate": 0.0001, + "loss": 1.3909, + "step": 12955 + }, + { + "epoch": 1.5051989544002322, + "grad_norm": 0.7212052345275879, + "learning_rate": 0.0001, + "loss": 1.5745, + "step": 12956 + }, + { + "epoch": 1.5053151321521927, + "grad_norm": 0.6326794028282166, + "learning_rate": 0.0001, + "loss": 1.4791, + "step": 12957 + }, + { + "epoch": 1.5054313099041532, + "grad_norm": 0.6327944397926331, + "learning_rate": 0.0001, + "loss": 1.5499, + "step": 12958 + }, + { + "epoch": 1.505547487656114, + "grad_norm": 0.580574095249176, + "learning_rate": 0.0001, + "loss": 1.3895, + "step": 12959 + }, + { + "epoch": 1.5056636654080744, + "grad_norm": 0.7401102781295776, + "learning_rate": 0.0001, + "loss": 1.5773, + "step": 12960 + }, + { + "epoch": 1.5057798431600349, + "grad_norm": 0.5871754288673401, + "learning_rate": 0.0001, + "loss": 1.4968, + "step": 12961 + }, + { + "epoch": 1.5058960209119954, + "grad_norm": 0.6001573801040649, + "learning_rate": 0.0001, + "loss": 1.4837, + "step": 12962 + }, + { + "epoch": 1.5060121986639559, + "grad_norm": 0.537681519985199, + "learning_rate": 0.0001, + "loss": 1.371, + "step": 12963 + }, + { + "epoch": 1.5061283764159163, + "grad_norm": 0.6001483201980591, + "learning_rate": 0.0001, + "loss": 1.4206, + "step": 12964 + }, + { + "epoch": 1.5062445541678768, + "grad_norm": 0.6050352454185486, + "learning_rate": 0.0001, + "loss": 1.4868, + "step": 12965 + }, + { + "epoch": 1.5063607319198373, + "grad_norm": 0.5874537825584412, + "learning_rate": 0.0001, + "loss": 1.3447, + "step": 12966 + }, + { + "epoch": 1.5064769096717978, + "grad_norm": 0.6636204123497009, + "learning_rate": 0.0001, + "loss": 1.4919, + "step": 12967 + }, + { + "epoch": 1.5065930874237583, + "grad_norm": 0.6525164246559143, + "learning_rate": 0.0001, + "loss": 1.7726, + "step": 12968 + }, + { + "epoch": 1.506709265175719, + "grad_norm": 0.6085329055786133, + "learning_rate": 0.0001, + "loss": 1.5039, + "step": 12969 + }, + { + "epoch": 1.5068254429276795, + "grad_norm": 0.6257724165916443, + "learning_rate": 0.0001, + "loss": 1.5168, + "step": 12970 + }, + { + "epoch": 1.50694162067964, + "grad_norm": 0.6245921850204468, + "learning_rate": 0.0001, + "loss": 1.4173, + "step": 12971 + }, + { + "epoch": 1.5070577984316005, + "grad_norm": 0.6131578087806702, + "learning_rate": 0.0001, + "loss": 1.3337, + "step": 12972 + }, + { + "epoch": 1.507173976183561, + "grad_norm": 0.6182470321655273, + "learning_rate": 0.0001, + "loss": 1.5111, + "step": 12973 + }, + { + "epoch": 1.5072901539355215, + "grad_norm": 0.6009563207626343, + "learning_rate": 0.0001, + "loss": 1.4442, + "step": 12974 + }, + { + "epoch": 1.507406331687482, + "grad_norm": 0.5851460099220276, + "learning_rate": 0.0001, + "loss": 1.4297, + "step": 12975 + }, + { + "epoch": 1.5075225094394424, + "grad_norm": 0.5955979824066162, + "learning_rate": 0.0001, + "loss": 1.503, + "step": 12976 + }, + { + "epoch": 1.507638687191403, + "grad_norm": 0.6064378619194031, + "learning_rate": 0.0001, + "loss": 1.4586, + "step": 12977 + }, + { + "epoch": 1.5077548649433634, + "grad_norm": 0.5852058529853821, + "learning_rate": 0.0001, + "loss": 1.2852, + "step": 12978 + }, + { + "epoch": 1.507871042695324, + "grad_norm": 0.5747120380401611, + "learning_rate": 0.0001, + "loss": 1.4461, + "step": 12979 + }, + { + "epoch": 1.5079872204472844, + "grad_norm": 0.566041886806488, + "learning_rate": 0.0001, + "loss": 1.4831, + "step": 12980 + }, + { + "epoch": 1.5081033981992449, + "grad_norm": 0.6108693480491638, + "learning_rate": 0.0001, + "loss": 1.3197, + "step": 12981 + }, + { + "epoch": 1.5082195759512054, + "grad_norm": 0.6155402660369873, + "learning_rate": 0.0001, + "loss": 1.5253, + "step": 12982 + }, + { + "epoch": 1.5083357537031659, + "grad_norm": 0.6545661687850952, + "learning_rate": 0.0001, + "loss": 1.5458, + "step": 12983 + }, + { + "epoch": 1.5084519314551263, + "grad_norm": 0.6387633085250854, + "learning_rate": 0.0001, + "loss": 1.3534, + "step": 12984 + }, + { + "epoch": 1.5085681092070868, + "grad_norm": 0.6138156056404114, + "learning_rate": 0.0001, + "loss": 1.4418, + "step": 12985 + }, + { + "epoch": 1.5086842869590473, + "grad_norm": 0.6812984347343445, + "learning_rate": 0.0001, + "loss": 1.5453, + "step": 12986 + }, + { + "epoch": 1.5088004647110078, + "grad_norm": 0.5979045033454895, + "learning_rate": 0.0001, + "loss": 1.3903, + "step": 12987 + }, + { + "epoch": 1.5089166424629683, + "grad_norm": 0.5794659852981567, + "learning_rate": 0.0001, + "loss": 1.4501, + "step": 12988 + }, + { + "epoch": 1.5090328202149288, + "grad_norm": 0.6213316321372986, + "learning_rate": 0.0001, + "loss": 1.4512, + "step": 12989 + }, + { + "epoch": 1.5091489979668893, + "grad_norm": 0.6868020296096802, + "learning_rate": 0.0001, + "loss": 1.6787, + "step": 12990 + }, + { + "epoch": 1.5092651757188498, + "grad_norm": 0.6285677552223206, + "learning_rate": 0.0001, + "loss": 1.5757, + "step": 12991 + }, + { + "epoch": 1.5093813534708103, + "grad_norm": 0.6092104315757751, + "learning_rate": 0.0001, + "loss": 1.5592, + "step": 12992 + }, + { + "epoch": 1.5094975312227707, + "grad_norm": 0.5800586342811584, + "learning_rate": 0.0001, + "loss": 1.413, + "step": 12993 + }, + { + "epoch": 1.5096137089747312, + "grad_norm": 0.6098979711532593, + "learning_rate": 0.0001, + "loss": 1.6554, + "step": 12994 + }, + { + "epoch": 1.5097298867266917, + "grad_norm": 0.5925366878509521, + "learning_rate": 0.0001, + "loss": 1.6048, + "step": 12995 + }, + { + "epoch": 1.5098460644786522, + "grad_norm": 0.5455504059791565, + "learning_rate": 0.0001, + "loss": 1.3888, + "step": 12996 + }, + { + "epoch": 1.5099622422306127, + "grad_norm": 0.6356263160705566, + "learning_rate": 0.0001, + "loss": 1.6256, + "step": 12997 + }, + { + "epoch": 1.5100784199825732, + "grad_norm": 0.6532312035560608, + "learning_rate": 0.0001, + "loss": 1.6825, + "step": 12998 + }, + { + "epoch": 1.5101945977345337, + "grad_norm": 0.6413785219192505, + "learning_rate": 0.0001, + "loss": 1.5933, + "step": 12999 + }, + { + "epoch": 1.5103107754864942, + "grad_norm": 0.5889772176742554, + "learning_rate": 0.0001, + "loss": 1.3492, + "step": 13000 + }, + { + "epoch": 1.5104269532384549, + "grad_norm": 0.5992516279220581, + "learning_rate": 0.0001, + "loss": 1.4146, + "step": 13001 + }, + { + "epoch": 1.5105431309904154, + "grad_norm": 0.5487164855003357, + "learning_rate": 0.0001, + "loss": 1.393, + "step": 13002 + }, + { + "epoch": 1.5106593087423759, + "grad_norm": 0.6081210970878601, + "learning_rate": 0.0001, + "loss": 1.3813, + "step": 13003 + }, + { + "epoch": 1.5107754864943364, + "grad_norm": 0.6229928731918335, + "learning_rate": 0.0001, + "loss": 1.4805, + "step": 13004 + }, + { + "epoch": 1.5108916642462968, + "grad_norm": 0.6931246519088745, + "learning_rate": 0.0001, + "loss": 1.8908, + "step": 13005 + }, + { + "epoch": 1.5110078419982573, + "grad_norm": 0.5892548561096191, + "learning_rate": 0.0001, + "loss": 1.4452, + "step": 13006 + }, + { + "epoch": 1.5111240197502178, + "grad_norm": 0.6359273791313171, + "learning_rate": 0.0001, + "loss": 1.4474, + "step": 13007 + }, + { + "epoch": 1.5112401975021783, + "grad_norm": 0.6264612078666687, + "learning_rate": 0.0001, + "loss": 1.5436, + "step": 13008 + }, + { + "epoch": 1.5113563752541388, + "grad_norm": 0.6442233324050903, + "learning_rate": 0.0001, + "loss": 1.3834, + "step": 13009 + }, + { + "epoch": 1.5114725530060993, + "grad_norm": 0.6219121217727661, + "learning_rate": 0.0001, + "loss": 1.2729, + "step": 13010 + }, + { + "epoch": 1.51158873075806, + "grad_norm": 0.6267338991165161, + "learning_rate": 0.0001, + "loss": 1.4941, + "step": 13011 + }, + { + "epoch": 1.5117049085100205, + "grad_norm": 0.5808162689208984, + "learning_rate": 0.0001, + "loss": 1.4617, + "step": 13012 + }, + { + "epoch": 1.511821086261981, + "grad_norm": 0.6579829454421997, + "learning_rate": 0.0001, + "loss": 1.4468, + "step": 13013 + }, + { + "epoch": 1.5119372640139415, + "grad_norm": 0.593537449836731, + "learning_rate": 0.0001, + "loss": 1.4952, + "step": 13014 + }, + { + "epoch": 1.512053441765902, + "grad_norm": 0.538985550403595, + "learning_rate": 0.0001, + "loss": 1.2365, + "step": 13015 + }, + { + "epoch": 1.5121696195178624, + "grad_norm": 0.6082078814506531, + "learning_rate": 0.0001, + "loss": 1.4621, + "step": 13016 + }, + { + "epoch": 1.512285797269823, + "grad_norm": 0.6273752450942993, + "learning_rate": 0.0001, + "loss": 1.4828, + "step": 13017 + }, + { + "epoch": 1.5124019750217834, + "grad_norm": 0.6424401998519897, + "learning_rate": 0.0001, + "loss": 1.5122, + "step": 13018 + }, + { + "epoch": 1.512518152773744, + "grad_norm": 0.5700364708900452, + "learning_rate": 0.0001, + "loss": 1.2343, + "step": 13019 + }, + { + "epoch": 1.5126343305257044, + "grad_norm": 0.6058292388916016, + "learning_rate": 0.0001, + "loss": 1.3065, + "step": 13020 + }, + { + "epoch": 1.5127505082776649, + "grad_norm": 0.5938413143157959, + "learning_rate": 0.0001, + "loss": 1.4116, + "step": 13021 + }, + { + "epoch": 1.5128666860296254, + "grad_norm": 0.5882897973060608, + "learning_rate": 0.0001, + "loss": 1.4321, + "step": 13022 + }, + { + "epoch": 1.5129828637815859, + "grad_norm": 0.6097550988197327, + "learning_rate": 0.0001, + "loss": 1.4422, + "step": 13023 + }, + { + "epoch": 1.5130990415335464, + "grad_norm": 0.6150256395339966, + "learning_rate": 0.0001, + "loss": 1.4294, + "step": 13024 + }, + { + "epoch": 1.5132152192855068, + "grad_norm": 0.5661260485649109, + "learning_rate": 0.0001, + "loss": 1.3101, + "step": 13025 + }, + { + "epoch": 1.5133313970374673, + "grad_norm": 0.6101265549659729, + "learning_rate": 0.0001, + "loss": 1.4298, + "step": 13026 + }, + { + "epoch": 1.5134475747894278, + "grad_norm": 0.5510701537132263, + "learning_rate": 0.0001, + "loss": 1.2253, + "step": 13027 + }, + { + "epoch": 1.5135637525413883, + "grad_norm": 0.5895761847496033, + "learning_rate": 0.0001, + "loss": 1.2103, + "step": 13028 + }, + { + "epoch": 1.5136799302933488, + "grad_norm": 0.6410093903541565, + "learning_rate": 0.0001, + "loss": 1.5713, + "step": 13029 + }, + { + "epoch": 1.5137961080453093, + "grad_norm": 0.6049452424049377, + "learning_rate": 0.0001, + "loss": 1.4493, + "step": 13030 + }, + { + "epoch": 1.5139122857972698, + "grad_norm": 0.5991563200950623, + "learning_rate": 0.0001, + "loss": 1.369, + "step": 13031 + }, + { + "epoch": 1.5140284635492303, + "grad_norm": 0.6251417398452759, + "learning_rate": 0.0001, + "loss": 1.4542, + "step": 13032 + }, + { + "epoch": 1.5141446413011908, + "grad_norm": 0.6023422479629517, + "learning_rate": 0.0001, + "loss": 1.3901, + "step": 13033 + }, + { + "epoch": 1.5142608190531512, + "grad_norm": 0.6075770854949951, + "learning_rate": 0.0001, + "loss": 1.4216, + "step": 13034 + }, + { + "epoch": 1.5143769968051117, + "grad_norm": 0.5965530872344971, + "learning_rate": 0.0001, + "loss": 1.544, + "step": 13035 + }, + { + "epoch": 1.5144931745570722, + "grad_norm": 0.5827516913414001, + "learning_rate": 0.0001, + "loss": 1.3238, + "step": 13036 + }, + { + "epoch": 1.5146093523090327, + "grad_norm": 0.5984660387039185, + "learning_rate": 0.0001, + "loss": 1.5236, + "step": 13037 + }, + { + "epoch": 1.5147255300609932, + "grad_norm": 0.6355912685394287, + "learning_rate": 0.0001, + "loss": 1.6367, + "step": 13038 + }, + { + "epoch": 1.5148417078129537, + "grad_norm": 0.6122537851333618, + "learning_rate": 0.0001, + "loss": 1.5632, + "step": 13039 + }, + { + "epoch": 1.5149578855649142, + "grad_norm": 0.5791593790054321, + "learning_rate": 0.0001, + "loss": 1.3258, + "step": 13040 + }, + { + "epoch": 1.5150740633168747, + "grad_norm": 0.604680061340332, + "learning_rate": 0.0001, + "loss": 1.5425, + "step": 13041 + }, + { + "epoch": 1.5151902410688352, + "grad_norm": 0.610745370388031, + "learning_rate": 0.0001, + "loss": 1.2757, + "step": 13042 + }, + { + "epoch": 1.5153064188207959, + "grad_norm": 0.5982139706611633, + "learning_rate": 0.0001, + "loss": 1.5444, + "step": 13043 + }, + { + "epoch": 1.5154225965727564, + "grad_norm": 0.5831255316734314, + "learning_rate": 0.0001, + "loss": 1.366, + "step": 13044 + }, + { + "epoch": 1.5155387743247168, + "grad_norm": 0.6067568063735962, + "learning_rate": 0.0001, + "loss": 1.3268, + "step": 13045 + }, + { + "epoch": 1.5156549520766773, + "grad_norm": 0.575416088104248, + "learning_rate": 0.0001, + "loss": 1.4809, + "step": 13046 + }, + { + "epoch": 1.5157711298286378, + "grad_norm": 0.6012206673622131, + "learning_rate": 0.0001, + "loss": 1.4559, + "step": 13047 + }, + { + "epoch": 1.5158873075805983, + "grad_norm": 0.5732565522193909, + "learning_rate": 0.0001, + "loss": 1.3186, + "step": 13048 + }, + { + "epoch": 1.5160034853325588, + "grad_norm": 0.5650144815444946, + "learning_rate": 0.0001, + "loss": 1.4331, + "step": 13049 + }, + { + "epoch": 1.5161196630845193, + "grad_norm": 0.6107412576675415, + "learning_rate": 0.0001, + "loss": 1.388, + "step": 13050 + }, + { + "epoch": 1.5162358408364798, + "grad_norm": 0.6341642737388611, + "learning_rate": 0.0001, + "loss": 1.4583, + "step": 13051 + }, + { + "epoch": 1.5163520185884403, + "grad_norm": 0.6973785758018494, + "learning_rate": 0.0001, + "loss": 1.5455, + "step": 13052 + }, + { + "epoch": 1.516468196340401, + "grad_norm": 0.6058968901634216, + "learning_rate": 0.0001, + "loss": 1.1763, + "step": 13053 + }, + { + "epoch": 1.5165843740923615, + "grad_norm": 0.6408393383026123, + "learning_rate": 0.0001, + "loss": 1.5899, + "step": 13054 + }, + { + "epoch": 1.516700551844322, + "grad_norm": 0.6071977019309998, + "learning_rate": 0.0001, + "loss": 1.3627, + "step": 13055 + }, + { + "epoch": 1.5168167295962824, + "grad_norm": 0.6299028396606445, + "learning_rate": 0.0001, + "loss": 1.5713, + "step": 13056 + }, + { + "epoch": 1.516932907348243, + "grad_norm": 0.6032760143280029, + "learning_rate": 0.0001, + "loss": 1.4244, + "step": 13057 + }, + { + "epoch": 1.5170490851002034, + "grad_norm": 0.5748342871665955, + "learning_rate": 0.0001, + "loss": 1.41, + "step": 13058 + }, + { + "epoch": 1.517165262852164, + "grad_norm": 0.5807141661643982, + "learning_rate": 0.0001, + "loss": 1.4516, + "step": 13059 + }, + { + "epoch": 1.5172814406041244, + "grad_norm": 0.6238056421279907, + "learning_rate": 0.0001, + "loss": 1.3953, + "step": 13060 + }, + { + "epoch": 1.5173976183560849, + "grad_norm": 0.6162986159324646, + "learning_rate": 0.0001, + "loss": 1.5245, + "step": 13061 + }, + { + "epoch": 1.5175137961080454, + "grad_norm": 0.6011757254600525, + "learning_rate": 0.0001, + "loss": 1.4601, + "step": 13062 + }, + { + "epoch": 1.5176299738600059, + "grad_norm": 0.6143328547477722, + "learning_rate": 0.0001, + "loss": 1.4704, + "step": 13063 + }, + { + "epoch": 1.5177461516119664, + "grad_norm": 0.5976557731628418, + "learning_rate": 0.0001, + "loss": 1.3611, + "step": 13064 + }, + { + "epoch": 1.5178623293639268, + "grad_norm": 0.6380662322044373, + "learning_rate": 0.0001, + "loss": 1.5687, + "step": 13065 + }, + { + "epoch": 1.5179785071158873, + "grad_norm": 0.6185310482978821, + "learning_rate": 0.0001, + "loss": 1.5481, + "step": 13066 + }, + { + "epoch": 1.5180946848678478, + "grad_norm": 0.5588987469673157, + "learning_rate": 0.0001, + "loss": 1.2781, + "step": 13067 + }, + { + "epoch": 1.5182108626198083, + "grad_norm": 0.6087438464164734, + "learning_rate": 0.0001, + "loss": 1.4519, + "step": 13068 + }, + { + "epoch": 1.5183270403717688, + "grad_norm": 0.6939107775688171, + "learning_rate": 0.0001, + "loss": 1.6837, + "step": 13069 + }, + { + "epoch": 1.5184432181237293, + "grad_norm": 0.5763146877288818, + "learning_rate": 0.0001, + "loss": 1.3932, + "step": 13070 + }, + { + "epoch": 1.5185593958756898, + "grad_norm": 0.5791681408882141, + "learning_rate": 0.0001, + "loss": 1.3638, + "step": 13071 + }, + { + "epoch": 1.5186755736276503, + "grad_norm": 0.6180105209350586, + "learning_rate": 0.0001, + "loss": 1.5888, + "step": 13072 + }, + { + "epoch": 1.5187917513796108, + "grad_norm": 0.6183565258979797, + "learning_rate": 0.0001, + "loss": 1.5131, + "step": 13073 + }, + { + "epoch": 1.5189079291315712, + "grad_norm": 0.6181787848472595, + "learning_rate": 0.0001, + "loss": 1.4906, + "step": 13074 + }, + { + "epoch": 1.5190241068835317, + "grad_norm": 0.6255152225494385, + "learning_rate": 0.0001, + "loss": 1.472, + "step": 13075 + }, + { + "epoch": 1.5191402846354922, + "grad_norm": 0.57309889793396, + "learning_rate": 0.0001, + "loss": 1.3557, + "step": 13076 + }, + { + "epoch": 1.5192564623874527, + "grad_norm": 0.5992699861526489, + "learning_rate": 0.0001, + "loss": 1.2967, + "step": 13077 + }, + { + "epoch": 1.5193726401394132, + "grad_norm": 0.6498000025749207, + "learning_rate": 0.0001, + "loss": 1.6267, + "step": 13078 + }, + { + "epoch": 1.5194888178913737, + "grad_norm": 0.6142323017120361, + "learning_rate": 0.0001, + "loss": 1.3953, + "step": 13079 + }, + { + "epoch": 1.5196049956433342, + "grad_norm": 0.5629411935806274, + "learning_rate": 0.0001, + "loss": 1.4722, + "step": 13080 + }, + { + "epoch": 1.5197211733952947, + "grad_norm": 0.6026464104652405, + "learning_rate": 0.0001, + "loss": 1.5025, + "step": 13081 + }, + { + "epoch": 1.5198373511472552, + "grad_norm": 0.5876480340957642, + "learning_rate": 0.0001, + "loss": 1.42, + "step": 13082 + }, + { + "epoch": 1.5199535288992156, + "grad_norm": 0.6540488600730896, + "learning_rate": 0.0001, + "loss": 1.5638, + "step": 13083 + }, + { + "epoch": 1.5200697066511761, + "grad_norm": 0.5790266990661621, + "learning_rate": 0.0001, + "loss": 1.4935, + "step": 13084 + }, + { + "epoch": 1.5201858844031368, + "grad_norm": 0.5915881991386414, + "learning_rate": 0.0001, + "loss": 1.5506, + "step": 13085 + }, + { + "epoch": 1.5203020621550973, + "grad_norm": 0.601015567779541, + "learning_rate": 0.0001, + "loss": 1.554, + "step": 13086 + }, + { + "epoch": 1.5204182399070578, + "grad_norm": 0.607069730758667, + "learning_rate": 0.0001, + "loss": 1.4829, + "step": 13087 + }, + { + "epoch": 1.5205344176590183, + "grad_norm": 0.5643183588981628, + "learning_rate": 0.0001, + "loss": 1.4889, + "step": 13088 + }, + { + "epoch": 1.5206505954109788, + "grad_norm": 0.6405245065689087, + "learning_rate": 0.0001, + "loss": 1.4864, + "step": 13089 + }, + { + "epoch": 1.5207667731629393, + "grad_norm": 0.618649959564209, + "learning_rate": 0.0001, + "loss": 1.4619, + "step": 13090 + }, + { + "epoch": 1.5208829509148998, + "grad_norm": 0.596312403678894, + "learning_rate": 0.0001, + "loss": 1.4631, + "step": 13091 + }, + { + "epoch": 1.5209991286668603, + "grad_norm": 0.6227704286575317, + "learning_rate": 0.0001, + "loss": 1.6598, + "step": 13092 + }, + { + "epoch": 1.5211153064188208, + "grad_norm": 0.5877798795700073, + "learning_rate": 0.0001, + "loss": 1.5022, + "step": 13093 + }, + { + "epoch": 1.5212314841707812, + "grad_norm": 0.5980353355407715, + "learning_rate": 0.0001, + "loss": 1.5944, + "step": 13094 + }, + { + "epoch": 1.521347661922742, + "grad_norm": 0.6470990777015686, + "learning_rate": 0.0001, + "loss": 1.6384, + "step": 13095 + }, + { + "epoch": 1.5214638396747024, + "grad_norm": 0.6021008491516113, + "learning_rate": 0.0001, + "loss": 1.5108, + "step": 13096 + }, + { + "epoch": 1.521580017426663, + "grad_norm": 0.5772629380226135, + "learning_rate": 0.0001, + "loss": 1.4815, + "step": 13097 + }, + { + "epoch": 1.5216961951786234, + "grad_norm": 0.5909640192985535, + "learning_rate": 0.0001, + "loss": 1.4743, + "step": 13098 + }, + { + "epoch": 1.521812372930584, + "grad_norm": 0.6111878752708435, + "learning_rate": 0.0001, + "loss": 1.5198, + "step": 13099 + }, + { + "epoch": 1.5219285506825444, + "grad_norm": 0.5904512405395508, + "learning_rate": 0.0001, + "loss": 1.5612, + "step": 13100 + }, + { + "epoch": 1.522044728434505, + "grad_norm": 0.5810577869415283, + "learning_rate": 0.0001, + "loss": 1.3642, + "step": 13101 + }, + { + "epoch": 1.5221609061864654, + "grad_norm": 0.6800036430358887, + "learning_rate": 0.0001, + "loss": 1.5471, + "step": 13102 + }, + { + "epoch": 1.5222770839384259, + "grad_norm": 0.6236268877983093, + "learning_rate": 0.0001, + "loss": 1.6704, + "step": 13103 + }, + { + "epoch": 1.5223932616903864, + "grad_norm": 0.6043499708175659, + "learning_rate": 0.0001, + "loss": 1.4506, + "step": 13104 + }, + { + "epoch": 1.5225094394423468, + "grad_norm": 0.609677255153656, + "learning_rate": 0.0001, + "loss": 1.45, + "step": 13105 + }, + { + "epoch": 1.5226256171943073, + "grad_norm": 0.5774016976356506, + "learning_rate": 0.0001, + "loss": 1.4479, + "step": 13106 + }, + { + "epoch": 1.5227417949462678, + "grad_norm": 0.6218743324279785, + "learning_rate": 0.0001, + "loss": 1.5468, + "step": 13107 + }, + { + "epoch": 1.5228579726982283, + "grad_norm": 0.5599686503410339, + "learning_rate": 0.0001, + "loss": 1.3028, + "step": 13108 + }, + { + "epoch": 1.5229741504501888, + "grad_norm": 0.5793375372886658, + "learning_rate": 0.0001, + "loss": 1.3442, + "step": 13109 + }, + { + "epoch": 1.5230903282021493, + "grad_norm": 0.5650421380996704, + "learning_rate": 0.0001, + "loss": 1.1701, + "step": 13110 + }, + { + "epoch": 1.5232065059541098, + "grad_norm": 0.628027617931366, + "learning_rate": 0.0001, + "loss": 1.4018, + "step": 13111 + }, + { + "epoch": 1.5233226837060703, + "grad_norm": 0.6404212117195129, + "learning_rate": 0.0001, + "loss": 1.5248, + "step": 13112 + }, + { + "epoch": 1.5234388614580308, + "grad_norm": 0.6501476168632507, + "learning_rate": 0.0001, + "loss": 1.6086, + "step": 13113 + }, + { + "epoch": 1.5235550392099912, + "grad_norm": 0.6205679774284363, + "learning_rate": 0.0001, + "loss": 1.5886, + "step": 13114 + }, + { + "epoch": 1.5236712169619517, + "grad_norm": 0.6271020174026489, + "learning_rate": 0.0001, + "loss": 1.4695, + "step": 13115 + }, + { + "epoch": 1.5237873947139122, + "grad_norm": 0.6364188194274902, + "learning_rate": 0.0001, + "loss": 1.6018, + "step": 13116 + }, + { + "epoch": 1.5239035724658727, + "grad_norm": 0.6291508674621582, + "learning_rate": 0.0001, + "loss": 1.4255, + "step": 13117 + }, + { + "epoch": 1.5240197502178332, + "grad_norm": 0.6832680106163025, + "learning_rate": 0.0001, + "loss": 1.566, + "step": 13118 + }, + { + "epoch": 1.5241359279697937, + "grad_norm": 0.6287549138069153, + "learning_rate": 0.0001, + "loss": 1.6068, + "step": 13119 + }, + { + "epoch": 1.5242521057217542, + "grad_norm": 0.6037324070930481, + "learning_rate": 0.0001, + "loss": 1.3475, + "step": 13120 + }, + { + "epoch": 1.5243682834737147, + "grad_norm": 0.6374009847640991, + "learning_rate": 0.0001, + "loss": 1.544, + "step": 13121 + }, + { + "epoch": 1.5244844612256752, + "grad_norm": 0.5533431172370911, + "learning_rate": 0.0001, + "loss": 1.4256, + "step": 13122 + }, + { + "epoch": 1.5246006389776356, + "grad_norm": 0.6008569002151489, + "learning_rate": 0.0001, + "loss": 1.4351, + "step": 13123 + }, + { + "epoch": 1.5247168167295961, + "grad_norm": 0.5492884516716003, + "learning_rate": 0.0001, + "loss": 1.4973, + "step": 13124 + }, + { + "epoch": 1.5248329944815566, + "grad_norm": 0.6348086595535278, + "learning_rate": 0.0001, + "loss": 1.6349, + "step": 13125 + }, + { + "epoch": 1.5249491722335173, + "grad_norm": 0.5985419750213623, + "learning_rate": 0.0001, + "loss": 1.3882, + "step": 13126 + }, + { + "epoch": 1.5250653499854778, + "grad_norm": 0.588337242603302, + "learning_rate": 0.0001, + "loss": 1.4547, + "step": 13127 + }, + { + "epoch": 1.5251815277374383, + "grad_norm": 0.6453753113746643, + "learning_rate": 0.0001, + "loss": 1.593, + "step": 13128 + }, + { + "epoch": 1.5252977054893988, + "grad_norm": 0.6236133575439453, + "learning_rate": 0.0001, + "loss": 1.3412, + "step": 13129 + }, + { + "epoch": 1.5254138832413593, + "grad_norm": 0.6044201850891113, + "learning_rate": 0.0001, + "loss": 1.4729, + "step": 13130 + }, + { + "epoch": 1.5255300609933198, + "grad_norm": 0.5731863975524902, + "learning_rate": 0.0001, + "loss": 1.3612, + "step": 13131 + }, + { + "epoch": 1.5256462387452803, + "grad_norm": 0.6380339860916138, + "learning_rate": 0.0001, + "loss": 1.5127, + "step": 13132 + }, + { + "epoch": 1.5257624164972408, + "grad_norm": 0.6082126498222351, + "learning_rate": 0.0001, + "loss": 1.5667, + "step": 13133 + }, + { + "epoch": 1.5258785942492012, + "grad_norm": 0.6267526745796204, + "learning_rate": 0.0001, + "loss": 1.6277, + "step": 13134 + }, + { + "epoch": 1.5259947720011617, + "grad_norm": 0.5912509560585022, + "learning_rate": 0.0001, + "loss": 1.2835, + "step": 13135 + }, + { + "epoch": 1.5261109497531222, + "grad_norm": 0.6406939029693604, + "learning_rate": 0.0001, + "loss": 1.6174, + "step": 13136 + }, + { + "epoch": 1.526227127505083, + "grad_norm": 0.6724519729614258, + "learning_rate": 0.0001, + "loss": 1.4591, + "step": 13137 + }, + { + "epoch": 1.5263433052570434, + "grad_norm": 0.6085923314094543, + "learning_rate": 0.0001, + "loss": 1.4962, + "step": 13138 + }, + { + "epoch": 1.526459483009004, + "grad_norm": 0.5566753149032593, + "learning_rate": 0.0001, + "loss": 1.1431, + "step": 13139 + }, + { + "epoch": 1.5265756607609644, + "grad_norm": 0.5759755969047546, + "learning_rate": 0.0001, + "loss": 1.3858, + "step": 13140 + }, + { + "epoch": 1.526691838512925, + "grad_norm": 0.5898522138595581, + "learning_rate": 0.0001, + "loss": 1.4425, + "step": 13141 + }, + { + "epoch": 1.5268080162648854, + "grad_norm": 0.614996075630188, + "learning_rate": 0.0001, + "loss": 1.5029, + "step": 13142 + }, + { + "epoch": 1.5269241940168459, + "grad_norm": 0.5954774022102356, + "learning_rate": 0.0001, + "loss": 1.4092, + "step": 13143 + }, + { + "epoch": 1.5270403717688064, + "grad_norm": 0.6323831081390381, + "learning_rate": 0.0001, + "loss": 1.4987, + "step": 13144 + }, + { + "epoch": 1.5271565495207668, + "grad_norm": 0.6423251628875732, + "learning_rate": 0.0001, + "loss": 1.7558, + "step": 13145 + }, + { + "epoch": 1.5272727272727273, + "grad_norm": 0.6106740236282349, + "learning_rate": 0.0001, + "loss": 1.566, + "step": 13146 + }, + { + "epoch": 1.5273889050246878, + "grad_norm": 0.5833632349967957, + "learning_rate": 0.0001, + "loss": 1.4143, + "step": 13147 + }, + { + "epoch": 1.5275050827766483, + "grad_norm": 0.580480694770813, + "learning_rate": 0.0001, + "loss": 1.4606, + "step": 13148 + }, + { + "epoch": 1.5276212605286088, + "grad_norm": 0.6620073318481445, + "learning_rate": 0.0001, + "loss": 1.5736, + "step": 13149 + }, + { + "epoch": 1.5277374382805693, + "grad_norm": 0.6580905914306641, + "learning_rate": 0.0001, + "loss": 1.6311, + "step": 13150 + }, + { + "epoch": 1.5278536160325298, + "grad_norm": 0.5711208581924438, + "learning_rate": 0.0001, + "loss": 1.4917, + "step": 13151 + }, + { + "epoch": 1.5279697937844903, + "grad_norm": 0.5903010964393616, + "learning_rate": 0.0001, + "loss": 1.3864, + "step": 13152 + }, + { + "epoch": 1.5280859715364508, + "grad_norm": 0.5728796720504761, + "learning_rate": 0.0001, + "loss": 1.4043, + "step": 13153 + }, + { + "epoch": 1.5282021492884112, + "grad_norm": 0.591552197933197, + "learning_rate": 0.0001, + "loss": 1.5419, + "step": 13154 + }, + { + "epoch": 1.5283183270403717, + "grad_norm": 0.5879071950912476, + "learning_rate": 0.0001, + "loss": 1.2898, + "step": 13155 + }, + { + "epoch": 1.5284345047923322, + "grad_norm": 0.6146317720413208, + "learning_rate": 0.0001, + "loss": 1.4951, + "step": 13156 + }, + { + "epoch": 1.5285506825442927, + "grad_norm": 0.6422489285469055, + "learning_rate": 0.0001, + "loss": 1.5282, + "step": 13157 + }, + { + "epoch": 1.5286668602962532, + "grad_norm": 0.6233516335487366, + "learning_rate": 0.0001, + "loss": 1.5457, + "step": 13158 + }, + { + "epoch": 1.5287830380482137, + "grad_norm": 0.7108170986175537, + "learning_rate": 0.0001, + "loss": 1.7262, + "step": 13159 + }, + { + "epoch": 1.5288992158001742, + "grad_norm": 0.6578541398048401, + "learning_rate": 0.0001, + "loss": 1.6322, + "step": 13160 + }, + { + "epoch": 1.5290153935521347, + "grad_norm": 0.6104475259780884, + "learning_rate": 0.0001, + "loss": 1.461, + "step": 13161 + }, + { + "epoch": 1.5291315713040952, + "grad_norm": 0.6033722758293152, + "learning_rate": 0.0001, + "loss": 1.68, + "step": 13162 + }, + { + "epoch": 1.5292477490560556, + "grad_norm": 0.5982484817504883, + "learning_rate": 0.0001, + "loss": 1.5397, + "step": 13163 + }, + { + "epoch": 1.5293639268080161, + "grad_norm": 0.5699301958084106, + "learning_rate": 0.0001, + "loss": 1.3097, + "step": 13164 + }, + { + "epoch": 1.5294801045599766, + "grad_norm": 0.5736358165740967, + "learning_rate": 0.0001, + "loss": 1.5139, + "step": 13165 + }, + { + "epoch": 1.5295962823119371, + "grad_norm": 0.5809690952301025, + "learning_rate": 0.0001, + "loss": 1.2927, + "step": 13166 + }, + { + "epoch": 1.5297124600638976, + "grad_norm": 0.64310622215271, + "learning_rate": 0.0001, + "loss": 1.6676, + "step": 13167 + }, + { + "epoch": 1.5298286378158583, + "grad_norm": 0.6130411624908447, + "learning_rate": 0.0001, + "loss": 1.3156, + "step": 13168 + }, + { + "epoch": 1.5299448155678188, + "grad_norm": 0.611953854560852, + "learning_rate": 0.0001, + "loss": 1.6393, + "step": 13169 + }, + { + "epoch": 1.5300609933197793, + "grad_norm": 0.6218355894088745, + "learning_rate": 0.0001, + "loss": 1.3941, + "step": 13170 + }, + { + "epoch": 1.5301771710717398, + "grad_norm": 0.6101202964782715, + "learning_rate": 0.0001, + "loss": 1.405, + "step": 13171 + }, + { + "epoch": 1.5302933488237003, + "grad_norm": 0.6130668520927429, + "learning_rate": 0.0001, + "loss": 1.6009, + "step": 13172 + }, + { + "epoch": 1.5304095265756608, + "grad_norm": 0.5833741426467896, + "learning_rate": 0.0001, + "loss": 1.5686, + "step": 13173 + }, + { + "epoch": 1.5305257043276213, + "grad_norm": 0.5890129208564758, + "learning_rate": 0.0001, + "loss": 1.4987, + "step": 13174 + }, + { + "epoch": 1.5306418820795817, + "grad_norm": 0.5778335928916931, + "learning_rate": 0.0001, + "loss": 1.3429, + "step": 13175 + }, + { + "epoch": 1.5307580598315422, + "grad_norm": 0.6423119306564331, + "learning_rate": 0.0001, + "loss": 1.58, + "step": 13176 + }, + { + "epoch": 1.5308742375835027, + "grad_norm": 0.594713032245636, + "learning_rate": 0.0001, + "loss": 1.566, + "step": 13177 + }, + { + "epoch": 1.5309904153354632, + "grad_norm": 0.662578821182251, + "learning_rate": 0.0001, + "loss": 1.6117, + "step": 13178 + }, + { + "epoch": 1.531106593087424, + "grad_norm": 0.6052879095077515, + "learning_rate": 0.0001, + "loss": 1.4994, + "step": 13179 + }, + { + "epoch": 1.5312227708393844, + "grad_norm": 0.580970287322998, + "learning_rate": 0.0001, + "loss": 1.5377, + "step": 13180 + }, + { + "epoch": 1.531338948591345, + "grad_norm": 0.6133270859718323, + "learning_rate": 0.0001, + "loss": 1.5026, + "step": 13181 + }, + { + "epoch": 1.5314551263433054, + "grad_norm": 0.5756634473800659, + "learning_rate": 0.0001, + "loss": 1.4228, + "step": 13182 + }, + { + "epoch": 1.5315713040952659, + "grad_norm": 0.588685154914856, + "learning_rate": 0.0001, + "loss": 1.3341, + "step": 13183 + }, + { + "epoch": 1.5316874818472264, + "grad_norm": 0.6559744477272034, + "learning_rate": 0.0001, + "loss": 1.4987, + "step": 13184 + }, + { + "epoch": 1.5318036595991869, + "grad_norm": 0.6924941539764404, + "learning_rate": 0.0001, + "loss": 1.6968, + "step": 13185 + }, + { + "epoch": 1.5319198373511473, + "grad_norm": 0.5851516723632812, + "learning_rate": 0.0001, + "loss": 1.4279, + "step": 13186 + }, + { + "epoch": 1.5320360151031078, + "grad_norm": 0.6089817881584167, + "learning_rate": 0.0001, + "loss": 1.4143, + "step": 13187 + }, + { + "epoch": 1.5321521928550683, + "grad_norm": 0.6336711049079895, + "learning_rate": 0.0001, + "loss": 1.528, + "step": 13188 + }, + { + "epoch": 1.5322683706070288, + "grad_norm": 0.5794046521186829, + "learning_rate": 0.0001, + "loss": 1.4202, + "step": 13189 + }, + { + "epoch": 1.5323845483589893, + "grad_norm": 0.614953875541687, + "learning_rate": 0.0001, + "loss": 1.4362, + "step": 13190 + }, + { + "epoch": 1.5325007261109498, + "grad_norm": 0.6357384324073792, + "learning_rate": 0.0001, + "loss": 1.3925, + "step": 13191 + }, + { + "epoch": 1.5326169038629103, + "grad_norm": 0.617169201374054, + "learning_rate": 0.0001, + "loss": 1.4264, + "step": 13192 + }, + { + "epoch": 1.5327330816148708, + "grad_norm": 0.5910585522651672, + "learning_rate": 0.0001, + "loss": 1.4205, + "step": 13193 + }, + { + "epoch": 1.5328492593668313, + "grad_norm": 0.5722532868385315, + "learning_rate": 0.0001, + "loss": 1.3988, + "step": 13194 + }, + { + "epoch": 1.5329654371187917, + "grad_norm": 0.5971508622169495, + "learning_rate": 0.0001, + "loss": 1.4986, + "step": 13195 + }, + { + "epoch": 1.5330816148707522, + "grad_norm": 0.6081166863441467, + "learning_rate": 0.0001, + "loss": 1.4698, + "step": 13196 + }, + { + "epoch": 1.5331977926227127, + "grad_norm": 0.6401707530021667, + "learning_rate": 0.0001, + "loss": 1.3302, + "step": 13197 + }, + { + "epoch": 1.5333139703746732, + "grad_norm": 0.6085423827171326, + "learning_rate": 0.0001, + "loss": 1.3921, + "step": 13198 + }, + { + "epoch": 1.5334301481266337, + "grad_norm": 0.6252439618110657, + "learning_rate": 0.0001, + "loss": 1.4668, + "step": 13199 + }, + { + "epoch": 1.5335463258785942, + "grad_norm": 0.6168279051780701, + "learning_rate": 0.0001, + "loss": 1.4045, + "step": 13200 + }, + { + "epoch": 1.5336625036305547, + "grad_norm": 0.5795454382896423, + "learning_rate": 0.0001, + "loss": 1.4236, + "step": 13201 + }, + { + "epoch": 1.5337786813825152, + "grad_norm": 0.6378520131111145, + "learning_rate": 0.0001, + "loss": 1.6387, + "step": 13202 + }, + { + "epoch": 1.5338948591344757, + "grad_norm": 0.616070568561554, + "learning_rate": 0.0001, + "loss": 1.5371, + "step": 13203 + }, + { + "epoch": 1.5340110368864361, + "grad_norm": 0.670217752456665, + "learning_rate": 0.0001, + "loss": 1.5229, + "step": 13204 + }, + { + "epoch": 1.5341272146383966, + "grad_norm": 0.6514032483100891, + "learning_rate": 0.0001, + "loss": 1.6283, + "step": 13205 + }, + { + "epoch": 1.5342433923903571, + "grad_norm": 0.5641196966171265, + "learning_rate": 0.0001, + "loss": 1.4328, + "step": 13206 + }, + { + "epoch": 1.5343595701423176, + "grad_norm": 0.5916898250579834, + "learning_rate": 0.0001, + "loss": 1.4845, + "step": 13207 + }, + { + "epoch": 1.534475747894278, + "grad_norm": 0.5729571580886841, + "learning_rate": 0.0001, + "loss": 1.3437, + "step": 13208 + }, + { + "epoch": 1.5345919256462386, + "grad_norm": 0.6877140402793884, + "learning_rate": 0.0001, + "loss": 1.5928, + "step": 13209 + }, + { + "epoch": 1.5347081033981993, + "grad_norm": 0.628453254699707, + "learning_rate": 0.0001, + "loss": 1.3626, + "step": 13210 + }, + { + "epoch": 1.5348242811501598, + "grad_norm": 0.5816517472267151, + "learning_rate": 0.0001, + "loss": 1.3589, + "step": 13211 + }, + { + "epoch": 1.5349404589021203, + "grad_norm": 0.5863345861434937, + "learning_rate": 0.0001, + "loss": 1.4668, + "step": 13212 + }, + { + "epoch": 1.5350566366540808, + "grad_norm": 0.5848140120506287, + "learning_rate": 0.0001, + "loss": 1.4771, + "step": 13213 + }, + { + "epoch": 1.5351728144060413, + "grad_norm": 0.6075370907783508, + "learning_rate": 0.0001, + "loss": 1.4385, + "step": 13214 + }, + { + "epoch": 1.5352889921580017, + "grad_norm": 0.6305372714996338, + "learning_rate": 0.0001, + "loss": 1.5449, + "step": 13215 + }, + { + "epoch": 1.5354051699099622, + "grad_norm": 0.5978954434394836, + "learning_rate": 0.0001, + "loss": 1.514, + "step": 13216 + }, + { + "epoch": 1.5355213476619227, + "grad_norm": 0.6035651564598083, + "learning_rate": 0.0001, + "loss": 1.5109, + "step": 13217 + }, + { + "epoch": 1.5356375254138832, + "grad_norm": 0.5722531080245972, + "learning_rate": 0.0001, + "loss": 1.5771, + "step": 13218 + }, + { + "epoch": 1.5357537031658437, + "grad_norm": 0.5960158705711365, + "learning_rate": 0.0001, + "loss": 1.4061, + "step": 13219 + }, + { + "epoch": 1.5358698809178042, + "grad_norm": 0.6274595260620117, + "learning_rate": 0.0001, + "loss": 1.5707, + "step": 13220 + }, + { + "epoch": 1.535986058669765, + "grad_norm": 0.6137160062789917, + "learning_rate": 0.0001, + "loss": 1.1475, + "step": 13221 + }, + { + "epoch": 1.5361022364217254, + "grad_norm": 0.6096351742744446, + "learning_rate": 0.0001, + "loss": 1.2318, + "step": 13222 + }, + { + "epoch": 1.5362184141736859, + "grad_norm": 0.642091691493988, + "learning_rate": 0.0001, + "loss": 1.5027, + "step": 13223 + }, + { + "epoch": 1.5363345919256464, + "grad_norm": 0.6222463250160217, + "learning_rate": 0.0001, + "loss": 1.4155, + "step": 13224 + }, + { + "epoch": 1.5364507696776069, + "grad_norm": 0.5922936201095581, + "learning_rate": 0.0001, + "loss": 1.3679, + "step": 13225 + }, + { + "epoch": 1.5365669474295673, + "grad_norm": 0.565569281578064, + "learning_rate": 0.0001, + "loss": 1.2068, + "step": 13226 + }, + { + "epoch": 1.5366831251815278, + "grad_norm": 0.6304237842559814, + "learning_rate": 0.0001, + "loss": 1.4687, + "step": 13227 + }, + { + "epoch": 1.5367993029334883, + "grad_norm": 0.6497960686683655, + "learning_rate": 0.0001, + "loss": 1.5075, + "step": 13228 + }, + { + "epoch": 1.5369154806854488, + "grad_norm": 0.6448734402656555, + "learning_rate": 0.0001, + "loss": 1.4774, + "step": 13229 + }, + { + "epoch": 1.5370316584374093, + "grad_norm": 0.5980844497680664, + "learning_rate": 0.0001, + "loss": 1.5099, + "step": 13230 + }, + { + "epoch": 1.5371478361893698, + "grad_norm": 0.5994159579277039, + "learning_rate": 0.0001, + "loss": 1.5558, + "step": 13231 + }, + { + "epoch": 1.5372640139413303, + "grad_norm": 0.58444744348526, + "learning_rate": 0.0001, + "loss": 1.4855, + "step": 13232 + }, + { + "epoch": 1.5373801916932908, + "grad_norm": 0.6289106607437134, + "learning_rate": 0.0001, + "loss": 1.5868, + "step": 13233 + }, + { + "epoch": 1.5374963694452513, + "grad_norm": 0.6375524997711182, + "learning_rate": 0.0001, + "loss": 1.6756, + "step": 13234 + }, + { + "epoch": 1.5376125471972117, + "grad_norm": 0.6511433720588684, + "learning_rate": 0.0001, + "loss": 1.4251, + "step": 13235 + }, + { + "epoch": 1.5377287249491722, + "grad_norm": 0.5981631875038147, + "learning_rate": 0.0001, + "loss": 1.4202, + "step": 13236 + }, + { + "epoch": 1.5378449027011327, + "grad_norm": 0.5719684958457947, + "learning_rate": 0.0001, + "loss": 1.4421, + "step": 13237 + }, + { + "epoch": 1.5379610804530932, + "grad_norm": 0.6303753852844238, + "learning_rate": 0.0001, + "loss": 1.2953, + "step": 13238 + }, + { + "epoch": 1.5380772582050537, + "grad_norm": 0.6014121174812317, + "learning_rate": 0.0001, + "loss": 1.4532, + "step": 13239 + }, + { + "epoch": 1.5381934359570142, + "grad_norm": 0.5993248820304871, + "learning_rate": 0.0001, + "loss": 1.4158, + "step": 13240 + }, + { + "epoch": 1.5383096137089747, + "grad_norm": 0.6195975542068481, + "learning_rate": 0.0001, + "loss": 1.4652, + "step": 13241 + }, + { + "epoch": 1.5384257914609352, + "grad_norm": 0.5984340906143188, + "learning_rate": 0.0001, + "loss": 1.5031, + "step": 13242 + }, + { + "epoch": 1.5385419692128957, + "grad_norm": 0.610115647315979, + "learning_rate": 0.0001, + "loss": 1.4412, + "step": 13243 + }, + { + "epoch": 1.5386581469648561, + "grad_norm": 0.6320290565490723, + "learning_rate": 0.0001, + "loss": 1.5107, + "step": 13244 + }, + { + "epoch": 1.5387743247168166, + "grad_norm": 0.6114384531974792, + "learning_rate": 0.0001, + "loss": 1.569, + "step": 13245 + }, + { + "epoch": 1.5388905024687771, + "grad_norm": 0.6696942448616028, + "learning_rate": 0.0001, + "loss": 1.5754, + "step": 13246 + }, + { + "epoch": 1.5390066802207376, + "grad_norm": 0.6278484463691711, + "learning_rate": 0.0001, + "loss": 1.3396, + "step": 13247 + }, + { + "epoch": 1.539122857972698, + "grad_norm": 0.5679048895835876, + "learning_rate": 0.0001, + "loss": 1.3174, + "step": 13248 + }, + { + "epoch": 1.5392390357246586, + "grad_norm": 0.6171903610229492, + "learning_rate": 0.0001, + "loss": 1.4796, + "step": 13249 + }, + { + "epoch": 1.539355213476619, + "grad_norm": 0.6222214698791504, + "learning_rate": 0.0001, + "loss": 1.439, + "step": 13250 + }, + { + "epoch": 1.5394713912285796, + "grad_norm": 0.5891657471656799, + "learning_rate": 0.0001, + "loss": 1.4478, + "step": 13251 + }, + { + "epoch": 1.5395875689805403, + "grad_norm": 0.6132618188858032, + "learning_rate": 0.0001, + "loss": 1.4395, + "step": 13252 + }, + { + "epoch": 1.5397037467325008, + "grad_norm": 0.5770408511161804, + "learning_rate": 0.0001, + "loss": 1.443, + "step": 13253 + }, + { + "epoch": 1.5398199244844613, + "grad_norm": 0.6150637269020081, + "learning_rate": 0.0001, + "loss": 1.4207, + "step": 13254 + }, + { + "epoch": 1.5399361022364217, + "grad_norm": 0.6168022155761719, + "learning_rate": 0.0001, + "loss": 1.3776, + "step": 13255 + }, + { + "epoch": 1.5400522799883822, + "grad_norm": 0.6376373767852783, + "learning_rate": 0.0001, + "loss": 1.6387, + "step": 13256 + }, + { + "epoch": 1.5401684577403427, + "grad_norm": 0.5829399824142456, + "learning_rate": 0.0001, + "loss": 1.2685, + "step": 13257 + }, + { + "epoch": 1.5402846354923032, + "grad_norm": 0.6367193460464478, + "learning_rate": 0.0001, + "loss": 1.5424, + "step": 13258 + }, + { + "epoch": 1.5404008132442637, + "grad_norm": 0.5960088968276978, + "learning_rate": 0.0001, + "loss": 1.3126, + "step": 13259 + }, + { + "epoch": 1.5405169909962242, + "grad_norm": 0.6211891770362854, + "learning_rate": 0.0001, + "loss": 1.5361, + "step": 13260 + }, + { + "epoch": 1.5406331687481847, + "grad_norm": 0.5986420512199402, + "learning_rate": 0.0001, + "loss": 1.5322, + "step": 13261 + }, + { + "epoch": 1.5407493465001452, + "grad_norm": 0.6235995888710022, + "learning_rate": 0.0001, + "loss": 1.4277, + "step": 13262 + }, + { + "epoch": 1.5408655242521059, + "grad_norm": 0.613019585609436, + "learning_rate": 0.0001, + "loss": 1.5626, + "step": 13263 + }, + { + "epoch": 1.5409817020040664, + "grad_norm": 0.630034327507019, + "learning_rate": 0.0001, + "loss": 1.6762, + "step": 13264 + }, + { + "epoch": 1.5410978797560269, + "grad_norm": 0.5918728709220886, + "learning_rate": 0.0001, + "loss": 1.4057, + "step": 13265 + }, + { + "epoch": 1.5412140575079873, + "grad_norm": 0.5738247036933899, + "learning_rate": 0.0001, + "loss": 1.4588, + "step": 13266 + }, + { + "epoch": 1.5413302352599478, + "grad_norm": 0.6175058484077454, + "learning_rate": 0.0001, + "loss": 1.457, + "step": 13267 + }, + { + "epoch": 1.5414464130119083, + "grad_norm": 0.5984748601913452, + "learning_rate": 0.0001, + "loss": 1.5045, + "step": 13268 + }, + { + "epoch": 1.5415625907638688, + "grad_norm": 0.6336756944656372, + "learning_rate": 0.0001, + "loss": 1.5041, + "step": 13269 + }, + { + "epoch": 1.5416787685158293, + "grad_norm": 0.6561615467071533, + "learning_rate": 0.0001, + "loss": 1.4334, + "step": 13270 + }, + { + "epoch": 1.5417949462677898, + "grad_norm": 0.643224835395813, + "learning_rate": 0.0001, + "loss": 1.5183, + "step": 13271 + }, + { + "epoch": 1.5419111240197503, + "grad_norm": 0.5880582928657532, + "learning_rate": 0.0001, + "loss": 1.3817, + "step": 13272 + }, + { + "epoch": 1.5420273017717108, + "grad_norm": 0.6055521368980408, + "learning_rate": 0.0001, + "loss": 1.3452, + "step": 13273 + }, + { + "epoch": 1.5421434795236713, + "grad_norm": 0.590501070022583, + "learning_rate": 0.0001, + "loss": 1.3997, + "step": 13274 + }, + { + "epoch": 1.5422596572756317, + "grad_norm": 0.5980383157730103, + "learning_rate": 0.0001, + "loss": 1.5425, + "step": 13275 + }, + { + "epoch": 1.5423758350275922, + "grad_norm": 0.5758526921272278, + "learning_rate": 0.0001, + "loss": 1.348, + "step": 13276 + }, + { + "epoch": 1.5424920127795527, + "grad_norm": 0.5835914611816406, + "learning_rate": 0.0001, + "loss": 1.2751, + "step": 13277 + }, + { + "epoch": 1.5426081905315132, + "grad_norm": 0.5609144568443298, + "learning_rate": 0.0001, + "loss": 1.2751, + "step": 13278 + }, + { + "epoch": 1.5427243682834737, + "grad_norm": 0.5979585647583008, + "learning_rate": 0.0001, + "loss": 1.4066, + "step": 13279 + }, + { + "epoch": 1.5428405460354342, + "grad_norm": 0.6307063698768616, + "learning_rate": 0.0001, + "loss": 1.4208, + "step": 13280 + }, + { + "epoch": 1.5429567237873947, + "grad_norm": 0.6205479502677917, + "learning_rate": 0.0001, + "loss": 1.4094, + "step": 13281 + }, + { + "epoch": 1.5430729015393552, + "grad_norm": 0.6695151925086975, + "learning_rate": 0.0001, + "loss": 1.5445, + "step": 13282 + }, + { + "epoch": 1.5431890792913157, + "grad_norm": 0.5992878675460815, + "learning_rate": 0.0001, + "loss": 1.3733, + "step": 13283 + }, + { + "epoch": 1.5433052570432761, + "grad_norm": 0.5989102125167847, + "learning_rate": 0.0001, + "loss": 1.3398, + "step": 13284 + }, + { + "epoch": 1.5434214347952366, + "grad_norm": 0.6177709698677063, + "learning_rate": 0.0001, + "loss": 1.5911, + "step": 13285 + }, + { + "epoch": 1.5435376125471971, + "grad_norm": 0.5789451599121094, + "learning_rate": 0.0001, + "loss": 1.431, + "step": 13286 + }, + { + "epoch": 1.5436537902991576, + "grad_norm": 0.6461418271064758, + "learning_rate": 0.0001, + "loss": 1.4955, + "step": 13287 + }, + { + "epoch": 1.543769968051118, + "grad_norm": 0.5908834934234619, + "learning_rate": 0.0001, + "loss": 1.4299, + "step": 13288 + }, + { + "epoch": 1.5438861458030786, + "grad_norm": 0.6150990724563599, + "learning_rate": 0.0001, + "loss": 1.5879, + "step": 13289 + }, + { + "epoch": 1.544002323555039, + "grad_norm": 0.5878303050994873, + "learning_rate": 0.0001, + "loss": 1.2382, + "step": 13290 + }, + { + "epoch": 1.5441185013069996, + "grad_norm": 0.6185941100120544, + "learning_rate": 0.0001, + "loss": 1.542, + "step": 13291 + }, + { + "epoch": 1.54423467905896, + "grad_norm": 0.7063065767288208, + "learning_rate": 0.0001, + "loss": 1.6306, + "step": 13292 + }, + { + "epoch": 1.5443508568109205, + "grad_norm": 0.6151609420776367, + "learning_rate": 0.0001, + "loss": 1.4068, + "step": 13293 + }, + { + "epoch": 1.5444670345628813, + "grad_norm": 0.5908089280128479, + "learning_rate": 0.0001, + "loss": 1.5094, + "step": 13294 + }, + { + "epoch": 1.5445832123148417, + "grad_norm": 0.5625834465026855, + "learning_rate": 0.0001, + "loss": 1.3472, + "step": 13295 + }, + { + "epoch": 1.5446993900668022, + "grad_norm": 0.6102145314216614, + "learning_rate": 0.0001, + "loss": 1.4495, + "step": 13296 + }, + { + "epoch": 1.5448155678187627, + "grad_norm": 0.6253021359443665, + "learning_rate": 0.0001, + "loss": 1.4255, + "step": 13297 + }, + { + "epoch": 1.5449317455707232, + "grad_norm": 0.588062047958374, + "learning_rate": 0.0001, + "loss": 1.3395, + "step": 13298 + }, + { + "epoch": 1.5450479233226837, + "grad_norm": 0.6264815926551819, + "learning_rate": 0.0001, + "loss": 1.3836, + "step": 13299 + }, + { + "epoch": 1.5451641010746442, + "grad_norm": 0.6290935277938843, + "learning_rate": 0.0001, + "loss": 1.4261, + "step": 13300 + }, + { + "epoch": 1.5452802788266047, + "grad_norm": 0.6513729691505432, + "learning_rate": 0.0001, + "loss": 1.4928, + "step": 13301 + }, + { + "epoch": 1.5453964565785652, + "grad_norm": 0.6446203589439392, + "learning_rate": 0.0001, + "loss": 1.4939, + "step": 13302 + }, + { + "epoch": 1.5455126343305257, + "grad_norm": 0.6368615031242371, + "learning_rate": 0.0001, + "loss": 1.5529, + "step": 13303 + }, + { + "epoch": 1.5456288120824864, + "grad_norm": 0.5730301141738892, + "learning_rate": 0.0001, + "loss": 1.5144, + "step": 13304 + }, + { + "epoch": 1.5457449898344469, + "grad_norm": 0.5296486020088196, + "learning_rate": 0.0001, + "loss": 1.4107, + "step": 13305 + }, + { + "epoch": 1.5458611675864073, + "grad_norm": 0.6226763725280762, + "learning_rate": 0.0001, + "loss": 1.4552, + "step": 13306 + }, + { + "epoch": 1.5459773453383678, + "grad_norm": 0.6259723901748657, + "learning_rate": 0.0001, + "loss": 1.4037, + "step": 13307 + }, + { + "epoch": 1.5460935230903283, + "grad_norm": 0.5425748825073242, + "learning_rate": 0.0001, + "loss": 1.4097, + "step": 13308 + }, + { + "epoch": 1.5462097008422888, + "grad_norm": 0.6267858743667603, + "learning_rate": 0.0001, + "loss": 1.2792, + "step": 13309 + }, + { + "epoch": 1.5463258785942493, + "grad_norm": 0.5841253399848938, + "learning_rate": 0.0001, + "loss": 1.5139, + "step": 13310 + }, + { + "epoch": 1.5464420563462098, + "grad_norm": 0.660430908203125, + "learning_rate": 0.0001, + "loss": 1.5256, + "step": 13311 + }, + { + "epoch": 1.5465582340981703, + "grad_norm": 0.5816801190376282, + "learning_rate": 0.0001, + "loss": 1.3198, + "step": 13312 + }, + { + "epoch": 1.5466744118501308, + "grad_norm": 0.6387170553207397, + "learning_rate": 0.0001, + "loss": 1.5019, + "step": 13313 + }, + { + "epoch": 1.5467905896020913, + "grad_norm": 0.6249637603759766, + "learning_rate": 0.0001, + "loss": 1.41, + "step": 13314 + }, + { + "epoch": 1.5469067673540517, + "grad_norm": 0.6024580597877502, + "learning_rate": 0.0001, + "loss": 1.4078, + "step": 13315 + }, + { + "epoch": 1.5470229451060122, + "grad_norm": 0.6384835243225098, + "learning_rate": 0.0001, + "loss": 1.4738, + "step": 13316 + }, + { + "epoch": 1.5471391228579727, + "grad_norm": 0.5695124864578247, + "learning_rate": 0.0001, + "loss": 1.364, + "step": 13317 + }, + { + "epoch": 1.5472553006099332, + "grad_norm": 0.6180372834205627, + "learning_rate": 0.0001, + "loss": 1.5373, + "step": 13318 + }, + { + "epoch": 1.5473714783618937, + "grad_norm": 0.5775349140167236, + "learning_rate": 0.0001, + "loss": 1.3818, + "step": 13319 + }, + { + "epoch": 1.5474876561138542, + "grad_norm": 0.6106045842170715, + "learning_rate": 0.0001, + "loss": 1.3383, + "step": 13320 + }, + { + "epoch": 1.5476038338658147, + "grad_norm": 0.6502373814582825, + "learning_rate": 0.0001, + "loss": 1.5179, + "step": 13321 + }, + { + "epoch": 1.5477200116177752, + "grad_norm": 0.6484719514846802, + "learning_rate": 0.0001, + "loss": 1.4393, + "step": 13322 + }, + { + "epoch": 1.5478361893697357, + "grad_norm": 0.6394400000572205, + "learning_rate": 0.0001, + "loss": 1.646, + "step": 13323 + }, + { + "epoch": 1.5479523671216961, + "grad_norm": 0.5985630750656128, + "learning_rate": 0.0001, + "loss": 1.4505, + "step": 13324 + }, + { + "epoch": 1.5480685448736566, + "grad_norm": 0.616834282875061, + "learning_rate": 0.0001, + "loss": 1.5475, + "step": 13325 + }, + { + "epoch": 1.5481847226256171, + "grad_norm": 0.6069583892822266, + "learning_rate": 0.0001, + "loss": 1.5224, + "step": 13326 + }, + { + "epoch": 1.5483009003775776, + "grad_norm": 0.6398360133171082, + "learning_rate": 0.0001, + "loss": 1.4967, + "step": 13327 + }, + { + "epoch": 1.548417078129538, + "grad_norm": 0.5988773107528687, + "learning_rate": 0.0001, + "loss": 1.3364, + "step": 13328 + }, + { + "epoch": 1.5485332558814986, + "grad_norm": 0.6219581365585327, + "learning_rate": 0.0001, + "loss": 1.3877, + "step": 13329 + }, + { + "epoch": 1.548649433633459, + "grad_norm": 0.5958396792411804, + "learning_rate": 0.0001, + "loss": 1.3614, + "step": 13330 + }, + { + "epoch": 1.5487656113854196, + "grad_norm": 0.5914923548698425, + "learning_rate": 0.0001, + "loss": 1.4747, + "step": 13331 + }, + { + "epoch": 1.54888178913738, + "grad_norm": 0.6485493779182434, + "learning_rate": 0.0001, + "loss": 1.4707, + "step": 13332 + }, + { + "epoch": 1.5489979668893405, + "grad_norm": 0.6255422830581665, + "learning_rate": 0.0001, + "loss": 1.4731, + "step": 13333 + }, + { + "epoch": 1.549114144641301, + "grad_norm": 0.6094285845756531, + "learning_rate": 0.0001, + "loss": 1.503, + "step": 13334 + }, + { + "epoch": 1.5492303223932615, + "grad_norm": 0.5891302824020386, + "learning_rate": 0.0001, + "loss": 1.4598, + "step": 13335 + }, + { + "epoch": 1.5493465001452222, + "grad_norm": 0.6349563002586365, + "learning_rate": 0.0001, + "loss": 1.5865, + "step": 13336 + }, + { + "epoch": 1.5494626778971827, + "grad_norm": 0.6088860034942627, + "learning_rate": 0.0001, + "loss": 1.4814, + "step": 13337 + }, + { + "epoch": 1.5495788556491432, + "grad_norm": 0.7181675434112549, + "learning_rate": 0.0001, + "loss": 1.6944, + "step": 13338 + }, + { + "epoch": 1.5496950334011037, + "grad_norm": 0.6011210083961487, + "learning_rate": 0.0001, + "loss": 1.4959, + "step": 13339 + }, + { + "epoch": 1.5498112111530642, + "grad_norm": 0.6319872140884399, + "learning_rate": 0.0001, + "loss": 1.4118, + "step": 13340 + }, + { + "epoch": 1.5499273889050247, + "grad_norm": 0.6266433596611023, + "learning_rate": 0.0001, + "loss": 1.5924, + "step": 13341 + }, + { + "epoch": 1.5500435666569852, + "grad_norm": 0.6468966603279114, + "learning_rate": 0.0001, + "loss": 1.5942, + "step": 13342 + }, + { + "epoch": 1.5501597444089457, + "grad_norm": 0.5942563414573669, + "learning_rate": 0.0001, + "loss": 1.5652, + "step": 13343 + }, + { + "epoch": 1.5502759221609062, + "grad_norm": 0.5953789949417114, + "learning_rate": 0.0001, + "loss": 1.2761, + "step": 13344 + }, + { + "epoch": 1.5503920999128666, + "grad_norm": 0.5696388483047485, + "learning_rate": 0.0001, + "loss": 1.3045, + "step": 13345 + }, + { + "epoch": 1.5505082776648274, + "grad_norm": 0.5958232879638672, + "learning_rate": 0.0001, + "loss": 1.4762, + "step": 13346 + }, + { + "epoch": 1.5506244554167878, + "grad_norm": 0.6455214619636536, + "learning_rate": 0.0001, + "loss": 1.5647, + "step": 13347 + }, + { + "epoch": 1.5507406331687483, + "grad_norm": 0.7005032896995544, + "learning_rate": 0.0001, + "loss": 1.4845, + "step": 13348 + }, + { + "epoch": 1.5508568109207088, + "grad_norm": 0.6717994809150696, + "learning_rate": 0.0001, + "loss": 1.6779, + "step": 13349 + }, + { + "epoch": 1.5509729886726693, + "grad_norm": 0.5751558542251587, + "learning_rate": 0.0001, + "loss": 1.3095, + "step": 13350 + }, + { + "epoch": 1.5510891664246298, + "grad_norm": 0.5958260297775269, + "learning_rate": 0.0001, + "loss": 1.4298, + "step": 13351 + }, + { + "epoch": 1.5512053441765903, + "grad_norm": 0.5926149487495422, + "learning_rate": 0.0001, + "loss": 1.4678, + "step": 13352 + }, + { + "epoch": 1.5513215219285508, + "grad_norm": 0.621460497379303, + "learning_rate": 0.0001, + "loss": 1.3896, + "step": 13353 + }, + { + "epoch": 1.5514376996805113, + "grad_norm": 0.6164433360099792, + "learning_rate": 0.0001, + "loss": 1.5632, + "step": 13354 + }, + { + "epoch": 1.5515538774324718, + "grad_norm": 0.6684780120849609, + "learning_rate": 0.0001, + "loss": 1.6347, + "step": 13355 + }, + { + "epoch": 1.5516700551844322, + "grad_norm": 0.6255985498428345, + "learning_rate": 0.0001, + "loss": 1.3428, + "step": 13356 + }, + { + "epoch": 1.5517862329363927, + "grad_norm": 0.571209728717804, + "learning_rate": 0.0001, + "loss": 1.3975, + "step": 13357 + }, + { + "epoch": 1.5519024106883532, + "grad_norm": 0.5906187891960144, + "learning_rate": 0.0001, + "loss": 1.4001, + "step": 13358 + }, + { + "epoch": 1.5520185884403137, + "grad_norm": 0.6360796093940735, + "learning_rate": 0.0001, + "loss": 1.6595, + "step": 13359 + }, + { + "epoch": 1.5521347661922742, + "grad_norm": 0.5564336180686951, + "learning_rate": 0.0001, + "loss": 1.2847, + "step": 13360 + }, + { + "epoch": 1.5522509439442347, + "grad_norm": 0.6803907155990601, + "learning_rate": 0.0001, + "loss": 1.5776, + "step": 13361 + }, + { + "epoch": 1.5523671216961952, + "grad_norm": 0.6163176894187927, + "learning_rate": 0.0001, + "loss": 1.5737, + "step": 13362 + }, + { + "epoch": 1.5524832994481557, + "grad_norm": 0.5890237092971802, + "learning_rate": 0.0001, + "loss": 1.3819, + "step": 13363 + }, + { + "epoch": 1.5525994772001162, + "grad_norm": 0.6154002547264099, + "learning_rate": 0.0001, + "loss": 1.2708, + "step": 13364 + }, + { + "epoch": 1.5527156549520766, + "grad_norm": 0.6088365316390991, + "learning_rate": 0.0001, + "loss": 1.5037, + "step": 13365 + }, + { + "epoch": 1.5528318327040371, + "grad_norm": 0.5878404378890991, + "learning_rate": 0.0001, + "loss": 1.3299, + "step": 13366 + }, + { + "epoch": 1.5529480104559976, + "grad_norm": 0.6358885765075684, + "learning_rate": 0.0001, + "loss": 1.6242, + "step": 13367 + }, + { + "epoch": 1.553064188207958, + "grad_norm": 0.6308332681655884, + "learning_rate": 0.0001, + "loss": 1.4362, + "step": 13368 + }, + { + "epoch": 1.5531803659599186, + "grad_norm": 0.5902178287506104, + "learning_rate": 0.0001, + "loss": 1.4866, + "step": 13369 + }, + { + "epoch": 1.553296543711879, + "grad_norm": 0.6629870533943176, + "learning_rate": 0.0001, + "loss": 1.4724, + "step": 13370 + }, + { + "epoch": 1.5534127214638396, + "grad_norm": 0.6822353601455688, + "learning_rate": 0.0001, + "loss": 1.4084, + "step": 13371 + }, + { + "epoch": 1.5535288992158, + "grad_norm": 0.5877573490142822, + "learning_rate": 0.0001, + "loss": 1.2979, + "step": 13372 + }, + { + "epoch": 1.5536450769677606, + "grad_norm": 0.5882243514060974, + "learning_rate": 0.0001, + "loss": 1.5042, + "step": 13373 + }, + { + "epoch": 1.553761254719721, + "grad_norm": 0.6118847727775574, + "learning_rate": 0.0001, + "loss": 1.464, + "step": 13374 + }, + { + "epoch": 1.5538774324716815, + "grad_norm": 0.6584449410438538, + "learning_rate": 0.0001, + "loss": 1.6281, + "step": 13375 + }, + { + "epoch": 1.553993610223642, + "grad_norm": 0.6288265585899353, + "learning_rate": 0.0001, + "loss": 1.4938, + "step": 13376 + }, + { + "epoch": 1.5541097879756025, + "grad_norm": 0.637592077255249, + "learning_rate": 0.0001, + "loss": 1.5458, + "step": 13377 + }, + { + "epoch": 1.5542259657275632, + "grad_norm": 0.642940104007721, + "learning_rate": 0.0001, + "loss": 1.4834, + "step": 13378 + }, + { + "epoch": 1.5543421434795237, + "grad_norm": 0.6421939730644226, + "learning_rate": 0.0001, + "loss": 1.4557, + "step": 13379 + }, + { + "epoch": 1.5544583212314842, + "grad_norm": 0.5893421173095703, + "learning_rate": 0.0001, + "loss": 1.4721, + "step": 13380 + }, + { + "epoch": 1.5545744989834447, + "grad_norm": 0.6120336055755615, + "learning_rate": 0.0001, + "loss": 1.5363, + "step": 13381 + }, + { + "epoch": 1.5546906767354052, + "grad_norm": 0.6369379162788391, + "learning_rate": 0.0001, + "loss": 1.4525, + "step": 13382 + }, + { + "epoch": 1.5548068544873657, + "grad_norm": 0.6311773657798767, + "learning_rate": 0.0001, + "loss": 1.4666, + "step": 13383 + }, + { + "epoch": 1.5549230322393262, + "grad_norm": 0.6140812039375305, + "learning_rate": 0.0001, + "loss": 1.4224, + "step": 13384 + }, + { + "epoch": 1.5550392099912866, + "grad_norm": 0.6031202673912048, + "learning_rate": 0.0001, + "loss": 1.4273, + "step": 13385 + }, + { + "epoch": 1.5551553877432471, + "grad_norm": 0.6218429803848267, + "learning_rate": 0.0001, + "loss": 1.5215, + "step": 13386 + }, + { + "epoch": 1.5552715654952076, + "grad_norm": 0.6086384654045105, + "learning_rate": 0.0001, + "loss": 1.6033, + "step": 13387 + }, + { + "epoch": 1.5553877432471683, + "grad_norm": 0.6052895188331604, + "learning_rate": 0.0001, + "loss": 1.4336, + "step": 13388 + }, + { + "epoch": 1.5555039209991288, + "grad_norm": 0.6349013447761536, + "learning_rate": 0.0001, + "loss": 1.5928, + "step": 13389 + }, + { + "epoch": 1.5556200987510893, + "grad_norm": 0.6184483766555786, + "learning_rate": 0.0001, + "loss": 1.61, + "step": 13390 + }, + { + "epoch": 1.5557362765030498, + "grad_norm": 0.601908266544342, + "learning_rate": 0.0001, + "loss": 1.5301, + "step": 13391 + }, + { + "epoch": 1.5558524542550103, + "grad_norm": 0.6823452115058899, + "learning_rate": 0.0001, + "loss": 1.7363, + "step": 13392 + }, + { + "epoch": 1.5559686320069708, + "grad_norm": 0.6129056811332703, + "learning_rate": 0.0001, + "loss": 1.41, + "step": 13393 + }, + { + "epoch": 1.5560848097589313, + "grad_norm": 0.5627692937850952, + "learning_rate": 0.0001, + "loss": 1.3706, + "step": 13394 + }, + { + "epoch": 1.5562009875108918, + "grad_norm": 0.5908641815185547, + "learning_rate": 0.0001, + "loss": 1.3288, + "step": 13395 + }, + { + "epoch": 1.5563171652628522, + "grad_norm": 0.6511195302009583, + "learning_rate": 0.0001, + "loss": 1.4648, + "step": 13396 + }, + { + "epoch": 1.5564333430148127, + "grad_norm": 0.619094729423523, + "learning_rate": 0.0001, + "loss": 1.4406, + "step": 13397 + }, + { + "epoch": 1.5565495207667732, + "grad_norm": 0.6720662117004395, + "learning_rate": 0.0001, + "loss": 1.5678, + "step": 13398 + }, + { + "epoch": 1.5566656985187337, + "grad_norm": 0.7280980944633484, + "learning_rate": 0.0001, + "loss": 1.6025, + "step": 13399 + }, + { + "epoch": 1.5567818762706942, + "grad_norm": 0.612970232963562, + "learning_rate": 0.0001, + "loss": 1.2731, + "step": 13400 + }, + { + "epoch": 1.5568980540226547, + "grad_norm": 0.6179816722869873, + "learning_rate": 0.0001, + "loss": 1.4997, + "step": 13401 + }, + { + "epoch": 1.5570142317746152, + "grad_norm": 0.6261540651321411, + "learning_rate": 0.0001, + "loss": 1.3036, + "step": 13402 + }, + { + "epoch": 1.5571304095265757, + "grad_norm": 0.6171663403511047, + "learning_rate": 0.0001, + "loss": 1.5464, + "step": 13403 + }, + { + "epoch": 1.5572465872785362, + "grad_norm": 0.5964109301567078, + "learning_rate": 0.0001, + "loss": 1.538, + "step": 13404 + }, + { + "epoch": 1.5573627650304966, + "grad_norm": 0.6069731712341309, + "learning_rate": 0.0001, + "loss": 1.3542, + "step": 13405 + }, + { + "epoch": 1.5574789427824571, + "grad_norm": 0.61017906665802, + "learning_rate": 0.0001, + "loss": 1.5481, + "step": 13406 + }, + { + "epoch": 1.5575951205344176, + "grad_norm": 0.6241382956504822, + "learning_rate": 0.0001, + "loss": 1.5264, + "step": 13407 + }, + { + "epoch": 1.557711298286378, + "grad_norm": 0.6097595691680908, + "learning_rate": 0.0001, + "loss": 1.4519, + "step": 13408 + }, + { + "epoch": 1.5578274760383386, + "grad_norm": 0.6461699604988098, + "learning_rate": 0.0001, + "loss": 1.4854, + "step": 13409 + }, + { + "epoch": 1.557943653790299, + "grad_norm": 0.6478632688522339, + "learning_rate": 0.0001, + "loss": 1.6612, + "step": 13410 + }, + { + "epoch": 1.5580598315422596, + "grad_norm": 0.5949918627738953, + "learning_rate": 0.0001, + "loss": 1.3795, + "step": 13411 + }, + { + "epoch": 1.55817600929422, + "grad_norm": 0.6056655645370483, + "learning_rate": 0.0001, + "loss": 1.4087, + "step": 13412 + }, + { + "epoch": 1.5582921870461806, + "grad_norm": 0.5610243082046509, + "learning_rate": 0.0001, + "loss": 1.3303, + "step": 13413 + }, + { + "epoch": 1.558408364798141, + "grad_norm": 0.5745115280151367, + "learning_rate": 0.0001, + "loss": 1.3136, + "step": 13414 + }, + { + "epoch": 1.5585245425501015, + "grad_norm": 0.587050199508667, + "learning_rate": 0.0001, + "loss": 1.4801, + "step": 13415 + }, + { + "epoch": 1.558640720302062, + "grad_norm": 0.6332523226737976, + "learning_rate": 0.0001, + "loss": 1.5378, + "step": 13416 + }, + { + "epoch": 1.5587568980540225, + "grad_norm": 0.6112111210823059, + "learning_rate": 0.0001, + "loss": 1.5639, + "step": 13417 + }, + { + "epoch": 1.558873075805983, + "grad_norm": 0.6478002667427063, + "learning_rate": 0.0001, + "loss": 1.6473, + "step": 13418 + }, + { + "epoch": 1.5589892535579435, + "grad_norm": 0.6234623789787292, + "learning_rate": 0.0001, + "loss": 1.5801, + "step": 13419 + }, + { + "epoch": 1.5591054313099042, + "grad_norm": 0.6207946538925171, + "learning_rate": 0.0001, + "loss": 1.4796, + "step": 13420 + }, + { + "epoch": 1.5592216090618647, + "grad_norm": 0.6182363629341125, + "learning_rate": 0.0001, + "loss": 1.4773, + "step": 13421 + }, + { + "epoch": 1.5593377868138252, + "grad_norm": 0.6172978281974792, + "learning_rate": 0.0001, + "loss": 1.4579, + "step": 13422 + }, + { + "epoch": 1.5594539645657857, + "grad_norm": 0.59696364402771, + "learning_rate": 0.0001, + "loss": 1.5049, + "step": 13423 + }, + { + "epoch": 1.5595701423177462, + "grad_norm": 0.58649742603302, + "learning_rate": 0.0001, + "loss": 1.4482, + "step": 13424 + }, + { + "epoch": 1.5596863200697066, + "grad_norm": 0.6304432153701782, + "learning_rate": 0.0001, + "loss": 1.6529, + "step": 13425 + }, + { + "epoch": 1.5598024978216671, + "grad_norm": 0.5944958329200745, + "learning_rate": 0.0001, + "loss": 1.4783, + "step": 13426 + }, + { + "epoch": 1.5599186755736276, + "grad_norm": 0.5643553137779236, + "learning_rate": 0.0001, + "loss": 1.4529, + "step": 13427 + }, + { + "epoch": 1.5600348533255881, + "grad_norm": 0.6088239550590515, + "learning_rate": 0.0001, + "loss": 1.4445, + "step": 13428 + }, + { + "epoch": 1.5601510310775486, + "grad_norm": 0.6248722672462463, + "learning_rate": 0.0001, + "loss": 1.4446, + "step": 13429 + }, + { + "epoch": 1.5602672088295093, + "grad_norm": 0.6209125518798828, + "learning_rate": 0.0001, + "loss": 1.4512, + "step": 13430 + }, + { + "epoch": 1.5603833865814698, + "grad_norm": 0.6119517683982849, + "learning_rate": 0.0001, + "loss": 1.498, + "step": 13431 + }, + { + "epoch": 1.5604995643334303, + "grad_norm": 0.6059420704841614, + "learning_rate": 0.0001, + "loss": 1.3094, + "step": 13432 + }, + { + "epoch": 1.5606157420853908, + "grad_norm": 0.635593593120575, + "learning_rate": 0.0001, + "loss": 1.5286, + "step": 13433 + }, + { + "epoch": 1.5607319198373513, + "grad_norm": 0.63407963514328, + "learning_rate": 0.0001, + "loss": 1.6207, + "step": 13434 + }, + { + "epoch": 1.5608480975893118, + "grad_norm": 0.5982295274734497, + "learning_rate": 0.0001, + "loss": 1.4909, + "step": 13435 + }, + { + "epoch": 1.5609642753412722, + "grad_norm": 0.6256076097488403, + "learning_rate": 0.0001, + "loss": 1.4428, + "step": 13436 + }, + { + "epoch": 1.5610804530932327, + "grad_norm": 0.600486159324646, + "learning_rate": 0.0001, + "loss": 1.5336, + "step": 13437 + }, + { + "epoch": 1.5611966308451932, + "grad_norm": 0.6342299580574036, + "learning_rate": 0.0001, + "loss": 1.4709, + "step": 13438 + }, + { + "epoch": 1.5613128085971537, + "grad_norm": 0.625291109085083, + "learning_rate": 0.0001, + "loss": 1.679, + "step": 13439 + }, + { + "epoch": 1.5614289863491142, + "grad_norm": 0.6260406970977783, + "learning_rate": 0.0001, + "loss": 1.5073, + "step": 13440 + }, + { + "epoch": 1.5615451641010747, + "grad_norm": 0.6225281953811646, + "learning_rate": 0.0001, + "loss": 1.4857, + "step": 13441 + }, + { + "epoch": 1.5616613418530352, + "grad_norm": 0.5835169553756714, + "learning_rate": 0.0001, + "loss": 1.5368, + "step": 13442 + }, + { + "epoch": 1.5617775196049957, + "grad_norm": 0.6170945167541504, + "learning_rate": 0.0001, + "loss": 1.53, + "step": 13443 + }, + { + "epoch": 1.5618936973569562, + "grad_norm": 0.594443142414093, + "learning_rate": 0.0001, + "loss": 1.3711, + "step": 13444 + }, + { + "epoch": 1.5620098751089166, + "grad_norm": 0.6158294081687927, + "learning_rate": 0.0001, + "loss": 1.4973, + "step": 13445 + }, + { + "epoch": 1.5621260528608771, + "grad_norm": 0.6171496510505676, + "learning_rate": 0.0001, + "loss": 1.5523, + "step": 13446 + }, + { + "epoch": 1.5622422306128376, + "grad_norm": 0.6249264478683472, + "learning_rate": 0.0001, + "loss": 1.4543, + "step": 13447 + }, + { + "epoch": 1.5623584083647981, + "grad_norm": 0.6244696974754333, + "learning_rate": 0.0001, + "loss": 1.4413, + "step": 13448 + }, + { + "epoch": 1.5624745861167586, + "grad_norm": 0.6428230404853821, + "learning_rate": 0.0001, + "loss": 1.5077, + "step": 13449 + }, + { + "epoch": 1.562590763868719, + "grad_norm": 0.6612055897712708, + "learning_rate": 0.0001, + "loss": 1.5691, + "step": 13450 + }, + { + "epoch": 1.5627069416206796, + "grad_norm": 0.6030257344245911, + "learning_rate": 0.0001, + "loss": 1.458, + "step": 13451 + }, + { + "epoch": 1.56282311937264, + "grad_norm": 0.5865484476089478, + "learning_rate": 0.0001, + "loss": 1.2867, + "step": 13452 + }, + { + "epoch": 1.5629392971246006, + "grad_norm": 0.6694547533988953, + "learning_rate": 0.0001, + "loss": 1.5695, + "step": 13453 + }, + { + "epoch": 1.563055474876561, + "grad_norm": 0.6105194687843323, + "learning_rate": 0.0001, + "loss": 1.3006, + "step": 13454 + }, + { + "epoch": 1.5631716526285215, + "grad_norm": 0.5813074707984924, + "learning_rate": 0.0001, + "loss": 1.3504, + "step": 13455 + }, + { + "epoch": 1.563287830380482, + "grad_norm": 0.6537028551101685, + "learning_rate": 0.0001, + "loss": 1.3702, + "step": 13456 + }, + { + "epoch": 1.5634040081324425, + "grad_norm": 0.6405291557312012, + "learning_rate": 0.0001, + "loss": 1.3698, + "step": 13457 + }, + { + "epoch": 1.563520185884403, + "grad_norm": 0.644826352596283, + "learning_rate": 0.0001, + "loss": 1.5245, + "step": 13458 + }, + { + "epoch": 1.5636363636363635, + "grad_norm": 0.6287594437599182, + "learning_rate": 0.0001, + "loss": 1.4445, + "step": 13459 + }, + { + "epoch": 1.563752541388324, + "grad_norm": 0.6116755604743958, + "learning_rate": 0.0001, + "loss": 1.42, + "step": 13460 + }, + { + "epoch": 1.5638687191402847, + "grad_norm": 0.6694396734237671, + "learning_rate": 0.0001, + "loss": 1.528, + "step": 13461 + }, + { + "epoch": 1.5639848968922452, + "grad_norm": 0.6518230438232422, + "learning_rate": 0.0001, + "loss": 1.5947, + "step": 13462 + }, + { + "epoch": 1.5641010746442057, + "grad_norm": 0.5904227495193481, + "learning_rate": 0.0001, + "loss": 1.484, + "step": 13463 + }, + { + "epoch": 1.5642172523961662, + "grad_norm": 0.6056810617446899, + "learning_rate": 0.0001, + "loss": 1.5085, + "step": 13464 + }, + { + "epoch": 1.5643334301481266, + "grad_norm": 0.606863260269165, + "learning_rate": 0.0001, + "loss": 1.4822, + "step": 13465 + }, + { + "epoch": 1.5644496079000871, + "grad_norm": 0.61651211977005, + "learning_rate": 0.0001, + "loss": 1.4381, + "step": 13466 + }, + { + "epoch": 1.5645657856520476, + "grad_norm": 0.6035972833633423, + "learning_rate": 0.0001, + "loss": 1.4649, + "step": 13467 + }, + { + "epoch": 1.5646819634040081, + "grad_norm": 0.6274685859680176, + "learning_rate": 0.0001, + "loss": 1.4341, + "step": 13468 + }, + { + "epoch": 1.5647981411559686, + "grad_norm": 0.6115575432777405, + "learning_rate": 0.0001, + "loss": 1.5034, + "step": 13469 + }, + { + "epoch": 1.564914318907929, + "grad_norm": 0.6543807983398438, + "learning_rate": 0.0001, + "loss": 1.3965, + "step": 13470 + }, + { + "epoch": 1.5650304966598896, + "grad_norm": 0.6324952244758606, + "learning_rate": 0.0001, + "loss": 1.4441, + "step": 13471 + }, + { + "epoch": 1.5651466744118503, + "grad_norm": 0.6065995693206787, + "learning_rate": 0.0001, + "loss": 1.3974, + "step": 13472 + }, + { + "epoch": 1.5652628521638108, + "grad_norm": 0.6735640168190002, + "learning_rate": 0.0001, + "loss": 1.4644, + "step": 13473 + }, + { + "epoch": 1.5653790299157713, + "grad_norm": 0.6395497918128967, + "learning_rate": 0.0001, + "loss": 1.5344, + "step": 13474 + }, + { + "epoch": 1.5654952076677318, + "grad_norm": 0.6460095643997192, + "learning_rate": 0.0001, + "loss": 1.5333, + "step": 13475 + }, + { + "epoch": 1.5656113854196922, + "grad_norm": 0.6567825078964233, + "learning_rate": 0.0001, + "loss": 1.6167, + "step": 13476 + }, + { + "epoch": 1.5657275631716527, + "grad_norm": 0.6820036768913269, + "learning_rate": 0.0001, + "loss": 1.6258, + "step": 13477 + }, + { + "epoch": 1.5658437409236132, + "grad_norm": 0.6071247458457947, + "learning_rate": 0.0001, + "loss": 1.3688, + "step": 13478 + }, + { + "epoch": 1.5659599186755737, + "grad_norm": 0.608846127986908, + "learning_rate": 0.0001, + "loss": 1.3601, + "step": 13479 + }, + { + "epoch": 1.5660760964275342, + "grad_norm": 0.7005351781845093, + "learning_rate": 0.0001, + "loss": 1.535, + "step": 13480 + }, + { + "epoch": 1.5661922741794947, + "grad_norm": 0.6527760624885559, + "learning_rate": 0.0001, + "loss": 1.5133, + "step": 13481 + }, + { + "epoch": 1.5663084519314552, + "grad_norm": 0.593977153301239, + "learning_rate": 0.0001, + "loss": 1.4874, + "step": 13482 + }, + { + "epoch": 1.5664246296834157, + "grad_norm": 0.6437960267066956, + "learning_rate": 0.0001, + "loss": 1.4985, + "step": 13483 + }, + { + "epoch": 1.5665408074353762, + "grad_norm": 0.608905553817749, + "learning_rate": 0.0001, + "loss": 1.4795, + "step": 13484 + }, + { + "epoch": 1.5666569851873366, + "grad_norm": 0.5913368463516235, + "learning_rate": 0.0001, + "loss": 1.3551, + "step": 13485 + }, + { + "epoch": 1.5667731629392971, + "grad_norm": 0.6186370849609375, + "learning_rate": 0.0001, + "loss": 1.4108, + "step": 13486 + }, + { + "epoch": 1.5668893406912576, + "grad_norm": 0.6638941168785095, + "learning_rate": 0.0001, + "loss": 1.5195, + "step": 13487 + }, + { + "epoch": 1.5670055184432181, + "grad_norm": 0.7029228806495667, + "learning_rate": 0.0001, + "loss": 1.6166, + "step": 13488 + }, + { + "epoch": 1.5671216961951786, + "grad_norm": 0.6374664902687073, + "learning_rate": 0.0001, + "loss": 1.5843, + "step": 13489 + }, + { + "epoch": 1.567237873947139, + "grad_norm": 0.6297853589057922, + "learning_rate": 0.0001, + "loss": 1.4802, + "step": 13490 + }, + { + "epoch": 1.5673540516990996, + "grad_norm": 0.6086907386779785, + "learning_rate": 0.0001, + "loss": 1.3929, + "step": 13491 + }, + { + "epoch": 1.56747022945106, + "grad_norm": 0.6632092595100403, + "learning_rate": 0.0001, + "loss": 1.5099, + "step": 13492 + }, + { + "epoch": 1.5675864072030206, + "grad_norm": 0.6230993270874023, + "learning_rate": 0.0001, + "loss": 1.497, + "step": 13493 + }, + { + "epoch": 1.567702584954981, + "grad_norm": 0.6114823818206787, + "learning_rate": 0.0001, + "loss": 1.4794, + "step": 13494 + }, + { + "epoch": 1.5678187627069415, + "grad_norm": 0.645427405834198, + "learning_rate": 0.0001, + "loss": 1.3659, + "step": 13495 + }, + { + "epoch": 1.567934940458902, + "grad_norm": 0.5928539633750916, + "learning_rate": 0.0001, + "loss": 1.3554, + "step": 13496 + }, + { + "epoch": 1.5680511182108625, + "grad_norm": 0.6389637589454651, + "learning_rate": 0.0001, + "loss": 1.3468, + "step": 13497 + }, + { + "epoch": 1.568167295962823, + "grad_norm": 0.6235938668251038, + "learning_rate": 0.0001, + "loss": 1.4206, + "step": 13498 + }, + { + "epoch": 1.5682834737147835, + "grad_norm": 0.6196651458740234, + "learning_rate": 0.0001, + "loss": 1.5044, + "step": 13499 + }, + { + "epoch": 1.568399651466744, + "grad_norm": 0.6908482313156128, + "learning_rate": 0.0001, + "loss": 1.6682, + "step": 13500 + }, + { + "epoch": 1.5685158292187045, + "grad_norm": 0.5792660713195801, + "learning_rate": 0.0001, + "loss": 1.4432, + "step": 13501 + }, + { + "epoch": 1.568632006970665, + "grad_norm": 0.6126808524131775, + "learning_rate": 0.0001, + "loss": 1.5117, + "step": 13502 + }, + { + "epoch": 1.5687481847226257, + "grad_norm": 0.6406491994857788, + "learning_rate": 0.0001, + "loss": 1.484, + "step": 13503 + }, + { + "epoch": 1.5688643624745862, + "grad_norm": 0.6253640055656433, + "learning_rate": 0.0001, + "loss": 1.5793, + "step": 13504 + }, + { + "epoch": 1.5689805402265466, + "grad_norm": 0.7741358280181885, + "learning_rate": 0.0001, + "loss": 1.5336, + "step": 13505 + }, + { + "epoch": 1.5690967179785071, + "grad_norm": 0.5788688659667969, + "learning_rate": 0.0001, + "loss": 1.397, + "step": 13506 + }, + { + "epoch": 1.5692128957304676, + "grad_norm": 0.6431933045387268, + "learning_rate": 0.0001, + "loss": 1.5296, + "step": 13507 + }, + { + "epoch": 1.5693290734824281, + "grad_norm": 0.6005233526229858, + "learning_rate": 0.0001, + "loss": 1.5396, + "step": 13508 + }, + { + "epoch": 1.5694452512343886, + "grad_norm": 0.6058092713356018, + "learning_rate": 0.0001, + "loss": 1.4367, + "step": 13509 + }, + { + "epoch": 1.569561428986349, + "grad_norm": 0.6385200023651123, + "learning_rate": 0.0001, + "loss": 1.6869, + "step": 13510 + }, + { + "epoch": 1.5696776067383096, + "grad_norm": 0.5717422366142273, + "learning_rate": 0.0001, + "loss": 1.4981, + "step": 13511 + }, + { + "epoch": 1.56979378449027, + "grad_norm": 0.5632378458976746, + "learning_rate": 0.0001, + "loss": 1.324, + "step": 13512 + }, + { + "epoch": 1.5699099622422306, + "grad_norm": 0.5892371535301208, + "learning_rate": 0.0001, + "loss": 1.4671, + "step": 13513 + }, + { + "epoch": 1.5700261399941913, + "grad_norm": 0.5963519811630249, + "learning_rate": 0.0001, + "loss": 1.4721, + "step": 13514 + }, + { + "epoch": 1.5701423177461518, + "grad_norm": 0.6520595550537109, + "learning_rate": 0.0001, + "loss": 1.6544, + "step": 13515 + }, + { + "epoch": 1.5702584954981123, + "grad_norm": 0.6093313097953796, + "learning_rate": 0.0001, + "loss": 1.3183, + "step": 13516 + }, + { + "epoch": 1.5703746732500727, + "grad_norm": 0.6415348052978516, + "learning_rate": 0.0001, + "loss": 1.5086, + "step": 13517 + }, + { + "epoch": 1.5704908510020332, + "grad_norm": 0.6013089418411255, + "learning_rate": 0.0001, + "loss": 1.4176, + "step": 13518 + }, + { + "epoch": 1.5706070287539937, + "grad_norm": 0.6108344197273254, + "learning_rate": 0.0001, + "loss": 1.4794, + "step": 13519 + }, + { + "epoch": 1.5707232065059542, + "grad_norm": 0.6556594371795654, + "learning_rate": 0.0001, + "loss": 1.6093, + "step": 13520 + }, + { + "epoch": 1.5708393842579147, + "grad_norm": 0.6048537492752075, + "learning_rate": 0.0001, + "loss": 1.4716, + "step": 13521 + }, + { + "epoch": 1.5709555620098752, + "grad_norm": 0.5792707800865173, + "learning_rate": 0.0001, + "loss": 1.3146, + "step": 13522 + }, + { + "epoch": 1.5710717397618357, + "grad_norm": 0.6412919163703918, + "learning_rate": 0.0001, + "loss": 1.4789, + "step": 13523 + }, + { + "epoch": 1.5711879175137962, + "grad_norm": 0.6530932188034058, + "learning_rate": 0.0001, + "loss": 1.4638, + "step": 13524 + }, + { + "epoch": 1.5713040952657567, + "grad_norm": 0.6409628391265869, + "learning_rate": 0.0001, + "loss": 1.4587, + "step": 13525 + }, + { + "epoch": 1.5714202730177171, + "grad_norm": 0.6507309675216675, + "learning_rate": 0.0001, + "loss": 1.6477, + "step": 13526 + }, + { + "epoch": 1.5715364507696776, + "grad_norm": 0.6563456654548645, + "learning_rate": 0.0001, + "loss": 1.3904, + "step": 13527 + }, + { + "epoch": 1.5716526285216381, + "grad_norm": 0.6711177825927734, + "learning_rate": 0.0001, + "loss": 1.4924, + "step": 13528 + }, + { + "epoch": 1.5717688062735986, + "grad_norm": 0.6010395884513855, + "learning_rate": 0.0001, + "loss": 1.4131, + "step": 13529 + }, + { + "epoch": 1.571884984025559, + "grad_norm": 0.609821617603302, + "learning_rate": 0.0001, + "loss": 1.3046, + "step": 13530 + }, + { + "epoch": 1.5720011617775196, + "grad_norm": 0.6318057179450989, + "learning_rate": 0.0001, + "loss": 1.4899, + "step": 13531 + }, + { + "epoch": 1.57211733952948, + "grad_norm": 0.6473124027252197, + "learning_rate": 0.0001, + "loss": 1.5493, + "step": 13532 + }, + { + "epoch": 1.5722335172814406, + "grad_norm": 0.6723019480705261, + "learning_rate": 0.0001, + "loss": 1.6904, + "step": 13533 + }, + { + "epoch": 1.572349695033401, + "grad_norm": 0.6439656019210815, + "learning_rate": 0.0001, + "loss": 1.4742, + "step": 13534 + }, + { + "epoch": 1.5724658727853615, + "grad_norm": 0.5960809588432312, + "learning_rate": 0.0001, + "loss": 1.4667, + "step": 13535 + }, + { + "epoch": 1.572582050537322, + "grad_norm": 0.6096289753913879, + "learning_rate": 0.0001, + "loss": 1.4864, + "step": 13536 + }, + { + "epoch": 1.5726982282892825, + "grad_norm": 0.5937155485153198, + "learning_rate": 0.0001, + "loss": 1.4994, + "step": 13537 + }, + { + "epoch": 1.572814406041243, + "grad_norm": 0.6248606443405151, + "learning_rate": 0.0001, + "loss": 1.4783, + "step": 13538 + }, + { + "epoch": 1.5729305837932035, + "grad_norm": 0.6048620343208313, + "learning_rate": 0.0001, + "loss": 1.5025, + "step": 13539 + }, + { + "epoch": 1.573046761545164, + "grad_norm": 0.5833286643028259, + "learning_rate": 0.0001, + "loss": 1.4685, + "step": 13540 + }, + { + "epoch": 1.5731629392971245, + "grad_norm": 0.6231339573860168, + "learning_rate": 0.0001, + "loss": 1.5243, + "step": 13541 + }, + { + "epoch": 1.573279117049085, + "grad_norm": 0.641281247138977, + "learning_rate": 0.0001, + "loss": 1.6371, + "step": 13542 + }, + { + "epoch": 1.5733952948010455, + "grad_norm": 0.6249711513519287, + "learning_rate": 0.0001, + "loss": 1.573, + "step": 13543 + }, + { + "epoch": 1.573511472553006, + "grad_norm": 0.5779685378074646, + "learning_rate": 0.0001, + "loss": 1.3116, + "step": 13544 + }, + { + "epoch": 1.5736276503049667, + "grad_norm": 0.6345327496528625, + "learning_rate": 0.0001, + "loss": 1.4915, + "step": 13545 + }, + { + "epoch": 1.5737438280569271, + "grad_norm": 0.6199609637260437, + "learning_rate": 0.0001, + "loss": 1.2796, + "step": 13546 + }, + { + "epoch": 1.5738600058088876, + "grad_norm": 0.5937657952308655, + "learning_rate": 0.0001, + "loss": 1.408, + "step": 13547 + }, + { + "epoch": 1.5739761835608481, + "grad_norm": 0.6459031701087952, + "learning_rate": 0.0001, + "loss": 1.4237, + "step": 13548 + }, + { + "epoch": 1.5740923613128086, + "grad_norm": 0.6150230169296265, + "learning_rate": 0.0001, + "loss": 1.3998, + "step": 13549 + }, + { + "epoch": 1.574208539064769, + "grad_norm": 0.6067149043083191, + "learning_rate": 0.0001, + "loss": 1.4518, + "step": 13550 + }, + { + "epoch": 1.5743247168167296, + "grad_norm": 0.5723419785499573, + "learning_rate": 0.0001, + "loss": 1.3533, + "step": 13551 + }, + { + "epoch": 1.57444089456869, + "grad_norm": 0.5707582831382751, + "learning_rate": 0.0001, + "loss": 1.255, + "step": 13552 + }, + { + "epoch": 1.5745570723206506, + "grad_norm": 0.6069430112838745, + "learning_rate": 0.0001, + "loss": 1.3843, + "step": 13553 + }, + { + "epoch": 1.574673250072611, + "grad_norm": 0.6458742022514343, + "learning_rate": 0.0001, + "loss": 1.4396, + "step": 13554 + }, + { + "epoch": 1.5747894278245715, + "grad_norm": 0.6966086030006409, + "learning_rate": 0.0001, + "loss": 1.5593, + "step": 13555 + }, + { + "epoch": 1.5749056055765323, + "grad_norm": 0.6121847033500671, + "learning_rate": 0.0001, + "loss": 1.5018, + "step": 13556 + }, + { + "epoch": 1.5750217833284927, + "grad_norm": 0.6447102427482605, + "learning_rate": 0.0001, + "loss": 1.5895, + "step": 13557 + }, + { + "epoch": 1.5751379610804532, + "grad_norm": 0.6316473484039307, + "learning_rate": 0.0001, + "loss": 1.4707, + "step": 13558 + }, + { + "epoch": 1.5752541388324137, + "grad_norm": 0.623142659664154, + "learning_rate": 0.0001, + "loss": 1.4751, + "step": 13559 + }, + { + "epoch": 1.5753703165843742, + "grad_norm": 0.5784343481063843, + "learning_rate": 0.0001, + "loss": 1.3394, + "step": 13560 + }, + { + "epoch": 1.5754864943363347, + "grad_norm": 0.5949280261993408, + "learning_rate": 0.0001, + "loss": 1.3922, + "step": 13561 + }, + { + "epoch": 1.5756026720882952, + "grad_norm": 0.5954093933105469, + "learning_rate": 0.0001, + "loss": 1.519, + "step": 13562 + }, + { + "epoch": 1.5757188498402557, + "grad_norm": 0.6469232439994812, + "learning_rate": 0.0001, + "loss": 1.6181, + "step": 13563 + }, + { + "epoch": 1.5758350275922162, + "grad_norm": 0.5921434164047241, + "learning_rate": 0.0001, + "loss": 1.302, + "step": 13564 + }, + { + "epoch": 1.5759512053441767, + "grad_norm": 0.5979151725769043, + "learning_rate": 0.0001, + "loss": 1.4764, + "step": 13565 + }, + { + "epoch": 1.5760673830961371, + "grad_norm": 0.5948922634124756, + "learning_rate": 0.0001, + "loss": 1.367, + "step": 13566 + }, + { + "epoch": 1.5761835608480976, + "grad_norm": 0.6355971097946167, + "learning_rate": 0.0001, + "loss": 1.5362, + "step": 13567 + }, + { + "epoch": 1.5762997386000581, + "grad_norm": 0.654316782951355, + "learning_rate": 0.0001, + "loss": 1.5292, + "step": 13568 + }, + { + "epoch": 1.5764159163520186, + "grad_norm": 0.6081578731536865, + "learning_rate": 0.0001, + "loss": 1.4942, + "step": 13569 + }, + { + "epoch": 1.576532094103979, + "grad_norm": 0.6357074975967407, + "learning_rate": 0.0001, + "loss": 1.3794, + "step": 13570 + }, + { + "epoch": 1.5766482718559396, + "grad_norm": 0.6234276294708252, + "learning_rate": 0.0001, + "loss": 1.4295, + "step": 13571 + }, + { + "epoch": 1.5767644496079, + "grad_norm": 0.6558229327201843, + "learning_rate": 0.0001, + "loss": 1.5862, + "step": 13572 + }, + { + "epoch": 1.5768806273598606, + "grad_norm": 0.6135905981063843, + "learning_rate": 0.0001, + "loss": 1.5521, + "step": 13573 + }, + { + "epoch": 1.576996805111821, + "grad_norm": 0.5907683372497559, + "learning_rate": 0.0001, + "loss": 1.3218, + "step": 13574 + }, + { + "epoch": 1.5771129828637815, + "grad_norm": 0.5935102701187134, + "learning_rate": 0.0001, + "loss": 1.4558, + "step": 13575 + }, + { + "epoch": 1.577229160615742, + "grad_norm": 0.5960171222686768, + "learning_rate": 0.0001, + "loss": 1.3446, + "step": 13576 + }, + { + "epoch": 1.5773453383677025, + "grad_norm": 0.617599606513977, + "learning_rate": 0.0001, + "loss": 1.4829, + "step": 13577 + }, + { + "epoch": 1.577461516119663, + "grad_norm": 0.611723005771637, + "learning_rate": 0.0001, + "loss": 1.5532, + "step": 13578 + }, + { + "epoch": 1.5775776938716235, + "grad_norm": 0.6049669981002808, + "learning_rate": 0.0001, + "loss": 1.3626, + "step": 13579 + }, + { + "epoch": 1.577693871623584, + "grad_norm": 0.6668212413787842, + "learning_rate": 0.0001, + "loss": 1.5093, + "step": 13580 + }, + { + "epoch": 1.5778100493755445, + "grad_norm": 0.5572206377983093, + "learning_rate": 0.0001, + "loss": 1.1698, + "step": 13581 + }, + { + "epoch": 1.577926227127505, + "grad_norm": 0.6364170908927917, + "learning_rate": 0.0001, + "loss": 1.3022, + "step": 13582 + }, + { + "epoch": 1.5780424048794655, + "grad_norm": 0.6507368087768555, + "learning_rate": 0.0001, + "loss": 1.5794, + "step": 13583 + }, + { + "epoch": 1.578158582631426, + "grad_norm": 0.6161065697669983, + "learning_rate": 0.0001, + "loss": 1.54, + "step": 13584 + }, + { + "epoch": 1.5782747603833864, + "grad_norm": 0.6118278503417969, + "learning_rate": 0.0001, + "loss": 1.5709, + "step": 13585 + }, + { + "epoch": 1.578390938135347, + "grad_norm": 0.6194911599159241, + "learning_rate": 0.0001, + "loss": 1.5626, + "step": 13586 + }, + { + "epoch": 1.5785071158873076, + "grad_norm": 0.6314056515693665, + "learning_rate": 0.0001, + "loss": 1.4473, + "step": 13587 + }, + { + "epoch": 1.5786232936392681, + "grad_norm": 0.6135434508323669, + "learning_rate": 0.0001, + "loss": 1.3764, + "step": 13588 + }, + { + "epoch": 1.5787394713912286, + "grad_norm": 0.6144410371780396, + "learning_rate": 0.0001, + "loss": 1.5922, + "step": 13589 + }, + { + "epoch": 1.578855649143189, + "grad_norm": 0.6232594847679138, + "learning_rate": 0.0001, + "loss": 1.5534, + "step": 13590 + }, + { + "epoch": 1.5789718268951496, + "grad_norm": 0.6463111639022827, + "learning_rate": 0.0001, + "loss": 1.5343, + "step": 13591 + }, + { + "epoch": 1.57908800464711, + "grad_norm": 0.6009104251861572, + "learning_rate": 0.0001, + "loss": 1.4659, + "step": 13592 + }, + { + "epoch": 1.5792041823990706, + "grad_norm": 0.6553391218185425, + "learning_rate": 0.0001, + "loss": 1.5536, + "step": 13593 + }, + { + "epoch": 1.579320360151031, + "grad_norm": 0.5963552594184875, + "learning_rate": 0.0001, + "loss": 1.221, + "step": 13594 + }, + { + "epoch": 1.5794365379029915, + "grad_norm": 0.5866657495498657, + "learning_rate": 0.0001, + "loss": 1.3527, + "step": 13595 + }, + { + "epoch": 1.579552715654952, + "grad_norm": 0.6316760182380676, + "learning_rate": 0.0001, + "loss": 1.3974, + "step": 13596 + }, + { + "epoch": 1.5796688934069125, + "grad_norm": 0.5814557671546936, + "learning_rate": 0.0001, + "loss": 1.257, + "step": 13597 + }, + { + "epoch": 1.5797850711588732, + "grad_norm": 0.5807773470878601, + "learning_rate": 0.0001, + "loss": 1.29, + "step": 13598 + }, + { + "epoch": 1.5799012489108337, + "grad_norm": 0.6384397745132446, + "learning_rate": 0.0001, + "loss": 1.5735, + "step": 13599 + }, + { + "epoch": 1.5800174266627942, + "grad_norm": 0.5958534479141235, + "learning_rate": 0.0001, + "loss": 1.378, + "step": 13600 + }, + { + "epoch": 1.5801336044147547, + "grad_norm": 0.633190929889679, + "learning_rate": 0.0001, + "loss": 1.5265, + "step": 13601 + }, + { + "epoch": 1.5802497821667152, + "grad_norm": 0.6206573843955994, + "learning_rate": 0.0001, + "loss": 1.4828, + "step": 13602 + }, + { + "epoch": 1.5803659599186757, + "grad_norm": 0.6325139999389648, + "learning_rate": 0.0001, + "loss": 1.3249, + "step": 13603 + }, + { + "epoch": 1.5804821376706362, + "grad_norm": 0.618319571018219, + "learning_rate": 0.0001, + "loss": 1.6177, + "step": 13604 + }, + { + "epoch": 1.5805983154225967, + "grad_norm": 0.6615894436836243, + "learning_rate": 0.0001, + "loss": 1.4182, + "step": 13605 + }, + { + "epoch": 1.5807144931745571, + "grad_norm": 0.632692813873291, + "learning_rate": 0.0001, + "loss": 1.4902, + "step": 13606 + }, + { + "epoch": 1.5808306709265176, + "grad_norm": 0.6061463356018066, + "learning_rate": 0.0001, + "loss": 1.3473, + "step": 13607 + }, + { + "epoch": 1.5809468486784781, + "grad_norm": 0.6018291711807251, + "learning_rate": 0.0001, + "loss": 1.4807, + "step": 13608 + }, + { + "epoch": 1.5810630264304386, + "grad_norm": 0.6714172959327698, + "learning_rate": 0.0001, + "loss": 1.4028, + "step": 13609 + }, + { + "epoch": 1.581179204182399, + "grad_norm": 0.5730470418930054, + "learning_rate": 0.0001, + "loss": 1.3864, + "step": 13610 + }, + { + "epoch": 1.5812953819343596, + "grad_norm": 0.5907166004180908, + "learning_rate": 0.0001, + "loss": 1.4745, + "step": 13611 + }, + { + "epoch": 1.58141155968632, + "grad_norm": 0.6612714529037476, + "learning_rate": 0.0001, + "loss": 1.4357, + "step": 13612 + }, + { + "epoch": 1.5815277374382806, + "grad_norm": 0.6622298359870911, + "learning_rate": 0.0001, + "loss": 1.5119, + "step": 13613 + }, + { + "epoch": 1.581643915190241, + "grad_norm": 0.6330438852310181, + "learning_rate": 0.0001, + "loss": 1.5316, + "step": 13614 + }, + { + "epoch": 1.5817600929422015, + "grad_norm": 0.6070156693458557, + "learning_rate": 0.0001, + "loss": 1.4341, + "step": 13615 + }, + { + "epoch": 1.581876270694162, + "grad_norm": 0.6398180723190308, + "learning_rate": 0.0001, + "loss": 1.3261, + "step": 13616 + }, + { + "epoch": 1.5819924484461225, + "grad_norm": 0.6727312803268433, + "learning_rate": 0.0001, + "loss": 1.6721, + "step": 13617 + }, + { + "epoch": 1.582108626198083, + "grad_norm": 0.5788384079933167, + "learning_rate": 0.0001, + "loss": 1.3515, + "step": 13618 + }, + { + "epoch": 1.5822248039500435, + "grad_norm": 0.5994517803192139, + "learning_rate": 0.0001, + "loss": 1.5195, + "step": 13619 + }, + { + "epoch": 1.582340981702004, + "grad_norm": 0.661238431930542, + "learning_rate": 0.0001, + "loss": 1.5039, + "step": 13620 + }, + { + "epoch": 1.5824571594539645, + "grad_norm": 0.6708731055259705, + "learning_rate": 0.0001, + "loss": 1.423, + "step": 13621 + }, + { + "epoch": 1.582573337205925, + "grad_norm": 0.6710119843482971, + "learning_rate": 0.0001, + "loss": 1.6503, + "step": 13622 + }, + { + "epoch": 1.5826895149578855, + "grad_norm": 0.632266640663147, + "learning_rate": 0.0001, + "loss": 1.7291, + "step": 13623 + }, + { + "epoch": 1.582805692709846, + "grad_norm": 0.6530042290687561, + "learning_rate": 0.0001, + "loss": 1.4614, + "step": 13624 + }, + { + "epoch": 1.5829218704618064, + "grad_norm": 0.5772076845169067, + "learning_rate": 0.0001, + "loss": 1.4532, + "step": 13625 + }, + { + "epoch": 1.583038048213767, + "grad_norm": 0.6056697368621826, + "learning_rate": 0.0001, + "loss": 1.5996, + "step": 13626 + }, + { + "epoch": 1.5831542259657274, + "grad_norm": 0.6514211893081665, + "learning_rate": 0.0001, + "loss": 1.6922, + "step": 13627 + }, + { + "epoch": 1.583270403717688, + "grad_norm": 0.5848867297172546, + "learning_rate": 0.0001, + "loss": 1.2466, + "step": 13628 + }, + { + "epoch": 1.5833865814696486, + "grad_norm": 0.5798253417015076, + "learning_rate": 0.0001, + "loss": 1.334, + "step": 13629 + }, + { + "epoch": 1.583502759221609, + "grad_norm": 0.6590627431869507, + "learning_rate": 0.0001, + "loss": 1.6691, + "step": 13630 + }, + { + "epoch": 1.5836189369735696, + "grad_norm": 0.5774937272071838, + "learning_rate": 0.0001, + "loss": 1.4867, + "step": 13631 + }, + { + "epoch": 1.58373511472553, + "grad_norm": 0.6019846200942993, + "learning_rate": 0.0001, + "loss": 1.3748, + "step": 13632 + }, + { + "epoch": 1.5838512924774906, + "grad_norm": 0.581792414188385, + "learning_rate": 0.0001, + "loss": 1.3973, + "step": 13633 + }, + { + "epoch": 1.583967470229451, + "grad_norm": 0.6952236890792847, + "learning_rate": 0.0001, + "loss": 1.7609, + "step": 13634 + }, + { + "epoch": 1.5840836479814115, + "grad_norm": 0.5895935297012329, + "learning_rate": 0.0001, + "loss": 1.4348, + "step": 13635 + }, + { + "epoch": 1.584199825733372, + "grad_norm": 0.6527734994888306, + "learning_rate": 0.0001, + "loss": 1.4164, + "step": 13636 + }, + { + "epoch": 1.5843160034853325, + "grad_norm": 0.5872225761413574, + "learning_rate": 0.0001, + "loss": 1.4422, + "step": 13637 + }, + { + "epoch": 1.584432181237293, + "grad_norm": 0.5942081212997437, + "learning_rate": 0.0001, + "loss": 1.3844, + "step": 13638 + }, + { + "epoch": 1.5845483589892535, + "grad_norm": 0.5980642437934875, + "learning_rate": 0.0001, + "loss": 1.717, + "step": 13639 + }, + { + "epoch": 1.5846645367412142, + "grad_norm": 0.5560755729675293, + "learning_rate": 0.0001, + "loss": 1.2702, + "step": 13640 + }, + { + "epoch": 1.5847807144931747, + "grad_norm": 0.6350087523460388, + "learning_rate": 0.0001, + "loss": 1.555, + "step": 13641 + }, + { + "epoch": 1.5848968922451352, + "grad_norm": 0.6073043346405029, + "learning_rate": 0.0001, + "loss": 1.5905, + "step": 13642 + }, + { + "epoch": 1.5850130699970957, + "grad_norm": 0.6224149465560913, + "learning_rate": 0.0001, + "loss": 1.3986, + "step": 13643 + }, + { + "epoch": 1.5851292477490562, + "grad_norm": 0.6422401070594788, + "learning_rate": 0.0001, + "loss": 1.5798, + "step": 13644 + }, + { + "epoch": 1.5852454255010167, + "grad_norm": 0.59529048204422, + "learning_rate": 0.0001, + "loss": 1.3998, + "step": 13645 + }, + { + "epoch": 1.5853616032529771, + "grad_norm": 0.6929322481155396, + "learning_rate": 0.0001, + "loss": 1.5899, + "step": 13646 + }, + { + "epoch": 1.5854777810049376, + "grad_norm": 0.5858911871910095, + "learning_rate": 0.0001, + "loss": 1.4672, + "step": 13647 + }, + { + "epoch": 1.5855939587568981, + "grad_norm": 0.6204400658607483, + "learning_rate": 0.0001, + "loss": 1.3553, + "step": 13648 + }, + { + "epoch": 1.5857101365088586, + "grad_norm": 0.5746094584465027, + "learning_rate": 0.0001, + "loss": 1.2365, + "step": 13649 + }, + { + "epoch": 1.585826314260819, + "grad_norm": 0.6758047938346863, + "learning_rate": 0.0001, + "loss": 1.5019, + "step": 13650 + }, + { + "epoch": 1.5859424920127796, + "grad_norm": 0.6562144756317139, + "learning_rate": 0.0001, + "loss": 1.4617, + "step": 13651 + }, + { + "epoch": 1.58605866976474, + "grad_norm": 0.6426905989646912, + "learning_rate": 0.0001, + "loss": 1.4973, + "step": 13652 + }, + { + "epoch": 1.5861748475167006, + "grad_norm": 0.5699962973594666, + "learning_rate": 0.0001, + "loss": 1.3537, + "step": 13653 + }, + { + "epoch": 1.586291025268661, + "grad_norm": 0.5969005227088928, + "learning_rate": 0.0001, + "loss": 1.3264, + "step": 13654 + }, + { + "epoch": 1.5864072030206215, + "grad_norm": 0.6157596111297607, + "learning_rate": 0.0001, + "loss": 1.4763, + "step": 13655 + }, + { + "epoch": 1.586523380772582, + "grad_norm": 0.5986966490745544, + "learning_rate": 0.0001, + "loss": 1.4321, + "step": 13656 + }, + { + "epoch": 1.5866395585245425, + "grad_norm": 0.6195567846298218, + "learning_rate": 0.0001, + "loss": 1.5363, + "step": 13657 + }, + { + "epoch": 1.586755736276503, + "grad_norm": 0.6283451914787292, + "learning_rate": 0.0001, + "loss": 1.3878, + "step": 13658 + }, + { + "epoch": 1.5868719140284635, + "grad_norm": 0.6001632809638977, + "learning_rate": 0.0001, + "loss": 1.4915, + "step": 13659 + }, + { + "epoch": 1.586988091780424, + "grad_norm": 0.6226193308830261, + "learning_rate": 0.0001, + "loss": 1.5789, + "step": 13660 + }, + { + "epoch": 1.5871042695323845, + "grad_norm": 0.6030400991439819, + "learning_rate": 0.0001, + "loss": 1.4806, + "step": 13661 + }, + { + "epoch": 1.587220447284345, + "grad_norm": 0.5935747623443604, + "learning_rate": 0.0001, + "loss": 1.337, + "step": 13662 + }, + { + "epoch": 1.5873366250363055, + "grad_norm": 0.6464823484420776, + "learning_rate": 0.0001, + "loss": 1.612, + "step": 13663 + }, + { + "epoch": 1.587452802788266, + "grad_norm": 0.6798169612884521, + "learning_rate": 0.0001, + "loss": 1.277, + "step": 13664 + }, + { + "epoch": 1.5875689805402264, + "grad_norm": 0.5947486162185669, + "learning_rate": 0.0001, + "loss": 1.3965, + "step": 13665 + }, + { + "epoch": 1.587685158292187, + "grad_norm": 0.6533806324005127, + "learning_rate": 0.0001, + "loss": 1.431, + "step": 13666 + }, + { + "epoch": 1.5878013360441474, + "grad_norm": 0.6578819155693054, + "learning_rate": 0.0001, + "loss": 1.5538, + "step": 13667 + }, + { + "epoch": 1.587917513796108, + "grad_norm": 0.6176968216896057, + "learning_rate": 0.0001, + "loss": 1.413, + "step": 13668 + }, + { + "epoch": 1.5880336915480684, + "grad_norm": 0.6410322785377502, + "learning_rate": 0.0001, + "loss": 1.546, + "step": 13669 + }, + { + "epoch": 1.5881498693000289, + "grad_norm": 0.6015816330909729, + "learning_rate": 0.0001, + "loss": 1.464, + "step": 13670 + }, + { + "epoch": 1.5882660470519896, + "grad_norm": 0.6452057957649231, + "learning_rate": 0.0001, + "loss": 1.5803, + "step": 13671 + }, + { + "epoch": 1.58838222480395, + "grad_norm": 0.6403473019599915, + "learning_rate": 0.0001, + "loss": 1.5298, + "step": 13672 + }, + { + "epoch": 1.5884984025559106, + "grad_norm": 0.5736103653907776, + "learning_rate": 0.0001, + "loss": 1.3283, + "step": 13673 + }, + { + "epoch": 1.588614580307871, + "grad_norm": 0.5940598845481873, + "learning_rate": 0.0001, + "loss": 1.588, + "step": 13674 + }, + { + "epoch": 1.5887307580598315, + "grad_norm": 0.6314423084259033, + "learning_rate": 0.0001, + "loss": 1.4163, + "step": 13675 + }, + { + "epoch": 1.588846935811792, + "grad_norm": 0.5856815576553345, + "learning_rate": 0.0001, + "loss": 1.4124, + "step": 13676 + }, + { + "epoch": 1.5889631135637525, + "grad_norm": 0.6191335916519165, + "learning_rate": 0.0001, + "loss": 1.5927, + "step": 13677 + }, + { + "epoch": 1.589079291315713, + "grad_norm": 0.5804166793823242, + "learning_rate": 0.0001, + "loss": 1.3609, + "step": 13678 + }, + { + "epoch": 1.5891954690676735, + "grad_norm": 0.6077446937561035, + "learning_rate": 0.0001, + "loss": 1.4106, + "step": 13679 + }, + { + "epoch": 1.589311646819634, + "grad_norm": 0.668630838394165, + "learning_rate": 0.0001, + "loss": 1.3604, + "step": 13680 + }, + { + "epoch": 1.5894278245715947, + "grad_norm": 0.6589301824569702, + "learning_rate": 0.0001, + "loss": 1.7169, + "step": 13681 + }, + { + "epoch": 1.5895440023235552, + "grad_norm": 0.62638920545578, + "learning_rate": 0.0001, + "loss": 1.5183, + "step": 13682 + }, + { + "epoch": 1.5896601800755157, + "grad_norm": 0.626444935798645, + "learning_rate": 0.0001, + "loss": 1.5418, + "step": 13683 + }, + { + "epoch": 1.5897763578274762, + "grad_norm": 0.6382818818092346, + "learning_rate": 0.0001, + "loss": 1.5035, + "step": 13684 + }, + { + "epoch": 1.5898925355794367, + "grad_norm": 0.6530130505561829, + "learning_rate": 0.0001, + "loss": 1.534, + "step": 13685 + }, + { + "epoch": 1.5900087133313971, + "grad_norm": 0.5917280316352844, + "learning_rate": 0.0001, + "loss": 1.2945, + "step": 13686 + }, + { + "epoch": 1.5901248910833576, + "grad_norm": 0.601091206073761, + "learning_rate": 0.0001, + "loss": 1.4798, + "step": 13687 + }, + { + "epoch": 1.5902410688353181, + "grad_norm": 0.6319622993469238, + "learning_rate": 0.0001, + "loss": 1.4709, + "step": 13688 + }, + { + "epoch": 1.5903572465872786, + "grad_norm": 0.5939752459526062, + "learning_rate": 0.0001, + "loss": 1.3826, + "step": 13689 + }, + { + "epoch": 1.590473424339239, + "grad_norm": 0.6311963200569153, + "learning_rate": 0.0001, + "loss": 1.3094, + "step": 13690 + }, + { + "epoch": 1.5905896020911996, + "grad_norm": 0.6844353079795837, + "learning_rate": 0.0001, + "loss": 1.6273, + "step": 13691 + }, + { + "epoch": 1.59070577984316, + "grad_norm": 0.5649846196174622, + "learning_rate": 0.0001, + "loss": 1.4596, + "step": 13692 + }, + { + "epoch": 1.5908219575951206, + "grad_norm": 0.6137500405311584, + "learning_rate": 0.0001, + "loss": 1.4631, + "step": 13693 + }, + { + "epoch": 1.590938135347081, + "grad_norm": 0.6590669751167297, + "learning_rate": 0.0001, + "loss": 1.5914, + "step": 13694 + }, + { + "epoch": 1.5910543130990416, + "grad_norm": 0.6377047896385193, + "learning_rate": 0.0001, + "loss": 1.5466, + "step": 13695 + }, + { + "epoch": 1.591170490851002, + "grad_norm": 0.5739707946777344, + "learning_rate": 0.0001, + "loss": 1.4131, + "step": 13696 + }, + { + "epoch": 1.5912866686029625, + "grad_norm": 0.5823712944984436, + "learning_rate": 0.0001, + "loss": 1.3342, + "step": 13697 + }, + { + "epoch": 1.591402846354923, + "grad_norm": 0.6388639807701111, + "learning_rate": 0.0001, + "loss": 1.6015, + "step": 13698 + }, + { + "epoch": 1.5915190241068835, + "grad_norm": 0.5976583361625671, + "learning_rate": 0.0001, + "loss": 1.4843, + "step": 13699 + }, + { + "epoch": 1.591635201858844, + "grad_norm": 0.5898172855377197, + "learning_rate": 0.0001, + "loss": 1.4665, + "step": 13700 + }, + { + "epoch": 1.5917513796108045, + "grad_norm": 0.6254405379295349, + "learning_rate": 0.0001, + "loss": 1.4318, + "step": 13701 + }, + { + "epoch": 1.591867557362765, + "grad_norm": 0.6013739109039307, + "learning_rate": 0.0001, + "loss": 1.6765, + "step": 13702 + }, + { + "epoch": 1.5919837351147255, + "grad_norm": 0.6028892993927002, + "learning_rate": 0.0001, + "loss": 1.5473, + "step": 13703 + }, + { + "epoch": 1.592099912866686, + "grad_norm": 0.5645247101783752, + "learning_rate": 0.0001, + "loss": 1.3645, + "step": 13704 + }, + { + "epoch": 1.5922160906186464, + "grad_norm": 0.5704538822174072, + "learning_rate": 0.0001, + "loss": 1.4181, + "step": 13705 + }, + { + "epoch": 1.592332268370607, + "grad_norm": 0.6046001315116882, + "learning_rate": 0.0001, + "loss": 1.4436, + "step": 13706 + }, + { + "epoch": 1.5924484461225674, + "grad_norm": 0.6235845685005188, + "learning_rate": 0.0001, + "loss": 1.4548, + "step": 13707 + }, + { + "epoch": 1.592564623874528, + "grad_norm": 0.6004418730735779, + "learning_rate": 0.0001, + "loss": 1.3711, + "step": 13708 + }, + { + "epoch": 1.5926808016264884, + "grad_norm": 0.6089489459991455, + "learning_rate": 0.0001, + "loss": 1.4301, + "step": 13709 + }, + { + "epoch": 1.5927969793784489, + "grad_norm": 0.5987474322319031, + "learning_rate": 0.0001, + "loss": 1.6283, + "step": 13710 + }, + { + "epoch": 1.5929131571304094, + "grad_norm": 0.6150535345077515, + "learning_rate": 0.0001, + "loss": 1.6291, + "step": 13711 + }, + { + "epoch": 1.5930293348823699, + "grad_norm": 0.6323658227920532, + "learning_rate": 0.0001, + "loss": 1.5579, + "step": 13712 + }, + { + "epoch": 1.5931455126343306, + "grad_norm": 0.6383227705955505, + "learning_rate": 0.0001, + "loss": 1.4719, + "step": 13713 + }, + { + "epoch": 1.593261690386291, + "grad_norm": 0.6205024719238281, + "learning_rate": 0.0001, + "loss": 1.5389, + "step": 13714 + }, + { + "epoch": 1.5933778681382516, + "grad_norm": 0.6311460733413696, + "learning_rate": 0.0001, + "loss": 1.5924, + "step": 13715 + }, + { + "epoch": 1.593494045890212, + "grad_norm": 0.650108277797699, + "learning_rate": 0.0001, + "loss": 1.3354, + "step": 13716 + }, + { + "epoch": 1.5936102236421725, + "grad_norm": 0.5857402682304382, + "learning_rate": 0.0001, + "loss": 1.3433, + "step": 13717 + }, + { + "epoch": 1.593726401394133, + "grad_norm": 0.5971586108207703, + "learning_rate": 0.0001, + "loss": 1.455, + "step": 13718 + }, + { + "epoch": 1.5938425791460935, + "grad_norm": 0.6388098001480103, + "learning_rate": 0.0001, + "loss": 1.4883, + "step": 13719 + }, + { + "epoch": 1.593958756898054, + "grad_norm": 0.6415640115737915, + "learning_rate": 0.0001, + "loss": 1.4228, + "step": 13720 + }, + { + "epoch": 1.5940749346500145, + "grad_norm": 0.5820626616477966, + "learning_rate": 0.0001, + "loss": 1.4049, + "step": 13721 + }, + { + "epoch": 1.594191112401975, + "grad_norm": 0.569054126739502, + "learning_rate": 0.0001, + "loss": 1.3599, + "step": 13722 + }, + { + "epoch": 1.5943072901539357, + "grad_norm": 0.6303136348724365, + "learning_rate": 0.0001, + "loss": 1.4897, + "step": 13723 + }, + { + "epoch": 1.5944234679058962, + "grad_norm": 0.5948778986930847, + "learning_rate": 0.0001, + "loss": 1.5478, + "step": 13724 + }, + { + "epoch": 1.5945396456578567, + "grad_norm": 0.6140435338020325, + "learning_rate": 0.0001, + "loss": 1.4991, + "step": 13725 + }, + { + "epoch": 1.5946558234098172, + "grad_norm": 0.5883561372756958, + "learning_rate": 0.0001, + "loss": 1.5103, + "step": 13726 + }, + { + "epoch": 1.5947720011617776, + "grad_norm": 0.5896515846252441, + "learning_rate": 0.0001, + "loss": 1.4454, + "step": 13727 + }, + { + "epoch": 1.5948881789137381, + "grad_norm": 0.5697752833366394, + "learning_rate": 0.0001, + "loss": 1.4407, + "step": 13728 + }, + { + "epoch": 1.5950043566656986, + "grad_norm": 0.6240254640579224, + "learning_rate": 0.0001, + "loss": 1.525, + "step": 13729 + }, + { + "epoch": 1.595120534417659, + "grad_norm": 0.6201429963111877, + "learning_rate": 0.0001, + "loss": 1.5679, + "step": 13730 + }, + { + "epoch": 1.5952367121696196, + "grad_norm": 0.590234637260437, + "learning_rate": 0.0001, + "loss": 1.3998, + "step": 13731 + }, + { + "epoch": 1.59535288992158, + "grad_norm": 0.5969529747962952, + "learning_rate": 0.0001, + "loss": 1.3655, + "step": 13732 + }, + { + "epoch": 1.5954690676735406, + "grad_norm": 0.6133432984352112, + "learning_rate": 0.0001, + "loss": 1.1459, + "step": 13733 + }, + { + "epoch": 1.595585245425501, + "grad_norm": 0.6062743067741394, + "learning_rate": 0.0001, + "loss": 1.4189, + "step": 13734 + }, + { + "epoch": 1.5957014231774616, + "grad_norm": 0.658184826374054, + "learning_rate": 0.0001, + "loss": 1.6716, + "step": 13735 + }, + { + "epoch": 1.595817600929422, + "grad_norm": 0.599326491355896, + "learning_rate": 0.0001, + "loss": 1.3884, + "step": 13736 + }, + { + "epoch": 1.5959337786813825, + "grad_norm": 0.6448878049850464, + "learning_rate": 0.0001, + "loss": 1.4524, + "step": 13737 + }, + { + "epoch": 1.596049956433343, + "grad_norm": 0.6201961636543274, + "learning_rate": 0.0001, + "loss": 1.3097, + "step": 13738 + }, + { + "epoch": 1.5961661341853035, + "grad_norm": 0.6305376291275024, + "learning_rate": 0.0001, + "loss": 1.4291, + "step": 13739 + }, + { + "epoch": 1.596282311937264, + "grad_norm": 0.6596425175666809, + "learning_rate": 0.0001, + "loss": 1.557, + "step": 13740 + }, + { + "epoch": 1.5963984896892245, + "grad_norm": 0.6207994818687439, + "learning_rate": 0.0001, + "loss": 1.4566, + "step": 13741 + }, + { + "epoch": 1.596514667441185, + "grad_norm": 0.6085087060928345, + "learning_rate": 0.0001, + "loss": 1.5105, + "step": 13742 + }, + { + "epoch": 1.5966308451931455, + "grad_norm": 0.6169493794441223, + "learning_rate": 0.0001, + "loss": 1.5793, + "step": 13743 + }, + { + "epoch": 1.596747022945106, + "grad_norm": 0.6025657653808594, + "learning_rate": 0.0001, + "loss": 1.28, + "step": 13744 + }, + { + "epoch": 1.5968632006970664, + "grad_norm": 0.6102682948112488, + "learning_rate": 0.0001, + "loss": 1.6037, + "step": 13745 + }, + { + "epoch": 1.596979378449027, + "grad_norm": 0.5944964289665222, + "learning_rate": 0.0001, + "loss": 1.6677, + "step": 13746 + }, + { + "epoch": 1.5970955562009874, + "grad_norm": 0.6884152889251709, + "learning_rate": 0.0001, + "loss": 1.6509, + "step": 13747 + }, + { + "epoch": 1.597211733952948, + "grad_norm": 0.600307285785675, + "learning_rate": 0.0001, + "loss": 1.4579, + "step": 13748 + }, + { + "epoch": 1.5973279117049084, + "grad_norm": 0.5742710828781128, + "learning_rate": 0.0001, + "loss": 1.2788, + "step": 13749 + }, + { + "epoch": 1.5974440894568689, + "grad_norm": 0.5742681622505188, + "learning_rate": 0.0001, + "loss": 1.4084, + "step": 13750 + }, + { + "epoch": 1.5975602672088294, + "grad_norm": 0.5895428657531738, + "learning_rate": 0.0001, + "loss": 1.5114, + "step": 13751 + }, + { + "epoch": 1.5976764449607899, + "grad_norm": 0.6041052937507629, + "learning_rate": 0.0001, + "loss": 1.4984, + "step": 13752 + }, + { + "epoch": 1.5977926227127504, + "grad_norm": 0.6442006826400757, + "learning_rate": 0.0001, + "loss": 1.6977, + "step": 13753 + }, + { + "epoch": 1.5979088004647108, + "grad_norm": 0.5974199175834656, + "learning_rate": 0.0001, + "loss": 1.3967, + "step": 13754 + }, + { + "epoch": 1.5980249782166716, + "grad_norm": 0.6257326602935791, + "learning_rate": 0.0001, + "loss": 1.5538, + "step": 13755 + }, + { + "epoch": 1.598141155968632, + "grad_norm": 0.6209074258804321, + "learning_rate": 0.0001, + "loss": 1.4854, + "step": 13756 + }, + { + "epoch": 1.5982573337205925, + "grad_norm": 0.6362159252166748, + "learning_rate": 0.0001, + "loss": 1.5634, + "step": 13757 + }, + { + "epoch": 1.598373511472553, + "grad_norm": 0.611994743347168, + "learning_rate": 0.0001, + "loss": 1.4347, + "step": 13758 + }, + { + "epoch": 1.5984896892245135, + "grad_norm": 0.6070793867111206, + "learning_rate": 0.0001, + "loss": 1.3342, + "step": 13759 + }, + { + "epoch": 1.598605866976474, + "grad_norm": 0.6359820365905762, + "learning_rate": 0.0001, + "loss": 1.4627, + "step": 13760 + }, + { + "epoch": 1.5987220447284345, + "grad_norm": 0.6544227004051208, + "learning_rate": 0.0001, + "loss": 1.4379, + "step": 13761 + }, + { + "epoch": 1.598838222480395, + "grad_norm": 0.669750452041626, + "learning_rate": 0.0001, + "loss": 1.3736, + "step": 13762 + }, + { + "epoch": 1.5989544002323555, + "grad_norm": 0.6785888075828552, + "learning_rate": 0.0001, + "loss": 1.6779, + "step": 13763 + }, + { + "epoch": 1.599070577984316, + "grad_norm": 0.6282570958137512, + "learning_rate": 0.0001, + "loss": 1.4536, + "step": 13764 + }, + { + "epoch": 1.5991867557362767, + "grad_norm": 0.5843489170074463, + "learning_rate": 0.0001, + "loss": 1.4549, + "step": 13765 + }, + { + "epoch": 1.5993029334882372, + "grad_norm": 0.6315165758132935, + "learning_rate": 0.0001, + "loss": 1.4352, + "step": 13766 + }, + { + "epoch": 1.5994191112401976, + "grad_norm": 0.6136773824691772, + "learning_rate": 0.0001, + "loss": 1.5576, + "step": 13767 + }, + { + "epoch": 1.5995352889921581, + "grad_norm": 0.6138652563095093, + "learning_rate": 0.0001, + "loss": 1.5366, + "step": 13768 + }, + { + "epoch": 1.5996514667441186, + "grad_norm": 0.6372418999671936, + "learning_rate": 0.0001, + "loss": 1.6065, + "step": 13769 + }, + { + "epoch": 1.599767644496079, + "grad_norm": 0.6061596870422363, + "learning_rate": 0.0001, + "loss": 1.416, + "step": 13770 + }, + { + "epoch": 1.5998838222480396, + "grad_norm": 0.6062629818916321, + "learning_rate": 0.0001, + "loss": 1.2875, + "step": 13771 + }, + { + "epoch": 1.6, + "grad_norm": 0.6032814979553223, + "learning_rate": 0.0001, + "loss": 1.4574, + "step": 13772 + }, + { + "epoch": 1.6001161777519606, + "grad_norm": 0.6498891115188599, + "learning_rate": 0.0001, + "loss": 1.5472, + "step": 13773 + }, + { + "epoch": 1.600232355503921, + "grad_norm": 0.6103582978248596, + "learning_rate": 0.0001, + "loss": 1.4621, + "step": 13774 + }, + { + "epoch": 1.6003485332558816, + "grad_norm": 0.5915868282318115, + "learning_rate": 0.0001, + "loss": 1.4204, + "step": 13775 + }, + { + "epoch": 1.600464711007842, + "grad_norm": 0.5753716826438904, + "learning_rate": 0.0001, + "loss": 1.2423, + "step": 13776 + }, + { + "epoch": 1.6005808887598025, + "grad_norm": 0.5940613746643066, + "learning_rate": 0.0001, + "loss": 1.5206, + "step": 13777 + }, + { + "epoch": 1.600697066511763, + "grad_norm": 0.5621521472930908, + "learning_rate": 0.0001, + "loss": 1.3927, + "step": 13778 + }, + { + "epoch": 1.6008132442637235, + "grad_norm": 0.6004639267921448, + "learning_rate": 0.0001, + "loss": 1.3794, + "step": 13779 + }, + { + "epoch": 1.600929422015684, + "grad_norm": 0.6216353178024292, + "learning_rate": 0.0001, + "loss": 1.559, + "step": 13780 + }, + { + "epoch": 1.6010455997676445, + "grad_norm": 0.5929364562034607, + "learning_rate": 0.0001, + "loss": 1.5074, + "step": 13781 + }, + { + "epoch": 1.601161777519605, + "grad_norm": 0.6302357316017151, + "learning_rate": 0.0001, + "loss": 1.3182, + "step": 13782 + }, + { + "epoch": 1.6012779552715655, + "grad_norm": 0.6794602274894714, + "learning_rate": 0.0001, + "loss": 1.6703, + "step": 13783 + }, + { + "epoch": 1.601394133023526, + "grad_norm": 0.5782144069671631, + "learning_rate": 0.0001, + "loss": 1.36, + "step": 13784 + }, + { + "epoch": 1.6015103107754864, + "grad_norm": 0.659773588180542, + "learning_rate": 0.0001, + "loss": 1.4868, + "step": 13785 + }, + { + "epoch": 1.601626488527447, + "grad_norm": 0.6725043058395386, + "learning_rate": 0.0001, + "loss": 1.4797, + "step": 13786 + }, + { + "epoch": 1.6017426662794074, + "grad_norm": 0.6366990208625793, + "learning_rate": 0.0001, + "loss": 1.4486, + "step": 13787 + }, + { + "epoch": 1.601858844031368, + "grad_norm": 0.6560510993003845, + "learning_rate": 0.0001, + "loss": 1.4995, + "step": 13788 + }, + { + "epoch": 1.6019750217833284, + "grad_norm": 0.6620891690254211, + "learning_rate": 0.0001, + "loss": 1.3972, + "step": 13789 + }, + { + "epoch": 1.602091199535289, + "grad_norm": 0.6221482157707214, + "learning_rate": 0.0001, + "loss": 1.5887, + "step": 13790 + }, + { + "epoch": 1.6022073772872494, + "grad_norm": 0.6068333983421326, + "learning_rate": 0.0001, + "loss": 1.4633, + "step": 13791 + }, + { + "epoch": 1.6023235550392099, + "grad_norm": 0.6149522066116333, + "learning_rate": 0.0001, + "loss": 1.5298, + "step": 13792 + }, + { + "epoch": 1.6024397327911704, + "grad_norm": 0.6315502524375916, + "learning_rate": 0.0001, + "loss": 1.3288, + "step": 13793 + }, + { + "epoch": 1.6025559105431308, + "grad_norm": 0.6193204522132874, + "learning_rate": 0.0001, + "loss": 1.5412, + "step": 13794 + }, + { + "epoch": 1.6026720882950913, + "grad_norm": 0.6118963956832886, + "learning_rate": 0.0001, + "loss": 1.6071, + "step": 13795 + }, + { + "epoch": 1.6027882660470518, + "grad_norm": 0.5986089110374451, + "learning_rate": 0.0001, + "loss": 1.4186, + "step": 13796 + }, + { + "epoch": 1.6029044437990125, + "grad_norm": 0.5849127769470215, + "learning_rate": 0.0001, + "loss": 1.6019, + "step": 13797 + }, + { + "epoch": 1.603020621550973, + "grad_norm": 0.6488816738128662, + "learning_rate": 0.0001, + "loss": 1.592, + "step": 13798 + }, + { + "epoch": 1.6031367993029335, + "grad_norm": 0.5963447690010071, + "learning_rate": 0.0001, + "loss": 1.4233, + "step": 13799 + }, + { + "epoch": 1.603252977054894, + "grad_norm": 0.5969744920730591, + "learning_rate": 0.0001, + "loss": 1.4197, + "step": 13800 + }, + { + "epoch": 1.6033691548068545, + "grad_norm": 0.5776385068893433, + "learning_rate": 0.0001, + "loss": 1.4639, + "step": 13801 + }, + { + "epoch": 1.603485332558815, + "grad_norm": 0.568734347820282, + "learning_rate": 0.0001, + "loss": 1.384, + "step": 13802 + }, + { + "epoch": 1.6036015103107755, + "grad_norm": 0.6355451345443726, + "learning_rate": 0.0001, + "loss": 1.5034, + "step": 13803 + }, + { + "epoch": 1.603717688062736, + "grad_norm": 0.5923163294792175, + "learning_rate": 0.0001, + "loss": 1.4938, + "step": 13804 + }, + { + "epoch": 1.6038338658146964, + "grad_norm": 0.6059244275093079, + "learning_rate": 0.0001, + "loss": 1.4256, + "step": 13805 + }, + { + "epoch": 1.603950043566657, + "grad_norm": 0.5898007750511169, + "learning_rate": 0.0001, + "loss": 1.4724, + "step": 13806 + }, + { + "epoch": 1.6040662213186176, + "grad_norm": 0.5829443335533142, + "learning_rate": 0.0001, + "loss": 1.3609, + "step": 13807 + }, + { + "epoch": 1.6041823990705781, + "grad_norm": 0.6079580783843994, + "learning_rate": 0.0001, + "loss": 1.3347, + "step": 13808 + }, + { + "epoch": 1.6042985768225386, + "grad_norm": 0.6549359560012817, + "learning_rate": 0.0001, + "loss": 1.5499, + "step": 13809 + }, + { + "epoch": 1.6044147545744991, + "grad_norm": 0.5998852252960205, + "learning_rate": 0.0001, + "loss": 1.3346, + "step": 13810 + }, + { + "epoch": 1.6045309323264596, + "grad_norm": 0.6005247831344604, + "learning_rate": 0.0001, + "loss": 1.4291, + "step": 13811 + }, + { + "epoch": 1.60464711007842, + "grad_norm": 0.6408963799476624, + "learning_rate": 0.0001, + "loss": 1.5221, + "step": 13812 + }, + { + "epoch": 1.6047632878303806, + "grad_norm": 0.589667022228241, + "learning_rate": 0.0001, + "loss": 1.334, + "step": 13813 + }, + { + "epoch": 1.604879465582341, + "grad_norm": 0.5643407702445984, + "learning_rate": 0.0001, + "loss": 1.3689, + "step": 13814 + }, + { + "epoch": 1.6049956433343016, + "grad_norm": 0.659737229347229, + "learning_rate": 0.0001, + "loss": 1.4327, + "step": 13815 + }, + { + "epoch": 1.605111821086262, + "grad_norm": 0.6060417890548706, + "learning_rate": 0.0001, + "loss": 1.4205, + "step": 13816 + }, + { + "epoch": 1.6052279988382225, + "grad_norm": 0.6752500534057617, + "learning_rate": 0.0001, + "loss": 1.4827, + "step": 13817 + }, + { + "epoch": 1.605344176590183, + "grad_norm": 0.6170288324356079, + "learning_rate": 0.0001, + "loss": 1.2098, + "step": 13818 + }, + { + "epoch": 1.6054603543421435, + "grad_norm": 0.6077424883842468, + "learning_rate": 0.0001, + "loss": 1.5053, + "step": 13819 + }, + { + "epoch": 1.605576532094104, + "grad_norm": 0.5606176257133484, + "learning_rate": 0.0001, + "loss": 1.3151, + "step": 13820 + }, + { + "epoch": 1.6056927098460645, + "grad_norm": 0.6182757019996643, + "learning_rate": 0.0001, + "loss": 1.503, + "step": 13821 + }, + { + "epoch": 1.605808887598025, + "grad_norm": 0.6296382546424866, + "learning_rate": 0.0001, + "loss": 1.5476, + "step": 13822 + }, + { + "epoch": 1.6059250653499855, + "grad_norm": 0.6217150688171387, + "learning_rate": 0.0001, + "loss": 1.3125, + "step": 13823 + }, + { + "epoch": 1.606041243101946, + "grad_norm": 0.6524258852005005, + "learning_rate": 0.0001, + "loss": 1.5611, + "step": 13824 + }, + { + "epoch": 1.6061574208539064, + "grad_norm": 0.6134025454521179, + "learning_rate": 0.0001, + "loss": 1.4981, + "step": 13825 + }, + { + "epoch": 1.606273598605867, + "grad_norm": 0.6530399918556213, + "learning_rate": 0.0001, + "loss": 1.4917, + "step": 13826 + }, + { + "epoch": 1.6063897763578274, + "grad_norm": 0.5791024565696716, + "learning_rate": 0.0001, + "loss": 1.2731, + "step": 13827 + }, + { + "epoch": 1.606505954109788, + "grad_norm": 0.6186858415603638, + "learning_rate": 0.0001, + "loss": 1.3552, + "step": 13828 + }, + { + "epoch": 1.6066221318617484, + "grad_norm": 0.5937260985374451, + "learning_rate": 0.0001, + "loss": 1.3172, + "step": 13829 + }, + { + "epoch": 1.606738309613709, + "grad_norm": 0.6377929449081421, + "learning_rate": 0.0001, + "loss": 1.5637, + "step": 13830 + }, + { + "epoch": 1.6068544873656694, + "grad_norm": 0.5948269367218018, + "learning_rate": 0.0001, + "loss": 1.3598, + "step": 13831 + }, + { + "epoch": 1.6069706651176299, + "grad_norm": 0.5641445517539978, + "learning_rate": 0.0001, + "loss": 1.2045, + "step": 13832 + }, + { + "epoch": 1.6070868428695904, + "grad_norm": 0.638268232345581, + "learning_rate": 0.0001, + "loss": 1.4597, + "step": 13833 + }, + { + "epoch": 1.6072030206215508, + "grad_norm": 0.5887224674224854, + "learning_rate": 0.0001, + "loss": 1.4827, + "step": 13834 + }, + { + "epoch": 1.6073191983735113, + "grad_norm": 0.6423467397689819, + "learning_rate": 0.0001, + "loss": 1.7015, + "step": 13835 + }, + { + "epoch": 1.6074353761254718, + "grad_norm": 0.6366564631462097, + "learning_rate": 0.0001, + "loss": 1.6689, + "step": 13836 + }, + { + "epoch": 1.6075515538774323, + "grad_norm": 0.5624208450317383, + "learning_rate": 0.0001, + "loss": 1.3148, + "step": 13837 + }, + { + "epoch": 1.607667731629393, + "grad_norm": 0.5844243168830872, + "learning_rate": 0.0001, + "loss": 1.4741, + "step": 13838 + }, + { + "epoch": 1.6077839093813535, + "grad_norm": 0.6397031545639038, + "learning_rate": 0.0001, + "loss": 1.6976, + "step": 13839 + }, + { + "epoch": 1.607900087133314, + "grad_norm": 0.6005331873893738, + "learning_rate": 0.0001, + "loss": 1.3997, + "step": 13840 + }, + { + "epoch": 1.6080162648852745, + "grad_norm": 0.6438605189323425, + "learning_rate": 0.0001, + "loss": 1.6067, + "step": 13841 + }, + { + "epoch": 1.608132442637235, + "grad_norm": 0.5851279497146606, + "learning_rate": 0.0001, + "loss": 1.2065, + "step": 13842 + }, + { + "epoch": 1.6082486203891955, + "grad_norm": 0.567348301410675, + "learning_rate": 0.0001, + "loss": 1.3517, + "step": 13843 + }, + { + "epoch": 1.608364798141156, + "grad_norm": 0.6039325594902039, + "learning_rate": 0.0001, + "loss": 1.4571, + "step": 13844 + }, + { + "epoch": 1.6084809758931164, + "grad_norm": 0.5768235921859741, + "learning_rate": 0.0001, + "loss": 1.3531, + "step": 13845 + }, + { + "epoch": 1.608597153645077, + "grad_norm": 0.5796616077423096, + "learning_rate": 0.0001, + "loss": 1.3958, + "step": 13846 + }, + { + "epoch": 1.6087133313970374, + "grad_norm": 0.603740930557251, + "learning_rate": 0.0001, + "loss": 1.4034, + "step": 13847 + }, + { + "epoch": 1.608829509148998, + "grad_norm": 0.6364991068840027, + "learning_rate": 0.0001, + "loss": 1.5609, + "step": 13848 + }, + { + "epoch": 1.6089456869009586, + "grad_norm": 0.6351603269577026, + "learning_rate": 0.0001, + "loss": 1.564, + "step": 13849 + }, + { + "epoch": 1.6090618646529191, + "grad_norm": 0.6068862676620483, + "learning_rate": 0.0001, + "loss": 1.4097, + "step": 13850 + }, + { + "epoch": 1.6091780424048796, + "grad_norm": 0.642614483833313, + "learning_rate": 0.0001, + "loss": 1.4754, + "step": 13851 + }, + { + "epoch": 1.60929422015684, + "grad_norm": 0.6429895162582397, + "learning_rate": 0.0001, + "loss": 1.6018, + "step": 13852 + }, + { + "epoch": 1.6094103979088006, + "grad_norm": 0.5926845073699951, + "learning_rate": 0.0001, + "loss": 1.458, + "step": 13853 + }, + { + "epoch": 1.609526575660761, + "grad_norm": 0.658845067024231, + "learning_rate": 0.0001, + "loss": 1.3906, + "step": 13854 + }, + { + "epoch": 1.6096427534127216, + "grad_norm": 0.5958040952682495, + "learning_rate": 0.0001, + "loss": 1.3695, + "step": 13855 + }, + { + "epoch": 1.609758931164682, + "grad_norm": 0.6442962288856506, + "learning_rate": 0.0001, + "loss": 1.6451, + "step": 13856 + }, + { + "epoch": 1.6098751089166425, + "grad_norm": 0.5802430510520935, + "learning_rate": 0.0001, + "loss": 1.4404, + "step": 13857 + }, + { + "epoch": 1.609991286668603, + "grad_norm": 0.6123525500297546, + "learning_rate": 0.0001, + "loss": 1.3439, + "step": 13858 + }, + { + "epoch": 1.6101074644205635, + "grad_norm": 0.6517568230628967, + "learning_rate": 0.0001, + "loss": 1.4508, + "step": 13859 + }, + { + "epoch": 1.610223642172524, + "grad_norm": 0.634818434715271, + "learning_rate": 0.0001, + "loss": 1.6394, + "step": 13860 + }, + { + "epoch": 1.6103398199244845, + "grad_norm": 0.5639699697494507, + "learning_rate": 0.0001, + "loss": 1.5309, + "step": 13861 + }, + { + "epoch": 1.610455997676445, + "grad_norm": 0.5527640581130981, + "learning_rate": 0.0001, + "loss": 1.39, + "step": 13862 + }, + { + "epoch": 1.6105721754284055, + "grad_norm": 0.6287981271743774, + "learning_rate": 0.0001, + "loss": 1.4817, + "step": 13863 + }, + { + "epoch": 1.610688353180366, + "grad_norm": 0.5936501622200012, + "learning_rate": 0.0001, + "loss": 1.5035, + "step": 13864 + }, + { + "epoch": 1.6108045309323265, + "grad_norm": 0.6397016644477844, + "learning_rate": 0.0001, + "loss": 1.3037, + "step": 13865 + }, + { + "epoch": 1.610920708684287, + "grad_norm": 0.6394860744476318, + "learning_rate": 0.0001, + "loss": 1.4609, + "step": 13866 + }, + { + "epoch": 1.6110368864362474, + "grad_norm": 0.6565226316452026, + "learning_rate": 0.0001, + "loss": 1.5441, + "step": 13867 + }, + { + "epoch": 1.611153064188208, + "grad_norm": 0.6310607194900513, + "learning_rate": 0.0001, + "loss": 1.4312, + "step": 13868 + }, + { + "epoch": 1.6112692419401684, + "grad_norm": 0.6256000995635986, + "learning_rate": 0.0001, + "loss": 1.6485, + "step": 13869 + }, + { + "epoch": 1.611385419692129, + "grad_norm": 0.6283453106880188, + "learning_rate": 0.0001, + "loss": 1.4648, + "step": 13870 + }, + { + "epoch": 1.6115015974440894, + "grad_norm": 0.6144470572471619, + "learning_rate": 0.0001, + "loss": 1.3494, + "step": 13871 + }, + { + "epoch": 1.6116177751960499, + "grad_norm": 0.6374574303627014, + "learning_rate": 0.0001, + "loss": 1.6428, + "step": 13872 + }, + { + "epoch": 1.6117339529480104, + "grad_norm": 0.6335989236831665, + "learning_rate": 0.0001, + "loss": 1.5362, + "step": 13873 + }, + { + "epoch": 1.6118501306999709, + "grad_norm": 0.6003391146659851, + "learning_rate": 0.0001, + "loss": 1.4554, + "step": 13874 + }, + { + "epoch": 1.6119663084519313, + "grad_norm": 0.6658709645271301, + "learning_rate": 0.0001, + "loss": 1.6036, + "step": 13875 + }, + { + "epoch": 1.6120824862038918, + "grad_norm": 0.6419057250022888, + "learning_rate": 0.0001, + "loss": 1.4394, + "step": 13876 + }, + { + "epoch": 1.6121986639558523, + "grad_norm": 0.737707793712616, + "learning_rate": 0.0001, + "loss": 1.3447, + "step": 13877 + }, + { + "epoch": 1.6123148417078128, + "grad_norm": 0.6373167634010315, + "learning_rate": 0.0001, + "loss": 1.5011, + "step": 13878 + }, + { + "epoch": 1.6124310194597733, + "grad_norm": 0.6646495461463928, + "learning_rate": 0.0001, + "loss": 1.6556, + "step": 13879 + }, + { + "epoch": 1.612547197211734, + "grad_norm": 0.6386552453041077, + "learning_rate": 0.0001, + "loss": 1.5203, + "step": 13880 + }, + { + "epoch": 1.6126633749636945, + "grad_norm": 0.5788254141807556, + "learning_rate": 0.0001, + "loss": 1.5247, + "step": 13881 + }, + { + "epoch": 1.612779552715655, + "grad_norm": 0.6018937826156616, + "learning_rate": 0.0001, + "loss": 1.4407, + "step": 13882 + }, + { + "epoch": 1.6128957304676155, + "grad_norm": 0.6025969386100769, + "learning_rate": 0.0001, + "loss": 1.3986, + "step": 13883 + }, + { + "epoch": 1.613011908219576, + "grad_norm": 0.5800029635429382, + "learning_rate": 0.0001, + "loss": 1.4766, + "step": 13884 + }, + { + "epoch": 1.6131280859715365, + "grad_norm": 0.5896215438842773, + "learning_rate": 0.0001, + "loss": 1.3036, + "step": 13885 + }, + { + "epoch": 1.613244263723497, + "grad_norm": 0.6163521409034729, + "learning_rate": 0.0001, + "loss": 1.4066, + "step": 13886 + }, + { + "epoch": 1.6133604414754574, + "grad_norm": 0.6672067642211914, + "learning_rate": 0.0001, + "loss": 1.5426, + "step": 13887 + }, + { + "epoch": 1.613476619227418, + "grad_norm": 0.5820779204368591, + "learning_rate": 0.0001, + "loss": 1.3377, + "step": 13888 + }, + { + "epoch": 1.6135927969793784, + "grad_norm": 0.6226430535316467, + "learning_rate": 0.0001, + "loss": 1.4441, + "step": 13889 + }, + { + "epoch": 1.613708974731339, + "grad_norm": 0.6409672498703003, + "learning_rate": 0.0001, + "loss": 1.5232, + "step": 13890 + }, + { + "epoch": 1.6138251524832996, + "grad_norm": 0.6150544881820679, + "learning_rate": 0.0001, + "loss": 1.4207, + "step": 13891 + }, + { + "epoch": 1.61394133023526, + "grad_norm": 0.6505727767944336, + "learning_rate": 0.0001, + "loss": 1.5336, + "step": 13892 + }, + { + "epoch": 1.6140575079872206, + "grad_norm": 0.6071352958679199, + "learning_rate": 0.0001, + "loss": 1.5639, + "step": 13893 + }, + { + "epoch": 1.614173685739181, + "grad_norm": 0.5638280510902405, + "learning_rate": 0.0001, + "loss": 1.4476, + "step": 13894 + }, + { + "epoch": 1.6142898634911416, + "grad_norm": 0.6587737798690796, + "learning_rate": 0.0001, + "loss": 1.6427, + "step": 13895 + }, + { + "epoch": 1.614406041243102, + "grad_norm": 0.6675108075141907, + "learning_rate": 0.0001, + "loss": 1.5899, + "step": 13896 + }, + { + "epoch": 1.6145222189950625, + "grad_norm": 0.6368352770805359, + "learning_rate": 0.0001, + "loss": 1.6209, + "step": 13897 + }, + { + "epoch": 1.614638396747023, + "grad_norm": 0.6211996674537659, + "learning_rate": 0.0001, + "loss": 1.416, + "step": 13898 + }, + { + "epoch": 1.6147545744989835, + "grad_norm": 0.6183685660362244, + "learning_rate": 0.0001, + "loss": 1.4282, + "step": 13899 + }, + { + "epoch": 1.614870752250944, + "grad_norm": 0.615385115146637, + "learning_rate": 0.0001, + "loss": 1.4632, + "step": 13900 + }, + { + "epoch": 1.6149869300029045, + "grad_norm": 0.6182390451431274, + "learning_rate": 0.0001, + "loss": 1.5878, + "step": 13901 + }, + { + "epoch": 1.615103107754865, + "grad_norm": 0.6461263298988342, + "learning_rate": 0.0001, + "loss": 1.4021, + "step": 13902 + }, + { + "epoch": 1.6152192855068255, + "grad_norm": 0.5660906434059143, + "learning_rate": 0.0001, + "loss": 1.2598, + "step": 13903 + }, + { + "epoch": 1.615335463258786, + "grad_norm": 0.6474366188049316, + "learning_rate": 0.0001, + "loss": 1.4905, + "step": 13904 + }, + { + "epoch": 1.6154516410107465, + "grad_norm": 0.6442524790763855, + "learning_rate": 0.0001, + "loss": 1.4119, + "step": 13905 + }, + { + "epoch": 1.615567818762707, + "grad_norm": 0.6138772964477539, + "learning_rate": 0.0001, + "loss": 1.4201, + "step": 13906 + }, + { + "epoch": 1.6156839965146674, + "grad_norm": 0.6902275085449219, + "learning_rate": 0.0001, + "loss": 1.7217, + "step": 13907 + }, + { + "epoch": 1.615800174266628, + "grad_norm": 0.5896263122558594, + "learning_rate": 0.0001, + "loss": 1.3702, + "step": 13908 + }, + { + "epoch": 1.6159163520185884, + "grad_norm": 0.6327207684516907, + "learning_rate": 0.0001, + "loss": 1.5585, + "step": 13909 + }, + { + "epoch": 1.616032529770549, + "grad_norm": 0.6263841986656189, + "learning_rate": 0.0001, + "loss": 1.371, + "step": 13910 + }, + { + "epoch": 1.6161487075225094, + "grad_norm": 0.6256563067436218, + "learning_rate": 0.0001, + "loss": 1.3766, + "step": 13911 + }, + { + "epoch": 1.6162648852744699, + "grad_norm": 0.6188286542892456, + "learning_rate": 0.0001, + "loss": 1.3662, + "step": 13912 + }, + { + "epoch": 1.6163810630264304, + "grad_norm": 0.5888686180114746, + "learning_rate": 0.0001, + "loss": 1.328, + "step": 13913 + }, + { + "epoch": 1.6164972407783909, + "grad_norm": 0.5912812352180481, + "learning_rate": 0.0001, + "loss": 1.3469, + "step": 13914 + }, + { + "epoch": 1.6166134185303513, + "grad_norm": 0.58259516954422, + "learning_rate": 0.0001, + "loss": 1.3371, + "step": 13915 + }, + { + "epoch": 1.6167295962823118, + "grad_norm": 0.636556088924408, + "learning_rate": 0.0001, + "loss": 1.5573, + "step": 13916 + }, + { + "epoch": 1.6168457740342723, + "grad_norm": 0.5752967596054077, + "learning_rate": 0.0001, + "loss": 1.3001, + "step": 13917 + }, + { + "epoch": 1.6169619517862328, + "grad_norm": 0.6357734203338623, + "learning_rate": 0.0001, + "loss": 1.5934, + "step": 13918 + }, + { + "epoch": 1.6170781295381933, + "grad_norm": 0.6322953701019287, + "learning_rate": 0.0001, + "loss": 1.5612, + "step": 13919 + }, + { + "epoch": 1.6171943072901538, + "grad_norm": 0.6408698558807373, + "learning_rate": 0.0001, + "loss": 1.5034, + "step": 13920 + }, + { + "epoch": 1.6173104850421143, + "grad_norm": 0.6438761949539185, + "learning_rate": 0.0001, + "loss": 1.5634, + "step": 13921 + }, + { + "epoch": 1.617426662794075, + "grad_norm": 0.6261228322982788, + "learning_rate": 0.0001, + "loss": 1.4357, + "step": 13922 + }, + { + "epoch": 1.6175428405460355, + "grad_norm": 0.6390764713287354, + "learning_rate": 0.0001, + "loss": 1.458, + "step": 13923 + }, + { + "epoch": 1.617659018297996, + "grad_norm": 0.5800904631614685, + "learning_rate": 0.0001, + "loss": 1.2348, + "step": 13924 + }, + { + "epoch": 1.6177751960499565, + "grad_norm": 0.5801419615745544, + "learning_rate": 0.0001, + "loss": 1.4536, + "step": 13925 + }, + { + "epoch": 1.617891373801917, + "grad_norm": 0.6229808926582336, + "learning_rate": 0.0001, + "loss": 1.4334, + "step": 13926 + }, + { + "epoch": 1.6180075515538774, + "grad_norm": 0.5984506011009216, + "learning_rate": 0.0001, + "loss": 1.3765, + "step": 13927 + }, + { + "epoch": 1.618123729305838, + "grad_norm": 0.6117129325866699, + "learning_rate": 0.0001, + "loss": 1.4251, + "step": 13928 + }, + { + "epoch": 1.6182399070577984, + "grad_norm": 0.6036126613616943, + "learning_rate": 0.0001, + "loss": 1.3457, + "step": 13929 + }, + { + "epoch": 1.618356084809759, + "grad_norm": 0.6293709874153137, + "learning_rate": 0.0001, + "loss": 1.5344, + "step": 13930 + }, + { + "epoch": 1.6184722625617194, + "grad_norm": 0.6404412388801575, + "learning_rate": 0.0001, + "loss": 1.5842, + "step": 13931 + }, + { + "epoch": 1.6185884403136799, + "grad_norm": 0.6405262351036072, + "learning_rate": 0.0001, + "loss": 1.5674, + "step": 13932 + }, + { + "epoch": 1.6187046180656406, + "grad_norm": 0.6383447647094727, + "learning_rate": 0.0001, + "loss": 1.5493, + "step": 13933 + }, + { + "epoch": 1.618820795817601, + "grad_norm": 0.6693515181541443, + "learning_rate": 0.0001, + "loss": 1.6526, + "step": 13934 + }, + { + "epoch": 1.6189369735695616, + "grad_norm": 0.6073635220527649, + "learning_rate": 0.0001, + "loss": 1.5188, + "step": 13935 + }, + { + "epoch": 1.619053151321522, + "grad_norm": 0.6150029301643372, + "learning_rate": 0.0001, + "loss": 1.4316, + "step": 13936 + }, + { + "epoch": 1.6191693290734825, + "grad_norm": 0.6156079173088074, + "learning_rate": 0.0001, + "loss": 1.3389, + "step": 13937 + }, + { + "epoch": 1.619285506825443, + "grad_norm": 0.6327203512191772, + "learning_rate": 0.0001, + "loss": 1.5262, + "step": 13938 + }, + { + "epoch": 1.6194016845774035, + "grad_norm": 0.5985980033874512, + "learning_rate": 0.0001, + "loss": 1.3391, + "step": 13939 + }, + { + "epoch": 1.619517862329364, + "grad_norm": 0.5909581780433655, + "learning_rate": 0.0001, + "loss": 1.4643, + "step": 13940 + }, + { + "epoch": 1.6196340400813245, + "grad_norm": 0.6321226358413696, + "learning_rate": 0.0001, + "loss": 1.5628, + "step": 13941 + }, + { + "epoch": 1.619750217833285, + "grad_norm": 0.6735860109329224, + "learning_rate": 0.0001, + "loss": 1.3241, + "step": 13942 + }, + { + "epoch": 1.6198663955852455, + "grad_norm": 0.6376870274543762, + "learning_rate": 0.0001, + "loss": 1.463, + "step": 13943 + }, + { + "epoch": 1.619982573337206, + "grad_norm": 0.6346738338470459, + "learning_rate": 0.0001, + "loss": 1.4896, + "step": 13944 + }, + { + "epoch": 1.6200987510891665, + "grad_norm": 0.6026695966720581, + "learning_rate": 0.0001, + "loss": 1.3668, + "step": 13945 + }, + { + "epoch": 1.620214928841127, + "grad_norm": 0.5611673593521118, + "learning_rate": 0.0001, + "loss": 1.3694, + "step": 13946 + }, + { + "epoch": 1.6203311065930874, + "grad_norm": 0.6367985606193542, + "learning_rate": 0.0001, + "loss": 1.5537, + "step": 13947 + }, + { + "epoch": 1.620447284345048, + "grad_norm": 0.6661987900733948, + "learning_rate": 0.0001, + "loss": 1.5108, + "step": 13948 + }, + { + "epoch": 1.6205634620970084, + "grad_norm": 0.6102688908576965, + "learning_rate": 0.0001, + "loss": 1.4665, + "step": 13949 + }, + { + "epoch": 1.620679639848969, + "grad_norm": 0.5848194360733032, + "learning_rate": 0.0001, + "loss": 1.4065, + "step": 13950 + }, + { + "epoch": 1.6207958176009294, + "grad_norm": 0.5830379128456116, + "learning_rate": 0.0001, + "loss": 1.513, + "step": 13951 + }, + { + "epoch": 1.6209119953528899, + "grad_norm": 0.6410690546035767, + "learning_rate": 0.0001, + "loss": 1.4485, + "step": 13952 + }, + { + "epoch": 1.6210281731048504, + "grad_norm": 0.6042836904525757, + "learning_rate": 0.0001, + "loss": 1.4449, + "step": 13953 + }, + { + "epoch": 1.6211443508568109, + "grad_norm": 0.6346921920776367, + "learning_rate": 0.0001, + "loss": 1.5349, + "step": 13954 + }, + { + "epoch": 1.6212605286087713, + "grad_norm": 0.5908709764480591, + "learning_rate": 0.0001, + "loss": 1.3898, + "step": 13955 + }, + { + "epoch": 1.6213767063607318, + "grad_norm": 0.6253523230552673, + "learning_rate": 0.0001, + "loss": 1.455, + "step": 13956 + }, + { + "epoch": 1.6214928841126923, + "grad_norm": 0.6313303112983704, + "learning_rate": 0.0001, + "loss": 1.4671, + "step": 13957 + }, + { + "epoch": 1.6216090618646528, + "grad_norm": 0.6551414728164673, + "learning_rate": 0.0001, + "loss": 1.5872, + "step": 13958 + }, + { + "epoch": 1.6217252396166133, + "grad_norm": 0.602879524230957, + "learning_rate": 0.0001, + "loss": 1.4072, + "step": 13959 + }, + { + "epoch": 1.6218414173685738, + "grad_norm": 0.6179625988006592, + "learning_rate": 0.0001, + "loss": 1.4204, + "step": 13960 + }, + { + "epoch": 1.6219575951205343, + "grad_norm": 0.5867196321487427, + "learning_rate": 0.0001, + "loss": 1.3604, + "step": 13961 + }, + { + "epoch": 1.6220737728724948, + "grad_norm": 0.6081652045249939, + "learning_rate": 0.0001, + "loss": 1.5271, + "step": 13962 + }, + { + "epoch": 1.6221899506244553, + "grad_norm": 0.5890710353851318, + "learning_rate": 0.0001, + "loss": 1.3407, + "step": 13963 + }, + { + "epoch": 1.622306128376416, + "grad_norm": 0.6529542207717896, + "learning_rate": 0.0001, + "loss": 1.5192, + "step": 13964 + }, + { + "epoch": 1.6224223061283765, + "grad_norm": 0.6106531023979187, + "learning_rate": 0.0001, + "loss": 1.4651, + "step": 13965 + }, + { + "epoch": 1.622538483880337, + "grad_norm": 0.6239684224128723, + "learning_rate": 0.0001, + "loss": 1.4949, + "step": 13966 + }, + { + "epoch": 1.6226546616322974, + "grad_norm": 0.6258800625801086, + "learning_rate": 0.0001, + "loss": 1.4508, + "step": 13967 + }, + { + "epoch": 1.622770839384258, + "grad_norm": 0.5880980491638184, + "learning_rate": 0.0001, + "loss": 1.4816, + "step": 13968 + }, + { + "epoch": 1.6228870171362184, + "grad_norm": 0.588895320892334, + "learning_rate": 0.0001, + "loss": 1.2776, + "step": 13969 + }, + { + "epoch": 1.623003194888179, + "grad_norm": 0.5825314521789551, + "learning_rate": 0.0001, + "loss": 1.3614, + "step": 13970 + }, + { + "epoch": 1.6231193726401394, + "grad_norm": 0.6523398756980896, + "learning_rate": 0.0001, + "loss": 1.4717, + "step": 13971 + }, + { + "epoch": 1.6232355503920999, + "grad_norm": 0.5812602639198303, + "learning_rate": 0.0001, + "loss": 1.3427, + "step": 13972 + }, + { + "epoch": 1.6233517281440604, + "grad_norm": 0.6783197522163391, + "learning_rate": 0.0001, + "loss": 1.4434, + "step": 13973 + }, + { + "epoch": 1.6234679058960209, + "grad_norm": 0.6416046619415283, + "learning_rate": 0.0001, + "loss": 1.3776, + "step": 13974 + }, + { + "epoch": 1.6235840836479816, + "grad_norm": 0.6453458666801453, + "learning_rate": 0.0001, + "loss": 1.5991, + "step": 13975 + }, + { + "epoch": 1.623700261399942, + "grad_norm": 0.6448045969009399, + "learning_rate": 0.0001, + "loss": 1.4237, + "step": 13976 + }, + { + "epoch": 1.6238164391519025, + "grad_norm": 0.6527649760246277, + "learning_rate": 0.0001, + "loss": 1.431, + "step": 13977 + }, + { + "epoch": 1.623932616903863, + "grad_norm": 0.6334816217422485, + "learning_rate": 0.0001, + "loss": 1.323, + "step": 13978 + }, + { + "epoch": 1.6240487946558235, + "grad_norm": 0.6312617659568787, + "learning_rate": 0.0001, + "loss": 1.5421, + "step": 13979 + }, + { + "epoch": 1.624164972407784, + "grad_norm": 0.6112910509109497, + "learning_rate": 0.0001, + "loss": 1.5255, + "step": 13980 + }, + { + "epoch": 1.6242811501597445, + "grad_norm": 0.6557388305664062, + "learning_rate": 0.0001, + "loss": 1.582, + "step": 13981 + }, + { + "epoch": 1.624397327911705, + "grad_norm": 0.6169906854629517, + "learning_rate": 0.0001, + "loss": 1.4738, + "step": 13982 + }, + { + "epoch": 1.6245135056636655, + "grad_norm": 0.6463947296142578, + "learning_rate": 0.0001, + "loss": 1.4123, + "step": 13983 + }, + { + "epoch": 1.624629683415626, + "grad_norm": 0.5652637481689453, + "learning_rate": 0.0001, + "loss": 1.2282, + "step": 13984 + }, + { + "epoch": 1.6247458611675865, + "grad_norm": 0.6067284941673279, + "learning_rate": 0.0001, + "loss": 1.5313, + "step": 13985 + }, + { + "epoch": 1.624862038919547, + "grad_norm": 0.6332645416259766, + "learning_rate": 0.0001, + "loss": 1.4048, + "step": 13986 + }, + { + "epoch": 1.6249782166715074, + "grad_norm": 0.5808968544006348, + "learning_rate": 0.0001, + "loss": 1.4871, + "step": 13987 + }, + { + "epoch": 1.625094394423468, + "grad_norm": 0.5867791771888733, + "learning_rate": 0.0001, + "loss": 1.44, + "step": 13988 + }, + { + "epoch": 1.6252105721754284, + "grad_norm": 0.643679678440094, + "learning_rate": 0.0001, + "loss": 1.5416, + "step": 13989 + }, + { + "epoch": 1.625326749927389, + "grad_norm": 0.6203275322914124, + "learning_rate": 0.0001, + "loss": 1.4913, + "step": 13990 + }, + { + "epoch": 1.6254429276793494, + "grad_norm": 0.5824859738349915, + "learning_rate": 0.0001, + "loss": 1.3436, + "step": 13991 + }, + { + "epoch": 1.6255591054313099, + "grad_norm": 0.6150936484336853, + "learning_rate": 0.0001, + "loss": 1.3999, + "step": 13992 + }, + { + "epoch": 1.6256752831832704, + "grad_norm": 0.579694926738739, + "learning_rate": 0.0001, + "loss": 1.6301, + "step": 13993 + }, + { + "epoch": 1.6257914609352309, + "grad_norm": 0.5742935538291931, + "learning_rate": 0.0001, + "loss": 1.4251, + "step": 13994 + }, + { + "epoch": 1.6259076386871913, + "grad_norm": 0.5996615886688232, + "learning_rate": 0.0001, + "loss": 1.4744, + "step": 13995 + }, + { + "epoch": 1.6260238164391518, + "grad_norm": 0.7094278931617737, + "learning_rate": 0.0001, + "loss": 1.4994, + "step": 13996 + }, + { + "epoch": 1.6261399941911123, + "grad_norm": 0.6233917474746704, + "learning_rate": 0.0001, + "loss": 1.4866, + "step": 13997 + }, + { + "epoch": 1.6262561719430728, + "grad_norm": 0.6166983246803284, + "learning_rate": 0.0001, + "loss": 1.375, + "step": 13998 + }, + { + "epoch": 1.6263723496950333, + "grad_norm": 0.638420581817627, + "learning_rate": 0.0001, + "loss": 1.4988, + "step": 13999 + }, + { + "epoch": 1.6264885274469938, + "grad_norm": 0.6417514681816101, + "learning_rate": 0.0001, + "loss": 1.4279, + "step": 14000 + } + ], + "logging_steps": 1.0, + "max_steps": 17214, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.1119190102441984e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}