{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8972644376899696, "eval_steps": 500, "global_step": 1476, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006079027355623101, "grad_norm": 44.818572998046875, "learning_rate": 0.0, "loss": 7.186539173126221, "step": 1 }, { "epoch": 0.0012158054711246201, "grad_norm": 47.259071350097656, "learning_rate": 1e-05, "loss": 7.313593864440918, "step": 2 }, { "epoch": 0.00182370820668693, "grad_norm": 23.298837661743164, "learning_rate": 2e-05, "loss": 7.087122917175293, "step": 3 }, { "epoch": 0.0024316109422492403, "grad_norm": 13.535771369934082, "learning_rate": 3e-05, "loss": 6.942234992980957, "step": 4 }, { "epoch": 0.00303951367781155, "grad_norm": 11.997403144836426, "learning_rate": 4e-05, "loss": 6.6411614418029785, "step": 5 }, { "epoch": 0.00364741641337386, "grad_norm": 13.242263793945312, "learning_rate": 5e-05, "loss": 6.319230079650879, "step": 6 }, { "epoch": 0.00425531914893617, "grad_norm": 10.080074310302734, "learning_rate": 6e-05, "loss": 6.251328468322754, "step": 7 }, { "epoch": 0.004863221884498481, "grad_norm": 14.386478424072266, "learning_rate": 7.000000000000001e-05, "loss": 6.372805595397949, "step": 8 }, { "epoch": 0.00547112462006079, "grad_norm": 6.731114387512207, "learning_rate": 8e-05, "loss": 6.32672119140625, "step": 9 }, { "epoch": 0.0060790273556231, "grad_norm": 7.430361747741699, "learning_rate": 8.999999999999999e-05, "loss": 5.981637954711914, "step": 10 }, { "epoch": 0.006686930091185411, "grad_norm": 6.817004680633545, "learning_rate": 0.0001, "loss": 6.182029724121094, "step": 11 }, { "epoch": 0.00729483282674772, "grad_norm": 6.540442943572998, "learning_rate": 0.00011, "loss": 6.224725723266602, "step": 12 }, { "epoch": 0.007902735562310031, "grad_norm": 6.224416255950928, "learning_rate": 0.00012, "loss": 6.106351852416992, "step": 13 }, { "epoch": 0.00851063829787234, "grad_norm": 5.954357624053955, "learning_rate": 0.00013000000000000002, "loss": 6.050826072692871, "step": 14 }, { "epoch": 0.00911854103343465, "grad_norm": 5.7734551429748535, "learning_rate": 0.00014000000000000001, "loss": 6.147342681884766, "step": 15 }, { "epoch": 0.009726443768996961, "grad_norm": 6.399932861328125, "learning_rate": 0.00015, "loss": 6.284224510192871, "step": 16 }, { "epoch": 0.01033434650455927, "grad_norm": 4.2578558921813965, "learning_rate": 0.00016, "loss": 5.968033790588379, "step": 17 }, { "epoch": 0.01094224924012158, "grad_norm": 3.9558868408203125, "learning_rate": 0.00017, "loss": 5.909118175506592, "step": 18 }, { "epoch": 0.011550151975683891, "grad_norm": 3.4882659912109375, "learning_rate": 0.00017999999999999998, "loss": 6.045907974243164, "step": 19 }, { "epoch": 0.0121580547112462, "grad_norm": 6.301029682159424, "learning_rate": 0.00019, "loss": 5.905165672302246, "step": 20 }, { "epoch": 0.01276595744680851, "grad_norm": 3.891385078430176, "learning_rate": 0.0002, "loss": 5.9485931396484375, "step": 21 }, { "epoch": 0.013373860182370821, "grad_norm": 4.277671813964844, "learning_rate": 0.00021, "loss": 5.995012283325195, "step": 22 }, { "epoch": 0.01398176291793313, "grad_norm": 3.7930500507354736, "learning_rate": 0.00022, "loss": 6.081092834472656, "step": 23 }, { "epoch": 0.01458966565349544, "grad_norm": 5.02017879486084, "learning_rate": 0.00023, "loss": 6.232627868652344, "step": 24 }, { "epoch": 0.015197568389057751, "grad_norm": 3.485990285873413, "learning_rate": 0.00024, "loss": 6.189592361450195, "step": 25 }, { "epoch": 0.015805471124620062, "grad_norm": 4.133285999298096, "learning_rate": 0.00025, "loss": 5.953710079193115, "step": 26 }, { "epoch": 0.01641337386018237, "grad_norm": 4.140801429748535, "learning_rate": 0.00026000000000000003, "loss": 5.926338195800781, "step": 27 }, { "epoch": 0.01702127659574468, "grad_norm": 3.4010164737701416, "learning_rate": 0.00027, "loss": 5.7254462242126465, "step": 28 }, { "epoch": 0.01762917933130699, "grad_norm": 10.262829780578613, "learning_rate": 0.00028000000000000003, "loss": 6.183866500854492, "step": 29 }, { "epoch": 0.0182370820668693, "grad_norm": 4.732674598693848, "learning_rate": 0.00029, "loss": 5.899426460266113, "step": 30 }, { "epoch": 0.01884498480243161, "grad_norm": 4.868585109710693, "learning_rate": 0.0003, "loss": 5.8833699226379395, "step": 31 }, { "epoch": 0.019452887537993922, "grad_norm": 4.654231071472168, "learning_rate": 0.00031, "loss": 5.967190265655518, "step": 32 }, { "epoch": 0.02006079027355623, "grad_norm": 4.583294868469238, "learning_rate": 0.00032, "loss": 6.027661323547363, "step": 33 }, { "epoch": 0.02066869300911854, "grad_norm": 4.038606643676758, "learning_rate": 0.00033, "loss": 6.06468391418457, "step": 34 }, { "epoch": 0.02127659574468085, "grad_norm": 3.1677229404449463, "learning_rate": 0.00034, "loss": 5.97524881362915, "step": 35 }, { "epoch": 0.02188449848024316, "grad_norm": 4.171515941619873, "learning_rate": 0.00035, "loss": 5.981804370880127, "step": 36 }, { "epoch": 0.022492401215805473, "grad_norm": 5.382990837097168, "learning_rate": 0.00035999999999999997, "loss": 6.05380916595459, "step": 37 }, { "epoch": 0.023100303951367782, "grad_norm": 4.436893463134766, "learning_rate": 0.00037, "loss": 6.156210899353027, "step": 38 }, { "epoch": 0.02370820668693009, "grad_norm": 4.104293346405029, "learning_rate": 0.00038, "loss": 5.963473320007324, "step": 39 }, { "epoch": 0.0243161094224924, "grad_norm": 7.8225202560424805, "learning_rate": 0.00039000000000000005, "loss": 5.945594310760498, "step": 40 }, { "epoch": 0.02492401215805471, "grad_norm": 3.7115426063537598, "learning_rate": 0.0004, "loss": 5.866631984710693, "step": 41 }, { "epoch": 0.02553191489361702, "grad_norm": 3.377136468887329, "learning_rate": 0.00041, "loss": 5.87300968170166, "step": 42 }, { "epoch": 0.026139817629179333, "grad_norm": 3.0676238536834717, "learning_rate": 0.00042, "loss": 5.819428443908691, "step": 43 }, { "epoch": 0.026747720364741642, "grad_norm": 3.4088737964630127, "learning_rate": 0.00043, "loss": 5.686548709869385, "step": 44 }, { "epoch": 0.02735562310030395, "grad_norm": 4.599688529968262, "learning_rate": 0.00044, "loss": 6.143298149108887, "step": 45 }, { "epoch": 0.02796352583586626, "grad_norm": 3.1253559589385986, "learning_rate": 0.00045000000000000004, "loss": 5.965961933135986, "step": 46 }, { "epoch": 0.02857142857142857, "grad_norm": 3.3107733726501465, "learning_rate": 0.00046, "loss": 5.744629859924316, "step": 47 }, { "epoch": 0.02917933130699088, "grad_norm": 3.4835944175720215, "learning_rate": 0.00047, "loss": 5.963787078857422, "step": 48 }, { "epoch": 0.029787234042553193, "grad_norm": 4.766516208648682, "learning_rate": 0.00048, "loss": 5.903127670288086, "step": 49 }, { "epoch": 0.030395136778115502, "grad_norm": 3.4444823265075684, "learning_rate": 0.00049, "loss": 5.898875713348389, "step": 50 }, { "epoch": 0.03100303951367781, "grad_norm": 3.4199633598327637, "learning_rate": 0.0005, "loss": 5.995363235473633, "step": 51 }, { "epoch": 0.031610942249240125, "grad_norm": 4.609949111938477, "learning_rate": 0.0005, "loss": 5.867133140563965, "step": 52 }, { "epoch": 0.03221884498480243, "grad_norm": 2.445003032684326, "learning_rate": 0.0005, "loss": 5.596291542053223, "step": 53 }, { "epoch": 0.03282674772036474, "grad_norm": 7.065042972564697, "learning_rate": 0.0005, "loss": 5.764184951782227, "step": 54 }, { "epoch": 0.03343465045592705, "grad_norm": 3.3624749183654785, "learning_rate": 0.0005, "loss": 5.835771560668945, "step": 55 }, { "epoch": 0.03404255319148936, "grad_norm": 2.667015790939331, "learning_rate": 0.0005, "loss": 5.9446611404418945, "step": 56 }, { "epoch": 0.034650455927051675, "grad_norm": 3.2562549114227295, "learning_rate": 0.0005, "loss": 6.190652370452881, "step": 57 }, { "epoch": 0.03525835866261398, "grad_norm": 3.5651185512542725, "learning_rate": 0.0005, "loss": 5.877089500427246, "step": 58 }, { "epoch": 0.035866261398176294, "grad_norm": 2.6607139110565186, "learning_rate": 0.0005, "loss": 5.947436332702637, "step": 59 }, { "epoch": 0.0364741641337386, "grad_norm": 2.5586416721343994, "learning_rate": 0.0005, "loss": 6.041194915771484, "step": 60 }, { "epoch": 0.03708206686930091, "grad_norm": 3.5156543254852295, "learning_rate": 0.0005, "loss": 5.8784284591674805, "step": 61 }, { "epoch": 0.03768996960486322, "grad_norm": 2.013105630874634, "learning_rate": 0.0005, "loss": 5.705929756164551, "step": 62 }, { "epoch": 0.03829787234042553, "grad_norm": 2.2044196128845215, "learning_rate": 0.0005, "loss": 5.775040626525879, "step": 63 }, { "epoch": 0.038905775075987845, "grad_norm": 3.8432488441467285, "learning_rate": 0.0005, "loss": 5.757482528686523, "step": 64 }, { "epoch": 0.03951367781155015, "grad_norm": 2.794318437576294, "learning_rate": 0.0005, "loss": 5.4956865310668945, "step": 65 }, { "epoch": 0.04012158054711246, "grad_norm": 5.635376930236816, "learning_rate": 0.0005, "loss": 5.950571060180664, "step": 66 }, { "epoch": 0.04072948328267477, "grad_norm": 2.8366096019744873, "learning_rate": 0.0005, "loss": 5.937989711761475, "step": 67 }, { "epoch": 0.04133738601823708, "grad_norm": 4.0585455894470215, "learning_rate": 0.0005, "loss": 6.175616264343262, "step": 68 }, { "epoch": 0.041945288753799395, "grad_norm": 2.4633665084838867, "learning_rate": 0.0005, "loss": 5.856078147888184, "step": 69 }, { "epoch": 0.0425531914893617, "grad_norm": 2.900541305541992, "learning_rate": 0.0005, "loss": 5.562302112579346, "step": 70 }, { "epoch": 0.043161094224924014, "grad_norm": 2.1582231521606445, "learning_rate": 0.0005, "loss": 5.853466033935547, "step": 71 }, { "epoch": 0.04376899696048632, "grad_norm": 2.823076009750366, "learning_rate": 0.0005, "loss": 5.676411151885986, "step": 72 }, { "epoch": 0.04437689969604863, "grad_norm": 3.4227182865142822, "learning_rate": 0.0005, "loss": 5.687357425689697, "step": 73 }, { "epoch": 0.044984802431610946, "grad_norm": 2.4039175510406494, "learning_rate": 0.0005, "loss": 5.892976760864258, "step": 74 }, { "epoch": 0.04559270516717325, "grad_norm": 2.6830098628997803, "learning_rate": 0.0005, "loss": 5.66058349609375, "step": 75 }, { "epoch": 0.046200607902735565, "grad_norm": 2.413268566131592, "learning_rate": 0.0005, "loss": 5.7166547775268555, "step": 76 }, { "epoch": 0.04680851063829787, "grad_norm": 2.110560894012451, "learning_rate": 0.0005, "loss": 5.578657150268555, "step": 77 }, { "epoch": 0.04741641337386018, "grad_norm": 2.293944835662842, "learning_rate": 0.0005, "loss": 5.830209732055664, "step": 78 }, { "epoch": 0.04802431610942249, "grad_norm": 2.3141164779663086, "learning_rate": 0.0005, "loss": 5.730184555053711, "step": 79 }, { "epoch": 0.0486322188449848, "grad_norm": 2.4202141761779785, "learning_rate": 0.0005, "loss": 5.657958030700684, "step": 80 }, { "epoch": 0.049240121580547115, "grad_norm": 2.1450300216674805, "learning_rate": 0.0005, "loss": 5.734421253204346, "step": 81 }, { "epoch": 0.04984802431610942, "grad_norm": 2.340426206588745, "learning_rate": 0.0005, "loss": 5.912275314331055, "step": 82 }, { "epoch": 0.050455927051671734, "grad_norm": 2.2572286128997803, "learning_rate": 0.0005, "loss": 6.227065086364746, "step": 83 }, { "epoch": 0.05106382978723404, "grad_norm": 1.9745402336120605, "learning_rate": 0.0005, "loss": 5.538962364196777, "step": 84 }, { "epoch": 0.05167173252279635, "grad_norm": 1.8350422382354736, "learning_rate": 0.0005, "loss": 5.68572998046875, "step": 85 }, { "epoch": 0.052279635258358666, "grad_norm": 1.4099390506744385, "learning_rate": 0.0005, "loss": 5.548061370849609, "step": 86 }, { "epoch": 0.05288753799392097, "grad_norm": 1.7324459552764893, "learning_rate": 0.0005, "loss": 5.791088104248047, "step": 87 }, { "epoch": 0.053495440729483285, "grad_norm": 2.2765917778015137, "learning_rate": 0.0005, "loss": 5.66319465637207, "step": 88 }, { "epoch": 0.05410334346504559, "grad_norm": 1.8931759595870972, "learning_rate": 0.0005, "loss": 5.931559085845947, "step": 89 }, { "epoch": 0.0547112462006079, "grad_norm": 3.1260805130004883, "learning_rate": 0.0005, "loss": 5.887214183807373, "step": 90 }, { "epoch": 0.05531914893617021, "grad_norm": 2.076260805130005, "learning_rate": 0.0005, "loss": 5.837953567504883, "step": 91 }, { "epoch": 0.05592705167173252, "grad_norm": 2.6507105827331543, "learning_rate": 0.0005, "loss": 5.720830917358398, "step": 92 }, { "epoch": 0.056534954407294835, "grad_norm": 1.761267900466919, "learning_rate": 0.0005, "loss": 5.8046417236328125, "step": 93 }, { "epoch": 0.05714285714285714, "grad_norm": 2.158432722091675, "learning_rate": 0.0005, "loss": 5.530825614929199, "step": 94 }, { "epoch": 0.057750759878419454, "grad_norm": 1.8743107318878174, "learning_rate": 0.0005, "loss": 5.851261138916016, "step": 95 }, { "epoch": 0.05835866261398176, "grad_norm": 2.2951159477233887, "learning_rate": 0.0005, "loss": 5.754410743713379, "step": 96 }, { "epoch": 0.05896656534954407, "grad_norm": 1.6710808277130127, "learning_rate": 0.0005, "loss": 5.511685371398926, "step": 97 }, { "epoch": 0.059574468085106386, "grad_norm": 2.4671308994293213, "learning_rate": 0.0005, "loss": 5.762502193450928, "step": 98 }, { "epoch": 0.06018237082066869, "grad_norm": 1.7344735860824585, "learning_rate": 0.0005, "loss": 5.726058006286621, "step": 99 }, { "epoch": 0.060790273556231005, "grad_norm": 1.9786497354507446, "learning_rate": 0.0005, "loss": 5.570637226104736, "step": 100 }, { "epoch": 0.06139817629179331, "grad_norm": 1.672898769378662, "learning_rate": 0.0005, "loss": 5.4022722244262695, "step": 101 }, { "epoch": 0.06200607902735562, "grad_norm": 1.975422978401184, "learning_rate": 0.0005, "loss": 5.58085823059082, "step": 102 }, { "epoch": 0.06261398176291794, "grad_norm": 1.6185539960861206, "learning_rate": 0.0005, "loss": 5.551645755767822, "step": 103 }, { "epoch": 0.06322188449848025, "grad_norm": 1.6963152885437012, "learning_rate": 0.0005, "loss": 5.634788990020752, "step": 104 }, { "epoch": 0.06382978723404255, "grad_norm": 1.6010147333145142, "learning_rate": 0.0005, "loss": 5.439291954040527, "step": 105 }, { "epoch": 0.06443768996960486, "grad_norm": 1.4918285608291626, "learning_rate": 0.0005, "loss": 5.595495700836182, "step": 106 }, { "epoch": 0.06504559270516717, "grad_norm": 1.7921746969223022, "learning_rate": 0.0005, "loss": 5.7882080078125, "step": 107 }, { "epoch": 0.06565349544072949, "grad_norm": 1.6905741691589355, "learning_rate": 0.0005, "loss": 5.6724653244018555, "step": 108 }, { "epoch": 0.0662613981762918, "grad_norm": 1.5293573141098022, "learning_rate": 0.0005, "loss": 5.407555103302002, "step": 109 }, { "epoch": 0.0668693009118541, "grad_norm": 1.3903565406799316, "learning_rate": 0.0005, "loss": 5.763338565826416, "step": 110 }, { "epoch": 0.06747720364741641, "grad_norm": 1.6731656789779663, "learning_rate": 0.0005, "loss": 5.656299591064453, "step": 111 }, { "epoch": 0.06808510638297872, "grad_norm": 1.6174890995025635, "learning_rate": 0.0005, "loss": 5.728058815002441, "step": 112 }, { "epoch": 0.06869300911854104, "grad_norm": 1.9111192226409912, "learning_rate": 0.0005, "loss": 5.569175720214844, "step": 113 }, { "epoch": 0.06930091185410335, "grad_norm": 1.397756576538086, "learning_rate": 0.0005, "loss": 5.692349433898926, "step": 114 }, { "epoch": 0.06990881458966565, "grad_norm": 1.4280520677566528, "learning_rate": 0.0005, "loss": 5.366017818450928, "step": 115 }, { "epoch": 0.07051671732522796, "grad_norm": 2.1756176948547363, "learning_rate": 0.0005, "loss": 5.529537677764893, "step": 116 }, { "epoch": 0.07112462006079028, "grad_norm": 1.6855345964431763, "learning_rate": 0.0005, "loss": 5.3663010597229, "step": 117 }, { "epoch": 0.07173252279635259, "grad_norm": 1.3849018812179565, "learning_rate": 0.0005, "loss": 5.661293983459473, "step": 118 }, { "epoch": 0.07234042553191489, "grad_norm": 1.5399678945541382, "learning_rate": 0.0005, "loss": 5.681015968322754, "step": 119 }, { "epoch": 0.0729483282674772, "grad_norm": 1.3474847078323364, "learning_rate": 0.0005, "loss": 5.404428482055664, "step": 120 }, { "epoch": 0.07355623100303951, "grad_norm": 1.4353671073913574, "learning_rate": 0.0005, "loss": 5.621041297912598, "step": 121 }, { "epoch": 0.07416413373860183, "grad_norm": 1.385099172592163, "learning_rate": 0.0005, "loss": 5.410789489746094, "step": 122 }, { "epoch": 0.07477203647416414, "grad_norm": 1.5382664203643799, "learning_rate": 0.0005, "loss": 5.401933670043945, "step": 123 }, { "epoch": 0.07537993920972644, "grad_norm": 1.48553466796875, "learning_rate": 0.0005, "loss": 5.547571182250977, "step": 124 }, { "epoch": 0.07598784194528875, "grad_norm": 1.3798505067825317, "learning_rate": 0.0005, "loss": 5.5776872634887695, "step": 125 }, { "epoch": 0.07659574468085106, "grad_norm": 1.863465428352356, "learning_rate": 0.0005, "loss": 5.570428371429443, "step": 126 }, { "epoch": 0.07720364741641338, "grad_norm": 1.7337578535079956, "learning_rate": 0.0005, "loss": 5.60271692276001, "step": 127 }, { "epoch": 0.07781155015197569, "grad_norm": 1.7129346132278442, "learning_rate": 0.0005, "loss": 5.655090808868408, "step": 128 }, { "epoch": 0.07841945288753799, "grad_norm": 1.8253934383392334, "learning_rate": 0.0005, "loss": 5.726884841918945, "step": 129 }, { "epoch": 0.0790273556231003, "grad_norm": 1.493262529373169, "learning_rate": 0.0005, "loss": 5.307271957397461, "step": 130 }, { "epoch": 0.07963525835866261, "grad_norm": 1.9851430654525757, "learning_rate": 0.0005, "loss": 5.40402889251709, "step": 131 }, { "epoch": 0.08024316109422493, "grad_norm": 1.4382926225662231, "learning_rate": 0.0005, "loss": 5.55129337310791, "step": 132 }, { "epoch": 0.08085106382978724, "grad_norm": 2.1384055614471436, "learning_rate": 0.0005, "loss": 5.42939567565918, "step": 133 }, { "epoch": 0.08145896656534954, "grad_norm": 1.5483143329620361, "learning_rate": 0.0005, "loss": 5.495145797729492, "step": 134 }, { "epoch": 0.08206686930091185, "grad_norm": 1.6180500984191895, "learning_rate": 0.0005, "loss": 5.596287727355957, "step": 135 }, { "epoch": 0.08267477203647416, "grad_norm": 1.6833781003952026, "learning_rate": 0.0005, "loss": 5.704960346221924, "step": 136 }, { "epoch": 0.08328267477203648, "grad_norm": 1.731799602508545, "learning_rate": 0.0005, "loss": 5.343502998352051, "step": 137 }, { "epoch": 0.08389057750759879, "grad_norm": 1.7854918241500854, "learning_rate": 0.0005, "loss": 5.647939205169678, "step": 138 }, { "epoch": 0.08449848024316109, "grad_norm": 1.2474077939987183, "learning_rate": 0.0005, "loss": 5.360551834106445, "step": 139 }, { "epoch": 0.0851063829787234, "grad_norm": 5.299109935760498, "learning_rate": 0.0005, "loss": 5.383178234100342, "step": 140 }, { "epoch": 0.08571428571428572, "grad_norm": 2.591733694076538, "learning_rate": 0.0005, "loss": 5.623793601989746, "step": 141 }, { "epoch": 0.08632218844984803, "grad_norm": 1.5868524312973022, "learning_rate": 0.0005, "loss": 5.522441864013672, "step": 142 }, { "epoch": 0.08693009118541034, "grad_norm": 1.752677083015442, "learning_rate": 0.0005, "loss": 5.5086774826049805, "step": 143 }, { "epoch": 0.08753799392097264, "grad_norm": 1.5863618850708008, "learning_rate": 0.0005, "loss": 5.492759704589844, "step": 144 }, { "epoch": 0.08814589665653495, "grad_norm": 1.4941948652267456, "learning_rate": 0.0005, "loss": 5.475063323974609, "step": 145 }, { "epoch": 0.08875379939209727, "grad_norm": 1.5351965427398682, "learning_rate": 0.0005, "loss": 5.511392593383789, "step": 146 }, { "epoch": 0.08936170212765958, "grad_norm": 1.5566837787628174, "learning_rate": 0.0005, "loss": 5.4525909423828125, "step": 147 }, { "epoch": 0.08996960486322189, "grad_norm": 1.5408483743667603, "learning_rate": 0.0005, "loss": 5.592557430267334, "step": 148 }, { "epoch": 0.09057750759878419, "grad_norm": 1.3915044069290161, "learning_rate": 0.0005, "loss": 5.68109130859375, "step": 149 }, { "epoch": 0.0911854103343465, "grad_norm": 1.4081814289093018, "learning_rate": 0.0005, "loss": 5.310542106628418, "step": 150 }, { "epoch": 0.09179331306990882, "grad_norm": 1.368977427482605, "learning_rate": 0.0005, "loss": 5.590452194213867, "step": 151 }, { "epoch": 0.09240121580547113, "grad_norm": 1.7604471445083618, "learning_rate": 0.0005, "loss": 5.2881550788879395, "step": 152 }, { "epoch": 0.09300911854103343, "grad_norm": 1.2718323469161987, "learning_rate": 0.0005, "loss": 5.228243827819824, "step": 153 }, { "epoch": 0.09361702127659574, "grad_norm": 1.853657841682434, "learning_rate": 0.0005, "loss": 5.344303131103516, "step": 154 }, { "epoch": 0.09422492401215805, "grad_norm": 1.2742729187011719, "learning_rate": 0.0005, "loss": 5.602327346801758, "step": 155 }, { "epoch": 0.09483282674772037, "grad_norm": 1.3428983688354492, "learning_rate": 0.0005, "loss": 5.564847469329834, "step": 156 }, { "epoch": 0.09544072948328268, "grad_norm": 1.307673454284668, "learning_rate": 0.0005, "loss": 5.5293378829956055, "step": 157 }, { "epoch": 0.09604863221884498, "grad_norm": 1.2413536310195923, "learning_rate": 0.0005, "loss": 5.751148223876953, "step": 158 }, { "epoch": 0.09665653495440729, "grad_norm": 1.5207955837249756, "learning_rate": 0.0005, "loss": 5.464879989624023, "step": 159 }, { "epoch": 0.0972644376899696, "grad_norm": 1.2123122215270996, "learning_rate": 0.0005, "loss": 5.438077926635742, "step": 160 }, { "epoch": 0.09787234042553192, "grad_norm": 1.420456051826477, "learning_rate": 0.0005, "loss": 5.586366176605225, "step": 161 }, { "epoch": 0.09848024316109423, "grad_norm": 1.2411231994628906, "learning_rate": 0.0005, "loss": 5.465837478637695, "step": 162 }, { "epoch": 0.09908814589665653, "grad_norm": 1.4124112129211426, "learning_rate": 0.0005, "loss": 5.58890438079834, "step": 163 }, { "epoch": 0.09969604863221884, "grad_norm": 1.421832799911499, "learning_rate": 0.0005, "loss": 5.211925029754639, "step": 164 }, { "epoch": 0.10030395136778116, "grad_norm": 1.4735937118530273, "learning_rate": 0.0005, "loss": 5.542084693908691, "step": 165 }, { "epoch": 0.10091185410334347, "grad_norm": 1.2726881504058838, "learning_rate": 0.0005, "loss": 5.566733360290527, "step": 166 }, { "epoch": 0.10151975683890578, "grad_norm": 1.3275830745697021, "learning_rate": 0.0005, "loss": 5.730228424072266, "step": 167 }, { "epoch": 0.10212765957446808, "grad_norm": 1.6597068309783936, "learning_rate": 0.0005, "loss": 5.339101791381836, "step": 168 }, { "epoch": 0.10273556231003039, "grad_norm": 1.46490478515625, "learning_rate": 0.0005, "loss": 5.410638809204102, "step": 169 }, { "epoch": 0.1033434650455927, "grad_norm": 1.3094699382781982, "learning_rate": 0.0005, "loss": 5.219968318939209, "step": 170 }, { "epoch": 0.10395136778115502, "grad_norm": 1.4983205795288086, "learning_rate": 0.0005, "loss": 5.392378330230713, "step": 171 }, { "epoch": 0.10455927051671733, "grad_norm": 1.517512559890747, "learning_rate": 0.0005, "loss": 5.38358736038208, "step": 172 }, { "epoch": 0.10516717325227963, "grad_norm": 1.5345962047576904, "learning_rate": 0.0005, "loss": 5.368213653564453, "step": 173 }, { "epoch": 0.10577507598784194, "grad_norm": 1.1318706274032593, "learning_rate": 0.0005, "loss": 5.639193534851074, "step": 174 }, { "epoch": 0.10638297872340426, "grad_norm": 1.3089977502822876, "learning_rate": 0.0005, "loss": 5.508517265319824, "step": 175 }, { "epoch": 0.10699088145896657, "grad_norm": 1.16405189037323, "learning_rate": 0.0005, "loss": 5.238767623901367, "step": 176 }, { "epoch": 0.10759878419452888, "grad_norm": 1.318361759185791, "learning_rate": 0.0005, "loss": 5.591005325317383, "step": 177 }, { "epoch": 0.10820668693009118, "grad_norm": 1.7068839073181152, "learning_rate": 0.0005, "loss": 5.138769149780273, "step": 178 }, { "epoch": 0.1088145896656535, "grad_norm": 1.4426335096359253, "learning_rate": 0.0005, "loss": 5.406965255737305, "step": 179 }, { "epoch": 0.1094224924012158, "grad_norm": 1.3298251628875732, "learning_rate": 0.0005, "loss": 5.486334323883057, "step": 180 }, { "epoch": 0.11003039513677812, "grad_norm": 1.2703888416290283, "learning_rate": 0.0005, "loss": 5.543169021606445, "step": 181 }, { "epoch": 0.11063829787234042, "grad_norm": 1.0853707790374756, "learning_rate": 0.0005, "loss": 5.2396135330200195, "step": 182 }, { "epoch": 0.11124620060790273, "grad_norm": 1.283922553062439, "learning_rate": 0.0005, "loss": 5.168734550476074, "step": 183 }, { "epoch": 0.11185410334346504, "grad_norm": 1.4008558988571167, "learning_rate": 0.0005, "loss": 5.464504241943359, "step": 184 }, { "epoch": 0.11246200607902736, "grad_norm": 1.6104100942611694, "learning_rate": 0.0005, "loss": 5.350894927978516, "step": 185 }, { "epoch": 0.11306990881458967, "grad_norm": 1.1095637083053589, "learning_rate": 0.0005, "loss": 5.330683708190918, "step": 186 }, { "epoch": 0.11367781155015197, "grad_norm": 1.3298522233963013, "learning_rate": 0.0005, "loss": 5.376528739929199, "step": 187 }, { "epoch": 0.11428571428571428, "grad_norm": 1.4511582851409912, "learning_rate": 0.0005, "loss": 5.49576473236084, "step": 188 }, { "epoch": 0.1148936170212766, "grad_norm": 1.4968204498291016, "learning_rate": 0.0005, "loss": 5.232635021209717, "step": 189 }, { "epoch": 0.11550151975683891, "grad_norm": 1.2423769235610962, "learning_rate": 0.0005, "loss": 5.456453323364258, "step": 190 }, { "epoch": 0.11610942249240122, "grad_norm": 1.2642461061477661, "learning_rate": 0.0005, "loss": 5.673423767089844, "step": 191 }, { "epoch": 0.11671732522796352, "grad_norm": 1.6604862213134766, "learning_rate": 0.0005, "loss": 5.230939865112305, "step": 192 }, { "epoch": 0.11732522796352583, "grad_norm": 1.4601672887802124, "learning_rate": 0.0005, "loss": 5.308025360107422, "step": 193 }, { "epoch": 0.11793313069908815, "grad_norm": 1.66468346118927, "learning_rate": 0.0005, "loss": 5.50089168548584, "step": 194 }, { "epoch": 0.11854103343465046, "grad_norm": 1.4034700393676758, "learning_rate": 0.0005, "loss": 5.4229583740234375, "step": 195 }, { "epoch": 0.11914893617021277, "grad_norm": 1.3911566734313965, "learning_rate": 0.0005, "loss": 5.266064643859863, "step": 196 }, { "epoch": 0.11975683890577507, "grad_norm": 1.5582391023635864, "learning_rate": 0.0005, "loss": 5.215412616729736, "step": 197 }, { "epoch": 0.12036474164133738, "grad_norm": 1.4908430576324463, "learning_rate": 0.0005, "loss": 5.305833339691162, "step": 198 }, { "epoch": 0.1209726443768997, "grad_norm": 1.4207631349563599, "learning_rate": 0.0005, "loss": 5.2746734619140625, "step": 199 }, { "epoch": 0.12158054711246201, "grad_norm": 1.5322375297546387, "learning_rate": 0.0005, "loss": 5.160092353820801, "step": 200 }, { "epoch": 0.12218844984802432, "grad_norm": 1.538822889328003, "learning_rate": 0.0005, "loss": 5.2349467277526855, "step": 201 }, { "epoch": 0.12279635258358662, "grad_norm": 1.487720251083374, "learning_rate": 0.0005, "loss": 5.305604934692383, "step": 202 }, { "epoch": 0.12340425531914893, "grad_norm": 1.402201771736145, "learning_rate": 0.0005, "loss": 5.271785736083984, "step": 203 }, { "epoch": 0.12401215805471125, "grad_norm": 1.4523091316223145, "learning_rate": 0.0005, "loss": 5.260416030883789, "step": 204 }, { "epoch": 0.12462006079027356, "grad_norm": 1.3056803941726685, "learning_rate": 0.0005, "loss": 5.221076488494873, "step": 205 }, { "epoch": 0.12522796352583587, "grad_norm": 1.4249091148376465, "learning_rate": 0.0005, "loss": 5.13364839553833, "step": 206 }, { "epoch": 0.12583586626139817, "grad_norm": 1.417321801185608, "learning_rate": 0.0005, "loss": 5.294346332550049, "step": 207 }, { "epoch": 0.1264437689969605, "grad_norm": 1.3512288331985474, "learning_rate": 0.0005, "loss": 5.273685455322266, "step": 208 }, { "epoch": 0.1270516717325228, "grad_norm": 1.53708016872406, "learning_rate": 0.0005, "loss": 5.160931587219238, "step": 209 }, { "epoch": 0.1276595744680851, "grad_norm": 1.3125845193862915, "learning_rate": 0.0005, "loss": 5.472460746765137, "step": 210 }, { "epoch": 0.12826747720364742, "grad_norm": 1.6518676280975342, "learning_rate": 0.0005, "loss": 5.4825568199157715, "step": 211 }, { "epoch": 0.12887537993920972, "grad_norm": 1.203003168106079, "learning_rate": 0.0005, "loss": 5.11652946472168, "step": 212 }, { "epoch": 0.12948328267477205, "grad_norm": 1.3805352449417114, "learning_rate": 0.0005, "loss": 5.366741180419922, "step": 213 }, { "epoch": 0.13009118541033435, "grad_norm": 1.8709197044372559, "learning_rate": 0.0005, "loss": 5.435246467590332, "step": 214 }, { "epoch": 0.13069908814589665, "grad_norm": 1.7283586263656616, "learning_rate": 0.0005, "loss": 5.202251434326172, "step": 215 }, { "epoch": 0.13130699088145897, "grad_norm": 1.2809170484542847, "learning_rate": 0.0005, "loss": 5.283895492553711, "step": 216 }, { "epoch": 0.13191489361702127, "grad_norm": 1.249645709991455, "learning_rate": 0.0005, "loss": 5.123793601989746, "step": 217 }, { "epoch": 0.1325227963525836, "grad_norm": 1.3356451988220215, "learning_rate": 0.0005, "loss": 5.174809455871582, "step": 218 }, { "epoch": 0.1331306990881459, "grad_norm": 1.139381766319275, "learning_rate": 0.0005, "loss": 5.0811967849731445, "step": 219 }, { "epoch": 0.1337386018237082, "grad_norm": 1.2006030082702637, "learning_rate": 0.0005, "loss": 5.268994331359863, "step": 220 }, { "epoch": 0.13434650455927052, "grad_norm": 1.2994015216827393, "learning_rate": 0.0005, "loss": 5.426079750061035, "step": 221 }, { "epoch": 0.13495440729483282, "grad_norm": 1.0793324708938599, "learning_rate": 0.0005, "loss": 5.424633979797363, "step": 222 }, { "epoch": 0.13556231003039515, "grad_norm": 1.1271226406097412, "learning_rate": 0.0005, "loss": 5.310846328735352, "step": 223 }, { "epoch": 0.13617021276595745, "grad_norm": 1.1775165796279907, "learning_rate": 0.0005, "loss": 5.071159839630127, "step": 224 }, { "epoch": 0.13677811550151975, "grad_norm": 1.1077218055725098, "learning_rate": 0.0005, "loss": 5.208876609802246, "step": 225 }, { "epoch": 0.13738601823708207, "grad_norm": 1.3281017541885376, "learning_rate": 0.0005, "loss": 5.371927261352539, "step": 226 }, { "epoch": 0.13799392097264437, "grad_norm": 1.4999650716781616, "learning_rate": 0.0005, "loss": 5.17914342880249, "step": 227 }, { "epoch": 0.1386018237082067, "grad_norm": 1.2213531732559204, "learning_rate": 0.0005, "loss": 5.079235076904297, "step": 228 }, { "epoch": 0.139209726443769, "grad_norm": 1.409624695777893, "learning_rate": 0.0005, "loss": 5.218929767608643, "step": 229 }, { "epoch": 0.1398176291793313, "grad_norm": 1.2914072275161743, "learning_rate": 0.0005, "loss": 5.254355430603027, "step": 230 }, { "epoch": 0.14042553191489363, "grad_norm": 1.27825927734375, "learning_rate": 0.0005, "loss": 5.02869987487793, "step": 231 }, { "epoch": 0.14103343465045592, "grad_norm": 1.367679238319397, "learning_rate": 0.0005, "loss": 5.032447814941406, "step": 232 }, { "epoch": 0.14164133738601822, "grad_norm": 1.1813191175460815, "learning_rate": 0.0005, "loss": 5.181385040283203, "step": 233 }, { "epoch": 0.14224924012158055, "grad_norm": 1.385109305381775, "learning_rate": 0.0005, "loss": 5.294610977172852, "step": 234 }, { "epoch": 0.14285714285714285, "grad_norm": 1.2544500827789307, "learning_rate": 0.0005, "loss": 5.046303749084473, "step": 235 }, { "epoch": 0.14346504559270518, "grad_norm": 1.487121820449829, "learning_rate": 0.0005, "loss": 5.523983001708984, "step": 236 }, { "epoch": 0.14407294832826747, "grad_norm": 1.263445258140564, "learning_rate": 0.0005, "loss": 5.192383289337158, "step": 237 }, { "epoch": 0.14468085106382977, "grad_norm": 1.0454970598220825, "learning_rate": 0.0005, "loss": 5.0029120445251465, "step": 238 }, { "epoch": 0.1452887537993921, "grad_norm": 1.131041407585144, "learning_rate": 0.0005, "loss": 5.140591144561768, "step": 239 }, { "epoch": 0.1458966565349544, "grad_norm": 1.3271952867507935, "learning_rate": 0.0005, "loss": 5.232538223266602, "step": 240 }, { "epoch": 0.14650455927051673, "grad_norm": 1.2867931127548218, "learning_rate": 0.0005, "loss": 5.288295745849609, "step": 241 }, { "epoch": 0.14711246200607903, "grad_norm": 1.2857162952423096, "learning_rate": 0.0005, "loss": 4.999725341796875, "step": 242 }, { "epoch": 0.14772036474164132, "grad_norm": 1.308387279510498, "learning_rate": 0.0005, "loss": 5.332901477813721, "step": 243 }, { "epoch": 0.14832826747720365, "grad_norm": 1.431774377822876, "learning_rate": 0.0005, "loss": 5.33701753616333, "step": 244 }, { "epoch": 0.14893617021276595, "grad_norm": 1.2257990837097168, "learning_rate": 0.0005, "loss": 5.286837100982666, "step": 245 }, { "epoch": 0.14954407294832828, "grad_norm": 1.2497832775115967, "learning_rate": 0.0005, "loss": 5.060267448425293, "step": 246 }, { "epoch": 0.15015197568389058, "grad_norm": 1.3174192905426025, "learning_rate": 0.0005, "loss": 5.460453987121582, "step": 247 }, { "epoch": 0.15075987841945288, "grad_norm": 1.2937954664230347, "learning_rate": 0.0005, "loss": 5.300616264343262, "step": 248 }, { "epoch": 0.1513677811550152, "grad_norm": 1.1722848415374756, "learning_rate": 0.0005, "loss": 5.289948463439941, "step": 249 }, { "epoch": 0.1519756838905775, "grad_norm": 1.365752100944519, "learning_rate": 0.0005, "loss": 5.077818870544434, "step": 250 }, { "epoch": 0.15258358662613983, "grad_norm": 1.2099617719650269, "learning_rate": 0.0005, "loss": 5.033614635467529, "step": 251 }, { "epoch": 0.15319148936170213, "grad_norm": 1.3854937553405762, "learning_rate": 0.0005, "loss": 5.019617080688477, "step": 252 }, { "epoch": 0.15379939209726443, "grad_norm": 1.3792158365249634, "learning_rate": 0.0005, "loss": 5.079125881195068, "step": 253 }, { "epoch": 0.15440729483282675, "grad_norm": 1.1149134635925293, "learning_rate": 0.0005, "loss": 5.06775426864624, "step": 254 }, { "epoch": 0.15501519756838905, "grad_norm": 1.4162288904190063, "learning_rate": 0.0005, "loss": 5.29591178894043, "step": 255 }, { "epoch": 0.15562310030395138, "grad_norm": 1.298060417175293, "learning_rate": 0.0005, "loss": 5.090610504150391, "step": 256 }, { "epoch": 0.15623100303951368, "grad_norm": 1.1845481395721436, "learning_rate": 0.0005, "loss": 5.00084114074707, "step": 257 }, { "epoch": 0.15683890577507598, "grad_norm": 1.1649361848831177, "learning_rate": 0.0005, "loss": 5.0191168785095215, "step": 258 }, { "epoch": 0.1574468085106383, "grad_norm": 1.1649863719940186, "learning_rate": 0.0005, "loss": 4.924384117126465, "step": 259 }, { "epoch": 0.1580547112462006, "grad_norm": 1.305981159210205, "learning_rate": 0.0005, "loss": 5.208071708679199, "step": 260 }, { "epoch": 0.15866261398176293, "grad_norm": 1.1375975608825684, "learning_rate": 0.0005, "loss": 5.07304048538208, "step": 261 }, { "epoch": 0.15927051671732523, "grad_norm": 1.570008635520935, "learning_rate": 0.0005, "loss": 5.2816667556762695, "step": 262 }, { "epoch": 0.15987841945288753, "grad_norm": 1.168481469154358, "learning_rate": 0.0005, "loss": 5.156436920166016, "step": 263 }, { "epoch": 0.16048632218844985, "grad_norm": 1.17093026638031, "learning_rate": 0.0005, "loss": 5.264464378356934, "step": 264 }, { "epoch": 0.16109422492401215, "grad_norm": 1.1767195463180542, "learning_rate": 0.0005, "loss": 5.278616905212402, "step": 265 }, { "epoch": 0.16170212765957448, "grad_norm": 1.2456096410751343, "learning_rate": 0.0005, "loss": 5.296989440917969, "step": 266 }, { "epoch": 0.16231003039513678, "grad_norm": 1.260128140449524, "learning_rate": 0.0005, "loss": 5.161136150360107, "step": 267 }, { "epoch": 0.16291793313069908, "grad_norm": 1.3702967166900635, "learning_rate": 0.0005, "loss": 5.2522077560424805, "step": 268 }, { "epoch": 0.1635258358662614, "grad_norm": 1.1898664236068726, "learning_rate": 0.0005, "loss": 5.138284683227539, "step": 269 }, { "epoch": 0.1641337386018237, "grad_norm": 1.586888074874878, "learning_rate": 0.0005, "loss": 4.960643291473389, "step": 270 }, { "epoch": 0.16474164133738603, "grad_norm": 1.2508625984191895, "learning_rate": 0.0005, "loss": 5.2589569091796875, "step": 271 }, { "epoch": 0.16534954407294833, "grad_norm": 1.1662089824676514, "learning_rate": 0.0005, "loss": 5.264585494995117, "step": 272 }, { "epoch": 0.16595744680851063, "grad_norm": 1.2917591333389282, "learning_rate": 0.0005, "loss": 4.975507736206055, "step": 273 }, { "epoch": 0.16656534954407295, "grad_norm": 1.0556538105010986, "learning_rate": 0.0005, "loss": 5.047136306762695, "step": 274 }, { "epoch": 0.16717325227963525, "grad_norm": 1.0959351062774658, "learning_rate": 0.0005, "loss": 5.063904762268066, "step": 275 }, { "epoch": 0.16778115501519758, "grad_norm": 1.0194965600967407, "learning_rate": 0.0005, "loss": 5.230169296264648, "step": 276 }, { "epoch": 0.16838905775075988, "grad_norm": 1.326802372932434, "learning_rate": 0.0005, "loss": 5.127433776855469, "step": 277 }, { "epoch": 0.16899696048632218, "grad_norm": 1.17707097530365, "learning_rate": 0.0005, "loss": 5.209277153015137, "step": 278 }, { "epoch": 0.1696048632218845, "grad_norm": 0.9115813970565796, "learning_rate": 0.0005, "loss": 5.025136470794678, "step": 279 }, { "epoch": 0.1702127659574468, "grad_norm": 1.1245434284210205, "learning_rate": 0.0005, "loss": 5.057619094848633, "step": 280 }, { "epoch": 0.17082066869300913, "grad_norm": 1.3757452964782715, "learning_rate": 0.0005, "loss": 4.920927047729492, "step": 281 }, { "epoch": 0.17142857142857143, "grad_norm": 1.4696053266525269, "learning_rate": 0.0005, "loss": 5.1536760330200195, "step": 282 }, { "epoch": 0.17203647416413373, "grad_norm": 1.2874000072479248, "learning_rate": 0.0005, "loss": 5.050880432128906, "step": 283 }, { "epoch": 0.17264437689969606, "grad_norm": 1.2090721130371094, "learning_rate": 0.0005, "loss": 5.024714469909668, "step": 284 }, { "epoch": 0.17325227963525835, "grad_norm": 1.3489820957183838, "learning_rate": 0.0005, "loss": 5.124329090118408, "step": 285 }, { "epoch": 0.17386018237082068, "grad_norm": 1.055483102798462, "learning_rate": 0.0005, "loss": 4.890225887298584, "step": 286 }, { "epoch": 0.17446808510638298, "grad_norm": 1.2479093074798584, "learning_rate": 0.0005, "loss": 4.835631370544434, "step": 287 }, { "epoch": 0.17507598784194528, "grad_norm": 1.1899778842926025, "learning_rate": 0.0005, "loss": 5.027457237243652, "step": 288 }, { "epoch": 0.1756838905775076, "grad_norm": 1.1618897914886475, "learning_rate": 0.0005, "loss": 5.145232677459717, "step": 289 }, { "epoch": 0.1762917933130699, "grad_norm": 1.2332507371902466, "learning_rate": 0.0005, "loss": 5.138116359710693, "step": 290 }, { "epoch": 0.17689969604863223, "grad_norm": 1.1276404857635498, "learning_rate": 0.0005, "loss": 5.094466209411621, "step": 291 }, { "epoch": 0.17750759878419453, "grad_norm": 1.4890656471252441, "learning_rate": 0.0005, "loss": 4.797001838684082, "step": 292 }, { "epoch": 0.17811550151975683, "grad_norm": 1.0490905046463013, "learning_rate": 0.0005, "loss": 5.235766410827637, "step": 293 }, { "epoch": 0.17872340425531916, "grad_norm": 1.1675019264221191, "learning_rate": 0.0005, "loss": 4.964472770690918, "step": 294 }, { "epoch": 0.17933130699088146, "grad_norm": 0.9588620662689209, "learning_rate": 0.0005, "loss": 5.124715805053711, "step": 295 }, { "epoch": 0.17993920972644378, "grad_norm": 1.3892091512680054, "learning_rate": 0.0005, "loss": 4.847377300262451, "step": 296 }, { "epoch": 0.18054711246200608, "grad_norm": 1.1051721572875977, "learning_rate": 0.0005, "loss": 5.199601173400879, "step": 297 }, { "epoch": 0.18115501519756838, "grad_norm": 1.0869505405426025, "learning_rate": 0.0005, "loss": 5.3870697021484375, "step": 298 }, { "epoch": 0.1817629179331307, "grad_norm": 1.111187219619751, "learning_rate": 0.0005, "loss": 5.190181732177734, "step": 299 }, { "epoch": 0.182370820668693, "grad_norm": 1.2440016269683838, "learning_rate": 0.0005, "loss": 5.041322231292725, "step": 300 }, { "epoch": 0.1829787234042553, "grad_norm": 1.2418692111968994, "learning_rate": 0.0005, "loss": 5.212306022644043, "step": 301 }, { "epoch": 0.18358662613981763, "grad_norm": 1.2612659931182861, "learning_rate": 0.0005, "loss": 4.961835861206055, "step": 302 }, { "epoch": 0.18419452887537993, "grad_norm": 1.1162973642349243, "learning_rate": 0.0005, "loss": 4.950830936431885, "step": 303 }, { "epoch": 0.18480243161094226, "grad_norm": 1.144067406654358, "learning_rate": 0.0005, "loss": 4.8998637199401855, "step": 304 }, { "epoch": 0.18541033434650456, "grad_norm": 1.2814747095108032, "learning_rate": 0.0005, "loss": 5.224381446838379, "step": 305 }, { "epoch": 0.18601823708206686, "grad_norm": 1.3770310878753662, "learning_rate": 0.0005, "loss": 5.05579137802124, "step": 306 }, { "epoch": 0.18662613981762918, "grad_norm": 1.5116229057312012, "learning_rate": 0.0005, "loss": 5.082482814788818, "step": 307 }, { "epoch": 0.18723404255319148, "grad_norm": 1.0909713506698608, "learning_rate": 0.0005, "loss": 4.967124938964844, "step": 308 }, { "epoch": 0.1878419452887538, "grad_norm": 1.1027607917785645, "learning_rate": 0.0005, "loss": 5.00374698638916, "step": 309 }, { "epoch": 0.1884498480243161, "grad_norm": 1.238652229309082, "learning_rate": 0.0005, "loss": 4.993183135986328, "step": 310 }, { "epoch": 0.1890577507598784, "grad_norm": 1.0609782934188843, "learning_rate": 0.0005, "loss": 5.019218444824219, "step": 311 }, { "epoch": 0.18966565349544073, "grad_norm": 1.1945058107376099, "learning_rate": 0.0005, "loss": 5.068751335144043, "step": 312 }, { "epoch": 0.19027355623100303, "grad_norm": 1.2640782594680786, "learning_rate": 0.0005, "loss": 5.185402870178223, "step": 313 }, { "epoch": 0.19088145896656536, "grad_norm": 1.0532907247543335, "learning_rate": 0.0005, "loss": 5.222114562988281, "step": 314 }, { "epoch": 0.19148936170212766, "grad_norm": 1.0423952341079712, "learning_rate": 0.0005, "loss": 5.1693806648254395, "step": 315 }, { "epoch": 0.19209726443768996, "grad_norm": 1.0700887441635132, "learning_rate": 0.0005, "loss": 5.0217485427856445, "step": 316 }, { "epoch": 0.19270516717325228, "grad_norm": 1.2595866918563843, "learning_rate": 0.0005, "loss": 5.231429576873779, "step": 317 }, { "epoch": 0.19331306990881458, "grad_norm": 1.1495158672332764, "learning_rate": 0.0005, "loss": 5.015372276306152, "step": 318 }, { "epoch": 0.1939209726443769, "grad_norm": 1.3977763652801514, "learning_rate": 0.0005, "loss": 5.323009490966797, "step": 319 }, { "epoch": 0.1945288753799392, "grad_norm": 1.4009697437286377, "learning_rate": 0.0005, "loss": 5.2833638191223145, "step": 320 }, { "epoch": 0.1951367781155015, "grad_norm": 1.1618447303771973, "learning_rate": 0.0005, "loss": 5.064535140991211, "step": 321 }, { "epoch": 0.19574468085106383, "grad_norm": 1.1447522640228271, "learning_rate": 0.0005, "loss": 4.99235725402832, "step": 322 }, { "epoch": 0.19635258358662613, "grad_norm": 1.2342157363891602, "learning_rate": 0.0005, "loss": 5.036558151245117, "step": 323 }, { "epoch": 0.19696048632218846, "grad_norm": 1.2487186193466187, "learning_rate": 0.0005, "loss": 5.207220077514648, "step": 324 }, { "epoch": 0.19756838905775076, "grad_norm": 1.4693067073822021, "learning_rate": 0.0005, "loss": 5.096504211425781, "step": 325 }, { "epoch": 0.19817629179331306, "grad_norm": 1.1707696914672852, "learning_rate": 0.0005, "loss": 5.003598213195801, "step": 326 }, { "epoch": 0.19878419452887539, "grad_norm": 0.9728778600692749, "learning_rate": 0.0005, "loss": 4.8744659423828125, "step": 327 }, { "epoch": 0.19939209726443768, "grad_norm": 1.383410096168518, "learning_rate": 0.0005, "loss": 5.1511383056640625, "step": 328 }, { "epoch": 0.2, "grad_norm": 1.0482876300811768, "learning_rate": 0.0005, "loss": 5.014847755432129, "step": 329 }, { "epoch": 0.2006079027355623, "grad_norm": 1.2320209741592407, "learning_rate": 0.0005, "loss": 4.923969745635986, "step": 330 }, { "epoch": 0.2012158054711246, "grad_norm": 2.013617753982544, "learning_rate": 0.0005, "loss": 4.876163482666016, "step": 331 }, { "epoch": 0.20182370820668694, "grad_norm": 1.4123047590255737, "learning_rate": 0.0005, "loss": 4.870320796966553, "step": 332 }, { "epoch": 0.20243161094224923, "grad_norm": 0.9998598694801331, "learning_rate": 0.0005, "loss": 4.8142805099487305, "step": 333 }, { "epoch": 0.20303951367781156, "grad_norm": 1.255579948425293, "learning_rate": 0.0005, "loss": 5.134385108947754, "step": 334 }, { "epoch": 0.20364741641337386, "grad_norm": 1.1863816976547241, "learning_rate": 0.0005, "loss": 4.943517208099365, "step": 335 }, { "epoch": 0.20425531914893616, "grad_norm": 1.3125497102737427, "learning_rate": 0.0005, "loss": 4.835733413696289, "step": 336 }, { "epoch": 0.2048632218844985, "grad_norm": 1.330944538116455, "learning_rate": 0.0005, "loss": 4.996496200561523, "step": 337 }, { "epoch": 0.20547112462006079, "grad_norm": 1.4103339910507202, "learning_rate": 0.0005, "loss": 5.215001106262207, "step": 338 }, { "epoch": 0.2060790273556231, "grad_norm": 1.1276763677597046, "learning_rate": 0.0005, "loss": 5.080985069274902, "step": 339 }, { "epoch": 0.2066869300911854, "grad_norm": 1.2522611618041992, "learning_rate": 0.0005, "loss": 5.1337480545043945, "step": 340 }, { "epoch": 0.2072948328267477, "grad_norm": 1.0622775554656982, "learning_rate": 0.0005, "loss": 5.139281272888184, "step": 341 }, { "epoch": 0.20790273556231004, "grad_norm": 1.2667897939682007, "learning_rate": 0.0005, "loss": 4.985269546508789, "step": 342 }, { "epoch": 0.20851063829787234, "grad_norm": 1.2665342092514038, "learning_rate": 0.0005, "loss": 4.907642841339111, "step": 343 }, { "epoch": 0.20911854103343466, "grad_norm": 1.2670104503631592, "learning_rate": 0.0005, "loss": 4.9238739013671875, "step": 344 }, { "epoch": 0.20972644376899696, "grad_norm": 1.3876585960388184, "learning_rate": 0.0005, "loss": 5.280843734741211, "step": 345 }, { "epoch": 0.21033434650455926, "grad_norm": 1.172425389289856, "learning_rate": 0.0005, "loss": 5.018771171569824, "step": 346 }, { "epoch": 0.2109422492401216, "grad_norm": 1.057332158088684, "learning_rate": 0.0005, "loss": 4.957630157470703, "step": 347 }, { "epoch": 0.2115501519756839, "grad_norm": 1.2106921672821045, "learning_rate": 0.0005, "loss": 5.079224109649658, "step": 348 }, { "epoch": 0.2121580547112462, "grad_norm": 1.2184040546417236, "learning_rate": 0.0005, "loss": 4.923876762390137, "step": 349 }, { "epoch": 0.2127659574468085, "grad_norm": 1.3889566659927368, "learning_rate": 0.0005, "loss": 5.0445098876953125, "step": 350 }, { "epoch": 0.2133738601823708, "grad_norm": 1.1836071014404297, "learning_rate": 0.0005, "loss": 4.762534141540527, "step": 351 }, { "epoch": 0.21398176291793314, "grad_norm": 1.2222967147827148, "learning_rate": 0.0005, "loss": 5.045120716094971, "step": 352 }, { "epoch": 0.21458966565349544, "grad_norm": 1.203317403793335, "learning_rate": 0.0005, "loss": 5.027883052825928, "step": 353 }, { "epoch": 0.21519756838905776, "grad_norm": 1.118275761604309, "learning_rate": 0.0005, "loss": 5.153387069702148, "step": 354 }, { "epoch": 0.21580547112462006, "grad_norm": 1.1502918004989624, "learning_rate": 0.0005, "loss": 4.907447814941406, "step": 355 }, { "epoch": 0.21641337386018236, "grad_norm": 0.916477620601654, "learning_rate": 0.0005, "loss": 4.913633346557617, "step": 356 }, { "epoch": 0.2170212765957447, "grad_norm": 0.9976673722267151, "learning_rate": 0.0005, "loss": 4.855230331420898, "step": 357 }, { "epoch": 0.217629179331307, "grad_norm": 1.2301874160766602, "learning_rate": 0.0005, "loss": 5.274983882904053, "step": 358 }, { "epoch": 0.21823708206686931, "grad_norm": 1.268349051475525, "learning_rate": 0.0005, "loss": 4.990891933441162, "step": 359 }, { "epoch": 0.2188449848024316, "grad_norm": 1.7098944187164307, "learning_rate": 0.0005, "loss": 5.0019989013671875, "step": 360 }, { "epoch": 0.2194528875379939, "grad_norm": 1.3171290159225464, "learning_rate": 0.0005, "loss": 5.091225624084473, "step": 361 }, { "epoch": 0.22006079027355624, "grad_norm": 1.1964459419250488, "learning_rate": 0.0005, "loss": 4.942023754119873, "step": 362 }, { "epoch": 0.22066869300911854, "grad_norm": 1.212193250656128, "learning_rate": 0.0005, "loss": 4.842243194580078, "step": 363 }, { "epoch": 0.22127659574468084, "grad_norm": 1.2447597980499268, "learning_rate": 0.0005, "loss": 4.891105651855469, "step": 364 }, { "epoch": 0.22188449848024316, "grad_norm": 1.0322506427764893, "learning_rate": 0.0005, "loss": 5.083103179931641, "step": 365 }, { "epoch": 0.22249240121580546, "grad_norm": 1.1431292295455933, "learning_rate": 0.0005, "loss": 5.104142189025879, "step": 366 }, { "epoch": 0.2231003039513678, "grad_norm": 1.1028327941894531, "learning_rate": 0.0005, "loss": 4.933050632476807, "step": 367 }, { "epoch": 0.2237082066869301, "grad_norm": 0.9712069630622864, "learning_rate": 0.0005, "loss": 4.821019172668457, "step": 368 }, { "epoch": 0.2243161094224924, "grad_norm": 1.063249111175537, "learning_rate": 0.0005, "loss": 4.972682476043701, "step": 369 }, { "epoch": 0.22492401215805471, "grad_norm": 1.1715357303619385, "learning_rate": 0.0005, "loss": 5.0836591720581055, "step": 370 }, { "epoch": 0.225531914893617, "grad_norm": 1.128483772277832, "learning_rate": 0.0005, "loss": 5.094054698944092, "step": 371 }, { "epoch": 0.22613981762917934, "grad_norm": 1.2616199254989624, "learning_rate": 0.0005, "loss": 4.991359710693359, "step": 372 }, { "epoch": 0.22674772036474164, "grad_norm": 1.2140382528305054, "learning_rate": 0.0005, "loss": 4.7401838302612305, "step": 373 }, { "epoch": 0.22735562310030394, "grad_norm": 1.1435750722885132, "learning_rate": 0.0005, "loss": 5.093307971954346, "step": 374 }, { "epoch": 0.22796352583586627, "grad_norm": 1.0213854312896729, "learning_rate": 0.0005, "loss": 4.898110389709473, "step": 375 }, { "epoch": 0.22857142857142856, "grad_norm": 1.6159358024597168, "learning_rate": 0.0005, "loss": 4.884780406951904, "step": 376 }, { "epoch": 0.2291793313069909, "grad_norm": 1.0451385974884033, "learning_rate": 0.0005, "loss": 5.046623229980469, "step": 377 }, { "epoch": 0.2297872340425532, "grad_norm": 1.0726312398910522, "learning_rate": 0.0005, "loss": 5.3511962890625, "step": 378 }, { "epoch": 0.2303951367781155, "grad_norm": 1.1179200410842896, "learning_rate": 0.0005, "loss": 4.847324371337891, "step": 379 }, { "epoch": 0.23100303951367782, "grad_norm": 1.1474509239196777, "learning_rate": 0.0005, "loss": 4.830921173095703, "step": 380 }, { "epoch": 0.23161094224924011, "grad_norm": 1.0454329252243042, "learning_rate": 0.0005, "loss": 4.962401390075684, "step": 381 }, { "epoch": 0.23221884498480244, "grad_norm": 1.214348316192627, "learning_rate": 0.0005, "loss": 4.800313472747803, "step": 382 }, { "epoch": 0.23282674772036474, "grad_norm": 1.18563973903656, "learning_rate": 0.0005, "loss": 4.8629655838012695, "step": 383 }, { "epoch": 0.23343465045592704, "grad_norm": 1.0595086812973022, "learning_rate": 0.0005, "loss": 4.9949750900268555, "step": 384 }, { "epoch": 0.23404255319148937, "grad_norm": 1.0595086812973022, "learning_rate": 0.0005, "loss": 4.926072597503662, "step": 385 }, { "epoch": 0.23465045592705167, "grad_norm": 1.1770035028457642, "learning_rate": 0.0005, "loss": 4.766304969787598, "step": 386 }, { "epoch": 0.235258358662614, "grad_norm": 1.1117204427719116, "learning_rate": 0.0005, "loss": 4.896605968475342, "step": 387 }, { "epoch": 0.2358662613981763, "grad_norm": 1.2087441682815552, "learning_rate": 0.0005, "loss": 4.892548084259033, "step": 388 }, { "epoch": 0.2364741641337386, "grad_norm": 0.9041852355003357, "learning_rate": 0.0005, "loss": 4.948829650878906, "step": 389 }, { "epoch": 0.23708206686930092, "grad_norm": 0.94862300157547, "learning_rate": 0.0005, "loss": 4.8753533363342285, "step": 390 }, { "epoch": 0.23768996960486322, "grad_norm": 1.055679202079773, "learning_rate": 0.0005, "loss": 4.816287994384766, "step": 391 }, { "epoch": 0.23829787234042554, "grad_norm": 1.413857340812683, "learning_rate": 0.0005, "loss": 4.809457778930664, "step": 392 }, { "epoch": 0.23890577507598784, "grad_norm": 1.326051950454712, "learning_rate": 0.0005, "loss": 5.0313568115234375, "step": 393 }, { "epoch": 0.23951367781155014, "grad_norm": 1.2621649503707886, "learning_rate": 0.0005, "loss": 4.906643867492676, "step": 394 }, { "epoch": 0.24012158054711247, "grad_norm": 1.2217754125595093, "learning_rate": 0.0005, "loss": 4.929527759552002, "step": 395 }, { "epoch": 0.24072948328267477, "grad_norm": 1.1450992822647095, "learning_rate": 0.0005, "loss": 4.908195495605469, "step": 396 }, { "epoch": 0.2413373860182371, "grad_norm": 1.4507970809936523, "learning_rate": 0.0005, "loss": 5.079260349273682, "step": 397 }, { "epoch": 0.2419452887537994, "grad_norm": 1.086036205291748, "learning_rate": 0.0005, "loss": 4.996855735778809, "step": 398 }, { "epoch": 0.2425531914893617, "grad_norm": 1.0666170120239258, "learning_rate": 0.0005, "loss": 5.002256393432617, "step": 399 }, { "epoch": 0.24316109422492402, "grad_norm": 1.199183702468872, "learning_rate": 0.0005, "loss": 5.217647552490234, "step": 400 }, { "epoch": 0.24376899696048632, "grad_norm": 1.156293511390686, "learning_rate": 0.0005, "loss": 4.900952339172363, "step": 401 }, { "epoch": 0.24437689969604864, "grad_norm": 1.3151594400405884, "learning_rate": 0.0005, "loss": 4.980197906494141, "step": 402 }, { "epoch": 0.24498480243161094, "grad_norm": 1.0817885398864746, "learning_rate": 0.0005, "loss": 4.745031356811523, "step": 403 }, { "epoch": 0.24559270516717324, "grad_norm": 1.0003957748413086, "learning_rate": 0.0005, "loss": 4.599782466888428, "step": 404 }, { "epoch": 0.24620060790273557, "grad_norm": 0.95441734790802, "learning_rate": 0.0005, "loss": 4.928730010986328, "step": 405 }, { "epoch": 0.24680851063829787, "grad_norm": 1.1539515256881714, "learning_rate": 0.0005, "loss": 5.01755428314209, "step": 406 }, { "epoch": 0.2474164133738602, "grad_norm": 1.1274021863937378, "learning_rate": 0.0005, "loss": 4.92464542388916, "step": 407 }, { "epoch": 0.2480243161094225, "grad_norm": 1.075126051902771, "learning_rate": 0.0005, "loss": 4.842813014984131, "step": 408 }, { "epoch": 0.2486322188449848, "grad_norm": 1.1200828552246094, "learning_rate": 0.0005, "loss": 4.701647758483887, "step": 409 }, { "epoch": 0.24924012158054712, "grad_norm": 1.349135398864746, "learning_rate": 0.0005, "loss": 5.124917030334473, "step": 410 }, { "epoch": 0.24984802431610942, "grad_norm": 1.403590440750122, "learning_rate": 0.0005, "loss": 5.070537567138672, "step": 411 }, { "epoch": 0.25045592705167175, "grad_norm": 0.9664301872253418, "learning_rate": 0.0005, "loss": 4.846314430236816, "step": 412 }, { "epoch": 0.251063829787234, "grad_norm": 1.1642309427261353, "learning_rate": 0.0005, "loss": 4.933165550231934, "step": 413 }, { "epoch": 0.25167173252279634, "grad_norm": 1.1649516820907593, "learning_rate": 0.0005, "loss": 4.789491653442383, "step": 414 }, { "epoch": 0.25227963525835867, "grad_norm": 1.1041150093078613, "learning_rate": 0.0005, "loss": 4.580702781677246, "step": 415 }, { "epoch": 0.252887537993921, "grad_norm": 1.0078331232070923, "learning_rate": 0.0005, "loss": 4.77386999130249, "step": 416 }, { "epoch": 0.25349544072948327, "grad_norm": 1.0907591581344604, "learning_rate": 0.0005, "loss": 4.774503707885742, "step": 417 }, { "epoch": 0.2541033434650456, "grad_norm": 1.3880425691604614, "learning_rate": 0.0005, "loss": 4.793880462646484, "step": 418 }, { "epoch": 0.2547112462006079, "grad_norm": 1.2313039302825928, "learning_rate": 0.0005, "loss": 4.7932891845703125, "step": 419 }, { "epoch": 0.2553191489361702, "grad_norm": 0.9940412044525146, "learning_rate": 0.0005, "loss": 5.119372367858887, "step": 420 }, { "epoch": 0.2559270516717325, "grad_norm": 1.0474408864974976, "learning_rate": 0.0005, "loss": 4.940298080444336, "step": 421 }, { "epoch": 0.25653495440729485, "grad_norm": 1.091572642326355, "learning_rate": 0.0005, "loss": 4.824063777923584, "step": 422 }, { "epoch": 0.2571428571428571, "grad_norm": 0.9919223189353943, "learning_rate": 0.0005, "loss": 4.823666572570801, "step": 423 }, { "epoch": 0.25775075987841944, "grad_norm": 0.9640527963638306, "learning_rate": 0.0005, "loss": 4.798361778259277, "step": 424 }, { "epoch": 0.25835866261398177, "grad_norm": 1.0292719602584839, "learning_rate": 0.0005, "loss": 4.69101619720459, "step": 425 }, { "epoch": 0.2589665653495441, "grad_norm": 1.2390789985656738, "learning_rate": 0.0005, "loss": 4.671029090881348, "step": 426 }, { "epoch": 0.25957446808510637, "grad_norm": 1.2008142471313477, "learning_rate": 0.0005, "loss": 4.796487331390381, "step": 427 }, { "epoch": 0.2601823708206687, "grad_norm": 1.0405327081680298, "learning_rate": 0.0005, "loss": 4.8557820320129395, "step": 428 }, { "epoch": 0.260790273556231, "grad_norm": 1.042792558670044, "learning_rate": 0.0005, "loss": 4.805086135864258, "step": 429 }, { "epoch": 0.2613981762917933, "grad_norm": 1.6039878129959106, "learning_rate": 0.0005, "loss": 4.892642974853516, "step": 430 }, { "epoch": 0.2620060790273556, "grad_norm": 1.0221588611602783, "learning_rate": 0.0005, "loss": 4.868304252624512, "step": 431 }, { "epoch": 0.26261398176291795, "grad_norm": 1.0673880577087402, "learning_rate": 0.0005, "loss": 4.52126932144165, "step": 432 }, { "epoch": 0.2632218844984802, "grad_norm": 1.1782925128936768, "learning_rate": 0.0005, "loss": 4.9915618896484375, "step": 433 }, { "epoch": 0.26382978723404255, "grad_norm": 0.9004169702529907, "learning_rate": 0.0005, "loss": 5.040285110473633, "step": 434 }, { "epoch": 0.26443768996960487, "grad_norm": 1.1495839357376099, "learning_rate": 0.0005, "loss": 4.991700172424316, "step": 435 }, { "epoch": 0.2650455927051672, "grad_norm": 1.4188427925109863, "learning_rate": 0.0005, "loss": 4.851819038391113, "step": 436 }, { "epoch": 0.26565349544072947, "grad_norm": 1.1886249780654907, "learning_rate": 0.0005, "loss": 4.819738388061523, "step": 437 }, { "epoch": 0.2662613981762918, "grad_norm": 1.0886558294296265, "learning_rate": 0.0005, "loss": 4.889862537384033, "step": 438 }, { "epoch": 0.2668693009118541, "grad_norm": 1.215423822402954, "learning_rate": 0.0005, "loss": 4.66435432434082, "step": 439 }, { "epoch": 0.2674772036474164, "grad_norm": 1.2564237117767334, "learning_rate": 0.0005, "loss": 4.840651512145996, "step": 440 }, { "epoch": 0.2680851063829787, "grad_norm": 0.9406836628913879, "learning_rate": 0.0005, "loss": 4.836145401000977, "step": 441 }, { "epoch": 0.26869300911854105, "grad_norm": 0.9963774085044861, "learning_rate": 0.0005, "loss": 4.879360675811768, "step": 442 }, { "epoch": 0.2693009118541033, "grad_norm": 1.349959135055542, "learning_rate": 0.0005, "loss": 5.149614334106445, "step": 443 }, { "epoch": 0.26990881458966565, "grad_norm": 1.0401732921600342, "learning_rate": 0.0005, "loss": 4.831120491027832, "step": 444 }, { "epoch": 0.270516717325228, "grad_norm": 1.0176857709884644, "learning_rate": 0.0005, "loss": 4.795515060424805, "step": 445 }, { "epoch": 0.2711246200607903, "grad_norm": 1.025748610496521, "learning_rate": 0.0005, "loss": 4.850000381469727, "step": 446 }, { "epoch": 0.27173252279635257, "grad_norm": 1.179107904434204, "learning_rate": 0.0005, "loss": 4.714792728424072, "step": 447 }, { "epoch": 0.2723404255319149, "grad_norm": 1.0913288593292236, "learning_rate": 0.0005, "loss": 4.713229656219482, "step": 448 }, { "epoch": 0.2729483282674772, "grad_norm": 1.2143056392669678, "learning_rate": 0.0005, "loss": 4.776023864746094, "step": 449 }, { "epoch": 0.2735562310030395, "grad_norm": 1.0799494981765747, "learning_rate": 0.0005, "loss": 4.930194854736328, "step": 450 }, { "epoch": 0.2741641337386018, "grad_norm": 1.108874797821045, "learning_rate": 0.0005, "loss": 4.798364162445068, "step": 451 }, { "epoch": 0.27477203647416415, "grad_norm": 1.023545742034912, "learning_rate": 0.0005, "loss": 4.951462745666504, "step": 452 }, { "epoch": 0.2753799392097264, "grad_norm": 1.109633207321167, "learning_rate": 0.0005, "loss": 4.775464057922363, "step": 453 }, { "epoch": 0.27598784194528875, "grad_norm": 1.3409186601638794, "learning_rate": 0.0005, "loss": 4.637991905212402, "step": 454 }, { "epoch": 0.2765957446808511, "grad_norm": 1.3562052249908447, "learning_rate": 0.0005, "loss": 4.67308235168457, "step": 455 }, { "epoch": 0.2772036474164134, "grad_norm": 1.0121145248413086, "learning_rate": 0.0005, "loss": 4.8010430335998535, "step": 456 }, { "epoch": 0.2778115501519757, "grad_norm": 1.1394174098968506, "learning_rate": 0.0005, "loss": 4.878546237945557, "step": 457 }, { "epoch": 0.278419452887538, "grad_norm": 1.2403444051742554, "learning_rate": 0.0005, "loss": 4.8740434646606445, "step": 458 }, { "epoch": 0.2790273556231003, "grad_norm": 1.242672085762024, "learning_rate": 0.0005, "loss": 4.854490280151367, "step": 459 }, { "epoch": 0.2796352583586626, "grad_norm": 1.1986356973648071, "learning_rate": 0.0005, "loss": 4.629700660705566, "step": 460 }, { "epoch": 0.2802431610942249, "grad_norm": 1.0786645412445068, "learning_rate": 0.0005, "loss": 4.87874698638916, "step": 461 }, { "epoch": 0.28085106382978725, "grad_norm": 1.1056885719299316, "learning_rate": 0.0005, "loss": 4.816555023193359, "step": 462 }, { "epoch": 0.2814589665653495, "grad_norm": 1.2329976558685303, "learning_rate": 0.0005, "loss": 4.837638854980469, "step": 463 }, { "epoch": 0.28206686930091185, "grad_norm": 1.0028218030929565, "learning_rate": 0.0005, "loss": 4.760637283325195, "step": 464 }, { "epoch": 0.2826747720364742, "grad_norm": 2.1149895191192627, "learning_rate": 0.0005, "loss": 4.90034818649292, "step": 465 }, { "epoch": 0.28328267477203645, "grad_norm": 1.1582082509994507, "learning_rate": 0.0005, "loss": 4.943870544433594, "step": 466 }, { "epoch": 0.2838905775075988, "grad_norm": 1.069417119026184, "learning_rate": 0.0005, "loss": 4.872045993804932, "step": 467 }, { "epoch": 0.2844984802431611, "grad_norm": 1.0112608671188354, "learning_rate": 0.0005, "loss": 4.7598490715026855, "step": 468 }, { "epoch": 0.2851063829787234, "grad_norm": 1.2075181007385254, "learning_rate": 0.0005, "loss": 4.731328010559082, "step": 469 }, { "epoch": 0.2857142857142857, "grad_norm": 1.2083991765975952, "learning_rate": 0.0005, "loss": 4.927289962768555, "step": 470 }, { "epoch": 0.286322188449848, "grad_norm": 1.1168643236160278, "learning_rate": 0.0005, "loss": 4.864751815795898, "step": 471 }, { "epoch": 0.28693009118541035, "grad_norm": 1.078041434288025, "learning_rate": 0.0005, "loss": 4.8492431640625, "step": 472 }, { "epoch": 0.2875379939209726, "grad_norm": 1.1274940967559814, "learning_rate": 0.0005, "loss": 4.937112808227539, "step": 473 }, { "epoch": 0.28814589665653495, "grad_norm": 1.0653259754180908, "learning_rate": 0.0005, "loss": 4.594569683074951, "step": 474 }, { "epoch": 0.2887537993920973, "grad_norm": 1.1258432865142822, "learning_rate": 0.0005, "loss": 4.773998260498047, "step": 475 }, { "epoch": 0.28936170212765955, "grad_norm": 1.0394357442855835, "learning_rate": 0.0005, "loss": 4.6821393966674805, "step": 476 }, { "epoch": 0.2899696048632219, "grad_norm": 0.9899529218673706, "learning_rate": 0.0005, "loss": 4.887704849243164, "step": 477 }, { "epoch": 0.2905775075987842, "grad_norm": 1.1077382564544678, "learning_rate": 0.0005, "loss": 4.747071266174316, "step": 478 }, { "epoch": 0.29118541033434653, "grad_norm": 1.1913772821426392, "learning_rate": 0.0005, "loss": 4.718881607055664, "step": 479 }, { "epoch": 0.2917933130699088, "grad_norm": 1.0459861755371094, "learning_rate": 0.0005, "loss": 4.841939926147461, "step": 480 }, { "epoch": 0.2924012158054711, "grad_norm": 1.0120186805725098, "learning_rate": 0.0005, "loss": 4.599112510681152, "step": 481 }, { "epoch": 0.29300911854103345, "grad_norm": 1.195823073387146, "learning_rate": 0.0005, "loss": 4.728496551513672, "step": 482 }, { "epoch": 0.2936170212765957, "grad_norm": 1.3696142435073853, "learning_rate": 0.0005, "loss": 4.8885321617126465, "step": 483 }, { "epoch": 0.29422492401215805, "grad_norm": 1.0792248249053955, "learning_rate": 0.0005, "loss": 4.971987724304199, "step": 484 }, { "epoch": 0.2948328267477204, "grad_norm": 1.1619709730148315, "learning_rate": 0.0005, "loss": 4.571520805358887, "step": 485 }, { "epoch": 0.29544072948328265, "grad_norm": 1.0330854654312134, "learning_rate": 0.0005, "loss": 4.9688520431518555, "step": 486 }, { "epoch": 0.296048632218845, "grad_norm": 1.0170172452926636, "learning_rate": 0.0005, "loss": 4.837705135345459, "step": 487 }, { "epoch": 0.2966565349544073, "grad_norm": 0.9504514932632446, "learning_rate": 0.0005, "loss": 4.930578231811523, "step": 488 }, { "epoch": 0.29726443768996963, "grad_norm": 1.0397839546203613, "learning_rate": 0.0005, "loss": 4.835279941558838, "step": 489 }, { "epoch": 0.2978723404255319, "grad_norm": 1.1507797241210938, "learning_rate": 0.0005, "loss": 4.659822463989258, "step": 490 }, { "epoch": 0.2984802431610942, "grad_norm": 1.0850329399108887, "learning_rate": 0.0005, "loss": 4.845378875732422, "step": 491 }, { "epoch": 0.29908814589665655, "grad_norm": 0.9977235794067383, "learning_rate": 0.0005, "loss": 4.6792449951171875, "step": 492 }, { "epoch": 0.2996960486322188, "grad_norm": 1.1023447513580322, "learning_rate": 0.0005, "loss": 4.397878646850586, "step": 493 }, { "epoch": 0.30030395136778115, "grad_norm": 1.151859998703003, "learning_rate": 0.0005, "loss": 4.909426689147949, "step": 494 }, { "epoch": 0.3009118541033435, "grad_norm": 0.9461018443107605, "learning_rate": 0.0005, "loss": 4.778614044189453, "step": 495 }, { "epoch": 0.30151975683890575, "grad_norm": 1.0753334760665894, "learning_rate": 0.0005, "loss": 4.747906684875488, "step": 496 }, { "epoch": 0.3021276595744681, "grad_norm": 1.1790133714675903, "learning_rate": 0.0005, "loss": 4.932548522949219, "step": 497 }, { "epoch": 0.3027355623100304, "grad_norm": 0.9537319540977478, "learning_rate": 0.0005, "loss": 4.962670803070068, "step": 498 }, { "epoch": 0.30334346504559273, "grad_norm": 1.0915073156356812, "learning_rate": 0.0005, "loss": 4.60493278503418, "step": 499 }, { "epoch": 0.303951367781155, "grad_norm": 1.1177006959915161, "learning_rate": 0.0005, "loss": 4.69853401184082, "step": 500 }, { "epoch": 0.30455927051671733, "grad_norm": 1.297899842262268, "learning_rate": 0.0005, "loss": 4.779489517211914, "step": 501 }, { "epoch": 0.30516717325227966, "grad_norm": 1.0834105014801025, "learning_rate": 0.0005, "loss": 4.795891761779785, "step": 502 }, { "epoch": 0.3057750759878419, "grad_norm": 1.345795750617981, "learning_rate": 0.0005, "loss": 4.725937843322754, "step": 503 }, { "epoch": 0.30638297872340425, "grad_norm": 1.0314546823501587, "learning_rate": 0.0005, "loss": 4.679283142089844, "step": 504 }, { "epoch": 0.3069908814589666, "grad_norm": 1.0348689556121826, "learning_rate": 0.0005, "loss": 4.620650291442871, "step": 505 }, { "epoch": 0.30759878419452885, "grad_norm": 1.266882061958313, "learning_rate": 0.0005, "loss": 4.773314476013184, "step": 506 }, { "epoch": 0.3082066869300912, "grad_norm": 1.1243505477905273, "learning_rate": 0.0005, "loss": 4.748200416564941, "step": 507 }, { "epoch": 0.3088145896656535, "grad_norm": 1.1018924713134766, "learning_rate": 0.0005, "loss": 4.68126106262207, "step": 508 }, { "epoch": 0.30942249240121583, "grad_norm": 0.9563927054405212, "learning_rate": 0.0005, "loss": 4.857057094573975, "step": 509 }, { "epoch": 0.3100303951367781, "grad_norm": 0.9670454263687134, "learning_rate": 0.0005, "loss": 4.659792900085449, "step": 510 }, { "epoch": 0.31063829787234043, "grad_norm": 1.3360145092010498, "learning_rate": 0.0005, "loss": 4.829246520996094, "step": 511 }, { "epoch": 0.31124620060790276, "grad_norm": 1.2123932838439941, "learning_rate": 0.0005, "loss": 4.866283416748047, "step": 512 }, { "epoch": 0.31185410334346503, "grad_norm": 1.1718541383743286, "learning_rate": 0.0005, "loss": 4.582745552062988, "step": 513 }, { "epoch": 0.31246200607902735, "grad_norm": 1.0925103425979614, "learning_rate": 0.0005, "loss": 4.792252540588379, "step": 514 }, { "epoch": 0.3130699088145897, "grad_norm": 1.1929430961608887, "learning_rate": 0.0005, "loss": 5.072274208068848, "step": 515 }, { "epoch": 0.31367781155015195, "grad_norm": 1.1033862829208374, "learning_rate": 0.0005, "loss": 5.100406646728516, "step": 516 }, { "epoch": 0.3142857142857143, "grad_norm": 1.0984266996383667, "learning_rate": 0.0005, "loss": 4.652458190917969, "step": 517 }, { "epoch": 0.3148936170212766, "grad_norm": 1.1322665214538574, "learning_rate": 0.0005, "loss": 4.757636070251465, "step": 518 }, { "epoch": 0.31550151975683893, "grad_norm": 1.062367558479309, "learning_rate": 0.0005, "loss": 4.769024848937988, "step": 519 }, { "epoch": 0.3161094224924012, "grad_norm": 1.2141786813735962, "learning_rate": 0.0005, "loss": 4.795253753662109, "step": 520 }, { "epoch": 0.31671732522796353, "grad_norm": 1.0612986087799072, "learning_rate": 0.0005, "loss": 4.869831562042236, "step": 521 }, { "epoch": 0.31732522796352586, "grad_norm": 1.0063875913619995, "learning_rate": 0.0005, "loss": 4.789008617401123, "step": 522 }, { "epoch": 0.31793313069908813, "grad_norm": 1.1345361471176147, "learning_rate": 0.0005, "loss": 4.858623504638672, "step": 523 }, { "epoch": 0.31854103343465046, "grad_norm": 1.0883427858352661, "learning_rate": 0.0005, "loss": 4.6939568519592285, "step": 524 }, { "epoch": 0.3191489361702128, "grad_norm": 1.210877776145935, "learning_rate": 0.0005, "loss": 4.860000133514404, "step": 525 }, { "epoch": 0.31975683890577505, "grad_norm": 0.9779753088951111, "learning_rate": 0.0005, "loss": 4.710822582244873, "step": 526 }, { "epoch": 0.3203647416413374, "grad_norm": 1.130603313446045, "learning_rate": 0.0005, "loss": 4.8572678565979, "step": 527 }, { "epoch": 0.3209726443768997, "grad_norm": 1.0674115419387817, "learning_rate": 0.0005, "loss": 4.597178936004639, "step": 528 }, { "epoch": 0.321580547112462, "grad_norm": 1.2021600008010864, "learning_rate": 0.0005, "loss": 4.564465045928955, "step": 529 }, { "epoch": 0.3221884498480243, "grad_norm": 1.018747329711914, "learning_rate": 0.0005, "loss": 4.791827201843262, "step": 530 }, { "epoch": 0.32279635258358663, "grad_norm": 0.847745418548584, "learning_rate": 0.0005, "loss": 4.538583278656006, "step": 531 }, { "epoch": 0.32340425531914896, "grad_norm": 1.0722301006317139, "learning_rate": 0.0005, "loss": 4.728479385375977, "step": 532 }, { "epoch": 0.32401215805471123, "grad_norm": 1.0908275842666626, "learning_rate": 0.0005, "loss": 4.7406721115112305, "step": 533 }, { "epoch": 0.32462006079027356, "grad_norm": 1.0944693088531494, "learning_rate": 0.0005, "loss": 4.56569242477417, "step": 534 }, { "epoch": 0.3252279635258359, "grad_norm": 1.2364919185638428, "learning_rate": 0.0005, "loss": 4.977725028991699, "step": 535 }, { "epoch": 0.32583586626139815, "grad_norm": 0.9999113082885742, "learning_rate": 0.0005, "loss": 4.493361473083496, "step": 536 }, { "epoch": 0.3264437689969605, "grad_norm": 1.3366332054138184, "learning_rate": 0.0005, "loss": 4.634256362915039, "step": 537 }, { "epoch": 0.3270516717325228, "grad_norm": 1.1342191696166992, "learning_rate": 0.0005, "loss": 4.737150192260742, "step": 538 }, { "epoch": 0.3276595744680851, "grad_norm": 1.582653284072876, "learning_rate": 0.0005, "loss": 4.870404243469238, "step": 539 }, { "epoch": 0.3282674772036474, "grad_norm": 1.1713464260101318, "learning_rate": 0.0005, "loss": 4.6230669021606445, "step": 540 }, { "epoch": 0.32887537993920973, "grad_norm": 1.4178698062896729, "learning_rate": 0.0005, "loss": 4.764198303222656, "step": 541 }, { "epoch": 0.32948328267477206, "grad_norm": 1.2060075998306274, "learning_rate": 0.0005, "loss": 4.675044059753418, "step": 542 }, { "epoch": 0.33009118541033433, "grad_norm": 1.1698312759399414, "learning_rate": 0.0005, "loss": 4.706038475036621, "step": 543 }, { "epoch": 0.33069908814589666, "grad_norm": 1.23035728931427, "learning_rate": 0.0005, "loss": 4.638150215148926, "step": 544 }, { "epoch": 0.331306990881459, "grad_norm": 1.2109099626541138, "learning_rate": 0.0005, "loss": 4.521143436431885, "step": 545 }, { "epoch": 0.33191489361702126, "grad_norm": 1.0906360149383545, "learning_rate": 0.0005, "loss": 4.71769380569458, "step": 546 }, { "epoch": 0.3325227963525836, "grad_norm": 0.9782645106315613, "learning_rate": 0.0005, "loss": 4.610015869140625, "step": 547 }, { "epoch": 0.3331306990881459, "grad_norm": 0.9349035620689392, "learning_rate": 0.0005, "loss": 4.59166955947876, "step": 548 }, { "epoch": 0.3337386018237082, "grad_norm": 0.987219512462616, "learning_rate": 0.0005, "loss": 4.769125938415527, "step": 549 }, { "epoch": 0.3343465045592705, "grad_norm": 1.1204229593276978, "learning_rate": 0.0005, "loss": 4.561359405517578, "step": 550 }, { "epoch": 0.33495440729483283, "grad_norm": 0.9658718109130859, "learning_rate": 0.0005, "loss": 4.64151668548584, "step": 551 }, { "epoch": 0.33556231003039516, "grad_norm": 0.9612642526626587, "learning_rate": 0.0005, "loss": 4.750694274902344, "step": 552 }, { "epoch": 0.33617021276595743, "grad_norm": 1.215868592262268, "learning_rate": 0.0005, "loss": 4.788500785827637, "step": 553 }, { "epoch": 0.33677811550151976, "grad_norm": 1.1488007307052612, "learning_rate": 0.0005, "loss": 4.708594799041748, "step": 554 }, { "epoch": 0.3373860182370821, "grad_norm": 1.7407371997833252, "learning_rate": 0.0005, "loss": 4.751000881195068, "step": 555 }, { "epoch": 0.33799392097264436, "grad_norm": 1.0364381074905396, "learning_rate": 0.0005, "loss": 4.5454301834106445, "step": 556 }, { "epoch": 0.3386018237082067, "grad_norm": 1.0255850553512573, "learning_rate": 0.0005, "loss": 4.67049503326416, "step": 557 }, { "epoch": 0.339209726443769, "grad_norm": 1.1722489595413208, "learning_rate": 0.0005, "loss": 4.762301445007324, "step": 558 }, { "epoch": 0.3398176291793313, "grad_norm": 0.9487795829772949, "learning_rate": 0.0005, "loss": 4.537074089050293, "step": 559 }, { "epoch": 0.3404255319148936, "grad_norm": 1.0322198867797852, "learning_rate": 0.0005, "loss": 4.325550079345703, "step": 560 }, { "epoch": 0.34103343465045594, "grad_norm": 1.1969901323318481, "learning_rate": 0.0005, "loss": 4.897404670715332, "step": 561 }, { "epoch": 0.34164133738601826, "grad_norm": 0.9366703629493713, "learning_rate": 0.0005, "loss": 4.552170753479004, "step": 562 }, { "epoch": 0.34224924012158053, "grad_norm": 0.9916586875915527, "learning_rate": 0.0005, "loss": 4.596172332763672, "step": 563 }, { "epoch": 0.34285714285714286, "grad_norm": 1.1367878913879395, "learning_rate": 0.0005, "loss": 4.745723724365234, "step": 564 }, { "epoch": 0.3434650455927052, "grad_norm": 1.0490455627441406, "learning_rate": 0.0005, "loss": 4.605084419250488, "step": 565 }, { "epoch": 0.34407294832826746, "grad_norm": 1.2300151586532593, "learning_rate": 0.0005, "loss": 4.680173397064209, "step": 566 }, { "epoch": 0.3446808510638298, "grad_norm": 0.9747954607009888, "learning_rate": 0.0005, "loss": 4.755300521850586, "step": 567 }, { "epoch": 0.3452887537993921, "grad_norm": 1.2195698022842407, "learning_rate": 0.0005, "loss": 4.678683280944824, "step": 568 }, { "epoch": 0.3458966565349544, "grad_norm": 1.1122758388519287, "learning_rate": 0.0005, "loss": 4.55827522277832, "step": 569 }, { "epoch": 0.3465045592705167, "grad_norm": 1.1671665906906128, "learning_rate": 0.0005, "loss": 4.6204071044921875, "step": 570 }, { "epoch": 0.34711246200607904, "grad_norm": 0.912133514881134, "learning_rate": 0.0005, "loss": 4.619932174682617, "step": 571 }, { "epoch": 0.34772036474164136, "grad_norm": 1.0673686265945435, "learning_rate": 0.0005, "loss": 4.7417120933532715, "step": 572 }, { "epoch": 0.34832826747720363, "grad_norm": 1.0796691179275513, "learning_rate": 0.0005, "loss": 4.666133880615234, "step": 573 }, { "epoch": 0.34893617021276596, "grad_norm": 1.177518367767334, "learning_rate": 0.0005, "loss": 4.443113803863525, "step": 574 }, { "epoch": 0.3495440729483283, "grad_norm": 0.9157246351242065, "learning_rate": 0.0005, "loss": 4.578097343444824, "step": 575 }, { "epoch": 0.35015197568389056, "grad_norm": 1.034294843673706, "learning_rate": 0.0005, "loss": 4.393146514892578, "step": 576 }, { "epoch": 0.3507598784194529, "grad_norm": 0.9026995301246643, "learning_rate": 0.0005, "loss": 4.868537425994873, "step": 577 }, { "epoch": 0.3513677811550152, "grad_norm": 1.1576241254806519, "learning_rate": 0.0005, "loss": 4.755158424377441, "step": 578 }, { "epoch": 0.3519756838905775, "grad_norm": 1.061812400817871, "learning_rate": 0.0005, "loss": 4.48585319519043, "step": 579 }, { "epoch": 0.3525835866261398, "grad_norm": 0.9842910170555115, "learning_rate": 0.0005, "loss": 4.865891456604004, "step": 580 }, { "epoch": 0.35319148936170214, "grad_norm": 1.0243335962295532, "learning_rate": 0.0005, "loss": 4.523388862609863, "step": 581 }, { "epoch": 0.35379939209726446, "grad_norm": 1.2581957578659058, "learning_rate": 0.0005, "loss": 4.821706771850586, "step": 582 }, { "epoch": 0.35440729483282674, "grad_norm": 1.1777689456939697, "learning_rate": 0.0005, "loss": 4.600160121917725, "step": 583 }, { "epoch": 0.35501519756838906, "grad_norm": 0.9623486995697021, "learning_rate": 0.0005, "loss": 4.775470733642578, "step": 584 }, { "epoch": 0.3556231003039514, "grad_norm": 1.302804708480835, "learning_rate": 0.0005, "loss": 4.704485893249512, "step": 585 }, { "epoch": 0.35623100303951366, "grad_norm": 1.15083646774292, "learning_rate": 0.0005, "loss": 4.685108184814453, "step": 586 }, { "epoch": 0.356838905775076, "grad_norm": 1.0529240369796753, "learning_rate": 0.0005, "loss": 4.762598991394043, "step": 587 }, { "epoch": 0.3574468085106383, "grad_norm": 1.008600115776062, "learning_rate": 0.0005, "loss": 4.711298942565918, "step": 588 }, { "epoch": 0.3580547112462006, "grad_norm": 1.1591368913650513, "learning_rate": 0.0005, "loss": 4.836706638336182, "step": 589 }, { "epoch": 0.3586626139817629, "grad_norm": 1.0372366905212402, "learning_rate": 0.0005, "loss": 4.753532409667969, "step": 590 }, { "epoch": 0.35927051671732524, "grad_norm": 0.9533773064613342, "learning_rate": 0.0005, "loss": 4.787997245788574, "step": 591 }, { "epoch": 0.35987841945288757, "grad_norm": 1.3395041227340698, "learning_rate": 0.0005, "loss": 4.700077533721924, "step": 592 }, { "epoch": 0.36048632218844984, "grad_norm": 1.0645594596862793, "learning_rate": 0.0005, "loss": 4.607672691345215, "step": 593 }, { "epoch": 0.36109422492401216, "grad_norm": 1.2142505645751953, "learning_rate": 0.0005, "loss": 4.6179375648498535, "step": 594 }, { "epoch": 0.3617021276595745, "grad_norm": 1.2730581760406494, "learning_rate": 0.0005, "loss": 4.555119514465332, "step": 595 }, { "epoch": 0.36231003039513676, "grad_norm": 1.0680732727050781, "learning_rate": 0.0005, "loss": 4.700529098510742, "step": 596 }, { "epoch": 0.3629179331306991, "grad_norm": 1.055757761001587, "learning_rate": 0.0005, "loss": 4.544746398925781, "step": 597 }, { "epoch": 0.3635258358662614, "grad_norm": 1.2012107372283936, "learning_rate": 0.0005, "loss": 4.614580154418945, "step": 598 }, { "epoch": 0.3641337386018237, "grad_norm": 1.0662033557891846, "learning_rate": 0.0005, "loss": 4.880558967590332, "step": 599 }, { "epoch": 0.364741641337386, "grad_norm": 1.0305242538452148, "learning_rate": 0.0005, "loss": 4.462358474731445, "step": 600 }, { "epoch": 0.36534954407294834, "grad_norm": 1.0423706769943237, "learning_rate": 0.0005, "loss": 4.591382026672363, "step": 601 }, { "epoch": 0.3659574468085106, "grad_norm": 1.2076576948165894, "learning_rate": 0.0005, "loss": 4.7383599281311035, "step": 602 }, { "epoch": 0.36656534954407294, "grad_norm": 1.0415648221969604, "learning_rate": 0.0005, "loss": 4.586676597595215, "step": 603 }, { "epoch": 0.36717325227963526, "grad_norm": 0.9548492431640625, "learning_rate": 0.0005, "loss": 4.836339950561523, "step": 604 }, { "epoch": 0.3677811550151976, "grad_norm": 1.1116399765014648, "learning_rate": 0.0005, "loss": 4.634486198425293, "step": 605 }, { "epoch": 0.36838905775075986, "grad_norm": 0.9329056739807129, "learning_rate": 0.0005, "loss": 4.806420803070068, "step": 606 }, { "epoch": 0.3689969604863222, "grad_norm": 1.167823314666748, "learning_rate": 0.0005, "loss": 4.594254493713379, "step": 607 }, { "epoch": 0.3696048632218845, "grad_norm": 1.0034370422363281, "learning_rate": 0.0005, "loss": 4.6151347160339355, "step": 608 }, { "epoch": 0.3702127659574468, "grad_norm": 1.0906440019607544, "learning_rate": 0.0005, "loss": 4.540549278259277, "step": 609 }, { "epoch": 0.3708206686930091, "grad_norm": 1.0491790771484375, "learning_rate": 0.0005, "loss": 4.600298881530762, "step": 610 }, { "epoch": 0.37142857142857144, "grad_norm": 1.2935380935668945, "learning_rate": 0.0005, "loss": 4.646307945251465, "step": 611 }, { "epoch": 0.3720364741641337, "grad_norm": 1.1572242975234985, "learning_rate": 0.0005, "loss": 4.820685863494873, "step": 612 }, { "epoch": 0.37264437689969604, "grad_norm": 1.0526167154312134, "learning_rate": 0.0005, "loss": 4.463221549987793, "step": 613 }, { "epoch": 0.37325227963525837, "grad_norm": 1.0142046213150024, "learning_rate": 0.0005, "loss": 4.979160308837891, "step": 614 }, { "epoch": 0.3738601823708207, "grad_norm": 1.0886595249176025, "learning_rate": 0.0005, "loss": 4.659153461456299, "step": 615 }, { "epoch": 0.37446808510638296, "grad_norm": 1.0294383764266968, "learning_rate": 0.0005, "loss": 4.511576175689697, "step": 616 }, { "epoch": 0.3750759878419453, "grad_norm": 1.220738172531128, "learning_rate": 0.0005, "loss": 4.640242576599121, "step": 617 }, { "epoch": 0.3756838905775076, "grad_norm": 0.976274311542511, "learning_rate": 0.0005, "loss": 4.557078838348389, "step": 618 }, { "epoch": 0.3762917933130699, "grad_norm": 1.1121824979782104, "learning_rate": 0.0005, "loss": 4.412234306335449, "step": 619 }, { "epoch": 0.3768996960486322, "grad_norm": 1.0940440893173218, "learning_rate": 0.0005, "loss": 4.597440242767334, "step": 620 }, { "epoch": 0.37750759878419454, "grad_norm": 1.1758757829666138, "learning_rate": 0.0005, "loss": 4.729987144470215, "step": 621 }, { "epoch": 0.3781155015197568, "grad_norm": 0.979016900062561, "learning_rate": 0.0005, "loss": 4.656641960144043, "step": 622 }, { "epoch": 0.37872340425531914, "grad_norm": 1.1017565727233887, "learning_rate": 0.0005, "loss": 4.587738037109375, "step": 623 }, { "epoch": 0.37933130699088147, "grad_norm": 1.0581464767456055, "learning_rate": 0.0005, "loss": 4.452451705932617, "step": 624 }, { "epoch": 0.3799392097264438, "grad_norm": 1.0750993490219116, "learning_rate": 0.0005, "loss": 4.531889915466309, "step": 625 }, { "epoch": 0.38054711246200607, "grad_norm": 0.9821625351905823, "learning_rate": 0.0005, "loss": 4.488890171051025, "step": 626 }, { "epoch": 0.3811550151975684, "grad_norm": 1.0691367387771606, "learning_rate": 0.0005, "loss": 4.62428617477417, "step": 627 }, { "epoch": 0.3817629179331307, "grad_norm": 1.0314120054244995, "learning_rate": 0.0005, "loss": 4.533023834228516, "step": 628 }, { "epoch": 0.382370820668693, "grad_norm": 0.9268558025360107, "learning_rate": 0.0005, "loss": 4.565212249755859, "step": 629 }, { "epoch": 0.3829787234042553, "grad_norm": 1.0632472038269043, "learning_rate": 0.0005, "loss": 4.5511980056762695, "step": 630 }, { "epoch": 0.38358662613981764, "grad_norm": 0.9516937732696533, "learning_rate": 0.0005, "loss": 4.546860694885254, "step": 631 }, { "epoch": 0.3841945288753799, "grad_norm": 0.8885926008224487, "learning_rate": 0.0005, "loss": 4.540233612060547, "step": 632 }, { "epoch": 0.38480243161094224, "grad_norm": 0.9631567001342773, "learning_rate": 0.0005, "loss": 4.552545070648193, "step": 633 }, { "epoch": 0.38541033434650457, "grad_norm": 1.0189249515533447, "learning_rate": 0.0005, "loss": 4.413745880126953, "step": 634 }, { "epoch": 0.3860182370820669, "grad_norm": 1.0094175338745117, "learning_rate": 0.0005, "loss": 4.266282081604004, "step": 635 }, { "epoch": 0.38662613981762917, "grad_norm": 1.1108192205429077, "learning_rate": 0.0005, "loss": 4.169710159301758, "step": 636 }, { "epoch": 0.3872340425531915, "grad_norm": 1.1999133825302124, "learning_rate": 0.0005, "loss": 4.5471391677856445, "step": 637 }, { "epoch": 0.3878419452887538, "grad_norm": 1.047059178352356, "learning_rate": 0.0005, "loss": 4.793215751647949, "step": 638 }, { "epoch": 0.3884498480243161, "grad_norm": 1.1927613019943237, "learning_rate": 0.0005, "loss": 4.474370002746582, "step": 639 }, { "epoch": 0.3890577507598784, "grad_norm": 1.0722092390060425, "learning_rate": 0.0005, "loss": 4.685356140136719, "step": 640 }, { "epoch": 0.38966565349544074, "grad_norm": 1.0422673225402832, "learning_rate": 0.0005, "loss": 4.5289201736450195, "step": 641 }, { "epoch": 0.390273556231003, "grad_norm": 0.9556507468223572, "learning_rate": 0.0005, "loss": 4.421667098999023, "step": 642 }, { "epoch": 0.39088145896656534, "grad_norm": 1.0354868173599243, "learning_rate": 0.0005, "loss": 4.573639869689941, "step": 643 }, { "epoch": 0.39148936170212767, "grad_norm": 1.0089163780212402, "learning_rate": 0.0005, "loss": 4.505742073059082, "step": 644 }, { "epoch": 0.39209726443769, "grad_norm": 1.098516821861267, "learning_rate": 0.0005, "loss": 4.61726713180542, "step": 645 }, { "epoch": 0.39270516717325227, "grad_norm": 1.0022438764572144, "learning_rate": 0.0005, "loss": 4.8146162033081055, "step": 646 }, { "epoch": 0.3933130699088146, "grad_norm": 1.219514012336731, "learning_rate": 0.0005, "loss": 4.5992279052734375, "step": 647 }, { "epoch": 0.3939209726443769, "grad_norm": 1.0511285066604614, "learning_rate": 0.0005, "loss": 4.65933895111084, "step": 648 }, { "epoch": 0.3945288753799392, "grad_norm": 1.0481231212615967, "learning_rate": 0.0005, "loss": 4.405591011047363, "step": 649 }, { "epoch": 0.3951367781155015, "grad_norm": 1.1169630289077759, "learning_rate": 0.0005, "loss": 4.621652603149414, "step": 650 }, { "epoch": 0.39574468085106385, "grad_norm": 1.031966209411621, "learning_rate": 0.0005, "loss": 4.5710320472717285, "step": 651 }, { "epoch": 0.3963525835866261, "grad_norm": 1.1107763051986694, "learning_rate": 0.0005, "loss": 4.537693023681641, "step": 652 }, { "epoch": 0.39696048632218844, "grad_norm": 0.9889346957206726, "learning_rate": 0.0005, "loss": 4.518610000610352, "step": 653 }, { "epoch": 0.39756838905775077, "grad_norm": 1.1640068292617798, "learning_rate": 0.0005, "loss": 4.595146179199219, "step": 654 }, { "epoch": 0.3981762917933131, "grad_norm": 1.2929025888442993, "learning_rate": 0.0005, "loss": 4.559798240661621, "step": 655 }, { "epoch": 0.39878419452887537, "grad_norm": 1.098781943321228, "learning_rate": 0.0005, "loss": 4.602121353149414, "step": 656 }, { "epoch": 0.3993920972644377, "grad_norm": 1.0199748277664185, "learning_rate": 0.0005, "loss": 4.460375785827637, "step": 657 }, { "epoch": 0.4, "grad_norm": 1.4516689777374268, "learning_rate": 0.0005, "loss": 4.583429336547852, "step": 658 }, { "epoch": 0.4006079027355623, "grad_norm": 1.0523816347122192, "learning_rate": 0.0005, "loss": 4.602944374084473, "step": 659 }, { "epoch": 0.4012158054711246, "grad_norm": 1.052711844444275, "learning_rate": 0.0005, "loss": 4.508934020996094, "step": 660 }, { "epoch": 0.40182370820668695, "grad_norm": 1.0846177339553833, "learning_rate": 0.0005, "loss": 4.532805442810059, "step": 661 }, { "epoch": 0.4024316109422492, "grad_norm": 0.9877490401268005, "learning_rate": 0.0005, "loss": 4.644316673278809, "step": 662 }, { "epoch": 0.40303951367781155, "grad_norm": 1.04659104347229, "learning_rate": 0.0005, "loss": 4.376730918884277, "step": 663 }, { "epoch": 0.40364741641337387, "grad_norm": 1.250658392906189, "learning_rate": 0.0005, "loss": 4.553335666656494, "step": 664 }, { "epoch": 0.40425531914893614, "grad_norm": 1.1647439002990723, "learning_rate": 0.0005, "loss": 4.282361030578613, "step": 665 }, { "epoch": 0.40486322188449847, "grad_norm": 1.086575984954834, "learning_rate": 0.0005, "loss": 4.545602798461914, "step": 666 }, { "epoch": 0.4054711246200608, "grad_norm": 1.0094430446624756, "learning_rate": 0.0005, "loss": 4.514423370361328, "step": 667 }, { "epoch": 0.4060790273556231, "grad_norm": 1.1341593265533447, "learning_rate": 0.0005, "loss": 4.359306812286377, "step": 668 }, { "epoch": 0.4066869300911854, "grad_norm": 1.0556292533874512, "learning_rate": 0.0005, "loss": 4.663166046142578, "step": 669 }, { "epoch": 0.4072948328267477, "grad_norm": 0.9918414950370789, "learning_rate": 0.0005, "loss": 4.348359107971191, "step": 670 }, { "epoch": 0.40790273556231005, "grad_norm": 1.2771086692810059, "learning_rate": 0.0005, "loss": 4.380928993225098, "step": 671 }, { "epoch": 0.4085106382978723, "grad_norm": 1.2792952060699463, "learning_rate": 0.0005, "loss": 4.493129253387451, "step": 672 }, { "epoch": 0.40911854103343465, "grad_norm": 1.115451693534851, "learning_rate": 0.0005, "loss": 4.5493903160095215, "step": 673 }, { "epoch": 0.409726443768997, "grad_norm": 1.02188241481781, "learning_rate": 0.0005, "loss": 4.540634632110596, "step": 674 }, { "epoch": 0.41033434650455924, "grad_norm": 1.1881492137908936, "learning_rate": 0.0005, "loss": 4.6216325759887695, "step": 675 }, { "epoch": 0.41094224924012157, "grad_norm": 1.1510716676712036, "learning_rate": 0.0005, "loss": 4.753006935119629, "step": 676 }, { "epoch": 0.4115501519756839, "grad_norm": 0.9409204125404358, "learning_rate": 0.0005, "loss": 4.558671951293945, "step": 677 }, { "epoch": 0.4121580547112462, "grad_norm": 0.9652894735336304, "learning_rate": 0.0005, "loss": 4.586430549621582, "step": 678 }, { "epoch": 0.4127659574468085, "grad_norm": 1.0625907182693481, "learning_rate": 0.0005, "loss": 4.467252254486084, "step": 679 }, { "epoch": 0.4133738601823708, "grad_norm": 1.078682780265808, "learning_rate": 0.0005, "loss": 4.66164493560791, "step": 680 }, { "epoch": 0.41398176291793315, "grad_norm": 1.0304362773895264, "learning_rate": 0.0005, "loss": 4.765620231628418, "step": 681 }, { "epoch": 0.4145896656534954, "grad_norm": 0.9225407242774963, "learning_rate": 0.0005, "loss": 4.550148010253906, "step": 682 }, { "epoch": 0.41519756838905775, "grad_norm": 1.0196508169174194, "learning_rate": 0.0005, "loss": 4.9098100662231445, "step": 683 }, { "epoch": 0.4158054711246201, "grad_norm": 0.9961191415786743, "learning_rate": 0.0005, "loss": 4.4087114334106445, "step": 684 }, { "epoch": 0.41641337386018235, "grad_norm": 1.0987764596939087, "learning_rate": 0.0005, "loss": 4.60486364364624, "step": 685 }, { "epoch": 0.41702127659574467, "grad_norm": 1.3485429286956787, "learning_rate": 0.0005, "loss": 4.509698390960693, "step": 686 }, { "epoch": 0.417629179331307, "grad_norm": 1.0834795236587524, "learning_rate": 0.0005, "loss": 4.131223678588867, "step": 687 }, { "epoch": 0.4182370820668693, "grad_norm": 1.2778581380844116, "learning_rate": 0.0005, "loss": 4.530914306640625, "step": 688 }, { "epoch": 0.4188449848024316, "grad_norm": 0.9555144309997559, "learning_rate": 0.0005, "loss": 4.773101806640625, "step": 689 }, { "epoch": 0.4194528875379939, "grad_norm": 1.0608127117156982, "learning_rate": 0.0005, "loss": 4.457843780517578, "step": 690 }, { "epoch": 0.42006079027355625, "grad_norm": 1.2380342483520508, "learning_rate": 0.0005, "loss": 4.438450813293457, "step": 691 }, { "epoch": 0.4206686930091185, "grad_norm": 1.0234472751617432, "learning_rate": 0.0005, "loss": 4.412363052368164, "step": 692 }, { "epoch": 0.42127659574468085, "grad_norm": 1.0774229764938354, "learning_rate": 0.0005, "loss": 4.687466144561768, "step": 693 }, { "epoch": 0.4218844984802432, "grad_norm": 0.9822944402694702, "learning_rate": 0.0005, "loss": 4.798013687133789, "step": 694 }, { "epoch": 0.42249240121580545, "grad_norm": 1.1232951879501343, "learning_rate": 0.0005, "loss": 4.548072814941406, "step": 695 }, { "epoch": 0.4231003039513678, "grad_norm": 1.5027856826782227, "learning_rate": 0.0005, "loss": 4.7048797607421875, "step": 696 }, { "epoch": 0.4237082066869301, "grad_norm": 1.036541223526001, "learning_rate": 0.0005, "loss": 4.6969709396362305, "step": 697 }, { "epoch": 0.4243161094224924, "grad_norm": 1.1823787689208984, "learning_rate": 0.0005, "loss": 4.457941055297852, "step": 698 }, { "epoch": 0.4249240121580547, "grad_norm": 0.9230678081512451, "learning_rate": 0.0005, "loss": 4.421998500823975, "step": 699 }, { "epoch": 0.425531914893617, "grad_norm": 1.7750741243362427, "learning_rate": 0.0005, "loss": 4.76076602935791, "step": 700 }, { "epoch": 0.42613981762917935, "grad_norm": 1.0719808340072632, "learning_rate": 0.0005, "loss": 4.580799102783203, "step": 701 }, { "epoch": 0.4267477203647416, "grad_norm": 1.0799646377563477, "learning_rate": 0.0005, "loss": 4.311610221862793, "step": 702 }, { "epoch": 0.42735562310030395, "grad_norm": 0.8947767019271851, "learning_rate": 0.0005, "loss": 4.4494123458862305, "step": 703 }, { "epoch": 0.4279635258358663, "grad_norm": 1.0298351049423218, "learning_rate": 0.0005, "loss": 4.393129348754883, "step": 704 }, { "epoch": 0.42857142857142855, "grad_norm": 1.098189115524292, "learning_rate": 0.0005, "loss": 4.199446678161621, "step": 705 }, { "epoch": 0.4291793313069909, "grad_norm": 1.112589955329895, "learning_rate": 0.0005, "loss": 4.471273422241211, "step": 706 }, { "epoch": 0.4297872340425532, "grad_norm": 1.2152529954910278, "learning_rate": 0.0005, "loss": 4.727916240692139, "step": 707 }, { "epoch": 0.43039513677811553, "grad_norm": 1.1162065267562866, "learning_rate": 0.0005, "loss": 4.282822132110596, "step": 708 }, { "epoch": 0.4310030395136778, "grad_norm": 1.2259479761123657, "learning_rate": 0.0005, "loss": 4.1524977684021, "step": 709 }, { "epoch": 0.4316109422492401, "grad_norm": 1.0089929103851318, "learning_rate": 0.0005, "loss": 4.150537490844727, "step": 710 }, { "epoch": 0.43221884498480245, "grad_norm": 0.9101129770278931, "learning_rate": 0.0005, "loss": 4.379437446594238, "step": 711 }, { "epoch": 0.4328267477203647, "grad_norm": 0.9849691390991211, "learning_rate": 0.0005, "loss": 4.299429893493652, "step": 712 }, { "epoch": 0.43343465045592705, "grad_norm": 0.9956537485122681, "learning_rate": 0.0005, "loss": 4.439446926116943, "step": 713 }, { "epoch": 0.4340425531914894, "grad_norm": 1.0646576881408691, "learning_rate": 0.0005, "loss": 4.680734634399414, "step": 714 }, { "epoch": 0.43465045592705165, "grad_norm": 1.1268900632858276, "learning_rate": 0.0005, "loss": 4.390021324157715, "step": 715 }, { "epoch": 0.435258358662614, "grad_norm": 1.1238709688186646, "learning_rate": 0.0005, "loss": 4.414492607116699, "step": 716 }, { "epoch": 0.4358662613981763, "grad_norm": 1.0272475481033325, "learning_rate": 0.0005, "loss": 4.48759651184082, "step": 717 }, { "epoch": 0.43647416413373863, "grad_norm": 0.9443128108978271, "learning_rate": 0.0005, "loss": 4.241964340209961, "step": 718 }, { "epoch": 0.4370820668693009, "grad_norm": 0.8795979022979736, "learning_rate": 0.0005, "loss": 4.438322067260742, "step": 719 }, { "epoch": 0.4376899696048632, "grad_norm": 1.0388433933258057, "learning_rate": 0.0005, "loss": 4.499500274658203, "step": 720 }, { "epoch": 0.43829787234042555, "grad_norm": 1.0285965204238892, "learning_rate": 0.0005, "loss": 4.458085060119629, "step": 721 }, { "epoch": 0.4389057750759878, "grad_norm": 1.0486245155334473, "learning_rate": 0.0005, "loss": 4.3121843338012695, "step": 722 }, { "epoch": 0.43951367781155015, "grad_norm": 0.974229633808136, "learning_rate": 0.0005, "loss": 4.484938621520996, "step": 723 }, { "epoch": 0.4401215805471125, "grad_norm": 1.028061032295227, "learning_rate": 0.0005, "loss": 4.343748092651367, "step": 724 }, { "epoch": 0.44072948328267475, "grad_norm": 1.247310757637024, "learning_rate": 0.0005, "loss": 4.43183708190918, "step": 725 }, { "epoch": 0.4413373860182371, "grad_norm": 1.07508385181427, "learning_rate": 0.0005, "loss": 4.473773956298828, "step": 726 }, { "epoch": 0.4419452887537994, "grad_norm": 1.0861989259719849, "learning_rate": 0.0005, "loss": 4.50743293762207, "step": 727 }, { "epoch": 0.4425531914893617, "grad_norm": 1.043446660041809, "learning_rate": 0.0005, "loss": 4.65224027633667, "step": 728 }, { "epoch": 0.443161094224924, "grad_norm": 1.1153486967086792, "learning_rate": 0.0005, "loss": 4.275899887084961, "step": 729 }, { "epoch": 0.44376899696048633, "grad_norm": 1.0387423038482666, "learning_rate": 0.0005, "loss": 4.571664333343506, "step": 730 }, { "epoch": 0.44437689969604866, "grad_norm": 1.1121833324432373, "learning_rate": 0.0005, "loss": 4.472873687744141, "step": 731 }, { "epoch": 0.4449848024316109, "grad_norm": 1.110357642173767, "learning_rate": 0.0005, "loss": 4.507586479187012, "step": 732 }, { "epoch": 0.44559270516717325, "grad_norm": 1.0192921161651611, "learning_rate": 0.0005, "loss": 4.614180564880371, "step": 733 }, { "epoch": 0.4462006079027356, "grad_norm": 1.2011562585830688, "learning_rate": 0.0005, "loss": 4.410806655883789, "step": 734 }, { "epoch": 0.44680851063829785, "grad_norm": 1.045922040939331, "learning_rate": 0.0005, "loss": 4.522254943847656, "step": 735 }, { "epoch": 0.4474164133738602, "grad_norm": 1.1084001064300537, "learning_rate": 0.0005, "loss": 4.473600387573242, "step": 736 }, { "epoch": 0.4480243161094225, "grad_norm": 1.0580531358718872, "learning_rate": 0.0005, "loss": 4.495148658752441, "step": 737 }, { "epoch": 0.4486322188449848, "grad_norm": 1.0791500806808472, "learning_rate": 0.0005, "loss": 4.559470176696777, "step": 738 }, { "epoch": 0.4492401215805471, "grad_norm": 0.9919356107711792, "learning_rate": 0.0005, "loss": 4.445730209350586, "step": 739 }, { "epoch": 0.44984802431610943, "grad_norm": 0.9215476512908936, "learning_rate": 0.0005, "loss": 4.360682487487793, "step": 740 }, { "epoch": 0.45045592705167176, "grad_norm": 1.1767232418060303, "learning_rate": 0.0005, "loss": 4.51902437210083, "step": 741 }, { "epoch": 0.451063829787234, "grad_norm": 1.1746350526809692, "learning_rate": 0.0005, "loss": 4.362285614013672, "step": 742 }, { "epoch": 0.45167173252279635, "grad_norm": 1.0243946313858032, "learning_rate": 0.0005, "loss": 4.443662166595459, "step": 743 }, { "epoch": 0.4522796352583587, "grad_norm": 1.034515619277954, "learning_rate": 0.0005, "loss": 4.329188346862793, "step": 744 }, { "epoch": 0.45288753799392095, "grad_norm": 1.1209111213684082, "learning_rate": 0.0005, "loss": 4.6534223556518555, "step": 745 }, { "epoch": 0.4534954407294833, "grad_norm": 1.0455032587051392, "learning_rate": 0.0005, "loss": 4.511608600616455, "step": 746 }, { "epoch": 0.4541033434650456, "grad_norm": 1.002439022064209, "learning_rate": 0.0005, "loss": 4.4008378982543945, "step": 747 }, { "epoch": 0.4547112462006079, "grad_norm": 0.9780976176261902, "learning_rate": 0.0005, "loss": 4.478031158447266, "step": 748 }, { "epoch": 0.4553191489361702, "grad_norm": 1.0394052267074585, "learning_rate": 0.0005, "loss": 4.431166648864746, "step": 749 }, { "epoch": 0.45592705167173253, "grad_norm": 1.0838037729263306, "learning_rate": 0.0005, "loss": 4.38276481628418, "step": 750 }, { "epoch": 0.45653495440729486, "grad_norm": 1.2306514978408813, "learning_rate": 0.0005, "loss": 4.427013874053955, "step": 751 }, { "epoch": 0.45714285714285713, "grad_norm": 0.8942012190818787, "learning_rate": 0.0005, "loss": 4.463613986968994, "step": 752 }, { "epoch": 0.45775075987841946, "grad_norm": 1.0273581743240356, "learning_rate": 0.0005, "loss": 4.331677436828613, "step": 753 }, { "epoch": 0.4583586626139818, "grad_norm": 1.061225414276123, "learning_rate": 0.0005, "loss": 4.360611438751221, "step": 754 }, { "epoch": 0.45896656534954405, "grad_norm": 0.9954508543014526, "learning_rate": 0.0005, "loss": 4.37364387512207, "step": 755 }, { "epoch": 0.4595744680851064, "grad_norm": 0.9806733131408691, "learning_rate": 0.0005, "loss": 4.469931602478027, "step": 756 }, { "epoch": 0.4601823708206687, "grad_norm": 1.131806492805481, "learning_rate": 0.0005, "loss": 4.487429618835449, "step": 757 }, { "epoch": 0.460790273556231, "grad_norm": 0.9451801776885986, "learning_rate": 0.0005, "loss": 4.476114749908447, "step": 758 }, { "epoch": 0.4613981762917933, "grad_norm": 1.064634084701538, "learning_rate": 0.0005, "loss": 4.607744216918945, "step": 759 }, { "epoch": 0.46200607902735563, "grad_norm": 1.0846835374832153, "learning_rate": 0.0005, "loss": 4.312438011169434, "step": 760 }, { "epoch": 0.46261398176291796, "grad_norm": 0.9688083529472351, "learning_rate": 0.0005, "loss": 4.376931667327881, "step": 761 }, { "epoch": 0.46322188449848023, "grad_norm": 1.1652911901474, "learning_rate": 0.0005, "loss": 4.416962623596191, "step": 762 }, { "epoch": 0.46382978723404256, "grad_norm": 1.147851586341858, "learning_rate": 0.0005, "loss": 4.349986553192139, "step": 763 }, { "epoch": 0.4644376899696049, "grad_norm": 0.9702697992324829, "learning_rate": 0.0005, "loss": 4.497674942016602, "step": 764 }, { "epoch": 0.46504559270516715, "grad_norm": 1.0843515396118164, "learning_rate": 0.0005, "loss": 4.49877405166626, "step": 765 }, { "epoch": 0.4656534954407295, "grad_norm": 0.9171056747436523, "learning_rate": 0.0005, "loss": 4.1539201736450195, "step": 766 }, { "epoch": 0.4662613981762918, "grad_norm": 1.164944052696228, "learning_rate": 0.0005, "loss": 4.509303092956543, "step": 767 }, { "epoch": 0.4668693009118541, "grad_norm": 1.0968433618545532, "learning_rate": 0.0005, "loss": 4.4588727951049805, "step": 768 }, { "epoch": 0.4674772036474164, "grad_norm": 1.0154880285263062, "learning_rate": 0.0005, "loss": 4.611554145812988, "step": 769 }, { "epoch": 0.46808510638297873, "grad_norm": 0.9653189778327942, "learning_rate": 0.0005, "loss": 4.324926376342773, "step": 770 }, { "epoch": 0.46869300911854106, "grad_norm": 1.1051913499832153, "learning_rate": 0.0005, "loss": 4.4647111892700195, "step": 771 }, { "epoch": 0.46930091185410333, "grad_norm": 0.9223854541778564, "learning_rate": 0.0005, "loss": 4.7103400230407715, "step": 772 }, { "epoch": 0.46990881458966566, "grad_norm": 1.124935507774353, "learning_rate": 0.0005, "loss": 4.453402519226074, "step": 773 }, { "epoch": 0.470516717325228, "grad_norm": 1.3314533233642578, "learning_rate": 0.0005, "loss": 4.297192573547363, "step": 774 }, { "epoch": 0.47112462006079026, "grad_norm": 1.0218007564544678, "learning_rate": 0.0005, "loss": 4.496466159820557, "step": 775 }, { "epoch": 0.4717325227963526, "grad_norm": 1.0308325290679932, "learning_rate": 0.0005, "loss": 4.3223772048950195, "step": 776 }, { "epoch": 0.4723404255319149, "grad_norm": 1.1283831596374512, "learning_rate": 0.0005, "loss": 4.398843288421631, "step": 777 }, { "epoch": 0.4729483282674772, "grad_norm": 1.1089282035827637, "learning_rate": 0.0005, "loss": 4.226986408233643, "step": 778 }, { "epoch": 0.4735562310030395, "grad_norm": 0.9950074553489685, "learning_rate": 0.0005, "loss": 4.401683807373047, "step": 779 }, { "epoch": 0.47416413373860183, "grad_norm": 1.1220934391021729, "learning_rate": 0.0005, "loss": 4.23845100402832, "step": 780 }, { "epoch": 0.47477203647416416, "grad_norm": 1.1314822435379028, "learning_rate": 0.0005, "loss": 4.648829936981201, "step": 781 }, { "epoch": 0.47537993920972643, "grad_norm": 1.0067565441131592, "learning_rate": 0.0005, "loss": 4.342182159423828, "step": 782 }, { "epoch": 0.47598784194528876, "grad_norm": 1.4291990995407104, "learning_rate": 0.0005, "loss": 4.222455978393555, "step": 783 }, { "epoch": 0.4765957446808511, "grad_norm": 1.0664339065551758, "learning_rate": 0.0005, "loss": 4.533761978149414, "step": 784 }, { "epoch": 0.47720364741641336, "grad_norm": 0.837992787361145, "learning_rate": 0.0005, "loss": 4.583135604858398, "step": 785 }, { "epoch": 0.4778115501519757, "grad_norm": 1.0775222778320312, "learning_rate": 0.0005, "loss": 4.407233238220215, "step": 786 }, { "epoch": 0.478419452887538, "grad_norm": 1.1260716915130615, "learning_rate": 0.0005, "loss": 4.408687114715576, "step": 787 }, { "epoch": 0.4790273556231003, "grad_norm": 1.1476800441741943, "learning_rate": 0.0005, "loss": 4.5264692306518555, "step": 788 }, { "epoch": 0.4796352583586626, "grad_norm": 1.0624704360961914, "learning_rate": 0.0005, "loss": 4.47670316696167, "step": 789 }, { "epoch": 0.48024316109422494, "grad_norm": 1.4008615016937256, "learning_rate": 0.0005, "loss": 4.542054653167725, "step": 790 }, { "epoch": 0.4808510638297872, "grad_norm": 1.6348981857299805, "learning_rate": 0.0005, "loss": 4.272322654724121, "step": 791 }, { "epoch": 0.48145896656534953, "grad_norm": 1.110823154449463, "learning_rate": 0.0005, "loss": 4.32360315322876, "step": 792 }, { "epoch": 0.48206686930091186, "grad_norm": 0.9771617650985718, "learning_rate": 0.0005, "loss": 4.4510321617126465, "step": 793 }, { "epoch": 0.4826747720364742, "grad_norm": 1.0948632955551147, "learning_rate": 0.0005, "loss": 4.335118293762207, "step": 794 }, { "epoch": 0.48328267477203646, "grad_norm": 1.2692338228225708, "learning_rate": 0.0005, "loss": 4.3776655197143555, "step": 795 }, { "epoch": 0.4838905775075988, "grad_norm": 0.8474439978599548, "learning_rate": 0.0005, "loss": 4.24397087097168, "step": 796 }, { "epoch": 0.4844984802431611, "grad_norm": 0.9258842468261719, "learning_rate": 0.0005, "loss": 4.602321624755859, "step": 797 }, { "epoch": 0.4851063829787234, "grad_norm": 1.1678420305252075, "learning_rate": 0.0005, "loss": 4.5578203201293945, "step": 798 }, { "epoch": 0.4857142857142857, "grad_norm": 1.0839719772338867, "learning_rate": 0.0005, "loss": 4.4719109535217285, "step": 799 }, { "epoch": 0.48632218844984804, "grad_norm": 1.0721313953399658, "learning_rate": 0.0005, "loss": 4.1971516609191895, "step": 800 }, { "epoch": 0.4869300911854103, "grad_norm": 1.077587366104126, "learning_rate": 0.0005, "loss": 4.452859401702881, "step": 801 }, { "epoch": 0.48753799392097263, "grad_norm": 0.9456436038017273, "learning_rate": 0.0005, "loss": 4.417455673217773, "step": 802 }, { "epoch": 0.48814589665653496, "grad_norm": 0.9326696991920471, "learning_rate": 0.0005, "loss": 4.389290809631348, "step": 803 }, { "epoch": 0.4887537993920973, "grad_norm": 1.0423635244369507, "learning_rate": 0.0005, "loss": 4.4252448081970215, "step": 804 }, { "epoch": 0.48936170212765956, "grad_norm": 1.0106087923049927, "learning_rate": 0.0005, "loss": 4.29632043838501, "step": 805 }, { "epoch": 0.4899696048632219, "grad_norm": 0.8635157942771912, "learning_rate": 0.0005, "loss": 4.45654296875, "step": 806 }, { "epoch": 0.4905775075987842, "grad_norm": 0.9637815952301025, "learning_rate": 0.0005, "loss": 4.305363655090332, "step": 807 }, { "epoch": 0.4911854103343465, "grad_norm": 0.9523938298225403, "learning_rate": 0.0005, "loss": 4.561666965484619, "step": 808 }, { "epoch": 0.4917933130699088, "grad_norm": 1.1045883893966675, "learning_rate": 0.0005, "loss": 4.385721206665039, "step": 809 }, { "epoch": 0.49240121580547114, "grad_norm": 0.951117992401123, "learning_rate": 0.0005, "loss": 4.302276611328125, "step": 810 }, { "epoch": 0.4930091185410334, "grad_norm": 1.091933250427246, "learning_rate": 0.0005, "loss": 4.64669132232666, "step": 811 }, { "epoch": 0.49361702127659574, "grad_norm": 1.0813966989517212, "learning_rate": 0.0005, "loss": 4.266849517822266, "step": 812 }, { "epoch": 0.49422492401215806, "grad_norm": 0.9683962464332581, "learning_rate": 0.0005, "loss": 4.304372787475586, "step": 813 }, { "epoch": 0.4948328267477204, "grad_norm": 0.960382342338562, "learning_rate": 0.0005, "loss": 4.221304416656494, "step": 814 }, { "epoch": 0.49544072948328266, "grad_norm": 0.9746182560920715, "learning_rate": 0.0005, "loss": 4.392333030700684, "step": 815 }, { "epoch": 0.496048632218845, "grad_norm": 0.9449917078018188, "learning_rate": 0.0005, "loss": 4.274685859680176, "step": 816 }, { "epoch": 0.4966565349544073, "grad_norm": 0.8899694085121155, "learning_rate": 0.0005, "loss": 4.206332206726074, "step": 817 }, { "epoch": 0.4972644376899696, "grad_norm": 0.9504559636116028, "learning_rate": 0.0005, "loss": 4.2690534591674805, "step": 818 }, { "epoch": 0.4978723404255319, "grad_norm": 0.9823598265647888, "learning_rate": 0.0005, "loss": 4.379746437072754, "step": 819 }, { "epoch": 0.49848024316109424, "grad_norm": 1.0227431058883667, "learning_rate": 0.0005, "loss": 4.233619213104248, "step": 820 }, { "epoch": 0.4990881458966565, "grad_norm": 0.9714612364768982, "learning_rate": 0.0005, "loss": 4.607011795043945, "step": 821 }, { "epoch": 0.49969604863221884, "grad_norm": 0.9920446276664734, "learning_rate": 0.0005, "loss": 4.5199127197265625, "step": 822 }, { "epoch": 0.5003039513677812, "grad_norm": 1.0052610635757446, "learning_rate": 0.0005, "loss": 4.538883209228516, "step": 823 }, { "epoch": 0.5009118541033435, "grad_norm": 0.961460292339325, "learning_rate": 0.0005, "loss": 4.37430477142334, "step": 824 }, { "epoch": 0.5015197568389058, "grad_norm": 0.9705450534820557, "learning_rate": 0.0005, "loss": 4.36405611038208, "step": 825 }, { "epoch": 0.502127659574468, "grad_norm": 1.0589666366577148, "learning_rate": 0.0005, "loss": 4.532018661499023, "step": 826 }, { "epoch": 0.5027355623100304, "grad_norm": 1.0190895795822144, "learning_rate": 0.0005, "loss": 4.366916656494141, "step": 827 }, { "epoch": 0.5033434650455927, "grad_norm": 1.2047783136367798, "learning_rate": 0.0005, "loss": 4.332704544067383, "step": 828 }, { "epoch": 0.503951367781155, "grad_norm": 0.9100733995437622, "learning_rate": 0.0005, "loss": 4.1280975341796875, "step": 829 }, { "epoch": 0.5045592705167173, "grad_norm": 1.0953924655914307, "learning_rate": 0.0005, "loss": 4.338841438293457, "step": 830 }, { "epoch": 0.5051671732522797, "grad_norm": 1.2325948476791382, "learning_rate": 0.0005, "loss": 4.425684928894043, "step": 831 }, { "epoch": 0.505775075987842, "grad_norm": 1.0776824951171875, "learning_rate": 0.0005, "loss": 4.238302230834961, "step": 832 }, { "epoch": 0.5063829787234042, "grad_norm": 1.002465009689331, "learning_rate": 0.0005, "loss": 4.1673173904418945, "step": 833 }, { "epoch": 0.5069908814589665, "grad_norm": 1.0070068836212158, "learning_rate": 0.0005, "loss": 4.502063751220703, "step": 834 }, { "epoch": 0.5075987841945289, "grad_norm": 0.9460301995277405, "learning_rate": 0.0005, "loss": 4.266294479370117, "step": 835 }, { "epoch": 0.5082066869300912, "grad_norm": 0.9609605669975281, "learning_rate": 0.0005, "loss": 4.49836540222168, "step": 836 }, { "epoch": 0.5088145896656535, "grad_norm": 1.0298100709915161, "learning_rate": 0.0005, "loss": 4.342093467712402, "step": 837 }, { "epoch": 0.5094224924012158, "grad_norm": 1.102327585220337, "learning_rate": 0.0005, "loss": 4.25087833404541, "step": 838 }, { "epoch": 0.5100303951367782, "grad_norm": 1.2569550275802612, "learning_rate": 0.0005, "loss": 4.285090446472168, "step": 839 }, { "epoch": 0.5106382978723404, "grad_norm": 1.0138150453567505, "learning_rate": 0.0005, "loss": 4.334506034851074, "step": 840 }, { "epoch": 0.5112462006079027, "grad_norm": 1.0152983665466309, "learning_rate": 0.0005, "loss": 4.283235549926758, "step": 841 }, { "epoch": 0.511854103343465, "grad_norm": 1.1372138261795044, "learning_rate": 0.0005, "loss": 4.07025146484375, "step": 842 }, { "epoch": 0.5124620060790274, "grad_norm": 1.1843246221542358, "learning_rate": 0.0005, "loss": 4.353334426879883, "step": 843 }, { "epoch": 0.5130699088145897, "grad_norm": 1.1458396911621094, "learning_rate": 0.0005, "loss": 4.34335994720459, "step": 844 }, { "epoch": 0.513677811550152, "grad_norm": 1.0594899654388428, "learning_rate": 0.0005, "loss": 4.31781005859375, "step": 845 }, { "epoch": 0.5142857142857142, "grad_norm": 0.844513475894928, "learning_rate": 0.0005, "loss": 4.4846577644348145, "step": 846 }, { "epoch": 0.5148936170212766, "grad_norm": 2.6839306354522705, "learning_rate": 0.0005, "loss": 4.262670993804932, "step": 847 }, { "epoch": 0.5155015197568389, "grad_norm": 1.0088754892349243, "learning_rate": 0.0005, "loss": 4.266050338745117, "step": 848 }, { "epoch": 0.5161094224924012, "grad_norm": 1.0849522352218628, "learning_rate": 0.0005, "loss": 4.108889102935791, "step": 849 }, { "epoch": 0.5167173252279635, "grad_norm": 1.0903068780899048, "learning_rate": 0.0005, "loss": 4.313821315765381, "step": 850 }, { "epoch": 0.5173252279635259, "grad_norm": 1.1618335247039795, "learning_rate": 0.0005, "loss": 4.295135498046875, "step": 851 }, { "epoch": 0.5179331306990882, "grad_norm": 0.9828124046325684, "learning_rate": 0.0005, "loss": 4.440587043762207, "step": 852 }, { "epoch": 0.5185410334346504, "grad_norm": 1.131939172744751, "learning_rate": 0.0005, "loss": 4.306354522705078, "step": 853 }, { "epoch": 0.5191489361702127, "grad_norm": 1.3951880931854248, "learning_rate": 0.0005, "loss": 4.395257949829102, "step": 854 }, { "epoch": 0.5197568389057751, "grad_norm": 1.28059983253479, "learning_rate": 0.0005, "loss": 4.033473968505859, "step": 855 }, { "epoch": 0.5203647416413374, "grad_norm": 0.9717862606048584, "learning_rate": 0.0005, "loss": 4.356355667114258, "step": 856 }, { "epoch": 0.5209726443768997, "grad_norm": 1.043353796005249, "learning_rate": 0.0005, "loss": 4.250835418701172, "step": 857 }, { "epoch": 0.521580547112462, "grad_norm": 1.016579508781433, "learning_rate": 0.0005, "loss": 4.286150932312012, "step": 858 }, { "epoch": 0.5221884498480243, "grad_norm": 1.112782597541809, "learning_rate": 0.0005, "loss": 4.598012924194336, "step": 859 }, { "epoch": 0.5227963525835866, "grad_norm": 1.1940479278564453, "learning_rate": 0.0005, "loss": 4.4383955001831055, "step": 860 }, { "epoch": 0.5234042553191489, "grad_norm": 1.254970669746399, "learning_rate": 0.0005, "loss": 4.322863578796387, "step": 861 }, { "epoch": 0.5240121580547112, "grad_norm": 1.0700422525405884, "learning_rate": 0.0005, "loss": 4.244253158569336, "step": 862 }, { "epoch": 0.5246200607902736, "grad_norm": 1.0553544759750366, "learning_rate": 0.0005, "loss": 4.310792446136475, "step": 863 }, { "epoch": 0.5252279635258359, "grad_norm": 1.0288846492767334, "learning_rate": 0.0005, "loss": 4.3274383544921875, "step": 864 }, { "epoch": 0.5258358662613982, "grad_norm": 1.0445955991744995, "learning_rate": 0.0005, "loss": 4.45347261428833, "step": 865 }, { "epoch": 0.5264437689969604, "grad_norm": 1.1357736587524414, "learning_rate": 0.0005, "loss": 4.4809064865112305, "step": 866 }, { "epoch": 0.5270516717325228, "grad_norm": 1.109326720237732, "learning_rate": 0.0005, "loss": 4.253253936767578, "step": 867 }, { "epoch": 0.5276595744680851, "grad_norm": 1.1890736818313599, "learning_rate": 0.0005, "loss": 4.426365852355957, "step": 868 }, { "epoch": 0.5282674772036474, "grad_norm": 1.0840505361557007, "learning_rate": 0.0005, "loss": 4.321274280548096, "step": 869 }, { "epoch": 0.5288753799392097, "grad_norm": 1.2200610637664795, "learning_rate": 0.0005, "loss": 4.557803153991699, "step": 870 }, { "epoch": 0.5294832826747721, "grad_norm": 0.9972710609436035, "learning_rate": 0.0005, "loss": 4.23234748840332, "step": 871 }, { "epoch": 0.5300911854103344, "grad_norm": 1.0316972732543945, "learning_rate": 0.0005, "loss": 4.139028549194336, "step": 872 }, { "epoch": 0.5306990881458966, "grad_norm": 1.0380617380142212, "learning_rate": 0.0005, "loss": 4.348488807678223, "step": 873 }, { "epoch": 0.5313069908814589, "grad_norm": 0.9867698550224304, "learning_rate": 0.0005, "loss": 4.302568435668945, "step": 874 }, { "epoch": 0.5319148936170213, "grad_norm": 1.0779541730880737, "learning_rate": 0.0005, "loss": 4.425013542175293, "step": 875 }, { "epoch": 0.5325227963525836, "grad_norm": 1.2543246746063232, "learning_rate": 0.0005, "loss": 4.724435806274414, "step": 876 }, { "epoch": 0.5331306990881459, "grad_norm": 1.2280689477920532, "learning_rate": 0.0005, "loss": 4.2406415939331055, "step": 877 }, { "epoch": 0.5337386018237082, "grad_norm": 1.3842073678970337, "learning_rate": 0.0005, "loss": 4.396044731140137, "step": 878 }, { "epoch": 0.5343465045592705, "grad_norm": 1.0350067615509033, "learning_rate": 0.0005, "loss": 4.17176628112793, "step": 879 }, { "epoch": 0.5349544072948328, "grad_norm": 0.9484389424324036, "learning_rate": 0.0005, "loss": 4.430863380432129, "step": 880 }, { "epoch": 0.5355623100303951, "grad_norm": 1.1557071208953857, "learning_rate": 0.0005, "loss": 4.12956428527832, "step": 881 }, { "epoch": 0.5361702127659574, "grad_norm": 0.9079960584640503, "learning_rate": 0.0005, "loss": 4.4100542068481445, "step": 882 }, { "epoch": 0.5367781155015198, "grad_norm": 0.9755933880805969, "learning_rate": 0.0005, "loss": 4.136897563934326, "step": 883 }, { "epoch": 0.5373860182370821, "grad_norm": 1.0319873094558716, "learning_rate": 0.0005, "loss": 4.440415859222412, "step": 884 }, { "epoch": 0.5379939209726444, "grad_norm": 0.8542789220809937, "learning_rate": 0.0005, "loss": 4.413039207458496, "step": 885 }, { "epoch": 0.5386018237082066, "grad_norm": 1.0158871412277222, "learning_rate": 0.0005, "loss": 4.379025459289551, "step": 886 }, { "epoch": 0.539209726443769, "grad_norm": 0.8926265835762024, "learning_rate": 0.0005, "loss": 4.344198226928711, "step": 887 }, { "epoch": 0.5398176291793313, "grad_norm": 0.8857081532478333, "learning_rate": 0.0005, "loss": 4.38722562789917, "step": 888 }, { "epoch": 0.5404255319148936, "grad_norm": 0.9595281481742859, "learning_rate": 0.0005, "loss": 4.3452959060668945, "step": 889 }, { "epoch": 0.541033434650456, "grad_norm": 0.9428173303604126, "learning_rate": 0.0005, "loss": 4.258479118347168, "step": 890 }, { "epoch": 0.5416413373860183, "grad_norm": 1.5479097366333008, "learning_rate": 0.0005, "loss": 4.245420455932617, "step": 891 }, { "epoch": 0.5422492401215806, "grad_norm": 1.1619681119918823, "learning_rate": 0.0005, "loss": 4.385200500488281, "step": 892 }, { "epoch": 0.5428571428571428, "grad_norm": 0.9958190321922302, "learning_rate": 0.0005, "loss": 4.102227687835693, "step": 893 }, { "epoch": 0.5434650455927051, "grad_norm": 1.0156055688858032, "learning_rate": 0.0005, "loss": 4.067695617675781, "step": 894 }, { "epoch": 0.5440729483282675, "grad_norm": 1.1579831838607788, "learning_rate": 0.0005, "loss": 4.48448371887207, "step": 895 }, { "epoch": 0.5446808510638298, "grad_norm": 1.23504638671875, "learning_rate": 0.0005, "loss": 4.5317583084106445, "step": 896 }, { "epoch": 0.5452887537993921, "grad_norm": 1.167401909828186, "learning_rate": 0.0005, "loss": 4.3551435470581055, "step": 897 }, { "epoch": 0.5458966565349544, "grad_norm": 1.4126181602478027, "learning_rate": 0.0005, "loss": 4.373387813568115, "step": 898 }, { "epoch": 0.5465045592705167, "grad_norm": 1.152944564819336, "learning_rate": 0.0005, "loss": 3.9645819664001465, "step": 899 }, { "epoch": 0.547112462006079, "grad_norm": 1.5390210151672363, "learning_rate": 0.0005, "loss": 4.454073429107666, "step": 900 }, { "epoch": 0.5477203647416413, "grad_norm": 1.0349818468093872, "learning_rate": 0.0005, "loss": 4.398721694946289, "step": 901 }, { "epoch": 0.5483282674772036, "grad_norm": 1.0963656902313232, "learning_rate": 0.0005, "loss": 4.0993242263793945, "step": 902 }, { "epoch": 0.548936170212766, "grad_norm": 1.1737645864486694, "learning_rate": 0.0005, "loss": 4.228819370269775, "step": 903 }, { "epoch": 0.5495440729483283, "grad_norm": 1.1499532461166382, "learning_rate": 0.0005, "loss": 4.329497337341309, "step": 904 }, { "epoch": 0.5501519756838906, "grad_norm": 1.1188825368881226, "learning_rate": 0.0005, "loss": 4.631438255310059, "step": 905 }, { "epoch": 0.5507598784194528, "grad_norm": 1.0337425470352173, "learning_rate": 0.0005, "loss": 4.373821258544922, "step": 906 }, { "epoch": 0.5513677811550152, "grad_norm": 1.098497986793518, "learning_rate": 0.0005, "loss": 4.344779014587402, "step": 907 }, { "epoch": 0.5519756838905775, "grad_norm": 1.0316400527954102, "learning_rate": 0.0005, "loss": 4.097405910491943, "step": 908 }, { "epoch": 0.5525835866261398, "grad_norm": 1.0182708501815796, "learning_rate": 0.0005, "loss": 4.353028297424316, "step": 909 }, { "epoch": 0.5531914893617021, "grad_norm": 1.2190346717834473, "learning_rate": 0.0005, "loss": 4.165225028991699, "step": 910 }, { "epoch": 0.5537993920972645, "grad_norm": 1.017309546470642, "learning_rate": 0.0005, "loss": 4.318220138549805, "step": 911 }, { "epoch": 0.5544072948328268, "grad_norm": 1.1314797401428223, "learning_rate": 0.0005, "loss": 4.121149063110352, "step": 912 }, { "epoch": 0.555015197568389, "grad_norm": 1.0844316482543945, "learning_rate": 0.0005, "loss": 4.213968276977539, "step": 913 }, { "epoch": 0.5556231003039513, "grad_norm": 0.9382945895195007, "learning_rate": 0.0005, "loss": 4.214629650115967, "step": 914 }, { "epoch": 0.5562310030395137, "grad_norm": 1.245742678642273, "learning_rate": 0.0005, "loss": 4.122774124145508, "step": 915 }, { "epoch": 0.556838905775076, "grad_norm": 1.095625877380371, "learning_rate": 0.0005, "loss": 4.296173095703125, "step": 916 }, { "epoch": 0.5574468085106383, "grad_norm": 1.0720239877700806, "learning_rate": 0.0005, "loss": 4.165585994720459, "step": 917 }, { "epoch": 0.5580547112462007, "grad_norm": 1.1082829236984253, "learning_rate": 0.0005, "loss": 4.3951921463012695, "step": 918 }, { "epoch": 0.5586626139817629, "grad_norm": 1.1302635669708252, "learning_rate": 0.0005, "loss": 4.336912155151367, "step": 919 }, { "epoch": 0.5592705167173252, "grad_norm": 0.9658374786376953, "learning_rate": 0.0005, "loss": 4.145001411437988, "step": 920 }, { "epoch": 0.5598784194528875, "grad_norm": 1.2869893312454224, "learning_rate": 0.0005, "loss": 4.438281536102295, "step": 921 }, { "epoch": 0.5604863221884498, "grad_norm": 0.9351769089698792, "learning_rate": 0.0005, "loss": 4.342588424682617, "step": 922 }, { "epoch": 0.5610942249240122, "grad_norm": 1.075165867805481, "learning_rate": 0.0005, "loss": 4.3024187088012695, "step": 923 }, { "epoch": 0.5617021276595745, "grad_norm": 1.0462286472320557, "learning_rate": 0.0005, "loss": 4.308043479919434, "step": 924 }, { "epoch": 0.5623100303951368, "grad_norm": 1.1331902742385864, "learning_rate": 0.0005, "loss": 4.122361183166504, "step": 925 }, { "epoch": 0.562917933130699, "grad_norm": 1.0483379364013672, "learning_rate": 0.0005, "loss": 4.140399932861328, "step": 926 }, { "epoch": 0.5635258358662614, "grad_norm": 1.0775599479675293, "learning_rate": 0.0005, "loss": 4.258686065673828, "step": 927 }, { "epoch": 0.5641337386018237, "grad_norm": 1.1621100902557373, "learning_rate": 0.0005, "loss": 4.177057266235352, "step": 928 }, { "epoch": 0.564741641337386, "grad_norm": 1.144015908241272, "learning_rate": 0.0005, "loss": 3.854235887527466, "step": 929 }, { "epoch": 0.5653495440729484, "grad_norm": 1.0188685655593872, "learning_rate": 0.0005, "loss": 4.226658821105957, "step": 930 }, { "epoch": 0.5659574468085107, "grad_norm": 1.214069128036499, "learning_rate": 0.0005, "loss": 4.558093070983887, "step": 931 }, { "epoch": 0.5665653495440729, "grad_norm": 1.0221775770187378, "learning_rate": 0.0005, "loss": 4.362860202789307, "step": 932 }, { "epoch": 0.5671732522796352, "grad_norm": 1.1003692150115967, "learning_rate": 0.0005, "loss": 4.412820339202881, "step": 933 }, { "epoch": 0.5677811550151975, "grad_norm": 1.0189692974090576, "learning_rate": 0.0005, "loss": 4.141862392425537, "step": 934 }, { "epoch": 0.5683890577507599, "grad_norm": 1.1275514364242554, "learning_rate": 0.0005, "loss": 4.0759077072143555, "step": 935 }, { "epoch": 0.5689969604863222, "grad_norm": 1.0595769882202148, "learning_rate": 0.0005, "loss": 4.234007835388184, "step": 936 }, { "epoch": 0.5696048632218845, "grad_norm": 1.0620779991149902, "learning_rate": 0.0005, "loss": 4.242690086364746, "step": 937 }, { "epoch": 0.5702127659574469, "grad_norm": 1.0344425439834595, "learning_rate": 0.0005, "loss": 4.393516540527344, "step": 938 }, { "epoch": 0.5708206686930091, "grad_norm": 1.1058911085128784, "learning_rate": 0.0005, "loss": 4.163288116455078, "step": 939 }, { "epoch": 0.5714285714285714, "grad_norm": 1.38120698928833, "learning_rate": 0.0005, "loss": 4.052524566650391, "step": 940 }, { "epoch": 0.5720364741641337, "grad_norm": 1.0876317024230957, "learning_rate": 0.0005, "loss": 4.03524112701416, "step": 941 }, { "epoch": 0.572644376899696, "grad_norm": 1.0367745161056519, "learning_rate": 0.0005, "loss": 4.183863639831543, "step": 942 }, { "epoch": 0.5732522796352584, "grad_norm": 1.008543848991394, "learning_rate": 0.0005, "loss": 4.219581127166748, "step": 943 }, { "epoch": 0.5738601823708207, "grad_norm": 1.0349946022033691, "learning_rate": 0.0005, "loss": 4.3019561767578125, "step": 944 }, { "epoch": 0.574468085106383, "grad_norm": 1.5670639276504517, "learning_rate": 0.0005, "loss": 4.330730438232422, "step": 945 }, { "epoch": 0.5750759878419452, "grad_norm": 1.0402114391326904, "learning_rate": 0.0005, "loss": 4.131731033325195, "step": 946 }, { "epoch": 0.5756838905775076, "grad_norm": 1.092549204826355, "learning_rate": 0.0005, "loss": 4.311880111694336, "step": 947 }, { "epoch": 0.5762917933130699, "grad_norm": 0.968338668346405, "learning_rate": 0.0005, "loss": 4.174734115600586, "step": 948 }, { "epoch": 0.5768996960486322, "grad_norm": 1.0552120208740234, "learning_rate": 0.0005, "loss": 3.9222404956817627, "step": 949 }, { "epoch": 0.5775075987841946, "grad_norm": 1.0390139818191528, "learning_rate": 0.0005, "loss": 4.248430252075195, "step": 950 }, { "epoch": 0.5781155015197569, "grad_norm": 1.1151931285858154, "learning_rate": 0.0005, "loss": 4.177859306335449, "step": 951 }, { "epoch": 0.5787234042553191, "grad_norm": 1.0557676553726196, "learning_rate": 0.0005, "loss": 4.173686981201172, "step": 952 }, { "epoch": 0.5793313069908814, "grad_norm": 1.133589506149292, "learning_rate": 0.0005, "loss": 4.227142333984375, "step": 953 }, { "epoch": 0.5799392097264437, "grad_norm": 1.12785804271698, "learning_rate": 0.0005, "loss": 4.077308654785156, "step": 954 }, { "epoch": 0.5805471124620061, "grad_norm": 1.0380632877349854, "learning_rate": 0.0005, "loss": 4.485074996948242, "step": 955 }, { "epoch": 0.5811550151975684, "grad_norm": 1.0573036670684814, "learning_rate": 0.0005, "loss": 4.045351028442383, "step": 956 }, { "epoch": 0.5817629179331307, "grad_norm": 1.0433647632598877, "learning_rate": 0.0005, "loss": 3.9277734756469727, "step": 957 }, { "epoch": 0.5823708206686931, "grad_norm": 1.077911376953125, "learning_rate": 0.0005, "loss": 4.329649448394775, "step": 958 }, { "epoch": 0.5829787234042553, "grad_norm": 0.9521039128303528, "learning_rate": 0.0005, "loss": 4.175987720489502, "step": 959 }, { "epoch": 0.5835866261398176, "grad_norm": 1.0778512954711914, "learning_rate": 0.0005, "loss": 4.32703971862793, "step": 960 }, { "epoch": 0.5841945288753799, "grad_norm": 1.048074722290039, "learning_rate": 0.0005, "loss": 4.146064758300781, "step": 961 }, { "epoch": 0.5848024316109423, "grad_norm": 1.0995032787322998, "learning_rate": 0.0005, "loss": 4.317961692810059, "step": 962 }, { "epoch": 0.5854103343465046, "grad_norm": 1.1812586784362793, "learning_rate": 0.0005, "loss": 4.266629219055176, "step": 963 }, { "epoch": 0.5860182370820669, "grad_norm": 1.2058099508285522, "learning_rate": 0.0005, "loss": 4.350966930389404, "step": 964 }, { "epoch": 0.5866261398176292, "grad_norm": 1.1499630212783813, "learning_rate": 0.0005, "loss": 4.47420072555542, "step": 965 }, { "epoch": 0.5872340425531914, "grad_norm": 1.212178111076355, "learning_rate": 0.0005, "loss": 4.201877593994141, "step": 966 }, { "epoch": 0.5878419452887538, "grad_norm": 1.0750401020050049, "learning_rate": 0.0005, "loss": 4.032867431640625, "step": 967 }, { "epoch": 0.5884498480243161, "grad_norm": 1.0766054391860962, "learning_rate": 0.0005, "loss": 4.070111274719238, "step": 968 }, { "epoch": 0.5890577507598784, "grad_norm": 1.0466876029968262, "learning_rate": 0.0005, "loss": 4.164140224456787, "step": 969 }, { "epoch": 0.5896656534954408, "grad_norm": 0.9755964875221252, "learning_rate": 0.0005, "loss": 4.061018943786621, "step": 970 }, { "epoch": 0.5902735562310031, "grad_norm": 1.0774449110031128, "learning_rate": 0.0005, "loss": 4.103540420532227, "step": 971 }, { "epoch": 0.5908814589665653, "grad_norm": 1.016599178314209, "learning_rate": 0.0005, "loss": 4.367238998413086, "step": 972 }, { "epoch": 0.5914893617021276, "grad_norm": 1.273015022277832, "learning_rate": 0.0005, "loss": 4.130205154418945, "step": 973 }, { "epoch": 0.59209726443769, "grad_norm": 1.121202826499939, "learning_rate": 0.0005, "loss": 4.058278560638428, "step": 974 }, { "epoch": 0.5927051671732523, "grad_norm": 1.782248854637146, "learning_rate": 0.0005, "loss": 4.2193732261657715, "step": 975 }, { "epoch": 0.5933130699088146, "grad_norm": 1.2525842189788818, "learning_rate": 0.0005, "loss": 4.324434757232666, "step": 976 }, { "epoch": 0.5939209726443769, "grad_norm": 0.9859209656715393, "learning_rate": 0.0005, "loss": 4.235608100891113, "step": 977 }, { "epoch": 0.5945288753799393, "grad_norm": 1.0421037673950195, "learning_rate": 0.0005, "loss": 4.312819480895996, "step": 978 }, { "epoch": 0.5951367781155015, "grad_norm": 1.2486640214920044, "learning_rate": 0.0005, "loss": 4.172330856323242, "step": 979 }, { "epoch": 0.5957446808510638, "grad_norm": 1.049641489982605, "learning_rate": 0.0005, "loss": 4.053893089294434, "step": 980 }, { "epoch": 0.5963525835866261, "grad_norm": 1.0123006105422974, "learning_rate": 0.0005, "loss": 4.453596115112305, "step": 981 }, { "epoch": 0.5969604863221885, "grad_norm": 0.9871963858604431, "learning_rate": 0.0005, "loss": 4.007091999053955, "step": 982 }, { "epoch": 0.5975683890577508, "grad_norm": 0.9984953999519348, "learning_rate": 0.0005, "loss": 4.22979736328125, "step": 983 }, { "epoch": 0.5981762917933131, "grad_norm": 1.281544804573059, "learning_rate": 0.0005, "loss": 4.074709892272949, "step": 984 }, { "epoch": 0.5987841945288754, "grad_norm": 1.1482913494110107, "learning_rate": 0.0005, "loss": 4.320782661437988, "step": 985 }, { "epoch": 0.5993920972644377, "grad_norm": 1.2105413675308228, "learning_rate": 0.0005, "loss": 4.15565299987793, "step": 986 }, { "epoch": 0.6, "grad_norm": 1.0716112852096558, "learning_rate": 0.0005, "loss": 4.429147720336914, "step": 987 }, { "epoch": 0.6006079027355623, "grad_norm": 1.1487056016921997, "learning_rate": 0.0005, "loss": 4.206772804260254, "step": 988 }, { "epoch": 0.6012158054711246, "grad_norm": 0.9919009208679199, "learning_rate": 0.0005, "loss": 4.105408191680908, "step": 989 }, { "epoch": 0.601823708206687, "grad_norm": 1.1244338750839233, "learning_rate": 0.0005, "loss": 4.034040451049805, "step": 990 }, { "epoch": 0.6024316109422493, "grad_norm": 0.9693543910980225, "learning_rate": 0.0005, "loss": 3.8358006477355957, "step": 991 }, { "epoch": 0.6030395136778115, "grad_norm": 1.147226333618164, "learning_rate": 0.0005, "loss": 4.114927291870117, "step": 992 }, { "epoch": 0.6036474164133738, "grad_norm": 1.1658263206481934, "learning_rate": 0.0005, "loss": 4.3732099533081055, "step": 993 }, { "epoch": 0.6042553191489362, "grad_norm": 1.1261506080627441, "learning_rate": 0.0005, "loss": 4.212404251098633, "step": 994 }, { "epoch": 0.6048632218844985, "grad_norm": 1.214408040046692, "learning_rate": 0.0005, "loss": 4.113962173461914, "step": 995 }, { "epoch": 0.6054711246200608, "grad_norm": 1.1703499555587769, "learning_rate": 0.0005, "loss": 3.9795780181884766, "step": 996 }, { "epoch": 0.6060790273556231, "grad_norm": 1.2819421291351318, "learning_rate": 0.0005, "loss": 3.820543050765991, "step": 997 }, { "epoch": 0.6066869300911855, "grad_norm": 1.1751822233200073, "learning_rate": 0.0005, "loss": 4.115008354187012, "step": 998 }, { "epoch": 0.6072948328267477, "grad_norm": 1.133631944656372, "learning_rate": 0.0005, "loss": 4.320215225219727, "step": 999 }, { "epoch": 0.60790273556231, "grad_norm": 1.2056914567947388, "learning_rate": 0.0005, "loss": 4.139728546142578, "step": 1000 }, { "epoch": 0.6085106382978723, "grad_norm": 1.1610949039459229, "learning_rate": 0.0005, "loss": 4.215843200683594, "step": 1001 }, { "epoch": 0.6091185410334347, "grad_norm": 1.2171114683151245, "learning_rate": 0.0005, "loss": 4.104484558105469, "step": 1002 }, { "epoch": 0.609726443768997, "grad_norm": 1.0760419368743896, "learning_rate": 0.0005, "loss": 4.106335639953613, "step": 1003 }, { "epoch": 0.6103343465045593, "grad_norm": 1.0737935304641724, "learning_rate": 0.0005, "loss": 4.18284797668457, "step": 1004 }, { "epoch": 0.6109422492401215, "grad_norm": 1.0054482221603394, "learning_rate": 0.0005, "loss": 4.185699462890625, "step": 1005 }, { "epoch": 0.6115501519756839, "grad_norm": 1.0817815065383911, "learning_rate": 0.0005, "loss": 3.9077231884002686, "step": 1006 }, { "epoch": 0.6121580547112462, "grad_norm": 1.1520154476165771, "learning_rate": 0.0005, "loss": 4.094099044799805, "step": 1007 }, { "epoch": 0.6127659574468085, "grad_norm": 0.9844207763671875, "learning_rate": 0.0005, "loss": 4.341885566711426, "step": 1008 }, { "epoch": 0.6133738601823708, "grad_norm": 1.2627776861190796, "learning_rate": 0.0005, "loss": 4.28475284576416, "step": 1009 }, { "epoch": 0.6139817629179332, "grad_norm": 0.9542902112007141, "learning_rate": 0.0005, "loss": 4.2372026443481445, "step": 1010 }, { "epoch": 0.6145896656534955, "grad_norm": 3.645486831665039, "learning_rate": 0.0005, "loss": 4.06125545501709, "step": 1011 }, { "epoch": 0.6151975683890577, "grad_norm": 1.4817546606063843, "learning_rate": 0.0005, "loss": 3.9809517860412598, "step": 1012 }, { "epoch": 0.61580547112462, "grad_norm": 1.1932374238967896, "learning_rate": 0.0005, "loss": 4.242306232452393, "step": 1013 }, { "epoch": 0.6164133738601824, "grad_norm": 0.9499757289886475, "learning_rate": 0.0005, "loss": 3.9819726943969727, "step": 1014 }, { "epoch": 0.6170212765957447, "grad_norm": 1.1981247663497925, "learning_rate": 0.0005, "loss": 4.266401290893555, "step": 1015 }, { "epoch": 0.617629179331307, "grad_norm": 1.2060346603393555, "learning_rate": 0.0005, "loss": 4.270205497741699, "step": 1016 }, { "epoch": 0.6182370820668693, "grad_norm": 1.002508282661438, "learning_rate": 0.0005, "loss": 4.509585380554199, "step": 1017 }, { "epoch": 0.6188449848024317, "grad_norm": 1.0094107389450073, "learning_rate": 0.0005, "loss": 4.162940979003906, "step": 1018 }, { "epoch": 0.6194528875379939, "grad_norm": 1.180220365524292, "learning_rate": 0.0005, "loss": 4.317109107971191, "step": 1019 }, { "epoch": 0.6200607902735562, "grad_norm": 0.980454683303833, "learning_rate": 0.0005, "loss": 4.042284965515137, "step": 1020 }, { "epoch": 0.6206686930091185, "grad_norm": 1.0461052656173706, "learning_rate": 0.0005, "loss": 4.0409698486328125, "step": 1021 }, { "epoch": 0.6212765957446809, "grad_norm": 1.0268027782440186, "learning_rate": 0.0005, "loss": 4.10588264465332, "step": 1022 }, { "epoch": 0.6218844984802432, "grad_norm": 0.9659956693649292, "learning_rate": 0.0005, "loss": 4.511264801025391, "step": 1023 }, { "epoch": 0.6224924012158055, "grad_norm": 1.0161465406417847, "learning_rate": 0.0005, "loss": 4.369597911834717, "step": 1024 }, { "epoch": 0.6231003039513677, "grad_norm": 1.145430326461792, "learning_rate": 0.0005, "loss": 4.104186058044434, "step": 1025 }, { "epoch": 0.6237082066869301, "grad_norm": 0.968573808670044, "learning_rate": 0.0005, "loss": 4.03414249420166, "step": 1026 }, { "epoch": 0.6243161094224924, "grad_norm": 1.2972266674041748, "learning_rate": 0.0005, "loss": 4.18367862701416, "step": 1027 }, { "epoch": 0.6249240121580547, "grad_norm": 0.9075741171836853, "learning_rate": 0.0005, "loss": 4.101839065551758, "step": 1028 }, { "epoch": 0.625531914893617, "grad_norm": 1.2480190992355347, "learning_rate": 0.0005, "loss": 4.170825004577637, "step": 1029 }, { "epoch": 0.6261398176291794, "grad_norm": 1.1662267446517944, "learning_rate": 0.0005, "loss": 4.132046222686768, "step": 1030 }, { "epoch": 0.6267477203647417, "grad_norm": 0.9081514477729797, "learning_rate": 0.0005, "loss": 4.023431777954102, "step": 1031 }, { "epoch": 0.6273556231003039, "grad_norm": 1.1570264101028442, "learning_rate": 0.0005, "loss": 4.246901512145996, "step": 1032 }, { "epoch": 0.6279635258358662, "grad_norm": 1.0261447429656982, "learning_rate": 0.0005, "loss": 4.251025199890137, "step": 1033 }, { "epoch": 0.6285714285714286, "grad_norm": 0.9957416653633118, "learning_rate": 0.0005, "loss": 4.112504482269287, "step": 1034 }, { "epoch": 0.6291793313069909, "grad_norm": 1.2634888887405396, "learning_rate": 0.0005, "loss": 4.397002220153809, "step": 1035 }, { "epoch": 0.6297872340425532, "grad_norm": 1.0848995447158813, "learning_rate": 0.0005, "loss": 4.163301467895508, "step": 1036 }, { "epoch": 0.6303951367781155, "grad_norm": 1.0806390047073364, "learning_rate": 0.0005, "loss": 3.78402042388916, "step": 1037 }, { "epoch": 0.6310030395136779, "grad_norm": 1.0640003681182861, "learning_rate": 0.0005, "loss": 4.0556440353393555, "step": 1038 }, { "epoch": 0.6316109422492401, "grad_norm": 0.9620634317398071, "learning_rate": 0.0005, "loss": 4.19709587097168, "step": 1039 }, { "epoch": 0.6322188449848024, "grad_norm": 1.4484918117523193, "learning_rate": 0.0005, "loss": 4.058507442474365, "step": 1040 }, { "epoch": 0.6328267477203647, "grad_norm": 1.219489574432373, "learning_rate": 0.0005, "loss": 4.1419267654418945, "step": 1041 }, { "epoch": 0.6334346504559271, "grad_norm": 1.127636194229126, "learning_rate": 0.0005, "loss": 3.892014980316162, "step": 1042 }, { "epoch": 0.6340425531914894, "grad_norm": 1.326476812362671, "learning_rate": 0.0005, "loss": 4.128079414367676, "step": 1043 }, { "epoch": 0.6346504559270517, "grad_norm": 1.1010375022888184, "learning_rate": 0.0005, "loss": 3.898940086364746, "step": 1044 }, { "epoch": 0.6352583586626139, "grad_norm": 1.1064268350601196, "learning_rate": 0.0005, "loss": 4.141763687133789, "step": 1045 }, { "epoch": 0.6358662613981763, "grad_norm": 1.24687659740448, "learning_rate": 0.0005, "loss": 4.210533618927002, "step": 1046 }, { "epoch": 0.6364741641337386, "grad_norm": 1.0071916580200195, "learning_rate": 0.0005, "loss": 4.255558013916016, "step": 1047 }, { "epoch": 0.6370820668693009, "grad_norm": 1.0620638132095337, "learning_rate": 0.0005, "loss": 4.008969306945801, "step": 1048 }, { "epoch": 0.6376899696048632, "grad_norm": 1.0604190826416016, "learning_rate": 0.0005, "loss": 4.224608421325684, "step": 1049 }, { "epoch": 0.6382978723404256, "grad_norm": 1.032774567604065, "learning_rate": 0.0005, "loss": 4.131565093994141, "step": 1050 }, { "epoch": 0.6389057750759879, "grad_norm": 0.9236063361167908, "learning_rate": 0.0005, "loss": 4.309024333953857, "step": 1051 }, { "epoch": 0.6395136778115501, "grad_norm": 1.059757947921753, "learning_rate": 0.0005, "loss": 4.041001319885254, "step": 1052 }, { "epoch": 0.6401215805471124, "grad_norm": 1.1099759340286255, "learning_rate": 0.0005, "loss": 3.9661004543304443, "step": 1053 }, { "epoch": 0.6407294832826748, "grad_norm": 1.0091055631637573, "learning_rate": 0.0005, "loss": 3.987016439437866, "step": 1054 }, { "epoch": 0.6413373860182371, "grad_norm": 1.1090649366378784, "learning_rate": 0.0005, "loss": 4.068497657775879, "step": 1055 }, { "epoch": 0.6419452887537994, "grad_norm": 1.0738252401351929, "learning_rate": 0.0005, "loss": 3.9846339225769043, "step": 1056 }, { "epoch": 0.6425531914893617, "grad_norm": 1.1196277141571045, "learning_rate": 0.0005, "loss": 4.063312530517578, "step": 1057 }, { "epoch": 0.643161094224924, "grad_norm": 1.2615549564361572, "learning_rate": 0.0005, "loss": 3.9986069202423096, "step": 1058 }, { "epoch": 0.6437689969604863, "grad_norm": 1.49628746509552, "learning_rate": 0.0005, "loss": 4.1674394607543945, "step": 1059 }, { "epoch": 0.6443768996960486, "grad_norm": 1.279189109802246, "learning_rate": 0.0005, "loss": 3.8027124404907227, "step": 1060 }, { "epoch": 0.6449848024316109, "grad_norm": 1.1228110790252686, "learning_rate": 0.0005, "loss": 3.7935433387756348, "step": 1061 }, { "epoch": 0.6455927051671733, "grad_norm": 1.082332730293274, "learning_rate": 0.0005, "loss": 4.039803981781006, "step": 1062 }, { "epoch": 0.6462006079027356, "grad_norm": 0.9758466482162476, "learning_rate": 0.0005, "loss": 4.102064609527588, "step": 1063 }, { "epoch": 0.6468085106382979, "grad_norm": 1.0097397565841675, "learning_rate": 0.0005, "loss": 4.058742523193359, "step": 1064 }, { "epoch": 0.6474164133738601, "grad_norm": 1.0726414918899536, "learning_rate": 0.0005, "loss": 4.0242133140563965, "step": 1065 }, { "epoch": 0.6480243161094225, "grad_norm": 1.107040524482727, "learning_rate": 0.0005, "loss": 3.9720733165740967, "step": 1066 }, { "epoch": 0.6486322188449848, "grad_norm": 1.258399248123169, "learning_rate": 0.0005, "loss": 3.85103178024292, "step": 1067 }, { "epoch": 0.6492401215805471, "grad_norm": 1.215524435043335, "learning_rate": 0.0005, "loss": 4.162143707275391, "step": 1068 }, { "epoch": 0.6498480243161094, "grad_norm": 1.0505629777908325, "learning_rate": 0.0005, "loss": 4.23874568939209, "step": 1069 }, { "epoch": 0.6504559270516718, "grad_norm": 1.2580337524414062, "learning_rate": 0.0005, "loss": 4.126619338989258, "step": 1070 }, { "epoch": 0.6510638297872341, "grad_norm": 1.1980527639389038, "learning_rate": 0.0005, "loss": 4.011953353881836, "step": 1071 }, { "epoch": 0.6516717325227963, "grad_norm": 1.020224690437317, "learning_rate": 0.0005, "loss": 4.2201948165893555, "step": 1072 }, { "epoch": 0.6522796352583586, "grad_norm": 1.0695855617523193, "learning_rate": 0.0005, "loss": 4.277288436889648, "step": 1073 }, { "epoch": 0.652887537993921, "grad_norm": 1.1862881183624268, "learning_rate": 0.0005, "loss": 4.1104841232299805, "step": 1074 }, { "epoch": 0.6534954407294833, "grad_norm": 1.7002424001693726, "learning_rate": 0.0005, "loss": 4.274345874786377, "step": 1075 }, { "epoch": 0.6541033434650456, "grad_norm": 1.3632254600524902, "learning_rate": 0.0005, "loss": 4.318878173828125, "step": 1076 }, { "epoch": 0.6547112462006079, "grad_norm": 1.1510448455810547, "learning_rate": 0.0005, "loss": 4.182323455810547, "step": 1077 }, { "epoch": 0.6553191489361702, "grad_norm": 1.143638014793396, "learning_rate": 0.0005, "loss": 4.174741744995117, "step": 1078 }, { "epoch": 0.6559270516717325, "grad_norm": 1.1500475406646729, "learning_rate": 0.0005, "loss": 3.9260623455047607, "step": 1079 }, { "epoch": 0.6565349544072948, "grad_norm": 1.293712854385376, "learning_rate": 0.0005, "loss": 4.087700843811035, "step": 1080 }, { "epoch": 0.6571428571428571, "grad_norm": 1.3932772874832153, "learning_rate": 0.0005, "loss": 4.118124961853027, "step": 1081 }, { "epoch": 0.6577507598784195, "grad_norm": 1.094328761100769, "learning_rate": 0.0005, "loss": 4.175318241119385, "step": 1082 }, { "epoch": 0.6583586626139818, "grad_norm": 1.467499017715454, "learning_rate": 0.0005, "loss": 4.272140979766846, "step": 1083 }, { "epoch": 0.6589665653495441, "grad_norm": 1.1503561735153198, "learning_rate": 0.0005, "loss": 4.183167934417725, "step": 1084 }, { "epoch": 0.6595744680851063, "grad_norm": 1.1912407875061035, "learning_rate": 0.0005, "loss": 4.055290222167969, "step": 1085 }, { "epoch": 0.6601823708206687, "grad_norm": 1.1428508758544922, "learning_rate": 0.0005, "loss": 4.183894157409668, "step": 1086 }, { "epoch": 0.660790273556231, "grad_norm": 1.136474609375, "learning_rate": 0.0005, "loss": 3.86468768119812, "step": 1087 }, { "epoch": 0.6613981762917933, "grad_norm": 1.0048547983169556, "learning_rate": 0.0005, "loss": 4.054813385009766, "step": 1088 }, { "epoch": 0.6620060790273556, "grad_norm": 1.021672010421753, "learning_rate": 0.0005, "loss": 3.9937808513641357, "step": 1089 }, { "epoch": 0.662613981762918, "grad_norm": 1.184766173362732, "learning_rate": 0.0005, "loss": 4.007226943969727, "step": 1090 }, { "epoch": 0.6632218844984803, "grad_norm": 1.1701700687408447, "learning_rate": 0.0005, "loss": 3.880901336669922, "step": 1091 }, { "epoch": 0.6638297872340425, "grad_norm": 1.0928300619125366, "learning_rate": 0.0005, "loss": 4.0920090675354, "step": 1092 }, { "epoch": 0.6644376899696048, "grad_norm": 1.0498013496398926, "learning_rate": 0.0005, "loss": 4.117219924926758, "step": 1093 }, { "epoch": 0.6650455927051672, "grad_norm": 1.034084439277649, "learning_rate": 0.0005, "loss": 4.0926313400268555, "step": 1094 }, { "epoch": 0.6656534954407295, "grad_norm": 0.939494788646698, "learning_rate": 0.0005, "loss": 4.018050670623779, "step": 1095 }, { "epoch": 0.6662613981762918, "grad_norm": 1.2339518070220947, "learning_rate": 0.0005, "loss": 3.9285688400268555, "step": 1096 }, { "epoch": 0.6668693009118541, "grad_norm": 1.1236822605133057, "learning_rate": 0.0005, "loss": 3.9050168991088867, "step": 1097 }, { "epoch": 0.6674772036474164, "grad_norm": 0.9875328540802002, "learning_rate": 0.0005, "loss": 4.033220291137695, "step": 1098 }, { "epoch": 0.6680851063829787, "grad_norm": 0.9468657374382019, "learning_rate": 0.0005, "loss": 4.108023643493652, "step": 1099 }, { "epoch": 0.668693009118541, "grad_norm": 1.0056613683700562, "learning_rate": 0.0005, "loss": 4.217726707458496, "step": 1100 }, { "epoch": 0.6693009118541033, "grad_norm": 1.1911637783050537, "learning_rate": 0.0005, "loss": 3.9922735691070557, "step": 1101 }, { "epoch": 0.6699088145896657, "grad_norm": 0.9524610638618469, "learning_rate": 0.0005, "loss": 3.843928337097168, "step": 1102 }, { "epoch": 0.670516717325228, "grad_norm": 1.1759804487228394, "learning_rate": 0.0005, "loss": 3.8452773094177246, "step": 1103 }, { "epoch": 0.6711246200607903, "grad_norm": 1.1534795761108398, "learning_rate": 0.0005, "loss": 4.147238731384277, "step": 1104 }, { "epoch": 0.6717325227963525, "grad_norm": 1.0438340902328491, "learning_rate": 0.0005, "loss": 3.814009666442871, "step": 1105 }, { "epoch": 0.6723404255319149, "grad_norm": 1.4943510293960571, "learning_rate": 0.0005, "loss": 4.062148571014404, "step": 1106 }, { "epoch": 0.6729483282674772, "grad_norm": 0.9739040732383728, "learning_rate": 0.0005, "loss": 4.066575050354004, "step": 1107 }, { "epoch": 0.6735562310030395, "grad_norm": 1.0727957487106323, "learning_rate": 0.0005, "loss": 3.935608386993408, "step": 1108 }, { "epoch": 0.6741641337386018, "grad_norm": 1.480692744255066, "learning_rate": 0.0005, "loss": 4.12183952331543, "step": 1109 }, { "epoch": 0.6747720364741642, "grad_norm": 1.1042070388793945, "learning_rate": 0.0005, "loss": 3.8309693336486816, "step": 1110 }, { "epoch": 0.6753799392097265, "grad_norm": 1.5949453115463257, "learning_rate": 0.0005, "loss": 4.225711822509766, "step": 1111 }, { "epoch": 0.6759878419452887, "grad_norm": 1.1404409408569336, "learning_rate": 0.0005, "loss": 3.870262384414673, "step": 1112 }, { "epoch": 0.676595744680851, "grad_norm": 1.1272308826446533, "learning_rate": 0.0005, "loss": 4.375516891479492, "step": 1113 }, { "epoch": 0.6772036474164134, "grad_norm": 1.3391433954238892, "learning_rate": 0.0005, "loss": 3.9125869274139404, "step": 1114 }, { "epoch": 0.6778115501519757, "grad_norm": 0.9406550526618958, "learning_rate": 0.0005, "loss": 4.000041961669922, "step": 1115 }, { "epoch": 0.678419452887538, "grad_norm": 1.211789846420288, "learning_rate": 0.0005, "loss": 4.146924018859863, "step": 1116 }, { "epoch": 0.6790273556231003, "grad_norm": 1.0479586124420166, "learning_rate": 0.0005, "loss": 3.77646803855896, "step": 1117 }, { "epoch": 0.6796352583586626, "grad_norm": 1.0069152116775513, "learning_rate": 0.0005, "loss": 4.110267162322998, "step": 1118 }, { "epoch": 0.6802431610942249, "grad_norm": 1.2088702917099, "learning_rate": 0.0005, "loss": 4.083201885223389, "step": 1119 }, { "epoch": 0.6808510638297872, "grad_norm": 1.3016067743301392, "learning_rate": 0.0005, "loss": 4.1130218505859375, "step": 1120 }, { "epoch": 0.6814589665653495, "grad_norm": 1.0395400524139404, "learning_rate": 0.0005, "loss": 4.012112617492676, "step": 1121 }, { "epoch": 0.6820668693009119, "grad_norm": 1.1534603834152222, "learning_rate": 0.0005, "loss": 3.8767285346984863, "step": 1122 }, { "epoch": 0.6826747720364742, "grad_norm": 1.1331707239151, "learning_rate": 0.0005, "loss": 3.8466670513153076, "step": 1123 }, { "epoch": 0.6832826747720365, "grad_norm": 1.0023419857025146, "learning_rate": 0.0005, "loss": 3.978550910949707, "step": 1124 }, { "epoch": 0.6838905775075987, "grad_norm": 1.198326826095581, "learning_rate": 0.0005, "loss": 4.160974502563477, "step": 1125 }, { "epoch": 0.6844984802431611, "grad_norm": 1.0249745845794678, "learning_rate": 0.0005, "loss": 3.961395740509033, "step": 1126 }, { "epoch": 0.6851063829787234, "grad_norm": 1.2853235006332397, "learning_rate": 0.0005, "loss": 4.322844505310059, "step": 1127 }, { "epoch": 0.6857142857142857, "grad_norm": 0.9774798154830933, "learning_rate": 0.0005, "loss": 4.034377098083496, "step": 1128 }, { "epoch": 0.686322188449848, "grad_norm": 1.1903027296066284, "learning_rate": 0.0005, "loss": 3.896298408508301, "step": 1129 }, { "epoch": 0.6869300911854104, "grad_norm": 0.9409128427505493, "learning_rate": 0.0005, "loss": 3.967303514480591, "step": 1130 }, { "epoch": 0.6875379939209726, "grad_norm": 1.0214065313339233, "learning_rate": 0.0005, "loss": 3.916367769241333, "step": 1131 }, { "epoch": 0.6881458966565349, "grad_norm": 1.3258956670761108, "learning_rate": 0.0005, "loss": 3.859543800354004, "step": 1132 }, { "epoch": 0.6887537993920972, "grad_norm": 1.0668888092041016, "learning_rate": 0.0005, "loss": 3.929979085922241, "step": 1133 }, { "epoch": 0.6893617021276596, "grad_norm": 1.0921815633773804, "learning_rate": 0.0005, "loss": 3.942767381668091, "step": 1134 }, { "epoch": 0.6899696048632219, "grad_norm": 1.1683087348937988, "learning_rate": 0.0005, "loss": 4.096218109130859, "step": 1135 }, { "epoch": 0.6905775075987842, "grad_norm": 1.150611162185669, "learning_rate": 0.0005, "loss": 4.065778732299805, "step": 1136 }, { "epoch": 0.6911854103343466, "grad_norm": 0.9955292344093323, "learning_rate": 0.0005, "loss": 3.83855938911438, "step": 1137 }, { "epoch": 0.6917933130699088, "grad_norm": 1.1191688776016235, "learning_rate": 0.0005, "loss": 4.189568519592285, "step": 1138 }, { "epoch": 0.6924012158054711, "grad_norm": 1.1021112203598022, "learning_rate": 0.0005, "loss": 4.004612445831299, "step": 1139 }, { "epoch": 0.6930091185410334, "grad_norm": 1.2468072175979614, "learning_rate": 0.0005, "loss": 3.867835283279419, "step": 1140 }, { "epoch": 0.6936170212765957, "grad_norm": 0.9965139627456665, "learning_rate": 0.0005, "loss": 3.8393120765686035, "step": 1141 }, { "epoch": 0.6942249240121581, "grad_norm": 1.2608331441879272, "learning_rate": 0.0005, "loss": 4.122796535491943, "step": 1142 }, { "epoch": 0.6948328267477204, "grad_norm": 0.9645028710365295, "learning_rate": 0.0005, "loss": 4.193379878997803, "step": 1143 }, { "epoch": 0.6954407294832827, "grad_norm": 1.103003978729248, "learning_rate": 0.0005, "loss": 3.80690860748291, "step": 1144 }, { "epoch": 0.6960486322188449, "grad_norm": 0.9812702536582947, "learning_rate": 0.0005, "loss": 3.910191059112549, "step": 1145 }, { "epoch": 0.6966565349544073, "grad_norm": 1.1629973649978638, "learning_rate": 0.0005, "loss": 3.912919282913208, "step": 1146 }, { "epoch": 0.6972644376899696, "grad_norm": 0.9559318423271179, "learning_rate": 0.0005, "loss": 4.1403703689575195, "step": 1147 }, { "epoch": 0.6978723404255319, "grad_norm": 1.187225103378296, "learning_rate": 0.0005, "loss": 4.227229595184326, "step": 1148 }, { "epoch": 0.6984802431610942, "grad_norm": 1.0893582105636597, "learning_rate": 0.0005, "loss": 4.085037708282471, "step": 1149 }, { "epoch": 0.6990881458966566, "grad_norm": 1.207614541053772, "learning_rate": 0.0005, "loss": 4.065666198730469, "step": 1150 }, { "epoch": 0.6996960486322188, "grad_norm": 1.1726024150848389, "learning_rate": 0.0005, "loss": 4.011224269866943, "step": 1151 }, { "epoch": 0.7003039513677811, "grad_norm": 1.0657603740692139, "learning_rate": 0.0005, "loss": 3.7515785694122314, "step": 1152 }, { "epoch": 0.7009118541033434, "grad_norm": 1.069787859916687, "learning_rate": 0.0005, "loss": 4.024471759796143, "step": 1153 }, { "epoch": 0.7015197568389058, "grad_norm": 1.0333293676376343, "learning_rate": 0.0005, "loss": 3.9765753746032715, "step": 1154 }, { "epoch": 0.7021276595744681, "grad_norm": 1.3091932535171509, "learning_rate": 0.0005, "loss": 4.086296081542969, "step": 1155 }, { "epoch": 0.7027355623100304, "grad_norm": 0.96831214427948, "learning_rate": 0.0005, "loss": 4.159407615661621, "step": 1156 }, { "epoch": 0.7033434650455928, "grad_norm": 1.0307363271713257, "learning_rate": 0.0005, "loss": 4.085778713226318, "step": 1157 }, { "epoch": 0.703951367781155, "grad_norm": 1.2046213150024414, "learning_rate": 0.0005, "loss": 4.149312973022461, "step": 1158 }, { "epoch": 0.7045592705167173, "grad_norm": 1.027969241142273, "learning_rate": 0.0005, "loss": 4.021113872528076, "step": 1159 }, { "epoch": 0.7051671732522796, "grad_norm": 0.886216938495636, "learning_rate": 0.0005, "loss": 4.1072492599487305, "step": 1160 }, { "epoch": 0.705775075987842, "grad_norm": 1.2814362049102783, "learning_rate": 0.0005, "loss": 4.351136207580566, "step": 1161 }, { "epoch": 0.7063829787234043, "grad_norm": 1.195614218711853, "learning_rate": 0.0005, "loss": 3.859333038330078, "step": 1162 }, { "epoch": 0.7069908814589666, "grad_norm": 1.02545964717865, "learning_rate": 0.0005, "loss": 3.993720531463623, "step": 1163 }, { "epoch": 0.7075987841945289, "grad_norm": 1.1973057985305786, "learning_rate": 0.0005, "loss": 4.022034168243408, "step": 1164 }, { "epoch": 0.7082066869300911, "grad_norm": 1.047211766242981, "learning_rate": 0.0005, "loss": 4.110629081726074, "step": 1165 }, { "epoch": 0.7088145896656535, "grad_norm": 0.9065303206443787, "learning_rate": 0.0005, "loss": 4.266797065734863, "step": 1166 }, { "epoch": 0.7094224924012158, "grad_norm": 1.0121465921401978, "learning_rate": 0.0005, "loss": 3.7510571479797363, "step": 1167 }, { "epoch": 0.7100303951367781, "grad_norm": 1.2128599882125854, "learning_rate": 0.0005, "loss": 4.190847396850586, "step": 1168 }, { "epoch": 0.7106382978723405, "grad_norm": 1.4867533445358276, "learning_rate": 0.0005, "loss": 4.125823497772217, "step": 1169 }, { "epoch": 0.7112462006079028, "grad_norm": 0.9088724851608276, "learning_rate": 0.0005, "loss": 4.092848777770996, "step": 1170 }, { "epoch": 0.711854103343465, "grad_norm": 0.980387270450592, "learning_rate": 0.0005, "loss": 3.963526725769043, "step": 1171 }, { "epoch": 0.7124620060790273, "grad_norm": 0.9671593308448792, "learning_rate": 0.0005, "loss": 3.886415958404541, "step": 1172 }, { "epoch": 0.7130699088145896, "grad_norm": 0.8448948860168457, "learning_rate": 0.0005, "loss": 3.960893154144287, "step": 1173 }, { "epoch": 0.713677811550152, "grad_norm": 1.0654000043869019, "learning_rate": 0.0005, "loss": 3.9057178497314453, "step": 1174 }, { "epoch": 0.7142857142857143, "grad_norm": 1.255560040473938, "learning_rate": 0.0005, "loss": 4.085208892822266, "step": 1175 }, { "epoch": 0.7148936170212766, "grad_norm": 1.172607183456421, "learning_rate": 0.0005, "loss": 3.8918302059173584, "step": 1176 }, { "epoch": 0.715501519756839, "grad_norm": 1.1429939270019531, "learning_rate": 0.0005, "loss": 3.8840150833129883, "step": 1177 }, { "epoch": 0.7161094224924012, "grad_norm": 1.0610404014587402, "learning_rate": 0.0005, "loss": 4.0701003074646, "step": 1178 }, { "epoch": 0.7167173252279635, "grad_norm": 1.0055387020111084, "learning_rate": 0.0005, "loss": 3.773474931716919, "step": 1179 }, { "epoch": 0.7173252279635258, "grad_norm": 1.0536381006240845, "learning_rate": 0.0005, "loss": 3.885234832763672, "step": 1180 }, { "epoch": 0.7179331306990882, "grad_norm": 1.2304924726486206, "learning_rate": 0.0005, "loss": 4.134721755981445, "step": 1181 }, { "epoch": 0.7185410334346505, "grad_norm": 1.1367759704589844, "learning_rate": 0.0005, "loss": 4.07640266418457, "step": 1182 }, { "epoch": 0.7191489361702128, "grad_norm": 0.9987047910690308, "learning_rate": 0.0005, "loss": 3.941793918609619, "step": 1183 }, { "epoch": 0.7197568389057751, "grad_norm": 0.9390357136726379, "learning_rate": 0.0005, "loss": 4.057319641113281, "step": 1184 }, { "epoch": 0.7203647416413373, "grad_norm": 1.3009685277938843, "learning_rate": 0.0005, "loss": 3.7487921714782715, "step": 1185 }, { "epoch": 0.7209726443768997, "grad_norm": 1.0107924938201904, "learning_rate": 0.0005, "loss": 3.9321677684783936, "step": 1186 }, { "epoch": 0.721580547112462, "grad_norm": 1.003091812133789, "learning_rate": 0.0005, "loss": 3.855192184448242, "step": 1187 }, { "epoch": 0.7221884498480243, "grad_norm": 1.1665643453598022, "learning_rate": 0.0005, "loss": 4.062481880187988, "step": 1188 }, { "epoch": 0.7227963525835867, "grad_norm": 1.0481219291687012, "learning_rate": 0.0005, "loss": 3.7130908966064453, "step": 1189 }, { "epoch": 0.723404255319149, "grad_norm": 1.4968420267105103, "learning_rate": 0.0005, "loss": 4.054961681365967, "step": 1190 }, { "epoch": 0.7240121580547112, "grad_norm": 1.0543270111083984, "learning_rate": 0.0005, "loss": 4.080737113952637, "step": 1191 }, { "epoch": 0.7246200607902735, "grad_norm": 1.3208811283111572, "learning_rate": 0.0005, "loss": 3.828869581222534, "step": 1192 }, { "epoch": 0.7252279635258359, "grad_norm": 1.1503605842590332, "learning_rate": 0.0005, "loss": 3.897340774536133, "step": 1193 }, { "epoch": 0.7258358662613982, "grad_norm": 0.9485260844230652, "learning_rate": 0.0005, "loss": 4.022025108337402, "step": 1194 }, { "epoch": 0.7264437689969605, "grad_norm": 1.0768346786499023, "learning_rate": 0.0005, "loss": 4.132440567016602, "step": 1195 }, { "epoch": 0.7270516717325228, "grad_norm": 1.0768530368804932, "learning_rate": 0.0005, "loss": 4.167514801025391, "step": 1196 }, { "epoch": 0.7276595744680852, "grad_norm": 1.1659386157989502, "learning_rate": 0.0005, "loss": 3.9605331420898438, "step": 1197 }, { "epoch": 0.7282674772036474, "grad_norm": 0.9825963377952576, "learning_rate": 0.0005, "loss": 3.9677042961120605, "step": 1198 }, { "epoch": 0.7288753799392097, "grad_norm": 1.200975775718689, "learning_rate": 0.0005, "loss": 4.059627056121826, "step": 1199 }, { "epoch": 0.729483282674772, "grad_norm": 1.0287483930587769, "learning_rate": 0.0005, "loss": 3.9571118354797363, "step": 1200 }, { "epoch": 0.7300911854103344, "grad_norm": 1.171775221824646, "learning_rate": 0.0005, "loss": 4.2555084228515625, "step": 1201 }, { "epoch": 0.7306990881458967, "grad_norm": 1.2075831890106201, "learning_rate": 0.0005, "loss": 3.9444262981414795, "step": 1202 }, { "epoch": 0.731306990881459, "grad_norm": 1.1258975267410278, "learning_rate": 0.0005, "loss": 4.037412643432617, "step": 1203 }, { "epoch": 0.7319148936170212, "grad_norm": 1.107055902481079, "learning_rate": 0.0005, "loss": 4.046412467956543, "step": 1204 }, { "epoch": 0.7325227963525835, "grad_norm": 1.1721580028533936, "learning_rate": 0.0005, "loss": 4.039186477661133, "step": 1205 }, { "epoch": 0.7331306990881459, "grad_norm": 1.8083940744400024, "learning_rate": 0.0005, "loss": 4.255344390869141, "step": 1206 }, { "epoch": 0.7337386018237082, "grad_norm": 1.1505194902420044, "learning_rate": 0.0005, "loss": 3.849947452545166, "step": 1207 }, { "epoch": 0.7343465045592705, "grad_norm": 1.0368176698684692, "learning_rate": 0.0005, "loss": 4.037250518798828, "step": 1208 }, { "epoch": 0.7349544072948329, "grad_norm": 1.076282262802124, "learning_rate": 0.0005, "loss": 3.8192124366760254, "step": 1209 }, { "epoch": 0.7355623100303952, "grad_norm": 0.9457529187202454, "learning_rate": 0.0005, "loss": 3.988126516342163, "step": 1210 }, { "epoch": 0.7361702127659574, "grad_norm": 1.396436333656311, "learning_rate": 0.0005, "loss": 3.8735647201538086, "step": 1211 }, { "epoch": 0.7367781155015197, "grad_norm": 1.1978737115859985, "learning_rate": 0.0005, "loss": 3.7308835983276367, "step": 1212 }, { "epoch": 0.737386018237082, "grad_norm": 1.2270631790161133, "learning_rate": 0.0005, "loss": 3.7548632621765137, "step": 1213 }, { "epoch": 0.7379939209726444, "grad_norm": 1.0319976806640625, "learning_rate": 0.0005, "loss": 4.095251083374023, "step": 1214 }, { "epoch": 0.7386018237082067, "grad_norm": 1.2742465734481812, "learning_rate": 0.0005, "loss": 3.7844438552856445, "step": 1215 }, { "epoch": 0.739209726443769, "grad_norm": 0.9936171174049377, "learning_rate": 0.0005, "loss": 3.6828408241271973, "step": 1216 }, { "epoch": 0.7398176291793314, "grad_norm": 1.0827305316925049, "learning_rate": 0.0005, "loss": 4.2918291091918945, "step": 1217 }, { "epoch": 0.7404255319148936, "grad_norm": 1.0626490116119385, "learning_rate": 0.0005, "loss": 3.8438615798950195, "step": 1218 }, { "epoch": 0.7410334346504559, "grad_norm": 1.0187205076217651, "learning_rate": 0.0005, "loss": 4.007755279541016, "step": 1219 }, { "epoch": 0.7416413373860182, "grad_norm": 0.9945427775382996, "learning_rate": 0.0005, "loss": 3.8854458332061768, "step": 1220 }, { "epoch": 0.7422492401215806, "grad_norm": 0.9728744029998779, "learning_rate": 0.0005, "loss": 3.7727737426757812, "step": 1221 }, { "epoch": 0.7428571428571429, "grad_norm": 1.0771368741989136, "learning_rate": 0.0005, "loss": 3.984614133834839, "step": 1222 }, { "epoch": 0.7434650455927052, "grad_norm": 1.0673145055770874, "learning_rate": 0.0005, "loss": 4.186018943786621, "step": 1223 }, { "epoch": 0.7440729483282674, "grad_norm": 1.0385884046554565, "learning_rate": 0.0005, "loss": 3.9700469970703125, "step": 1224 }, { "epoch": 0.7446808510638298, "grad_norm": 0.9378101229667664, "learning_rate": 0.0005, "loss": 4.093457221984863, "step": 1225 }, { "epoch": 0.7452887537993921, "grad_norm": 1.1992157697677612, "learning_rate": 0.0005, "loss": 3.8426108360290527, "step": 1226 }, { "epoch": 0.7458966565349544, "grad_norm": 0.9516767263412476, "learning_rate": 0.0005, "loss": 3.540165901184082, "step": 1227 }, { "epoch": 0.7465045592705167, "grad_norm": 0.9911203980445862, "learning_rate": 0.0005, "loss": 3.791531562805176, "step": 1228 }, { "epoch": 0.7471124620060791, "grad_norm": 1.1304718255996704, "learning_rate": 0.0005, "loss": 3.8638508319854736, "step": 1229 }, { "epoch": 0.7477203647416414, "grad_norm": 3.538874626159668, "learning_rate": 0.0005, "loss": 4.131631851196289, "step": 1230 }, { "epoch": 0.7483282674772036, "grad_norm": 1.096618413925171, "learning_rate": 0.0005, "loss": 3.782884120941162, "step": 1231 }, { "epoch": 0.7489361702127659, "grad_norm": 1.2701330184936523, "learning_rate": 0.0005, "loss": 3.992222785949707, "step": 1232 }, { "epoch": 0.7495440729483283, "grad_norm": 1.0706497430801392, "learning_rate": 0.0005, "loss": 3.908442735671997, "step": 1233 }, { "epoch": 0.7501519756838906, "grad_norm": 1.030834436416626, "learning_rate": 0.0005, "loss": 4.000621318817139, "step": 1234 }, { "epoch": 0.7507598784194529, "grad_norm": 1.3895245790481567, "learning_rate": 0.0005, "loss": 3.80660343170166, "step": 1235 }, { "epoch": 0.7513677811550152, "grad_norm": 0.9692356586456299, "learning_rate": 0.0005, "loss": 4.078845977783203, "step": 1236 }, { "epoch": 0.7519756838905776, "grad_norm": 1.1271778345108032, "learning_rate": 0.0005, "loss": 3.9555394649505615, "step": 1237 }, { "epoch": 0.7525835866261398, "grad_norm": 1.5441569089889526, "learning_rate": 0.0005, "loss": 3.963904857635498, "step": 1238 }, { "epoch": 0.7531914893617021, "grad_norm": 1.7030054330825806, "learning_rate": 0.0005, "loss": 4.024696350097656, "step": 1239 }, { "epoch": 0.7537993920972644, "grad_norm": 1.12552011013031, "learning_rate": 0.0005, "loss": 3.919168472290039, "step": 1240 }, { "epoch": 0.7544072948328268, "grad_norm": 1.0487366914749146, "learning_rate": 0.0005, "loss": 4.095437526702881, "step": 1241 }, { "epoch": 0.7550151975683891, "grad_norm": 1.0279390811920166, "learning_rate": 0.0005, "loss": 3.941718816757202, "step": 1242 }, { "epoch": 0.7556231003039514, "grad_norm": 1.080350399017334, "learning_rate": 0.0005, "loss": 3.7040228843688965, "step": 1243 }, { "epoch": 0.7562310030395136, "grad_norm": 1.0182151794433594, "learning_rate": 0.0005, "loss": 4.062251091003418, "step": 1244 }, { "epoch": 0.756838905775076, "grad_norm": 1.078009843826294, "learning_rate": 0.0005, "loss": 3.8745062351226807, "step": 1245 }, { "epoch": 0.7574468085106383, "grad_norm": 1.0222269296646118, "learning_rate": 0.0005, "loss": 3.7564854621887207, "step": 1246 }, { "epoch": 0.7580547112462006, "grad_norm": 1.329654335975647, "learning_rate": 0.0005, "loss": 3.8875160217285156, "step": 1247 }, { "epoch": 0.7586626139817629, "grad_norm": 1.0129868984222412, "learning_rate": 0.0005, "loss": 3.8748350143432617, "step": 1248 }, { "epoch": 0.7592705167173253, "grad_norm": 1.030468225479126, "learning_rate": 0.0005, "loss": 3.8655738830566406, "step": 1249 }, { "epoch": 0.7598784194528876, "grad_norm": 1.111459732055664, "learning_rate": 0.0005, "loss": 3.9891488552093506, "step": 1250 }, { "epoch": 0.7604863221884498, "grad_norm": 1.4396013021469116, "learning_rate": 0.0005, "loss": 3.9904720783233643, "step": 1251 }, { "epoch": 0.7610942249240121, "grad_norm": 1.2336925268173218, "learning_rate": 0.0005, "loss": 3.7369742393493652, "step": 1252 }, { "epoch": 0.7617021276595745, "grad_norm": 0.8990273475646973, "learning_rate": 0.0005, "loss": 3.9168124198913574, "step": 1253 }, { "epoch": 0.7623100303951368, "grad_norm": 1.2932227849960327, "learning_rate": 0.0005, "loss": 4.008082389831543, "step": 1254 }, { "epoch": 0.7629179331306991, "grad_norm": 0.9154768586158752, "learning_rate": 0.0005, "loss": 4.019550323486328, "step": 1255 }, { "epoch": 0.7635258358662614, "grad_norm": 0.9175946712493896, "learning_rate": 0.0005, "loss": 3.97037935256958, "step": 1256 }, { "epoch": 0.7641337386018237, "grad_norm": 1.067017912864685, "learning_rate": 0.0005, "loss": 4.1387038230896, "step": 1257 }, { "epoch": 0.764741641337386, "grad_norm": 1.1540616750717163, "learning_rate": 0.0005, "loss": 3.979078769683838, "step": 1258 }, { "epoch": 0.7653495440729483, "grad_norm": 0.9942051768302917, "learning_rate": 0.0005, "loss": 4.157475471496582, "step": 1259 }, { "epoch": 0.7659574468085106, "grad_norm": 1.0882611274719238, "learning_rate": 0.0005, "loss": 3.784665584564209, "step": 1260 }, { "epoch": 0.766565349544073, "grad_norm": 1.0358823537826538, "learning_rate": 0.0005, "loss": 3.8665788173675537, "step": 1261 }, { "epoch": 0.7671732522796353, "grad_norm": 0.9150176048278809, "learning_rate": 0.0005, "loss": 3.9708409309387207, "step": 1262 }, { "epoch": 0.7677811550151976, "grad_norm": 1.2305281162261963, "learning_rate": 0.0005, "loss": 3.791486978530884, "step": 1263 }, { "epoch": 0.7683890577507598, "grad_norm": 1.0246379375457764, "learning_rate": 0.0005, "loss": 3.931403875350952, "step": 1264 }, { "epoch": 0.7689969604863222, "grad_norm": 1.342997431755066, "learning_rate": 0.0005, "loss": 3.800549030303955, "step": 1265 }, { "epoch": 0.7696048632218845, "grad_norm": 1.0477383136749268, "learning_rate": 0.0005, "loss": 3.9642491340637207, "step": 1266 }, { "epoch": 0.7702127659574468, "grad_norm": 1.5231037139892578, "learning_rate": 0.0005, "loss": 3.883274555206299, "step": 1267 }, { "epoch": 0.7708206686930091, "grad_norm": 1.21817147731781, "learning_rate": 0.0005, "loss": 3.8319201469421387, "step": 1268 }, { "epoch": 0.7714285714285715, "grad_norm": 1.3139930963516235, "learning_rate": 0.0005, "loss": 3.902513027191162, "step": 1269 }, { "epoch": 0.7720364741641338, "grad_norm": 1.1108347177505493, "learning_rate": 0.0005, "loss": 4.040473937988281, "step": 1270 }, { "epoch": 0.772644376899696, "grad_norm": 0.9352411031723022, "learning_rate": 0.0005, "loss": 3.8833415508270264, "step": 1271 }, { "epoch": 0.7732522796352583, "grad_norm": 0.9234441518783569, "learning_rate": 0.0005, "loss": 4.101876258850098, "step": 1272 }, { "epoch": 0.7738601823708207, "grad_norm": 1.0629017353057861, "learning_rate": 0.0005, "loss": 3.869561195373535, "step": 1273 }, { "epoch": 0.774468085106383, "grad_norm": 1.0356484651565552, "learning_rate": 0.0005, "loss": 3.9723856449127197, "step": 1274 }, { "epoch": 0.7750759878419453, "grad_norm": 0.9600344896316528, "learning_rate": 0.0005, "loss": 3.824707508087158, "step": 1275 }, { "epoch": 0.7756838905775076, "grad_norm": 1.0315158367156982, "learning_rate": 0.0005, "loss": 4.001948356628418, "step": 1276 }, { "epoch": 0.7762917933130699, "grad_norm": 1.1866099834442139, "learning_rate": 0.0005, "loss": 3.763075828552246, "step": 1277 }, { "epoch": 0.7768996960486322, "grad_norm": 1.1227611303329468, "learning_rate": 0.0005, "loss": 3.846872329711914, "step": 1278 }, { "epoch": 0.7775075987841945, "grad_norm": 1.1628526449203491, "learning_rate": 0.0005, "loss": 3.929243564605713, "step": 1279 }, { "epoch": 0.7781155015197568, "grad_norm": 0.9936217069625854, "learning_rate": 0.0005, "loss": 3.736764907836914, "step": 1280 }, { "epoch": 0.7787234042553192, "grad_norm": 1.0325050354003906, "learning_rate": 0.0005, "loss": 3.935077667236328, "step": 1281 }, { "epoch": 0.7793313069908815, "grad_norm": 1.0567058324813843, "learning_rate": 0.0005, "loss": 3.801319122314453, "step": 1282 }, { "epoch": 0.7799392097264438, "grad_norm": 1.313740611076355, "learning_rate": 0.0005, "loss": 4.132801055908203, "step": 1283 }, { "epoch": 0.780547112462006, "grad_norm": 1.4536793231964111, "learning_rate": 0.0005, "loss": 3.88094425201416, "step": 1284 }, { "epoch": 0.7811550151975684, "grad_norm": 1.1501535177230835, "learning_rate": 0.0005, "loss": 3.8404948711395264, "step": 1285 }, { "epoch": 0.7817629179331307, "grad_norm": 1.3253229856491089, "learning_rate": 0.0005, "loss": 4.016423225402832, "step": 1286 }, { "epoch": 0.782370820668693, "grad_norm": 1.2896214723587036, "learning_rate": 0.0005, "loss": 3.8204050064086914, "step": 1287 }, { "epoch": 0.7829787234042553, "grad_norm": 1.347516655921936, "learning_rate": 0.0005, "loss": 3.849546432495117, "step": 1288 }, { "epoch": 0.7835866261398177, "grad_norm": 1.5418754816055298, "learning_rate": 0.0005, "loss": 4.135068893432617, "step": 1289 }, { "epoch": 0.78419452887538, "grad_norm": 1.0823962688446045, "learning_rate": 0.0005, "loss": 3.752423048019409, "step": 1290 }, { "epoch": 0.7848024316109422, "grad_norm": 1.1146916151046753, "learning_rate": 0.0005, "loss": 3.810540199279785, "step": 1291 }, { "epoch": 0.7854103343465045, "grad_norm": 1.0943037271499634, "learning_rate": 0.0005, "loss": 3.761442184448242, "step": 1292 }, { "epoch": 0.7860182370820669, "grad_norm": 1.0425827503204346, "learning_rate": 0.0005, "loss": 3.7996015548706055, "step": 1293 }, { "epoch": 0.7866261398176292, "grad_norm": 1.5982511043548584, "learning_rate": 0.0005, "loss": 3.8388147354125977, "step": 1294 }, { "epoch": 0.7872340425531915, "grad_norm": 1.4619585275650024, "learning_rate": 0.0005, "loss": 4.016120910644531, "step": 1295 }, { "epoch": 0.7878419452887538, "grad_norm": 1.3633700609207153, "learning_rate": 0.0005, "loss": 4.069618225097656, "step": 1296 }, { "epoch": 0.7884498480243161, "grad_norm": 1.009056568145752, "learning_rate": 0.0005, "loss": 4.0978264808654785, "step": 1297 }, { "epoch": 0.7890577507598784, "grad_norm": 1.1812894344329834, "learning_rate": 0.0005, "loss": 3.6178488731384277, "step": 1298 }, { "epoch": 0.7896656534954407, "grad_norm": 1.0647777318954468, "learning_rate": 0.0005, "loss": 3.910210371017456, "step": 1299 }, { "epoch": 0.790273556231003, "grad_norm": 1.4413726329803467, "learning_rate": 0.0005, "loss": 3.9200563430786133, "step": 1300 }, { "epoch": 0.7908814589665654, "grad_norm": 1.1021374464035034, "learning_rate": 0.0005, "loss": 3.680574655532837, "step": 1301 }, { "epoch": 0.7914893617021277, "grad_norm": 1.0827854871749878, "learning_rate": 0.0005, "loss": 3.842402458190918, "step": 1302 }, { "epoch": 0.79209726443769, "grad_norm": 1.2615513801574707, "learning_rate": 0.0005, "loss": 4.0547590255737305, "step": 1303 }, { "epoch": 0.7927051671732522, "grad_norm": 1.0599168539047241, "learning_rate": 0.0005, "loss": 3.8400776386260986, "step": 1304 }, { "epoch": 0.7933130699088146, "grad_norm": 1.4258071184158325, "learning_rate": 0.0005, "loss": 3.945885181427002, "step": 1305 }, { "epoch": 0.7939209726443769, "grad_norm": 1.107612133026123, "learning_rate": 0.0005, "loss": 3.6351089477539062, "step": 1306 }, { "epoch": 0.7945288753799392, "grad_norm": 0.9725725650787354, "learning_rate": 0.0005, "loss": 3.5905802249908447, "step": 1307 }, { "epoch": 0.7951367781155015, "grad_norm": 1.3178088665008545, "learning_rate": 0.0005, "loss": 4.063264846801758, "step": 1308 }, { "epoch": 0.7957446808510639, "grad_norm": 1.111405611038208, "learning_rate": 0.0005, "loss": 3.70896053314209, "step": 1309 }, { "epoch": 0.7963525835866262, "grad_norm": 1.0547385215759277, "learning_rate": 0.0005, "loss": 4.020359516143799, "step": 1310 }, { "epoch": 0.7969604863221884, "grad_norm": 1.1632133722305298, "learning_rate": 0.0005, "loss": 3.8566200733184814, "step": 1311 }, { "epoch": 0.7975683890577507, "grad_norm": 1.0662367343902588, "learning_rate": 0.0005, "loss": 3.7626304626464844, "step": 1312 }, { "epoch": 0.7981762917933131, "grad_norm": 1.0058962106704712, "learning_rate": 0.0005, "loss": 3.667207956314087, "step": 1313 }, { "epoch": 0.7987841945288754, "grad_norm": 1.21786367893219, "learning_rate": 0.0005, "loss": 3.7486650943756104, "step": 1314 }, { "epoch": 0.7993920972644377, "grad_norm": 1.576144814491272, "learning_rate": 0.0005, "loss": 3.836618185043335, "step": 1315 }, { "epoch": 0.8, "grad_norm": 1.0205941200256348, "learning_rate": 0.0005, "loss": 3.921718120574951, "step": 1316 }, { "epoch": 0.8006079027355623, "grad_norm": 1.1202620267868042, "learning_rate": 0.0005, "loss": 3.979546308517456, "step": 1317 }, { "epoch": 0.8012158054711246, "grad_norm": 1.266727089881897, "learning_rate": 0.0004999886023671629, "loss": 3.7467775344848633, "step": 1318 }, { "epoch": 0.8018237082066869, "grad_norm": 1.1622782945632935, "learning_rate": 0.0004999544105079001, "loss": 4.046473503112793, "step": 1319 }, { "epoch": 0.8024316109422492, "grad_norm": 1.1754651069641113, "learning_rate": 0.0004998974275398614, "loss": 3.6320791244506836, "step": 1320 }, { "epoch": 0.8030395136778116, "grad_norm": 0.9786376953125, "learning_rate": 0.0004998176586588145, "loss": 3.6877191066741943, "step": 1321 }, { "epoch": 0.8036474164133739, "grad_norm": 0.969366729259491, "learning_rate": 0.0004997151111381707, "loss": 3.766533374786377, "step": 1322 }, { "epoch": 0.8042553191489362, "grad_norm": 0.9558953046798706, "learning_rate": 0.0004995897943283221, "loss": 4.06315803527832, "step": 1323 }, { "epoch": 0.8048632218844984, "grad_norm": 0.8645924925804138, "learning_rate": 0.0004994417196557883, "loss": 3.838135004043579, "step": 1324 }, { "epoch": 0.8054711246200608, "grad_norm": 0.8671835064888, "learning_rate": 0.0004992709006221755, "loss": 3.883330821990967, "step": 1325 }, { "epoch": 0.8060790273556231, "grad_norm": 1.1144053936004639, "learning_rate": 0.0004990773528029446, "loss": 3.7989044189453125, "step": 1326 }, { "epoch": 0.8066869300911854, "grad_norm": 1.0151537656784058, "learning_rate": 0.0004988610938459917, "loss": 4.007248878479004, "step": 1327 }, { "epoch": 0.8072948328267477, "grad_norm": 1.2170069217681885, "learning_rate": 0.0004986221434700379, "loss": 3.8616843223571777, "step": 1328 }, { "epoch": 0.8079027355623101, "grad_norm": 0.8724591135978699, "learning_rate": 0.0004983605234628328, "loss": 4.205953598022461, "step": 1329 }, { "epoch": 0.8085106382978723, "grad_norm": 1.1466760635375977, "learning_rate": 0.0004980762576791664, "loss": 3.9655470848083496, "step": 1330 }, { "epoch": 0.8091185410334346, "grad_norm": 1.1359692811965942, "learning_rate": 0.000497769372038695, "loss": 4.22013521194458, "step": 1331 }, { "epoch": 0.8097264437689969, "grad_norm": 1.0394648313522339, "learning_rate": 0.0004974398945235776, "loss": 3.911543130874634, "step": 1332 }, { "epoch": 0.8103343465045593, "grad_norm": 1.0383487939834595, "learning_rate": 0.0004970878551759239, "loss": 3.8219704627990723, "step": 1333 }, { "epoch": 0.8109422492401216, "grad_norm": 1.0844473838806152, "learning_rate": 0.000496713286095056, "loss": 3.876410484313965, "step": 1334 }, { "epoch": 0.8115501519756839, "grad_norm": 1.2770010232925415, "learning_rate": 0.0004963162214345805, "loss": 3.8071320056915283, "step": 1335 }, { "epoch": 0.8121580547112462, "grad_norm": 1.0182770490646362, "learning_rate": 0.0004958966973992754, "loss": 3.6059393882751465, "step": 1336 }, { "epoch": 0.8127659574468085, "grad_norm": 1.02802574634552, "learning_rate": 0.0004954547522417877, "loss": 3.669658660888672, "step": 1337 }, { "epoch": 0.8133738601823708, "grad_norm": 1.1248687505722046, "learning_rate": 0.0004949904262591467, "loss": 3.9866435527801514, "step": 1338 }, { "epoch": 0.8139817629179331, "grad_norm": 1.0492587089538574, "learning_rate": 0.0004945037617890889, "loss": 3.949676036834717, "step": 1339 }, { "epoch": 0.8145896656534954, "grad_norm": 0.9690307974815369, "learning_rate": 0.000493994803206198, "loss": 3.748741626739502, "step": 1340 }, { "epoch": 0.8151975683890578, "grad_norm": 1.465824842453003, "learning_rate": 0.0004934635969178583, "loss": 3.977262020111084, "step": 1341 }, { "epoch": 0.8158054711246201, "grad_norm": 1.0349231958389282, "learning_rate": 0.0004929101913600238, "loss": 3.619255542755127, "step": 1342 }, { "epoch": 0.8164133738601824, "grad_norm": 1.0467352867126465, "learning_rate": 0.0004923346369928012, "loss": 3.9860079288482666, "step": 1343 }, { "epoch": 0.8170212765957446, "grad_norm": 1.0222679376602173, "learning_rate": 0.0004917369862958494, "loss": 3.830394744873047, "step": 1344 }, { "epoch": 0.817629179331307, "grad_norm": 1.117563247680664, "learning_rate": 0.0004911172937635942, "loss": 3.7490053176879883, "step": 1345 }, { "epoch": 0.8182370820668693, "grad_norm": 1.4361516237258911, "learning_rate": 0.000490475615900259, "loss": 3.8583335876464844, "step": 1346 }, { "epoch": 0.8188449848024316, "grad_norm": 1.2900465726852417, "learning_rate": 0.0004898120112147136, "loss": 3.906479835510254, "step": 1347 }, { "epoch": 0.819452887537994, "grad_norm": 1.1463675498962402, "learning_rate": 0.0004891265402151381, "loss": 3.9391555786132812, "step": 1348 }, { "epoch": 0.8200607902735563, "grad_norm": 1.5694284439086914, "learning_rate": 0.0004884192654035069, "loss": 3.974485397338867, "step": 1349 }, { "epoch": 0.8206686930091185, "grad_norm": 1.0200462341308594, "learning_rate": 0.000487690251269889, "loss": 3.6864073276519775, "step": 1350 }, { "epoch": 0.8212765957446808, "grad_norm": 1.089603304862976, "learning_rate": 0.0004869395642865676, "loss": 3.7212605476379395, "step": 1351 }, { "epoch": 0.8218844984802431, "grad_norm": 1.2351415157318115, "learning_rate": 0.0004861672729019797, "loss": 3.700591802597046, "step": 1352 }, { "epoch": 0.8224924012158055, "grad_norm": 0.9957062602043152, "learning_rate": 0.00048537344753447453, "loss": 3.7319130897521973, "step": 1353 }, { "epoch": 0.8231003039513678, "grad_norm": 1.056557059288025, "learning_rate": 0.0004845581605658926, "loss": 3.657074213027954, "step": 1354 }, { "epoch": 0.8237082066869301, "grad_norm": 1.0980826616287231, "learning_rate": 0.00048372148633496617, "loss": 3.770319938659668, "step": 1355 }, { "epoch": 0.8243161094224924, "grad_norm": 0.8664339780807495, "learning_rate": 0.0004828635011305407, "loss": 3.894157886505127, "step": 1356 }, { "epoch": 0.8249240121580547, "grad_norm": 1.2869031429290771, "learning_rate": 0.00048198428318461896, "loss": 3.6396484375, "step": 1357 }, { "epoch": 0.825531914893617, "grad_norm": 1.459326148033142, "learning_rate": 0.0004810839126652275, "loss": 4.004338264465332, "step": 1358 }, { "epoch": 0.8261398176291793, "grad_norm": 1.1490086317062378, "learning_rate": 0.0004801624716691072, "loss": 4.074912071228027, "step": 1359 }, { "epoch": 0.8267477203647416, "grad_norm": 1.0660607814788818, "learning_rate": 0.00047922004421422726, "loss": 3.8288257122039795, "step": 1360 }, { "epoch": 0.827355623100304, "grad_norm": 1.1202282905578613, "learning_rate": 0.00047825671623212454, "loss": 3.728804111480713, "step": 1361 }, { "epoch": 0.8279635258358663, "grad_norm": 1.07158625125885, "learning_rate": 0.0004772725755600682, "loss": 3.5751538276672363, "step": 1362 }, { "epoch": 0.8285714285714286, "grad_norm": 1.008811354637146, "learning_rate": 0.0004762677119330505, "loss": 3.8057093620300293, "step": 1363 }, { "epoch": 0.8291793313069908, "grad_norm": 1.4745137691497803, "learning_rate": 0.00047524221697560476, "loss": 3.8376100063323975, "step": 1364 }, { "epoch": 0.8297872340425532, "grad_norm": 1.2719781398773193, "learning_rate": 0.00047419618419345115, "loss": 3.747580051422119, "step": 1365 }, { "epoch": 0.8303951367781155, "grad_norm": 1.1576839685440063, "learning_rate": 0.0004731297089649703, "loss": 3.7823610305786133, "step": 1366 }, { "epoch": 0.8310030395136778, "grad_norm": 1.0849125385284424, "learning_rate": 0.0004720428885325069, "loss": 3.9312424659729004, "step": 1367 }, { "epoch": 0.8316109422492401, "grad_norm": 1.0173479318618774, "learning_rate": 0.00047093582199350285, "loss": 3.641855239868164, "step": 1368 }, { "epoch": 0.8322188449848025, "grad_norm": 0.9390632510185242, "learning_rate": 0.00046980861029146173, "loss": 4.027669906616211, "step": 1369 }, { "epoch": 0.8328267477203647, "grad_norm": 1.0367680788040161, "learning_rate": 0.0004686613562067444, "loss": 3.9053921699523926, "step": 1370 }, { "epoch": 0.833434650455927, "grad_norm": 0.9983039498329163, "learning_rate": 0.00046749416434719747, "loss": 3.6601035594940186, "step": 1371 }, { "epoch": 0.8340425531914893, "grad_norm": 1.2556368112564087, "learning_rate": 0.00046630714113861507, "loss": 3.643587350845337, "step": 1372 }, { "epoch": 0.8346504559270517, "grad_norm": 1.0456199645996094, "learning_rate": 0.00046510039481503486, "loss": 3.689802646636963, "step": 1373 }, { "epoch": 0.835258358662614, "grad_norm": 0.9730342626571655, "learning_rate": 0.00046387403540886895, "loss": 3.6004483699798584, "step": 1374 }, { "epoch": 0.8358662613981763, "grad_norm": 1.2402491569519043, "learning_rate": 0.00046262817474087127, "loss": 3.6834664344787598, "step": 1375 }, { "epoch": 0.8364741641337387, "grad_norm": 1.1597247123718262, "learning_rate": 0.00046136292640994154, "loss": 3.7525768280029297, "step": 1376 }, { "epoch": 0.8370820668693009, "grad_norm": 1.2773432731628418, "learning_rate": 0.0004600784057827671, "loss": 3.862699508666992, "step": 1377 }, { "epoch": 0.8376899696048632, "grad_norm": 1.2818998098373413, "learning_rate": 0.00045877472998330385, "loss": 4.099722385406494, "step": 1378 }, { "epoch": 0.8382978723404255, "grad_norm": 1.0724464654922485, "learning_rate": 0.0004574520178820965, "loss": 3.8608179092407227, "step": 1379 }, { "epoch": 0.8389057750759878, "grad_norm": 1.293906807899475, "learning_rate": 0.0004561103900854401, "loss": 3.723815441131592, "step": 1380 }, { "epoch": 0.8395136778115502, "grad_norm": 1.0045194625854492, "learning_rate": 0.0004547499689243829, "loss": 3.7255592346191406, "step": 1381 }, { "epoch": 0.8401215805471125, "grad_norm": 1.0186697244644165, "learning_rate": 0.0004533708784435722, "loss": 3.6717958450317383, "step": 1382 }, { "epoch": 0.8407294832826747, "grad_norm": 1.0383477210998535, "learning_rate": 0.0004519732443899435, "loss": 3.681596279144287, "step": 1383 }, { "epoch": 0.841337386018237, "grad_norm": 1.1144697666168213, "learning_rate": 0.00045055719420125504, "loss": 3.9934191703796387, "step": 1384 }, { "epoch": 0.8419452887537994, "grad_norm": 1.056483268737793, "learning_rate": 0.0004491228569944679, "loss": 4.028287887573242, "step": 1385 }, { "epoch": 0.8425531914893617, "grad_norm": 1.1046830415725708, "learning_rate": 0.0004476703635539728, "loss": 3.823612689971924, "step": 1386 }, { "epoch": 0.843161094224924, "grad_norm": 1.1697293519973755, "learning_rate": 0.00044619984631966527, "loss": 3.7220816612243652, "step": 1387 }, { "epoch": 0.8437689969604864, "grad_norm": 1.0626883506774902, "learning_rate": 0.0004447114393748694, "loss": 3.5306200981140137, "step": 1388 }, { "epoch": 0.8443768996960487, "grad_norm": 1.153074026107788, "learning_rate": 0.0004432052784341122, "loss": 3.672762393951416, "step": 1389 }, { "epoch": 0.8449848024316109, "grad_norm": 0.9894313812255859, "learning_rate": 0.0004416815008307488, "loss": 3.661726474761963, "step": 1390 }, { "epoch": 0.8455927051671732, "grad_norm": 0.9667363166809082, "learning_rate": 0.00044014024550444045, "loss": 3.788522720336914, "step": 1391 }, { "epoch": 0.8462006079027355, "grad_norm": 1.2645761966705322, "learning_rate": 0.00043858165298848556, "loss": 3.721158981323242, "step": 1392 }, { "epoch": 0.8468085106382979, "grad_norm": 1.0775492191314697, "learning_rate": 0.00043700586539700614, "loss": 3.5772523880004883, "step": 1393 }, { "epoch": 0.8474164133738602, "grad_norm": 1.0271198749542236, "learning_rate": 0.00043541302641198946, "loss": 3.820373058319092, "step": 1394 }, { "epoch": 0.8480243161094225, "grad_norm": 0.8722153902053833, "learning_rate": 0.00043380328127018663, "loss": 3.610518455505371, "step": 1395 }, { "epoch": 0.8486322188449849, "grad_norm": 1.0228782892227173, "learning_rate": 0.00043217677674987047, "loss": 3.7967772483825684, "step": 1396 }, { "epoch": 0.8492401215805471, "grad_norm": 1.0525845289230347, "learning_rate": 0.00043053366115745174, "loss": 3.623091697692871, "step": 1397 }, { "epoch": 0.8498480243161094, "grad_norm": 0.998408317565918, "learning_rate": 0.00042887408431395614, "loss": 3.685908317565918, "step": 1398 }, { "epoch": 0.8504559270516717, "grad_norm": 1.026895523071289, "learning_rate": 0.0004271981975413639, "loss": 3.5633139610290527, "step": 1399 }, { "epoch": 0.851063829787234, "grad_norm": 1.0553339719772339, "learning_rate": 0.00042550615364881196, "loss": 3.833423137664795, "step": 1400 }, { "epoch": 0.8516717325227964, "grad_norm": 1.1019606590270996, "learning_rate": 0.00042379810691866064, "loss": 3.8337411880493164, "step": 1401 }, { "epoch": 0.8522796352583587, "grad_norm": 1.7001227140426636, "learning_rate": 0.0004220742130924257, "loss": 3.495081663131714, "step": 1402 }, { "epoch": 0.8528875379939209, "grad_norm": 1.0422172546386719, "learning_rate": 0.0004203346293565784, "loss": 3.7549071311950684, "step": 1403 }, { "epoch": 0.8534954407294832, "grad_norm": 1.2587510347366333, "learning_rate": 0.0004185795143282123, "loss": 3.770139217376709, "step": 1404 }, { "epoch": 0.8541033434650456, "grad_norm": 1.1424074172973633, "learning_rate": 0.00041680902804058095, "loss": 3.779757499694824, "step": 1405 }, { "epoch": 0.8547112462006079, "grad_norm": 1.0849041938781738, "learning_rate": 0.0004150233319285055, "loss": 3.8310835361480713, "step": 1406 }, { "epoch": 0.8553191489361702, "grad_norm": 1.1193660497665405, "learning_rate": 0.00041322258881365515, "loss": 3.7291765213012695, "step": 1407 }, { "epoch": 0.8559270516717326, "grad_norm": 1.1108680963516235, "learning_rate": 0.0004114069628897006, "loss": 4.129992485046387, "step": 1408 }, { "epoch": 0.8565349544072949, "grad_norm": 1.1723637580871582, "learning_rate": 0.0004095766197073432, "loss": 3.6475980281829834, "step": 1409 }, { "epoch": 0.8571428571428571, "grad_norm": 1.0332688093185425, "learning_rate": 0.0004077317261592194, "loss": 3.8548192977905273, "step": 1410 }, { "epoch": 0.8577507598784194, "grad_norm": 1.0339442491531372, "learning_rate": 0.0004058724504646834, "loss": 3.8385329246520996, "step": 1411 }, { "epoch": 0.8583586626139817, "grad_norm": 1.0235612392425537, "learning_rate": 0.000403998962154469, "loss": 3.8993711471557617, "step": 1412 }, { "epoch": 0.8589665653495441, "grad_norm": 0.8945487141609192, "learning_rate": 0.0004021114320552311, "loss": 3.681536912918091, "step": 1413 }, { "epoch": 0.8595744680851064, "grad_norm": 0.907351016998291, "learning_rate": 0.00040021003227397014, "loss": 3.751767635345459, "step": 1414 }, { "epoch": 0.8601823708206687, "grad_norm": 0.8751946091651917, "learning_rate": 0.0003982949361823388, "loss": 3.6982154846191406, "step": 1415 }, { "epoch": 0.8607902735562311, "grad_norm": 0.9630452990531921, "learning_rate": 0.0003963663184008338, "loss": 3.7995591163635254, "step": 1416 }, { "epoch": 0.8613981762917933, "grad_norm": 1.0385856628417969, "learning_rate": 0.0003944243547828742, "loss": 3.583292007446289, "step": 1417 }, { "epoch": 0.8620060790273556, "grad_norm": 1.1438446044921875, "learning_rate": 0.000392469222398766, "loss": 3.97701096534729, "step": 1418 }, { "epoch": 0.8626139817629179, "grad_norm": 1.0620638132095337, "learning_rate": 0.00039050109951955814, "loss": 3.5980987548828125, "step": 1419 }, { "epoch": 0.8632218844984803, "grad_norm": 0.978015661239624, "learning_rate": 0.000388520165600786, "loss": 3.7316596508026123, "step": 1420 }, { "epoch": 0.8638297872340426, "grad_norm": 1.0127967596054077, "learning_rate": 0.0003865266012661095, "loss": 3.9404823780059814, "step": 1421 }, { "epoch": 0.8644376899696049, "grad_norm": 1.2003899812698364, "learning_rate": 0.0003845205882908432, "loss": 3.918931245803833, "step": 1422 }, { "epoch": 0.8650455927051671, "grad_norm": 1.1857889890670776, "learning_rate": 0.000382502309585382, "loss": 3.6090548038482666, "step": 1423 }, { "epoch": 0.8656534954407294, "grad_norm": 1.2434966564178467, "learning_rate": 0.000380471949178523, "loss": 3.467123031616211, "step": 1424 }, { "epoch": 0.8662613981762918, "grad_norm": 1.2050342559814453, "learning_rate": 0.0003784296922006859, "loss": 3.696073055267334, "step": 1425 }, { "epoch": 0.8668693009118541, "grad_norm": 1.1129356622695923, "learning_rate": 0.0003763757248670321, "loss": 3.715449810028076, "step": 1426 }, { "epoch": 0.8674772036474164, "grad_norm": 1.1708143949508667, "learning_rate": 0.00037431023446048595, "loss": 3.860975980758667, "step": 1427 }, { "epoch": 0.8680851063829788, "grad_norm": 1.1058366298675537, "learning_rate": 0.0003722334093146576, "loss": 3.7916457653045654, "step": 1428 }, { "epoch": 0.8686930091185411, "grad_norm": 1.274646520614624, "learning_rate": 0.00037014543879667093, "loss": 3.8200652599334717, "step": 1429 }, { "epoch": 0.8693009118541033, "grad_norm": 1.2253806591033936, "learning_rate": 0.00036804651328989666, "loss": 3.522810459136963, "step": 1430 }, { "epoch": 0.8699088145896656, "grad_norm": 1.1936273574829102, "learning_rate": 0.000365936824176593, "loss": 3.9892830848693848, "step": 1431 }, { "epoch": 0.870516717325228, "grad_norm": 1.1107733249664307, "learning_rate": 0.00036381656382045526, "loss": 3.6833291053771973, "step": 1432 }, { "epoch": 0.8711246200607903, "grad_norm": 1.4528982639312744, "learning_rate": 0.00036168592554907596, "loss": 3.4317424297332764, "step": 1433 }, { "epoch": 0.8717325227963526, "grad_norm": 1.539918303489685, "learning_rate": 0.0003595451036363168, "loss": 3.9146463871002197, "step": 1434 }, { "epoch": 0.8723404255319149, "grad_norm": 1.0589654445648193, "learning_rate": 0.00035739429328459493, "loss": 3.64989972114563, "step": 1435 }, { "epoch": 0.8729483282674773, "grad_norm": 0.9970619082450867, "learning_rate": 0.0003552336906070838, "loss": 3.7197318077087402, "step": 1436 }, { "epoch": 0.8735562310030395, "grad_norm": 1.1559967994689941, "learning_rate": 0.0003530634926098316, "loss": 3.835594892501831, "step": 1437 }, { "epoch": 0.8741641337386018, "grad_norm": 1.0069043636322021, "learning_rate": 0.0003508838971737981, "loss": 3.8029980659484863, "step": 1438 }, { "epoch": 0.8747720364741641, "grad_norm": 1.3581100702285767, "learning_rate": 0.0003486951030368113, "loss": 3.6824827194213867, "step": 1439 }, { "epoch": 0.8753799392097265, "grad_norm": 1.7533200979232788, "learning_rate": 0.00034649730977544664, "loss": 3.7235536575317383, "step": 1440 }, { "epoch": 0.8759878419452888, "grad_norm": 1.0940066576004028, "learning_rate": 0.0003442907177868293, "loss": 3.6482458114624023, "step": 1441 }, { "epoch": 0.8765957446808511, "grad_norm": 1.0252796411514282, "learning_rate": 0.00034207552827036176, "loss": 3.634884834289551, "step": 1442 }, { "epoch": 0.8772036474164133, "grad_norm": 1.3038619756698608, "learning_rate": 0.0003398519432093782, "loss": 3.886862277984619, "step": 1443 }, { "epoch": 0.8778115501519757, "grad_norm": 1.5358000993728638, "learning_rate": 0.00033762016535272745, "loss": 3.916736125946045, "step": 1444 }, { "epoch": 0.878419452887538, "grad_norm": 1.0540707111358643, "learning_rate": 0.00033538039819628625, "loss": 3.914485454559326, "step": 1445 }, { "epoch": 0.8790273556231003, "grad_norm": 1.0498977899551392, "learning_rate": 0.000333132845964404, "loss": 3.6423823833465576, "step": 1446 }, { "epoch": 0.8796352583586626, "grad_norm": 2.2342031002044678, "learning_rate": 0.00033087771359128175, "loss": 3.7215816974639893, "step": 1447 }, { "epoch": 0.880243161094225, "grad_norm": 1.4365023374557495, "learning_rate": 0.00032861520670228586, "loss": 3.7631328105926514, "step": 1448 }, { "epoch": 0.8808510638297873, "grad_norm": 2.098018169403076, "learning_rate": 0.00032634553159519865, "loss": 3.4372754096984863, "step": 1449 }, { "epoch": 0.8814589665653495, "grad_norm": 0.9924235939979553, "learning_rate": 0.0003240688952214085, "loss": 4.062948226928711, "step": 1450 }, { "epoch": 0.8820668693009118, "grad_norm": 1.2176319360733032, "learning_rate": 0.0003217855051670393, "loss": 3.6439735889434814, "step": 1451 }, { "epoch": 0.8826747720364742, "grad_norm": 1.2388694286346436, "learning_rate": 0.00031949556963402283, "loss": 3.8236451148986816, "step": 1452 }, { "epoch": 0.8832826747720365, "grad_norm": 0.8196237683296204, "learning_rate": 0.00031719929742111437, "loss": 3.686429977416992, "step": 1453 }, { "epoch": 0.8838905775075988, "grad_norm": 0.9667937755584717, "learning_rate": 0.00031489689790485464, "loss": 4.012905120849609, "step": 1454 }, { "epoch": 0.8844984802431611, "grad_norm": 0.9525713920593262, "learning_rate": 0.00031258858102047813, "loss": 3.484525680541992, "step": 1455 }, { "epoch": 0.8851063829787233, "grad_norm": 1.0953255891799927, "learning_rate": 0.0003102745572427716, "loss": 3.6367969512939453, "step": 1456 }, { "epoch": 0.8857142857142857, "grad_norm": 0.8041018843650818, "learning_rate": 0.0003079550375668821, "loss": 3.627480983734131, "step": 1457 }, { "epoch": 0.886322188449848, "grad_norm": 0.9474192261695862, "learning_rate": 0.0003056302334890786, "loss": 3.771761894226074, "step": 1458 }, { "epoch": 0.8869300911854103, "grad_norm": 0.9393053650856018, "learning_rate": 0.00030330035698746753, "loss": 3.4475784301757812, "step": 1459 }, { "epoch": 0.8875379939209727, "grad_norm": 1.0495154857635498, "learning_rate": 0.00030096562050266427, "loss": 3.7747950553894043, "step": 1460 }, { "epoch": 0.888145896656535, "grad_norm": 1.0793986320495605, "learning_rate": 0.0002986262369184226, "loss": 3.5836963653564453, "step": 1461 }, { "epoch": 0.8887537993920973, "grad_norm": 1.0350525379180908, "learning_rate": 0.0002962824195422238, "loss": 3.8103108406066895, "step": 1462 }, { "epoch": 0.8893617021276595, "grad_norm": 0.923565149307251, "learning_rate": 0.0002939343820858269, "loss": 3.5080511569976807, "step": 1463 }, { "epoch": 0.8899696048632219, "grad_norm": 0.9419893026351929, "learning_rate": 0.00029158233864578256, "loss": 3.5664780139923096, "step": 1464 }, { "epoch": 0.8905775075987842, "grad_norm": 1.120071530342102, "learning_rate": 0.000289226503683911, "loss": 3.5272912979125977, "step": 1465 }, { "epoch": 0.8911854103343465, "grad_norm": 0.9679391980171204, "learning_rate": 0.0002868670920077478, "loss": 3.7422268390655518, "step": 1466 }, { "epoch": 0.8917933130699088, "grad_norm": 0.8348677754402161, "learning_rate": 0.0002845043187509567, "loss": 3.677544355392456, "step": 1467 }, { "epoch": 0.8924012158054712, "grad_norm": 0.8530043959617615, "learning_rate": 0.0002821383993537144, "loss": 3.6701407432556152, "step": 1468 }, { "epoch": 0.8930091185410335, "grad_norm": 0.9390717148780823, "learning_rate": 0.00027976954954306554, "loss": 3.7104759216308594, "step": 1469 }, { "epoch": 0.8936170212765957, "grad_norm": 1.0507652759552002, "learning_rate": 0.0002773979853132534, "loss": 3.5879673957824707, "step": 1470 }, { "epoch": 0.894224924012158, "grad_norm": 0.9291044473648071, "learning_rate": 0.0002750239229060246, "loss": 3.655197858810425, "step": 1471 }, { "epoch": 0.8948328267477204, "grad_norm": 0.9448993802070618, "learning_rate": 0.0002726475787909125, "loss": 3.6126198768615723, "step": 1472 }, { "epoch": 0.8954407294832827, "grad_norm": 0.9143878221511841, "learning_rate": 0.0002702691696454986, "loss": 3.7886955738067627, "step": 1473 }, { "epoch": 0.896048632218845, "grad_norm": 0.8731086850166321, "learning_rate": 0.00026788891233565655, "loss": 3.4998018741607666, "step": 1474 }, { "epoch": 0.8966565349544073, "grad_norm": 1.0264720916748047, "learning_rate": 0.0002655070238957772, "loss": 3.7816460132598877, "step": 1475 }, { "epoch": 0.8972644376899696, "grad_norm": 0.9198083877563477, "learning_rate": 0.0002631237215089798, "loss": 3.2887322902679443, "step": 1476 } ], "logging_steps": 1, "max_steps": 1645, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 164, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.8063788889999933e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }