{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2961, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00033772374197906115, "grad_norm": 50.10189437866211, "learning_rate": 0.0, "loss": 2.4076, "step": 1 }, { "epoch": 0.0006754474839581223, "grad_norm": 44.271629333496094, "learning_rate": 2.247191011235955e-07, "loss": 2.3058, "step": 2 }, { "epoch": 0.0010131712259371835, "grad_norm": 41.26779556274414, "learning_rate": 4.49438202247191e-07, "loss": 2.4165, "step": 3 }, { "epoch": 0.0013508949679162446, "grad_norm": 45.60620880126953, "learning_rate": 6.741573033707865e-07, "loss": 2.2885, "step": 4 }, { "epoch": 0.0016886187098953055, "grad_norm": 52.9456672668457, "learning_rate": 8.98876404494382e-07, "loss": 2.303, "step": 5 }, { "epoch": 0.002026342451874367, "grad_norm": 42.909305572509766, "learning_rate": 1.1235955056179777e-06, "loss": 2.3153, "step": 6 }, { "epoch": 0.002364066193853428, "grad_norm": 41.13112258911133, "learning_rate": 1.348314606741573e-06, "loss": 2.3465, "step": 7 }, { "epoch": 0.002701789935832489, "grad_norm": 38.38589096069336, "learning_rate": 1.5730337078651686e-06, "loss": 2.1576, "step": 8 }, { "epoch": 0.00303951367781155, "grad_norm": 35.56371307373047, "learning_rate": 1.797752808988764e-06, "loss": 2.0807, "step": 9 }, { "epoch": 0.003377237419790611, "grad_norm": 26.647472381591797, "learning_rate": 2.02247191011236e-06, "loss": 1.7679, "step": 10 }, { "epoch": 0.0037149611617696724, "grad_norm": 28.12204360961914, "learning_rate": 2.2471910112359554e-06, "loss": 1.7461, "step": 11 }, { "epoch": 0.004052684903748734, "grad_norm": 25.484024047851562, "learning_rate": 2.4719101123595505e-06, "loss": 1.5513, "step": 12 }, { "epoch": 0.004390408645727794, "grad_norm": 20.636688232421875, "learning_rate": 2.696629213483146e-06, "loss": 1.5211, "step": 13 }, { "epoch": 0.004728132387706856, "grad_norm": 9.782940864562988, "learning_rate": 2.9213483146067416e-06, "loss": 1.2883, "step": 14 }, { "epoch": 0.005065856129685917, "grad_norm": 9.97684097290039, "learning_rate": 3.146067415730337e-06, "loss": 1.2591, "step": 15 }, { "epoch": 0.005403579871664978, "grad_norm": 9.615913391113281, "learning_rate": 3.3707865168539327e-06, "loss": 1.2123, "step": 16 }, { "epoch": 0.005741303613644039, "grad_norm": 8.775111198425293, "learning_rate": 3.595505617977528e-06, "loss": 1.1456, "step": 17 }, { "epoch": 0.0060790273556231, "grad_norm": 8.429645538330078, "learning_rate": 3.820224719101124e-06, "loss": 1.1558, "step": 18 }, { "epoch": 0.006416751097602162, "grad_norm": 8.449453353881836, "learning_rate": 4.04494382022472e-06, "loss": 1.133, "step": 19 }, { "epoch": 0.006754474839581222, "grad_norm": 11.782390594482422, "learning_rate": 4.269662921348315e-06, "loss": 1.0918, "step": 20 }, { "epoch": 0.0070921985815602835, "grad_norm": 14.329768180847168, "learning_rate": 4.494382022471911e-06, "loss": 1.2144, "step": 21 }, { "epoch": 0.007429922323539345, "grad_norm": 10.508218765258789, "learning_rate": 4.719101123595506e-06, "loss": 1.1004, "step": 22 }, { "epoch": 0.007767646065518406, "grad_norm": 8.545432090759277, "learning_rate": 4.943820224719101e-06, "loss": 1.0638, "step": 23 }, { "epoch": 0.008105369807497468, "grad_norm": 8.0914888381958, "learning_rate": 5.168539325842698e-06, "loss": 1.0281, "step": 24 }, { "epoch": 0.008443093549476529, "grad_norm": 8.022607803344727, "learning_rate": 5.393258426966292e-06, "loss": 1.0743, "step": 25 }, { "epoch": 0.008780817291455589, "grad_norm": 7.25279426574707, "learning_rate": 5.617977528089889e-06, "loss": 0.9934, "step": 26 }, { "epoch": 0.00911854103343465, "grad_norm": 6.515768051147461, "learning_rate": 5.842696629213483e-06, "loss": 0.9689, "step": 27 }, { "epoch": 0.009456264775413711, "grad_norm": 6.414740562438965, "learning_rate": 6.06741573033708e-06, "loss": 0.9712, "step": 28 }, { "epoch": 0.009793988517392773, "grad_norm": 5.925638198852539, "learning_rate": 6.292134831460674e-06, "loss": 0.8881, "step": 29 }, { "epoch": 0.010131712259371834, "grad_norm": 5.318961143493652, "learning_rate": 6.51685393258427e-06, "loss": 0.856, "step": 30 }, { "epoch": 0.010469436001350895, "grad_norm": 5.410531044006348, "learning_rate": 6.741573033707865e-06, "loss": 0.7896, "step": 31 }, { "epoch": 0.010807159743329957, "grad_norm": 5.174140930175781, "learning_rate": 6.966292134831461e-06, "loss": 0.8966, "step": 32 }, { "epoch": 0.011144883485309016, "grad_norm": 5.5696234703063965, "learning_rate": 7.191011235955056e-06, "loss": 0.8468, "step": 33 }, { "epoch": 0.011482607227288078, "grad_norm": 5.987701892852783, "learning_rate": 7.415730337078652e-06, "loss": 0.8169, "step": 34 }, { "epoch": 0.01182033096926714, "grad_norm": 5.436456680297852, "learning_rate": 7.640449438202247e-06, "loss": 0.8133, "step": 35 }, { "epoch": 0.0121580547112462, "grad_norm": 5.516933917999268, "learning_rate": 7.865168539325843e-06, "loss": 0.9219, "step": 36 }, { "epoch": 0.012495778453225262, "grad_norm": 5.960140705108643, "learning_rate": 8.08988764044944e-06, "loss": 0.8073, "step": 37 }, { "epoch": 0.012833502195204323, "grad_norm": 5.202682971954346, "learning_rate": 8.314606741573035e-06, "loss": 0.8338, "step": 38 }, { "epoch": 0.013171225937183385, "grad_norm": 4.67331075668335, "learning_rate": 8.53932584269663e-06, "loss": 0.8111, "step": 39 }, { "epoch": 0.013508949679162444, "grad_norm": 4.596957206726074, "learning_rate": 8.764044943820226e-06, "loss": 0.7566, "step": 40 }, { "epoch": 0.013846673421141506, "grad_norm": 5.329619407653809, "learning_rate": 8.988764044943822e-06, "loss": 0.7601, "step": 41 }, { "epoch": 0.014184397163120567, "grad_norm": 4.438983917236328, "learning_rate": 9.213483146067417e-06, "loss": 0.7545, "step": 42 }, { "epoch": 0.014522120905099628, "grad_norm": 7.461650848388672, "learning_rate": 9.438202247191012e-06, "loss": 0.7572, "step": 43 }, { "epoch": 0.01485984464707869, "grad_norm": 4.689310550689697, "learning_rate": 9.662921348314608e-06, "loss": 0.699, "step": 44 }, { "epoch": 0.015197568389057751, "grad_norm": 7.0688066482543945, "learning_rate": 9.887640449438202e-06, "loss": 0.753, "step": 45 }, { "epoch": 0.015535292131036813, "grad_norm": 5.523561000823975, "learning_rate": 1.01123595505618e-05, "loss": 0.7081, "step": 46 }, { "epoch": 0.015873015873015872, "grad_norm": 5.229451656341553, "learning_rate": 1.0337078651685396e-05, "loss": 0.7223, "step": 47 }, { "epoch": 0.016210739614994935, "grad_norm": 4.734966278076172, "learning_rate": 1.0561797752808988e-05, "loss": 0.7837, "step": 48 }, { "epoch": 0.016548463356973995, "grad_norm": 4.201187610626221, "learning_rate": 1.0786516853932584e-05, "loss": 0.7775, "step": 49 }, { "epoch": 0.016886187098953058, "grad_norm": 6.756969451904297, "learning_rate": 1.101123595505618e-05, "loss": 0.7058, "step": 50 }, { "epoch": 0.017223910840932118, "grad_norm": 4.614748001098633, "learning_rate": 1.1235955056179778e-05, "loss": 0.6543, "step": 51 }, { "epoch": 0.017561634582911177, "grad_norm": 4.245177268981934, "learning_rate": 1.146067415730337e-05, "loss": 0.6674, "step": 52 }, { "epoch": 0.01789935832489024, "grad_norm": 4.3909502029418945, "learning_rate": 1.1685393258426966e-05, "loss": 0.7157, "step": 53 }, { "epoch": 0.0182370820668693, "grad_norm": 4.713353157043457, "learning_rate": 1.1910112359550562e-05, "loss": 0.7074, "step": 54 }, { "epoch": 0.018574805808848363, "grad_norm": 4.385791301727295, "learning_rate": 1.213483146067416e-05, "loss": 0.6711, "step": 55 }, { "epoch": 0.018912529550827423, "grad_norm": 4.733304977416992, "learning_rate": 1.2359550561797752e-05, "loss": 0.6978, "step": 56 }, { "epoch": 0.019250253292806486, "grad_norm": 4.483788967132568, "learning_rate": 1.2584269662921348e-05, "loss": 0.7308, "step": 57 }, { "epoch": 0.019587977034785545, "grad_norm": 7.842435359954834, "learning_rate": 1.2808988764044944e-05, "loss": 0.7204, "step": 58 }, { "epoch": 0.019925700776764605, "grad_norm": 5.329195976257324, "learning_rate": 1.303370786516854e-05, "loss": 0.7541, "step": 59 }, { "epoch": 0.020263424518743668, "grad_norm": 6.006994247436523, "learning_rate": 1.3258426966292135e-05, "loss": 0.6854, "step": 60 }, { "epoch": 0.020601148260722728, "grad_norm": 4.954163551330566, "learning_rate": 1.348314606741573e-05, "loss": 0.6772, "step": 61 }, { "epoch": 0.02093887200270179, "grad_norm": 6.115952491760254, "learning_rate": 1.3707865168539327e-05, "loss": 0.733, "step": 62 }, { "epoch": 0.02127659574468085, "grad_norm": 5.515040874481201, "learning_rate": 1.3932584269662923e-05, "loss": 0.6764, "step": 63 }, { "epoch": 0.021614319486659914, "grad_norm": 5.990635395050049, "learning_rate": 1.4157303370786517e-05, "loss": 0.6807, "step": 64 }, { "epoch": 0.021952043228638973, "grad_norm": 5.867841720581055, "learning_rate": 1.4382022471910113e-05, "loss": 0.7252, "step": 65 }, { "epoch": 0.022289766970618033, "grad_norm": 8.022838592529297, "learning_rate": 1.4606741573033709e-05, "loss": 0.6723, "step": 66 }, { "epoch": 0.022627490712597096, "grad_norm": 6.407839298248291, "learning_rate": 1.4831460674157305e-05, "loss": 0.6876, "step": 67 }, { "epoch": 0.022965214454576156, "grad_norm": 5.519984722137451, "learning_rate": 1.5056179775280899e-05, "loss": 0.6095, "step": 68 }, { "epoch": 0.02330293819655522, "grad_norm": 5.558239936828613, "learning_rate": 1.5280898876404495e-05, "loss": 0.6624, "step": 69 }, { "epoch": 0.02364066193853428, "grad_norm": 6.600611209869385, "learning_rate": 1.5505617977528093e-05, "loss": 0.68, "step": 70 }, { "epoch": 0.02397838568051334, "grad_norm": 4.17883825302124, "learning_rate": 1.5730337078651687e-05, "loss": 0.6673, "step": 71 }, { "epoch": 0.0243161094224924, "grad_norm": 4.849052429199219, "learning_rate": 1.595505617977528e-05, "loss": 0.6922, "step": 72 }, { "epoch": 0.02465383316447146, "grad_norm": 6.452691078186035, "learning_rate": 1.617977528089888e-05, "loss": 0.6857, "step": 73 }, { "epoch": 0.024991556906450524, "grad_norm": 5.199310302734375, "learning_rate": 1.6404494382022473e-05, "loss": 0.6675, "step": 74 }, { "epoch": 0.025329280648429583, "grad_norm": 4.932150363922119, "learning_rate": 1.662921348314607e-05, "loss": 0.6734, "step": 75 }, { "epoch": 0.025667004390408647, "grad_norm": 5.169692039489746, "learning_rate": 1.6853932584269665e-05, "loss": 0.6714, "step": 76 }, { "epoch": 0.026004728132387706, "grad_norm": 4.994898319244385, "learning_rate": 1.707865168539326e-05, "loss": 0.6863, "step": 77 }, { "epoch": 0.02634245187436677, "grad_norm": 7.306075096130371, "learning_rate": 1.7303370786516857e-05, "loss": 0.6854, "step": 78 }, { "epoch": 0.02668017561634583, "grad_norm": 7.095992088317871, "learning_rate": 1.752808988764045e-05, "loss": 0.6672, "step": 79 }, { "epoch": 0.02701789935832489, "grad_norm": 4.298572540283203, "learning_rate": 1.7752808988764045e-05, "loss": 0.6451, "step": 80 }, { "epoch": 0.02735562310030395, "grad_norm": 5.45023250579834, "learning_rate": 1.7977528089887643e-05, "loss": 0.6654, "step": 81 }, { "epoch": 0.02769334684228301, "grad_norm": 4.542657852172852, "learning_rate": 1.8202247191011237e-05, "loss": 0.6912, "step": 82 }, { "epoch": 0.028031070584262074, "grad_norm": 8.690850257873535, "learning_rate": 1.8426966292134835e-05, "loss": 0.6274, "step": 83 }, { "epoch": 0.028368794326241134, "grad_norm": 6.253932952880859, "learning_rate": 1.8651685393258426e-05, "loss": 0.6622, "step": 84 }, { "epoch": 0.028706518068220197, "grad_norm": 7.63929557800293, "learning_rate": 1.8876404494382024e-05, "loss": 0.5958, "step": 85 }, { "epoch": 0.029044241810199257, "grad_norm": 7.0183539390563965, "learning_rate": 1.910112359550562e-05, "loss": 0.659, "step": 86 }, { "epoch": 0.029381965552178316, "grad_norm": 6.354076385498047, "learning_rate": 1.9325842696629215e-05, "loss": 0.6239, "step": 87 }, { "epoch": 0.02971968929415738, "grad_norm": 5.443641185760498, "learning_rate": 1.955056179775281e-05, "loss": 0.6462, "step": 88 }, { "epoch": 0.03005741303613644, "grad_norm": 4.6254801750183105, "learning_rate": 1.9775280898876404e-05, "loss": 0.6225, "step": 89 }, { "epoch": 0.030395136778115502, "grad_norm": 8.222158432006836, "learning_rate": 2e-05, "loss": 0.6809, "step": 90 }, { "epoch": 0.030732860520094562, "grad_norm": 4.348745346069336, "learning_rate": 1.999999401725028e-05, "loss": 0.6277, "step": 91 }, { "epoch": 0.031070584262073625, "grad_norm": 4.659098148345947, "learning_rate": 1.999997606900827e-05, "loss": 0.7233, "step": 92 }, { "epoch": 0.031408308004052685, "grad_norm": 4.061614036560059, "learning_rate": 1.9999946155295455e-05, "loss": 0.7107, "step": 93 }, { "epoch": 0.031746031746031744, "grad_norm": 4.863790988922119, "learning_rate": 1.999990427614762e-05, "loss": 0.6328, "step": 94 }, { "epoch": 0.032083755488010804, "grad_norm": 4.110963344573975, "learning_rate": 1.999985043161488e-05, "loss": 0.599, "step": 95 }, { "epoch": 0.03242147922998987, "grad_norm": 3.7325758934020996, "learning_rate": 1.999978462176166e-05, "loss": 0.6251, "step": 96 }, { "epoch": 0.03275920297196893, "grad_norm": 3.7743582725524902, "learning_rate": 1.9999706846666708e-05, "loss": 0.6093, "step": 97 }, { "epoch": 0.03309692671394799, "grad_norm": 4.4590935707092285, "learning_rate": 1.999961710642308e-05, "loss": 0.6746, "step": 98 }, { "epoch": 0.03343465045592705, "grad_norm": 3.5626745223999023, "learning_rate": 1.9999515401138166e-05, "loss": 0.6146, "step": 99 }, { "epoch": 0.033772374197906116, "grad_norm": 3.821159601211548, "learning_rate": 1.999940173093365e-05, "loss": 0.6348, "step": 100 }, { "epoch": 0.034110097939885176, "grad_norm": 4.028972625732422, "learning_rate": 1.999927609594555e-05, "loss": 0.6226, "step": 101 }, { "epoch": 0.034447821681864235, "grad_norm": 4.01675271987915, "learning_rate": 1.999913849632419e-05, "loss": 0.6434, "step": 102 }, { "epoch": 0.034785545423843295, "grad_norm": 4.323501110076904, "learning_rate": 1.999898893223422e-05, "loss": 0.6632, "step": 103 }, { "epoch": 0.035123269165822354, "grad_norm": 4.932978630065918, "learning_rate": 1.9998827403854596e-05, "loss": 0.6262, "step": 104 }, { "epoch": 0.03546099290780142, "grad_norm": 4.380099773406982, "learning_rate": 1.99986539113786e-05, "loss": 0.6702, "step": 105 }, { "epoch": 0.03579871664978048, "grad_norm": 4.053253650665283, "learning_rate": 1.9998468455013825e-05, "loss": 0.654, "step": 106 }, { "epoch": 0.03613644039175954, "grad_norm": 3.8264594078063965, "learning_rate": 1.999827103498217e-05, "loss": 0.6231, "step": 107 }, { "epoch": 0.0364741641337386, "grad_norm": 4.316560745239258, "learning_rate": 1.9998061651519868e-05, "loss": 0.657, "step": 108 }, { "epoch": 0.03681188787571766, "grad_norm": 3.56504487991333, "learning_rate": 1.999784030487745e-05, "loss": 0.6399, "step": 109 }, { "epoch": 0.037149611617696726, "grad_norm": 3.8484597206115723, "learning_rate": 1.999760699531977e-05, "loss": 0.6179, "step": 110 }, { "epoch": 0.037487335359675786, "grad_norm": 3.4877591133117676, "learning_rate": 1.9997361723125996e-05, "loss": 0.5974, "step": 111 }, { "epoch": 0.037825059101654845, "grad_norm": 4.060441017150879, "learning_rate": 1.9997104488589607e-05, "loss": 0.6131, "step": 112 }, { "epoch": 0.038162782843633905, "grad_norm": 3.373605728149414, "learning_rate": 1.9996835292018397e-05, "loss": 0.6434, "step": 113 }, { "epoch": 0.03850050658561297, "grad_norm": 3.3210911750793457, "learning_rate": 1.9996554133734473e-05, "loss": 0.6021, "step": 114 }, { "epoch": 0.03883823032759203, "grad_norm": 3.4856150150299072, "learning_rate": 1.999626101407426e-05, "loss": 0.6275, "step": 115 }, { "epoch": 0.03917595406957109, "grad_norm": 3.481555938720703, "learning_rate": 1.999595593338848e-05, "loss": 0.6176, "step": 116 }, { "epoch": 0.03951367781155015, "grad_norm": 3.874725341796875, "learning_rate": 1.9995638892042187e-05, "loss": 0.6417, "step": 117 }, { "epoch": 0.03985140155352921, "grad_norm": 3.424097776412964, "learning_rate": 1.9995309890414735e-05, "loss": 0.6322, "step": 118 }, { "epoch": 0.04018912529550828, "grad_norm": 3.740689754486084, "learning_rate": 1.9994968928899786e-05, "loss": 0.638, "step": 119 }, { "epoch": 0.040526849037487336, "grad_norm": 3.995001792907715, "learning_rate": 1.9994616007905318e-05, "loss": 0.622, "step": 120 }, { "epoch": 0.040864572779466396, "grad_norm": 4.847942352294922, "learning_rate": 1.9994251127853625e-05, "loss": 0.6575, "step": 121 }, { "epoch": 0.041202296521445456, "grad_norm": 3.3151299953460693, "learning_rate": 1.99938742891813e-05, "loss": 0.6019, "step": 122 }, { "epoch": 0.04154002026342452, "grad_norm": 3.303891897201538, "learning_rate": 1.999348549233925e-05, "loss": 0.5822, "step": 123 }, { "epoch": 0.04187774400540358, "grad_norm": 3.598629951477051, "learning_rate": 1.9993084737792687e-05, "loss": 0.6164, "step": 124 }, { "epoch": 0.04221546774738264, "grad_norm": 3.5501952171325684, "learning_rate": 1.9992672026021136e-05, "loss": 0.6393, "step": 125 }, { "epoch": 0.0425531914893617, "grad_norm": 3.3706142902374268, "learning_rate": 1.9992247357518428e-05, "loss": 0.6231, "step": 126 }, { "epoch": 0.04289091523134076, "grad_norm": 4.808049201965332, "learning_rate": 1.99918107327927e-05, "loss": 0.6109, "step": 127 }, { "epoch": 0.04322863897331983, "grad_norm": 3.605516195297241, "learning_rate": 1.9991362152366393e-05, "loss": 0.6468, "step": 128 }, { "epoch": 0.04356636271529889, "grad_norm": 3.820783853530884, "learning_rate": 1.9990901616776258e-05, "loss": 0.6137, "step": 129 }, { "epoch": 0.04390408645727795, "grad_norm": 3.2461788654327393, "learning_rate": 1.9990429126573353e-05, "loss": 0.6473, "step": 130 }, { "epoch": 0.044241810199257006, "grad_norm": 3.5616414546966553, "learning_rate": 1.9989944682323027e-05, "loss": 0.5717, "step": 131 }, { "epoch": 0.044579533941236066, "grad_norm": 3.425881862640381, "learning_rate": 1.9989448284604947e-05, "loss": 0.6378, "step": 132 }, { "epoch": 0.04491725768321513, "grad_norm": 4.116847038269043, "learning_rate": 1.9988939934013078e-05, "loss": 0.6669, "step": 133 }, { "epoch": 0.04525498142519419, "grad_norm": 3.32547926902771, "learning_rate": 1.9988419631155686e-05, "loss": 0.6015, "step": 134 }, { "epoch": 0.04559270516717325, "grad_norm": 3.284442663192749, "learning_rate": 1.9987887376655338e-05, "loss": 0.6648, "step": 135 }, { "epoch": 0.04593042890915231, "grad_norm": 3.4459228515625, "learning_rate": 1.9987343171148904e-05, "loss": 0.6041, "step": 136 }, { "epoch": 0.04626815265113138, "grad_norm": 4.657093524932861, "learning_rate": 1.9986787015287556e-05, "loss": 0.5847, "step": 137 }, { "epoch": 0.04660587639311044, "grad_norm": 3.7303574085235596, "learning_rate": 1.9986218909736758e-05, "loss": 0.6446, "step": 138 }, { "epoch": 0.0469436001350895, "grad_norm": 3.146385669708252, "learning_rate": 1.998563885517628e-05, "loss": 0.5981, "step": 139 }, { "epoch": 0.04728132387706856, "grad_norm": 3.8482303619384766, "learning_rate": 1.9985046852300183e-05, "loss": 0.641, "step": 140 }, { "epoch": 0.047619047619047616, "grad_norm": 3.616960048675537, "learning_rate": 1.998444290181683e-05, "loss": 0.6138, "step": 141 }, { "epoch": 0.04795677136102668, "grad_norm": 3.4537503719329834, "learning_rate": 1.9983827004448875e-05, "loss": 0.6498, "step": 142 }, { "epoch": 0.04829449510300574, "grad_norm": 3.461930751800537, "learning_rate": 1.9983199160933274e-05, "loss": 0.6294, "step": 143 }, { "epoch": 0.0486322188449848, "grad_norm": 3.4274182319641113, "learning_rate": 1.9982559372021274e-05, "loss": 0.6085, "step": 144 }, { "epoch": 0.04896994258696386, "grad_norm": 3.380801200866699, "learning_rate": 1.9981907638478408e-05, "loss": 0.6178, "step": 145 }, { "epoch": 0.04930766632894292, "grad_norm": 3.2365736961364746, "learning_rate": 1.9981243961084516e-05, "loss": 0.5975, "step": 146 }, { "epoch": 0.04964539007092199, "grad_norm": 3.3174169063568115, "learning_rate": 1.9980568340633716e-05, "loss": 0.5788, "step": 147 }, { "epoch": 0.04998311381290105, "grad_norm": 3.1043734550476074, "learning_rate": 1.997988077793442e-05, "loss": 0.5527, "step": 148 }, { "epoch": 0.05032083755488011, "grad_norm": 4.641834735870361, "learning_rate": 1.9979181273809334e-05, "loss": 0.6072, "step": 149 }, { "epoch": 0.05065856129685917, "grad_norm": 3.7558517456054688, "learning_rate": 1.997846982909545e-05, "loss": 0.6238, "step": 150 }, { "epoch": 0.050996285038838234, "grad_norm": 3.7003333568573, "learning_rate": 1.997774644464405e-05, "loss": 0.6496, "step": 151 }, { "epoch": 0.05133400878081729, "grad_norm": 4.087382793426514, "learning_rate": 1.9977011121320687e-05, "loss": 0.6159, "step": 152 }, { "epoch": 0.05167173252279635, "grad_norm": 3.4757237434387207, "learning_rate": 1.9976263860005226e-05, "loss": 0.5956, "step": 153 }, { "epoch": 0.05200945626477541, "grad_norm": 3.3057713508605957, "learning_rate": 1.99755046615918e-05, "loss": 0.5683, "step": 154 }, { "epoch": 0.05234718000675447, "grad_norm": 3.4212968349456787, "learning_rate": 1.9974733526988826e-05, "loss": 0.6214, "step": 155 }, { "epoch": 0.05268490374873354, "grad_norm": 3.5444345474243164, "learning_rate": 1.9973950457119e-05, "loss": 0.6106, "step": 156 }, { "epoch": 0.0530226274907126, "grad_norm": 3.1692023277282715, "learning_rate": 1.9973155452919312e-05, "loss": 0.6096, "step": 157 }, { "epoch": 0.05336035123269166, "grad_norm": 3.1855380535125732, "learning_rate": 1.9972348515341018e-05, "loss": 0.6206, "step": 158 }, { "epoch": 0.05369807497467072, "grad_norm": 3.871363639831543, "learning_rate": 1.9971529645349662e-05, "loss": 0.5915, "step": 159 }, { "epoch": 0.05403579871664978, "grad_norm": 3.2872087955474854, "learning_rate": 1.9970698843925064e-05, "loss": 0.592, "step": 160 }, { "epoch": 0.054373522458628844, "grad_norm": 4.021425247192383, "learning_rate": 1.996985611206132e-05, "loss": 0.616, "step": 161 }, { "epoch": 0.0547112462006079, "grad_norm": 3.0844080448150635, "learning_rate": 1.9969001450766795e-05, "loss": 0.6114, "step": 162 }, { "epoch": 0.05504896994258696, "grad_norm": 2.8914966583251953, "learning_rate": 1.9968134861064142e-05, "loss": 0.6118, "step": 163 }, { "epoch": 0.05538669368456602, "grad_norm": 3.5281951427459717, "learning_rate": 1.9967256343990272e-05, "loss": 0.6101, "step": 164 }, { "epoch": 0.05572441742654509, "grad_norm": 2.8299903869628906, "learning_rate": 1.9966365900596377e-05, "loss": 0.5657, "step": 165 }, { "epoch": 0.05606214116852415, "grad_norm": 3.664534330368042, "learning_rate": 1.996546353194792e-05, "loss": 0.598, "step": 166 }, { "epoch": 0.05639986491050321, "grad_norm": 3.7430531978607178, "learning_rate": 1.9964549239124627e-05, "loss": 0.6668, "step": 167 }, { "epoch": 0.05673758865248227, "grad_norm": 2.9533119201660156, "learning_rate": 1.9963623023220493e-05, "loss": 0.5706, "step": 168 }, { "epoch": 0.05707531239446133, "grad_norm": 2.976067066192627, "learning_rate": 1.9962684885343787e-05, "loss": 0.5983, "step": 169 }, { "epoch": 0.057413036136440394, "grad_norm": 2.8440473079681396, "learning_rate": 1.9961734826617033e-05, "loss": 0.6158, "step": 170 }, { "epoch": 0.057750759878419454, "grad_norm": 3.2681050300598145, "learning_rate": 1.9960772848177025e-05, "loss": 0.6446, "step": 171 }, { "epoch": 0.058088483620398514, "grad_norm": 3.3394553661346436, "learning_rate": 1.995979895117482e-05, "loss": 0.5806, "step": 172 }, { "epoch": 0.05842620736237757, "grad_norm": 5.23557710647583, "learning_rate": 1.9958813136775734e-05, "loss": 0.5725, "step": 173 }, { "epoch": 0.05876393110435663, "grad_norm": 3.448789358139038, "learning_rate": 1.9957815406159344e-05, "loss": 0.6469, "step": 174 }, { "epoch": 0.0591016548463357, "grad_norm": 2.977308988571167, "learning_rate": 1.9956805760519477e-05, "loss": 0.5772, "step": 175 }, { "epoch": 0.05943937858831476, "grad_norm": 2.9214539527893066, "learning_rate": 1.995578420106424e-05, "loss": 0.56, "step": 176 }, { "epoch": 0.05977710233029382, "grad_norm": 3.4948432445526123, "learning_rate": 1.995475072901596e-05, "loss": 0.6316, "step": 177 }, { "epoch": 0.06011482607227288, "grad_norm": 3.1181118488311768, "learning_rate": 1.995370534561125e-05, "loss": 0.6068, "step": 178 }, { "epoch": 0.060452549814251945, "grad_norm": 4.801273822784424, "learning_rate": 1.9952648052100967e-05, "loss": 0.6354, "step": 179 }, { "epoch": 0.060790273556231005, "grad_norm": 3.918304443359375, "learning_rate": 1.995157884975021e-05, "loss": 0.6893, "step": 180 }, { "epoch": 0.061127997298210064, "grad_norm": 2.8410356044769287, "learning_rate": 1.995049773983833e-05, "loss": 0.5814, "step": 181 }, { "epoch": 0.061465721040189124, "grad_norm": 2.6466407775878906, "learning_rate": 1.994940472365893e-05, "loss": 0.599, "step": 182 }, { "epoch": 0.06180344478216818, "grad_norm": 3.145305633544922, "learning_rate": 1.9948299802519866e-05, "loss": 0.6459, "step": 183 }, { "epoch": 0.06214116852414725, "grad_norm": 4.659904956817627, "learning_rate": 1.994718297774322e-05, "loss": 0.6729, "step": 184 }, { "epoch": 0.06247889226612631, "grad_norm": 3.2808473110198975, "learning_rate": 1.994605425066534e-05, "loss": 0.5709, "step": 185 }, { "epoch": 0.06281661600810537, "grad_norm": 3.368922710418701, "learning_rate": 1.9944913622636798e-05, "loss": 0.6194, "step": 186 }, { "epoch": 0.06315433975008443, "grad_norm": 5.883234024047852, "learning_rate": 1.9943761095022413e-05, "loss": 0.6079, "step": 187 }, { "epoch": 0.06349206349206349, "grad_norm": 3.2056937217712402, "learning_rate": 1.994259666920124e-05, "loss": 0.6053, "step": 188 }, { "epoch": 0.06382978723404255, "grad_norm": 2.6577680110931396, "learning_rate": 1.9941420346566576e-05, "loss": 0.5815, "step": 189 }, { "epoch": 0.06416751097602161, "grad_norm": 2.7583727836608887, "learning_rate": 1.994023212852595e-05, "loss": 0.6343, "step": 190 }, { "epoch": 0.06450523471800068, "grad_norm": 2.9179069995880127, "learning_rate": 1.993903201650112e-05, "loss": 0.6365, "step": 191 }, { "epoch": 0.06484295845997974, "grad_norm": 3.152411699295044, "learning_rate": 1.9937820011928086e-05, "loss": 0.6078, "step": 192 }, { "epoch": 0.0651806822019588, "grad_norm": 5.051009178161621, "learning_rate": 1.9936596116257068e-05, "loss": 0.6188, "step": 193 }, { "epoch": 0.06551840594393786, "grad_norm": 3.1799986362457275, "learning_rate": 1.993536033095252e-05, "loss": 0.6328, "step": 194 }, { "epoch": 0.06585612968591692, "grad_norm": 3.275731086730957, "learning_rate": 1.9934112657493115e-05, "loss": 0.615, "step": 195 }, { "epoch": 0.06619385342789598, "grad_norm": 3.503725290298462, "learning_rate": 1.9932853097371765e-05, "loss": 0.5677, "step": 196 }, { "epoch": 0.06653157716987504, "grad_norm": 3.055980920791626, "learning_rate": 1.9931581652095594e-05, "loss": 0.5842, "step": 197 }, { "epoch": 0.0668693009118541, "grad_norm": 3.3838376998901367, "learning_rate": 1.9930298323185945e-05, "loss": 0.5952, "step": 198 }, { "epoch": 0.06720702465383316, "grad_norm": 2.974113941192627, "learning_rate": 1.9929003112178394e-05, "loss": 0.6393, "step": 199 }, { "epoch": 0.06754474839581223, "grad_norm": 3.11110258102417, "learning_rate": 1.992769602062272e-05, "loss": 0.5999, "step": 200 }, { "epoch": 0.06788247213779129, "grad_norm": 3.1224758625030518, "learning_rate": 1.992637705008292e-05, "loss": 0.6021, "step": 201 }, { "epoch": 0.06822019587977035, "grad_norm": 3.2359213829040527, "learning_rate": 1.9925046202137215e-05, "loss": 0.5723, "step": 202 }, { "epoch": 0.06855791962174941, "grad_norm": 3.722144842147827, "learning_rate": 1.992370347837803e-05, "loss": 0.5818, "step": 203 }, { "epoch": 0.06889564336372847, "grad_norm": 3.357381582260132, "learning_rate": 1.9922348880411997e-05, "loss": 0.6096, "step": 204 }, { "epoch": 0.06923336710570753, "grad_norm": 3.24117112159729, "learning_rate": 1.9920982409859963e-05, "loss": 0.5796, "step": 205 }, { "epoch": 0.06957109084768659, "grad_norm": 3.105055809020996, "learning_rate": 1.9919604068356978e-05, "loss": 0.5923, "step": 206 }, { "epoch": 0.06990881458966565, "grad_norm": 3.025294065475464, "learning_rate": 1.9918213857552294e-05, "loss": 0.5605, "step": 207 }, { "epoch": 0.07024653833164471, "grad_norm": 3.0156004428863525, "learning_rate": 1.9916811779109374e-05, "loss": 0.5779, "step": 208 }, { "epoch": 0.07058426207362378, "grad_norm": 3.100205183029175, "learning_rate": 1.9915397834705868e-05, "loss": 0.5988, "step": 209 }, { "epoch": 0.07092198581560284, "grad_norm": 2.9238057136535645, "learning_rate": 1.991397202603363e-05, "loss": 0.601, "step": 210 }, { "epoch": 0.0712597095575819, "grad_norm": 3.5001566410064697, "learning_rate": 1.9912534354798723e-05, "loss": 0.6355, "step": 211 }, { "epoch": 0.07159743329956096, "grad_norm": 3.4854068756103516, "learning_rate": 1.991108482272138e-05, "loss": 0.5638, "step": 212 }, { "epoch": 0.07193515704154002, "grad_norm": 4.965551853179932, "learning_rate": 1.9909623431536042e-05, "loss": 0.577, "step": 213 }, { "epoch": 0.07227288078351908, "grad_norm": 3.451031446456909, "learning_rate": 1.9908150182991338e-05, "loss": 0.5854, "step": 214 }, { "epoch": 0.07261060452549814, "grad_norm": 3.7485363483428955, "learning_rate": 1.9906665078850085e-05, "loss": 0.62, "step": 215 }, { "epoch": 0.0729483282674772, "grad_norm": 4.418776035308838, "learning_rate": 1.990516812088928e-05, "loss": 0.6067, "step": 216 }, { "epoch": 0.07328605200945626, "grad_norm": 2.960486650466919, "learning_rate": 1.990365931090011e-05, "loss": 0.548, "step": 217 }, { "epoch": 0.07362377575143532, "grad_norm": 3.9442331790924072, "learning_rate": 1.9902138650687943e-05, "loss": 0.5775, "step": 218 }, { "epoch": 0.07396149949341439, "grad_norm": 2.9763643741607666, "learning_rate": 1.9900606142072325e-05, "loss": 0.6412, "step": 219 }, { "epoch": 0.07429922323539345, "grad_norm": 3.437159538269043, "learning_rate": 1.9899061786886978e-05, "loss": 0.6017, "step": 220 }, { "epoch": 0.07463694697737251, "grad_norm": 3.4802114963531494, "learning_rate": 1.9897505586979796e-05, "loss": 0.5635, "step": 221 }, { "epoch": 0.07497467071935157, "grad_norm": 3.2704412937164307, "learning_rate": 1.9895937544212856e-05, "loss": 0.596, "step": 222 }, { "epoch": 0.07531239446133063, "grad_norm": 2.8840575218200684, "learning_rate": 1.9894357660462397e-05, "loss": 0.6227, "step": 223 }, { "epoch": 0.07565011820330969, "grad_norm": 3.3949217796325684, "learning_rate": 1.9892765937618826e-05, "loss": 0.6142, "step": 224 }, { "epoch": 0.07598784194528875, "grad_norm": 3.4878833293914795, "learning_rate": 1.989116237758673e-05, "loss": 0.571, "step": 225 }, { "epoch": 0.07632556568726781, "grad_norm": 2.875044584274292, "learning_rate": 1.9889546982284833e-05, "loss": 0.5826, "step": 226 }, { "epoch": 0.07666328942924687, "grad_norm": 3.3268752098083496, "learning_rate": 1.988791975364605e-05, "loss": 0.5424, "step": 227 }, { "epoch": 0.07700101317122594, "grad_norm": 3.3672444820404053, "learning_rate": 1.988628069361743e-05, "loss": 0.5736, "step": 228 }, { "epoch": 0.077338736913205, "grad_norm": 3.1246511936187744, "learning_rate": 1.9884629804160197e-05, "loss": 0.6555, "step": 229 }, { "epoch": 0.07767646065518406, "grad_norm": 2.9528470039367676, "learning_rate": 1.988296708724972e-05, "loss": 0.612, "step": 230 }, { "epoch": 0.07801418439716312, "grad_norm": 2.964240312576294, "learning_rate": 1.9881292544875523e-05, "loss": 0.6255, "step": 231 }, { "epoch": 0.07835190813914218, "grad_norm": 3.5759999752044678, "learning_rate": 1.9879606179041283e-05, "loss": 0.6257, "step": 232 }, { "epoch": 0.07868963188112124, "grad_norm": 3.10675048828125, "learning_rate": 1.9877907991764817e-05, "loss": 0.6077, "step": 233 }, { "epoch": 0.0790273556231003, "grad_norm": 2.9079225063323975, "learning_rate": 1.987619798507809e-05, "loss": 0.5814, "step": 234 }, { "epoch": 0.07936507936507936, "grad_norm": 3.0466339588165283, "learning_rate": 1.987447616102722e-05, "loss": 0.5934, "step": 235 }, { "epoch": 0.07970280310705842, "grad_norm": 3.1226165294647217, "learning_rate": 1.987274252167244e-05, "loss": 0.5844, "step": 236 }, { "epoch": 0.0800405268490375, "grad_norm": 2.8907363414764404, "learning_rate": 1.987099706908815e-05, "loss": 0.5551, "step": 237 }, { "epoch": 0.08037825059101655, "grad_norm": 3.6690948009490967, "learning_rate": 1.986923980536286e-05, "loss": 0.6402, "step": 238 }, { "epoch": 0.08071597433299561, "grad_norm": 3.2151811122894287, "learning_rate": 1.9867470732599234e-05, "loss": 0.5917, "step": 239 }, { "epoch": 0.08105369807497467, "grad_norm": 3.2275493144989014, "learning_rate": 1.9865689852914048e-05, "loss": 0.5802, "step": 240 }, { "epoch": 0.08139142181695373, "grad_norm": 4.168654441833496, "learning_rate": 1.9863897168438217e-05, "loss": 0.6316, "step": 241 }, { "epoch": 0.08172914555893279, "grad_norm": 2.9070661067962646, "learning_rate": 1.9862092681316774e-05, "loss": 0.6186, "step": 242 }, { "epoch": 0.08206686930091185, "grad_norm": 3.5710504055023193, "learning_rate": 1.9860276393708887e-05, "loss": 0.6306, "step": 243 }, { "epoch": 0.08240459304289091, "grad_norm": 2.4950265884399414, "learning_rate": 1.9858448307787827e-05, "loss": 0.5781, "step": 244 }, { "epoch": 0.08274231678486997, "grad_norm": 2.7562596797943115, "learning_rate": 1.9856608425740993e-05, "loss": 0.5198, "step": 245 }, { "epoch": 0.08308004052684904, "grad_norm": 2.6843502521514893, "learning_rate": 1.9854756749769893e-05, "loss": 0.5807, "step": 246 }, { "epoch": 0.0834177642688281, "grad_norm": 2.9774436950683594, "learning_rate": 1.9852893282090152e-05, "loss": 0.6196, "step": 247 }, { "epoch": 0.08375548801080716, "grad_norm": 3.6631059646606445, "learning_rate": 1.98510180249315e-05, "loss": 0.6211, "step": 248 }, { "epoch": 0.08409321175278622, "grad_norm": 2.7743091583251953, "learning_rate": 1.984913098053778e-05, "loss": 0.593, "step": 249 }, { "epoch": 0.08443093549476528, "grad_norm": 3.1476755142211914, "learning_rate": 1.984723215116693e-05, "loss": 0.6045, "step": 250 }, { "epoch": 0.08476865923674434, "grad_norm": 2.6371498107910156, "learning_rate": 1.9845321539091e-05, "loss": 0.5797, "step": 251 }, { "epoch": 0.0851063829787234, "grad_norm": 2.5414681434631348, "learning_rate": 1.9843399146596125e-05, "loss": 0.6053, "step": 252 }, { "epoch": 0.08544410672070246, "grad_norm": 3.655090570449829, "learning_rate": 1.984146497598255e-05, "loss": 0.584, "step": 253 }, { "epoch": 0.08578183046268152, "grad_norm": 2.789790630340576, "learning_rate": 1.9839519029564608e-05, "loss": 0.5678, "step": 254 }, { "epoch": 0.08611955420466058, "grad_norm": 3.1145052909851074, "learning_rate": 1.9837561309670713e-05, "loss": 0.6432, "step": 255 }, { "epoch": 0.08645727794663965, "grad_norm": 2.691843032836914, "learning_rate": 1.983559181864338e-05, "loss": 0.5885, "step": 256 }, { "epoch": 0.08679500168861871, "grad_norm": 3.0318992137908936, "learning_rate": 1.9833610558839206e-05, "loss": 0.5795, "step": 257 }, { "epoch": 0.08713272543059777, "grad_norm": 2.8858354091644287, "learning_rate": 1.983161753262886e-05, "loss": 0.5948, "step": 258 }, { "epoch": 0.08747044917257683, "grad_norm": 2.9149672985076904, "learning_rate": 1.9829612742397107e-05, "loss": 0.5965, "step": 259 }, { "epoch": 0.0878081729145559, "grad_norm": 3.191701650619507, "learning_rate": 1.982759619054277e-05, "loss": 0.6011, "step": 260 }, { "epoch": 0.08814589665653495, "grad_norm": 2.988373041152954, "learning_rate": 1.9825567879478756e-05, "loss": 0.569, "step": 261 }, { "epoch": 0.08848362039851401, "grad_norm": 3.340489625930786, "learning_rate": 1.982352781163204e-05, "loss": 0.5639, "step": 262 }, { "epoch": 0.08882134414049307, "grad_norm": 2.7856662273406982, "learning_rate": 1.9821475989443673e-05, "loss": 0.5851, "step": 263 }, { "epoch": 0.08915906788247213, "grad_norm": 3.060499906539917, "learning_rate": 1.9819412415368753e-05, "loss": 0.6278, "step": 264 }, { "epoch": 0.0894967916244512, "grad_norm": 3.121128797531128, "learning_rate": 1.9817337091876453e-05, "loss": 0.6197, "step": 265 }, { "epoch": 0.08983451536643026, "grad_norm": 6.424557685852051, "learning_rate": 1.9815250021449998e-05, "loss": 0.6089, "step": 266 }, { "epoch": 0.09017223910840932, "grad_norm": 2.838230848312378, "learning_rate": 1.9813151206586682e-05, "loss": 0.5757, "step": 267 }, { "epoch": 0.09050996285038838, "grad_norm": 3.2983860969543457, "learning_rate": 1.981104064979783e-05, "loss": 0.5988, "step": 268 }, { "epoch": 0.09084768659236744, "grad_norm": 3.535027503967285, "learning_rate": 1.9808918353608834e-05, "loss": 0.6461, "step": 269 }, { "epoch": 0.0911854103343465, "grad_norm": 2.7944703102111816, "learning_rate": 1.980678432055913e-05, "loss": 0.5967, "step": 270 }, { "epoch": 0.09152313407632556, "grad_norm": 2.656114339828491, "learning_rate": 1.9804638553202186e-05, "loss": 0.5724, "step": 271 }, { "epoch": 0.09186085781830462, "grad_norm": 2.6317219734191895, "learning_rate": 1.9802481054105527e-05, "loss": 0.5363, "step": 272 }, { "epoch": 0.09219858156028368, "grad_norm": 4.506369590759277, "learning_rate": 1.9800311825850714e-05, "loss": 0.5599, "step": 273 }, { "epoch": 0.09253630530226276, "grad_norm": 3.5644328594207764, "learning_rate": 1.9798130871033322e-05, "loss": 0.5801, "step": 274 }, { "epoch": 0.09287402904424182, "grad_norm": 3.1363251209259033, "learning_rate": 1.9795938192262986e-05, "loss": 0.6359, "step": 275 }, { "epoch": 0.09321175278622088, "grad_norm": 7.618990898132324, "learning_rate": 1.979373379216335e-05, "loss": 0.5889, "step": 276 }, { "epoch": 0.09354947652819993, "grad_norm": 3.381350040435791, "learning_rate": 1.9791517673372086e-05, "loss": 0.6225, "step": 277 }, { "epoch": 0.093887200270179, "grad_norm": 2.629829168319702, "learning_rate": 1.9789289838540897e-05, "loss": 0.579, "step": 278 }, { "epoch": 0.09422492401215805, "grad_norm": 2.7541933059692383, "learning_rate": 1.9787050290335493e-05, "loss": 0.626, "step": 279 }, { "epoch": 0.09456264775413711, "grad_norm": 2.524603843688965, "learning_rate": 1.978479903143561e-05, "loss": 0.5372, "step": 280 }, { "epoch": 0.09490037149611617, "grad_norm": 2.8628947734832764, "learning_rate": 1.9782536064534987e-05, "loss": 0.5726, "step": 281 }, { "epoch": 0.09523809523809523, "grad_norm": 3.053196430206299, "learning_rate": 1.9780261392341383e-05, "loss": 0.5939, "step": 282 }, { "epoch": 0.09557581898007429, "grad_norm": 3.816166639328003, "learning_rate": 1.9777975017576548e-05, "loss": 0.5823, "step": 283 }, { "epoch": 0.09591354272205337, "grad_norm": 2.83060359954834, "learning_rate": 1.9775676942976253e-05, "loss": 0.5815, "step": 284 }, { "epoch": 0.09625126646403243, "grad_norm": 2.892929792404175, "learning_rate": 1.9773367171290258e-05, "loss": 0.5802, "step": 285 }, { "epoch": 0.09658899020601149, "grad_norm": 2.719924211502075, "learning_rate": 1.9771045705282313e-05, "loss": 0.6152, "step": 286 }, { "epoch": 0.09692671394799054, "grad_norm": 2.421276092529297, "learning_rate": 1.976871254773017e-05, "loss": 0.5715, "step": 287 }, { "epoch": 0.0972644376899696, "grad_norm": 2.6999661922454834, "learning_rate": 1.9766367701425575e-05, "loss": 0.6287, "step": 288 }, { "epoch": 0.09760216143194866, "grad_norm": 2.7989113330841064, "learning_rate": 1.9764011169174248e-05, "loss": 0.594, "step": 289 }, { "epoch": 0.09793988517392772, "grad_norm": 3.1024513244628906, "learning_rate": 1.9761642953795896e-05, "loss": 0.5958, "step": 290 }, { "epoch": 0.09827760891590678, "grad_norm": 4.804039478302002, "learning_rate": 1.9759263058124215e-05, "loss": 0.593, "step": 291 }, { "epoch": 0.09861533265788584, "grad_norm": 2.7476513385772705, "learning_rate": 1.9756871485006862e-05, "loss": 0.611, "step": 292 }, { "epoch": 0.09895305639986492, "grad_norm": 2.6721928119659424, "learning_rate": 1.9754468237305473e-05, "loss": 0.6089, "step": 293 }, { "epoch": 0.09929078014184398, "grad_norm": 2.7416863441467285, "learning_rate": 1.975205331789566e-05, "loss": 0.5757, "step": 294 }, { "epoch": 0.09962850388382304, "grad_norm": 2.6923232078552246, "learning_rate": 1.974962672966699e-05, "loss": 0.6041, "step": 295 }, { "epoch": 0.0999662276258021, "grad_norm": 2.68060564994812, "learning_rate": 1.9747188475523e-05, "loss": 0.544, "step": 296 }, { "epoch": 0.10030395136778116, "grad_norm": 2.6104190349578857, "learning_rate": 1.9744738558381176e-05, "loss": 0.5577, "step": 297 }, { "epoch": 0.10064167510976021, "grad_norm": 2.6624643802642822, "learning_rate": 1.9742276981172978e-05, "loss": 0.6158, "step": 298 }, { "epoch": 0.10097939885173927, "grad_norm": 2.5539746284484863, "learning_rate": 1.973980374684379e-05, "loss": 0.5403, "step": 299 }, { "epoch": 0.10131712259371833, "grad_norm": 3.026780605316162, "learning_rate": 1.973731885835298e-05, "loss": 0.6, "step": 300 }, { "epoch": 0.1016548463356974, "grad_norm": 2.672865629196167, "learning_rate": 1.9734822318673825e-05, "loss": 0.5887, "step": 301 }, { "epoch": 0.10199257007767647, "grad_norm": 2.3883142471313477, "learning_rate": 1.973231413079357e-05, "loss": 0.6069, "step": 302 }, { "epoch": 0.10233029381965553, "grad_norm": 2.9542043209075928, "learning_rate": 1.9729794297713377e-05, "loss": 0.623, "step": 303 }, { "epoch": 0.10266801756163459, "grad_norm": 2.7562685012817383, "learning_rate": 1.9727262822448364e-05, "loss": 0.5923, "step": 304 }, { "epoch": 0.10300574130361365, "grad_norm": 3.1382594108581543, "learning_rate": 1.9724719708027556e-05, "loss": 0.6308, "step": 305 }, { "epoch": 0.1033434650455927, "grad_norm": 3.226628065109253, "learning_rate": 1.9722164957493925e-05, "loss": 0.6461, "step": 306 }, { "epoch": 0.10368118878757177, "grad_norm": 2.725010871887207, "learning_rate": 1.971959857390435e-05, "loss": 0.5944, "step": 307 }, { "epoch": 0.10401891252955082, "grad_norm": 2.9489567279815674, "learning_rate": 1.9717020560329644e-05, "loss": 0.5873, "step": 308 }, { "epoch": 0.10435663627152988, "grad_norm": 3.1229867935180664, "learning_rate": 1.9714430919854525e-05, "loss": 0.6314, "step": 309 }, { "epoch": 0.10469436001350894, "grad_norm": 2.7725753784179688, "learning_rate": 1.971182965557763e-05, "loss": 0.6037, "step": 310 }, { "epoch": 0.10503208375548802, "grad_norm": 2.6510868072509766, "learning_rate": 1.97092167706115e-05, "loss": 0.5747, "step": 311 }, { "epoch": 0.10536980749746708, "grad_norm": 2.493316650390625, "learning_rate": 1.9706592268082584e-05, "loss": 0.6069, "step": 312 }, { "epoch": 0.10570753123944614, "grad_norm": 2.444570541381836, "learning_rate": 1.9703956151131226e-05, "loss": 0.5834, "step": 313 }, { "epoch": 0.1060452549814252, "grad_norm": 2.7361977100372314, "learning_rate": 1.9701308422911674e-05, "loss": 0.6008, "step": 314 }, { "epoch": 0.10638297872340426, "grad_norm": 2.8232803344726562, "learning_rate": 1.969864908659207e-05, "loss": 0.6029, "step": 315 }, { "epoch": 0.10672070246538332, "grad_norm": 2.6349120140075684, "learning_rate": 1.969597814535444e-05, "loss": 0.5583, "step": 316 }, { "epoch": 0.10705842620736238, "grad_norm": 3.0811498165130615, "learning_rate": 1.9693295602394697e-05, "loss": 0.58, "step": 317 }, { "epoch": 0.10739614994934144, "grad_norm": 2.811725378036499, "learning_rate": 1.969060146092264e-05, "loss": 0.5883, "step": 318 }, { "epoch": 0.1077338736913205, "grad_norm": 2.581575393676758, "learning_rate": 1.9687895724161946e-05, "loss": 0.5841, "step": 319 }, { "epoch": 0.10807159743329955, "grad_norm": 2.298649311065674, "learning_rate": 1.9685178395350157e-05, "loss": 0.5561, "step": 320 }, { "epoch": 0.10840932117527863, "grad_norm": 3.4873464107513428, "learning_rate": 1.9682449477738702e-05, "loss": 0.6193, "step": 321 }, { "epoch": 0.10874704491725769, "grad_norm": 2.832573652267456, "learning_rate": 1.967970897459286e-05, "loss": 0.555, "step": 322 }, { "epoch": 0.10908476865923675, "grad_norm": 2.581007242202759, "learning_rate": 1.967695688919178e-05, "loss": 0.6176, "step": 323 }, { "epoch": 0.1094224924012158, "grad_norm": 3.2140369415283203, "learning_rate": 1.9674193224828477e-05, "loss": 0.573, "step": 324 }, { "epoch": 0.10976021614319487, "grad_norm": 2.657564878463745, "learning_rate": 1.9671417984809804e-05, "loss": 0.5495, "step": 325 }, { "epoch": 0.11009793988517393, "grad_norm": 2.8436944484710693, "learning_rate": 1.966863117245648e-05, "loss": 0.6015, "step": 326 }, { "epoch": 0.11043566362715299, "grad_norm": 2.7170894145965576, "learning_rate": 1.9665832791103062e-05, "loss": 0.5897, "step": 327 }, { "epoch": 0.11077338736913205, "grad_norm": 2.9719128608703613, "learning_rate": 1.9663022844097956e-05, "loss": 0.5813, "step": 328 }, { "epoch": 0.1111111111111111, "grad_norm": 3.0696187019348145, "learning_rate": 1.96602013348034e-05, "loss": 0.5985, "step": 329 }, { "epoch": 0.11144883485309018, "grad_norm": 2.710968255996704, "learning_rate": 1.9657368266595477e-05, "loss": 0.6237, "step": 330 }, { "epoch": 0.11178655859506924, "grad_norm": 2.792539119720459, "learning_rate": 1.965452364286409e-05, "loss": 0.5694, "step": 331 }, { "epoch": 0.1121242823370483, "grad_norm": 2.8281753063201904, "learning_rate": 1.9651667467012977e-05, "loss": 0.5736, "step": 332 }, { "epoch": 0.11246200607902736, "grad_norm": 2.4222681522369385, "learning_rate": 1.964879974245969e-05, "loss": 0.5686, "step": 333 }, { "epoch": 0.11279972982100642, "grad_norm": 3.873903512954712, "learning_rate": 1.964592047263561e-05, "loss": 0.5674, "step": 334 }, { "epoch": 0.11313745356298548, "grad_norm": 2.782627820968628, "learning_rate": 1.964302966098592e-05, "loss": 0.5792, "step": 335 }, { "epoch": 0.11347517730496454, "grad_norm": 2.836864948272705, "learning_rate": 1.9640127310969626e-05, "loss": 0.5592, "step": 336 }, { "epoch": 0.1138129010469436, "grad_norm": 3.136997938156128, "learning_rate": 1.9637213426059535e-05, "loss": 0.6252, "step": 337 }, { "epoch": 0.11415062478892266, "grad_norm": 2.844254970550537, "learning_rate": 1.9634288009742254e-05, "loss": 0.6002, "step": 338 }, { "epoch": 0.11448834853090173, "grad_norm": 2.9198968410491943, "learning_rate": 1.963135106551819e-05, "loss": 0.591, "step": 339 }, { "epoch": 0.11482607227288079, "grad_norm": 2.4962453842163086, "learning_rate": 1.9628402596901545e-05, "loss": 0.552, "step": 340 }, { "epoch": 0.11516379601485985, "grad_norm": 2.815769672393799, "learning_rate": 1.9625442607420305e-05, "loss": 0.547, "step": 341 }, { "epoch": 0.11550151975683891, "grad_norm": 2.4507596492767334, "learning_rate": 1.9622471100616253e-05, "loss": 0.601, "step": 342 }, { "epoch": 0.11583924349881797, "grad_norm": 2.9236645698547363, "learning_rate": 1.9619488080044933e-05, "loss": 0.565, "step": 343 }, { "epoch": 0.11617696724079703, "grad_norm": 2.458740711212158, "learning_rate": 1.961649354927569e-05, "loss": 0.5513, "step": 344 }, { "epoch": 0.11651469098277609, "grad_norm": 2.7567594051361084, "learning_rate": 1.961348751189162e-05, "loss": 0.5476, "step": 345 }, { "epoch": 0.11685241472475515, "grad_norm": 2.655757427215576, "learning_rate": 1.961046997148961e-05, "loss": 0.631, "step": 346 }, { "epoch": 0.1171901384667342, "grad_norm": 2.630479574203491, "learning_rate": 1.960744093168028e-05, "loss": 0.5735, "step": 347 }, { "epoch": 0.11752786220871327, "grad_norm": 2.670705795288086, "learning_rate": 1.9604400396088047e-05, "loss": 0.5914, "step": 348 }, { "epoch": 0.11786558595069234, "grad_norm": 3.0580050945281982, "learning_rate": 1.9601348368351047e-05, "loss": 0.5729, "step": 349 }, { "epoch": 0.1182033096926714, "grad_norm": 3.1121106147766113, "learning_rate": 1.959828485212119e-05, "loss": 0.5787, "step": 350 }, { "epoch": 0.11854103343465046, "grad_norm": 2.7031359672546387, "learning_rate": 1.959520985106413e-05, "loss": 0.5724, "step": 351 }, { "epoch": 0.11887875717662952, "grad_norm": 2.435828685760498, "learning_rate": 1.959212336885925e-05, "loss": 0.5422, "step": 352 }, { "epoch": 0.11921648091860858, "grad_norm": 2.5263631343841553, "learning_rate": 1.9589025409199692e-05, "loss": 0.606, "step": 353 }, { "epoch": 0.11955420466058764, "grad_norm": 2.6189124584198, "learning_rate": 1.958591597579231e-05, "loss": 0.5987, "step": 354 }, { "epoch": 0.1198919284025667, "grad_norm": 2.343425989151001, "learning_rate": 1.95827950723577e-05, "loss": 0.5525, "step": 355 }, { "epoch": 0.12022965214454576, "grad_norm": 2.439429759979248, "learning_rate": 1.957966270263018e-05, "loss": 0.563, "step": 356 }, { "epoch": 0.12056737588652482, "grad_norm": 2.777621030807495, "learning_rate": 1.9576518870357782e-05, "loss": 0.5762, "step": 357 }, { "epoch": 0.12090509962850389, "grad_norm": 3.022714376449585, "learning_rate": 1.957336357930227e-05, "loss": 0.5677, "step": 358 }, { "epoch": 0.12124282337048295, "grad_norm": 2.770277500152588, "learning_rate": 1.957019683323909e-05, "loss": 0.6085, "step": 359 }, { "epoch": 0.12158054711246201, "grad_norm": 2.6869983673095703, "learning_rate": 1.9567018635957426e-05, "loss": 0.5752, "step": 360 }, { "epoch": 0.12191827085444107, "grad_norm": 2.3711676597595215, "learning_rate": 1.9563828991260138e-05, "loss": 0.5689, "step": 361 }, { "epoch": 0.12225599459642013, "grad_norm": 2.2947347164154053, "learning_rate": 1.9560627902963808e-05, "loss": 0.5784, "step": 362 }, { "epoch": 0.12259371833839919, "grad_norm": 2.524428367614746, "learning_rate": 1.9557415374898688e-05, "loss": 0.5597, "step": 363 }, { "epoch": 0.12293144208037825, "grad_norm": 2.8072845935821533, "learning_rate": 1.955419141090874e-05, "loss": 0.6176, "step": 364 }, { "epoch": 0.12326916582235731, "grad_norm": 2.3293845653533936, "learning_rate": 1.955095601485158e-05, "loss": 0.5694, "step": 365 }, { "epoch": 0.12360688956433637, "grad_norm": 2.6178319454193115, "learning_rate": 1.9547709190598538e-05, "loss": 0.5834, "step": 366 }, { "epoch": 0.12394461330631544, "grad_norm": 2.6552836894989014, "learning_rate": 1.9544450942034593e-05, "loss": 0.5978, "step": 367 }, { "epoch": 0.1242823370482945, "grad_norm": 3.4174654483795166, "learning_rate": 1.95411812730584e-05, "loss": 0.5983, "step": 368 }, { "epoch": 0.12462006079027356, "grad_norm": 2.625958204269409, "learning_rate": 1.9537900187582285e-05, "loss": 0.5821, "step": 369 }, { "epoch": 0.12495778453225262, "grad_norm": 2.748671531677246, "learning_rate": 1.9534607689532236e-05, "loss": 0.5886, "step": 370 }, { "epoch": 0.12529550827423167, "grad_norm": 2.4606218338012695, "learning_rate": 1.9531303782847883e-05, "loss": 0.5654, "step": 371 }, { "epoch": 0.12563323201621074, "grad_norm": 2.6622838973999023, "learning_rate": 1.9527988471482517e-05, "loss": 0.5417, "step": 372 }, { "epoch": 0.1259709557581898, "grad_norm": 3.4520180225372314, "learning_rate": 1.9524661759403075e-05, "loss": 0.5912, "step": 373 }, { "epoch": 0.12630867950016886, "grad_norm": 2.7404496669769287, "learning_rate": 1.9521323650590135e-05, "loss": 0.5662, "step": 374 }, { "epoch": 0.12664640324214793, "grad_norm": 2.7900335788726807, "learning_rate": 1.9517974149037908e-05, "loss": 0.6137, "step": 375 }, { "epoch": 0.12698412698412698, "grad_norm": 3.299988269805908, "learning_rate": 1.9514613258754244e-05, "loss": 0.5799, "step": 376 }, { "epoch": 0.12732185072610605, "grad_norm": 2.9523630142211914, "learning_rate": 1.9511240983760612e-05, "loss": 0.5871, "step": 377 }, { "epoch": 0.1276595744680851, "grad_norm": 3.4329755306243896, "learning_rate": 1.950785732809211e-05, "loss": 0.5676, "step": 378 }, { "epoch": 0.12799729821006417, "grad_norm": 2.6734511852264404, "learning_rate": 1.950446229579745e-05, "loss": 0.5677, "step": 379 }, { "epoch": 0.12833502195204322, "grad_norm": 2.7518250942230225, "learning_rate": 1.9501055890938957e-05, "loss": 0.6042, "step": 380 }, { "epoch": 0.1286727456940223, "grad_norm": 7.657167911529541, "learning_rate": 1.949763811759257e-05, "loss": 0.5789, "step": 381 }, { "epoch": 0.12901046943600136, "grad_norm": 2.8522746562957764, "learning_rate": 1.9494208979847814e-05, "loss": 0.6059, "step": 382 }, { "epoch": 0.1293481931779804, "grad_norm": 2.572242259979248, "learning_rate": 1.9490768481807833e-05, "loss": 0.5635, "step": 383 }, { "epoch": 0.12968591691995948, "grad_norm": 2.743250608444214, "learning_rate": 1.9487316627589353e-05, "loss": 0.5952, "step": 384 }, { "epoch": 0.13002364066193853, "grad_norm": 2.671328067779541, "learning_rate": 1.9483853421322695e-05, "loss": 0.5867, "step": 385 }, { "epoch": 0.1303613644039176, "grad_norm": 2.6871345043182373, "learning_rate": 1.9480378867151746e-05, "loss": 0.5678, "step": 386 }, { "epoch": 0.13069908814589665, "grad_norm": 2.5629220008850098, "learning_rate": 1.947689296923399e-05, "loss": 0.5521, "step": 387 }, { "epoch": 0.13103681188787572, "grad_norm": 2.4559404850006104, "learning_rate": 1.9473395731740483e-05, "loss": 0.6011, "step": 388 }, { "epoch": 0.13137453562985477, "grad_norm": 2.73789119720459, "learning_rate": 1.9469887158855834e-05, "loss": 0.6027, "step": 389 }, { "epoch": 0.13171225937183384, "grad_norm": 3.2954423427581787, "learning_rate": 1.9466367254778234e-05, "loss": 0.6064, "step": 390 }, { "epoch": 0.1320499831138129, "grad_norm": 2.5178065299987793, "learning_rate": 1.946283602371942e-05, "loss": 0.6079, "step": 391 }, { "epoch": 0.13238770685579196, "grad_norm": 3.0925915241241455, "learning_rate": 1.945929346990469e-05, "loss": 0.5525, "step": 392 }, { "epoch": 0.13272543059777103, "grad_norm": 2.7994487285614014, "learning_rate": 1.9455739597572882e-05, "loss": 0.6458, "step": 393 }, { "epoch": 0.13306315433975008, "grad_norm": 2.6234540939331055, "learning_rate": 1.9452174410976383e-05, "loss": 0.613, "step": 394 }, { "epoch": 0.13340087808172915, "grad_norm": 2.780223846435547, "learning_rate": 1.9448597914381117e-05, "loss": 0.5462, "step": 395 }, { "epoch": 0.1337386018237082, "grad_norm": 2.723886728286743, "learning_rate": 1.9445010112066543e-05, "loss": 0.5726, "step": 396 }, { "epoch": 0.13407632556568727, "grad_norm": 2.5422916412353516, "learning_rate": 1.944141100832565e-05, "loss": 0.587, "step": 397 }, { "epoch": 0.13441404930766632, "grad_norm": 3.1061668395996094, "learning_rate": 1.943780060746493e-05, "loss": 0.5635, "step": 398 }, { "epoch": 0.1347517730496454, "grad_norm": 2.7489137649536133, "learning_rate": 1.9434178913804423e-05, "loss": 0.6225, "step": 399 }, { "epoch": 0.13508949679162446, "grad_norm": 2.580404281616211, "learning_rate": 1.9430545931677657e-05, "loss": 0.5782, "step": 400 }, { "epoch": 0.1354272205336035, "grad_norm": 2.5410408973693848, "learning_rate": 1.9426901665431683e-05, "loss": 0.5794, "step": 401 }, { "epoch": 0.13576494427558258, "grad_norm": 2.3532910346984863, "learning_rate": 1.9423246119427044e-05, "loss": 0.5607, "step": 402 }, { "epoch": 0.13610266801756163, "grad_norm": 2.8815269470214844, "learning_rate": 1.9419579298037784e-05, "loss": 0.6249, "step": 403 }, { "epoch": 0.1364403917595407, "grad_norm": 2.820530414581299, "learning_rate": 1.941590120565144e-05, "loss": 0.5862, "step": 404 }, { "epoch": 0.13677811550151975, "grad_norm": 2.1003010272979736, "learning_rate": 1.9412211846669032e-05, "loss": 0.5428, "step": 405 }, { "epoch": 0.13711583924349882, "grad_norm": 2.2717537879943848, "learning_rate": 1.940851122550506e-05, "loss": 0.531, "step": 406 }, { "epoch": 0.13745356298547787, "grad_norm": 2.321383237838745, "learning_rate": 1.9404799346587503e-05, "loss": 0.5231, "step": 407 }, { "epoch": 0.13779128672745694, "grad_norm": 2.1788482666015625, "learning_rate": 1.940107621435781e-05, "loss": 0.5547, "step": 408 }, { "epoch": 0.13812901046943601, "grad_norm": 2.432164192199707, "learning_rate": 1.9397341833270898e-05, "loss": 0.5265, "step": 409 }, { "epoch": 0.13846673421141506, "grad_norm": 2.721555709838867, "learning_rate": 1.9393596207795135e-05, "loss": 0.5674, "step": 410 }, { "epoch": 0.13880445795339413, "grad_norm": 2.286525249481201, "learning_rate": 1.9389839342412353e-05, "loss": 0.5805, "step": 411 }, { "epoch": 0.13914218169537318, "grad_norm": 2.6038618087768555, "learning_rate": 1.9386071241617827e-05, "loss": 0.5994, "step": 412 }, { "epoch": 0.13947990543735225, "grad_norm": 2.797234535217285, "learning_rate": 1.938229190992028e-05, "loss": 0.6013, "step": 413 }, { "epoch": 0.1398176291793313, "grad_norm": 2.328542709350586, "learning_rate": 1.9378501351841864e-05, "loss": 0.5936, "step": 414 }, { "epoch": 0.14015535292131037, "grad_norm": 2.407930374145508, "learning_rate": 1.937469957191818e-05, "loss": 0.512, "step": 415 }, { "epoch": 0.14049307666328942, "grad_norm": 2.3427681922912598, "learning_rate": 1.9370886574698244e-05, "loss": 0.5857, "step": 416 }, { "epoch": 0.1408308004052685, "grad_norm": 2.1489741802215576, "learning_rate": 1.93670623647445e-05, "loss": 0.5516, "step": 417 }, { "epoch": 0.14116852414724756, "grad_norm": 2.6927857398986816, "learning_rate": 1.93632269466328e-05, "loss": 0.5708, "step": 418 }, { "epoch": 0.1415062478892266, "grad_norm": 2.4767730236053467, "learning_rate": 1.9359380324952418e-05, "loss": 0.6088, "step": 419 }, { "epoch": 0.14184397163120568, "grad_norm": 2.3425328731536865, "learning_rate": 1.9355522504306027e-05, "loss": 0.5664, "step": 420 }, { "epoch": 0.14218169537318473, "grad_norm": 2.1100456714630127, "learning_rate": 1.9351653489309705e-05, "loss": 0.5154, "step": 421 }, { "epoch": 0.1425194191151638, "grad_norm": 2.267889976501465, "learning_rate": 1.934777328459292e-05, "loss": 0.5508, "step": 422 }, { "epoch": 0.14285714285714285, "grad_norm": 2.7828733921051025, "learning_rate": 1.9343881894798532e-05, "loss": 0.5591, "step": 423 }, { "epoch": 0.14319486659912192, "grad_norm": 2.164461374282837, "learning_rate": 1.9339979324582782e-05, "loss": 0.5515, "step": 424 }, { "epoch": 0.14353259034110097, "grad_norm": 2.6530041694641113, "learning_rate": 1.9336065578615288e-05, "loss": 0.5567, "step": 425 }, { "epoch": 0.14387031408308004, "grad_norm": 2.4862446784973145, "learning_rate": 1.933214066157904e-05, "loss": 0.5521, "step": 426 }, { "epoch": 0.14420803782505912, "grad_norm": 2.286716938018799, "learning_rate": 1.932820457817041e-05, "loss": 0.5988, "step": 427 }, { "epoch": 0.14454576156703816, "grad_norm": 2.6005289554595947, "learning_rate": 1.9324257333099104e-05, "loss": 0.5587, "step": 428 }, { "epoch": 0.14488348530901723, "grad_norm": 2.853919267654419, "learning_rate": 1.9320298931088207e-05, "loss": 0.5888, "step": 429 }, { "epoch": 0.14522120905099628, "grad_norm": 2.5118322372436523, "learning_rate": 1.9316329376874146e-05, "loss": 0.5482, "step": 430 }, { "epoch": 0.14555893279297535, "grad_norm": 2.5333688259124756, "learning_rate": 1.931234867520668e-05, "loss": 0.5824, "step": 431 }, { "epoch": 0.1458966565349544, "grad_norm": 3.2575395107269287, "learning_rate": 1.9308356830848925e-05, "loss": 0.599, "step": 432 }, { "epoch": 0.14623438027693347, "grad_norm": 3.145564556121826, "learning_rate": 1.9304353848577328e-05, "loss": 0.5439, "step": 433 }, { "epoch": 0.14657210401891252, "grad_norm": 2.417504072189331, "learning_rate": 1.930033973318164e-05, "loss": 0.5639, "step": 434 }, { "epoch": 0.1469098277608916, "grad_norm": 3.017799139022827, "learning_rate": 1.9296314489464967e-05, "loss": 0.5624, "step": 435 }, { "epoch": 0.14724755150287064, "grad_norm": 2.4369075298309326, "learning_rate": 1.9292278122243705e-05, "loss": 0.607, "step": 436 }, { "epoch": 0.1475852752448497, "grad_norm": 2.415853500366211, "learning_rate": 1.9288230636347577e-05, "loss": 0.5302, "step": 437 }, { "epoch": 0.14792299898682879, "grad_norm": 2.6000194549560547, "learning_rate": 1.9284172036619597e-05, "loss": 0.5692, "step": 438 }, { "epoch": 0.14826072272880783, "grad_norm": 3.3238589763641357, "learning_rate": 1.9280102327916074e-05, "loss": 0.5621, "step": 439 }, { "epoch": 0.1485984464707869, "grad_norm": 2.7454633712768555, "learning_rate": 1.9276021515106635e-05, "loss": 0.6124, "step": 440 }, { "epoch": 0.14893617021276595, "grad_norm": 2.2638399600982666, "learning_rate": 1.9271929603074164e-05, "loss": 0.5695, "step": 441 }, { "epoch": 0.14927389395474502, "grad_norm": 2.4969418048858643, "learning_rate": 1.926782659671484e-05, "loss": 0.5676, "step": 442 }, { "epoch": 0.14961161769672407, "grad_norm": 2.8061816692352295, "learning_rate": 1.926371250093812e-05, "loss": 0.5592, "step": 443 }, { "epoch": 0.14994934143870314, "grad_norm": 2.850543260574341, "learning_rate": 1.925958732066672e-05, "loss": 0.5474, "step": 444 }, { "epoch": 0.1502870651806822, "grad_norm": 2.1626734733581543, "learning_rate": 1.9255451060836625e-05, "loss": 0.5723, "step": 445 }, { "epoch": 0.15062478892266126, "grad_norm": 2.6483047008514404, "learning_rate": 1.9251303726397076e-05, "loss": 0.5681, "step": 446 }, { "epoch": 0.15096251266464034, "grad_norm": 2.5221269130706787, "learning_rate": 1.924714532231057e-05, "loss": 0.5445, "step": 447 }, { "epoch": 0.15130023640661938, "grad_norm": 2.7257351875305176, "learning_rate": 1.924297585355284e-05, "loss": 0.6065, "step": 448 }, { "epoch": 0.15163796014859846, "grad_norm": 3.461411237716675, "learning_rate": 1.9238795325112867e-05, "loss": 0.583, "step": 449 }, { "epoch": 0.1519756838905775, "grad_norm": 3.607086420059204, "learning_rate": 1.9234603741992864e-05, "loss": 0.5784, "step": 450 }, { "epoch": 0.15231340763255657, "grad_norm": 2.8359286785125732, "learning_rate": 1.9230401109208264e-05, "loss": 0.58, "step": 451 }, { "epoch": 0.15265113137453562, "grad_norm": 3.526444435119629, "learning_rate": 1.9226187431787727e-05, "loss": 0.5922, "step": 452 }, { "epoch": 0.1529888551165147, "grad_norm": 2.7372679710388184, "learning_rate": 1.9221962714773136e-05, "loss": 0.5783, "step": 453 }, { "epoch": 0.15332657885849374, "grad_norm": 2.9736764430999756, "learning_rate": 1.9217726963219567e-05, "loss": 0.5774, "step": 454 }, { "epoch": 0.1536643026004728, "grad_norm": 2.6413867473602295, "learning_rate": 1.9213480182195313e-05, "loss": 0.5706, "step": 455 }, { "epoch": 0.1540020263424519, "grad_norm": 2.818441152572632, "learning_rate": 1.9209222376781864e-05, "loss": 0.6238, "step": 456 }, { "epoch": 0.15433975008443093, "grad_norm": 3.0630102157592773, "learning_rate": 1.9204953552073884e-05, "loss": 0.6711, "step": 457 }, { "epoch": 0.15467747382641, "grad_norm": 2.4029343128204346, "learning_rate": 1.9200673713179245e-05, "loss": 0.5595, "step": 458 }, { "epoch": 0.15501519756838905, "grad_norm": 3.095815658569336, "learning_rate": 1.9196382865218987e-05, "loss": 0.5915, "step": 459 }, { "epoch": 0.15535292131036813, "grad_norm": 2.5386464595794678, "learning_rate": 1.9192081013327325e-05, "loss": 0.6238, "step": 460 }, { "epoch": 0.15569064505234717, "grad_norm": 2.6473188400268555, "learning_rate": 1.9187768162651632e-05, "loss": 0.6099, "step": 461 }, { "epoch": 0.15602836879432624, "grad_norm": 2.2626852989196777, "learning_rate": 1.9183444318352458e-05, "loss": 0.5845, "step": 462 }, { "epoch": 0.1563660925363053, "grad_norm": 2.1272428035736084, "learning_rate": 1.917910948560349e-05, "loss": 0.5718, "step": 463 }, { "epoch": 0.15670381627828436, "grad_norm": 2.2861721515655518, "learning_rate": 1.9174763669591583e-05, "loss": 0.5421, "step": 464 }, { "epoch": 0.15704154002026344, "grad_norm": 2.459182024002075, "learning_rate": 1.9170406875516715e-05, "loss": 0.5837, "step": 465 }, { "epoch": 0.15737926376224248, "grad_norm": 2.0588929653167725, "learning_rate": 1.9166039108592008e-05, "loss": 0.5698, "step": 466 }, { "epoch": 0.15771698750422156, "grad_norm": 2.23376202583313, "learning_rate": 1.9161660374043716e-05, "loss": 0.5807, "step": 467 }, { "epoch": 0.1580547112462006, "grad_norm": 2.2739686965942383, "learning_rate": 1.9157270677111214e-05, "loss": 0.569, "step": 468 }, { "epoch": 0.15839243498817968, "grad_norm": 2.3197171688079834, "learning_rate": 1.915287002304699e-05, "loss": 0.5791, "step": 469 }, { "epoch": 0.15873015873015872, "grad_norm": 2.165407419204712, "learning_rate": 1.9148458417116645e-05, "loss": 0.5997, "step": 470 }, { "epoch": 0.1590678824721378, "grad_norm": 2.5910756587982178, "learning_rate": 1.9144035864598893e-05, "loss": 0.6225, "step": 471 }, { "epoch": 0.15940560621411684, "grad_norm": 2.2780349254608154, "learning_rate": 1.9139602370785536e-05, "loss": 0.5108, "step": 472 }, { "epoch": 0.15974332995609591, "grad_norm": 3.107177495956421, "learning_rate": 1.913515794098147e-05, "loss": 0.6011, "step": 473 }, { "epoch": 0.160081053698075, "grad_norm": 2.802154064178467, "learning_rate": 1.9130702580504678e-05, "loss": 0.5764, "step": 474 }, { "epoch": 0.16041877744005403, "grad_norm": 2.6769421100616455, "learning_rate": 1.9126236294686217e-05, "loss": 0.5793, "step": 475 }, { "epoch": 0.1607565011820331, "grad_norm": 2.266939401626587, "learning_rate": 1.9121759088870228e-05, "loss": 0.5245, "step": 476 }, { "epoch": 0.16109422492401215, "grad_norm": 2.168534755706787, "learning_rate": 1.9117270968413906e-05, "loss": 0.5959, "step": 477 }, { "epoch": 0.16143194866599123, "grad_norm": 3.64794659614563, "learning_rate": 1.911277193868751e-05, "loss": 0.5835, "step": 478 }, { "epoch": 0.16176967240797027, "grad_norm": 2.6535894870758057, "learning_rate": 1.910826200507436e-05, "loss": 0.596, "step": 479 }, { "epoch": 0.16210739614994935, "grad_norm": 2.1636133193969727, "learning_rate": 1.9103741172970816e-05, "loss": 0.5919, "step": 480 }, { "epoch": 0.1624451198919284, "grad_norm": 3.627110242843628, "learning_rate": 1.9099209447786275e-05, "loss": 0.5692, "step": 481 }, { "epoch": 0.16278284363390746, "grad_norm": 2.485244035720825, "learning_rate": 1.9094666834943177e-05, "loss": 0.5598, "step": 482 }, { "epoch": 0.16312056737588654, "grad_norm": 2.5140697956085205, "learning_rate": 1.9090113339876982e-05, "loss": 0.5359, "step": 483 }, { "epoch": 0.16345829111786558, "grad_norm": 2.206836462020874, "learning_rate": 1.9085548968036174e-05, "loss": 0.5734, "step": 484 }, { "epoch": 0.16379601485984466, "grad_norm": 2.4765357971191406, "learning_rate": 1.9080973724882253e-05, "loss": 0.5442, "step": 485 }, { "epoch": 0.1641337386018237, "grad_norm": 2.4856369495391846, "learning_rate": 1.9076387615889728e-05, "loss": 0.5664, "step": 486 }, { "epoch": 0.16447146234380278, "grad_norm": 2.568537712097168, "learning_rate": 1.9071790646546105e-05, "loss": 0.573, "step": 487 }, { "epoch": 0.16480918608578182, "grad_norm": 3.457841634750366, "learning_rate": 1.9067182822351884e-05, "loss": 0.5581, "step": 488 }, { "epoch": 0.1651469098277609, "grad_norm": 2.5403289794921875, "learning_rate": 1.9062564148820563e-05, "loss": 0.5485, "step": 489 }, { "epoch": 0.16548463356973994, "grad_norm": 2.3736140727996826, "learning_rate": 1.9057934631478616e-05, "loss": 0.5252, "step": 490 }, { "epoch": 0.16582235731171902, "grad_norm": 2.876591205596924, "learning_rate": 1.9053294275865485e-05, "loss": 0.5597, "step": 491 }, { "epoch": 0.1661600810536981, "grad_norm": 2.413252592086792, "learning_rate": 1.9048643087533594e-05, "loss": 0.566, "step": 492 }, { "epoch": 0.16649780479567713, "grad_norm": 2.4686074256896973, "learning_rate": 1.9043981072048318e-05, "loss": 0.6219, "step": 493 }, { "epoch": 0.1668355285376562, "grad_norm": 2.109065294265747, "learning_rate": 1.903930823498799e-05, "loss": 0.4973, "step": 494 }, { "epoch": 0.16717325227963525, "grad_norm": 2.468660831451416, "learning_rate": 1.90346245819439e-05, "loss": 0.6053, "step": 495 }, { "epoch": 0.16751097602161433, "grad_norm": 4.271061420440674, "learning_rate": 1.9029930118520266e-05, "loss": 0.5529, "step": 496 }, { "epoch": 0.16784869976359337, "grad_norm": 2.2877535820007324, "learning_rate": 1.9025224850334252e-05, "loss": 0.5688, "step": 497 }, { "epoch": 0.16818642350557245, "grad_norm": 3.8016862869262695, "learning_rate": 1.9020508783015942e-05, "loss": 0.5596, "step": 498 }, { "epoch": 0.1685241472475515, "grad_norm": 2.315105438232422, "learning_rate": 1.901578192220835e-05, "loss": 0.5255, "step": 499 }, { "epoch": 0.16886187098953057, "grad_norm": 2.7399275302886963, "learning_rate": 1.9011044273567405e-05, "loss": 0.5879, "step": 500 }, { "epoch": 0.1691995947315096, "grad_norm": 3.137909173965454, "learning_rate": 1.9006295842761927e-05, "loss": 0.5382, "step": 501 }, { "epoch": 0.16953731847348869, "grad_norm": 2.57723331451416, "learning_rate": 1.9001536635473664e-05, "loss": 0.5875, "step": 502 }, { "epoch": 0.16987504221546776, "grad_norm": 2.949665069580078, "learning_rate": 1.899676665739724e-05, "loss": 0.6015, "step": 503 }, { "epoch": 0.1702127659574468, "grad_norm": 2.678975820541382, "learning_rate": 1.8991985914240166e-05, "loss": 0.5706, "step": 504 }, { "epoch": 0.17055048969942588, "grad_norm": 2.477928876876831, "learning_rate": 1.8987194411722855e-05, "loss": 0.6157, "step": 505 }, { "epoch": 0.17088821344140492, "grad_norm": 2.6126325130462646, "learning_rate": 1.898239215557856e-05, "loss": 0.5572, "step": 506 }, { "epoch": 0.171225937183384, "grad_norm": 2.9918887615203857, "learning_rate": 1.8977579151553437e-05, "loss": 0.6099, "step": 507 }, { "epoch": 0.17156366092536304, "grad_norm": 2.4580743312835693, "learning_rate": 1.8972755405406475e-05, "loss": 0.5696, "step": 508 }, { "epoch": 0.17190138466734212, "grad_norm": 2.825040102005005, "learning_rate": 1.8967920922909535e-05, "loss": 0.5848, "step": 509 }, { "epoch": 0.17223910840932116, "grad_norm": 2.378786087036133, "learning_rate": 1.8963075709847308e-05, "loss": 0.5532, "step": 510 }, { "epoch": 0.17257683215130024, "grad_norm": 3.1526527404785156, "learning_rate": 1.8958219772017344e-05, "loss": 0.555, "step": 511 }, { "epoch": 0.1729145558932793, "grad_norm": 2.280085802078247, "learning_rate": 1.8953353115230005e-05, "loss": 0.5902, "step": 512 }, { "epoch": 0.17325227963525835, "grad_norm": 4.195369720458984, "learning_rate": 1.8948475745308497e-05, "loss": 0.551, "step": 513 }, { "epoch": 0.17359000337723743, "grad_norm": 2.6032066345214844, "learning_rate": 1.894358766808883e-05, "loss": 0.5452, "step": 514 }, { "epoch": 0.17392772711921647, "grad_norm": 3.6678786277770996, "learning_rate": 1.8938688889419837e-05, "loss": 0.6103, "step": 515 }, { "epoch": 0.17426545086119555, "grad_norm": 2.447453260421753, "learning_rate": 1.893377941516315e-05, "loss": 0.5937, "step": 516 }, { "epoch": 0.1746031746031746, "grad_norm": 2.4996769428253174, "learning_rate": 1.89288592511932e-05, "loss": 0.6035, "step": 517 }, { "epoch": 0.17494089834515367, "grad_norm": 2.2942795753479004, "learning_rate": 1.892392840339721e-05, "loss": 0.5486, "step": 518 }, { "epoch": 0.1752786220871327, "grad_norm": 1.8694463968276978, "learning_rate": 1.8918986877675182e-05, "loss": 0.5314, "step": 519 }, { "epoch": 0.1756163458291118, "grad_norm": 4.262805938720703, "learning_rate": 1.8914034679939905e-05, "loss": 0.5577, "step": 520 }, { "epoch": 0.17595406957109086, "grad_norm": 2.3569746017456055, "learning_rate": 1.8909071816116925e-05, "loss": 0.5855, "step": 521 }, { "epoch": 0.1762917933130699, "grad_norm": 2.303800106048584, "learning_rate": 1.8904098292144556e-05, "loss": 0.5738, "step": 522 }, { "epoch": 0.17662951705504898, "grad_norm": 2.204500675201416, "learning_rate": 1.8899114113973872e-05, "loss": 0.5338, "step": 523 }, { "epoch": 0.17696724079702802, "grad_norm": 1.901328682899475, "learning_rate": 1.889411928756869e-05, "loss": 0.5936, "step": 524 }, { "epoch": 0.1773049645390071, "grad_norm": 2.6285901069641113, "learning_rate": 1.888911381890557e-05, "loss": 0.5651, "step": 525 }, { "epoch": 0.17764268828098614, "grad_norm": 2.0434720516204834, "learning_rate": 1.8884097713973798e-05, "loss": 0.5526, "step": 526 }, { "epoch": 0.17798041202296522, "grad_norm": 2.5209391117095947, "learning_rate": 1.8879070978775405e-05, "loss": 0.5806, "step": 527 }, { "epoch": 0.17831813576494426, "grad_norm": 2.2196450233459473, "learning_rate": 1.8874033619325124e-05, "loss": 0.595, "step": 528 }, { "epoch": 0.17865585950692334, "grad_norm": 2.3618783950805664, "learning_rate": 1.886898564165041e-05, "loss": 0.5924, "step": 529 }, { "epoch": 0.1789935832489024, "grad_norm": 2.19270920753479, "learning_rate": 1.8863927051791418e-05, "loss": 0.5812, "step": 530 }, { "epoch": 0.17933130699088146, "grad_norm": 2.3661458492279053, "learning_rate": 1.8858857855801004e-05, "loss": 0.592, "step": 531 }, { "epoch": 0.17966903073286053, "grad_norm": 2.106689214706421, "learning_rate": 1.8853778059744716e-05, "loss": 0.5843, "step": 532 }, { "epoch": 0.18000675447483958, "grad_norm": 2.538896083831787, "learning_rate": 1.884868766970078e-05, "loss": 0.5902, "step": 533 }, { "epoch": 0.18034447821681865, "grad_norm": 2.418215751647949, "learning_rate": 1.884358669176011e-05, "loss": 0.5875, "step": 534 }, { "epoch": 0.1806822019587977, "grad_norm": 2.1817941665649414, "learning_rate": 1.883847513202627e-05, "loss": 0.6138, "step": 535 }, { "epoch": 0.18101992570077677, "grad_norm": 2.1061015129089355, "learning_rate": 1.8833352996615507e-05, "loss": 0.561, "step": 536 }, { "epoch": 0.18135764944275581, "grad_norm": 2.779547929763794, "learning_rate": 1.8828220291656702e-05, "loss": 0.5689, "step": 537 }, { "epoch": 0.1816953731847349, "grad_norm": 1.9308968782424927, "learning_rate": 1.88230770232914e-05, "loss": 0.5487, "step": 538 }, { "epoch": 0.18203309692671396, "grad_norm": 2.304126501083374, "learning_rate": 1.881792319767377e-05, "loss": 0.6069, "step": 539 }, { "epoch": 0.182370820668693, "grad_norm": 2.7512316703796387, "learning_rate": 1.8812758820970637e-05, "loss": 0.658, "step": 540 }, { "epoch": 0.18270854441067208, "grad_norm": 2.8649168014526367, "learning_rate": 1.880758389936142e-05, "loss": 0.5808, "step": 541 }, { "epoch": 0.18304626815265113, "grad_norm": 2.274082899093628, "learning_rate": 1.8802398439038175e-05, "loss": 0.5437, "step": 542 }, { "epoch": 0.1833839918946302, "grad_norm": 2.6033263206481934, "learning_rate": 1.8797202446205575e-05, "loss": 0.5797, "step": 543 }, { "epoch": 0.18372171563660925, "grad_norm": 2.697338581085205, "learning_rate": 1.879199592708087e-05, "loss": 0.5705, "step": 544 }, { "epoch": 0.18405943937858832, "grad_norm": 3.673042058944702, "learning_rate": 1.878677888789393e-05, "loss": 0.5564, "step": 545 }, { "epoch": 0.18439716312056736, "grad_norm": 2.352073907852173, "learning_rate": 1.8781551334887204e-05, "loss": 0.6394, "step": 546 }, { "epoch": 0.18473488686254644, "grad_norm": 2.670875310897827, "learning_rate": 1.877631327431571e-05, "loss": 0.5584, "step": 547 }, { "epoch": 0.1850726106045255, "grad_norm": 2.556748390197754, "learning_rate": 1.8771064712447054e-05, "loss": 0.6097, "step": 548 }, { "epoch": 0.18541033434650456, "grad_norm": 2.6242175102233887, "learning_rate": 1.876580565556141e-05, "loss": 0.5609, "step": 549 }, { "epoch": 0.18574805808848363, "grad_norm": 5.8099870681762695, "learning_rate": 1.876053610995149e-05, "loss": 0.5525, "step": 550 }, { "epoch": 0.18608578183046268, "grad_norm": 2.45652174949646, "learning_rate": 1.8755256081922576e-05, "loss": 0.5894, "step": 551 }, { "epoch": 0.18642350557244175, "grad_norm": 2.715425491333008, "learning_rate": 1.8749965577792482e-05, "loss": 0.609, "step": 552 }, { "epoch": 0.1867612293144208, "grad_norm": 2.2454471588134766, "learning_rate": 1.8744664603891566e-05, "loss": 0.5388, "step": 553 }, { "epoch": 0.18709895305639987, "grad_norm": 2.49367094039917, "learning_rate": 1.87393531665627e-05, "loss": 0.5698, "step": 554 }, { "epoch": 0.18743667679837892, "grad_norm": 2.245166301727295, "learning_rate": 1.8734031272161293e-05, "loss": 0.5715, "step": 555 }, { "epoch": 0.187774400540358, "grad_norm": 2.0615689754486084, "learning_rate": 1.872869892705525e-05, "loss": 0.5834, "step": 556 }, { "epoch": 0.18811212428233706, "grad_norm": 2.5240225791931152, "learning_rate": 1.872335613762499e-05, "loss": 0.5906, "step": 557 }, { "epoch": 0.1884498480243161, "grad_norm": 2.4320287704467773, "learning_rate": 1.8718002910263426e-05, "loss": 0.5227, "step": 558 }, { "epoch": 0.18878757176629518, "grad_norm": 2.3207664489746094, "learning_rate": 1.8712639251375968e-05, "loss": 0.5858, "step": 559 }, { "epoch": 0.18912529550827423, "grad_norm": 2.947436571121216, "learning_rate": 1.8707265167380497e-05, "loss": 0.5738, "step": 560 }, { "epoch": 0.1894630192502533, "grad_norm": 2.4595272541046143, "learning_rate": 1.8701880664707374e-05, "loss": 0.5523, "step": 561 }, { "epoch": 0.18980074299223235, "grad_norm": 2.4411473274230957, "learning_rate": 1.869648574979942e-05, "loss": 0.5667, "step": 562 }, { "epoch": 0.19013846673421142, "grad_norm": 2.61208176612854, "learning_rate": 1.8691080429111933e-05, "loss": 0.5541, "step": 563 }, { "epoch": 0.19047619047619047, "grad_norm": 3.2387595176696777, "learning_rate": 1.8685664709112637e-05, "loss": 0.5988, "step": 564 }, { "epoch": 0.19081391421816954, "grad_norm": 2.3639848232269287, "learning_rate": 1.8680238596281713e-05, "loss": 0.5507, "step": 565 }, { "epoch": 0.19115163796014858, "grad_norm": 2.118597984313965, "learning_rate": 1.8674802097111784e-05, "loss": 0.525, "step": 566 }, { "epoch": 0.19148936170212766, "grad_norm": 2.4331037998199463, "learning_rate": 1.866935521810789e-05, "loss": 0.5976, "step": 567 }, { "epoch": 0.19182708544410673, "grad_norm": 2.3532347679138184, "learning_rate": 1.8663897965787483e-05, "loss": 0.5875, "step": 568 }, { "epoch": 0.19216480918608578, "grad_norm": 5.882742881774902, "learning_rate": 1.865843034668045e-05, "loss": 0.5519, "step": 569 }, { "epoch": 0.19250253292806485, "grad_norm": 2.006202220916748, "learning_rate": 1.865295236732907e-05, "loss": 0.5222, "step": 570 }, { "epoch": 0.1928402566700439, "grad_norm": 2.1628787517547607, "learning_rate": 1.864746403428801e-05, "loss": 0.5796, "step": 571 }, { "epoch": 0.19317798041202297, "grad_norm": 2.338165760040283, "learning_rate": 1.8641965354124346e-05, "loss": 0.5812, "step": 572 }, { "epoch": 0.19351570415400202, "grad_norm": 1.9040286540985107, "learning_rate": 1.8636456333417514e-05, "loss": 0.5635, "step": 573 }, { "epoch": 0.1938534278959811, "grad_norm": 2.0422518253326416, "learning_rate": 1.8630936978759337e-05, "loss": 0.5863, "step": 574 }, { "epoch": 0.19419115163796014, "grad_norm": 2.227210521697998, "learning_rate": 1.8625407296754e-05, "loss": 0.5378, "step": 575 }, { "epoch": 0.1945288753799392, "grad_norm": 2.518249034881592, "learning_rate": 1.8619867294018035e-05, "loss": 0.6083, "step": 576 }, { "epoch": 0.19486659912191828, "grad_norm": 2.3911690711975098, "learning_rate": 1.861431697718034e-05, "loss": 0.6241, "step": 577 }, { "epoch": 0.19520432286389733, "grad_norm": 2.399261474609375, "learning_rate": 1.8608756352882152e-05, "loss": 0.5354, "step": 578 }, { "epoch": 0.1955420466058764, "grad_norm": 2.103030204772949, "learning_rate": 1.860318542777702e-05, "loss": 0.5809, "step": 579 }, { "epoch": 0.19587977034785545, "grad_norm": 2.2702789306640625, "learning_rate": 1.8597604208530845e-05, "loss": 0.5045, "step": 580 }, { "epoch": 0.19621749408983452, "grad_norm": 2.410989761352539, "learning_rate": 1.859201270182183e-05, "loss": 0.5536, "step": 581 }, { "epoch": 0.19655521783181357, "grad_norm": 2.225159168243408, "learning_rate": 1.85864109143405e-05, "loss": 0.5677, "step": 582 }, { "epoch": 0.19689294157379264, "grad_norm": 2.2405340671539307, "learning_rate": 1.8580798852789662e-05, "loss": 0.5741, "step": 583 }, { "epoch": 0.19723066531577169, "grad_norm": 2.186213731765747, "learning_rate": 1.8575176523884432e-05, "loss": 0.5745, "step": 584 }, { "epoch": 0.19756838905775076, "grad_norm": 2.847445487976074, "learning_rate": 1.8569543934352212e-05, "loss": 0.6121, "step": 585 }, { "epoch": 0.19790611279972983, "grad_norm": 2.3178629875183105, "learning_rate": 1.8563901090932673e-05, "loss": 0.6085, "step": 586 }, { "epoch": 0.19824383654170888, "grad_norm": 2.4105918407440186, "learning_rate": 1.855824800037776e-05, "loss": 0.5485, "step": 587 }, { "epoch": 0.19858156028368795, "grad_norm": 2.1383843421936035, "learning_rate": 1.8552584669451675e-05, "loss": 0.5438, "step": 588 }, { "epoch": 0.198919284025667, "grad_norm": 2.352964162826538, "learning_rate": 1.8546911104930882e-05, "loss": 0.5694, "step": 589 }, { "epoch": 0.19925700776764607, "grad_norm": 2.124833345413208, "learning_rate": 1.854122731360408e-05, "loss": 0.6121, "step": 590 }, { "epoch": 0.19959473150962512, "grad_norm": 2.142090320587158, "learning_rate": 1.853553330227221e-05, "loss": 0.5021, "step": 591 }, { "epoch": 0.1999324552516042, "grad_norm": 1.991716742515564, "learning_rate": 1.8529829077748442e-05, "loss": 0.5156, "step": 592 }, { "epoch": 0.20027017899358324, "grad_norm": 2.6672635078430176, "learning_rate": 1.8524114646858167e-05, "loss": 0.6312, "step": 593 }, { "epoch": 0.2006079027355623, "grad_norm": 2.158097505569458, "learning_rate": 1.851839001643898e-05, "loss": 0.573, "step": 594 }, { "epoch": 0.20094562647754138, "grad_norm": 3.555957794189453, "learning_rate": 1.8512655193340695e-05, "loss": 0.5619, "step": 595 }, { "epoch": 0.20128335021952043, "grad_norm": 2.193157434463501, "learning_rate": 1.850691018442531e-05, "loss": 0.5559, "step": 596 }, { "epoch": 0.2016210739614995, "grad_norm": 2.359311580657959, "learning_rate": 1.8501154996567018e-05, "loss": 0.519, "step": 597 }, { "epoch": 0.20195879770347855, "grad_norm": 2.0833637714385986, "learning_rate": 1.8495389636652185e-05, "loss": 0.5442, "step": 598 }, { "epoch": 0.20229652144545762, "grad_norm": 2.3817551136016846, "learning_rate": 1.8489614111579357e-05, "loss": 0.5416, "step": 599 }, { "epoch": 0.20263424518743667, "grad_norm": 2.3326256275177, "learning_rate": 1.8483828428259235e-05, "loss": 0.5496, "step": 600 }, { "epoch": 0.20297196892941574, "grad_norm": 2.061349630355835, "learning_rate": 1.8478032593614675e-05, "loss": 0.5736, "step": 601 }, { "epoch": 0.2033096926713948, "grad_norm": 2.2970833778381348, "learning_rate": 1.847222661458069e-05, "loss": 0.5307, "step": 602 }, { "epoch": 0.20364741641337386, "grad_norm": 2.0947864055633545, "learning_rate": 1.846641049810442e-05, "loss": 0.5758, "step": 603 }, { "epoch": 0.20398514015535293, "grad_norm": 2.4482085704803467, "learning_rate": 1.8460584251145137e-05, "loss": 0.5443, "step": 604 }, { "epoch": 0.20432286389733198, "grad_norm": 2.298417806625366, "learning_rate": 1.8454747880674236e-05, "loss": 0.5685, "step": 605 }, { "epoch": 0.20466058763931105, "grad_norm": 2.2356009483337402, "learning_rate": 1.8448901393675233e-05, "loss": 0.5261, "step": 606 }, { "epoch": 0.2049983113812901, "grad_norm": 3.0484750270843506, "learning_rate": 1.8443044797143733e-05, "loss": 0.5548, "step": 607 }, { "epoch": 0.20533603512326917, "grad_norm": 2.408050298690796, "learning_rate": 1.8437178098087452e-05, "loss": 0.589, "step": 608 }, { "epoch": 0.20567375886524822, "grad_norm": 3.2041168212890625, "learning_rate": 1.8431301303526186e-05, "loss": 0.6063, "step": 609 }, { "epoch": 0.2060114826072273, "grad_norm": 3.834425449371338, "learning_rate": 1.8425414420491817e-05, "loss": 0.5687, "step": 610 }, { "epoch": 0.20634920634920634, "grad_norm": 2.2821505069732666, "learning_rate": 1.8419517456028283e-05, "loss": 0.5702, "step": 611 }, { "epoch": 0.2066869300911854, "grad_norm": 4.354745388031006, "learning_rate": 1.841361041719161e-05, "loss": 0.5936, "step": 612 }, { "epoch": 0.20702465383316448, "grad_norm": 2.1499412059783936, "learning_rate": 1.8407693311049862e-05, "loss": 0.5746, "step": 613 }, { "epoch": 0.20736237757514353, "grad_norm": 3.716365337371826, "learning_rate": 1.8401766144683145e-05, "loss": 0.5451, "step": 614 }, { "epoch": 0.2077001013171226, "grad_norm": 2.4600579738616943, "learning_rate": 1.8395828925183616e-05, "loss": 0.5485, "step": 615 }, { "epoch": 0.20803782505910165, "grad_norm": 3.1863479614257812, "learning_rate": 1.8389881659655456e-05, "loss": 0.5569, "step": 616 }, { "epoch": 0.20837554880108072, "grad_norm": 3.4445457458496094, "learning_rate": 1.8383924355214858e-05, "loss": 0.6213, "step": 617 }, { "epoch": 0.20871327254305977, "grad_norm": 2.3219456672668457, "learning_rate": 1.8377957018990043e-05, "loss": 0.5769, "step": 618 }, { "epoch": 0.20905099628503884, "grad_norm": 2.051604747772217, "learning_rate": 1.8371979658121217e-05, "loss": 0.564, "step": 619 }, { "epoch": 0.2093887200270179, "grad_norm": 2.107227325439453, "learning_rate": 1.83659922797606e-05, "loss": 0.572, "step": 620 }, { "epoch": 0.20972644376899696, "grad_norm": 2.6015241146087646, "learning_rate": 1.8359994891072383e-05, "loss": 0.5886, "step": 621 }, { "epoch": 0.21006416751097604, "grad_norm": 2.35957407951355, "learning_rate": 1.8353987499232747e-05, "loss": 0.5523, "step": 622 }, { "epoch": 0.21040189125295508, "grad_norm": 2.682255744934082, "learning_rate": 1.834797011142983e-05, "loss": 0.5656, "step": 623 }, { "epoch": 0.21073961499493415, "grad_norm": 2.6817049980163574, "learning_rate": 1.834194273486374e-05, "loss": 0.5582, "step": 624 }, { "epoch": 0.2110773387369132, "grad_norm": 2.915517568588257, "learning_rate": 1.8335905376746535e-05, "loss": 0.5572, "step": 625 }, { "epoch": 0.21141506247889227, "grad_norm": 2.4118781089782715, "learning_rate": 1.8329858044302212e-05, "loss": 0.5777, "step": 626 }, { "epoch": 0.21175278622087132, "grad_norm": 2.29581880569458, "learning_rate": 1.8323800744766715e-05, "loss": 0.5589, "step": 627 }, { "epoch": 0.2120905099628504, "grad_norm": 2.13769793510437, "learning_rate": 1.8317733485387893e-05, "loss": 0.5661, "step": 628 }, { "epoch": 0.21242823370482944, "grad_norm": 2.8407037258148193, "learning_rate": 1.8311656273425535e-05, "loss": 0.5577, "step": 629 }, { "epoch": 0.2127659574468085, "grad_norm": 2.035285472869873, "learning_rate": 1.830556911615132e-05, "loss": 0.5096, "step": 630 }, { "epoch": 0.21310368118878756, "grad_norm": 2.2275800704956055, "learning_rate": 1.8299472020848845e-05, "loss": 0.5784, "step": 631 }, { "epoch": 0.21344140493076663, "grad_norm": 2.2813491821289062, "learning_rate": 1.8293364994813584e-05, "loss": 0.5055, "step": 632 }, { "epoch": 0.2137791286727457, "grad_norm": 1.9074902534484863, "learning_rate": 1.82872480453529e-05, "loss": 0.5665, "step": 633 }, { "epoch": 0.21411685241472475, "grad_norm": 2.0177953243255615, "learning_rate": 1.8281121179786024e-05, "loss": 0.575, "step": 634 }, { "epoch": 0.21445457615670382, "grad_norm": 2.3978168964385986, "learning_rate": 1.8274984405444063e-05, "loss": 0.5663, "step": 635 }, { "epoch": 0.21479229989868287, "grad_norm": 2.886589288711548, "learning_rate": 1.826883772966997e-05, "loss": 0.5386, "step": 636 }, { "epoch": 0.21513002364066194, "grad_norm": 2.004395008087158, "learning_rate": 1.826268115981855e-05, "loss": 0.5391, "step": 637 }, { "epoch": 0.215467747382641, "grad_norm": 2.081407070159912, "learning_rate": 1.825651470325645e-05, "loss": 0.544, "step": 638 }, { "epoch": 0.21580547112462006, "grad_norm": 2.2525200843811035, "learning_rate": 1.825033836736214e-05, "loss": 0.5471, "step": 639 }, { "epoch": 0.2161431948665991, "grad_norm": 2.003926992416382, "learning_rate": 1.8244152159525916e-05, "loss": 0.5397, "step": 640 }, { "epoch": 0.21648091860857818, "grad_norm": 2.0800864696502686, "learning_rate": 1.823795608714988e-05, "loss": 0.5602, "step": 641 }, { "epoch": 0.21681864235055726, "grad_norm": 1.9890743494033813, "learning_rate": 1.823175015764795e-05, "loss": 0.5442, "step": 642 }, { "epoch": 0.2171563660925363, "grad_norm": 2.1888182163238525, "learning_rate": 1.8225534378445823e-05, "loss": 0.5354, "step": 643 }, { "epoch": 0.21749408983451538, "grad_norm": 1.9506672620773315, "learning_rate": 1.821930875698099e-05, "loss": 0.5322, "step": 644 }, { "epoch": 0.21783181357649442, "grad_norm": 1.9487754106521606, "learning_rate": 1.8213073300702727e-05, "loss": 0.6035, "step": 645 }, { "epoch": 0.2181695373184735, "grad_norm": 1.7257194519042969, "learning_rate": 1.8206828017072057e-05, "loss": 0.5525, "step": 646 }, { "epoch": 0.21850726106045254, "grad_norm": 1.9587911367416382, "learning_rate": 1.820057291356178e-05, "loss": 0.5626, "step": 647 }, { "epoch": 0.2188449848024316, "grad_norm": 2.299060583114624, "learning_rate": 1.819430799765644e-05, "loss": 0.5336, "step": 648 }, { "epoch": 0.21918270854441066, "grad_norm": 2.3395512104034424, "learning_rate": 1.8188033276852322e-05, "loss": 0.5736, "step": 649 }, { "epoch": 0.21952043228638973, "grad_norm": 1.9211443662643433, "learning_rate": 1.818174875865744e-05, "loss": 0.5327, "step": 650 }, { "epoch": 0.2198581560283688, "grad_norm": 2.1377081871032715, "learning_rate": 1.8175454450591536e-05, "loss": 0.5721, "step": 651 }, { "epoch": 0.22019587977034785, "grad_norm": 2.335589647293091, "learning_rate": 1.8169150360186062e-05, "loss": 0.5596, "step": 652 }, { "epoch": 0.22053360351232693, "grad_norm": 1.8264658451080322, "learning_rate": 1.8162836494984177e-05, "loss": 0.5535, "step": 653 }, { "epoch": 0.22087132725430597, "grad_norm": 1.7625173330307007, "learning_rate": 1.815651286254074e-05, "loss": 0.5044, "step": 654 }, { "epoch": 0.22120905099628504, "grad_norm": 1.9844608306884766, "learning_rate": 1.815017947042229e-05, "loss": 0.5431, "step": 655 }, { "epoch": 0.2215467747382641, "grad_norm": 2.3415050506591797, "learning_rate": 1.8143836326207048e-05, "loss": 0.5956, "step": 656 }, { "epoch": 0.22188449848024316, "grad_norm": 2.188693046569824, "learning_rate": 1.81374834374849e-05, "loss": 0.5467, "step": 657 }, { "epoch": 0.2222222222222222, "grad_norm": 2.057687759399414, "learning_rate": 1.8131120811857398e-05, "loss": 0.5442, "step": 658 }, { "epoch": 0.22255994596420128, "grad_norm": 2.1752912998199463, "learning_rate": 1.812474845693774e-05, "loss": 0.5696, "step": 659 }, { "epoch": 0.22289766970618036, "grad_norm": 1.9735984802246094, "learning_rate": 1.8118366380350773e-05, "loss": 0.559, "step": 660 }, { "epoch": 0.2232353934481594, "grad_norm": 2.111361265182495, "learning_rate": 1.811197458973296e-05, "loss": 0.6015, "step": 661 }, { "epoch": 0.22357311719013848, "grad_norm": 2.0953822135925293, "learning_rate": 1.81055730927324e-05, "loss": 0.57, "step": 662 }, { "epoch": 0.22391084093211752, "grad_norm": 2.493661403656006, "learning_rate": 1.8099161897008807e-05, "loss": 0.5099, "step": 663 }, { "epoch": 0.2242485646740966, "grad_norm": 1.9923776388168335, "learning_rate": 1.8092741010233496e-05, "loss": 0.5716, "step": 664 }, { "epoch": 0.22458628841607564, "grad_norm": 3.465059757232666, "learning_rate": 1.8086310440089382e-05, "loss": 0.5995, "step": 665 }, { "epoch": 0.22492401215805471, "grad_norm": 2.1434879302978516, "learning_rate": 1.8079870194270958e-05, "loss": 0.5358, "step": 666 }, { "epoch": 0.22526173590003376, "grad_norm": 2.5465946197509766, "learning_rate": 1.80734202804843e-05, "loss": 0.5962, "step": 667 }, { "epoch": 0.22559945964201283, "grad_norm": 2.0590672492980957, "learning_rate": 1.806696070644706e-05, "loss": 0.5692, "step": 668 }, { "epoch": 0.2259371833839919, "grad_norm": 1.8930697441101074, "learning_rate": 1.806049147988843e-05, "loss": 0.5973, "step": 669 }, { "epoch": 0.22627490712597095, "grad_norm": 2.0130293369293213, "learning_rate": 1.8054012608549167e-05, "loss": 0.5947, "step": 670 }, { "epoch": 0.22661263086795003, "grad_norm": 1.9120235443115234, "learning_rate": 1.8047524100181567e-05, "loss": 0.5517, "step": 671 }, { "epoch": 0.22695035460992907, "grad_norm": 1.9243590831756592, "learning_rate": 1.804102596254945e-05, "loss": 0.5545, "step": 672 }, { "epoch": 0.22728807835190815, "grad_norm": 2.1282033920288086, "learning_rate": 1.8034518203428167e-05, "loss": 0.5468, "step": 673 }, { "epoch": 0.2276258020938872, "grad_norm": 1.7529751062393188, "learning_rate": 1.802800083060457e-05, "loss": 0.5608, "step": 674 }, { "epoch": 0.22796352583586627, "grad_norm": 1.8741788864135742, "learning_rate": 1.802147385187703e-05, "loss": 0.5812, "step": 675 }, { "epoch": 0.2283012495778453, "grad_norm": 1.9114259481430054, "learning_rate": 1.8014937275055393e-05, "loss": 0.5697, "step": 676 }, { "epoch": 0.22863897331982438, "grad_norm": 2.032668113708496, "learning_rate": 1.8008391107961007e-05, "loss": 0.5468, "step": 677 }, { "epoch": 0.22897669706180346, "grad_norm": 1.9070888757705688, "learning_rate": 1.8001835358426688e-05, "loss": 0.5363, "step": 678 }, { "epoch": 0.2293144208037825, "grad_norm": 2.210920572280884, "learning_rate": 1.799527003429671e-05, "loss": 0.5643, "step": 679 }, { "epoch": 0.22965214454576158, "grad_norm": 2.1543776988983154, "learning_rate": 1.798869514342682e-05, "loss": 0.5972, "step": 680 }, { "epoch": 0.22998986828774062, "grad_norm": 2.416109085083008, "learning_rate": 1.7982110693684204e-05, "loss": 0.5525, "step": 681 }, { "epoch": 0.2303275920297197, "grad_norm": 1.9231826066970825, "learning_rate": 1.7975516692947478e-05, "loss": 0.562, "step": 682 }, { "epoch": 0.23066531577169874, "grad_norm": 3.610804319381714, "learning_rate": 1.796891314910669e-05, "loss": 0.5832, "step": 683 }, { "epoch": 0.23100303951367782, "grad_norm": 2.2154481410980225, "learning_rate": 1.7962300070063325e-05, "loss": 0.5624, "step": 684 }, { "epoch": 0.23134076325565686, "grad_norm": 2.2703583240509033, "learning_rate": 1.795567746373025e-05, "loss": 0.5594, "step": 685 }, { "epoch": 0.23167848699763594, "grad_norm": 1.983102560043335, "learning_rate": 1.7949045338031744e-05, "loss": 0.5494, "step": 686 }, { "epoch": 0.232016210739615, "grad_norm": 1.9816350936889648, "learning_rate": 1.7942403700903485e-05, "loss": 0.5845, "step": 687 }, { "epoch": 0.23235393448159405, "grad_norm": 2.0253210067749023, "learning_rate": 1.793575256029252e-05, "loss": 0.5374, "step": 688 }, { "epoch": 0.23269165822357313, "grad_norm": 1.9443249702453613, "learning_rate": 1.792909192415727e-05, "loss": 0.5884, "step": 689 }, { "epoch": 0.23302938196555217, "grad_norm": 2.062180280685425, "learning_rate": 1.7922421800467515e-05, "loss": 0.5462, "step": 690 }, { "epoch": 0.23336710570753125, "grad_norm": 2.0391664505004883, "learning_rate": 1.7915742197204395e-05, "loss": 0.5344, "step": 691 }, { "epoch": 0.2337048294495103, "grad_norm": 2.072640895843506, "learning_rate": 1.790905312236039e-05, "loss": 0.5329, "step": 692 }, { "epoch": 0.23404255319148937, "grad_norm": 1.8229525089263916, "learning_rate": 1.790235458393931e-05, "loss": 0.5426, "step": 693 }, { "epoch": 0.2343802769334684, "grad_norm": 1.812156319618225, "learning_rate": 1.7895646589956294e-05, "loss": 0.5461, "step": 694 }, { "epoch": 0.23471800067544749, "grad_norm": 1.7226794958114624, "learning_rate": 1.7888929148437788e-05, "loss": 0.5629, "step": 695 }, { "epoch": 0.23505572441742653, "grad_norm": 2.147902727127075, "learning_rate": 1.7882202267421544e-05, "loss": 0.5604, "step": 696 }, { "epoch": 0.2353934481594056, "grad_norm": 1.990212321281433, "learning_rate": 1.787546595495662e-05, "loss": 0.5349, "step": 697 }, { "epoch": 0.23573117190138468, "grad_norm": 2.7894861698150635, "learning_rate": 1.7868720219103343e-05, "loss": 0.5584, "step": 698 }, { "epoch": 0.23606889564336372, "grad_norm": 2.32299542427063, "learning_rate": 1.7861965067933323e-05, "loss": 0.5546, "step": 699 }, { "epoch": 0.2364066193853428, "grad_norm": 2.0415711402893066, "learning_rate": 1.7855200509529442e-05, "loss": 0.5387, "step": 700 }, { "epoch": 0.23674434312732184, "grad_norm": 1.9649502038955688, "learning_rate": 1.7848426551985823e-05, "loss": 0.5247, "step": 701 }, { "epoch": 0.23708206686930092, "grad_norm": 1.900039792060852, "learning_rate": 1.7841643203407854e-05, "loss": 0.547, "step": 702 }, { "epoch": 0.23741979061127996, "grad_norm": 2.437091112136841, "learning_rate": 1.7834850471912142e-05, "loss": 0.5548, "step": 703 }, { "epoch": 0.23775751435325904, "grad_norm": 2.214914321899414, "learning_rate": 1.7828048365626536e-05, "loss": 0.563, "step": 704 }, { "epoch": 0.23809523809523808, "grad_norm": 1.8053598403930664, "learning_rate": 1.782123689269009e-05, "loss": 0.5538, "step": 705 }, { "epoch": 0.23843296183721716, "grad_norm": 1.962486743927002, "learning_rate": 1.7814416061253076e-05, "loss": 0.5417, "step": 706 }, { "epoch": 0.23877068557919623, "grad_norm": 2.358191728591919, "learning_rate": 1.780758587947696e-05, "loss": 0.5914, "step": 707 }, { "epoch": 0.23910840932117527, "grad_norm": 2.6066484451293945, "learning_rate": 1.780074635553439e-05, "loss": 0.538, "step": 708 }, { "epoch": 0.23944613306315435, "grad_norm": 1.9983174800872803, "learning_rate": 1.7793897497609204e-05, "loss": 0.5656, "step": 709 }, { "epoch": 0.2397838568051334, "grad_norm": 1.9660698175430298, "learning_rate": 1.77870393138964e-05, "loss": 0.533, "step": 710 }, { "epoch": 0.24012158054711247, "grad_norm": 2.108734369277954, "learning_rate": 1.778017181260214e-05, "loss": 0.5568, "step": 711 }, { "epoch": 0.2404593042890915, "grad_norm": 1.9091601371765137, "learning_rate": 1.7773295001943725e-05, "loss": 0.5498, "step": 712 }, { "epoch": 0.2407970280310706, "grad_norm": 2.802318811416626, "learning_rate": 1.776640889014961e-05, "loss": 0.5706, "step": 713 }, { "epoch": 0.24113475177304963, "grad_norm": 2.362542152404785, "learning_rate": 1.7759513485459367e-05, "loss": 0.617, "step": 714 }, { "epoch": 0.2414724755150287, "grad_norm": 2.430633544921875, "learning_rate": 1.7752608796123698e-05, "loss": 0.5508, "step": 715 }, { "epoch": 0.24181019925700778, "grad_norm": 3.053344488143921, "learning_rate": 1.77456948304044e-05, "loss": 0.605, "step": 716 }, { "epoch": 0.24214792299898683, "grad_norm": 1.9895578622817993, "learning_rate": 1.7738771596574386e-05, "loss": 0.5187, "step": 717 }, { "epoch": 0.2424856467409659, "grad_norm": 2.1944234371185303, "learning_rate": 1.7731839102917646e-05, "loss": 0.5379, "step": 718 }, { "epoch": 0.24282337048294494, "grad_norm": 4.510195732116699, "learning_rate": 1.772489735772926e-05, "loss": 0.5636, "step": 719 }, { "epoch": 0.24316109422492402, "grad_norm": 2.3052597045898438, "learning_rate": 1.7717946369315365e-05, "loss": 0.5362, "step": 720 }, { "epoch": 0.24349881796690306, "grad_norm": 2.0964791774749756, "learning_rate": 1.7710986145993174e-05, "loss": 0.5715, "step": 721 }, { "epoch": 0.24383654170888214, "grad_norm": 2.229092836380005, "learning_rate": 1.7704016696090936e-05, "loss": 0.5531, "step": 722 }, { "epoch": 0.24417426545086118, "grad_norm": 2.4715325832366943, "learning_rate": 1.769703802794795e-05, "loss": 0.6111, "step": 723 }, { "epoch": 0.24451198919284026, "grad_norm": 1.9419331550598145, "learning_rate": 1.769005014991454e-05, "loss": 0.5908, "step": 724 }, { "epoch": 0.24484971293481933, "grad_norm": 2.075488805770874, "learning_rate": 1.768305307035205e-05, "loss": 0.5705, "step": 725 }, { "epoch": 0.24518743667679838, "grad_norm": 1.9620312452316284, "learning_rate": 1.7676046797632834e-05, "loss": 0.5205, "step": 726 }, { "epoch": 0.24552516041877745, "grad_norm": 2.77907657623291, "learning_rate": 1.766903134014025e-05, "loss": 0.5656, "step": 727 }, { "epoch": 0.2458628841607565, "grad_norm": 3.5929653644561768, "learning_rate": 1.7662006706268642e-05, "loss": 0.5574, "step": 728 }, { "epoch": 0.24620060790273557, "grad_norm": 1.9514364004135132, "learning_rate": 1.765497290442334e-05, "loss": 0.5425, "step": 729 }, { "epoch": 0.24653833164471461, "grad_norm": 2.860889434814453, "learning_rate": 1.7647929943020625e-05, "loss": 0.5596, "step": 730 }, { "epoch": 0.2468760553866937, "grad_norm": 2.3512980937957764, "learning_rate": 1.7640877830487768e-05, "loss": 0.5426, "step": 731 }, { "epoch": 0.24721377912867273, "grad_norm": 2.175079822540283, "learning_rate": 1.7633816575262966e-05, "loss": 0.5349, "step": 732 }, { "epoch": 0.2475515028706518, "grad_norm": 2.5084407329559326, "learning_rate": 1.7626746185795363e-05, "loss": 0.6462, "step": 733 }, { "epoch": 0.24788922661263088, "grad_norm": 2.634982109069824, "learning_rate": 1.7619666670545034e-05, "loss": 0.5603, "step": 734 }, { "epoch": 0.24822695035460993, "grad_norm": 2.092783212661743, "learning_rate": 1.7612578037982978e-05, "loss": 0.5496, "step": 735 }, { "epoch": 0.248564674096589, "grad_norm": 2.0839903354644775, "learning_rate": 1.7605480296591092e-05, "loss": 0.524, "step": 736 }, { "epoch": 0.24890239783856805, "grad_norm": 2.1607890129089355, "learning_rate": 1.759837345486218e-05, "loss": 0.5542, "step": 737 }, { "epoch": 0.24924012158054712, "grad_norm": 1.9397990703582764, "learning_rate": 1.759125752129993e-05, "loss": 0.5565, "step": 738 }, { "epoch": 0.24957784532252617, "grad_norm": 2.0072615146636963, "learning_rate": 1.758413250441892e-05, "loss": 0.5601, "step": 739 }, { "epoch": 0.24991556906450524, "grad_norm": 2.1390388011932373, "learning_rate": 1.757699841274458e-05, "loss": 0.6155, "step": 740 }, { "epoch": 0.2502532928064843, "grad_norm": 2.973132610321045, "learning_rate": 1.756985525481321e-05, "loss": 0.5661, "step": 741 }, { "epoch": 0.25059101654846333, "grad_norm": 2.003105401992798, "learning_rate": 1.7562703039171955e-05, "loss": 0.5746, "step": 742 }, { "epoch": 0.25092874029044243, "grad_norm": 1.9357932806015015, "learning_rate": 1.7555541774378798e-05, "loss": 0.5408, "step": 743 }, { "epoch": 0.2512664640324215, "grad_norm": 2.2828307151794434, "learning_rate": 1.7548371469002553e-05, "loss": 0.5569, "step": 744 }, { "epoch": 0.2516041877744005, "grad_norm": 7.927975654602051, "learning_rate": 1.7541192131622844e-05, "loss": 0.5629, "step": 745 }, { "epoch": 0.2519419115163796, "grad_norm": 2.640950918197632, "learning_rate": 1.753400377083011e-05, "loss": 0.5512, "step": 746 }, { "epoch": 0.25227963525835867, "grad_norm": 2.1376330852508545, "learning_rate": 1.7526806395225583e-05, "loss": 0.5541, "step": 747 }, { "epoch": 0.2526173590003377, "grad_norm": 2.1680831909179688, "learning_rate": 1.7519600013421282e-05, "loss": 0.5256, "step": 748 }, { "epoch": 0.25295508274231676, "grad_norm": 1.9220272302627563, "learning_rate": 1.7512384634040002e-05, "loss": 0.537, "step": 749 }, { "epoch": 0.25329280648429586, "grad_norm": 2.273475170135498, "learning_rate": 1.7505160265715303e-05, "loss": 0.5936, "step": 750 }, { "epoch": 0.2536305302262749, "grad_norm": 2.3782095909118652, "learning_rate": 1.7497926917091508e-05, "loss": 0.5602, "step": 751 }, { "epoch": 0.25396825396825395, "grad_norm": 2.68558406829834, "learning_rate": 1.7490684596823678e-05, "loss": 0.5612, "step": 752 }, { "epoch": 0.25430597771023306, "grad_norm": 2.0827229022979736, "learning_rate": 1.7483433313577607e-05, "loss": 0.5634, "step": 753 }, { "epoch": 0.2546437014522121, "grad_norm": 2.259140729904175, "learning_rate": 1.747617307602982e-05, "loss": 0.5688, "step": 754 }, { "epoch": 0.25498142519419115, "grad_norm": 1.9068379402160645, "learning_rate": 1.7468903892867556e-05, "loss": 0.5416, "step": 755 }, { "epoch": 0.2553191489361702, "grad_norm": 1.9865871667861938, "learning_rate": 1.7461625772788755e-05, "loss": 0.572, "step": 756 }, { "epoch": 0.2556568726781493, "grad_norm": 2.187798261642456, "learning_rate": 1.745433872450205e-05, "loss": 0.5429, "step": 757 }, { "epoch": 0.25599459642012834, "grad_norm": 2.1291627883911133, "learning_rate": 1.7447042756726756e-05, "loss": 0.5415, "step": 758 }, { "epoch": 0.2563323201621074, "grad_norm": 1.705628514289856, "learning_rate": 1.7439737878192863e-05, "loss": 0.5704, "step": 759 }, { "epoch": 0.25667004390408643, "grad_norm": 1.8846163749694824, "learning_rate": 1.743242409764103e-05, "loss": 0.5715, "step": 760 }, { "epoch": 0.25700776764606553, "grad_norm": 2.149690866470337, "learning_rate": 1.7425101423822554e-05, "loss": 0.5833, "step": 761 }, { "epoch": 0.2573454913880446, "grad_norm": 1.620351791381836, "learning_rate": 1.741776986549938e-05, "loss": 0.5498, "step": 762 }, { "epoch": 0.2576832151300236, "grad_norm": 2.3813579082489014, "learning_rate": 1.741042943144409e-05, "loss": 0.5637, "step": 763 }, { "epoch": 0.2580209388720027, "grad_norm": 2.0085997581481934, "learning_rate": 1.7403080130439874e-05, "loss": 0.5409, "step": 764 }, { "epoch": 0.25835866261398177, "grad_norm": 2.028350830078125, "learning_rate": 1.739572197128054e-05, "loss": 0.5479, "step": 765 }, { "epoch": 0.2586963863559608, "grad_norm": 4.051863193511963, "learning_rate": 1.7388354962770488e-05, "loss": 0.5686, "step": 766 }, { "epoch": 0.25903411009793986, "grad_norm": 2.1607415676116943, "learning_rate": 1.7380979113724715e-05, "loss": 0.5731, "step": 767 }, { "epoch": 0.25937183383991896, "grad_norm": 2.103065013885498, "learning_rate": 1.7373594432968798e-05, "loss": 0.5643, "step": 768 }, { "epoch": 0.259709557581898, "grad_norm": 1.9516042470932007, "learning_rate": 1.736620092933887e-05, "loss": 0.5287, "step": 769 }, { "epoch": 0.26004728132387706, "grad_norm": 2.311922788619995, "learning_rate": 1.735879861168163e-05, "loss": 0.5246, "step": 770 }, { "epoch": 0.26038500506585616, "grad_norm": 1.6315172910690308, "learning_rate": 1.735138748885432e-05, "loss": 0.5493, "step": 771 }, { "epoch": 0.2607227288078352, "grad_norm": 1.9576717615127563, "learning_rate": 1.7343967569724716e-05, "loss": 0.5298, "step": 772 }, { "epoch": 0.26106045254981425, "grad_norm": 2.0451595783233643, "learning_rate": 1.733653886317113e-05, "loss": 0.568, "step": 773 }, { "epoch": 0.2613981762917933, "grad_norm": 1.8073296546936035, "learning_rate": 1.7329101378082374e-05, "loss": 0.5668, "step": 774 }, { "epoch": 0.2617359000337724, "grad_norm": 2.024836540222168, "learning_rate": 1.732165512335777e-05, "loss": 0.5324, "step": 775 }, { "epoch": 0.26207362377575144, "grad_norm": 1.864313006401062, "learning_rate": 1.731420010790713e-05, "loss": 0.5769, "step": 776 }, { "epoch": 0.2624113475177305, "grad_norm": 2.310439348220825, "learning_rate": 1.730673634065076e-05, "loss": 0.5398, "step": 777 }, { "epoch": 0.26274907125970953, "grad_norm": 2.466888904571533, "learning_rate": 1.729926383051943e-05, "loss": 0.5891, "step": 778 }, { "epoch": 0.26308679500168863, "grad_norm": 2.077031135559082, "learning_rate": 1.7291782586454367e-05, "loss": 0.543, "step": 779 }, { "epoch": 0.2634245187436677, "grad_norm": 1.9751691818237305, "learning_rate": 1.7284292617407253e-05, "loss": 0.5603, "step": 780 }, { "epoch": 0.2637622424856467, "grad_norm": 1.9815374612808228, "learning_rate": 1.727679393234022e-05, "loss": 0.5237, "step": 781 }, { "epoch": 0.2640999662276258, "grad_norm": 1.8622639179229736, "learning_rate": 1.7269286540225805e-05, "loss": 0.527, "step": 782 }, { "epoch": 0.26443768996960487, "grad_norm": 2.0941240787506104, "learning_rate": 1.7261770450046985e-05, "loss": 0.5771, "step": 783 }, { "epoch": 0.2647754137115839, "grad_norm": 1.9551018476486206, "learning_rate": 1.725424567079714e-05, "loss": 0.5666, "step": 784 }, { "epoch": 0.26511313745356296, "grad_norm": 1.983276128768921, "learning_rate": 1.7246712211480036e-05, "loss": 0.5802, "step": 785 }, { "epoch": 0.26545086119554206, "grad_norm": 2.1090171337127686, "learning_rate": 1.723917008110984e-05, "loss": 0.56, "step": 786 }, { "epoch": 0.2657885849375211, "grad_norm": 2.139714479446411, "learning_rate": 1.7231619288711083e-05, "loss": 0.5641, "step": 787 }, { "epoch": 0.26612630867950016, "grad_norm": 1.9303045272827148, "learning_rate": 1.722405984331867e-05, "loss": 0.534, "step": 788 }, { "epoch": 0.26646403242147926, "grad_norm": 2.0145158767700195, "learning_rate": 1.7216491753977856e-05, "loss": 0.5652, "step": 789 }, { "epoch": 0.2668017561634583, "grad_norm": 2.0305330753326416, "learning_rate": 1.720891502974423e-05, "loss": 0.5571, "step": 790 }, { "epoch": 0.26713947990543735, "grad_norm": 2.3499131202697754, "learning_rate": 1.7201329679683726e-05, "loss": 0.6018, "step": 791 }, { "epoch": 0.2674772036474164, "grad_norm": 2.0410754680633545, "learning_rate": 1.7193735712872598e-05, "loss": 0.6102, "step": 792 }, { "epoch": 0.2678149273893955, "grad_norm": 2.3621578216552734, "learning_rate": 1.71861331383974e-05, "loss": 0.6012, "step": 793 }, { "epoch": 0.26815265113137454, "grad_norm": 1.875656008720398, "learning_rate": 1.7178521965354992e-05, "loss": 0.549, "step": 794 }, { "epoch": 0.2684903748733536, "grad_norm": 2.0474579334259033, "learning_rate": 1.7170902202852526e-05, "loss": 0.5537, "step": 795 }, { "epoch": 0.26882809861533263, "grad_norm": 1.8746976852416992, "learning_rate": 1.7163273860007434e-05, "loss": 0.619, "step": 796 }, { "epoch": 0.26916582235731173, "grad_norm": 1.870257019996643, "learning_rate": 1.7155636945947396e-05, "loss": 0.5364, "step": 797 }, { "epoch": 0.2695035460992908, "grad_norm": 1.8943461179733276, "learning_rate": 1.714799146981037e-05, "loss": 0.5429, "step": 798 }, { "epoch": 0.2698412698412698, "grad_norm": 2.491217613220215, "learning_rate": 1.7140337440744545e-05, "loss": 0.5748, "step": 799 }, { "epoch": 0.2701789935832489, "grad_norm": 1.9062799215316772, "learning_rate": 1.7132674867908354e-05, "loss": 0.524, "step": 800 }, { "epoch": 0.270516717325228, "grad_norm": 2.1320526599884033, "learning_rate": 1.7125003760470447e-05, "loss": 0.578, "step": 801 }, { "epoch": 0.270854441067207, "grad_norm": 2.1500496864318848, "learning_rate": 1.7117324127609686e-05, "loss": 0.5281, "step": 802 }, { "epoch": 0.27119216480918606, "grad_norm": 2.2589471340179443, "learning_rate": 1.7109635978515133e-05, "loss": 0.5682, "step": 803 }, { "epoch": 0.27152988855116517, "grad_norm": 2.184072256088257, "learning_rate": 1.710193932238605e-05, "loss": 0.5641, "step": 804 }, { "epoch": 0.2718676122931442, "grad_norm": 2.561239719390869, "learning_rate": 1.709423416843186e-05, "loss": 0.5349, "step": 805 }, { "epoch": 0.27220533603512326, "grad_norm": 2.0771517753601074, "learning_rate": 1.7086520525872173e-05, "loss": 0.5402, "step": 806 }, { "epoch": 0.2725430597771023, "grad_norm": 1.8809479475021362, "learning_rate": 1.7078798403936742e-05, "loss": 0.5401, "step": 807 }, { "epoch": 0.2728807835190814, "grad_norm": 2.149000644683838, "learning_rate": 1.7071067811865477e-05, "loss": 0.5438, "step": 808 }, { "epoch": 0.27321850726106045, "grad_norm": 2.1849277019500732, "learning_rate": 1.706332875890841e-05, "loss": 0.5651, "step": 809 }, { "epoch": 0.2735562310030395, "grad_norm": 2.6675844192504883, "learning_rate": 1.7055581254325716e-05, "loss": 0.5498, "step": 810 }, { "epoch": 0.2738939547450186, "grad_norm": 2.075859308242798, "learning_rate": 1.7047825307387658e-05, "loss": 0.5355, "step": 811 }, { "epoch": 0.27423167848699764, "grad_norm": 2.0117979049682617, "learning_rate": 1.7040060927374626e-05, "loss": 0.5172, "step": 812 }, { "epoch": 0.2745694022289767, "grad_norm": 2.0425467491149902, "learning_rate": 1.7032288123577076e-05, "loss": 0.5366, "step": 813 }, { "epoch": 0.27490712597095573, "grad_norm": 1.7846869230270386, "learning_rate": 1.7024506905295566e-05, "loss": 0.5147, "step": 814 }, { "epoch": 0.27524484971293484, "grad_norm": 1.8894165754318237, "learning_rate": 1.7016717281840705e-05, "loss": 0.5327, "step": 815 }, { "epoch": 0.2755825734549139, "grad_norm": 2.217681407928467, "learning_rate": 1.7008919262533174e-05, "loss": 0.5408, "step": 816 }, { "epoch": 0.2759202971968929, "grad_norm": 1.9166340827941895, "learning_rate": 1.700111285670369e-05, "loss": 0.5335, "step": 817 }, { "epoch": 0.27625802093887203, "grad_norm": 1.8104338645935059, "learning_rate": 1.6993298073693005e-05, "loss": 0.5065, "step": 818 }, { "epoch": 0.2765957446808511, "grad_norm": 2.4260857105255127, "learning_rate": 1.6985474922851897e-05, "loss": 0.5455, "step": 819 }, { "epoch": 0.2769334684228301, "grad_norm": 2.1530063152313232, "learning_rate": 1.6977643413541156e-05, "loss": 0.5236, "step": 820 }, { "epoch": 0.27727119216480917, "grad_norm": 2.1060426235198975, "learning_rate": 1.696980355513158e-05, "loss": 0.5239, "step": 821 }, { "epoch": 0.27760891590678827, "grad_norm": 2.3658926486968994, "learning_rate": 1.6961955357003948e-05, "loss": 0.566, "step": 822 }, { "epoch": 0.2779466396487673, "grad_norm": 2.234330892562866, "learning_rate": 1.6954098828549013e-05, "loss": 0.5353, "step": 823 }, { "epoch": 0.27828436339074636, "grad_norm": 1.9442986249923706, "learning_rate": 1.6946233979167516e-05, "loss": 0.552, "step": 824 }, { "epoch": 0.2786220871327254, "grad_norm": 1.682684302330017, "learning_rate": 1.693836081827014e-05, "loss": 0.523, "step": 825 }, { "epoch": 0.2789598108747045, "grad_norm": 2.038027763366699, "learning_rate": 1.693047935527751e-05, "loss": 0.5411, "step": 826 }, { "epoch": 0.27929753461668355, "grad_norm": 3.1663737297058105, "learning_rate": 1.6922589599620193e-05, "loss": 0.5544, "step": 827 }, { "epoch": 0.2796352583586626, "grad_norm": 1.9813878536224365, "learning_rate": 1.6914691560738675e-05, "loss": 0.5802, "step": 828 }, { "epoch": 0.2799729821006417, "grad_norm": 2.2006947994232178, "learning_rate": 1.6906785248083354e-05, "loss": 0.6031, "step": 829 }, { "epoch": 0.28031070584262074, "grad_norm": 2.1339051723480225, "learning_rate": 1.6898870671114527e-05, "loss": 0.5785, "step": 830 }, { "epoch": 0.2806484295845998, "grad_norm": 1.9553354978561401, "learning_rate": 1.6890947839302382e-05, "loss": 0.5952, "step": 831 }, { "epoch": 0.28098615332657884, "grad_norm": 2.1120100021362305, "learning_rate": 1.6883016762126986e-05, "loss": 0.5384, "step": 832 }, { "epoch": 0.28132387706855794, "grad_norm": 1.8826038837432861, "learning_rate": 1.6875077449078262e-05, "loss": 0.5375, "step": 833 }, { "epoch": 0.281661600810537, "grad_norm": 1.7627432346343994, "learning_rate": 1.6867129909656e-05, "loss": 0.5583, "step": 834 }, { "epoch": 0.28199932455251603, "grad_norm": 1.9535682201385498, "learning_rate": 1.685917415336982e-05, "loss": 0.5892, "step": 835 }, { "epoch": 0.28233704829449513, "grad_norm": 2.0625839233398438, "learning_rate": 1.6851210189739195e-05, "loss": 0.5467, "step": 836 }, { "epoch": 0.2826747720364742, "grad_norm": 2.045077085494995, "learning_rate": 1.6843238028293396e-05, "loss": 0.5461, "step": 837 }, { "epoch": 0.2830124957784532, "grad_norm": 1.893406867980957, "learning_rate": 1.6835257678571515e-05, "loss": 0.5578, "step": 838 }, { "epoch": 0.28335021952043227, "grad_norm": 1.8902945518493652, "learning_rate": 1.6827269150122437e-05, "loss": 0.5794, "step": 839 }, { "epoch": 0.28368794326241137, "grad_norm": 1.8044627904891968, "learning_rate": 1.681927245250484e-05, "loss": 0.558, "step": 840 }, { "epoch": 0.2840256670043904, "grad_norm": 1.856358528137207, "learning_rate": 1.6811267595287165e-05, "loss": 0.5371, "step": 841 }, { "epoch": 0.28436339074636946, "grad_norm": 1.9735748767852783, "learning_rate": 1.680325458804763e-05, "loss": 0.5621, "step": 842 }, { "epoch": 0.2847011144883485, "grad_norm": 2.697091579437256, "learning_rate": 1.6795233440374193e-05, "loss": 0.5823, "step": 843 }, { "epoch": 0.2850388382303276, "grad_norm": 2.007364511489868, "learning_rate": 1.6787204161864562e-05, "loss": 0.5667, "step": 844 }, { "epoch": 0.28537656197230665, "grad_norm": 2.0994889736175537, "learning_rate": 1.677916676212617e-05, "loss": 0.5034, "step": 845 }, { "epoch": 0.2857142857142857, "grad_norm": 1.878923773765564, "learning_rate": 1.6771121250776163e-05, "loss": 0.5525, "step": 846 }, { "epoch": 0.2860520094562648, "grad_norm": 1.6664432287216187, "learning_rate": 1.67630676374414e-05, "loss": 0.5263, "step": 847 }, { "epoch": 0.28638973319824385, "grad_norm": 1.8380390405654907, "learning_rate": 1.675500593175843e-05, "loss": 0.5497, "step": 848 }, { "epoch": 0.2867274569402229, "grad_norm": 2.0949866771698, "learning_rate": 1.6746936143373488e-05, "loss": 0.5746, "step": 849 }, { "epoch": 0.28706518068220194, "grad_norm": 1.9844456911087036, "learning_rate": 1.6738858281942477e-05, "loss": 0.5681, "step": 850 }, { "epoch": 0.28740290442418104, "grad_norm": 1.8695605993270874, "learning_rate": 1.673077235713097e-05, "loss": 0.5786, "step": 851 }, { "epoch": 0.2877406281661601, "grad_norm": 3.7167856693267822, "learning_rate": 1.6722678378614164e-05, "loss": 0.551, "step": 852 }, { "epoch": 0.28807835190813913, "grad_norm": 1.9023866653442383, "learning_rate": 1.6714576356076925e-05, "loss": 0.5507, "step": 853 }, { "epoch": 0.28841607565011823, "grad_norm": 1.9262011051177979, "learning_rate": 1.6706466299213718e-05, "loss": 0.5345, "step": 854 }, { "epoch": 0.2887537993920973, "grad_norm": 1.9931910037994385, "learning_rate": 1.669834821772863e-05, "loss": 0.555, "step": 855 }, { "epoch": 0.2890915231340763, "grad_norm": 1.8690621852874756, "learning_rate": 1.6690222121335357e-05, "loss": 0.5392, "step": 856 }, { "epoch": 0.28942924687605537, "grad_norm": 1.6896865367889404, "learning_rate": 1.6682088019757174e-05, "loss": 0.5252, "step": 857 }, { "epoch": 0.28976697061803447, "grad_norm": 1.8050403594970703, "learning_rate": 1.6673945922726945e-05, "loss": 0.5436, "step": 858 }, { "epoch": 0.2901046943600135, "grad_norm": 1.7588163614273071, "learning_rate": 1.6665795839987092e-05, "loss": 0.4989, "step": 859 }, { "epoch": 0.29044241810199256, "grad_norm": 2.146669387817383, "learning_rate": 1.6657637781289596e-05, "loss": 0.5976, "step": 860 }, { "epoch": 0.2907801418439716, "grad_norm": 1.7566490173339844, "learning_rate": 1.6649471756395985e-05, "loss": 0.5575, "step": 861 }, { "epoch": 0.2911178655859507, "grad_norm": 8.24329948425293, "learning_rate": 1.6641297775077313e-05, "loss": 0.5205, "step": 862 }, { "epoch": 0.29145558932792975, "grad_norm": 2.20756196975708, "learning_rate": 1.6633115847114157e-05, "loss": 0.5805, "step": 863 }, { "epoch": 0.2917933130699088, "grad_norm": 2.0174927711486816, "learning_rate": 1.66249259822966e-05, "loss": 0.5557, "step": 864 }, { "epoch": 0.2921310368118879, "grad_norm": 1.8469337224960327, "learning_rate": 1.6616728190424228e-05, "loss": 0.5229, "step": 865 }, { "epoch": 0.29246876055386695, "grad_norm": 7.932276725769043, "learning_rate": 1.660852248130611e-05, "loss": 0.5524, "step": 866 }, { "epoch": 0.292806484295846, "grad_norm": 2.0979068279266357, "learning_rate": 1.660030886476078e-05, "loss": 0.5772, "step": 867 }, { "epoch": 0.29314420803782504, "grad_norm": 2.3202691078186035, "learning_rate": 1.6592087350616245e-05, "loss": 0.5708, "step": 868 }, { "epoch": 0.29348193177980414, "grad_norm": 23.089902877807617, "learning_rate": 1.6583857948709957e-05, "loss": 0.535, "step": 869 }, { "epoch": 0.2938196555217832, "grad_norm": 1.8671787977218628, "learning_rate": 1.6575620668888812e-05, "loss": 0.5518, "step": 870 }, { "epoch": 0.29415737926376223, "grad_norm": 1.6629315614700317, "learning_rate": 1.6567375521009114e-05, "loss": 0.5132, "step": 871 }, { "epoch": 0.2944951030057413, "grad_norm": 2.541193962097168, "learning_rate": 1.6559122514936606e-05, "loss": 0.5657, "step": 872 }, { "epoch": 0.2948328267477204, "grad_norm": 2.1817448139190674, "learning_rate": 1.6550861660546417e-05, "loss": 0.5271, "step": 873 }, { "epoch": 0.2951705504896994, "grad_norm": 2.1123411655426025, "learning_rate": 1.6542592967723065e-05, "loss": 0.5267, "step": 874 }, { "epoch": 0.29550827423167847, "grad_norm": 2.5140247344970703, "learning_rate": 1.6534316446360467e-05, "loss": 0.6375, "step": 875 }, { "epoch": 0.29584599797365757, "grad_norm": 1.9025087356567383, "learning_rate": 1.6526032106361888e-05, "loss": 0.5638, "step": 876 }, { "epoch": 0.2961837217156366, "grad_norm": 2.834843635559082, "learning_rate": 1.6517739957639952e-05, "loss": 0.5511, "step": 877 }, { "epoch": 0.29652144545761566, "grad_norm": 2.1340105533599854, "learning_rate": 1.6509440010116634e-05, "loss": 0.5478, "step": 878 }, { "epoch": 0.2968591691995947, "grad_norm": 2.2608306407928467, "learning_rate": 1.650113227372323e-05, "loss": 0.5273, "step": 879 }, { "epoch": 0.2971968929415738, "grad_norm": 2.3321566581726074, "learning_rate": 1.649281675840037e-05, "loss": 0.527, "step": 880 }, { "epoch": 0.29753461668355285, "grad_norm": 2.173710823059082, "learning_rate": 1.6484493474097976e-05, "loss": 0.5391, "step": 881 }, { "epoch": 0.2978723404255319, "grad_norm": 2.2197515964508057, "learning_rate": 1.6476162430775278e-05, "loss": 0.5737, "step": 882 }, { "epoch": 0.298210064167511, "grad_norm": 1.9948723316192627, "learning_rate": 1.6467823638400782e-05, "loss": 0.5689, "step": 883 }, { "epoch": 0.29854778790949005, "grad_norm": 1.8222283124923706, "learning_rate": 1.645947710695227e-05, "loss": 0.5622, "step": 884 }, { "epoch": 0.2988855116514691, "grad_norm": 2.2053563594818115, "learning_rate": 1.6451122846416784e-05, "loss": 0.5581, "step": 885 }, { "epoch": 0.29922323539344814, "grad_norm": 2.4677324295043945, "learning_rate": 1.6442760866790616e-05, "loss": 0.5937, "step": 886 }, { "epoch": 0.29956095913542724, "grad_norm": 2.1852009296417236, "learning_rate": 1.643439117807929e-05, "loss": 0.5532, "step": 887 }, { "epoch": 0.2998986828774063, "grad_norm": 1.8570302724838257, "learning_rate": 1.6426013790297556e-05, "loss": 0.5324, "step": 888 }, { "epoch": 0.30023640661938533, "grad_norm": 1.907383918762207, "learning_rate": 1.641762871346938e-05, "loss": 0.5578, "step": 889 }, { "epoch": 0.3005741303613644, "grad_norm": 2.0898120403289795, "learning_rate": 1.6409235957627926e-05, "loss": 0.5745, "step": 890 }, { "epoch": 0.3009118541033435, "grad_norm": 1.9215843677520752, "learning_rate": 1.6400835532815538e-05, "loss": 0.5206, "step": 891 }, { "epoch": 0.3012495778453225, "grad_norm": 2.296461582183838, "learning_rate": 1.639242744908375e-05, "loss": 0.5736, "step": 892 }, { "epoch": 0.30158730158730157, "grad_norm": 2.035682201385498, "learning_rate": 1.6384011716493255e-05, "loss": 0.5437, "step": 893 }, { "epoch": 0.30192502532928067, "grad_norm": 1.8587098121643066, "learning_rate": 1.6375588345113895e-05, "loss": 0.553, "step": 894 }, { "epoch": 0.3022627490712597, "grad_norm": 2.000211000442505, "learning_rate": 1.6367157345024655e-05, "loss": 0.5778, "step": 895 }, { "epoch": 0.30260047281323876, "grad_norm": 2.4121766090393066, "learning_rate": 1.6358718726313645e-05, "loss": 0.5339, "step": 896 }, { "epoch": 0.3029381965552178, "grad_norm": 2.024103879928589, "learning_rate": 1.6350272499078096e-05, "loss": 0.5781, "step": 897 }, { "epoch": 0.3032759202971969, "grad_norm": 1.9172406196594238, "learning_rate": 1.6341818673424342e-05, "loss": 0.5321, "step": 898 }, { "epoch": 0.30361364403917596, "grad_norm": 1.7518633604049683, "learning_rate": 1.633335725946781e-05, "loss": 0.5836, "step": 899 }, { "epoch": 0.303951367781155, "grad_norm": 2.3801729679107666, "learning_rate": 1.6324888267332998e-05, "loss": 0.548, "step": 900 }, { "epoch": 0.3042890915231341, "grad_norm": 1.9989526271820068, "learning_rate": 1.6316411707153478e-05, "loss": 0.5479, "step": 901 }, { "epoch": 0.30462681526511315, "grad_norm": 1.9508386850357056, "learning_rate": 1.630792758907189e-05, "loss": 0.5618, "step": 902 }, { "epoch": 0.3049645390070922, "grad_norm": 2.0251471996307373, "learning_rate": 1.6299435923239886e-05, "loss": 0.5582, "step": 903 }, { "epoch": 0.30530226274907124, "grad_norm": 2.0017852783203125, "learning_rate": 1.6290936719818182e-05, "loss": 0.5301, "step": 904 }, { "epoch": 0.30563998649105034, "grad_norm": 2.105952501296997, "learning_rate": 1.6282429988976497e-05, "loss": 0.5623, "step": 905 }, { "epoch": 0.3059777102330294, "grad_norm": 1.912850022315979, "learning_rate": 1.6273915740893557e-05, "loss": 0.5654, "step": 906 }, { "epoch": 0.30631543397500843, "grad_norm": 1.734071135520935, "learning_rate": 1.6265393985757082e-05, "loss": 0.5331, "step": 907 }, { "epoch": 0.3066531577169875, "grad_norm": 3.3579728603363037, "learning_rate": 1.6256864733763787e-05, "loss": 0.5586, "step": 908 }, { "epoch": 0.3069908814589666, "grad_norm": 2.7184321880340576, "learning_rate": 1.6248327995119337e-05, "loss": 0.5512, "step": 909 }, { "epoch": 0.3073286052009456, "grad_norm": 1.9222571849822998, "learning_rate": 1.6239783780038374e-05, "loss": 0.558, "step": 910 }, { "epoch": 0.30766632894292467, "grad_norm": 2.039170503616333, "learning_rate": 1.6231232098744477e-05, "loss": 0.5714, "step": 911 }, { "epoch": 0.3080040526849038, "grad_norm": 2.0233869552612305, "learning_rate": 1.6222672961470158e-05, "loss": 0.5231, "step": 912 }, { "epoch": 0.3083417764268828, "grad_norm": 1.9124637842178345, "learning_rate": 1.621410637845685e-05, "loss": 0.5222, "step": 913 }, { "epoch": 0.30867950016886186, "grad_norm": 2.7945308685302734, "learning_rate": 1.6205532359954905e-05, "loss": 0.5499, "step": 914 }, { "epoch": 0.3090172239108409, "grad_norm": 2.471774101257324, "learning_rate": 1.6196950916223553e-05, "loss": 0.5742, "step": 915 }, { "epoch": 0.30935494765282, "grad_norm": 2.5388102531433105, "learning_rate": 1.618836205753093e-05, "loss": 0.5114, "step": 916 }, { "epoch": 0.30969267139479906, "grad_norm": 1.9021186828613281, "learning_rate": 1.6179765794154034e-05, "loss": 0.5657, "step": 917 }, { "epoch": 0.3100303951367781, "grad_norm": 1.7403775453567505, "learning_rate": 1.6171162136378716e-05, "loss": 0.5918, "step": 918 }, { "epoch": 0.3103681188787572, "grad_norm": 2.489072561264038, "learning_rate": 1.6162551094499684e-05, "loss": 0.6043, "step": 919 }, { "epoch": 0.31070584262073625, "grad_norm": 1.9061717987060547, "learning_rate": 1.6153932678820487e-05, "loss": 0.5406, "step": 920 }, { "epoch": 0.3110435663627153, "grad_norm": 2.8631253242492676, "learning_rate": 1.6145306899653482e-05, "loss": 0.5509, "step": 921 }, { "epoch": 0.31138129010469434, "grad_norm": 2.252854585647583, "learning_rate": 1.6136673767319853e-05, "loss": 0.58, "step": 922 }, { "epoch": 0.31171901384667344, "grad_norm": 2.095470905303955, "learning_rate": 1.6128033292149564e-05, "loss": 0.5228, "step": 923 }, { "epoch": 0.3120567375886525, "grad_norm": 2.227548599243164, "learning_rate": 1.611938548448138e-05, "loss": 0.5885, "step": 924 }, { "epoch": 0.31239446133063153, "grad_norm": 1.765621542930603, "learning_rate": 1.6110730354662836e-05, "loss": 0.576, "step": 925 }, { "epoch": 0.3127321850726106, "grad_norm": 1.888337254524231, "learning_rate": 1.6102067913050227e-05, "loss": 0.5729, "step": 926 }, { "epoch": 0.3130699088145897, "grad_norm": 2.0745630264282227, "learning_rate": 1.609339817000859e-05, "loss": 0.5688, "step": 927 }, { "epoch": 0.3134076325565687, "grad_norm": 1.7210081815719604, "learning_rate": 1.6084721135911715e-05, "loss": 0.5198, "step": 928 }, { "epoch": 0.3137453562985478, "grad_norm": 2.0701916217803955, "learning_rate": 1.6076036821142106e-05, "loss": 0.5671, "step": 929 }, { "epoch": 0.3140830800405269, "grad_norm": 1.6981650590896606, "learning_rate": 1.606734523609097e-05, "loss": 0.5499, "step": 930 }, { "epoch": 0.3144208037825059, "grad_norm": 1.989798903465271, "learning_rate": 1.605864639115823e-05, "loss": 0.5218, "step": 931 }, { "epoch": 0.31475852752448497, "grad_norm": 2.05900502204895, "learning_rate": 1.604994029675249e-05, "loss": 0.5268, "step": 932 }, { "epoch": 0.315096251266464, "grad_norm": 2.205880880355835, "learning_rate": 1.6041226963291024e-05, "loss": 0.5345, "step": 933 }, { "epoch": 0.3154339750084431, "grad_norm": 2.0170722007751465, "learning_rate": 1.603250640119977e-05, "loss": 0.5371, "step": 934 }, { "epoch": 0.31577169875042216, "grad_norm": 1.995833396911621, "learning_rate": 1.6023778620913315e-05, "loss": 0.5757, "step": 935 }, { "epoch": 0.3161094224924012, "grad_norm": 1.8636432886123657, "learning_rate": 1.601504363287489e-05, "loss": 0.558, "step": 936 }, { "epoch": 0.31644714623438025, "grad_norm": 2.2682371139526367, "learning_rate": 1.6006301447536338e-05, "loss": 0.5609, "step": 937 }, { "epoch": 0.31678486997635935, "grad_norm": 2.110774278640747, "learning_rate": 1.5997552075358122e-05, "loss": 0.5365, "step": 938 }, { "epoch": 0.3171225937183384, "grad_norm": 2.0555100440979004, "learning_rate": 1.59887955268093e-05, "loss": 0.5399, "step": 939 }, { "epoch": 0.31746031746031744, "grad_norm": 2.401752471923828, "learning_rate": 1.598003181236753e-05, "loss": 0.5261, "step": 940 }, { "epoch": 0.31779804120229654, "grad_norm": 2.1245505809783936, "learning_rate": 1.5971260942519024e-05, "loss": 0.5821, "step": 941 }, { "epoch": 0.3181357649442756, "grad_norm": 2.3574609756469727, "learning_rate": 1.5962482927758568e-05, "loss": 0.5572, "step": 942 }, { "epoch": 0.31847348868625464, "grad_norm": 1.7971746921539307, "learning_rate": 1.5953697778589495e-05, "loss": 0.5545, "step": 943 }, { "epoch": 0.3188112124282337, "grad_norm": 1.8986175060272217, "learning_rate": 1.5944905505523677e-05, "loss": 0.565, "step": 944 }, { "epoch": 0.3191489361702128, "grad_norm": 2.102036714553833, "learning_rate": 1.5936106119081507e-05, "loss": 0.5691, "step": 945 }, { "epoch": 0.31948665991219183, "grad_norm": 2.181473970413208, "learning_rate": 1.592729962979189e-05, "loss": 0.5739, "step": 946 }, { "epoch": 0.3198243836541709, "grad_norm": 2.2344422340393066, "learning_rate": 1.591848604819223e-05, "loss": 0.5388, "step": 947 }, { "epoch": 0.32016210739615, "grad_norm": 2.011967658996582, "learning_rate": 1.590966538482842e-05, "loss": 0.5647, "step": 948 }, { "epoch": 0.320499831138129, "grad_norm": 2.657277822494507, "learning_rate": 1.590083765025482e-05, "loss": 0.5602, "step": 949 }, { "epoch": 0.32083755488010807, "grad_norm": 2.1366896629333496, "learning_rate": 1.589200285503426e-05, "loss": 0.5441, "step": 950 }, { "epoch": 0.3211752786220871, "grad_norm": 2.0592899322509766, "learning_rate": 1.5883161009738003e-05, "loss": 0.5459, "step": 951 }, { "epoch": 0.3215130023640662, "grad_norm": 1.9516855478286743, "learning_rate": 1.5874312124945773e-05, "loss": 0.5765, "step": 952 }, { "epoch": 0.32185072610604526, "grad_norm": 2.0138096809387207, "learning_rate": 1.58654562112457e-05, "loss": 0.5583, "step": 953 }, { "epoch": 0.3221884498480243, "grad_norm": 2.1654369831085205, "learning_rate": 1.585659327923432e-05, "loss": 0.5551, "step": 954 }, { "epoch": 0.32252617359000335, "grad_norm": 2.0132269859313965, "learning_rate": 1.5847723339516572e-05, "loss": 0.5191, "step": 955 }, { "epoch": 0.32286389733198245, "grad_norm": 3.0496742725372314, "learning_rate": 1.5838846402705793e-05, "loss": 0.5698, "step": 956 }, { "epoch": 0.3232016210739615, "grad_norm": 2.1517014503479004, "learning_rate": 1.5829962479423672e-05, "loss": 0.5519, "step": 957 }, { "epoch": 0.32353934481594054, "grad_norm": 1.9310775995254517, "learning_rate": 1.582107158030027e-05, "loss": 0.5436, "step": 958 }, { "epoch": 0.32387706855791965, "grad_norm": 2.1348989009857178, "learning_rate": 1.5812173715973996e-05, "loss": 0.528, "step": 959 }, { "epoch": 0.3242147922998987, "grad_norm": 1.9971834421157837, "learning_rate": 1.5803268897091582e-05, "loss": 0.5545, "step": 960 }, { "epoch": 0.32455251604187774, "grad_norm": 2.0893332958221436, "learning_rate": 1.579435713430809e-05, "loss": 0.5848, "step": 961 }, { "epoch": 0.3248902397838568, "grad_norm": 1.9694589376449585, "learning_rate": 1.5785438438286892e-05, "loss": 0.5456, "step": 962 }, { "epoch": 0.3252279635258359, "grad_norm": 2.1733498573303223, "learning_rate": 1.5776512819699655e-05, "loss": 0.5756, "step": 963 }, { "epoch": 0.32556568726781493, "grad_norm": 1.9733469486236572, "learning_rate": 1.576758028922632e-05, "loss": 0.5707, "step": 964 }, { "epoch": 0.325903411009794, "grad_norm": 2.1752002239227295, "learning_rate": 1.5758640857555114e-05, "loss": 0.5606, "step": 965 }, { "epoch": 0.3262411347517731, "grad_norm": 1.9969631433486938, "learning_rate": 1.574969453538251e-05, "loss": 0.547, "step": 966 }, { "epoch": 0.3265788584937521, "grad_norm": 1.982004165649414, "learning_rate": 1.5740741333413227e-05, "loss": 0.5722, "step": 967 }, { "epoch": 0.32691658223573117, "grad_norm": 1.9773539304733276, "learning_rate": 1.573178126236022e-05, "loss": 0.4881, "step": 968 }, { "epoch": 0.3272543059777102, "grad_norm": 1.9100091457366943, "learning_rate": 1.5722814332944663e-05, "loss": 0.5411, "step": 969 }, { "epoch": 0.3275920297196893, "grad_norm": 1.8979759216308594, "learning_rate": 1.5713840555895937e-05, "loss": 0.5411, "step": 970 }, { "epoch": 0.32792975346166836, "grad_norm": 1.975938081741333, "learning_rate": 1.5704859941951606e-05, "loss": 0.5394, "step": 971 }, { "epoch": 0.3282674772036474, "grad_norm": 1.8891639709472656, "learning_rate": 1.569587250185743e-05, "loss": 0.5645, "step": 972 }, { "epoch": 0.32860520094562645, "grad_norm": 2.0327651500701904, "learning_rate": 1.568687824636733e-05, "loss": 0.5413, "step": 973 }, { "epoch": 0.32894292468760555, "grad_norm": 2.0728039741516113, "learning_rate": 1.567787718624338e-05, "loss": 0.5521, "step": 974 }, { "epoch": 0.3292806484295846, "grad_norm": 1.8695573806762695, "learning_rate": 1.5668869332255796e-05, "loss": 0.5172, "step": 975 }, { "epoch": 0.32961837217156364, "grad_norm": 1.8486765623092651, "learning_rate": 1.5659854695182928e-05, "loss": 0.563, "step": 976 }, { "epoch": 0.32995609591354275, "grad_norm": 1.7631970643997192, "learning_rate": 1.565083328581124e-05, "loss": 0.5069, "step": 977 }, { "epoch": 0.3302938196555218, "grad_norm": 1.7457785606384277, "learning_rate": 1.5641805114935297e-05, "loss": 0.5387, "step": 978 }, { "epoch": 0.33063154339750084, "grad_norm": 2.243251085281372, "learning_rate": 1.563277019335776e-05, "loss": 0.5278, "step": 979 }, { "epoch": 0.3309692671394799, "grad_norm": 1.8760446310043335, "learning_rate": 1.5623728531889356e-05, "loss": 0.5581, "step": 980 }, { "epoch": 0.331306990881459, "grad_norm": 1.9527519941329956, "learning_rate": 1.5614680141348894e-05, "loss": 0.5589, "step": 981 }, { "epoch": 0.33164471462343803, "grad_norm": 1.8897161483764648, "learning_rate": 1.560562503256322e-05, "loss": 0.5741, "step": 982 }, { "epoch": 0.3319824383654171, "grad_norm": 2.006704807281494, "learning_rate": 1.5596563216367222e-05, "loss": 0.559, "step": 983 }, { "epoch": 0.3323201621073962, "grad_norm": 2.1624255180358887, "learning_rate": 1.558749470360382e-05, "loss": 0.5369, "step": 984 }, { "epoch": 0.3326578858493752, "grad_norm": 1.8806288242340088, "learning_rate": 1.557841950512394e-05, "loss": 0.554, "step": 985 }, { "epoch": 0.33299560959135427, "grad_norm": 1.740493655204773, "learning_rate": 1.556933763178651e-05, "loss": 0.5255, "step": 986 }, { "epoch": 0.3333333333333333, "grad_norm": 1.8992735147476196, "learning_rate": 1.556024909445845e-05, "loss": 0.5361, "step": 987 }, { "epoch": 0.3336710570753124, "grad_norm": 2.092799186706543, "learning_rate": 1.5551153904014645e-05, "loss": 0.577, "step": 988 }, { "epoch": 0.33400878081729146, "grad_norm": 2.0387206077575684, "learning_rate": 1.5542052071337942e-05, "loss": 0.5665, "step": 989 }, { "epoch": 0.3343465045592705, "grad_norm": 1.9928076267242432, "learning_rate": 1.5532943607319143e-05, "loss": 0.5657, "step": 990 }, { "epoch": 0.33468422830124955, "grad_norm": 1.9512214660644531, "learning_rate": 1.5523828522856974e-05, "loss": 0.5671, "step": 991 }, { "epoch": 0.33502195204322865, "grad_norm": 1.924033284187317, "learning_rate": 1.5514706828858096e-05, "loss": 0.5277, "step": 992 }, { "epoch": 0.3353596757852077, "grad_norm": 1.9554240703582764, "learning_rate": 1.5505578536237066e-05, "loss": 0.5958, "step": 993 }, { "epoch": 0.33569739952718675, "grad_norm": 1.994682788848877, "learning_rate": 1.5496443655916348e-05, "loss": 0.607, "step": 994 }, { "epoch": 0.33603512326916585, "grad_norm": 1.7074952125549316, "learning_rate": 1.5487302198826275e-05, "loss": 0.5563, "step": 995 }, { "epoch": 0.3363728470111449, "grad_norm": 1.9343167543411255, "learning_rate": 1.547815417590506e-05, "loss": 0.5619, "step": 996 }, { "epoch": 0.33671057075312394, "grad_norm": 2.0293781757354736, "learning_rate": 1.5468999598098778e-05, "loss": 0.5456, "step": 997 }, { "epoch": 0.337048294495103, "grad_norm": 1.866337537765503, "learning_rate": 1.5459838476361326e-05, "loss": 0.5328, "step": 998 }, { "epoch": 0.3373860182370821, "grad_norm": 1.7083992958068848, "learning_rate": 1.5450670821654447e-05, "loss": 0.5181, "step": 999 }, { "epoch": 0.33772374197906113, "grad_norm": 2.045197010040283, "learning_rate": 1.54414966449477e-05, "loss": 0.5551, "step": 1000 }, { "epoch": 0.3380614657210402, "grad_norm": 1.839285135269165, "learning_rate": 1.5432315957218445e-05, "loss": 0.4727, "step": 1001 }, { "epoch": 0.3383991894630192, "grad_norm": 2.0535049438476562, "learning_rate": 1.5423128769451832e-05, "loss": 0.5915, "step": 1002 }, { "epoch": 0.3387369132049983, "grad_norm": 1.9686163663864136, "learning_rate": 1.5413935092640794e-05, "loss": 0.5477, "step": 1003 }, { "epoch": 0.33907463694697737, "grad_norm": 1.9144943952560425, "learning_rate": 1.5404734937786017e-05, "loss": 0.5339, "step": 1004 }, { "epoch": 0.3394123606889564, "grad_norm": 1.921003818511963, "learning_rate": 1.5395528315895956e-05, "loss": 0.5575, "step": 1005 }, { "epoch": 0.3397500844309355, "grad_norm": 1.7055584192276, "learning_rate": 1.5386315237986785e-05, "loss": 0.5575, "step": 1006 }, { "epoch": 0.34008780817291456, "grad_norm": 2.023861885070801, "learning_rate": 1.5377095715082414e-05, "loss": 0.579, "step": 1007 }, { "epoch": 0.3404255319148936, "grad_norm": 1.9970475435256958, "learning_rate": 1.5367869758214466e-05, "loss": 0.5325, "step": 1008 }, { "epoch": 0.34076325565687265, "grad_norm": 1.8769994974136353, "learning_rate": 1.5358637378422256e-05, "loss": 0.5454, "step": 1009 }, { "epoch": 0.34110097939885176, "grad_norm": 1.790443778038025, "learning_rate": 1.5349398586752794e-05, "loss": 0.5279, "step": 1010 }, { "epoch": 0.3414387031408308, "grad_norm": 2.0113980770111084, "learning_rate": 1.5340153394260742e-05, "loss": 0.5407, "step": 1011 }, { "epoch": 0.34177642688280985, "grad_norm": 1.903141975402832, "learning_rate": 1.533090181200845e-05, "loss": 0.5516, "step": 1012 }, { "epoch": 0.34211415062478895, "grad_norm": 1.8822559118270874, "learning_rate": 1.5321643851065885e-05, "loss": 0.5777, "step": 1013 }, { "epoch": 0.342451874366768, "grad_norm": 2.061370372772217, "learning_rate": 1.5312379522510666e-05, "loss": 0.536, "step": 1014 }, { "epoch": 0.34278959810874704, "grad_norm": 1.6813714504241943, "learning_rate": 1.530310883742803e-05, "loss": 0.5237, "step": 1015 }, { "epoch": 0.3431273218507261, "grad_norm": 2.2160351276397705, "learning_rate": 1.5293831806910803e-05, "loss": 0.5213, "step": 1016 }, { "epoch": 0.3434650455927052, "grad_norm": 2.0099563598632812, "learning_rate": 1.5284548442059425e-05, "loss": 0.5604, "step": 1017 }, { "epoch": 0.34380276933468423, "grad_norm": 2.049576997756958, "learning_rate": 1.52752587539819e-05, "loss": 0.5036, "step": 1018 }, { "epoch": 0.3441404930766633, "grad_norm": 2.3557307720184326, "learning_rate": 1.526596275379381e-05, "loss": 0.5481, "step": 1019 }, { "epoch": 0.3444782168186423, "grad_norm": 2.1892623901367188, "learning_rate": 1.5256660452618276e-05, "loss": 0.5629, "step": 1020 }, { "epoch": 0.3448159405606214, "grad_norm": 1.9985511302947998, "learning_rate": 1.5247351861585968e-05, "loss": 0.5329, "step": 1021 }, { "epoch": 0.34515366430260047, "grad_norm": 2.2024972438812256, "learning_rate": 1.5238036991835085e-05, "loss": 0.5676, "step": 1022 }, { "epoch": 0.3454913880445795, "grad_norm": 2.093689441680908, "learning_rate": 1.5228715854511331e-05, "loss": 0.5174, "step": 1023 }, { "epoch": 0.3458291117865586, "grad_norm": 2.3484714031219482, "learning_rate": 1.521938846076791e-05, "loss": 0.5588, "step": 1024 }, { "epoch": 0.34616683552853766, "grad_norm": 2.2830355167388916, "learning_rate": 1.5210054821765513e-05, "loss": 0.5658, "step": 1025 }, { "epoch": 0.3465045592705167, "grad_norm": 3.99013614654541, "learning_rate": 1.5200714948672313e-05, "loss": 0.5425, "step": 1026 }, { "epoch": 0.34684228301249576, "grad_norm": 1.8407189846038818, "learning_rate": 1.519136885266393e-05, "loss": 0.5127, "step": 1027 }, { "epoch": 0.34718000675447486, "grad_norm": 6.108625888824463, "learning_rate": 1.5182016544923432e-05, "loss": 0.5074, "step": 1028 }, { "epoch": 0.3475177304964539, "grad_norm": 1.9158786535263062, "learning_rate": 1.5172658036641327e-05, "loss": 0.5311, "step": 1029 }, { "epoch": 0.34785545423843295, "grad_norm": 4.371150493621826, "learning_rate": 1.5163293339015535e-05, "loss": 0.5048, "step": 1030 }, { "epoch": 0.34819317798041205, "grad_norm": 1.940597653388977, "learning_rate": 1.5153922463251386e-05, "loss": 0.5309, "step": 1031 }, { "epoch": 0.3485309017223911, "grad_norm": 2.6149301528930664, "learning_rate": 1.5144545420561598e-05, "loss": 0.5059, "step": 1032 }, { "epoch": 0.34886862546437014, "grad_norm": 1.9176528453826904, "learning_rate": 1.5135162222166276e-05, "loss": 0.5282, "step": 1033 }, { "epoch": 0.3492063492063492, "grad_norm": 2.199713706970215, "learning_rate": 1.512577287929288e-05, "loss": 0.5351, "step": 1034 }, { "epoch": 0.3495440729483283, "grad_norm": 2.318742513656616, "learning_rate": 1.5116377403176233e-05, "loss": 0.5754, "step": 1035 }, { "epoch": 0.34988179669030733, "grad_norm": 2.0815298557281494, "learning_rate": 1.5106975805058483e-05, "loss": 0.5435, "step": 1036 }, { "epoch": 0.3502195204322864, "grad_norm": 2.0122759342193604, "learning_rate": 1.5097568096189119e-05, "loss": 0.5282, "step": 1037 }, { "epoch": 0.3505572441742654, "grad_norm": 2.0409493446350098, "learning_rate": 1.5088154287824934e-05, "loss": 0.5707, "step": 1038 }, { "epoch": 0.3508949679162445, "grad_norm": 2.154935121536255, "learning_rate": 1.5078734391230019e-05, "loss": 0.4876, "step": 1039 }, { "epoch": 0.3512326916582236, "grad_norm": 2.374852418899536, "learning_rate": 1.506930841767575e-05, "loss": 0.5655, "step": 1040 }, { "epoch": 0.3515704154002026, "grad_norm": 2.139035224914551, "learning_rate": 1.5059876378440774e-05, "loss": 0.5193, "step": 1041 }, { "epoch": 0.3519081391421817, "grad_norm": 1.8664394617080688, "learning_rate": 1.5050438284811001e-05, "loss": 0.5476, "step": 1042 }, { "epoch": 0.35224586288416077, "grad_norm": 2.279726505279541, "learning_rate": 1.5040994148079578e-05, "loss": 0.5804, "step": 1043 }, { "epoch": 0.3525835866261398, "grad_norm": 2.271094560623169, "learning_rate": 1.5031543979546887e-05, "loss": 0.5789, "step": 1044 }, { "epoch": 0.35292131036811886, "grad_norm": 2.0998013019561768, "learning_rate": 1.5022087790520526e-05, "loss": 0.5397, "step": 1045 }, { "epoch": 0.35325903411009796, "grad_norm": 2.0789730548858643, "learning_rate": 1.5012625592315298e-05, "loss": 0.5608, "step": 1046 }, { "epoch": 0.353596757852077, "grad_norm": 2.006349802017212, "learning_rate": 1.5003157396253198e-05, "loss": 0.5204, "step": 1047 }, { "epoch": 0.35393448159405605, "grad_norm": 2.4171555042266846, "learning_rate": 1.499368321366339e-05, "loss": 0.5702, "step": 1048 }, { "epoch": 0.35427220533603515, "grad_norm": 2.1138155460357666, "learning_rate": 1.4984203055882214e-05, "loss": 0.5806, "step": 1049 }, { "epoch": 0.3546099290780142, "grad_norm": 1.862313151359558, "learning_rate": 1.4974716934253146e-05, "loss": 0.541, "step": 1050 }, { "epoch": 0.35494765281999324, "grad_norm": 2.0836734771728516, "learning_rate": 1.4965224860126809e-05, "loss": 0.5675, "step": 1051 }, { "epoch": 0.3552853765619723, "grad_norm": 1.8699607849121094, "learning_rate": 1.4955726844860939e-05, "loss": 0.5702, "step": 1052 }, { "epoch": 0.3556231003039514, "grad_norm": 1.754858374595642, "learning_rate": 1.4946222899820389e-05, "loss": 0.5274, "step": 1053 }, { "epoch": 0.35596082404593044, "grad_norm": 2.6155617237091064, "learning_rate": 1.4936713036377102e-05, "loss": 0.5327, "step": 1054 }, { "epoch": 0.3562985477879095, "grad_norm": 2.2810544967651367, "learning_rate": 1.4927197265910107e-05, "loss": 0.5954, "step": 1055 }, { "epoch": 0.3566362715298885, "grad_norm": 1.826612114906311, "learning_rate": 1.4917675599805497e-05, "loss": 0.5672, "step": 1056 }, { "epoch": 0.35697399527186763, "grad_norm": 1.8813865184783936, "learning_rate": 1.490814804945642e-05, "loss": 0.553, "step": 1057 }, { "epoch": 0.3573117190138467, "grad_norm": 2.2047958374023438, "learning_rate": 1.4898614626263066e-05, "loss": 0.5011, "step": 1058 }, { "epoch": 0.3576494427558257, "grad_norm": 1.949554204940796, "learning_rate": 1.4889075341632655e-05, "loss": 0.5131, "step": 1059 }, { "epoch": 0.3579871664978048, "grad_norm": 2.702042579650879, "learning_rate": 1.4879530206979418e-05, "loss": 0.53, "step": 1060 }, { "epoch": 0.35832489023978387, "grad_norm": 2.136049509048462, "learning_rate": 1.4869979233724579e-05, "loss": 0.5497, "step": 1061 }, { "epoch": 0.3586626139817629, "grad_norm": 2.1227266788482666, "learning_rate": 1.4860422433296363e-05, "loss": 0.5169, "step": 1062 }, { "epoch": 0.35900033772374196, "grad_norm": 1.8207359313964844, "learning_rate": 1.485085981712995e-05, "loss": 0.5574, "step": 1063 }, { "epoch": 0.35933806146572106, "grad_norm": 2.3003885746002197, "learning_rate": 1.4841291396667494e-05, "loss": 0.534, "step": 1064 }, { "epoch": 0.3596757852077001, "grad_norm": 1.8896540403366089, "learning_rate": 1.4831717183358085e-05, "loss": 0.5461, "step": 1065 }, { "epoch": 0.36001350894967915, "grad_norm": 1.8416929244995117, "learning_rate": 1.4822137188657752e-05, "loss": 0.555, "step": 1066 }, { "epoch": 0.3603512326916582, "grad_norm": 2.1131277084350586, "learning_rate": 1.4812551424029429e-05, "loss": 0.5455, "step": 1067 }, { "epoch": 0.3606889564336373, "grad_norm": 2.149731397628784, "learning_rate": 1.4802959900942967e-05, "loss": 0.5619, "step": 1068 }, { "epoch": 0.36102668017561634, "grad_norm": 2.350322961807251, "learning_rate": 1.4793362630875105e-05, "loss": 0.5865, "step": 1069 }, { "epoch": 0.3613644039175954, "grad_norm": 2.1667263507843018, "learning_rate": 1.4783759625309454e-05, "loss": 0.5444, "step": 1070 }, { "epoch": 0.3617021276595745, "grad_norm": 1.9500027894973755, "learning_rate": 1.4774150895736488e-05, "loss": 0.5842, "step": 1071 }, { "epoch": 0.36203985140155354, "grad_norm": 2.2990498542785645, "learning_rate": 1.4764536453653536e-05, "loss": 0.5734, "step": 1072 }, { "epoch": 0.3623775751435326, "grad_norm": 2.195291757583618, "learning_rate": 1.475491631056475e-05, "loss": 0.5221, "step": 1073 }, { "epoch": 0.36271529888551163, "grad_norm": 2.6234989166259766, "learning_rate": 1.474529047798112e-05, "loss": 0.5398, "step": 1074 }, { "epoch": 0.36305302262749073, "grad_norm": 1.8076050281524658, "learning_rate": 1.4735658967420433e-05, "loss": 0.5281, "step": 1075 }, { "epoch": 0.3633907463694698, "grad_norm": 2.2039835453033447, "learning_rate": 1.4726021790407268e-05, "loss": 0.5339, "step": 1076 }, { "epoch": 0.3637284701114488, "grad_norm": 2.388828992843628, "learning_rate": 1.4716378958472995e-05, "loss": 0.5579, "step": 1077 }, { "epoch": 0.3640661938534279, "grad_norm": 1.9640778303146362, "learning_rate": 1.4706730483155738e-05, "loss": 0.5744, "step": 1078 }, { "epoch": 0.36440391759540697, "grad_norm": 1.8825005292892456, "learning_rate": 1.4697076376000388e-05, "loss": 0.5325, "step": 1079 }, { "epoch": 0.364741641337386, "grad_norm": 1.6883267164230347, "learning_rate": 1.4687416648558555e-05, "loss": 0.553, "step": 1080 }, { "epoch": 0.36507936507936506, "grad_norm": 1.8855961561203003, "learning_rate": 1.4677751312388597e-05, "loss": 0.51, "step": 1081 }, { "epoch": 0.36541708882134416, "grad_norm": 1.8335736989974976, "learning_rate": 1.4668080379055563e-05, "loss": 0.5471, "step": 1082 }, { "epoch": 0.3657548125633232, "grad_norm": 1.7968331575393677, "learning_rate": 1.4658403860131212e-05, "loss": 0.5517, "step": 1083 }, { "epoch": 0.36609253630530225, "grad_norm": 1.8566981554031372, "learning_rate": 1.4648721767193981e-05, "loss": 0.5335, "step": 1084 }, { "epoch": 0.3664302600472813, "grad_norm": 1.933724284172058, "learning_rate": 1.4639034111828977e-05, "loss": 0.5117, "step": 1085 }, { "epoch": 0.3667679837892604, "grad_norm": 1.8391430377960205, "learning_rate": 1.4629340905627964e-05, "loss": 0.5223, "step": 1086 }, { "epoch": 0.36710570753123944, "grad_norm": 2.183021068572998, "learning_rate": 1.461964216018935e-05, "loss": 0.5347, "step": 1087 }, { "epoch": 0.3674434312732185, "grad_norm": 1.8729252815246582, "learning_rate": 1.4609937887118165e-05, "loss": 0.5367, "step": 1088 }, { "epoch": 0.3677811550151976, "grad_norm": 2.1476709842681885, "learning_rate": 1.4600228098026055e-05, "loss": 0.5571, "step": 1089 }, { "epoch": 0.36811887875717664, "grad_norm": 1.7557318210601807, "learning_rate": 1.4590512804531272e-05, "loss": 0.5282, "step": 1090 }, { "epoch": 0.3684566024991557, "grad_norm": 2.255570411682129, "learning_rate": 1.4580792018258645e-05, "loss": 0.5352, "step": 1091 }, { "epoch": 0.36879432624113473, "grad_norm": 2.224292039871216, "learning_rate": 1.4571065750839586e-05, "loss": 0.5504, "step": 1092 }, { "epoch": 0.36913204998311383, "grad_norm": 2.182058572769165, "learning_rate": 1.4561334013912056e-05, "loss": 0.5139, "step": 1093 }, { "epoch": 0.3694697737250929, "grad_norm": 1.998540997505188, "learning_rate": 1.4551596819120564e-05, "loss": 0.5383, "step": 1094 }, { "epoch": 0.3698074974670719, "grad_norm": 1.693687915802002, "learning_rate": 1.4541854178116153e-05, "loss": 0.6016, "step": 1095 }, { "epoch": 0.370145221209051, "grad_norm": 2.218364953994751, "learning_rate": 1.4532106102556377e-05, "loss": 0.5727, "step": 1096 }, { "epoch": 0.37048294495103007, "grad_norm": 2.0627779960632324, "learning_rate": 1.4522352604105293e-05, "loss": 0.5497, "step": 1097 }, { "epoch": 0.3708206686930091, "grad_norm": 1.8225555419921875, "learning_rate": 1.4512593694433455e-05, "loss": 0.505, "step": 1098 }, { "epoch": 0.37115839243498816, "grad_norm": 2.036851167678833, "learning_rate": 1.4502829385217881e-05, "loss": 0.51, "step": 1099 }, { "epoch": 0.37149611617696726, "grad_norm": 1.911070704460144, "learning_rate": 1.4493059688142055e-05, "loss": 0.5296, "step": 1100 }, { "epoch": 0.3718338399189463, "grad_norm": 2.1363730430603027, "learning_rate": 1.4483284614895911e-05, "loss": 0.55, "step": 1101 }, { "epoch": 0.37217156366092535, "grad_norm": 2.01567006111145, "learning_rate": 1.447350417717581e-05, "loss": 0.5933, "step": 1102 }, { "epoch": 0.3725092874029044, "grad_norm": 1.7512503862380981, "learning_rate": 1.4463718386684536e-05, "loss": 0.5779, "step": 1103 }, { "epoch": 0.3728470111448835, "grad_norm": 1.993340015411377, "learning_rate": 1.445392725513127e-05, "loss": 0.591, "step": 1104 }, { "epoch": 0.37318473488686255, "grad_norm": 1.7904561758041382, "learning_rate": 1.4444130794231597e-05, "loss": 0.5189, "step": 1105 }, { "epoch": 0.3735224586288416, "grad_norm": 1.7751133441925049, "learning_rate": 1.4434329015707468e-05, "loss": 0.537, "step": 1106 }, { "epoch": 0.3738601823708207, "grad_norm": 2.1797189712524414, "learning_rate": 1.4424521931287206e-05, "loss": 0.533, "step": 1107 }, { "epoch": 0.37419790611279974, "grad_norm": 1.9019125699996948, "learning_rate": 1.4414709552705465e-05, "loss": 0.5204, "step": 1108 }, { "epoch": 0.3745356298547788, "grad_norm": 1.7774845361709595, "learning_rate": 1.440489189170326e-05, "loss": 0.5326, "step": 1109 }, { "epoch": 0.37487335359675783, "grad_norm": 1.7692289352416992, "learning_rate": 1.4395068960027903e-05, "loss": 0.5634, "step": 1110 }, { "epoch": 0.37521107733873693, "grad_norm": 1.7737778425216675, "learning_rate": 1.4385240769433029e-05, "loss": 0.5154, "step": 1111 }, { "epoch": 0.375548801080716, "grad_norm": 2.456174850463867, "learning_rate": 1.4375407331678553e-05, "loss": 0.5712, "step": 1112 }, { "epoch": 0.375886524822695, "grad_norm": 1.7358667850494385, "learning_rate": 1.436556865853068e-05, "loss": 0.5309, "step": 1113 }, { "epoch": 0.3762242485646741, "grad_norm": 1.9311598539352417, "learning_rate": 1.435572476176187e-05, "loss": 0.5799, "step": 1114 }, { "epoch": 0.37656197230665317, "grad_norm": 1.850720763206482, "learning_rate": 1.434587565315084e-05, "loss": 0.5363, "step": 1115 }, { "epoch": 0.3768996960486322, "grad_norm": 1.9302071332931519, "learning_rate": 1.4336021344482539e-05, "loss": 0.5171, "step": 1116 }, { "epoch": 0.37723741979061126, "grad_norm": 1.6593377590179443, "learning_rate": 1.4326161847548137e-05, "loss": 0.5082, "step": 1117 }, { "epoch": 0.37757514353259036, "grad_norm": 1.8848977088928223, "learning_rate": 1.4316297174145018e-05, "loss": 0.5109, "step": 1118 }, { "epoch": 0.3779128672745694, "grad_norm": 1.7957432270050049, "learning_rate": 1.4306427336076754e-05, "loss": 0.511, "step": 1119 }, { "epoch": 0.37825059101654845, "grad_norm": 1.8390629291534424, "learning_rate": 1.4296552345153099e-05, "loss": 0.5738, "step": 1120 }, { "epoch": 0.3785883147585275, "grad_norm": 1.9456121921539307, "learning_rate": 1.4286672213189973e-05, "loss": 0.4931, "step": 1121 }, { "epoch": 0.3789260385005066, "grad_norm": 1.831970453262329, "learning_rate": 1.427678695200945e-05, "loss": 0.5026, "step": 1122 }, { "epoch": 0.37926376224248565, "grad_norm": 1.954371452331543, "learning_rate": 1.426689657343974e-05, "loss": 0.571, "step": 1123 }, { "epoch": 0.3796014859844647, "grad_norm": 1.5779763460159302, "learning_rate": 1.4257001089315173e-05, "loss": 0.5065, "step": 1124 }, { "epoch": 0.3799392097264438, "grad_norm": 1.8063756227493286, "learning_rate": 1.4247100511476184e-05, "loss": 0.5296, "step": 1125 }, { "epoch": 0.38027693346842284, "grad_norm": 1.8522915840148926, "learning_rate": 1.4237194851769318e-05, "loss": 0.5044, "step": 1126 }, { "epoch": 0.3806146572104019, "grad_norm": 1.791467547416687, "learning_rate": 1.4227284122047183e-05, "loss": 0.5458, "step": 1127 }, { "epoch": 0.38095238095238093, "grad_norm": 2.0502398014068604, "learning_rate": 1.4217368334168472e-05, "loss": 0.5648, "step": 1128 }, { "epoch": 0.38129010469436003, "grad_norm": 1.6355087757110596, "learning_rate": 1.420744749999791e-05, "loss": 0.5199, "step": 1129 }, { "epoch": 0.3816278284363391, "grad_norm": 2.2087788581848145, "learning_rate": 1.4197521631406279e-05, "loss": 0.5873, "step": 1130 }, { "epoch": 0.3819655521783181, "grad_norm": 1.6688944101333618, "learning_rate": 1.4187590740270372e-05, "loss": 0.548, "step": 1131 }, { "epoch": 0.38230327592029717, "grad_norm": 1.6586618423461914, "learning_rate": 1.4177654838472996e-05, "loss": 0.514, "step": 1132 }, { "epoch": 0.38264099966227627, "grad_norm": 1.987221121788025, "learning_rate": 1.4167713937902957e-05, "loss": 0.57, "step": 1133 }, { "epoch": 0.3829787234042553, "grad_norm": 1.6457571983337402, "learning_rate": 1.4157768050455038e-05, "loss": 0.5231, "step": 1134 }, { "epoch": 0.38331644714623436, "grad_norm": 1.7036746740341187, "learning_rate": 1.4147817188029988e-05, "loss": 0.5019, "step": 1135 }, { "epoch": 0.38365417088821346, "grad_norm": 9.115896224975586, "learning_rate": 1.4137861362534513e-05, "loss": 0.5332, "step": 1136 }, { "epoch": 0.3839918946301925, "grad_norm": 1.8613120317459106, "learning_rate": 1.4127900585881254e-05, "loss": 0.4999, "step": 1137 }, { "epoch": 0.38432961837217156, "grad_norm": 2.1418750286102295, "learning_rate": 1.4117934869988776e-05, "loss": 0.5904, "step": 1138 }, { "epoch": 0.3846673421141506, "grad_norm": 1.9365202188491821, "learning_rate": 1.4107964226781561e-05, "loss": 0.5487, "step": 1139 }, { "epoch": 0.3850050658561297, "grad_norm": 1.5196839570999146, "learning_rate": 1.4097988668189977e-05, "loss": 0.4919, "step": 1140 }, { "epoch": 0.38534278959810875, "grad_norm": 2.1820569038391113, "learning_rate": 1.4088008206150279e-05, "loss": 0.5199, "step": 1141 }, { "epoch": 0.3856805133400878, "grad_norm": 2.2219276428222656, "learning_rate": 1.4078022852604591e-05, "loss": 0.5552, "step": 1142 }, { "epoch": 0.3860182370820669, "grad_norm": 1.8701210021972656, "learning_rate": 1.4068032619500884e-05, "loss": 0.5342, "step": 1143 }, { "epoch": 0.38635596082404594, "grad_norm": 1.9380638599395752, "learning_rate": 1.4058037518792975e-05, "loss": 0.5309, "step": 1144 }, { "epoch": 0.386693684566025, "grad_norm": 1.8619962930679321, "learning_rate": 1.4048037562440495e-05, "loss": 0.5171, "step": 1145 }, { "epoch": 0.38703140830800403, "grad_norm": 1.9780521392822266, "learning_rate": 1.4038032762408897e-05, "loss": 0.5526, "step": 1146 }, { "epoch": 0.38736913204998313, "grad_norm": 2.7345893383026123, "learning_rate": 1.402802313066942e-05, "loss": 0.5402, "step": 1147 }, { "epoch": 0.3877068557919622, "grad_norm": 1.7501345872879028, "learning_rate": 1.4018008679199092e-05, "loss": 0.5422, "step": 1148 }, { "epoch": 0.3880445795339412, "grad_norm": 1.6321029663085938, "learning_rate": 1.4007989419980696e-05, "loss": 0.5278, "step": 1149 }, { "epoch": 0.38838230327592027, "grad_norm": 1.8122402429580688, "learning_rate": 1.3997965365002789e-05, "loss": 0.5597, "step": 1150 }, { "epoch": 0.38872002701789937, "grad_norm": 2.1499698162078857, "learning_rate": 1.3987936526259643e-05, "loss": 0.5072, "step": 1151 }, { "epoch": 0.3890577507598784, "grad_norm": 2.0365943908691406, "learning_rate": 1.3977902915751268e-05, "loss": 0.5585, "step": 1152 }, { "epoch": 0.38939547450185746, "grad_norm": 1.8299415111541748, "learning_rate": 1.3967864545483382e-05, "loss": 0.5247, "step": 1153 }, { "epoch": 0.38973319824383656, "grad_norm": 2.056867837905884, "learning_rate": 1.3957821427467392e-05, "loss": 0.5748, "step": 1154 }, { "epoch": 0.3900709219858156, "grad_norm": 2.38212513923645, "learning_rate": 1.3947773573720396e-05, "loss": 0.544, "step": 1155 }, { "epoch": 0.39040864572779466, "grad_norm": 1.9713776111602783, "learning_rate": 1.3937720996265147e-05, "loss": 0.5476, "step": 1156 }, { "epoch": 0.3907463694697737, "grad_norm": 2.30199933052063, "learning_rate": 1.3927663707130061e-05, "loss": 0.5151, "step": 1157 }, { "epoch": 0.3910840932117528, "grad_norm": 1.770244836807251, "learning_rate": 1.3917601718349183e-05, "loss": 0.5498, "step": 1158 }, { "epoch": 0.39142181695373185, "grad_norm": 1.6282994747161865, "learning_rate": 1.3907535041962188e-05, "loss": 0.5309, "step": 1159 }, { "epoch": 0.3917595406957109, "grad_norm": 1.9224019050598145, "learning_rate": 1.3897463690014353e-05, "loss": 0.4946, "step": 1160 }, { "epoch": 0.39209726443769, "grad_norm": 1.6513456106185913, "learning_rate": 1.3887387674556558e-05, "loss": 0.553, "step": 1161 }, { "epoch": 0.39243498817966904, "grad_norm": 1.858555793762207, "learning_rate": 1.3877307007645256e-05, "loss": 0.559, "step": 1162 }, { "epoch": 0.3927727119216481, "grad_norm": 2.1466143131256104, "learning_rate": 1.386722170134247e-05, "loss": 0.5559, "step": 1163 }, { "epoch": 0.39311043566362713, "grad_norm": 1.8304537534713745, "learning_rate": 1.385713176771577e-05, "loss": 0.5384, "step": 1164 }, { "epoch": 0.39344815940560623, "grad_norm": 1.9499304294586182, "learning_rate": 1.3847037218838271e-05, "loss": 0.5327, "step": 1165 }, { "epoch": 0.3937858831475853, "grad_norm": 1.882509469985962, "learning_rate": 1.3836938066788599e-05, "loss": 0.5392, "step": 1166 }, { "epoch": 0.3941236068895643, "grad_norm": 1.8361124992370605, "learning_rate": 1.3826834323650899e-05, "loss": 0.5221, "step": 1167 }, { "epoch": 0.39446133063154337, "grad_norm": 2.2549922466278076, "learning_rate": 1.3816726001514802e-05, "loss": 0.5661, "step": 1168 }, { "epoch": 0.3947990543735225, "grad_norm": 1.6666157245635986, "learning_rate": 1.3806613112475416e-05, "loss": 0.5503, "step": 1169 }, { "epoch": 0.3951367781155015, "grad_norm": 2.055157423019409, "learning_rate": 1.3796495668633325e-05, "loss": 0.5331, "step": 1170 }, { "epoch": 0.39547450185748056, "grad_norm": 2.2004382610321045, "learning_rate": 1.3786373682094552e-05, "loss": 0.5536, "step": 1171 }, { "epoch": 0.39581222559945967, "grad_norm": 1.98672616481781, "learning_rate": 1.377624716497056e-05, "loss": 0.5322, "step": 1172 }, { "epoch": 0.3961499493414387, "grad_norm": 1.843321442604065, "learning_rate": 1.3766116129378229e-05, "loss": 0.5449, "step": 1173 }, { "epoch": 0.39648767308341776, "grad_norm": 1.7470521926879883, "learning_rate": 1.3755980587439857e-05, "loss": 0.5378, "step": 1174 }, { "epoch": 0.3968253968253968, "grad_norm": 2.0761799812316895, "learning_rate": 1.3745840551283119e-05, "loss": 0.5389, "step": 1175 }, { "epoch": 0.3971631205673759, "grad_norm": 1.8842417001724243, "learning_rate": 1.3735696033041079e-05, "loss": 0.5306, "step": 1176 }, { "epoch": 0.39750084430935495, "grad_norm": 2.0597147941589355, "learning_rate": 1.372554704485216e-05, "loss": 0.5383, "step": 1177 }, { "epoch": 0.397838568051334, "grad_norm": 2.097405433654785, "learning_rate": 1.3715393598860129e-05, "loss": 0.5442, "step": 1178 }, { "epoch": 0.3981762917933131, "grad_norm": 4.694592475891113, "learning_rate": 1.3705235707214092e-05, "loss": 0.5346, "step": 1179 }, { "epoch": 0.39851401553529214, "grad_norm": 1.9732106924057007, "learning_rate": 1.369507338206848e-05, "loss": 0.5297, "step": 1180 }, { "epoch": 0.3988517392772712, "grad_norm": 1.8007521629333496, "learning_rate": 1.3684906635583015e-05, "loss": 0.5204, "step": 1181 }, { "epoch": 0.39918946301925023, "grad_norm": 1.8197932243347168, "learning_rate": 1.367473547992272e-05, "loss": 0.5184, "step": 1182 }, { "epoch": 0.39952718676122934, "grad_norm": 3.1382312774658203, "learning_rate": 1.3664559927257891e-05, "loss": 0.5171, "step": 1183 }, { "epoch": 0.3998649105032084, "grad_norm": 1.870146632194519, "learning_rate": 1.3654379989764084e-05, "loss": 0.5495, "step": 1184 }, { "epoch": 0.4002026342451874, "grad_norm": 2.0994367599487305, "learning_rate": 1.3644195679622105e-05, "loss": 0.5644, "step": 1185 }, { "epoch": 0.4005403579871665, "grad_norm": 1.6776835918426514, "learning_rate": 1.3634007009017986e-05, "loss": 0.5219, "step": 1186 }, { "epoch": 0.4008780817291456, "grad_norm": 1.762833833694458, "learning_rate": 1.3623813990142986e-05, "loss": 0.585, "step": 1187 }, { "epoch": 0.4012158054711246, "grad_norm": 1.7979590892791748, "learning_rate": 1.3613616635193551e-05, "loss": 0.5175, "step": 1188 }, { "epoch": 0.40155352921310367, "grad_norm": 1.9943379163742065, "learning_rate": 1.3603414956371339e-05, "loss": 0.5478, "step": 1189 }, { "epoch": 0.40189125295508277, "grad_norm": 1.6275759935379028, "learning_rate": 1.3593208965883156e-05, "loss": 0.5573, "step": 1190 }, { "epoch": 0.4022289766970618, "grad_norm": 1.7482893466949463, "learning_rate": 1.3582998675940989e-05, "loss": 0.5482, "step": 1191 }, { "epoch": 0.40256670043904086, "grad_norm": 1.7057636976242065, "learning_rate": 1.357278409876195e-05, "loss": 0.5707, "step": 1192 }, { "epoch": 0.4029044241810199, "grad_norm": 1.9313801527023315, "learning_rate": 1.35625652465683e-05, "loss": 0.5377, "step": 1193 }, { "epoch": 0.403242147922999, "grad_norm": 1.531730055809021, "learning_rate": 1.3552342131587399e-05, "loss": 0.5259, "step": 1194 }, { "epoch": 0.40357987166497805, "grad_norm": 1.915149450302124, "learning_rate": 1.3542114766051719e-05, "loss": 0.545, "step": 1195 }, { "epoch": 0.4039175954069571, "grad_norm": 2.0444648265838623, "learning_rate": 1.3531883162198815e-05, "loss": 0.5318, "step": 1196 }, { "epoch": 0.40425531914893614, "grad_norm": 1.6755424737930298, "learning_rate": 1.3521647332271306e-05, "loss": 0.5566, "step": 1197 }, { "epoch": 0.40459304289091524, "grad_norm": 2.0106492042541504, "learning_rate": 1.351140728851688e-05, "loss": 0.5545, "step": 1198 }, { "epoch": 0.4049307666328943, "grad_norm": 1.8752272129058838, "learning_rate": 1.3501163043188256e-05, "loss": 0.5521, "step": 1199 }, { "epoch": 0.40526849037487334, "grad_norm": 1.988815188407898, "learning_rate": 1.3490914608543189e-05, "loss": 0.5368, "step": 1200 }, { "epoch": 0.40560621411685244, "grad_norm": 1.9583625793457031, "learning_rate": 1.348066199684444e-05, "loss": 0.5277, "step": 1201 }, { "epoch": 0.4059439378588315, "grad_norm": 1.636851191520691, "learning_rate": 1.3470405220359773e-05, "loss": 0.5257, "step": 1202 }, { "epoch": 0.40628166160081053, "grad_norm": 1.779767632484436, "learning_rate": 1.3460144291361931e-05, "loss": 0.502, "step": 1203 }, { "epoch": 0.4066193853427896, "grad_norm": 1.9183188676834106, "learning_rate": 1.3449879222128628e-05, "loss": 0.5526, "step": 1204 }, { "epoch": 0.4069571090847687, "grad_norm": 1.9939206838607788, "learning_rate": 1.3439610024942535e-05, "loss": 0.525, "step": 1205 }, { "epoch": 0.4072948328267477, "grad_norm": 1.8882708549499512, "learning_rate": 1.3429336712091258e-05, "loss": 0.5767, "step": 1206 }, { "epoch": 0.40763255656872677, "grad_norm": 1.8462036848068237, "learning_rate": 1.3419059295867332e-05, "loss": 0.4983, "step": 1207 }, { "epoch": 0.40797028031070587, "grad_norm": 2.3116695880889893, "learning_rate": 1.340877778856819e-05, "loss": 0.497, "step": 1208 }, { "epoch": 0.4083080040526849, "grad_norm": 2.473775625228882, "learning_rate": 1.3398492202496178e-05, "loss": 0.523, "step": 1209 }, { "epoch": 0.40864572779466396, "grad_norm": 1.9204163551330566, "learning_rate": 1.3388202549958507e-05, "loss": 0.5356, "step": 1210 }, { "epoch": 0.408983451536643, "grad_norm": 1.8044918775558472, "learning_rate": 1.3377908843267267e-05, "loss": 0.5936, "step": 1211 }, { "epoch": 0.4093211752786221, "grad_norm": 1.797738790512085, "learning_rate": 1.3367611094739384e-05, "loss": 0.5162, "step": 1212 }, { "epoch": 0.40965889902060115, "grad_norm": 2.108750104904175, "learning_rate": 1.3357309316696637e-05, "loss": 0.5098, "step": 1213 }, { "epoch": 0.4099966227625802, "grad_norm": 2.054626226425171, "learning_rate": 1.334700352146561e-05, "loss": 0.5539, "step": 1214 }, { "epoch": 0.41033434650455924, "grad_norm": 1.950878381729126, "learning_rate": 1.3336693721377705e-05, "loss": 0.5073, "step": 1215 }, { "epoch": 0.41067207024653835, "grad_norm": 1.9857349395751953, "learning_rate": 1.3326379928769114e-05, "loss": 0.5693, "step": 1216 }, { "epoch": 0.4110097939885174, "grad_norm": 1.8818366527557373, "learning_rate": 1.3316062155980804e-05, "loss": 0.5127, "step": 1217 }, { "epoch": 0.41134751773049644, "grad_norm": 2.0489344596862793, "learning_rate": 1.3305740415358506e-05, "loss": 0.5274, "step": 1218 }, { "epoch": 0.41168524147247554, "grad_norm": 1.9836921691894531, "learning_rate": 1.3295414719252699e-05, "loss": 0.5301, "step": 1219 }, { "epoch": 0.4120229652144546, "grad_norm": 4.103817462921143, "learning_rate": 1.3285085080018589e-05, "loss": 0.5322, "step": 1220 }, { "epoch": 0.41236068895643363, "grad_norm": 1.7054163217544556, "learning_rate": 1.3274751510016113e-05, "loss": 0.5163, "step": 1221 }, { "epoch": 0.4126984126984127, "grad_norm": 2.5940639972686768, "learning_rate": 1.3264414021609899e-05, "loss": 0.5298, "step": 1222 }, { "epoch": 0.4130361364403918, "grad_norm": 1.8955531120300293, "learning_rate": 1.3254072627169268e-05, "loss": 0.5288, "step": 1223 }, { "epoch": 0.4133738601823708, "grad_norm": 2.4254300594329834, "learning_rate": 1.3243727339068216e-05, "loss": 0.5664, "step": 1224 }, { "epoch": 0.41371158392434987, "grad_norm": 4.841183185577393, "learning_rate": 1.3233378169685396e-05, "loss": 0.5339, "step": 1225 }, { "epoch": 0.41404930766632897, "grad_norm": 4.662055015563965, "learning_rate": 1.3223025131404106e-05, "loss": 0.5524, "step": 1226 }, { "epoch": 0.414387031408308, "grad_norm": 1.9691284894943237, "learning_rate": 1.3212668236612274e-05, "loss": 0.533, "step": 1227 }, { "epoch": 0.41472475515028706, "grad_norm": 2.3513758182525635, "learning_rate": 1.3202307497702443e-05, "loss": 0.5271, "step": 1228 }, { "epoch": 0.4150624788922661, "grad_norm": 2.2213001251220703, "learning_rate": 1.3191942927071753e-05, "loss": 0.4991, "step": 1229 }, { "epoch": 0.4154002026342452, "grad_norm": 1.922692894935608, "learning_rate": 1.3181574537121933e-05, "loss": 0.5429, "step": 1230 }, { "epoch": 0.41573792637622425, "grad_norm": 1.7806744575500488, "learning_rate": 1.3171202340259275e-05, "loss": 0.5129, "step": 1231 }, { "epoch": 0.4160756501182033, "grad_norm": 1.744636058807373, "learning_rate": 1.3160826348894635e-05, "loss": 0.5122, "step": 1232 }, { "epoch": 0.41641337386018235, "grad_norm": 2.2100446224212646, "learning_rate": 1.31504465754434e-05, "loss": 0.5262, "step": 1233 }, { "epoch": 0.41675109760216145, "grad_norm": 2.0496482849121094, "learning_rate": 1.3140063032325491e-05, "loss": 0.5594, "step": 1234 }, { "epoch": 0.4170888213441405, "grad_norm": 1.8243377208709717, "learning_rate": 1.3129675731965336e-05, "loss": 0.5168, "step": 1235 }, { "epoch": 0.41742654508611954, "grad_norm": 2.2854344844818115, "learning_rate": 1.3119284686791859e-05, "loss": 0.524, "step": 1236 }, { "epoch": 0.41776426882809864, "grad_norm": 2.1877365112304688, "learning_rate": 1.310888990923846e-05, "loss": 0.5835, "step": 1237 }, { "epoch": 0.4181019925700777, "grad_norm": 2.5775158405303955, "learning_rate": 1.3098491411743014e-05, "loss": 0.5734, "step": 1238 }, { "epoch": 0.41843971631205673, "grad_norm": 2.2126893997192383, "learning_rate": 1.3088089206747845e-05, "loss": 0.5047, "step": 1239 }, { "epoch": 0.4187774400540358, "grad_norm": 2.18711519241333, "learning_rate": 1.3077683306699702e-05, "loss": 0.546, "step": 1240 }, { "epoch": 0.4191151637960149, "grad_norm": 2.096095561981201, "learning_rate": 1.3067273724049774e-05, "loss": 0.5697, "step": 1241 }, { "epoch": 0.4194528875379939, "grad_norm": 1.8555736541748047, "learning_rate": 1.3056860471253639e-05, "loss": 0.5281, "step": 1242 }, { "epoch": 0.41979061127997297, "grad_norm": 1.9800597429275513, "learning_rate": 1.3046443560771278e-05, "loss": 0.508, "step": 1243 }, { "epoch": 0.42012833502195207, "grad_norm": 1.6757116317749023, "learning_rate": 1.3036023005067042e-05, "loss": 0.5282, "step": 1244 }, { "epoch": 0.4204660587639311, "grad_norm": 1.7738958597183228, "learning_rate": 1.302559881660965e-05, "loss": 0.5474, "step": 1245 }, { "epoch": 0.42080378250591016, "grad_norm": 1.9713691473007202, "learning_rate": 1.3015171007872161e-05, "loss": 0.5349, "step": 1246 }, { "epoch": 0.4211415062478892, "grad_norm": 1.8992007970809937, "learning_rate": 1.3004739591331967e-05, "loss": 0.5629, "step": 1247 }, { "epoch": 0.4214792299898683, "grad_norm": 2.140488624572754, "learning_rate": 1.2994304579470787e-05, "loss": 0.5473, "step": 1248 }, { "epoch": 0.42181695373184735, "grad_norm": 1.7885081768035889, "learning_rate": 1.2983865984774627e-05, "loss": 0.5628, "step": 1249 }, { "epoch": 0.4221546774738264, "grad_norm": 2.5642693042755127, "learning_rate": 1.297342381973379e-05, "loss": 0.5675, "step": 1250 }, { "epoch": 0.42249240121580545, "grad_norm": 1.8276475667953491, "learning_rate": 1.2962978096842845e-05, "loss": 0.5547, "step": 1251 }, { "epoch": 0.42283012495778455, "grad_norm": 1.8977456092834473, "learning_rate": 1.2952528828600623e-05, "loss": 0.5451, "step": 1252 }, { "epoch": 0.4231678486997636, "grad_norm": 1.9125882387161255, "learning_rate": 1.2942076027510196e-05, "loss": 0.5385, "step": 1253 }, { "epoch": 0.42350557244174264, "grad_norm": 1.9338675737380981, "learning_rate": 1.2931619706078862e-05, "loss": 0.5355, "step": 1254 }, { "epoch": 0.42384329618372174, "grad_norm": 1.8571183681488037, "learning_rate": 1.2921159876818128e-05, "loss": 0.5153, "step": 1255 }, { "epoch": 0.4241810199257008, "grad_norm": 1.9408491849899292, "learning_rate": 1.2910696552243708e-05, "loss": 0.5263, "step": 1256 }, { "epoch": 0.42451874366767983, "grad_norm": 1.8911876678466797, "learning_rate": 1.290022974487549e-05, "loss": 0.5363, "step": 1257 }, { "epoch": 0.4248564674096589, "grad_norm": 2.033824920654297, "learning_rate": 1.2889759467237532e-05, "loss": 0.5162, "step": 1258 }, { "epoch": 0.425194191151638, "grad_norm": 2.138684034347534, "learning_rate": 1.2879285731858047e-05, "loss": 0.5114, "step": 1259 }, { "epoch": 0.425531914893617, "grad_norm": 2.0081512928009033, "learning_rate": 1.2868808551269374e-05, "loss": 0.5687, "step": 1260 }, { "epoch": 0.42586963863559607, "grad_norm": 1.9758532047271729, "learning_rate": 1.2858327938007993e-05, "loss": 0.5443, "step": 1261 }, { "epoch": 0.4262073623775751, "grad_norm": 1.928558111190796, "learning_rate": 1.2847843904614474e-05, "loss": 0.5506, "step": 1262 }, { "epoch": 0.4265450861195542, "grad_norm": 1.8456144332885742, "learning_rate": 1.283735646363349e-05, "loss": 0.5355, "step": 1263 }, { "epoch": 0.42688280986153326, "grad_norm": 1.781744360923767, "learning_rate": 1.2826865627613785e-05, "loss": 0.4928, "step": 1264 }, { "epoch": 0.4272205336035123, "grad_norm": 1.778971552848816, "learning_rate": 1.2816371409108174e-05, "loss": 0.5206, "step": 1265 }, { "epoch": 0.4275582573454914, "grad_norm": 2.4956207275390625, "learning_rate": 1.2805873820673509e-05, "loss": 0.5373, "step": 1266 }, { "epoch": 0.42789598108747046, "grad_norm": 2.685852527618408, "learning_rate": 1.279537287487068e-05, "loss": 0.5846, "step": 1267 }, { "epoch": 0.4282337048294495, "grad_norm": 3.4037110805511475, "learning_rate": 1.2784868584264587e-05, "loss": 0.5517, "step": 1268 }, { "epoch": 0.42857142857142855, "grad_norm": 2.901615619659424, "learning_rate": 1.277436096142415e-05, "loss": 0.5892, "step": 1269 }, { "epoch": 0.42890915231340765, "grad_norm": 2.122274160385132, "learning_rate": 1.2763850018922257e-05, "loss": 0.5475, "step": 1270 }, { "epoch": 0.4292468760553867, "grad_norm": 1.8014112710952759, "learning_rate": 1.275333576933578e-05, "loss": 0.5366, "step": 1271 }, { "epoch": 0.42958459979736574, "grad_norm": 2.112748861312866, "learning_rate": 1.2742818225245538e-05, "loss": 0.5172, "step": 1272 }, { "epoch": 0.42992232353934484, "grad_norm": 1.65496826171875, "learning_rate": 1.2732297399236301e-05, "loss": 0.5679, "step": 1273 }, { "epoch": 0.4302600472813239, "grad_norm": 2.266004800796509, "learning_rate": 1.2721773303896765e-05, "loss": 0.472, "step": 1274 }, { "epoch": 0.43059777102330293, "grad_norm": 3.4923911094665527, "learning_rate": 1.2711245951819533e-05, "loss": 0.5113, "step": 1275 }, { "epoch": 0.430935494765282, "grad_norm": 1.774741768836975, "learning_rate": 1.2700715355601107e-05, "loss": 0.5029, "step": 1276 }, { "epoch": 0.4312732185072611, "grad_norm": 1.7432043552398682, "learning_rate": 1.2690181527841873e-05, "loss": 0.5277, "step": 1277 }, { "epoch": 0.4316109422492401, "grad_norm": 1.747560977935791, "learning_rate": 1.2679644481146081e-05, "loss": 0.4893, "step": 1278 }, { "epoch": 0.43194866599121917, "grad_norm": 2.6088345050811768, "learning_rate": 1.2669104228121834e-05, "loss": 0.5171, "step": 1279 }, { "epoch": 0.4322863897331982, "grad_norm": 1.8391082286834717, "learning_rate": 1.265856078138107e-05, "loss": 0.5327, "step": 1280 }, { "epoch": 0.4326241134751773, "grad_norm": 1.9491345882415771, "learning_rate": 1.2648014153539551e-05, "loss": 0.5465, "step": 1281 }, { "epoch": 0.43296183721715636, "grad_norm": 1.9003881216049194, "learning_rate": 1.2637464357216847e-05, "loss": 0.5352, "step": 1282 }, { "epoch": 0.4332995609591354, "grad_norm": 1.842348337173462, "learning_rate": 1.2626911405036308e-05, "loss": 0.56, "step": 1283 }, { "epoch": 0.4336372847011145, "grad_norm": 2.9545814990997314, "learning_rate": 1.2616355309625076e-05, "loss": 0.5348, "step": 1284 }, { "epoch": 0.43397500844309356, "grad_norm": 1.8577566146850586, "learning_rate": 1.2605796083614044e-05, "loss": 0.4956, "step": 1285 }, { "epoch": 0.4343127321850726, "grad_norm": 1.9321303367614746, "learning_rate": 1.2595233739637851e-05, "loss": 0.5128, "step": 1286 }, { "epoch": 0.43465045592705165, "grad_norm": 2.070756435394287, "learning_rate": 1.2584668290334872e-05, "loss": 0.5937, "step": 1287 }, { "epoch": 0.43498817966903075, "grad_norm": 2.119610071182251, "learning_rate": 1.2574099748347195e-05, "loss": 0.5518, "step": 1288 }, { "epoch": 0.4353259034110098, "grad_norm": 2.3219246864318848, "learning_rate": 1.2563528126320606e-05, "loss": 0.5533, "step": 1289 }, { "epoch": 0.43566362715298884, "grad_norm": 1.9065794944763184, "learning_rate": 1.2552953436904578e-05, "loss": 0.5359, "step": 1290 }, { "epoch": 0.43600135089496794, "grad_norm": 2.114666700363159, "learning_rate": 1.254237569275226e-05, "loss": 0.5602, "step": 1291 }, { "epoch": 0.436339074636947, "grad_norm": 2.27213454246521, "learning_rate": 1.2531794906520447e-05, "loss": 0.5524, "step": 1292 }, { "epoch": 0.43667679837892603, "grad_norm": 1.8863598108291626, "learning_rate": 1.252121109086958e-05, "loss": 0.5356, "step": 1293 }, { "epoch": 0.4370145221209051, "grad_norm": 2.0272343158721924, "learning_rate": 1.2510624258463719e-05, "loss": 0.4877, "step": 1294 }, { "epoch": 0.4373522458628842, "grad_norm": 2.1431069374084473, "learning_rate": 1.2500034421970546e-05, "loss": 0.501, "step": 1295 }, { "epoch": 0.4376899696048632, "grad_norm": 1.8638639450073242, "learning_rate": 1.248944159406132e-05, "loss": 0.5358, "step": 1296 }, { "epoch": 0.4380276933468423, "grad_norm": 8.569225311279297, "learning_rate": 1.2478845787410896e-05, "loss": 0.5364, "step": 1297 }, { "epoch": 0.4383654170888213, "grad_norm": 2.6420843601226807, "learning_rate": 1.246824701469768e-05, "loss": 0.5344, "step": 1298 }, { "epoch": 0.4387031408308004, "grad_norm": 2.0753695964813232, "learning_rate": 1.2457645288603636e-05, "loss": 0.5482, "step": 1299 }, { "epoch": 0.43904086457277947, "grad_norm": 2.102945566177368, "learning_rate": 1.2447040621814262e-05, "loss": 0.5083, "step": 1300 }, { "epoch": 0.4393785883147585, "grad_norm": 2.0223333835601807, "learning_rate": 1.2436433027018564e-05, "loss": 0.5639, "step": 1301 }, { "epoch": 0.4397163120567376, "grad_norm": 5.6972246170043945, "learning_rate": 1.2425822516909065e-05, "loss": 0.5388, "step": 1302 }, { "epoch": 0.44005403579871666, "grad_norm": 1.6948012113571167, "learning_rate": 1.2415209104181767e-05, "loss": 0.5486, "step": 1303 }, { "epoch": 0.4403917595406957, "grad_norm": 2.041184186935425, "learning_rate": 1.2404592801536151e-05, "loss": 0.5566, "step": 1304 }, { "epoch": 0.44072948328267475, "grad_norm": 2.510096311569214, "learning_rate": 1.239397362167515e-05, "loss": 0.5285, "step": 1305 }, { "epoch": 0.44106720702465385, "grad_norm": 2.5810413360595703, "learning_rate": 1.2383351577305148e-05, "loss": 0.5594, "step": 1306 }, { "epoch": 0.4414049307666329, "grad_norm": 1.7567894458770752, "learning_rate": 1.2372726681135944e-05, "loss": 0.5353, "step": 1307 }, { "epoch": 0.44174265450861194, "grad_norm": 1.7908960580825806, "learning_rate": 1.2362098945880765e-05, "loss": 0.5463, "step": 1308 }, { "epoch": 0.44208037825059104, "grad_norm": 1.6354738473892212, "learning_rate": 1.2351468384256221e-05, "loss": 0.5155, "step": 1309 }, { "epoch": 0.4424181019925701, "grad_norm": 2.0084316730499268, "learning_rate": 1.2340835008982315e-05, "loss": 0.5094, "step": 1310 }, { "epoch": 0.44275582573454914, "grad_norm": 1.7620741128921509, "learning_rate": 1.2330198832782404e-05, "loss": 0.4869, "step": 1311 }, { "epoch": 0.4430935494765282, "grad_norm": 1.812522292137146, "learning_rate": 1.2319559868383215e-05, "loss": 0.5528, "step": 1312 }, { "epoch": 0.4434312732185073, "grad_norm": 1.7308216094970703, "learning_rate": 1.2308918128514796e-05, "loss": 0.5055, "step": 1313 }, { "epoch": 0.44376899696048633, "grad_norm": 1.7774397134780884, "learning_rate": 1.2298273625910512e-05, "loss": 0.5313, "step": 1314 }, { "epoch": 0.4441067207024654, "grad_norm": 2.3825790882110596, "learning_rate": 1.2287626373307053e-05, "loss": 0.4871, "step": 1315 }, { "epoch": 0.4444444444444444, "grad_norm": 2.205587148666382, "learning_rate": 1.2276976383444384e-05, "loss": 0.5626, "step": 1316 }, { "epoch": 0.4447821681864235, "grad_norm": 1.7676585912704468, "learning_rate": 1.226632366906575e-05, "loss": 0.5612, "step": 1317 }, { "epoch": 0.44511989192840257, "grad_norm": 1.9778674840927124, "learning_rate": 1.2255668242917651e-05, "loss": 0.5679, "step": 1318 }, { "epoch": 0.4454576156703816, "grad_norm": 2.0181491374969482, "learning_rate": 1.2245010117749848e-05, "loss": 0.5417, "step": 1319 }, { "epoch": 0.4457953394123607, "grad_norm": 1.85775887966156, "learning_rate": 1.2234349306315308e-05, "loss": 0.5194, "step": 1320 }, { "epoch": 0.44613306315433976, "grad_norm": 2.1167778968811035, "learning_rate": 1.2223685821370231e-05, "loss": 0.5076, "step": 1321 }, { "epoch": 0.4464707868963188, "grad_norm": 1.8716440200805664, "learning_rate": 1.2213019675674008e-05, "loss": 0.5347, "step": 1322 }, { "epoch": 0.44680851063829785, "grad_norm": 1.862638235092163, "learning_rate": 1.2202350881989216e-05, "loss": 0.5433, "step": 1323 }, { "epoch": 0.44714623438027695, "grad_norm": 3.808594226837158, "learning_rate": 1.2191679453081598e-05, "loss": 0.5574, "step": 1324 }, { "epoch": 0.447483958122256, "grad_norm": 1.8400542736053467, "learning_rate": 1.2181005401720052e-05, "loss": 0.5213, "step": 1325 }, { "epoch": 0.44782168186423504, "grad_norm": 2.541862726211548, "learning_rate": 1.2170328740676613e-05, "loss": 0.4936, "step": 1326 }, { "epoch": 0.4481594056062141, "grad_norm": 1.7653688192367554, "learning_rate": 1.2159649482726442e-05, "loss": 0.5554, "step": 1327 }, { "epoch": 0.4484971293481932, "grad_norm": 1.9455537796020508, "learning_rate": 1.2148967640647801e-05, "loss": 0.5255, "step": 1328 }, { "epoch": 0.44883485309017224, "grad_norm": 1.8578307628631592, "learning_rate": 1.213828322722205e-05, "loss": 0.5301, "step": 1329 }, { "epoch": 0.4491725768321513, "grad_norm": 2.7487428188323975, "learning_rate": 1.2127596255233622e-05, "loss": 0.5635, "step": 1330 }, { "epoch": 0.4495103005741304, "grad_norm": 4.468762397766113, "learning_rate": 1.2116906737470009e-05, "loss": 0.5096, "step": 1331 }, { "epoch": 0.44984802431610943, "grad_norm": 2.0481085777282715, "learning_rate": 1.2106214686721763e-05, "loss": 0.5215, "step": 1332 }, { "epoch": 0.4501857480580885, "grad_norm": 2.8743607997894287, "learning_rate": 1.2095520115782446e-05, "loss": 0.5802, "step": 1333 }, { "epoch": 0.4505234718000675, "grad_norm": 2.0881946086883545, "learning_rate": 1.2084823037448654e-05, "loss": 0.558, "step": 1334 }, { "epoch": 0.4508611955420466, "grad_norm": 1.837125301361084, "learning_rate": 1.2074123464519971e-05, "loss": 0.5166, "step": 1335 }, { "epoch": 0.45119891928402567, "grad_norm": 2.0617237091064453, "learning_rate": 1.2063421409798974e-05, "loss": 0.5478, "step": 1336 }, { "epoch": 0.4515366430260047, "grad_norm": 1.90732741355896, "learning_rate": 1.20527168860912e-05, "loss": 0.5119, "step": 1337 }, { "epoch": 0.4518743667679838, "grad_norm": 1.8153945207595825, "learning_rate": 1.2042009906205152e-05, "loss": 0.5385, "step": 1338 }, { "epoch": 0.45221209050996286, "grad_norm": 2.0735466480255127, "learning_rate": 1.2031300482952263e-05, "loss": 0.5324, "step": 1339 }, { "epoch": 0.4525498142519419, "grad_norm": 1.978614091873169, "learning_rate": 1.2020588629146897e-05, "loss": 0.5443, "step": 1340 }, { "epoch": 0.45288753799392095, "grad_norm": 2.4758059978485107, "learning_rate": 1.2009874357606316e-05, "loss": 0.5504, "step": 1341 }, { "epoch": 0.45322526173590005, "grad_norm": 2.0282976627349854, "learning_rate": 1.1999157681150683e-05, "loss": 0.5043, "step": 1342 }, { "epoch": 0.4535629854778791, "grad_norm": 2.0400731563568115, "learning_rate": 1.1988438612603041e-05, "loss": 0.5536, "step": 1343 }, { "epoch": 0.45390070921985815, "grad_norm": 2.152313470840454, "learning_rate": 1.1977717164789286e-05, "loss": 0.5676, "step": 1344 }, { "epoch": 0.4542384329618372, "grad_norm": 2.587888240814209, "learning_rate": 1.196699335053817e-05, "loss": 0.5755, "step": 1345 }, { "epoch": 0.4545761567038163, "grad_norm": 2.1858348846435547, "learning_rate": 1.1956267182681265e-05, "loss": 0.5736, "step": 1346 }, { "epoch": 0.45491388044579534, "grad_norm": 2.0561635494232178, "learning_rate": 1.1945538674052975e-05, "loss": 0.5438, "step": 1347 }, { "epoch": 0.4552516041877744, "grad_norm": 1.7773709297180176, "learning_rate": 1.193480783749049e-05, "loss": 0.5197, "step": 1348 }, { "epoch": 0.4555893279297535, "grad_norm": 2.285090446472168, "learning_rate": 1.1924074685833797e-05, "loss": 0.5575, "step": 1349 }, { "epoch": 0.45592705167173253, "grad_norm": 2.0313315391540527, "learning_rate": 1.1913339231925642e-05, "loss": 0.4918, "step": 1350 }, { "epoch": 0.4562647754137116, "grad_norm": 6.8018798828125, "learning_rate": 1.190260148861154e-05, "loss": 0.5444, "step": 1351 }, { "epoch": 0.4566024991556906, "grad_norm": 1.8594437837600708, "learning_rate": 1.1891861468739729e-05, "loss": 0.5465, "step": 1352 }, { "epoch": 0.4569402228976697, "grad_norm": 1.8235805034637451, "learning_rate": 1.1881119185161183e-05, "loss": 0.4862, "step": 1353 }, { "epoch": 0.45727794663964877, "grad_norm": 2.2161338329315186, "learning_rate": 1.1870374650729582e-05, "loss": 0.5444, "step": 1354 }, { "epoch": 0.4576156703816278, "grad_norm": 1.6504031419754028, "learning_rate": 1.1859627878301293e-05, "loss": 0.5117, "step": 1355 }, { "epoch": 0.4579533941236069, "grad_norm": 1.9400750398635864, "learning_rate": 1.1848878880735374e-05, "loss": 0.5369, "step": 1356 }, { "epoch": 0.45829111786558596, "grad_norm": 5.983567237854004, "learning_rate": 1.1838127670893532e-05, "loss": 0.5143, "step": 1357 }, { "epoch": 0.458628841607565, "grad_norm": 2.2216289043426514, "learning_rate": 1.1827374261640128e-05, "loss": 0.5065, "step": 1358 }, { "epoch": 0.45896656534954405, "grad_norm": 4.507613658905029, "learning_rate": 1.181661866584215e-05, "loss": 0.5449, "step": 1359 }, { "epoch": 0.45930428909152315, "grad_norm": 1.9921975135803223, "learning_rate": 1.1805860896369212e-05, "loss": 0.5684, "step": 1360 }, { "epoch": 0.4596420128335022, "grad_norm": 2.42618465423584, "learning_rate": 1.1795100966093516e-05, "loss": 0.5048, "step": 1361 }, { "epoch": 0.45997973657548125, "grad_norm": 5.405630588531494, "learning_rate": 1.1784338887889858e-05, "loss": 0.5296, "step": 1362 }, { "epoch": 0.4603174603174603, "grad_norm": 2.1250035762786865, "learning_rate": 1.1773574674635604e-05, "loss": 0.5825, "step": 1363 }, { "epoch": 0.4606551840594394, "grad_norm": 2.010023355484009, "learning_rate": 1.1762808339210672e-05, "loss": 0.5272, "step": 1364 }, { "epoch": 0.46099290780141844, "grad_norm": 3.388935089111328, "learning_rate": 1.1752039894497518e-05, "loss": 0.5201, "step": 1365 }, { "epoch": 0.4613306315433975, "grad_norm": 1.9226844310760498, "learning_rate": 1.1741269353381128e-05, "loss": 0.5508, "step": 1366 }, { "epoch": 0.4616683552853766, "grad_norm": 1.8691760301589966, "learning_rate": 1.173049672874899e-05, "loss": 0.5198, "step": 1367 }, { "epoch": 0.46200607902735563, "grad_norm": 2.1674916744232178, "learning_rate": 1.1719722033491086e-05, "loss": 0.5579, "step": 1368 }, { "epoch": 0.4623438027693347, "grad_norm": 2.432072162628174, "learning_rate": 1.1708945280499879e-05, "loss": 0.5379, "step": 1369 }, { "epoch": 0.4626815265113137, "grad_norm": 1.820574402809143, "learning_rate": 1.1698166482670293e-05, "loss": 0.5339, "step": 1370 }, { "epoch": 0.4630192502532928, "grad_norm": 2.3101956844329834, "learning_rate": 1.1687385652899694e-05, "loss": 0.506, "step": 1371 }, { "epoch": 0.46335697399527187, "grad_norm": 1.8386046886444092, "learning_rate": 1.1676602804087887e-05, "loss": 0.5503, "step": 1372 }, { "epoch": 0.4636946977372509, "grad_norm": 2.204070806503296, "learning_rate": 1.1665817949137086e-05, "loss": 0.5593, "step": 1373 }, { "epoch": 0.46403242147923, "grad_norm": 2.1889779567718506, "learning_rate": 1.165503110095191e-05, "loss": 0.517, "step": 1374 }, { "epoch": 0.46437014522120906, "grad_norm": 1.863933801651001, "learning_rate": 1.1644242272439365e-05, "loss": 0.5613, "step": 1375 }, { "epoch": 0.4647078689631881, "grad_norm": 2.391925811767578, "learning_rate": 1.1633451476508819e-05, "loss": 0.5737, "step": 1376 }, { "epoch": 0.46504559270516715, "grad_norm": 1.7387486696243286, "learning_rate": 1.1622658726072e-05, "loss": 0.5264, "step": 1377 }, { "epoch": 0.46538331644714626, "grad_norm": 2.3276591300964355, "learning_rate": 1.1611864034042972e-05, "loss": 0.5171, "step": 1378 }, { "epoch": 0.4657210401891253, "grad_norm": 1.742932677268982, "learning_rate": 1.1601067413338125e-05, "loss": 0.5447, "step": 1379 }, { "epoch": 0.46605876393110435, "grad_norm": 1.749743103981018, "learning_rate": 1.1590268876876151e-05, "loss": 0.5058, "step": 1380 }, { "epoch": 0.4663964876730834, "grad_norm": 1.9074084758758545, "learning_rate": 1.1579468437578045e-05, "loss": 0.5418, "step": 1381 }, { "epoch": 0.4667342114150625, "grad_norm": 2.0347349643707275, "learning_rate": 1.1568666108367066e-05, "loss": 0.5186, "step": 1382 }, { "epoch": 0.46707193515704154, "grad_norm": 1.8635331392288208, "learning_rate": 1.1557861902168739e-05, "loss": 0.5369, "step": 1383 }, { "epoch": 0.4674096588990206, "grad_norm": 1.8963871002197266, "learning_rate": 1.1547055831910841e-05, "loss": 0.5125, "step": 1384 }, { "epoch": 0.4677473826409997, "grad_norm": 2.6138548851013184, "learning_rate": 1.1536247910523372e-05, "loss": 0.5216, "step": 1385 }, { "epoch": 0.46808510638297873, "grad_norm": 1.663755178451538, "learning_rate": 1.1525438150938554e-05, "loss": 0.5291, "step": 1386 }, { "epoch": 0.4684228301249578, "grad_norm": 1.7594465017318726, "learning_rate": 1.15146265660908e-05, "loss": 0.5105, "step": 1387 }, { "epoch": 0.4687605538669368, "grad_norm": 1.856480360031128, "learning_rate": 1.1503813168916715e-05, "loss": 0.5294, "step": 1388 }, { "epoch": 0.4690982776089159, "grad_norm": 1.990504264831543, "learning_rate": 1.1492997972355062e-05, "loss": 0.5537, "step": 1389 }, { "epoch": 0.46943600135089497, "grad_norm": 1.6265802383422852, "learning_rate": 1.1482180989346771e-05, "loss": 0.4917, "step": 1390 }, { "epoch": 0.469773725092874, "grad_norm": 2.977954387664795, "learning_rate": 1.1471362232834898e-05, "loss": 0.5221, "step": 1391 }, { "epoch": 0.47011144883485306, "grad_norm": 2.3611598014831543, "learning_rate": 1.1460541715764628e-05, "loss": 0.5538, "step": 1392 }, { "epoch": 0.47044917257683216, "grad_norm": 1.9047822952270508, "learning_rate": 1.1449719451083247e-05, "loss": 0.5502, "step": 1393 }, { "epoch": 0.4707868963188112, "grad_norm": 1.8995492458343506, "learning_rate": 1.1438895451740141e-05, "loss": 0.5224, "step": 1394 }, { "epoch": 0.47112462006079026, "grad_norm": 2.1381924152374268, "learning_rate": 1.142806973068676e-05, "loss": 0.5501, "step": 1395 }, { "epoch": 0.47146234380276936, "grad_norm": 2.7795028686523438, "learning_rate": 1.1417242300876621e-05, "loss": 0.4859, "step": 1396 }, { "epoch": 0.4718000675447484, "grad_norm": 1.9494094848632812, "learning_rate": 1.1406413175265288e-05, "loss": 0.5342, "step": 1397 }, { "epoch": 0.47213779128672745, "grad_norm": 7.282893657684326, "learning_rate": 1.1395582366810348e-05, "loss": 0.5592, "step": 1398 }, { "epoch": 0.4724755150287065, "grad_norm": 5.1556925773620605, "learning_rate": 1.1384749888471402e-05, "loss": 0.5515, "step": 1399 }, { "epoch": 0.4728132387706856, "grad_norm": 2.098949670791626, "learning_rate": 1.1373915753210056e-05, "loss": 0.5026, "step": 1400 }, { "epoch": 0.47315096251266464, "grad_norm": 1.8055815696716309, "learning_rate": 1.1363079973989892e-05, "loss": 0.5292, "step": 1401 }, { "epoch": 0.4734886862546437, "grad_norm": 1.9953815937042236, "learning_rate": 1.135224256377646e-05, "loss": 0.5603, "step": 1402 }, { "epoch": 0.4738264099966228, "grad_norm": 2.116457223892212, "learning_rate": 1.1341403535537265e-05, "loss": 0.5161, "step": 1403 }, { "epoch": 0.47416413373860183, "grad_norm": 3.0207958221435547, "learning_rate": 1.1330562902241742e-05, "loss": 0.5127, "step": 1404 }, { "epoch": 0.4745018574805809, "grad_norm": 2.3466291427612305, "learning_rate": 1.1319720676861252e-05, "loss": 0.4893, "step": 1405 }, { "epoch": 0.4748395812225599, "grad_norm": 2.0848605632781982, "learning_rate": 1.1308876872369062e-05, "loss": 0.5068, "step": 1406 }, { "epoch": 0.475177304964539, "grad_norm": 2.260826349258423, "learning_rate": 1.1298031501740321e-05, "loss": 0.5398, "step": 1407 }, { "epoch": 0.4755150287065181, "grad_norm": 2.2539381980895996, "learning_rate": 1.128718457795206e-05, "loss": 0.5345, "step": 1408 }, { "epoch": 0.4758527524484971, "grad_norm": 2.615246295928955, "learning_rate": 1.1276336113983161e-05, "loss": 0.5517, "step": 1409 }, { "epoch": 0.47619047619047616, "grad_norm": 2.175102472305298, "learning_rate": 1.1265486122814359e-05, "loss": 0.5565, "step": 1410 }, { "epoch": 0.47652819993245527, "grad_norm": 1.964256763458252, "learning_rate": 1.1254634617428206e-05, "loss": 0.5211, "step": 1411 }, { "epoch": 0.4768659236744343, "grad_norm": 2.0367772579193115, "learning_rate": 1.124378161080907e-05, "loss": 0.5718, "step": 1412 }, { "epoch": 0.47720364741641336, "grad_norm": 2.357672691345215, "learning_rate": 1.1232927115943118e-05, "loss": 0.5015, "step": 1413 }, { "epoch": 0.47754137115839246, "grad_norm": 2.0370583534240723, "learning_rate": 1.1222071145818293e-05, "loss": 0.5036, "step": 1414 }, { "epoch": 0.4778790949003715, "grad_norm": 1.8354780673980713, "learning_rate": 1.1211213713424307e-05, "loss": 0.5428, "step": 1415 }, { "epoch": 0.47821681864235055, "grad_norm": 2.677562952041626, "learning_rate": 1.120035483175262e-05, "loss": 0.5272, "step": 1416 }, { "epoch": 0.4785545423843296, "grad_norm": 2.1220898628234863, "learning_rate": 1.1189494513796424e-05, "loss": 0.5774, "step": 1417 }, { "epoch": 0.4788922661263087, "grad_norm": 2.526350736618042, "learning_rate": 1.1178632772550636e-05, "loss": 0.5589, "step": 1418 }, { "epoch": 0.47922998986828774, "grad_norm": 2.6926231384277344, "learning_rate": 1.1167769621011868e-05, "loss": 0.5339, "step": 1419 }, { "epoch": 0.4795677136102668, "grad_norm": 2.8983354568481445, "learning_rate": 1.1156905072178425e-05, "loss": 0.5077, "step": 1420 }, { "epoch": 0.4799054373522459, "grad_norm": 2.288973808288574, "learning_rate": 1.1146039139050284e-05, "loss": 0.5432, "step": 1421 }, { "epoch": 0.48024316109422494, "grad_norm": 1.9885094165802002, "learning_rate": 1.113517183462907e-05, "loss": 0.5373, "step": 1422 }, { "epoch": 0.480580884836204, "grad_norm": 2.1156842708587646, "learning_rate": 1.1124303171918067e-05, "loss": 0.5378, "step": 1423 }, { "epoch": 0.480918608578183, "grad_norm": 4.254412651062012, "learning_rate": 1.1113433163922161e-05, "loss": 0.5739, "step": 1424 }, { "epoch": 0.48125633232016213, "grad_norm": 1.8133471012115479, "learning_rate": 1.1102561823647866e-05, "loss": 0.514, "step": 1425 }, { "epoch": 0.4815940560621412, "grad_norm": 1.7346327304840088, "learning_rate": 1.1091689164103281e-05, "loss": 0.5145, "step": 1426 }, { "epoch": 0.4819317798041202, "grad_norm": 1.8896650075912476, "learning_rate": 1.1080815198298089e-05, "loss": 0.5077, "step": 1427 }, { "epoch": 0.48226950354609927, "grad_norm": 1.8676141500473022, "learning_rate": 1.1069939939243531e-05, "loss": 0.5347, "step": 1428 }, { "epoch": 0.48260722728807837, "grad_norm": 1.9662107229232788, "learning_rate": 1.10590633999524e-05, "loss": 0.5161, "step": 1429 }, { "epoch": 0.4829449510300574, "grad_norm": 1.6622718572616577, "learning_rate": 1.1048185593439014e-05, "loss": 0.513, "step": 1430 }, { "epoch": 0.48328267477203646, "grad_norm": 1.900046706199646, "learning_rate": 1.1037306532719214e-05, "loss": 0.5234, "step": 1431 }, { "epoch": 0.48362039851401556, "grad_norm": 1.691713571548462, "learning_rate": 1.1026426230810342e-05, "loss": 0.5428, "step": 1432 }, { "epoch": 0.4839581222559946, "grad_norm": 2.074925184249878, "learning_rate": 1.101554470073122e-05, "loss": 0.5677, "step": 1433 }, { "epoch": 0.48429584599797365, "grad_norm": 2.1954033374786377, "learning_rate": 1.1004661955502143e-05, "loss": 0.5262, "step": 1434 }, { "epoch": 0.4846335697399527, "grad_norm": 2.346193552017212, "learning_rate": 1.0993778008144858e-05, "loss": 0.5043, "step": 1435 }, { "epoch": 0.4849712934819318, "grad_norm": 1.9933663606643677, "learning_rate": 1.0982892871682556e-05, "loss": 0.5422, "step": 1436 }, { "epoch": 0.48530901722391084, "grad_norm": 1.9431201219558716, "learning_rate": 1.0972006559139836e-05, "loss": 0.5505, "step": 1437 }, { "epoch": 0.4856467409658899, "grad_norm": 1.99745774269104, "learning_rate": 1.0961119083542727e-05, "loss": 0.5802, "step": 1438 }, { "epoch": 0.485984464707869, "grad_norm": 2.2026174068450928, "learning_rate": 1.095023045791863e-05, "loss": 0.5309, "step": 1439 }, { "epoch": 0.48632218844984804, "grad_norm": 1.8316936492919922, "learning_rate": 1.0939340695296332e-05, "loss": 0.5394, "step": 1440 }, { "epoch": 0.4866599121918271, "grad_norm": 1.716264009475708, "learning_rate": 1.0928449808705976e-05, "loss": 0.5016, "step": 1441 }, { "epoch": 0.48699763593380613, "grad_norm": 1.8066456317901611, "learning_rate": 1.0917557811179057e-05, "loss": 0.5116, "step": 1442 }, { "epoch": 0.48733535967578523, "grad_norm": 2.245302677154541, "learning_rate": 1.0906664715748385e-05, "loss": 0.568, "step": 1443 }, { "epoch": 0.4876730834177643, "grad_norm": 2.199521541595459, "learning_rate": 1.0895770535448103e-05, "loss": 0.5655, "step": 1444 }, { "epoch": 0.4880108071597433, "grad_norm": 2.20798397064209, "learning_rate": 1.0884875283313632e-05, "loss": 0.5055, "step": 1445 }, { "epoch": 0.48834853090172237, "grad_norm": 1.973873496055603, "learning_rate": 1.0873978972381692e-05, "loss": 0.5696, "step": 1446 }, { "epoch": 0.48868625464370147, "grad_norm": 2.3492164611816406, "learning_rate": 1.0863081615690259e-05, "loss": 0.5864, "step": 1447 }, { "epoch": 0.4890239783856805, "grad_norm": 1.991562843322754, "learning_rate": 1.0852183226278568e-05, "loss": 0.5569, "step": 1448 }, { "epoch": 0.48936170212765956, "grad_norm": 2.045905590057373, "learning_rate": 1.0841283817187085e-05, "loss": 0.5613, "step": 1449 }, { "epoch": 0.48969942586963866, "grad_norm": 1.679823637008667, "learning_rate": 1.0830383401457499e-05, "loss": 0.5046, "step": 1450 }, { "epoch": 0.4900371496116177, "grad_norm": 2.105825662612915, "learning_rate": 1.0819481992132698e-05, "loss": 0.5214, "step": 1451 }, { "epoch": 0.49037487335359675, "grad_norm": 1.657583236694336, "learning_rate": 1.0808579602256766e-05, "loss": 0.5247, "step": 1452 }, { "epoch": 0.4907125970955758, "grad_norm": 1.7162933349609375, "learning_rate": 1.0797676244874954e-05, "loss": 0.5107, "step": 1453 }, { "epoch": 0.4910503208375549, "grad_norm": 1.8648332357406616, "learning_rate": 1.0786771933033677e-05, "loss": 0.5566, "step": 1454 }, { "epoch": 0.49138804457953394, "grad_norm": 1.5353182554244995, "learning_rate": 1.077586667978049e-05, "loss": 0.5179, "step": 1455 }, { "epoch": 0.491725768321513, "grad_norm": 1.8796603679656982, "learning_rate": 1.0764960498164066e-05, "loss": 0.5304, "step": 1456 }, { "epoch": 0.49206349206349204, "grad_norm": 1.7509042024612427, "learning_rate": 1.0754053401234202e-05, "loss": 0.5279, "step": 1457 }, { "epoch": 0.49240121580547114, "grad_norm": 2.2565176486968994, "learning_rate": 1.0743145402041781e-05, "loss": 0.5685, "step": 1458 }, { "epoch": 0.4927389395474502, "grad_norm": 1.963194489479065, "learning_rate": 1.0732236513638772e-05, "loss": 0.5072, "step": 1459 }, { "epoch": 0.49307666328942923, "grad_norm": 2.0603153705596924, "learning_rate": 1.0721326749078205e-05, "loss": 0.5555, "step": 1460 }, { "epoch": 0.49341438703140833, "grad_norm": 1.8885610103607178, "learning_rate": 1.0710416121414154e-05, "loss": 0.5286, "step": 1461 }, { "epoch": 0.4937521107733874, "grad_norm": 1.919648289680481, "learning_rate": 1.0699504643701732e-05, "loss": 0.5452, "step": 1462 }, { "epoch": 0.4940898345153664, "grad_norm": 1.7565088272094727, "learning_rate": 1.068859232899707e-05, "loss": 0.5372, "step": 1463 }, { "epoch": 0.49442755825734547, "grad_norm": 1.9612773656845093, "learning_rate": 1.0677679190357292e-05, "loss": 0.5376, "step": 1464 }, { "epoch": 0.49476528199932457, "grad_norm": 2.4354312419891357, "learning_rate": 1.066676524084052e-05, "loss": 0.5261, "step": 1465 }, { "epoch": 0.4951030057413036, "grad_norm": 1.7740994691848755, "learning_rate": 1.0655850493505834e-05, "loss": 0.553, "step": 1466 }, { "epoch": 0.49544072948328266, "grad_norm": 2.0729455947875977, "learning_rate": 1.0644934961413276e-05, "loss": 0.5286, "step": 1467 }, { "epoch": 0.49577845322526176, "grad_norm": 1.7798141241073608, "learning_rate": 1.0634018657623827e-05, "loss": 0.4842, "step": 1468 }, { "epoch": 0.4961161769672408, "grad_norm": 2.007589817047119, "learning_rate": 1.0623101595199386e-05, "loss": 0.5401, "step": 1469 }, { "epoch": 0.49645390070921985, "grad_norm": 2.0371532440185547, "learning_rate": 1.0612183787202768e-05, "loss": 0.5246, "step": 1470 }, { "epoch": 0.4967916244511989, "grad_norm": 2.131265163421631, "learning_rate": 1.0601265246697672e-05, "loss": 0.4774, "step": 1471 }, { "epoch": 0.497129348193178, "grad_norm": 1.8842239379882812, "learning_rate": 1.059034598674868e-05, "loss": 0.5478, "step": 1472 }, { "epoch": 0.49746707193515705, "grad_norm": 2.1403729915618896, "learning_rate": 1.0579426020421231e-05, "loss": 0.5319, "step": 1473 }, { "epoch": 0.4978047956771361, "grad_norm": 1.6586905717849731, "learning_rate": 1.0568505360781606e-05, "loss": 0.4938, "step": 1474 }, { "epoch": 0.49814251941911514, "grad_norm": 1.9929665327072144, "learning_rate": 1.055758402089693e-05, "loss": 0.5254, "step": 1475 }, { "epoch": 0.49848024316109424, "grad_norm": 2.6981818675994873, "learning_rate": 1.0546662013835119e-05, "loss": 0.5582, "step": 1476 }, { "epoch": 0.4988179669030733, "grad_norm": 1.9506086111068726, "learning_rate": 1.0535739352664907e-05, "loss": 0.5126, "step": 1477 }, { "epoch": 0.49915569064505233, "grad_norm": 2.6548616886138916, "learning_rate": 1.0524816050455801e-05, "loss": 0.5201, "step": 1478 }, { "epoch": 0.49949341438703143, "grad_norm": 2.252875804901123, "learning_rate": 1.051389212027808e-05, "loss": 0.5622, "step": 1479 }, { "epoch": 0.4998311381290105, "grad_norm": 1.7515404224395752, "learning_rate": 1.0502967575202769e-05, "loss": 0.5035, "step": 1480 }, { "epoch": 0.5001688618709895, "grad_norm": 1.911763310432434, "learning_rate": 1.049204242830164e-05, "loss": 0.4665, "step": 1481 }, { "epoch": 0.5005065856129686, "grad_norm": 1.9344418048858643, "learning_rate": 1.0481116692647165e-05, "loss": 0.5119, "step": 1482 }, { "epoch": 0.5008443093549476, "grad_norm": 1.9650321006774902, "learning_rate": 1.047019038131254e-05, "loss": 0.5498, "step": 1483 }, { "epoch": 0.5011820330969267, "grad_norm": 1.8808964490890503, "learning_rate": 1.045926350737164e-05, "loss": 0.5215, "step": 1484 }, { "epoch": 0.5015197568389058, "grad_norm": 2.03971004486084, "learning_rate": 1.0448336083899018e-05, "loss": 0.5176, "step": 1485 }, { "epoch": 0.5018574805808849, "grad_norm": 1.872604250907898, "learning_rate": 1.0437408123969877e-05, "loss": 0.5397, "step": 1486 }, { "epoch": 0.5021952043228639, "grad_norm": 1.920541524887085, "learning_rate": 1.042647964066007e-05, "loss": 0.5347, "step": 1487 }, { "epoch": 0.502532928064843, "grad_norm": 1.6658520698547363, "learning_rate": 1.0415550647046074e-05, "loss": 0.4939, "step": 1488 }, { "epoch": 0.502870651806822, "grad_norm": 1.8827826976776123, "learning_rate": 1.0404621156204972e-05, "loss": 0.5286, "step": 1489 }, { "epoch": 0.503208375548801, "grad_norm": 1.709823489189148, "learning_rate": 1.039369118121445e-05, "loss": 0.5192, "step": 1490 }, { "epoch": 0.5035460992907801, "grad_norm": 1.7468341588974, "learning_rate": 1.0382760735152765e-05, "loss": 0.5024, "step": 1491 }, { "epoch": 0.5038838230327592, "grad_norm": 1.7445966005325317, "learning_rate": 1.0371829831098747e-05, "loss": 0.488, "step": 1492 }, { "epoch": 0.5042215467747383, "grad_norm": 1.738606333732605, "learning_rate": 1.0360898482131762e-05, "loss": 0.5171, "step": 1493 }, { "epoch": 0.5045592705167173, "grad_norm": 3.38631272315979, "learning_rate": 1.0349966701331721e-05, "loss": 0.5349, "step": 1494 }, { "epoch": 0.5048969942586964, "grad_norm": 1.9066959619522095, "learning_rate": 1.033903450177904e-05, "loss": 0.5422, "step": 1495 }, { "epoch": 0.5052347180006754, "grad_norm": 1.5557810068130493, "learning_rate": 1.0328101896554647e-05, "loss": 0.5097, "step": 1496 }, { "epoch": 0.5055724417426545, "grad_norm": 2.258446216583252, "learning_rate": 1.0317168898739947e-05, "loss": 0.5672, "step": 1497 }, { "epoch": 0.5059101654846335, "grad_norm": 1.5088642835617065, "learning_rate": 1.0306235521416822e-05, "loss": 0.4987, "step": 1498 }, { "epoch": 0.5062478892266127, "grad_norm": 1.808045744895935, "learning_rate": 1.0295301777667595e-05, "loss": 0.5459, "step": 1499 }, { "epoch": 0.5065856129685917, "grad_norm": 1.6580668687820435, "learning_rate": 1.0284367680575045e-05, "loss": 0.5232, "step": 1500 }, { "epoch": 0.5069233367105708, "grad_norm": 2.273622989654541, "learning_rate": 1.0273433243222364e-05, "loss": 0.5396, "step": 1501 }, { "epoch": 0.5072610604525498, "grad_norm": 1.7503571510314941, "learning_rate": 1.0262498478693148e-05, "loss": 0.5082, "step": 1502 }, { "epoch": 0.5075987841945289, "grad_norm": 1.8804943561553955, "learning_rate": 1.0251563400071395e-05, "loss": 0.5495, "step": 1503 }, { "epoch": 0.5079365079365079, "grad_norm": 1.6725060939788818, "learning_rate": 1.0240628020441468e-05, "loss": 0.5344, "step": 1504 }, { "epoch": 0.508274231678487, "grad_norm": 1.7230769395828247, "learning_rate": 1.02296923528881e-05, "loss": 0.4996, "step": 1505 }, { "epoch": 0.5086119554204661, "grad_norm": 1.7503342628479004, "learning_rate": 1.0218756410496353e-05, "loss": 0.5261, "step": 1506 }, { "epoch": 0.5089496791624452, "grad_norm": 1.6665900945663452, "learning_rate": 1.020782020635164e-05, "loss": 0.5286, "step": 1507 }, { "epoch": 0.5092874029044242, "grad_norm": 1.9903472661972046, "learning_rate": 1.019688375353967e-05, "loss": 0.544, "step": 1508 }, { "epoch": 0.5096251266464032, "grad_norm": 1.6918261051177979, "learning_rate": 1.0185947065146455e-05, "loss": 0.5102, "step": 1509 }, { "epoch": 0.5099628503883823, "grad_norm": 1.739233374595642, "learning_rate": 1.0175010154258288e-05, "loss": 0.5427, "step": 1510 }, { "epoch": 0.5103005741303613, "grad_norm": 1.4563263654708862, "learning_rate": 1.0164073033961733e-05, "loss": 0.5174, "step": 1511 }, { "epoch": 0.5106382978723404, "grad_norm": 2.173114061355591, "learning_rate": 1.0153135717343599e-05, "loss": 0.5266, "step": 1512 }, { "epoch": 0.5109760216143195, "grad_norm": 1.8180910348892212, "learning_rate": 1.0142198217490929e-05, "loss": 0.5184, "step": 1513 }, { "epoch": 0.5113137453562986, "grad_norm": 2.618863344192505, "learning_rate": 1.013126054749099e-05, "loss": 0.5183, "step": 1514 }, { "epoch": 0.5116514690982776, "grad_norm": 1.9329898357391357, "learning_rate": 1.012032272043125e-05, "loss": 0.5494, "step": 1515 }, { "epoch": 0.5119891928402567, "grad_norm": 1.5973321199417114, "learning_rate": 1.0109384749399369e-05, "loss": 0.5103, "step": 1516 }, { "epoch": 0.5123269165822357, "grad_norm": 2.090709924697876, "learning_rate": 1.0098446647483166e-05, "loss": 0.5252, "step": 1517 }, { "epoch": 0.5126646403242148, "grad_norm": 1.8020439147949219, "learning_rate": 1.0087508427770639e-05, "loss": 0.5284, "step": 1518 }, { "epoch": 0.5130023640661938, "grad_norm": 1.9082454442977905, "learning_rate": 1.0076570103349902e-05, "loss": 0.4898, "step": 1519 }, { "epoch": 0.5133400878081729, "grad_norm": 1.7775554656982422, "learning_rate": 1.0065631687309217e-05, "loss": 0.5068, "step": 1520 }, { "epoch": 0.513677811550152, "grad_norm": 2.0219433307647705, "learning_rate": 1.0054693192736936e-05, "loss": 0.5323, "step": 1521 }, { "epoch": 0.5140155352921311, "grad_norm": 1.930516004562378, "learning_rate": 1.0043754632721519e-05, "loss": 0.5326, "step": 1522 }, { "epoch": 0.5143532590341101, "grad_norm": 1.929200530052185, "learning_rate": 1.0032816020351498e-05, "loss": 0.5687, "step": 1523 }, { "epoch": 0.5146909827760892, "grad_norm": 2.040872097015381, "learning_rate": 1.0021877368715473e-05, "loss": 0.5875, "step": 1524 }, { "epoch": 0.5150287065180682, "grad_norm": 1.9415181875228882, "learning_rate": 1.0010938690902081e-05, "loss": 0.5419, "step": 1525 }, { "epoch": 0.5153664302600472, "grad_norm": 1.88204026222229, "learning_rate": 1e-05, "loss": 0.503, "step": 1526 }, { "epoch": 0.5157041540020263, "grad_norm": 1.740562915802002, "learning_rate": 9.989061309097922e-06, "loss": 0.495, "step": 1527 }, { "epoch": 0.5160418777440055, "grad_norm": 2.4091262817382812, "learning_rate": 9.978122631284528e-06, "loss": 0.4964, "step": 1528 }, { "epoch": 0.5163796014859845, "grad_norm": 2.142996311187744, "learning_rate": 9.967183979648503e-06, "loss": 0.5217, "step": 1529 }, { "epoch": 0.5167173252279635, "grad_norm": 1.9230830669403076, "learning_rate": 9.956245367278483e-06, "loss": 0.5432, "step": 1530 }, { "epoch": 0.5170550489699426, "grad_norm": 1.8235795497894287, "learning_rate": 9.945306807263069e-06, "loss": 0.5229, "step": 1531 }, { "epoch": 0.5173927727119216, "grad_norm": 1.8464144468307495, "learning_rate": 9.934368312690788e-06, "loss": 0.5443, "step": 1532 }, { "epoch": 0.5177304964539007, "grad_norm": 3.6297049522399902, "learning_rate": 9.923429896650103e-06, "loss": 0.4909, "step": 1533 }, { "epoch": 0.5180682201958797, "grad_norm": 2.0108883380889893, "learning_rate": 9.912491572229366e-06, "loss": 0.5578, "step": 1534 }, { "epoch": 0.5184059439378589, "grad_norm": 2.716996669769287, "learning_rate": 9.901553352516837e-06, "loss": 0.522, "step": 1535 }, { "epoch": 0.5187436676798379, "grad_norm": 2.258030414581299, "learning_rate": 9.890615250600638e-06, "loss": 0.5402, "step": 1536 }, { "epoch": 0.519081391421817, "grad_norm": 2.0340538024902344, "learning_rate": 9.87967727956875e-06, "loss": 0.5407, "step": 1537 }, { "epoch": 0.519419115163796, "grad_norm": 2.6202571392059326, "learning_rate": 9.868739452509011e-06, "loss": 0.5416, "step": 1538 }, { "epoch": 0.5197568389057751, "grad_norm": 2.002957820892334, "learning_rate": 9.857801782509073e-06, "loss": 0.5824, "step": 1539 }, { "epoch": 0.5200945626477541, "grad_norm": 1.7350502014160156, "learning_rate": 9.846864282656404e-06, "loss": 0.5044, "step": 1540 }, { "epoch": 0.5204322863897332, "grad_norm": 1.864804983139038, "learning_rate": 9.835926966038267e-06, "loss": 0.5532, "step": 1541 }, { "epoch": 0.5207700101317123, "grad_norm": 1.7072511911392212, "learning_rate": 9.824989845741713e-06, "loss": 0.4849, "step": 1542 }, { "epoch": 0.5211077338736914, "grad_norm": 1.6465752124786377, "learning_rate": 9.814052934853547e-06, "loss": 0.4762, "step": 1543 }, { "epoch": 0.5214454576156704, "grad_norm": 2.249641180038452, "learning_rate": 9.803116246460333e-06, "loss": 0.555, "step": 1544 }, { "epoch": 0.5217831813576495, "grad_norm": 1.7904754877090454, "learning_rate": 9.792179793648362e-06, "loss": 0.5206, "step": 1545 }, { "epoch": 0.5221209050996285, "grad_norm": 2.1210522651672363, "learning_rate": 9.78124358950365e-06, "loss": 0.5462, "step": 1546 }, { "epoch": 0.5224586288416075, "grad_norm": 1.7868205308914185, "learning_rate": 9.770307647111906e-06, "loss": 0.4881, "step": 1547 }, { "epoch": 0.5227963525835866, "grad_norm": 1.8930609226226807, "learning_rate": 9.759371979558535e-06, "loss": 0.5318, "step": 1548 }, { "epoch": 0.5231340763255656, "grad_norm": 2.095503091812134, "learning_rate": 9.748436599928608e-06, "loss": 0.5534, "step": 1549 }, { "epoch": 0.5234718000675448, "grad_norm": 2.2088258266448975, "learning_rate": 9.737501521306855e-06, "loss": 0.5225, "step": 1550 }, { "epoch": 0.5238095238095238, "grad_norm": 2.03084397315979, "learning_rate": 9.72656675677764e-06, "loss": 0.5524, "step": 1551 }, { "epoch": 0.5241472475515029, "grad_norm": 6.673501968383789, "learning_rate": 9.715632319424959e-06, "loss": 0.5101, "step": 1552 }, { "epoch": 0.5244849712934819, "grad_norm": 2.6978635787963867, "learning_rate": 9.704698222332408e-06, "loss": 0.5385, "step": 1553 }, { "epoch": 0.524822695035461, "grad_norm": 1.809374213218689, "learning_rate": 9.693764478583185e-06, "loss": 0.5599, "step": 1554 }, { "epoch": 0.52516041877744, "grad_norm": 3.5883986949920654, "learning_rate": 9.682831101260056e-06, "loss": 0.5134, "step": 1555 }, { "epoch": 0.5254981425194191, "grad_norm": 1.8489106893539429, "learning_rate": 9.671898103445358e-06, "loss": 0.5211, "step": 1556 }, { "epoch": 0.5258358662613982, "grad_norm": 2.1008834838867188, "learning_rate": 9.660965498220962e-06, "loss": 0.5291, "step": 1557 }, { "epoch": 0.5261735900033773, "grad_norm": 1.7562167644500732, "learning_rate": 9.65003329866828e-06, "loss": 0.4787, "step": 1558 }, { "epoch": 0.5265113137453563, "grad_norm": 1.6364998817443848, "learning_rate": 9.63910151786824e-06, "loss": 0.5144, "step": 1559 }, { "epoch": 0.5268490374873354, "grad_norm": 2.909820556640625, "learning_rate": 9.628170168901255e-06, "loss": 0.5513, "step": 1560 }, { "epoch": 0.5271867612293144, "grad_norm": 1.7136751413345337, "learning_rate": 9.617239264847236e-06, "loss": 0.5273, "step": 1561 }, { "epoch": 0.5275244849712935, "grad_norm": 1.7352958917617798, "learning_rate": 9.606308818785552e-06, "loss": 0.5036, "step": 1562 }, { "epoch": 0.5278622087132725, "grad_norm": 1.874314308166504, "learning_rate": 9.59537884379503e-06, "loss": 0.5394, "step": 1563 }, { "epoch": 0.5281999324552517, "grad_norm": 2.1518850326538086, "learning_rate": 9.584449352953931e-06, "loss": 0.5086, "step": 1564 }, { "epoch": 0.5285376561972307, "grad_norm": 1.7159767150878906, "learning_rate": 9.573520359339934e-06, "loss": 0.5195, "step": 1565 }, { "epoch": 0.5288753799392097, "grad_norm": 1.6112059354782104, "learning_rate": 9.562591876030127e-06, "loss": 0.5414, "step": 1566 }, { "epoch": 0.5292131036811888, "grad_norm": 1.775585412979126, "learning_rate": 9.551663916100985e-06, "loss": 0.5149, "step": 1567 }, { "epoch": 0.5295508274231678, "grad_norm": 1.8109186887741089, "learning_rate": 9.540736492628364e-06, "loss": 0.5392, "step": 1568 }, { "epoch": 0.5298885511651469, "grad_norm": 4.941011905670166, "learning_rate": 9.529809618687462e-06, "loss": 0.4897, "step": 1569 }, { "epoch": 0.5302262749071259, "grad_norm": 1.6611965894699097, "learning_rate": 9.518883307352839e-06, "loss": 0.5469, "step": 1570 }, { "epoch": 0.5305639986491051, "grad_norm": 1.971314549446106, "learning_rate": 9.507957571698366e-06, "loss": 0.4862, "step": 1571 }, { "epoch": 0.5309017223910841, "grad_norm": 1.687429666519165, "learning_rate": 9.497032424797233e-06, "loss": 0.5484, "step": 1572 }, { "epoch": 0.5312394461330632, "grad_norm": 2.4173943996429443, "learning_rate": 9.486107879721924e-06, "loss": 0.5581, "step": 1573 }, { "epoch": 0.5315771698750422, "grad_norm": 2.043538808822632, "learning_rate": 9.475183949544204e-06, "loss": 0.5145, "step": 1574 }, { "epoch": 0.5319148936170213, "grad_norm": 2.5612406730651855, "learning_rate": 9.464260647335098e-06, "loss": 0.4995, "step": 1575 }, { "epoch": 0.5322526173590003, "grad_norm": 1.9762159585952759, "learning_rate": 9.453337986164888e-06, "loss": 0.5002, "step": 1576 }, { "epoch": 0.5325903411009794, "grad_norm": 1.9823102951049805, "learning_rate": 9.442415979103077e-06, "loss": 0.5445, "step": 1577 }, { "epoch": 0.5329280648429585, "grad_norm": 1.7082359790802002, "learning_rate": 9.431494639218397e-06, "loss": 0.5433, "step": 1578 }, { "epoch": 0.5332657885849376, "grad_norm": 2.0554702281951904, "learning_rate": 9.42057397957877e-06, "loss": 0.5367, "step": 1579 }, { "epoch": 0.5336035123269166, "grad_norm": 1.7927786111831665, "learning_rate": 9.40965401325132e-06, "loss": 0.5052, "step": 1580 }, { "epoch": 0.5339412360688957, "grad_norm": 2.1841979026794434, "learning_rate": 9.39873475330233e-06, "loss": 0.5775, "step": 1581 }, { "epoch": 0.5342789598108747, "grad_norm": 1.9746659994125366, "learning_rate": 9.387816212797233e-06, "loss": 0.5422, "step": 1582 }, { "epoch": 0.5346166835528537, "grad_norm": 1.8738089799880981, "learning_rate": 9.376898404800616e-06, "loss": 0.4961, "step": 1583 }, { "epoch": 0.5349544072948328, "grad_norm": 1.7445091009140015, "learning_rate": 9.365981342376175e-06, "loss": 0.5, "step": 1584 }, { "epoch": 0.5352921310368118, "grad_norm": 1.8311409950256348, "learning_rate": 9.355065038586727e-06, "loss": 0.4844, "step": 1585 }, { "epoch": 0.535629854778791, "grad_norm": 2.0527617931365967, "learning_rate": 9.344149506494169e-06, "loss": 0.517, "step": 1586 }, { "epoch": 0.53596757852077, "grad_norm": 1.8283382654190063, "learning_rate": 9.333234759159485e-06, "loss": 0.5355, "step": 1587 }, { "epoch": 0.5363053022627491, "grad_norm": 1.9923325777053833, "learning_rate": 9.32232080964271e-06, "loss": 0.5309, "step": 1588 }, { "epoch": 0.5366430260047281, "grad_norm": 1.6713335514068604, "learning_rate": 9.311407671002934e-06, "loss": 0.5278, "step": 1589 }, { "epoch": 0.5369807497467072, "grad_norm": 1.8075021505355835, "learning_rate": 9.30049535629827e-06, "loss": 0.5387, "step": 1590 }, { "epoch": 0.5373184734886862, "grad_norm": 1.739136815071106, "learning_rate": 9.28958387858585e-06, "loss": 0.4894, "step": 1591 }, { "epoch": 0.5376561972306653, "grad_norm": 1.7701231241226196, "learning_rate": 9.278673250921799e-06, "loss": 0.5172, "step": 1592 }, { "epoch": 0.5379939209726444, "grad_norm": 1.7097407579421997, "learning_rate": 9.267763486361231e-06, "loss": 0.559, "step": 1593 }, { "epoch": 0.5383316447146235, "grad_norm": 2.0252482891082764, "learning_rate": 9.256854597958222e-06, "loss": 0.5286, "step": 1594 }, { "epoch": 0.5386693684566025, "grad_norm": 1.779090166091919, "learning_rate": 9.245946598765803e-06, "loss": 0.4921, "step": 1595 }, { "epoch": 0.5390070921985816, "grad_norm": 1.9797908067703247, "learning_rate": 9.235039501835937e-06, "loss": 0.4919, "step": 1596 }, { "epoch": 0.5393448159405606, "grad_norm": 1.892739176750183, "learning_rate": 9.224133320219517e-06, "loss": 0.5628, "step": 1597 }, { "epoch": 0.5396825396825397, "grad_norm": 1.7846553325653076, "learning_rate": 9.213228066966328e-06, "loss": 0.5359, "step": 1598 }, { "epoch": 0.5400202634245187, "grad_norm": 1.9714604616165161, "learning_rate": 9.20232375512505e-06, "loss": 0.5006, "step": 1599 }, { "epoch": 0.5403579871664979, "grad_norm": 1.6296238899230957, "learning_rate": 9.191420397743236e-06, "loss": 0.5077, "step": 1600 }, { "epoch": 0.5406957109084769, "grad_norm": 1.6234601736068726, "learning_rate": 9.180518007867304e-06, "loss": 0.5117, "step": 1601 }, { "epoch": 0.541033434650456, "grad_norm": 1.615780234336853, "learning_rate": 9.169616598542503e-06, "loss": 0.5006, "step": 1602 }, { "epoch": 0.541371158392435, "grad_norm": 2.1265811920166016, "learning_rate": 9.158716182812917e-06, "loss": 0.5042, "step": 1603 }, { "epoch": 0.541708882134414, "grad_norm": 2.097017526626587, "learning_rate": 9.147816773721434e-06, "loss": 0.5223, "step": 1604 }, { "epoch": 0.5420466058763931, "grad_norm": 1.7654004096984863, "learning_rate": 9.136918384309742e-06, "loss": 0.5324, "step": 1605 }, { "epoch": 0.5423843296183721, "grad_norm": 2.4025394916534424, "learning_rate": 9.126021027618312e-06, "loss": 0.5502, "step": 1606 }, { "epoch": 0.5427220533603513, "grad_norm": 1.8416392803192139, "learning_rate": 9.115124716686372e-06, "loss": 0.5427, "step": 1607 }, { "epoch": 0.5430597771023303, "grad_norm": 1.8634899854660034, "learning_rate": 9.1042294645519e-06, "loss": 0.5094, "step": 1608 }, { "epoch": 0.5433975008443094, "grad_norm": 4.097016334533691, "learning_rate": 9.093335284251618e-06, "loss": 0.5125, "step": 1609 }, { "epoch": 0.5437352245862884, "grad_norm": 2.222107172012329, "learning_rate": 9.082442188820947e-06, "loss": 0.5657, "step": 1610 }, { "epoch": 0.5440729483282675, "grad_norm": 1.8485863208770752, "learning_rate": 9.071550191294025e-06, "loss": 0.5267, "step": 1611 }, { "epoch": 0.5444106720702465, "grad_norm": 1.920896053314209, "learning_rate": 9.060659304703671e-06, "loss": 0.5123, "step": 1612 }, { "epoch": 0.5447483958122256, "grad_norm": 1.9553619623184204, "learning_rate": 9.049769542081374e-06, "loss": 0.4891, "step": 1613 }, { "epoch": 0.5450861195542046, "grad_norm": 1.8318523168563843, "learning_rate": 9.038880916457276e-06, "loss": 0.5301, "step": 1614 }, { "epoch": 0.5454238432961838, "grad_norm": 2.148956537246704, "learning_rate": 9.027993440860168e-06, "loss": 0.5515, "step": 1615 }, { "epoch": 0.5457615670381628, "grad_norm": 1.850074291229248, "learning_rate": 9.017107128317451e-06, "loss": 0.5359, "step": 1616 }, { "epoch": 0.5460992907801419, "grad_norm": 1.7465953826904297, "learning_rate": 9.006221991855147e-06, "loss": 0.5079, "step": 1617 }, { "epoch": 0.5464370145221209, "grad_norm": 1.8963632583618164, "learning_rate": 8.995338044497862e-06, "loss": 0.5459, "step": 1618 }, { "epoch": 0.5467747382641, "grad_norm": 2.0997002124786377, "learning_rate": 8.984455299268785e-06, "loss": 0.5046, "step": 1619 }, { "epoch": 0.547112462006079, "grad_norm": 2.0972416400909424, "learning_rate": 8.973573769189662e-06, "loss": 0.4931, "step": 1620 }, { "epoch": 0.547450185748058, "grad_norm": 2.009843587875366, "learning_rate": 8.96269346728079e-06, "loss": 0.5293, "step": 1621 }, { "epoch": 0.5477879094900372, "grad_norm": 3.5727500915527344, "learning_rate": 8.951814406560988e-06, "loss": 0.512, "step": 1622 }, { "epoch": 0.5481256332320162, "grad_norm": 1.6909104585647583, "learning_rate": 8.940936600047602e-06, "loss": 0.5054, "step": 1623 }, { "epoch": 0.5484633569739953, "grad_norm": 2.0125372409820557, "learning_rate": 8.93006006075647e-06, "loss": 0.5253, "step": 1624 }, { "epoch": 0.5488010807159743, "grad_norm": 1.688283920288086, "learning_rate": 8.919184801701911e-06, "loss": 0.5256, "step": 1625 }, { "epoch": 0.5491388044579534, "grad_norm": 1.7490724325180054, "learning_rate": 8.90831083589672e-06, "loss": 0.4985, "step": 1626 }, { "epoch": 0.5494765281999324, "grad_norm": 1.6981830596923828, "learning_rate": 8.897438176352135e-06, "loss": 0.507, "step": 1627 }, { "epoch": 0.5498142519419115, "grad_norm": 1.9245517253875732, "learning_rate": 8.886566836077842e-06, "loss": 0.5135, "step": 1628 }, { "epoch": 0.5501519756838906, "grad_norm": 1.6885648965835571, "learning_rate": 8.875696828081936e-06, "loss": 0.5091, "step": 1629 }, { "epoch": 0.5504896994258697, "grad_norm": 1.552577257156372, "learning_rate": 8.864828165370932e-06, "loss": 0.5111, "step": 1630 }, { "epoch": 0.5508274231678487, "grad_norm": 1.9419220685958862, "learning_rate": 8.85396086094972e-06, "loss": 0.5313, "step": 1631 }, { "epoch": 0.5511651469098278, "grad_norm": 1.8913154602050781, "learning_rate": 8.843094927821578e-06, "loss": 0.5212, "step": 1632 }, { "epoch": 0.5515028706518068, "grad_norm": 1.6829161643981934, "learning_rate": 8.832230378988133e-06, "loss": 0.5312, "step": 1633 }, { "epoch": 0.5518405943937859, "grad_norm": 2.0374350547790527, "learning_rate": 8.821367227449368e-06, "loss": 0.5119, "step": 1634 }, { "epoch": 0.5521783181357649, "grad_norm": 1.7659964561462402, "learning_rate": 8.81050548620358e-06, "loss": 0.4778, "step": 1635 }, { "epoch": 0.5525160418777441, "grad_norm": 2.025928258895874, "learning_rate": 8.799645168247384e-06, "loss": 0.5411, "step": 1636 }, { "epoch": 0.5528537656197231, "grad_norm": 2.071823835372925, "learning_rate": 8.788786286575696e-06, "loss": 0.494, "step": 1637 }, { "epoch": 0.5531914893617021, "grad_norm": 1.9638333320617676, "learning_rate": 8.77792885418171e-06, "loss": 0.5394, "step": 1638 }, { "epoch": 0.5535292131036812, "grad_norm": 1.9916033744812012, "learning_rate": 8.767072884056886e-06, "loss": 0.5058, "step": 1639 }, { "epoch": 0.5538669368456602, "grad_norm": 1.9690932035446167, "learning_rate": 8.756218389190933e-06, "loss": 0.5353, "step": 1640 }, { "epoch": 0.5542046605876393, "grad_norm": 1.7852071523666382, "learning_rate": 8.745365382571799e-06, "loss": 0.5133, "step": 1641 }, { "epoch": 0.5545423843296183, "grad_norm": 1.4376115798950195, "learning_rate": 8.734513877185644e-06, "loss": 0.5268, "step": 1642 }, { "epoch": 0.5548801080715975, "grad_norm": 2.1602067947387695, "learning_rate": 8.723663886016839e-06, "loss": 0.5588, "step": 1643 }, { "epoch": 0.5552178318135765, "grad_norm": 2.2356278896331787, "learning_rate": 8.712815422047944e-06, "loss": 0.4992, "step": 1644 }, { "epoch": 0.5555555555555556, "grad_norm": 2.0913782119750977, "learning_rate": 8.70196849825968e-06, "loss": 0.4668, "step": 1645 }, { "epoch": 0.5558932792975346, "grad_norm": 1.9730517864227295, "learning_rate": 8.691123127630942e-06, "loss": 0.5267, "step": 1646 }, { "epoch": 0.5562310030395137, "grad_norm": 1.8405466079711914, "learning_rate": 8.680279323138748e-06, "loss": 0.5261, "step": 1647 }, { "epoch": 0.5565687267814927, "grad_norm": 1.9740381240844727, "learning_rate": 8.669437097758261e-06, "loss": 0.5439, "step": 1648 }, { "epoch": 0.5569064505234718, "grad_norm": 1.8386714458465576, "learning_rate": 8.658596464462737e-06, "loss": 0.5204, "step": 1649 }, { "epoch": 0.5572441742654508, "grad_norm": 1.8446617126464844, "learning_rate": 8.647757436223543e-06, "loss": 0.5287, "step": 1650 }, { "epoch": 0.55758189800743, "grad_norm": 1.7824242115020752, "learning_rate": 8.63692002601011e-06, "loss": 0.5155, "step": 1651 }, { "epoch": 0.557919621749409, "grad_norm": 2.7127113342285156, "learning_rate": 8.626084246789946e-06, "loss": 0.5529, "step": 1652 }, { "epoch": 0.5582573454913881, "grad_norm": 1.7103229761123657, "learning_rate": 8.6152501115286e-06, "loss": 0.5306, "step": 1653 }, { "epoch": 0.5585950692333671, "grad_norm": 2.137053966522217, "learning_rate": 8.604417633189658e-06, "loss": 0.548, "step": 1654 }, { "epoch": 0.5589327929753461, "grad_norm": 1.6931251287460327, "learning_rate": 8.593586824734714e-06, "loss": 0.5219, "step": 1655 }, { "epoch": 0.5592705167173252, "grad_norm": 1.7442986965179443, "learning_rate": 8.582757699123382e-06, "loss": 0.5193, "step": 1656 }, { "epoch": 0.5596082404593042, "grad_norm": 1.7256258726119995, "learning_rate": 8.571930269313242e-06, "loss": 0.5406, "step": 1657 }, { "epoch": 0.5599459642012834, "grad_norm": 1.661208987236023, "learning_rate": 8.561104548259864e-06, "loss": 0.5086, "step": 1658 }, { "epoch": 0.5602836879432624, "grad_norm": 1.6064434051513672, "learning_rate": 8.550280548916754e-06, "loss": 0.4815, "step": 1659 }, { "epoch": 0.5606214116852415, "grad_norm": 1.7884057760238647, "learning_rate": 8.539458284235375e-06, "loss": 0.5474, "step": 1660 }, { "epoch": 0.5609591354272205, "grad_norm": 1.958184003829956, "learning_rate": 8.528637767165104e-06, "loss": 0.5146, "step": 1661 }, { "epoch": 0.5612968591691996, "grad_norm": 1.6790597438812256, "learning_rate": 8.517819010653234e-06, "loss": 0.5038, "step": 1662 }, { "epoch": 0.5616345829111786, "grad_norm": 2.1039931774139404, "learning_rate": 8.50700202764494e-06, "loss": 0.5619, "step": 1663 }, { "epoch": 0.5619723066531577, "grad_norm": 1.7159714698791504, "learning_rate": 8.496186831083286e-06, "loss": 0.5103, "step": 1664 }, { "epoch": 0.5623100303951368, "grad_norm": 2.2547850608825684, "learning_rate": 8.485373433909202e-06, "loss": 0.4942, "step": 1665 }, { "epoch": 0.5626477541371159, "grad_norm": 1.79918372631073, "learning_rate": 8.474561849061446e-06, "loss": 0.4915, "step": 1666 }, { "epoch": 0.5629854778790949, "grad_norm": 1.98334538936615, "learning_rate": 8.463752089476627e-06, "loss": 0.5503, "step": 1667 }, { "epoch": 0.563323201621074, "grad_norm": 1.5230368375778198, "learning_rate": 8.452944168089159e-06, "loss": 0.4776, "step": 1668 }, { "epoch": 0.563660925363053, "grad_norm": 1.5479648113250732, "learning_rate": 8.442138097831263e-06, "loss": 0.5077, "step": 1669 }, { "epoch": 0.5639986491050321, "grad_norm": 1.6129554510116577, "learning_rate": 8.431333891632937e-06, "loss": 0.5218, "step": 1670 }, { "epoch": 0.5643363728470111, "grad_norm": 1.8073893785476685, "learning_rate": 8.420531562421959e-06, "loss": 0.5243, "step": 1671 }, { "epoch": 0.5646740965889903, "grad_norm": 2.0291173458099365, "learning_rate": 8.40973112312385e-06, "loss": 0.5251, "step": 1672 }, { "epoch": 0.5650118203309693, "grad_norm": 1.6789470911026, "learning_rate": 8.398932586661878e-06, "loss": 0.4728, "step": 1673 }, { "epoch": 0.5653495440729484, "grad_norm": 1.7683087587356567, "learning_rate": 8.388135965957031e-06, "loss": 0.5142, "step": 1674 }, { "epoch": 0.5656872678149274, "grad_norm": 2.021653652191162, "learning_rate": 8.377341273928001e-06, "loss": 0.5751, "step": 1675 }, { "epoch": 0.5660249915569064, "grad_norm": 1.632583498954773, "learning_rate": 8.366548523491184e-06, "loss": 0.5061, "step": 1676 }, { "epoch": 0.5663627152988855, "grad_norm": 1.7549958229064941, "learning_rate": 8.355757727560637e-06, "loss": 0.515, "step": 1677 }, { "epoch": 0.5667004390408645, "grad_norm": 1.7326964139938354, "learning_rate": 8.344968899048093e-06, "loss": 0.5193, "step": 1678 }, { "epoch": 0.5670381627828436, "grad_norm": 1.6446523666381836, "learning_rate": 8.334182050862919e-06, "loss": 0.5448, "step": 1679 }, { "epoch": 0.5673758865248227, "grad_norm": 1.5911864042282104, "learning_rate": 8.32339719591212e-06, "loss": 0.4974, "step": 1680 }, { "epoch": 0.5677136102668018, "grad_norm": 1.7952638864517212, "learning_rate": 8.312614347100311e-06, "loss": 0.5433, "step": 1681 }, { "epoch": 0.5680513340087808, "grad_norm": 1.5623303651809692, "learning_rate": 8.301833517329714e-06, "loss": 0.4942, "step": 1682 }, { "epoch": 0.5683890577507599, "grad_norm": 1.7778902053833008, "learning_rate": 8.291054719500125e-06, "loss": 0.5345, "step": 1683 }, { "epoch": 0.5687267814927389, "grad_norm": 2.0717406272888184, "learning_rate": 8.280277966508918e-06, "loss": 0.5013, "step": 1684 }, { "epoch": 0.569064505234718, "grad_norm": 1.599245309829712, "learning_rate": 8.269503271251011e-06, "loss": 0.5159, "step": 1685 }, { "epoch": 0.569402228976697, "grad_norm": 1.7907711267471313, "learning_rate": 8.258730646618872e-06, "loss": 0.4938, "step": 1686 }, { "epoch": 0.5697399527186762, "grad_norm": 1.6812031269073486, "learning_rate": 8.247960105502482e-06, "loss": 0.4978, "step": 1687 }, { "epoch": 0.5700776764606552, "grad_norm": 1.7818291187286377, "learning_rate": 8.237191660789328e-06, "loss": 0.5251, "step": 1688 }, { "epoch": 0.5704154002026343, "grad_norm": 21.7037410736084, "learning_rate": 8.226425325364398e-06, "loss": 0.5074, "step": 1689 }, { "epoch": 0.5707531239446133, "grad_norm": 1.8835177421569824, "learning_rate": 8.215661112110143e-06, "loss": 0.5206, "step": 1690 }, { "epoch": 0.5710908476865924, "grad_norm": 1.8361026048660278, "learning_rate": 8.204899033906487e-06, "loss": 0.493, "step": 1691 }, { "epoch": 0.5714285714285714, "grad_norm": 1.8381787538528442, "learning_rate": 8.194139103630791e-06, "loss": 0.5469, "step": 1692 }, { "epoch": 0.5717662951705504, "grad_norm": 1.9428197145462036, "learning_rate": 8.183381334157852e-06, "loss": 0.5264, "step": 1693 }, { "epoch": 0.5721040189125296, "grad_norm": 1.7664002180099487, "learning_rate": 8.172625738359876e-06, "loss": 0.5295, "step": 1694 }, { "epoch": 0.5724417426545086, "grad_norm": 1.5937261581420898, "learning_rate": 8.16187232910647e-06, "loss": 0.5499, "step": 1695 }, { "epoch": 0.5727794663964877, "grad_norm": 1.7142542600631714, "learning_rate": 8.151121119264627e-06, "loss": 0.5302, "step": 1696 }, { "epoch": 0.5731171901384667, "grad_norm": 2.597435235977173, "learning_rate": 8.140372121698708e-06, "loss": 0.5409, "step": 1697 }, { "epoch": 0.5734549138804458, "grad_norm": 1.7513682842254639, "learning_rate": 8.12962534927042e-06, "loss": 0.523, "step": 1698 }, { "epoch": 0.5737926376224248, "grad_norm": 1.6808027029037476, "learning_rate": 8.11888081483882e-06, "loss": 0.5233, "step": 1699 }, { "epoch": 0.5741303613644039, "grad_norm": 1.5530805587768555, "learning_rate": 8.108138531260274e-06, "loss": 0.5152, "step": 1700 }, { "epoch": 0.574468085106383, "grad_norm": 1.8691776990890503, "learning_rate": 8.097398511388465e-06, "loss": 0.5058, "step": 1701 }, { "epoch": 0.5748058088483621, "grad_norm": 1.6531059741973877, "learning_rate": 8.08666076807436e-06, "loss": 0.5017, "step": 1702 }, { "epoch": 0.5751435325903411, "grad_norm": 1.5119786262512207, "learning_rate": 8.07592531416621e-06, "loss": 0.4819, "step": 1703 }, { "epoch": 0.5754812563323202, "grad_norm": 1.8579702377319336, "learning_rate": 8.065192162509515e-06, "loss": 0.4927, "step": 1704 }, { "epoch": 0.5758189800742992, "grad_norm": 2.085266351699829, "learning_rate": 8.05446132594703e-06, "loss": 0.4841, "step": 1705 }, { "epoch": 0.5761567038162783, "grad_norm": 1.958851933479309, "learning_rate": 8.043732817318736e-06, "loss": 0.5224, "step": 1706 }, { "epoch": 0.5764944275582573, "grad_norm": 1.7687489986419678, "learning_rate": 8.033006649461832e-06, "loss": 0.5112, "step": 1707 }, { "epoch": 0.5768321513002365, "grad_norm": 1.718447208404541, "learning_rate": 8.022282835210715e-06, "loss": 0.5067, "step": 1708 }, { "epoch": 0.5771698750422155, "grad_norm": 1.6209428310394287, "learning_rate": 8.01156138739696e-06, "loss": 0.5037, "step": 1709 }, { "epoch": 0.5775075987841946, "grad_norm": 2.2305126190185547, "learning_rate": 8.000842318849317e-06, "loss": 0.5253, "step": 1710 }, { "epoch": 0.5778453225261736, "grad_norm": 1.6277213096618652, "learning_rate": 7.990125642393687e-06, "loss": 0.5501, "step": 1711 }, { "epoch": 0.5781830462681526, "grad_norm": 1.6092228889465332, "learning_rate": 7.979411370853107e-06, "loss": 0.4891, "step": 1712 }, { "epoch": 0.5785207700101317, "grad_norm": 1.7830153703689575, "learning_rate": 7.968699517047738e-06, "loss": 0.5371, "step": 1713 }, { "epoch": 0.5788584937521107, "grad_norm": 1.8587408065795898, "learning_rate": 7.95799009379485e-06, "loss": 0.5084, "step": 1714 }, { "epoch": 0.5791962174940898, "grad_norm": 1.9413813352584839, "learning_rate": 7.947283113908802e-06, "loss": 0.5484, "step": 1715 }, { "epoch": 0.5795339412360689, "grad_norm": 1.8708240985870361, "learning_rate": 7.936578590201029e-06, "loss": 0.5053, "step": 1716 }, { "epoch": 0.579871664978048, "grad_norm": 1.6497007608413696, "learning_rate": 7.925876535480032e-06, "loss": 0.5229, "step": 1717 }, { "epoch": 0.580209388720027, "grad_norm": 1.6714518070220947, "learning_rate": 7.915176962551347e-06, "loss": 0.5029, "step": 1718 }, { "epoch": 0.5805471124620061, "grad_norm": 1.9394537210464478, "learning_rate": 7.904479884217556e-06, "loss": 0.5074, "step": 1719 }, { "epoch": 0.5808848362039851, "grad_norm": 1.6061900854110718, "learning_rate": 7.89378531327824e-06, "loss": 0.5058, "step": 1720 }, { "epoch": 0.5812225599459642, "grad_norm": 1.7901924848556519, "learning_rate": 7.883093262529993e-06, "loss": 0.528, "step": 1721 }, { "epoch": 0.5815602836879432, "grad_norm": 1.6284945011138916, "learning_rate": 7.872403744766383e-06, "loss": 0.4844, "step": 1722 }, { "epoch": 0.5818980074299224, "grad_norm": 1.6774873733520508, "learning_rate": 7.861716772777955e-06, "loss": 0.523, "step": 1723 }, { "epoch": 0.5822357311719014, "grad_norm": 2.0317070484161377, "learning_rate": 7.851032359352202e-06, "loss": 0.5118, "step": 1724 }, { "epoch": 0.5825734549138805, "grad_norm": 2.05784010887146, "learning_rate": 7.840350517273563e-06, "loss": 0.5487, "step": 1725 }, { "epoch": 0.5829111786558595, "grad_norm": 1.9822123050689697, "learning_rate": 7.82967125932339e-06, "loss": 0.5417, "step": 1726 }, { "epoch": 0.5832489023978386, "grad_norm": 2.3593215942382812, "learning_rate": 7.818994598279948e-06, "loss": 0.5534, "step": 1727 }, { "epoch": 0.5835866261398176, "grad_norm": 1.6976035833358765, "learning_rate": 7.808320546918404e-06, "loss": 0.5063, "step": 1728 }, { "epoch": 0.5839243498817966, "grad_norm": 1.8201258182525635, "learning_rate": 7.797649118010785e-06, "loss": 0.4774, "step": 1729 }, { "epoch": 0.5842620736237758, "grad_norm": 1.728067398071289, "learning_rate": 7.786980324325994e-06, "loss": 0.4992, "step": 1730 }, { "epoch": 0.5845997973657548, "grad_norm": 1.7115360498428345, "learning_rate": 7.77631417862977e-06, "loss": 0.5312, "step": 1731 }, { "epoch": 0.5849375211077339, "grad_norm": 1.995017170906067, "learning_rate": 7.765650693684695e-06, "loss": 0.507, "step": 1732 }, { "epoch": 0.5852752448497129, "grad_norm": 1.9406956434249878, "learning_rate": 7.754989882250156e-06, "loss": 0.5178, "step": 1733 }, { "epoch": 0.585612968591692, "grad_norm": 2.0803964138031006, "learning_rate": 7.74433175708235e-06, "loss": 0.526, "step": 1734 }, { "epoch": 0.585950692333671, "grad_norm": 1.9075216054916382, "learning_rate": 7.733676330934254e-06, "loss": 0.49, "step": 1735 }, { "epoch": 0.5862884160756501, "grad_norm": 1.8509905338287354, "learning_rate": 7.723023616555621e-06, "loss": 0.5225, "step": 1736 }, { "epoch": 0.5866261398176292, "grad_norm": 1.6421899795532227, "learning_rate": 7.712373626692949e-06, "loss": 0.4932, "step": 1737 }, { "epoch": 0.5869638635596083, "grad_norm": 2.1000378131866455, "learning_rate": 7.70172637408949e-06, "loss": 0.517, "step": 1738 }, { "epoch": 0.5873015873015873, "grad_norm": 1.6390101909637451, "learning_rate": 7.691081871485208e-06, "loss": 0.5373, "step": 1739 }, { "epoch": 0.5876393110435664, "grad_norm": 1.725976586341858, "learning_rate": 7.680440131616787e-06, "loss": 0.5151, "step": 1740 }, { "epoch": 0.5879770347855454, "grad_norm": 1.9770760536193848, "learning_rate": 7.669801167217597e-06, "loss": 0.5551, "step": 1741 }, { "epoch": 0.5883147585275245, "grad_norm": 1.945398211479187, "learning_rate": 7.659164991017689e-06, "loss": 0.5599, "step": 1742 }, { "epoch": 0.5886524822695035, "grad_norm": 1.7543689012527466, "learning_rate": 7.648531615743784e-06, "loss": 0.5085, "step": 1743 }, { "epoch": 0.5889902060114826, "grad_norm": 1.8086129426956177, "learning_rate": 7.637901054119238e-06, "loss": 0.5042, "step": 1744 }, { "epoch": 0.5893279297534617, "grad_norm": 1.629448413848877, "learning_rate": 7.6272733188640615e-06, "loss": 0.503, "step": 1745 }, { "epoch": 0.5896656534954408, "grad_norm": 2.08607816696167, "learning_rate": 7.616648422694858e-06, "loss": 0.5084, "step": 1746 }, { "epoch": 0.5900033772374198, "grad_norm": 1.7091796398162842, "learning_rate": 7.606026378324855e-06, "loss": 0.5343, "step": 1747 }, { "epoch": 0.5903411009793988, "grad_norm": 1.6369661092758179, "learning_rate": 7.595407198463852e-06, "loss": 0.4947, "step": 1748 }, { "epoch": 0.5906788247213779, "grad_norm": 1.4496893882751465, "learning_rate": 7.584790895818235e-06, "loss": 0.4955, "step": 1749 }, { "epoch": 0.5910165484633569, "grad_norm": 1.7512620687484741, "learning_rate": 7.5741774830909375e-06, "loss": 0.5231, "step": 1750 }, { "epoch": 0.591354272205336, "grad_norm": 1.7099575996398926, "learning_rate": 7.5635669729814375e-06, "loss": 0.5146, "step": 1751 }, { "epoch": 0.5916919959473151, "grad_norm": 1.6172142028808594, "learning_rate": 7.552959378185743e-06, "loss": 0.515, "step": 1752 }, { "epoch": 0.5920297196892942, "grad_norm": 1.6722681522369385, "learning_rate": 7.5423547113963645e-06, "loss": 0.5728, "step": 1753 }, { "epoch": 0.5923674434312732, "grad_norm": 1.992315649986267, "learning_rate": 7.531752985302323e-06, "loss": 0.5126, "step": 1754 }, { "epoch": 0.5927051671732523, "grad_norm": 1.7726420164108276, "learning_rate": 7.521154212589107e-06, "loss": 0.5124, "step": 1755 }, { "epoch": 0.5930428909152313, "grad_norm": 1.8035203218460083, "learning_rate": 7.510558405938683e-06, "loss": 0.5529, "step": 1756 }, { "epoch": 0.5933806146572104, "grad_norm": 1.9771755933761597, "learning_rate": 7.499965578029458e-06, "loss": 0.541, "step": 1757 }, { "epoch": 0.5937183383991894, "grad_norm": 1.7197126150131226, "learning_rate": 7.489375741536283e-06, "loss": 0.4893, "step": 1758 }, { "epoch": 0.5940560621411686, "grad_norm": 1.7861533164978027, "learning_rate": 7.478788909130423e-06, "loss": 0.4928, "step": 1759 }, { "epoch": 0.5943937858831476, "grad_norm": 2.35174560546875, "learning_rate": 7.468205093479557e-06, "loss": 0.5264, "step": 1760 }, { "epoch": 0.5947315096251267, "grad_norm": 1.5166665315628052, "learning_rate": 7.4576243072477425e-06, "loss": 0.4916, "step": 1761 }, { "epoch": 0.5950692333671057, "grad_norm": 6.826719284057617, "learning_rate": 7.447046563095425e-06, "loss": 0.5278, "step": 1762 }, { "epoch": 0.5954069571090848, "grad_norm": 1.8073714971542358, "learning_rate": 7.436471873679397e-06, "loss": 0.5129, "step": 1763 }, { "epoch": 0.5957446808510638, "grad_norm": 1.9197099208831787, "learning_rate": 7.425900251652809e-06, "loss": 0.5352, "step": 1764 }, { "epoch": 0.5960824045930428, "grad_norm": 1.844250202178955, "learning_rate": 7.41533170966513e-06, "loss": 0.5298, "step": 1765 }, { "epoch": 0.596420128335022, "grad_norm": 1.886328101158142, "learning_rate": 7.404766260362153e-06, "loss": 0.5227, "step": 1766 }, { "epoch": 0.596757852077001, "grad_norm": 1.5694997310638428, "learning_rate": 7.39420391638596e-06, "loss": 0.4989, "step": 1767 }, { "epoch": 0.5970955758189801, "grad_norm": 2.115183115005493, "learning_rate": 7.383644690374928e-06, "loss": 0.5671, "step": 1768 }, { "epoch": 0.5974332995609591, "grad_norm": 1.7803376913070679, "learning_rate": 7.373088594963694e-06, "loss": 0.5517, "step": 1769 }, { "epoch": 0.5977710233029382, "grad_norm": 3.2472879886627197, "learning_rate": 7.362535642783155e-06, "loss": 0.5243, "step": 1770 }, { "epoch": 0.5981087470449172, "grad_norm": 1.7118970155715942, "learning_rate": 7.35198584646045e-06, "loss": 0.5231, "step": 1771 }, { "epoch": 0.5984464707868963, "grad_norm": 1.8987621068954468, "learning_rate": 7.341439218618931e-06, "loss": 0.5192, "step": 1772 }, { "epoch": 0.5987841945288754, "grad_norm": 1.6433345079421997, "learning_rate": 7.330895771878168e-06, "loss": 0.5239, "step": 1773 }, { "epoch": 0.5991219182708545, "grad_norm": 3.3019137382507324, "learning_rate": 7.320355518853921e-06, "loss": 0.5033, "step": 1774 }, { "epoch": 0.5994596420128335, "grad_norm": 1.8919968605041504, "learning_rate": 7.30981847215813e-06, "loss": 0.5338, "step": 1775 }, { "epoch": 0.5997973657548126, "grad_norm": 1.6281535625457764, "learning_rate": 7.299284644398894e-06, "loss": 0.5183, "step": 1776 }, { "epoch": 0.6001350894967916, "grad_norm": 1.5345827341079712, "learning_rate": 7.2887540481804694e-06, "loss": 0.5187, "step": 1777 }, { "epoch": 0.6004728132387707, "grad_norm": 1.952297568321228, "learning_rate": 7.278226696103239e-06, "loss": 0.513, "step": 1778 }, { "epoch": 0.6008105369807497, "grad_norm": 1.744014024734497, "learning_rate": 7.267702600763699e-06, "loss": 0.5326, "step": 1779 }, { "epoch": 0.6011482607227288, "grad_norm": 1.8974837064743042, "learning_rate": 7.257181774754465e-06, "loss": 0.4994, "step": 1780 }, { "epoch": 0.6014859844647079, "grad_norm": 1.6592051982879639, "learning_rate": 7.246664230664224e-06, "loss": 0.5221, "step": 1781 }, { "epoch": 0.601823708206687, "grad_norm": 1.6222385168075562, "learning_rate": 7.236149981077746e-06, "loss": 0.4909, "step": 1782 }, { "epoch": 0.602161431948666, "grad_norm": 1.633861780166626, "learning_rate": 7.225639038575852e-06, "loss": 0.4886, "step": 1783 }, { "epoch": 0.602499155690645, "grad_norm": 1.884315848350525, "learning_rate": 7.215131415735416e-06, "loss": 0.5123, "step": 1784 }, { "epoch": 0.6028368794326241, "grad_norm": 2.0046441555023193, "learning_rate": 7.204627125129326e-06, "loss": 0.5044, "step": 1785 }, { "epoch": 0.6031746031746031, "grad_norm": 1.9159705638885498, "learning_rate": 7.194126179326497e-06, "loss": 0.5793, "step": 1786 }, { "epoch": 0.6035123269165822, "grad_norm": 1.6520031690597534, "learning_rate": 7.18362859089183e-06, "loss": 0.4929, "step": 1787 }, { "epoch": 0.6038500506585613, "grad_norm": 1.851183533668518, "learning_rate": 7.173134372386219e-06, "loss": 0.5034, "step": 1788 }, { "epoch": 0.6041877744005404, "grad_norm": 1.9079052209854126, "learning_rate": 7.162643536366515e-06, "loss": 0.543, "step": 1789 }, { "epoch": 0.6045254981425194, "grad_norm": 2.150238037109375, "learning_rate": 7.1521560953855274e-06, "loss": 0.5303, "step": 1790 }, { "epoch": 0.6048632218844985, "grad_norm": 1.957723617553711, "learning_rate": 7.141672061992011e-06, "loss": 0.4624, "step": 1791 }, { "epoch": 0.6052009456264775, "grad_norm": 1.7426623106002808, "learning_rate": 7.131191448730627e-06, "loss": 0.4997, "step": 1792 }, { "epoch": 0.6055386693684566, "grad_norm": 1.8989813327789307, "learning_rate": 7.120714268141958e-06, "loss": 0.5177, "step": 1793 }, { "epoch": 0.6058763931104356, "grad_norm": 2.016313314437866, "learning_rate": 7.110240532762469e-06, "loss": 0.4908, "step": 1794 }, { "epoch": 0.6062141168524148, "grad_norm": 2.0246846675872803, "learning_rate": 7.099770255124512e-06, "loss": 0.5098, "step": 1795 }, { "epoch": 0.6065518405943938, "grad_norm": 1.533897876739502, "learning_rate": 7.089303447756292e-06, "loss": 0.4912, "step": 1796 }, { "epoch": 0.6068895643363729, "grad_norm": 4.215752601623535, "learning_rate": 7.078840123181875e-06, "loss": 0.4921, "step": 1797 }, { "epoch": 0.6072272880783519, "grad_norm": 1.9828835725784302, "learning_rate": 7.068380293921142e-06, "loss": 0.5097, "step": 1798 }, { "epoch": 0.607565011820331, "grad_norm": 2.0375986099243164, "learning_rate": 7.057923972489809e-06, "loss": 0.5339, "step": 1799 }, { "epoch": 0.60790273556231, "grad_norm": 2.4214487075805664, "learning_rate": 7.047471171399381e-06, "loss": 0.5399, "step": 1800 }, { "epoch": 0.608240459304289, "grad_norm": 1.8295180797576904, "learning_rate": 7.037021903157159e-06, "loss": 0.4739, "step": 1801 }, { "epoch": 0.6085781830462682, "grad_norm": 1.9381102323532104, "learning_rate": 7.026576180266213e-06, "loss": 0.4936, "step": 1802 }, { "epoch": 0.6089159067882473, "grad_norm": 1.7747598886489868, "learning_rate": 7.016134015225375e-06, "loss": 0.5301, "step": 1803 }, { "epoch": 0.6092536305302263, "grad_norm": 3.0653014183044434, "learning_rate": 7.005695420529215e-06, "loss": 0.4871, "step": 1804 }, { "epoch": 0.6095913542722053, "grad_norm": 2.536710262298584, "learning_rate": 6.995260408668034e-06, "loss": 0.5223, "step": 1805 }, { "epoch": 0.6099290780141844, "grad_norm": 1.793545126914978, "learning_rate": 6.984828992127842e-06, "loss": 0.5125, "step": 1806 }, { "epoch": 0.6102668017561634, "grad_norm": 1.9523617029190063, "learning_rate": 6.9744011833903545e-06, "loss": 0.5082, "step": 1807 }, { "epoch": 0.6106045254981425, "grad_norm": 1.6350622177124023, "learning_rate": 6.963976994932962e-06, "loss": 0.4856, "step": 1808 }, { "epoch": 0.6109422492401215, "grad_norm": 19.300281524658203, "learning_rate": 6.953556439228728e-06, "loss": 0.4953, "step": 1809 }, { "epoch": 0.6112799729821007, "grad_norm": 1.9111565351486206, "learning_rate": 6.9431395287463655e-06, "loss": 0.5363, "step": 1810 }, { "epoch": 0.6116176967240797, "grad_norm": 2.6166939735412598, "learning_rate": 6.932726275950232e-06, "loss": 0.5385, "step": 1811 }, { "epoch": 0.6119554204660588, "grad_norm": 2.574491500854492, "learning_rate": 6.922316693300299e-06, "loss": 0.5291, "step": 1812 }, { "epoch": 0.6122931442080378, "grad_norm": 2.089628219604492, "learning_rate": 6.911910793252157e-06, "loss": 0.541, "step": 1813 }, { "epoch": 0.6126308679500169, "grad_norm": 1.963372826576233, "learning_rate": 6.9015085882569866e-06, "loss": 0.5296, "step": 1814 }, { "epoch": 0.6129685916919959, "grad_norm": 1.965610384941101, "learning_rate": 6.891110090761541e-06, "loss": 0.5194, "step": 1815 }, { "epoch": 0.613306315433975, "grad_norm": 2.0347254276275635, "learning_rate": 6.880715313208144e-06, "loss": 0.5367, "step": 1816 }, { "epoch": 0.6136440391759541, "grad_norm": 2.004020929336548, "learning_rate": 6.870324268034667e-06, "loss": 0.5204, "step": 1817 }, { "epoch": 0.6139817629179332, "grad_norm": 2.3809475898742676, "learning_rate": 6.859936967674509e-06, "loss": 0.5425, "step": 1818 }, { "epoch": 0.6143194866599122, "grad_norm": 2.143130302429199, "learning_rate": 6.849553424556603e-06, "loss": 0.5408, "step": 1819 }, { "epoch": 0.6146572104018913, "grad_norm": 2.3788681030273438, "learning_rate": 6.839173651105368e-06, "loss": 0.543, "step": 1820 }, { "epoch": 0.6149949341438703, "grad_norm": 1.7984365224838257, "learning_rate": 6.828797659740729e-06, "loss": 0.4956, "step": 1821 }, { "epoch": 0.6153326578858493, "grad_norm": 2.047539234161377, "learning_rate": 6.818425462878071e-06, "loss": 0.5198, "step": 1822 }, { "epoch": 0.6156703816278284, "grad_norm": 2.127707004547119, "learning_rate": 6.808057072928249e-06, "loss": 0.5044, "step": 1823 }, { "epoch": 0.6160081053698075, "grad_norm": 1.8218709230422974, "learning_rate": 6.7976925022975596e-06, "loss": 0.5623, "step": 1824 }, { "epoch": 0.6163458291117866, "grad_norm": 2.1076595783233643, "learning_rate": 6.78733176338773e-06, "loss": 0.504, "step": 1825 }, { "epoch": 0.6166835528537656, "grad_norm": 2.691159248352051, "learning_rate": 6.776974868595898e-06, "loss": 0.5397, "step": 1826 }, { "epoch": 0.6170212765957447, "grad_norm": 1.8562434911727905, "learning_rate": 6.76662183031461e-06, "loss": 0.4828, "step": 1827 }, { "epoch": 0.6173590003377237, "grad_norm": 1.859641432762146, "learning_rate": 6.756272660931788e-06, "loss": 0.5312, "step": 1828 }, { "epoch": 0.6176967240797028, "grad_norm": 1.8268191814422607, "learning_rate": 6.745927372830738e-06, "loss": 0.5127, "step": 1829 }, { "epoch": 0.6180344478216818, "grad_norm": 2.015805959701538, "learning_rate": 6.735585978390105e-06, "loss": 0.5102, "step": 1830 }, { "epoch": 0.618372171563661, "grad_norm": 1.6368883848190308, "learning_rate": 6.725248489983891e-06, "loss": 0.5117, "step": 1831 }, { "epoch": 0.61870989530564, "grad_norm": 2.0104892253875732, "learning_rate": 6.714914919981413e-06, "loss": 0.4899, "step": 1832 }, { "epoch": 0.6190476190476191, "grad_norm": 1.5184214115142822, "learning_rate": 6.7045852807473026e-06, "loss": 0.5, "step": 1833 }, { "epoch": 0.6193853427895981, "grad_norm": 2.497741460800171, "learning_rate": 6.694259584641496e-06, "loss": 0.5424, "step": 1834 }, { "epoch": 0.6197230665315772, "grad_norm": 2.163015842437744, "learning_rate": 6.683937844019197e-06, "loss": 0.5217, "step": 1835 }, { "epoch": 0.6200607902735562, "grad_norm": 1.8689396381378174, "learning_rate": 6.673620071230889e-06, "loss": 0.5132, "step": 1836 }, { "epoch": 0.6203985140155353, "grad_norm": 1.9100151062011719, "learning_rate": 6.663306278622297e-06, "loss": 0.5287, "step": 1837 }, { "epoch": 0.6207362377575144, "grad_norm": 1.971382737159729, "learning_rate": 6.652996478534395e-06, "loss": 0.5139, "step": 1838 }, { "epoch": 0.6210739614994935, "grad_norm": 1.776136040687561, "learning_rate": 6.6426906833033675e-06, "loss": 0.5078, "step": 1839 }, { "epoch": 0.6214116852414725, "grad_norm": 2.001795530319214, "learning_rate": 6.632388905260619e-06, "loss": 0.4992, "step": 1840 }, { "epoch": 0.6217494089834515, "grad_norm": 1.80959153175354, "learning_rate": 6.622091156732737e-06, "loss": 0.4928, "step": 1841 }, { "epoch": 0.6220871327254306, "grad_norm": 1.7976983785629272, "learning_rate": 6.611797450041495e-06, "loss": 0.5178, "step": 1842 }, { "epoch": 0.6224248564674096, "grad_norm": 1.7159576416015625, "learning_rate": 6.601507797503826e-06, "loss": 0.4727, "step": 1843 }, { "epoch": 0.6227625802093887, "grad_norm": 2.1529831886291504, "learning_rate": 6.591222211431814e-06, "loss": 0.5221, "step": 1844 }, { "epoch": 0.6231003039513677, "grad_norm": 1.749328374862671, "learning_rate": 6.580940704132673e-06, "loss": 0.5604, "step": 1845 }, { "epoch": 0.6234380276933469, "grad_norm": 1.7435510158538818, "learning_rate": 6.570663287908744e-06, "loss": 0.4996, "step": 1846 }, { "epoch": 0.6237757514353259, "grad_norm": 2.2633767127990723, "learning_rate": 6.560389975057467e-06, "loss": 0.5868, "step": 1847 }, { "epoch": 0.624113475177305, "grad_norm": 1.773872971534729, "learning_rate": 6.550120777871374e-06, "loss": 0.5364, "step": 1848 }, { "epoch": 0.624451198919284, "grad_norm": 1.644236445426941, "learning_rate": 6.539855708638075e-06, "loss": 0.5037, "step": 1849 }, { "epoch": 0.6247889226612631, "grad_norm": 1.7883882522583008, "learning_rate": 6.5295947796402315e-06, "loss": 0.4805, "step": 1850 }, { "epoch": 0.6251266464032421, "grad_norm": 1.6121529340744019, "learning_rate": 6.5193380031555655e-06, "loss": 0.514, "step": 1851 }, { "epoch": 0.6254643701452212, "grad_norm": 1.6191400289535522, "learning_rate": 6.509085391456815e-06, "loss": 0.5107, "step": 1852 }, { "epoch": 0.6258020938872003, "grad_norm": 4.463631629943848, "learning_rate": 6.498836956811748e-06, "loss": 0.5207, "step": 1853 }, { "epoch": 0.6261398176291794, "grad_norm": 1.7141677141189575, "learning_rate": 6.488592711483122e-06, "loss": 0.4505, "step": 1854 }, { "epoch": 0.6264775413711584, "grad_norm": 2.6629459857940674, "learning_rate": 6.478352667728693e-06, "loss": 0.5092, "step": 1855 }, { "epoch": 0.6268152651131375, "grad_norm": 1.7746893167495728, "learning_rate": 6.468116837801188e-06, "loss": 0.5279, "step": 1856 }, { "epoch": 0.6271529888551165, "grad_norm": 1.8443984985351562, "learning_rate": 6.45788523394828e-06, "loss": 0.52, "step": 1857 }, { "epoch": 0.6274907125970955, "grad_norm": 2.00224232673645, "learning_rate": 6.447657868412603e-06, "loss": 0.5457, "step": 1858 }, { "epoch": 0.6278284363390746, "grad_norm": 1.8209927082061768, "learning_rate": 6.437434753431702e-06, "loss": 0.512, "step": 1859 }, { "epoch": 0.6281661600810537, "grad_norm": 2.635629892349243, "learning_rate": 6.427215901238052e-06, "loss": 0.5166, "step": 1860 }, { "epoch": 0.6285038838230328, "grad_norm": 1.706510066986084, "learning_rate": 6.417001324059016e-06, "loss": 0.5016, "step": 1861 }, { "epoch": 0.6288416075650118, "grad_norm": 2.0307042598724365, "learning_rate": 6.406791034116846e-06, "loss": 0.5379, "step": 1862 }, { "epoch": 0.6291793313069909, "grad_norm": 1.5806844234466553, "learning_rate": 6.396585043628664e-06, "loss": 0.5073, "step": 1863 }, { "epoch": 0.6295170550489699, "grad_norm": 1.815000295639038, "learning_rate": 6.38638336480645e-06, "loss": 0.5149, "step": 1864 }, { "epoch": 0.629854778790949, "grad_norm": 1.9761162996292114, "learning_rate": 6.376186009857017e-06, "loss": 0.4935, "step": 1865 }, { "epoch": 0.630192502532928, "grad_norm": 1.7649118900299072, "learning_rate": 6.365992990982015e-06, "loss": 0.4654, "step": 1866 }, { "epoch": 0.6305302262749072, "grad_norm": 1.5499165058135986, "learning_rate": 6.355804320377896e-06, "loss": 0.5144, "step": 1867 }, { "epoch": 0.6308679500168862, "grad_norm": 2.0343143939971924, "learning_rate": 6.3456200102359175e-06, "loss": 0.5089, "step": 1868 }, { "epoch": 0.6312056737588653, "grad_norm": 1.8814688920974731, "learning_rate": 6.335440072742112e-06, "loss": 0.5279, "step": 1869 }, { "epoch": 0.6315433975008443, "grad_norm": 1.6328102350234985, "learning_rate": 6.3252645200772836e-06, "loss": 0.5093, "step": 1870 }, { "epoch": 0.6318811212428234, "grad_norm": 1.8640872240066528, "learning_rate": 6.3150933644169875e-06, "loss": 0.4909, "step": 1871 }, { "epoch": 0.6322188449848024, "grad_norm": 2.2519478797912598, "learning_rate": 6.3049266179315235e-06, "loss": 0.5578, "step": 1872 }, { "epoch": 0.6325565687267815, "grad_norm": 1.9731653928756714, "learning_rate": 6.2947642927859084e-06, "loss": 0.4885, "step": 1873 }, { "epoch": 0.6328942924687605, "grad_norm": 1.62202787399292, "learning_rate": 6.284606401139875e-06, "loss": 0.4891, "step": 1874 }, { "epoch": 0.6332320162107397, "grad_norm": 1.820271611213684, "learning_rate": 6.274452955147843e-06, "loss": 0.4889, "step": 1875 }, { "epoch": 0.6335697399527187, "grad_norm": 1.7516776323318481, "learning_rate": 6.26430396695892e-06, "loss": 0.519, "step": 1876 }, { "epoch": 0.6339074636946977, "grad_norm": 1.677078366279602, "learning_rate": 6.254159448716881e-06, "loss": 0.5198, "step": 1877 }, { "epoch": 0.6342451874366768, "grad_norm": 1.8859362602233887, "learning_rate": 6.244019412560144e-06, "loss": 0.5011, "step": 1878 }, { "epoch": 0.6345829111786558, "grad_norm": 2.1253881454467773, "learning_rate": 6.233883870621771e-06, "loss": 0.5036, "step": 1879 }, { "epoch": 0.6349206349206349, "grad_norm": 1.845361351966858, "learning_rate": 6.223752835029443e-06, "loss": 0.4815, "step": 1880 }, { "epoch": 0.6352583586626139, "grad_norm": 1.636048674583435, "learning_rate": 6.213626317905451e-06, "loss": 0.5029, "step": 1881 }, { "epoch": 0.6355960824045931, "grad_norm": 2.2495481967926025, "learning_rate": 6.203504331366677e-06, "loss": 0.5173, "step": 1882 }, { "epoch": 0.6359338061465721, "grad_norm": 1.7279146909713745, "learning_rate": 6.193386887524586e-06, "loss": 0.492, "step": 1883 }, { "epoch": 0.6362715298885512, "grad_norm": 1.9601259231567383, "learning_rate": 6.183273998485202e-06, "loss": 0.5124, "step": 1884 }, { "epoch": 0.6366092536305302, "grad_norm": 1.8850942850112915, "learning_rate": 6.173165676349103e-06, "loss": 0.5075, "step": 1885 }, { "epoch": 0.6369469773725093, "grad_norm": 1.908862829208374, "learning_rate": 6.163061933211403e-06, "loss": 0.5532, "step": 1886 }, { "epoch": 0.6372847011144883, "grad_norm": 1.799822211265564, "learning_rate": 6.1529627811617305e-06, "loss": 0.5463, "step": 1887 }, { "epoch": 0.6376224248564674, "grad_norm": 1.669973611831665, "learning_rate": 6.142868232284233e-06, "loss": 0.4778, "step": 1888 }, { "epoch": 0.6379601485984465, "grad_norm": 1.7877248525619507, "learning_rate": 6.132778298657534e-06, "loss": 0.4933, "step": 1889 }, { "epoch": 0.6382978723404256, "grad_norm": 2.0086023807525635, "learning_rate": 6.122692992354748e-06, "loss": 0.5322, "step": 1890 }, { "epoch": 0.6386355960824046, "grad_norm": 1.5583242177963257, "learning_rate": 6.112612325443446e-06, "loss": 0.5062, "step": 1891 }, { "epoch": 0.6389733198243837, "grad_norm": 1.6862893104553223, "learning_rate": 6.1025363099856515e-06, "loss": 0.5244, "step": 1892 }, { "epoch": 0.6393110435663627, "grad_norm": 1.9438791275024414, "learning_rate": 6.092464958037818e-06, "loss": 0.5377, "step": 1893 }, { "epoch": 0.6396487673083417, "grad_norm": 1.6971378326416016, "learning_rate": 6.082398281650823e-06, "loss": 0.5023, "step": 1894 }, { "epoch": 0.6399864910503208, "grad_norm": 1.7883998155593872, "learning_rate": 6.072336292869944e-06, "loss": 0.5432, "step": 1895 }, { "epoch": 0.6403242147923, "grad_norm": 1.6111795902252197, "learning_rate": 6.062279003734853e-06, "loss": 0.5065, "step": 1896 }, { "epoch": 0.640661938534279, "grad_norm": 2.6119439601898193, "learning_rate": 6.052226426279606e-06, "loss": 0.541, "step": 1897 }, { "epoch": 0.640999662276258, "grad_norm": 2.0721325874328613, "learning_rate": 6.0421785725326085e-06, "loss": 0.5205, "step": 1898 }, { "epoch": 0.6413373860182371, "grad_norm": 1.6638076305389404, "learning_rate": 6.032135454516621e-06, "loss": 0.5144, "step": 1899 }, { "epoch": 0.6416751097602161, "grad_norm": 1.6754400730133057, "learning_rate": 6.022097084248732e-06, "loss": 0.507, "step": 1900 }, { "epoch": 0.6420128335021952, "grad_norm": 1.8500871658325195, "learning_rate": 6.01206347374036e-06, "loss": 0.5258, "step": 1901 }, { "epoch": 0.6423505572441742, "grad_norm": 1.558168649673462, "learning_rate": 6.002034634997214e-06, "loss": 0.5052, "step": 1902 }, { "epoch": 0.6426882809861534, "grad_norm": 1.7751902341842651, "learning_rate": 5.992010580019305e-06, "loss": 0.4969, "step": 1903 }, { "epoch": 0.6430260047281324, "grad_norm": 2.620830774307251, "learning_rate": 5.981991320800913e-06, "loss": 0.5197, "step": 1904 }, { "epoch": 0.6433637284701115, "grad_norm": 1.7375465631484985, "learning_rate": 5.971976869330584e-06, "loss": 0.4969, "step": 1905 }, { "epoch": 0.6437014522120905, "grad_norm": 1.812056303024292, "learning_rate": 5.9619672375911065e-06, "loss": 0.5042, "step": 1906 }, { "epoch": 0.6440391759540696, "grad_norm": 2.2340002059936523, "learning_rate": 5.9519624375595076e-06, "loss": 0.4779, "step": 1907 }, { "epoch": 0.6443768996960486, "grad_norm": 1.8956190347671509, "learning_rate": 5.941962481207029e-06, "loss": 0.5235, "step": 1908 }, { "epoch": 0.6447146234380277, "grad_norm": 1.8199141025543213, "learning_rate": 5.9319673804991175e-06, "loss": 0.4904, "step": 1909 }, { "epoch": 0.6450523471800067, "grad_norm": 1.7568516731262207, "learning_rate": 5.92197714739541e-06, "loss": 0.5134, "step": 1910 }, { "epoch": 0.6453900709219859, "grad_norm": 2.0943987369537354, "learning_rate": 5.911991793849723e-06, "loss": 0.5298, "step": 1911 }, { "epoch": 0.6457277946639649, "grad_norm": 1.7022619247436523, "learning_rate": 5.902011331810026e-06, "loss": 0.5276, "step": 1912 }, { "epoch": 0.646065518405944, "grad_norm": 1.840043544769287, "learning_rate": 5.892035773218443e-06, "loss": 0.5088, "step": 1913 }, { "epoch": 0.646403242147923, "grad_norm": 1.53622567653656, "learning_rate": 5.882065130011226e-06, "loss": 0.5062, "step": 1914 }, { "epoch": 0.646740965889902, "grad_norm": 1.7366501092910767, "learning_rate": 5.872099414118751e-06, "loss": 0.4831, "step": 1915 }, { "epoch": 0.6470786896318811, "grad_norm": 1.7956757545471191, "learning_rate": 5.862138637465491e-06, "loss": 0.5556, "step": 1916 }, { "epoch": 0.6474164133738601, "grad_norm": 1.6927851438522339, "learning_rate": 5.852182811970013e-06, "loss": 0.4875, "step": 1917 }, { "epoch": 0.6477541371158393, "grad_norm": 1.9016704559326172, "learning_rate": 5.842231949544963e-06, "loss": 0.5064, "step": 1918 }, { "epoch": 0.6480918608578183, "grad_norm": 1.9392393827438354, "learning_rate": 5.8322860620970425e-06, "loss": 0.5547, "step": 1919 }, { "epoch": 0.6484295845997974, "grad_norm": 3.7548627853393555, "learning_rate": 5.822345161527004e-06, "loss": 0.5063, "step": 1920 }, { "epoch": 0.6487673083417764, "grad_norm": 1.6481525897979736, "learning_rate": 5.81240925972963e-06, "loss": 0.5057, "step": 1921 }, { "epoch": 0.6491050320837555, "grad_norm": 1.629094123840332, "learning_rate": 5.80247836859372e-06, "loss": 0.4728, "step": 1922 }, { "epoch": 0.6494427558257345, "grad_norm": 2.273602247238159, "learning_rate": 5.79255250000209e-06, "loss": 0.4728, "step": 1923 }, { "epoch": 0.6497804795677136, "grad_norm": 1.8964933156967163, "learning_rate": 5.782631665831532e-06, "loss": 0.5146, "step": 1924 }, { "epoch": 0.6501182033096927, "grad_norm": 2.0079851150512695, "learning_rate": 5.7727158779528194e-06, "loss": 0.5373, "step": 1925 }, { "epoch": 0.6504559270516718, "grad_norm": 1.8720099925994873, "learning_rate": 5.762805148230688e-06, "loss": 0.5279, "step": 1926 }, { "epoch": 0.6507936507936508, "grad_norm": 1.818017601966858, "learning_rate": 5.752899488523817e-06, "loss": 0.4644, "step": 1927 }, { "epoch": 0.6511313745356299, "grad_norm": 1.6053848266601562, "learning_rate": 5.742998910684832e-06, "loss": 0.5052, "step": 1928 }, { "epoch": 0.6514690982776089, "grad_norm": 1.746849536895752, "learning_rate": 5.733103426560263e-06, "loss": 0.4944, "step": 1929 }, { "epoch": 0.651806822019588, "grad_norm": 1.828536868095398, "learning_rate": 5.723213047990553e-06, "loss": 0.5052, "step": 1930 }, { "epoch": 0.652144545761567, "grad_norm": 1.6535857915878296, "learning_rate": 5.7133277868100276e-06, "loss": 0.5323, "step": 1931 }, { "epoch": 0.6524822695035462, "grad_norm": 1.7725305557250977, "learning_rate": 5.703447654846906e-06, "loss": 0.529, "step": 1932 }, { "epoch": 0.6528199932455252, "grad_norm": 1.701028823852539, "learning_rate": 5.693572663923253e-06, "loss": 0.5503, "step": 1933 }, { "epoch": 0.6531577169875042, "grad_norm": 1.585087776184082, "learning_rate": 5.68370282585499e-06, "loss": 0.468, "step": 1934 }, { "epoch": 0.6534954407294833, "grad_norm": 1.9020905494689941, "learning_rate": 5.673838152451867e-06, "loss": 0.4992, "step": 1935 }, { "epoch": 0.6538331644714623, "grad_norm": 1.9629559516906738, "learning_rate": 5.663978655517466e-06, "loss": 0.4831, "step": 1936 }, { "epoch": 0.6541708882134414, "grad_norm": 1.9085646867752075, "learning_rate": 5.654124346849165e-06, "loss": 0.5129, "step": 1937 }, { "epoch": 0.6545086119554204, "grad_norm": 1.8695898056030273, "learning_rate": 5.64427523823813e-06, "loss": 0.5173, "step": 1938 }, { "epoch": 0.6548463356973995, "grad_norm": 2.018824577331543, "learning_rate": 5.634431341469322e-06, "loss": 0.5183, "step": 1939 }, { "epoch": 0.6551840594393786, "grad_norm": 1.529212474822998, "learning_rate": 5.624592668321444e-06, "loss": 0.4891, "step": 1940 }, { "epoch": 0.6555217831813577, "grad_norm": 1.6243791580200195, "learning_rate": 5.614759230566973e-06, "loss": 0.5022, "step": 1941 }, { "epoch": 0.6558595069233367, "grad_norm": 1.7111704349517822, "learning_rate": 5.604931039972099e-06, "loss": 0.5288, "step": 1942 }, { "epoch": 0.6561972306653158, "grad_norm": 1.6000840663909912, "learning_rate": 5.595108108296744e-06, "loss": 0.5049, "step": 1943 }, { "epoch": 0.6565349544072948, "grad_norm": 1.5325617790222168, "learning_rate": 5.585290447294535e-06, "loss": 0.5044, "step": 1944 }, { "epoch": 0.6568726781492739, "grad_norm": 1.6330925226211548, "learning_rate": 5.5754780687127984e-06, "loss": 0.5132, "step": 1945 }, { "epoch": 0.6572104018912529, "grad_norm": 1.8440184593200684, "learning_rate": 5.5656709842925335e-06, "loss": 0.5086, "step": 1946 }, { "epoch": 0.6575481256332321, "grad_norm": 1.7609094381332397, "learning_rate": 5.5558692057684076e-06, "loss": 0.5008, "step": 1947 }, { "epoch": 0.6578858493752111, "grad_norm": 1.9684057235717773, "learning_rate": 5.546072744868731e-06, "loss": 0.4926, "step": 1948 }, { "epoch": 0.6582235731171902, "grad_norm": 1.5622491836547852, "learning_rate": 5.536281613315468e-06, "loss": 0.4996, "step": 1949 }, { "epoch": 0.6585612968591692, "grad_norm": 1.6622955799102783, "learning_rate": 5.5264958228241925e-06, "loss": 0.5029, "step": 1950 }, { "epoch": 0.6588990206011482, "grad_norm": 3.1179022789001465, "learning_rate": 5.516715385104092e-06, "loss": 0.5447, "step": 1951 }, { "epoch": 0.6592367443431273, "grad_norm": 1.9833824634552002, "learning_rate": 5.506940311857944e-06, "loss": 0.5396, "step": 1952 }, { "epoch": 0.6595744680851063, "grad_norm": 1.7038743495941162, "learning_rate": 5.497170614782121e-06, "loss": 0.5103, "step": 1953 }, { "epoch": 0.6599121918270855, "grad_norm": 1.5758050680160522, "learning_rate": 5.4874063055665495e-06, "loss": 0.4616, "step": 1954 }, { "epoch": 0.6602499155690645, "grad_norm": 2.0915815830230713, "learning_rate": 5.4776473958947115e-06, "loss": 0.5107, "step": 1955 }, { "epoch": 0.6605876393110436, "grad_norm": 1.8026622533798218, "learning_rate": 5.46789389744363e-06, "loss": 0.4835, "step": 1956 }, { "epoch": 0.6609253630530226, "grad_norm": 1.6235053539276123, "learning_rate": 5.458145821883851e-06, "loss": 0.4703, "step": 1957 }, { "epoch": 0.6612630867950017, "grad_norm": 1.7853825092315674, "learning_rate": 5.44840318087944e-06, "loss": 0.5493, "step": 1958 }, { "epoch": 0.6616008105369807, "grad_norm": 1.638479471206665, "learning_rate": 5.438665986087945e-06, "loss": 0.5028, "step": 1959 }, { "epoch": 0.6619385342789598, "grad_norm": 1.5766017436981201, "learning_rate": 5.428934249160416e-06, "loss": 0.4963, "step": 1960 }, { "epoch": 0.6622762580209389, "grad_norm": 1.689021348953247, "learning_rate": 5.419207981741358e-06, "loss": 0.5161, "step": 1961 }, { "epoch": 0.662613981762918, "grad_norm": 1.9634238481521606, "learning_rate": 5.40948719546873e-06, "loss": 0.5337, "step": 1962 }, { "epoch": 0.662951705504897, "grad_norm": 1.7832280397415161, "learning_rate": 5.399771901973947e-06, "loss": 0.5008, "step": 1963 }, { "epoch": 0.6632894292468761, "grad_norm": 1.8264548778533936, "learning_rate": 5.39006211288184e-06, "loss": 0.5213, "step": 1964 }, { "epoch": 0.6636271529888551, "grad_norm": 2.0316779613494873, "learning_rate": 5.380357839810655e-06, "loss": 0.5178, "step": 1965 }, { "epoch": 0.6639648767308342, "grad_norm": 1.7136971950531006, "learning_rate": 5.370659094372036e-06, "loss": 0.5253, "step": 1966 }, { "epoch": 0.6643026004728132, "grad_norm": 1.799641728401184, "learning_rate": 5.360965888171024e-06, "loss": 0.5239, "step": 1967 }, { "epoch": 0.6646403242147924, "grad_norm": 2.0749683380126953, "learning_rate": 5.351278232806022e-06, "loss": 0.5145, "step": 1968 }, { "epoch": 0.6649780479567714, "grad_norm": 1.9289696216583252, "learning_rate": 5.341596139868792e-06, "loss": 0.4856, "step": 1969 }, { "epoch": 0.6653157716987504, "grad_norm": 1.7460216283798218, "learning_rate": 5.331919620944438e-06, "loss": 0.5177, "step": 1970 }, { "epoch": 0.6656534954407295, "grad_norm": 1.5785537958145142, "learning_rate": 5.322248687611407e-06, "loss": 0.4828, "step": 1971 }, { "epoch": 0.6659912191827085, "grad_norm": 2.103273630142212, "learning_rate": 5.312583351441447e-06, "loss": 0.5282, "step": 1972 }, { "epoch": 0.6663289429246876, "grad_norm": 1.6142189502716064, "learning_rate": 5.302923623999619e-06, "loss": 0.4906, "step": 1973 }, { "epoch": 0.6666666666666666, "grad_norm": 1.719846248626709, "learning_rate": 5.293269516844263e-06, "loss": 0.5267, "step": 1974 }, { "epoch": 0.6670043904086457, "grad_norm": 1.6497747898101807, "learning_rate": 5.28362104152701e-06, "loss": 0.5374, "step": 1975 }, { "epoch": 0.6673421141506248, "grad_norm": 1.7565876245498657, "learning_rate": 5.273978209592736e-06, "loss": 0.4748, "step": 1976 }, { "epoch": 0.6676798378926039, "grad_norm": 2.1178531646728516, "learning_rate": 5.264341032579574e-06, "loss": 0.5206, "step": 1977 }, { "epoch": 0.6680175616345829, "grad_norm": 1.776497483253479, "learning_rate": 5.2547095220188815e-06, "loss": 0.4837, "step": 1978 }, { "epoch": 0.668355285376562, "grad_norm": 1.7263413667678833, "learning_rate": 5.245083689435253e-06, "loss": 0.5052, "step": 1979 }, { "epoch": 0.668693009118541, "grad_norm": 1.8023885488510132, "learning_rate": 5.23546354634647e-06, "loss": 0.5089, "step": 1980 }, { "epoch": 0.6690307328605201, "grad_norm": 1.7439874410629272, "learning_rate": 5.225849104263511e-06, "loss": 0.5426, "step": 1981 }, { "epoch": 0.6693684566024991, "grad_norm": 1.8230808973312378, "learning_rate": 5.216240374690546e-06, "loss": 0.549, "step": 1982 }, { "epoch": 0.6697061803444783, "grad_norm": 1.6625638008117676, "learning_rate": 5.206637369124891e-06, "loss": 0.4629, "step": 1983 }, { "epoch": 0.6700439040864573, "grad_norm": 1.9067953824996948, "learning_rate": 5.197040099057031e-06, "loss": 0.512, "step": 1984 }, { "epoch": 0.6703816278284364, "grad_norm": 1.8568631410598755, "learning_rate": 5.187448575970573e-06, "loss": 0.5107, "step": 1985 }, { "epoch": 0.6707193515704154, "grad_norm": 1.930611491203308, "learning_rate": 5.177862811342254e-06, "loss": 0.5252, "step": 1986 }, { "epoch": 0.6710570753123944, "grad_norm": 1.8624091148376465, "learning_rate": 5.168282816641915e-06, "loss": 0.5136, "step": 1987 }, { "epoch": 0.6713947990543735, "grad_norm": 1.9143561124801636, "learning_rate": 5.158708603332508e-06, "loss": 0.4978, "step": 1988 }, { "epoch": 0.6717325227963525, "grad_norm": 1.9737093448638916, "learning_rate": 5.1491401828700534e-06, "loss": 0.4898, "step": 1989 }, { "epoch": 0.6720702465383317, "grad_norm": 1.8430389165878296, "learning_rate": 5.139577566703643e-06, "loss": 0.4913, "step": 1990 }, { "epoch": 0.6724079702803107, "grad_norm": 1.8768329620361328, "learning_rate": 5.1300207662754206e-06, "loss": 0.5035, "step": 1991 }, { "epoch": 0.6727456940222898, "grad_norm": 2.37127685546875, "learning_rate": 5.120469793020585e-06, "loss": 0.5194, "step": 1992 }, { "epoch": 0.6730834177642688, "grad_norm": 1.7988253831863403, "learning_rate": 5.110924658367346e-06, "loss": 0.4813, "step": 1993 }, { "epoch": 0.6734211415062479, "grad_norm": 2.0235636234283447, "learning_rate": 5.101385373736937e-06, "loss": 0.5118, "step": 1994 }, { "epoch": 0.6737588652482269, "grad_norm": 4.643254280090332, "learning_rate": 5.091851950543585e-06, "loss": 0.5103, "step": 1995 }, { "epoch": 0.674096588990206, "grad_norm": 1.8220232725143433, "learning_rate": 5.082324400194507e-06, "loss": 0.5073, "step": 1996 }, { "epoch": 0.6744343127321851, "grad_norm": 2.5992937088012695, "learning_rate": 5.072802734089896e-06, "loss": 0.4879, "step": 1997 }, { "epoch": 0.6747720364741642, "grad_norm": 1.7290149927139282, "learning_rate": 5.0632869636229035e-06, "loss": 0.514, "step": 1998 }, { "epoch": 0.6751097602161432, "grad_norm": 2.098435163497925, "learning_rate": 5.053777100179618e-06, "loss": 0.5149, "step": 1999 }, { "epoch": 0.6754474839581223, "grad_norm": 1.818042278289795, "learning_rate": 5.044273155139065e-06, "loss": 0.5155, "step": 2000 }, { "epoch": 0.6757852077001013, "grad_norm": 1.780055284500122, "learning_rate": 5.034775139873197e-06, "loss": 0.5149, "step": 2001 }, { "epoch": 0.6761229314420804, "grad_norm": 1.7094818353652954, "learning_rate": 5.025283065746855e-06, "loss": 0.5322, "step": 2002 }, { "epoch": 0.6764606551840594, "grad_norm": 2.7583279609680176, "learning_rate": 5.015796944117789e-06, "loss": 0.4993, "step": 2003 }, { "epoch": 0.6767983789260384, "grad_norm": 2.1549859046936035, "learning_rate": 5.006316786336612e-06, "loss": 0.5068, "step": 2004 }, { "epoch": 0.6771361026680176, "grad_norm": 2.079089403152466, "learning_rate": 4.996842603746803e-06, "loss": 0.5382, "step": 2005 }, { "epoch": 0.6774738264099966, "grad_norm": 2.241684675216675, "learning_rate": 4.987374407684703e-06, "loss": 0.5714, "step": 2006 }, { "epoch": 0.6778115501519757, "grad_norm": 1.7146687507629395, "learning_rate": 4.977912209479477e-06, "loss": 0.5227, "step": 2007 }, { "epoch": 0.6781492738939547, "grad_norm": 1.6472845077514648, "learning_rate": 4.968456020453117e-06, "loss": 0.4703, "step": 2008 }, { "epoch": 0.6784869976359338, "grad_norm": 1.616767168045044, "learning_rate": 4.959005851920423e-06, "loss": 0.4811, "step": 2009 }, { "epoch": 0.6788247213779128, "grad_norm": 1.5320994853973389, "learning_rate": 4.949561715189001e-06, "loss": 0.5038, "step": 2010 }, { "epoch": 0.6791624451198919, "grad_norm": 1.5241081714630127, "learning_rate": 4.940123621559228e-06, "loss": 0.505, "step": 2011 }, { "epoch": 0.679500168861871, "grad_norm": 1.7927095890045166, "learning_rate": 4.930691582324254e-06, "loss": 0.5152, "step": 2012 }, { "epoch": 0.6798378926038501, "grad_norm": 1.7721396684646606, "learning_rate": 4.921265608769981e-06, "loss": 0.4739, "step": 2013 }, { "epoch": 0.6801756163458291, "grad_norm": 2.301035165786743, "learning_rate": 4.911845712175067e-06, "loss": 0.5316, "step": 2014 }, { "epoch": 0.6805133400878082, "grad_norm": 2.1046128273010254, "learning_rate": 4.9024319038108825e-06, "loss": 0.4788, "step": 2015 }, { "epoch": 0.6808510638297872, "grad_norm": 1.847375750541687, "learning_rate": 4.893024194941521e-06, "loss": 0.502, "step": 2016 }, { "epoch": 0.6811887875717663, "grad_norm": 1.8270272016525269, "learning_rate": 4.883622596823771e-06, "loss": 0.5183, "step": 2017 }, { "epoch": 0.6815265113137453, "grad_norm": 2.3249831199645996, "learning_rate": 4.8742271207071226e-06, "loss": 0.5362, "step": 2018 }, { "epoch": 0.6818642350557245, "grad_norm": 1.8754801750183105, "learning_rate": 4.864837777833728e-06, "loss": 0.5019, "step": 2019 }, { "epoch": 0.6822019587977035, "grad_norm": 1.8211567401885986, "learning_rate": 4.855454579438406e-06, "loss": 0.5024, "step": 2020 }, { "epoch": 0.6825396825396826, "grad_norm": 2.005805492401123, "learning_rate": 4.846077536748616e-06, "loss": 0.4825, "step": 2021 }, { "epoch": 0.6828774062816616, "grad_norm": 1.7987070083618164, "learning_rate": 4.836706660984467e-06, "loss": 0.5164, "step": 2022 }, { "epoch": 0.6832151300236406, "grad_norm": 1.9672449827194214, "learning_rate": 4.827341963358673e-06, "loss": 0.5113, "step": 2023 }, { "epoch": 0.6835528537656197, "grad_norm": 1.8913965225219727, "learning_rate": 4.8179834550765685e-06, "loss": 0.5036, "step": 2024 }, { "epoch": 0.6838905775075987, "grad_norm": 2.037135124206543, "learning_rate": 4.808631147336073e-06, "loss": 0.5358, "step": 2025 }, { "epoch": 0.6842283012495779, "grad_norm": 2.898036479949951, "learning_rate": 4.799285051327686e-06, "loss": 0.4703, "step": 2026 }, { "epoch": 0.6845660249915569, "grad_norm": 1.9329447746276855, "learning_rate": 4.789945178234485e-06, "loss": 0.5083, "step": 2027 }, { "epoch": 0.684903748733536, "grad_norm": 1.6591123342514038, "learning_rate": 4.780611539232093e-06, "loss": 0.4879, "step": 2028 }, { "epoch": 0.685241472475515, "grad_norm": 1.6692099571228027, "learning_rate": 4.771284145488673e-06, "loss": 0.5043, "step": 2029 }, { "epoch": 0.6855791962174941, "grad_norm": 1.7597270011901855, "learning_rate": 4.761963008164918e-06, "loss": 0.4753, "step": 2030 }, { "epoch": 0.6859169199594731, "grad_norm": 1.5643599033355713, "learning_rate": 4.752648138414031e-06, "loss": 0.5047, "step": 2031 }, { "epoch": 0.6862546437014522, "grad_norm": 2.2920081615448, "learning_rate": 4.743339547381727e-06, "loss": 0.5297, "step": 2032 }, { "epoch": 0.6865923674434313, "grad_norm": 1.7003077268600464, "learning_rate": 4.734037246206195e-06, "loss": 0.5235, "step": 2033 }, { "epoch": 0.6869300911854104, "grad_norm": 1.6800249814987183, "learning_rate": 4.724741246018103e-06, "loss": 0.4792, "step": 2034 }, { "epoch": 0.6872678149273894, "grad_norm": 2.3999242782592773, "learning_rate": 4.715451557940577e-06, "loss": 0.4904, "step": 2035 }, { "epoch": 0.6876055386693685, "grad_norm": 1.633929967880249, "learning_rate": 4.7061681930891986e-06, "loss": 0.5108, "step": 2036 }, { "epoch": 0.6879432624113475, "grad_norm": 2.122833251953125, "learning_rate": 4.696891162571976e-06, "loss": 0.5362, "step": 2037 }, { "epoch": 0.6882809861533266, "grad_norm": 1.7034035921096802, "learning_rate": 4.687620477489337e-06, "loss": 0.4952, "step": 2038 }, { "epoch": 0.6886187098953056, "grad_norm": 1.960045337677002, "learning_rate": 4.678356148934118e-06, "loss": 0.4905, "step": 2039 }, { "epoch": 0.6889564336372846, "grad_norm": 1.9514847993850708, "learning_rate": 4.6690981879915565e-06, "loss": 0.5326, "step": 2040 }, { "epoch": 0.6892941573792638, "grad_norm": 2.2564642429351807, "learning_rate": 4.659846605739261e-06, "loss": 0.5479, "step": 2041 }, { "epoch": 0.6896318811212429, "grad_norm": 1.5975600481033325, "learning_rate": 4.650601413247214e-06, "loss": 0.5087, "step": 2042 }, { "epoch": 0.6899696048632219, "grad_norm": 1.9163686037063599, "learning_rate": 4.641362621577745e-06, "loss": 0.533, "step": 2043 }, { "epoch": 0.6903073286052009, "grad_norm": 1.8115285634994507, "learning_rate": 4.632130241785533e-06, "loss": 0.5007, "step": 2044 }, { "epoch": 0.69064505234718, "grad_norm": 2.132821798324585, "learning_rate": 4.622904284917585e-06, "loss": 0.5483, "step": 2045 }, { "epoch": 0.690982776089159, "grad_norm": 1.691471815109253, "learning_rate": 4.613684762013217e-06, "loss": 0.5333, "step": 2046 }, { "epoch": 0.6913204998311381, "grad_norm": 1.6003730297088623, "learning_rate": 4.604471684104049e-06, "loss": 0.5226, "step": 2047 }, { "epoch": 0.6916582235731172, "grad_norm": 2.0143561363220215, "learning_rate": 4.595265062213983e-06, "loss": 0.5315, "step": 2048 }, { "epoch": 0.6919959473150963, "grad_norm": 4.8830246925354, "learning_rate": 4.586064907359209e-06, "loss": 0.463, "step": 2049 }, { "epoch": 0.6923336710570753, "grad_norm": 1.6366463899612427, "learning_rate": 4.57687123054817e-06, "loss": 0.5055, "step": 2050 }, { "epoch": 0.6926713947990544, "grad_norm": 1.7586066722869873, "learning_rate": 4.56768404278156e-06, "loss": 0.502, "step": 2051 }, { "epoch": 0.6930091185410334, "grad_norm": 1.7524745464324951, "learning_rate": 4.558503355052302e-06, "loss": 0.5182, "step": 2052 }, { "epoch": 0.6933468422830125, "grad_norm": 1.8260644674301147, "learning_rate": 4.549329178345556e-06, "loss": 0.5408, "step": 2053 }, { "epoch": 0.6936845660249915, "grad_norm": 1.7033907175064087, "learning_rate": 4.5401615236386785e-06, "loss": 0.522, "step": 2054 }, { "epoch": 0.6940222897669707, "grad_norm": 1.6513592004776, "learning_rate": 4.531000401901227e-06, "loss": 0.48, "step": 2055 }, { "epoch": 0.6943600135089497, "grad_norm": 1.7418322563171387, "learning_rate": 4.521845824094938e-06, "loss": 0.4889, "step": 2056 }, { "epoch": 0.6946977372509288, "grad_norm": 1.5807459354400635, "learning_rate": 4.5126978011737264e-06, "loss": 0.5174, "step": 2057 }, { "epoch": 0.6950354609929078, "grad_norm": 1.5152616500854492, "learning_rate": 4.503556344083656e-06, "loss": 0.5088, "step": 2058 }, { "epoch": 0.6953731847348869, "grad_norm": 1.9005603790283203, "learning_rate": 4.494421463762937e-06, "loss": 0.4893, "step": 2059 }, { "epoch": 0.6957109084768659, "grad_norm": 1.6637988090515137, "learning_rate": 4.485293171141906e-06, "loss": 0.5137, "step": 2060 }, { "epoch": 0.6960486322188449, "grad_norm": 1.617536187171936, "learning_rate": 4.4761714771430285e-06, "loss": 0.5087, "step": 2061 }, { "epoch": 0.6963863559608241, "grad_norm": 1.6945017576217651, "learning_rate": 4.467056392680863e-06, "loss": 0.4818, "step": 2062 }, { "epoch": 0.6967240797028031, "grad_norm": 1.9258880615234375, "learning_rate": 4.457947928662063e-06, "loss": 0.5058, "step": 2063 }, { "epoch": 0.6970618034447822, "grad_norm": 1.7837566137313843, "learning_rate": 4.448846095985362e-06, "loss": 0.4789, "step": 2064 }, { "epoch": 0.6973995271867612, "grad_norm": 1.8251985311508179, "learning_rate": 4.439750905541549e-06, "loss": 0.5086, "step": 2065 }, { "epoch": 0.6977372509287403, "grad_norm": 1.8463034629821777, "learning_rate": 4.4306623682134875e-06, "loss": 0.4934, "step": 2066 }, { "epoch": 0.6980749746707193, "grad_norm": 2.962557077407837, "learning_rate": 4.421580494876061e-06, "loss": 0.5051, "step": 2067 }, { "epoch": 0.6984126984126984, "grad_norm": 1.6415141820907593, "learning_rate": 4.412505296396182e-06, "loss": 0.4871, "step": 2068 }, { "epoch": 0.6987504221546774, "grad_norm": 1.6849000453948975, "learning_rate": 4.403436783632782e-06, "loss": 0.5211, "step": 2069 }, { "epoch": 0.6990881458966566, "grad_norm": 1.6289286613464355, "learning_rate": 4.394374967436783e-06, "loss": 0.4923, "step": 2070 }, { "epoch": 0.6994258696386356, "grad_norm": 2.0792665481567383, "learning_rate": 4.385319858651109e-06, "loss": 0.4661, "step": 2071 }, { "epoch": 0.6997635933806147, "grad_norm": 1.8197475671768188, "learning_rate": 4.376271468110646e-06, "loss": 0.4707, "step": 2072 }, { "epoch": 0.7001013171225937, "grad_norm": 1.7308281660079956, "learning_rate": 4.367229806642246e-06, "loss": 0.5134, "step": 2073 }, { "epoch": 0.7004390408645728, "grad_norm": 1.8216824531555176, "learning_rate": 4.358194885064704e-06, "loss": 0.5054, "step": 2074 }, { "epoch": 0.7007767646065518, "grad_norm": 1.768997311592102, "learning_rate": 4.349166714188762e-06, "loss": 0.5055, "step": 2075 }, { "epoch": 0.7011144883485309, "grad_norm": 1.7611669301986694, "learning_rate": 4.340145304817075e-06, "loss": 0.4943, "step": 2076 }, { "epoch": 0.70145221209051, "grad_norm": 1.7716970443725586, "learning_rate": 4.3311306677442085e-06, "loss": 0.4969, "step": 2077 }, { "epoch": 0.701789935832489, "grad_norm": 1.6888922452926636, "learning_rate": 4.3221228137566225e-06, "loss": 0.5114, "step": 2078 }, { "epoch": 0.7021276595744681, "grad_norm": 1.672825574874878, "learning_rate": 4.313121753632673e-06, "loss": 0.4964, "step": 2079 }, { "epoch": 0.7024653833164471, "grad_norm": 1.788180947303772, "learning_rate": 4.304127498142573e-06, "loss": 0.5116, "step": 2080 }, { "epoch": 0.7028031070584262, "grad_norm": 1.7934706211090088, "learning_rate": 4.2951400580483995e-06, "loss": 0.4986, "step": 2081 }, { "epoch": 0.7031408308004052, "grad_norm": 1.7990649938583374, "learning_rate": 4.286159444104068e-06, "loss": 0.4896, "step": 2082 }, { "epoch": 0.7034785545423843, "grad_norm": 3.2783758640289307, "learning_rate": 4.277185667055339e-06, "loss": 0.5015, "step": 2083 }, { "epoch": 0.7038162782843634, "grad_norm": 2.0089914798736572, "learning_rate": 4.268218737639783e-06, "loss": 0.4799, "step": 2084 }, { "epoch": 0.7041540020263425, "grad_norm": 2.8087007999420166, "learning_rate": 4.2592586665867785e-06, "loss": 0.5495, "step": 2085 }, { "epoch": 0.7044917257683215, "grad_norm": 1.8814725875854492, "learning_rate": 4.250305464617494e-06, "loss": 0.5333, "step": 2086 }, { "epoch": 0.7048294495103006, "grad_norm": 1.922613263130188, "learning_rate": 4.241359142444885e-06, "loss": 0.5035, "step": 2087 }, { "epoch": 0.7051671732522796, "grad_norm": 2.174886465072632, "learning_rate": 4.232419710773678e-06, "loss": 0.5191, "step": 2088 }, { "epoch": 0.7055048969942587, "grad_norm": 1.7356986999511719, "learning_rate": 4.223487180300348e-06, "loss": 0.5051, "step": 2089 }, { "epoch": 0.7058426207362377, "grad_norm": 2.0727834701538086, "learning_rate": 4.2145615617131095e-06, "loss": 0.5068, "step": 2090 }, { "epoch": 0.7061803444782169, "grad_norm": 1.8844778537750244, "learning_rate": 4.205642865691909e-06, "loss": 0.4769, "step": 2091 }, { "epoch": 0.7065180682201959, "grad_norm": 1.9750475883483887, "learning_rate": 4.19673110290842e-06, "loss": 0.5046, "step": 2092 }, { "epoch": 0.706855791962175, "grad_norm": 1.7780669927597046, "learning_rate": 4.187826284026007e-06, "loss": 0.4662, "step": 2093 }, { "epoch": 0.707193515704154, "grad_norm": 2.016815662384033, "learning_rate": 4.178928419699731e-06, "loss": 0.5185, "step": 2094 }, { "epoch": 0.707531239446133, "grad_norm": 1.6995552778244019, "learning_rate": 4.1700375205763285e-06, "loss": 0.4706, "step": 2095 }, { "epoch": 0.7078689631881121, "grad_norm": 1.9602305889129639, "learning_rate": 4.1611535972942095e-06, "loss": 0.5076, "step": 2096 }, { "epoch": 0.7082066869300911, "grad_norm": 1.789862871170044, "learning_rate": 4.152276660483429e-06, "loss": 0.5149, "step": 2097 }, { "epoch": 0.7085444106720703, "grad_norm": 1.9291008710861206, "learning_rate": 4.143406720765687e-06, "loss": 0.4931, "step": 2098 }, { "epoch": 0.7088821344140493, "grad_norm": 1.6133100986480713, "learning_rate": 4.134543788754304e-06, "loss": 0.4996, "step": 2099 }, { "epoch": 0.7092198581560284, "grad_norm": 2.1833035945892334, "learning_rate": 4.125687875054227e-06, "loss": 0.5226, "step": 2100 }, { "epoch": 0.7095575818980074, "grad_norm": 2.0537426471710205, "learning_rate": 4.116838990261998e-06, "loss": 0.5061, "step": 2101 }, { "epoch": 0.7098953056399865, "grad_norm": 1.78389310836792, "learning_rate": 4.107997144965747e-06, "loss": 0.5055, "step": 2102 }, { "epoch": 0.7102330293819655, "grad_norm": 1.7851991653442383, "learning_rate": 4.099162349745186e-06, "loss": 0.4827, "step": 2103 }, { "epoch": 0.7105707531239446, "grad_norm": 1.7757025957107544, "learning_rate": 4.090334615171584e-06, "loss": 0.5207, "step": 2104 }, { "epoch": 0.7109084768659236, "grad_norm": 1.6912221908569336, "learning_rate": 4.081513951807773e-06, "loss": 0.4978, "step": 2105 }, { "epoch": 0.7112462006079028, "grad_norm": 1.7207895517349243, "learning_rate": 4.0727003702081146e-06, "loss": 0.504, "step": 2106 }, { "epoch": 0.7115839243498818, "grad_norm": 1.8387105464935303, "learning_rate": 4.0638938809184946e-06, "loss": 0.4612, "step": 2107 }, { "epoch": 0.7119216480918609, "grad_norm": 1.4092106819152832, "learning_rate": 4.055094494476326e-06, "loss": 0.4961, "step": 2108 }, { "epoch": 0.7122593718338399, "grad_norm": 1.9927901029586792, "learning_rate": 4.046302221410505e-06, "loss": 0.4879, "step": 2109 }, { "epoch": 0.712597095575819, "grad_norm": 1.8632848262786865, "learning_rate": 4.037517072241435e-06, "loss": 0.5329, "step": 2110 }, { "epoch": 0.712934819317798, "grad_norm": 1.7867602109909058, "learning_rate": 4.028739057480981e-06, "loss": 0.4911, "step": 2111 }, { "epoch": 0.713272543059777, "grad_norm": 2.8234429359436035, "learning_rate": 4.019968187632476e-06, "loss": 0.5011, "step": 2112 }, { "epoch": 0.7136102668017562, "grad_norm": 1.5926002264022827, "learning_rate": 4.0112044731906996e-06, "loss": 0.4933, "step": 2113 }, { "epoch": 0.7139479905437353, "grad_norm": 1.7702220678329468, "learning_rate": 4.002447924641882e-06, "loss": 0.5078, "step": 2114 }, { "epoch": 0.7142857142857143, "grad_norm": 2.0030691623687744, "learning_rate": 3.993698552463667e-06, "loss": 0.5204, "step": 2115 }, { "epoch": 0.7146234380276933, "grad_norm": 2.2087457180023193, "learning_rate": 3.984956367125116e-06, "loss": 0.5139, "step": 2116 }, { "epoch": 0.7149611617696724, "grad_norm": 2.1288602352142334, "learning_rate": 3.976221379086685e-06, "loss": 0.5107, "step": 2117 }, { "epoch": 0.7152988855116514, "grad_norm": 2.318068265914917, "learning_rate": 3.967493598800233e-06, "loss": 0.4592, "step": 2118 }, { "epoch": 0.7156366092536305, "grad_norm": 2.5733389854431152, "learning_rate": 3.958773036708979e-06, "loss": 0.5074, "step": 2119 }, { "epoch": 0.7159743329956096, "grad_norm": 1.8051981925964355, "learning_rate": 3.950059703247513e-06, "loss": 0.5069, "step": 2120 }, { "epoch": 0.7163120567375887, "grad_norm": 1.8447082042694092, "learning_rate": 3.941353608841769e-06, "loss": 0.4783, "step": 2121 }, { "epoch": 0.7166497804795677, "grad_norm": 1.7475881576538086, "learning_rate": 3.9326547639090315e-06, "loss": 0.5063, "step": 2122 }, { "epoch": 0.7169875042215468, "grad_norm": 2.5208024978637695, "learning_rate": 3.923963178857899e-06, "loss": 0.5224, "step": 2123 }, { "epoch": 0.7173252279635258, "grad_norm": 20.88259506225586, "learning_rate": 3.915278864088288e-06, "loss": 0.5285, "step": 2124 }, { "epoch": 0.7176629517055049, "grad_norm": 1.5618053674697876, "learning_rate": 3.90660182999141e-06, "loss": 0.5042, "step": 2125 }, { "epoch": 0.7180006754474839, "grad_norm": 2.351884365081787, "learning_rate": 3.897932086949778e-06, "loss": 0.4809, "step": 2126 }, { "epoch": 0.7183383991894631, "grad_norm": 1.9457887411117554, "learning_rate": 3.889269645337168e-06, "loss": 0.5386, "step": 2127 }, { "epoch": 0.7186761229314421, "grad_norm": 1.9145712852478027, "learning_rate": 3.8806145155186205e-06, "loss": 0.5157, "step": 2128 }, { "epoch": 0.7190138466734212, "grad_norm": 2.143277645111084, "learning_rate": 3.871966707850439e-06, "loss": 0.5274, "step": 2129 }, { "epoch": 0.7193515704154002, "grad_norm": 1.962355136871338, "learning_rate": 3.863326232680148e-06, "loss": 0.4777, "step": 2130 }, { "epoch": 0.7196892941573793, "grad_norm": 1.817018985748291, "learning_rate": 3.854693100346516e-06, "loss": 0.4812, "step": 2131 }, { "epoch": 0.7200270178993583, "grad_norm": 1.889114499092102, "learning_rate": 3.846067321179514e-06, "loss": 0.4804, "step": 2132 }, { "epoch": 0.7203647416413373, "grad_norm": 1.6174373626708984, "learning_rate": 3.8374489055003175e-06, "loss": 0.4777, "step": 2133 }, { "epoch": 0.7207024653833164, "grad_norm": 2.140430450439453, "learning_rate": 3.828837863621286e-06, "loss": 0.4886, "step": 2134 }, { "epoch": 0.7210401891252955, "grad_norm": 1.7526297569274902, "learning_rate": 3.82023420584597e-06, "loss": 0.5214, "step": 2135 }, { "epoch": 0.7213779128672746, "grad_norm": 1.874678134918213, "learning_rate": 3.811637942469072e-06, "loss": 0.5183, "step": 2136 }, { "epoch": 0.7217156366092536, "grad_norm": 1.957456111907959, "learning_rate": 3.80304908377645e-06, "loss": 0.5141, "step": 2137 }, { "epoch": 0.7220533603512327, "grad_norm": 1.7486165761947632, "learning_rate": 3.7944676400451017e-06, "loss": 0.5209, "step": 2138 }, { "epoch": 0.7223910840932117, "grad_norm": 1.9038984775543213, "learning_rate": 3.7858936215431506e-06, "loss": 0.4916, "step": 2139 }, { "epoch": 0.7227288078351908, "grad_norm": 1.828188419342041, "learning_rate": 3.7773270385298465e-06, "loss": 0.5043, "step": 2140 }, { "epoch": 0.7230665315771698, "grad_norm": 1.8669188022613525, "learning_rate": 3.768767901255528e-06, "loss": 0.4922, "step": 2141 }, { "epoch": 0.723404255319149, "grad_norm": 2.1780216693878174, "learning_rate": 3.76021621996163e-06, "loss": 0.5135, "step": 2142 }, { "epoch": 0.723741979061128, "grad_norm": 1.7812824249267578, "learning_rate": 3.7516720048806645e-06, "loss": 0.4732, "step": 2143 }, { "epoch": 0.7240797028031071, "grad_norm": 1.8469502925872803, "learning_rate": 3.7431352662362185e-06, "loss": 0.5156, "step": 2144 }, { "epoch": 0.7244174265450861, "grad_norm": 2.406214475631714, "learning_rate": 3.734606014242922e-06, "loss": 0.4931, "step": 2145 }, { "epoch": 0.7247551502870652, "grad_norm": 1.8207601308822632, "learning_rate": 3.7260842591064504e-06, "loss": 0.4838, "step": 2146 }, { "epoch": 0.7250928740290442, "grad_norm": 1.8894842863082886, "learning_rate": 3.717570011023507e-06, "loss": 0.5105, "step": 2147 }, { "epoch": 0.7254305977710233, "grad_norm": 1.5274699926376343, "learning_rate": 3.7090632801818207e-06, "loss": 0.5089, "step": 2148 }, { "epoch": 0.7257683215130024, "grad_norm": 1.688896894454956, "learning_rate": 3.7005640767601146e-06, "loss": 0.4972, "step": 2149 }, { "epoch": 0.7261060452549815, "grad_norm": 1.9462617635726929, "learning_rate": 3.6920724109281146e-06, "loss": 0.4901, "step": 2150 }, { "epoch": 0.7264437689969605, "grad_norm": 2.472472667694092, "learning_rate": 3.6835882928465227e-06, "loss": 0.5351, "step": 2151 }, { "epoch": 0.7267814927389395, "grad_norm": 5.320930004119873, "learning_rate": 3.6751117326670037e-06, "loss": 0.5207, "step": 2152 }, { "epoch": 0.7271192164809186, "grad_norm": 1.7794955968856812, "learning_rate": 3.6666427405321923e-06, "loss": 0.4673, "step": 2153 }, { "epoch": 0.7274569402228976, "grad_norm": 1.894355058670044, "learning_rate": 3.6581813265756595e-06, "loss": 0.5123, "step": 2154 }, { "epoch": 0.7277946639648767, "grad_norm": 1.8752503395080566, "learning_rate": 3.649727500921907e-06, "loss": 0.4749, "step": 2155 }, { "epoch": 0.7281323877068558, "grad_norm": 1.645002007484436, "learning_rate": 3.6412812736863566e-06, "loss": 0.5133, "step": 2156 }, { "epoch": 0.7284701114488349, "grad_norm": 2.858109474182129, "learning_rate": 3.6328426549753492e-06, "loss": 0.4911, "step": 2157 }, { "epoch": 0.7288078351908139, "grad_norm": 2.3915398120880127, "learning_rate": 3.6244116548861084e-06, "loss": 0.5227, "step": 2158 }, { "epoch": 0.729145558932793, "grad_norm": 2.0598652362823486, "learning_rate": 3.615988283506748e-06, "loss": 0.5133, "step": 2159 }, { "epoch": 0.729483282674772, "grad_norm": 2.4612889289855957, "learning_rate": 3.6075725509162505e-06, "loss": 0.5208, "step": 2160 }, { "epoch": 0.7298210064167511, "grad_norm": 1.6721922159194946, "learning_rate": 3.599164467184464e-06, "loss": 0.5258, "step": 2161 }, { "epoch": 0.7301587301587301, "grad_norm": 1.7870811223983765, "learning_rate": 3.590764042372079e-06, "loss": 0.5288, "step": 2162 }, { "epoch": 0.7304964539007093, "grad_norm": 2.0756685733795166, "learning_rate": 3.5823712865306225e-06, "loss": 0.4977, "step": 2163 }, { "epoch": 0.7308341776426883, "grad_norm": 1.5911080837249756, "learning_rate": 3.5739862097024434e-06, "loss": 0.4892, "step": 2164 }, { "epoch": 0.7311719013846674, "grad_norm": 1.574463963508606, "learning_rate": 3.565608821920712e-06, "loss": 0.5215, "step": 2165 }, { "epoch": 0.7315096251266464, "grad_norm": 2.5072309970855713, "learning_rate": 3.557239133209387e-06, "loss": 0.5296, "step": 2166 }, { "epoch": 0.7318473488686255, "grad_norm": 2.0090274810791016, "learning_rate": 3.54887715358322e-06, "loss": 0.5075, "step": 2167 }, { "epoch": 0.7321850726106045, "grad_norm": 1.7611583471298218, "learning_rate": 3.540522893047732e-06, "loss": 0.4894, "step": 2168 }, { "epoch": 0.7325227963525835, "grad_norm": 1.9276676177978516, "learning_rate": 3.532176361599221e-06, "loss": 0.5148, "step": 2169 }, { "epoch": 0.7328605200945626, "grad_norm": 1.9069682359695435, "learning_rate": 3.523837569224725e-06, "loss": 0.5235, "step": 2170 }, { "epoch": 0.7331982438365418, "grad_norm": 2.874405860900879, "learning_rate": 3.515506525902024e-06, "loss": 0.4984, "step": 2171 }, { "epoch": 0.7335359675785208, "grad_norm": 1.868220329284668, "learning_rate": 3.507183241599631e-06, "loss": 0.4884, "step": 2172 }, { "epoch": 0.7338736913204998, "grad_norm": 2.637437343597412, "learning_rate": 3.498867726276767e-06, "loss": 0.5017, "step": 2173 }, { "epoch": 0.7342114150624789, "grad_norm": 2.032956123352051, "learning_rate": 3.4905599898833665e-06, "loss": 0.5088, "step": 2174 }, { "epoch": 0.7345491388044579, "grad_norm": 1.945512294769287, "learning_rate": 3.4822600423600485e-06, "loss": 0.4823, "step": 2175 }, { "epoch": 0.734886862546437, "grad_norm": 2.4700024127960205, "learning_rate": 3.473967893638115e-06, "loss": 0.5115, "step": 2176 }, { "epoch": 0.735224586288416, "grad_norm": 7.452981472015381, "learning_rate": 3.465683553639536e-06, "loss": 0.4974, "step": 2177 }, { "epoch": 0.7355623100303952, "grad_norm": 2.1160449981689453, "learning_rate": 3.4574070322769347e-06, "loss": 0.5071, "step": 2178 }, { "epoch": 0.7359000337723742, "grad_norm": 1.6134315729141235, "learning_rate": 3.4491383394535883e-06, "loss": 0.485, "step": 2179 }, { "epoch": 0.7362377575143533, "grad_norm": 1.7930665016174316, "learning_rate": 3.4408774850633986e-06, "loss": 0.4959, "step": 2180 }, { "epoch": 0.7365754812563323, "grad_norm": 1.9565809965133667, "learning_rate": 3.4326244789908892e-06, "loss": 0.5251, "step": 2181 }, { "epoch": 0.7369132049983114, "grad_norm": 4.0136308670043945, "learning_rate": 3.4243793311111916e-06, "loss": 0.4823, "step": 2182 }, { "epoch": 0.7372509287402904, "grad_norm": 1.7696640491485596, "learning_rate": 3.4161420512900423e-06, "loss": 0.5299, "step": 2183 }, { "epoch": 0.7375886524822695, "grad_norm": 1.9620662927627563, "learning_rate": 3.407912649383757e-06, "loss": 0.5036, "step": 2184 }, { "epoch": 0.7379263762242486, "grad_norm": 1.704829454421997, "learning_rate": 3.399691135239225e-06, "loss": 0.4863, "step": 2185 }, { "epoch": 0.7382640999662277, "grad_norm": 1.8219783306121826, "learning_rate": 3.391477518693894e-06, "loss": 0.4687, "step": 2186 }, { "epoch": 0.7386018237082067, "grad_norm": 1.957543969154358, "learning_rate": 3.3832718095757757e-06, "loss": 0.4777, "step": 2187 }, { "epoch": 0.7389395474501858, "grad_norm": 1.714799404144287, "learning_rate": 3.3750740177034047e-06, "loss": 0.4995, "step": 2188 }, { "epoch": 0.7392772711921648, "grad_norm": 2.224867105484009, "learning_rate": 3.36688415288585e-06, "loss": 0.4968, "step": 2189 }, { "epoch": 0.7396149949341438, "grad_norm": 1.7459436655044556, "learning_rate": 3.358702224922691e-06, "loss": 0.4825, "step": 2190 }, { "epoch": 0.7399527186761229, "grad_norm": 2.0579373836517334, "learning_rate": 3.3505282436040188e-06, "loss": 0.466, "step": 2191 }, { "epoch": 0.740290442418102, "grad_norm": 1.5856072902679443, "learning_rate": 3.342362218710403e-06, "loss": 0.4665, "step": 2192 }, { "epoch": 0.7406281661600811, "grad_norm": 3.3797624111175537, "learning_rate": 3.3342041600129092e-06, "loss": 0.5038, "step": 2193 }, { "epoch": 0.7409658899020601, "grad_norm": 1.9303333759307861, "learning_rate": 3.3260540772730576e-06, "loss": 0.5097, "step": 2194 }, { "epoch": 0.7413036136440392, "grad_norm": 1.8712704181671143, "learning_rate": 3.317911980242825e-06, "loss": 0.5111, "step": 2195 }, { "epoch": 0.7416413373860182, "grad_norm": 1.7047247886657715, "learning_rate": 3.3097778786646452e-06, "loss": 0.511, "step": 2196 }, { "epoch": 0.7419790611279973, "grad_norm": 1.6833575963974, "learning_rate": 3.3016517822713723e-06, "loss": 0.4947, "step": 2197 }, { "epoch": 0.7423167848699763, "grad_norm": 3.477786064147949, "learning_rate": 3.2935337007862865e-06, "loss": 0.5106, "step": 2198 }, { "epoch": 0.7426545086119554, "grad_norm": 2.367138624191284, "learning_rate": 3.285423643923077e-06, "loss": 0.4975, "step": 2199 }, { "epoch": 0.7429922323539345, "grad_norm": 1.8999276161193848, "learning_rate": 3.277321621385835e-06, "loss": 0.5015, "step": 2200 }, { "epoch": 0.7433299560959136, "grad_norm": 1.765883445739746, "learning_rate": 3.269227642869035e-06, "loss": 0.5009, "step": 2201 }, { "epoch": 0.7436676798378926, "grad_norm": 1.9989557266235352, "learning_rate": 3.261141718057523e-06, "loss": 0.459, "step": 2202 }, { "epoch": 0.7440054035798717, "grad_norm": 1.593981385231018, "learning_rate": 3.2530638566265126e-06, "loss": 0.4797, "step": 2203 }, { "epoch": 0.7443431273218507, "grad_norm": 1.9350991249084473, "learning_rate": 3.2449940682415725e-06, "loss": 0.5046, "step": 2204 }, { "epoch": 0.7446808510638298, "grad_norm": 1.9920780658721924, "learning_rate": 3.2369323625586036e-06, "loss": 0.4858, "step": 2205 }, { "epoch": 0.7450185748058088, "grad_norm": 1.8960868120193481, "learning_rate": 3.2288787492238416e-06, "loss": 0.5118, "step": 2206 }, { "epoch": 0.745356298547788, "grad_norm": 2.652043581008911, "learning_rate": 3.220833237873836e-06, "loss": 0.4988, "step": 2207 }, { "epoch": 0.745694022289767, "grad_norm": 2.224761486053467, "learning_rate": 3.2127958381354396e-06, "loss": 0.4837, "step": 2208 }, { "epoch": 0.746031746031746, "grad_norm": 3.5017452239990234, "learning_rate": 3.2047665596258093e-06, "loss": 0.4768, "step": 2209 }, { "epoch": 0.7463694697737251, "grad_norm": 1.8929321765899658, "learning_rate": 3.1967454119523745e-06, "loss": 0.5348, "step": 2210 }, { "epoch": 0.7467071935157041, "grad_norm": 2.0355708599090576, "learning_rate": 3.18873240471284e-06, "loss": 0.5139, "step": 2211 }, { "epoch": 0.7470449172576832, "grad_norm": 2.2055165767669678, "learning_rate": 3.1807275474951636e-06, "loss": 0.5291, "step": 2212 }, { "epoch": 0.7473826409996622, "grad_norm": 1.9334121942520142, "learning_rate": 3.1727308498775623e-06, "loss": 0.5188, "step": 2213 }, { "epoch": 0.7477203647416414, "grad_norm": 1.732324242591858, "learning_rate": 3.1647423214284856e-06, "loss": 0.5082, "step": 2214 }, { "epoch": 0.7480580884836204, "grad_norm": 2.6473793983459473, "learning_rate": 3.1567619717066057e-06, "loss": 0.4929, "step": 2215 }, { "epoch": 0.7483958122255995, "grad_norm": 1.765609622001648, "learning_rate": 3.1487898102608073e-06, "loss": 0.4943, "step": 2216 }, { "epoch": 0.7487335359675785, "grad_norm": 1.7690150737762451, "learning_rate": 3.1408258466301777e-06, "loss": 0.5343, "step": 2217 }, { "epoch": 0.7490712597095576, "grad_norm": 1.822365403175354, "learning_rate": 3.1328700903440045e-06, "loss": 0.5018, "step": 2218 }, { "epoch": 0.7494089834515366, "grad_norm": 1.5885096788406372, "learning_rate": 3.124922550921742e-06, "loss": 0.4606, "step": 2219 }, { "epoch": 0.7497467071935157, "grad_norm": 1.9040411710739136, "learning_rate": 3.1169832378730204e-06, "loss": 0.4972, "step": 2220 }, { "epoch": 0.7500844309354948, "grad_norm": 1.8548741340637207, "learning_rate": 3.1090521606976186e-06, "loss": 0.4794, "step": 2221 }, { "epoch": 0.7504221546774739, "grad_norm": 1.7283499240875244, "learning_rate": 3.101129328885475e-06, "loss": 0.5028, "step": 2222 }, { "epoch": 0.7507598784194529, "grad_norm": 1.624071717262268, "learning_rate": 3.09321475191665e-06, "loss": 0.4519, "step": 2223 }, { "epoch": 0.751097602161432, "grad_norm": 2.5448992252349854, "learning_rate": 3.0853084392613297e-06, "loss": 0.5081, "step": 2224 }, { "epoch": 0.751435325903411, "grad_norm": 1.6833394765853882, "learning_rate": 3.0774104003798087e-06, "loss": 0.483, "step": 2225 }, { "epoch": 0.75177304964539, "grad_norm": 1.9350643157958984, "learning_rate": 3.0695206447224923e-06, "loss": 0.4991, "step": 2226 }, { "epoch": 0.7521107733873691, "grad_norm": 1.6217069625854492, "learning_rate": 3.061639181729863e-06, "loss": 0.509, "step": 2227 }, { "epoch": 0.7524484971293482, "grad_norm": 2.147970199584961, "learning_rate": 3.053766020832485e-06, "loss": 0.5216, "step": 2228 }, { "epoch": 0.7527862208713273, "grad_norm": 2.072295904159546, "learning_rate": 3.045901171450986e-06, "loss": 0.5076, "step": 2229 }, { "epoch": 0.7531239446133063, "grad_norm": 2.329772472381592, "learning_rate": 3.0380446429960573e-06, "loss": 0.5335, "step": 2230 }, { "epoch": 0.7534616683552854, "grad_norm": 1.9266347885131836, "learning_rate": 3.030196444868424e-06, "loss": 0.4795, "step": 2231 }, { "epoch": 0.7537993920972644, "grad_norm": 1.9186776876449585, "learning_rate": 3.022356586458848e-06, "loss": 0.4667, "step": 2232 }, { "epoch": 0.7541371158392435, "grad_norm": 1.743638515472412, "learning_rate": 3.014525077148107e-06, "loss": 0.5402, "step": 2233 }, { "epoch": 0.7544748395812225, "grad_norm": 1.9464505910873413, "learning_rate": 3.0067019263069973e-06, "loss": 0.4916, "step": 2234 }, { "epoch": 0.7548125633232016, "grad_norm": 1.847946286201477, "learning_rate": 2.998887143296312e-06, "loss": 0.5282, "step": 2235 }, { "epoch": 0.7551502870651807, "grad_norm": 2.1001274585723877, "learning_rate": 2.9910807374668273e-06, "loss": 0.5147, "step": 2236 }, { "epoch": 0.7554880108071598, "grad_norm": 1.8212521076202393, "learning_rate": 2.983282718159296e-06, "loss": 0.5208, "step": 2237 }, { "epoch": 0.7558257345491388, "grad_norm": 2.6225926876068115, "learning_rate": 2.9754930947044357e-06, "loss": 0.4893, "step": 2238 }, { "epoch": 0.7561634582911179, "grad_norm": 1.7773020267486572, "learning_rate": 2.967711876422925e-06, "loss": 0.4883, "step": 2239 }, { "epoch": 0.7565011820330969, "grad_norm": 1.6429911851882935, "learning_rate": 2.9599390726253785e-06, "loss": 0.5002, "step": 2240 }, { "epoch": 0.756838905775076, "grad_norm": 2.089493751525879, "learning_rate": 2.952174692612344e-06, "loss": 0.5384, "step": 2241 }, { "epoch": 0.757176629517055, "grad_norm": 1.8864065408706665, "learning_rate": 2.9444187456742855e-06, "loss": 0.4706, "step": 2242 }, { "epoch": 0.7575143532590342, "grad_norm": 1.9049993753433228, "learning_rate": 2.9366712410915888e-06, "loss": 0.526, "step": 2243 }, { "epoch": 0.7578520770010132, "grad_norm": 1.6322212219238281, "learning_rate": 2.9289321881345257e-06, "loss": 0.4736, "step": 2244 }, { "epoch": 0.7581898007429922, "grad_norm": 2.030985116958618, "learning_rate": 2.92120159606326e-06, "loss": 0.524, "step": 2245 }, { "epoch": 0.7585275244849713, "grad_norm": 2.8712539672851562, "learning_rate": 2.9134794741278317e-06, "loss": 0.4935, "step": 2246 }, { "epoch": 0.7588652482269503, "grad_norm": 1.8868581056594849, "learning_rate": 2.9057658315681414e-06, "loss": 0.5284, "step": 2247 }, { "epoch": 0.7592029719689294, "grad_norm": 2.3816816806793213, "learning_rate": 2.8980606776139543e-06, "loss": 0.5012, "step": 2248 }, { "epoch": 0.7595406957109084, "grad_norm": 2.3844239711761475, "learning_rate": 2.8903640214848693e-06, "loss": 0.5164, "step": 2249 }, { "epoch": 0.7598784194528876, "grad_norm": 1.9284957647323608, "learning_rate": 2.8826758723903192e-06, "loss": 0.5196, "step": 2250 }, { "epoch": 0.7602161431948666, "grad_norm": 1.5869978666305542, "learning_rate": 2.874996239529556e-06, "loss": 0.4897, "step": 2251 }, { "epoch": 0.7605538669368457, "grad_norm": 1.7282772064208984, "learning_rate": 2.8673251320916483e-06, "loss": 0.5045, "step": 2252 }, { "epoch": 0.7608915906788247, "grad_norm": 2.0375173091888428, "learning_rate": 2.8596625592554585e-06, "loss": 0.5116, "step": 2253 }, { "epoch": 0.7612293144208038, "grad_norm": 1.6889909505844116, "learning_rate": 2.8520085301896373e-06, "loss": 0.544, "step": 2254 }, { "epoch": 0.7615670381627828, "grad_norm": 1.9214400053024292, "learning_rate": 2.844363054052608e-06, "loss": 0.4986, "step": 2255 }, { "epoch": 0.7619047619047619, "grad_norm": 1.8256349563598633, "learning_rate": 2.8367261399925692e-06, "loss": 0.4794, "step": 2256 }, { "epoch": 0.762242485646741, "grad_norm": 1.641717791557312, "learning_rate": 2.829097797147473e-06, "loss": 0.4754, "step": 2257 }, { "epoch": 0.7625802093887201, "grad_norm": 1.7669825553894043, "learning_rate": 2.821478034645009e-06, "loss": 0.5164, "step": 2258 }, { "epoch": 0.7629179331306991, "grad_norm": 1.9854422807693481, "learning_rate": 2.8138668616026045e-06, "loss": 0.4858, "step": 2259 }, { "epoch": 0.7632556568726782, "grad_norm": 2.631047487258911, "learning_rate": 2.8062642871274038e-06, "loss": 0.5046, "step": 2260 }, { "epoch": 0.7635933806146572, "grad_norm": 1.741010308265686, "learning_rate": 2.7986703203162735e-06, "loss": 0.4574, "step": 2261 }, { "epoch": 0.7639311043566362, "grad_norm": 1.707816243171692, "learning_rate": 2.791084970255772e-06, "loss": 0.4725, "step": 2262 }, { "epoch": 0.7642688280986153, "grad_norm": 1.9083914756774902, "learning_rate": 2.7835082460221484e-06, "loss": 0.524, "step": 2263 }, { "epoch": 0.7646065518405943, "grad_norm": 1.5374054908752441, "learning_rate": 2.7759401566813295e-06, "loss": 0.4829, "step": 2264 }, { "epoch": 0.7649442755825735, "grad_norm": 1.7384891510009766, "learning_rate": 2.768380711288917e-06, "loss": 0.5159, "step": 2265 }, { "epoch": 0.7652819993245525, "grad_norm": 1.9054173231124878, "learning_rate": 2.7608299188901632e-06, "loss": 0.5117, "step": 2266 }, { "epoch": 0.7656197230665316, "grad_norm": 2.018125295639038, "learning_rate": 2.7532877885199683e-06, "loss": 0.4817, "step": 2267 }, { "epoch": 0.7659574468085106, "grad_norm": 2.064748764038086, "learning_rate": 2.7457543292028634e-06, "loss": 0.4724, "step": 2268 }, { "epoch": 0.7662951705504897, "grad_norm": 1.7818918228149414, "learning_rate": 2.7382295499530165e-06, "loss": 0.4499, "step": 2269 }, { "epoch": 0.7666328942924687, "grad_norm": 1.7870254516601562, "learning_rate": 2.730713459774198e-06, "loss": 0.462, "step": 2270 }, { "epoch": 0.7669706180344478, "grad_norm": 1.9399747848510742, "learning_rate": 2.723206067659786e-06, "loss": 0.4639, "step": 2271 }, { "epoch": 0.7673083417764269, "grad_norm": 2.573512315750122, "learning_rate": 2.715707382592746e-06, "loss": 0.5357, "step": 2272 }, { "epoch": 0.767646065518406, "grad_norm": 2.0000314712524414, "learning_rate": 2.708217413545635e-06, "loss": 0.5463, "step": 2273 }, { "epoch": 0.767983789260385, "grad_norm": 1.7044312953948975, "learning_rate": 2.7007361694805735e-06, "loss": 0.517, "step": 2274 }, { "epoch": 0.7683215130023641, "grad_norm": 2.4844391345977783, "learning_rate": 2.6932636593492432e-06, "loss": 0.5238, "step": 2275 }, { "epoch": 0.7686592367443431, "grad_norm": 3.5157570838928223, "learning_rate": 2.685799892092872e-06, "loss": 0.4865, "step": 2276 }, { "epoch": 0.7689969604863222, "grad_norm": 1.6371835470199585, "learning_rate": 2.678344876642234e-06, "loss": 0.4942, "step": 2277 }, { "epoch": 0.7693346842283012, "grad_norm": 2.7867941856384277, "learning_rate": 2.670898621917629e-06, "loss": 0.4655, "step": 2278 }, { "epoch": 0.7696724079702804, "grad_norm": 2.195582866668701, "learning_rate": 2.663461136828871e-06, "loss": 0.5149, "step": 2279 }, { "epoch": 0.7700101317122594, "grad_norm": 2.0157830715179443, "learning_rate": 2.6560324302752825e-06, "loss": 0.4737, "step": 2280 }, { "epoch": 0.7703478554542385, "grad_norm": 2.328758955001831, "learning_rate": 2.6486125111456796e-06, "loss": 0.4894, "step": 2281 }, { "epoch": 0.7706855791962175, "grad_norm": 2.5709142684936523, "learning_rate": 2.64120138831837e-06, "loss": 0.4767, "step": 2282 }, { "epoch": 0.7710233029381965, "grad_norm": 1.6012747287750244, "learning_rate": 2.6337990706611303e-06, "loss": 0.5065, "step": 2283 }, { "epoch": 0.7713610266801756, "grad_norm": 2.185211420059204, "learning_rate": 2.6264055670312038e-06, "loss": 0.5125, "step": 2284 }, { "epoch": 0.7716987504221546, "grad_norm": 2.315382242202759, "learning_rate": 2.6190208862752865e-06, "loss": 0.4906, "step": 2285 }, { "epoch": 0.7720364741641338, "grad_norm": 1.6207542419433594, "learning_rate": 2.6116450372295145e-06, "loss": 0.5146, "step": 2286 }, { "epoch": 0.7723741979061128, "grad_norm": 1.9739338159561157, "learning_rate": 2.604278028719465e-06, "loss": 0.5254, "step": 2287 }, { "epoch": 0.7727119216480919, "grad_norm": 1.5837104320526123, "learning_rate": 2.5969198695601306e-06, "loss": 0.5098, "step": 2288 }, { "epoch": 0.7730496453900709, "grad_norm": 2.209446907043457, "learning_rate": 2.5895705685559138e-06, "loss": 0.5137, "step": 2289 }, { "epoch": 0.77338736913205, "grad_norm": 1.6970421075820923, "learning_rate": 2.5822301345006196e-06, "loss": 0.4803, "step": 2290 }, { "epoch": 0.773725092874029, "grad_norm": 1.7166054248809814, "learning_rate": 2.574898576177448e-06, "loss": 0.5297, "step": 2291 }, { "epoch": 0.7740628166160081, "grad_norm": 3.7177491188049316, "learning_rate": 2.5675759023589732e-06, "loss": 0.4461, "step": 2292 }, { "epoch": 0.7744005403579872, "grad_norm": 1.8223044872283936, "learning_rate": 2.560262121807139e-06, "loss": 0.4878, "step": 2293 }, { "epoch": 0.7747382640999663, "grad_norm": 1.7013657093048096, "learning_rate": 2.5529572432732473e-06, "loss": 0.5286, "step": 2294 }, { "epoch": 0.7750759878419453, "grad_norm": 3.030637502670288, "learning_rate": 2.545661275497955e-06, "loss": 0.4761, "step": 2295 }, { "epoch": 0.7754137115839244, "grad_norm": 1.552181363105774, "learning_rate": 2.53837422721125e-06, "loss": 0.4655, "step": 2296 }, { "epoch": 0.7757514353259034, "grad_norm": 1.7967276573181152, "learning_rate": 2.5310961071324448e-06, "loss": 0.4457, "step": 2297 }, { "epoch": 0.7760891590678825, "grad_norm": 1.9769643545150757, "learning_rate": 2.5238269239701816e-06, "loss": 0.506, "step": 2298 }, { "epoch": 0.7764268828098615, "grad_norm": 1.860569953918457, "learning_rate": 2.5165666864223936e-06, "loss": 0.4983, "step": 2299 }, { "epoch": 0.7767646065518405, "grad_norm": 2.172823667526245, "learning_rate": 2.5093154031763247e-06, "loss": 0.5232, "step": 2300 }, { "epoch": 0.7771023302938197, "grad_norm": 2.0324599742889404, "learning_rate": 2.5020730829084938e-06, "loss": 0.4922, "step": 2301 }, { "epoch": 0.7774400540357987, "grad_norm": 1.81693434715271, "learning_rate": 2.4948397342846985e-06, "loss": 0.4881, "step": 2302 }, { "epoch": 0.7777777777777778, "grad_norm": 5.63407564163208, "learning_rate": 2.48761536596e-06, "loss": 0.5067, "step": 2303 }, { "epoch": 0.7781155015197568, "grad_norm": 1.7347174882888794, "learning_rate": 2.480399986578721e-06, "loss": 0.5014, "step": 2304 }, { "epoch": 0.7784532252617359, "grad_norm": 1.8094494342803955, "learning_rate": 2.4731936047744198e-06, "loss": 0.5161, "step": 2305 }, { "epoch": 0.7787909490037149, "grad_norm": 1.8091762065887451, "learning_rate": 2.4659962291698936e-06, "loss": 0.4666, "step": 2306 }, { "epoch": 0.779128672745694, "grad_norm": 1.7516279220581055, "learning_rate": 2.4588078683771565e-06, "loss": 0.4743, "step": 2307 }, { "epoch": 0.7794663964876731, "grad_norm": 1.9064487218856812, "learning_rate": 2.45162853099745e-06, "loss": 0.4912, "step": 2308 }, { "epoch": 0.7798041202296522, "grad_norm": 1.737441062927246, "learning_rate": 2.4444582256212034e-06, "loss": 0.5165, "step": 2309 }, { "epoch": 0.7801418439716312, "grad_norm": 2.090681314468384, "learning_rate": 2.4372969608280483e-06, "loss": 0.5133, "step": 2310 }, { "epoch": 0.7804795677136103, "grad_norm": 2.0961406230926514, "learning_rate": 2.430144745186792e-06, "loss": 0.4723, "step": 2311 }, { "epoch": 0.7808172914555893, "grad_norm": 1.7331523895263672, "learning_rate": 2.4230015872554235e-06, "loss": 0.4866, "step": 2312 }, { "epoch": 0.7811550151975684, "grad_norm": 4.873265266418457, "learning_rate": 2.4158674955810836e-06, "loss": 0.4601, "step": 2313 }, { "epoch": 0.7814927389395474, "grad_norm": 1.8993254899978638, "learning_rate": 2.408742478700071e-06, "loss": 0.5106, "step": 2314 }, { "epoch": 0.7818304626815266, "grad_norm": 4.586630344390869, "learning_rate": 2.401626545137824e-06, "loss": 0.5119, "step": 2315 }, { "epoch": 0.7821681864235056, "grad_norm": 1.6898576021194458, "learning_rate": 2.3945197034089095e-06, "loss": 0.4942, "step": 2316 }, { "epoch": 0.7825059101654847, "grad_norm": 1.5354605913162231, "learning_rate": 2.3874219620170246e-06, "loss": 0.4801, "step": 2317 }, { "epoch": 0.7828436339074637, "grad_norm": 1.9221949577331543, "learning_rate": 2.3803333294549647e-06, "loss": 0.4882, "step": 2318 }, { "epoch": 0.7831813576494427, "grad_norm": 1.9564182758331299, "learning_rate": 2.373253814204639e-06, "loss": 0.5315, "step": 2319 }, { "epoch": 0.7835190813914218, "grad_norm": 2.559972047805786, "learning_rate": 2.3661834247370384e-06, "loss": 0.536, "step": 2320 }, { "epoch": 0.7838568051334008, "grad_norm": 1.6549112796783447, "learning_rate": 2.3591221695122337e-06, "loss": 0.4648, "step": 2321 }, { "epoch": 0.78419452887538, "grad_norm": 1.7167253494262695, "learning_rate": 2.352070056979375e-06, "loss": 0.5037, "step": 2322 }, { "epoch": 0.784532252617359, "grad_norm": 1.7285021543502808, "learning_rate": 2.345027095576666e-06, "loss": 0.5047, "step": 2323 }, { "epoch": 0.7848699763593381, "grad_norm": 1.8646498918533325, "learning_rate": 2.33799329373136e-06, "loss": 0.5093, "step": 2324 }, { "epoch": 0.7852077001013171, "grad_norm": 1.6724302768707275, "learning_rate": 2.3309686598597504e-06, "loss": 0.5047, "step": 2325 }, { "epoch": 0.7855454238432962, "grad_norm": 1.6705329418182373, "learning_rate": 2.3239532023671663e-06, "loss": 0.5006, "step": 2326 }, { "epoch": 0.7858831475852752, "grad_norm": 1.7959482669830322, "learning_rate": 2.316946929647952e-06, "loss": 0.4812, "step": 2327 }, { "epoch": 0.7862208713272543, "grad_norm": 2.063241720199585, "learning_rate": 2.309949850085462e-06, "loss": 0.4835, "step": 2328 }, { "epoch": 0.7865585950692333, "grad_norm": 1.7872635126113892, "learning_rate": 2.30296197205205e-06, "loss": 0.5006, "step": 2329 }, { "epoch": 0.7868963188112125, "grad_norm": 1.9559094905853271, "learning_rate": 2.295983303909065e-06, "loss": 0.5308, "step": 2330 }, { "epoch": 0.7872340425531915, "grad_norm": 2.1158254146575928, "learning_rate": 2.2890138540068297e-06, "loss": 0.4747, "step": 2331 }, { "epoch": 0.7875717662951706, "grad_norm": 1.998735785484314, "learning_rate": 2.2820536306846384e-06, "loss": 0.4783, "step": 2332 }, { "epoch": 0.7879094900371496, "grad_norm": 1.9355530738830566, "learning_rate": 2.2751026422707444e-06, "loss": 0.4982, "step": 2333 }, { "epoch": 0.7882472137791287, "grad_norm": 1.8180084228515625, "learning_rate": 2.2681608970823567e-06, "loss": 0.5087, "step": 2334 }, { "epoch": 0.7885849375211077, "grad_norm": 3.4800007343292236, "learning_rate": 2.2612284034256182e-06, "loss": 0.5037, "step": 2335 }, { "epoch": 0.7889226612630867, "grad_norm": 1.9249281883239746, "learning_rate": 2.254305169595604e-06, "loss": 0.4895, "step": 2336 }, { "epoch": 0.7892603850050659, "grad_norm": 4.9209675788879395, "learning_rate": 2.2473912038763066e-06, "loss": 0.5079, "step": 2337 }, { "epoch": 0.789598108747045, "grad_norm": 2.0414764881134033, "learning_rate": 2.2404865145406353e-06, "loss": 0.4847, "step": 2338 }, { "epoch": 0.789935832489024, "grad_norm": 1.793156623840332, "learning_rate": 2.2335911098503947e-06, "loss": 0.4908, "step": 2339 }, { "epoch": 0.790273556231003, "grad_norm": 1.6631263494491577, "learning_rate": 2.2267049980562772e-06, "loss": 0.4935, "step": 2340 }, { "epoch": 0.7906112799729821, "grad_norm": 1.5052499771118164, "learning_rate": 2.2198281873978643e-06, "loss": 0.4741, "step": 2341 }, { "epoch": 0.7909490037149611, "grad_norm": 1.8373688459396362, "learning_rate": 2.2129606861036003e-06, "loss": 0.5096, "step": 2342 }, { "epoch": 0.7912867274569402, "grad_norm": 1.5233027935028076, "learning_rate": 2.2061025023907968e-06, "loss": 0.4969, "step": 2343 }, { "epoch": 0.7916244511989193, "grad_norm": 1.7795177698135376, "learning_rate": 2.199253644465611e-06, "loss": 0.5257, "step": 2344 }, { "epoch": 0.7919621749408984, "grad_norm": 2.2098746299743652, "learning_rate": 2.192414120523043e-06, "loss": 0.4771, "step": 2345 }, { "epoch": 0.7922998986828774, "grad_norm": 1.5862585306167603, "learning_rate": 2.1855839387469237e-06, "loss": 0.474, "step": 2346 }, { "epoch": 0.7926376224248565, "grad_norm": 1.8672394752502441, "learning_rate": 2.17876310730991e-06, "loss": 0.4851, "step": 2347 }, { "epoch": 0.7929753461668355, "grad_norm": 1.5978671312332153, "learning_rate": 2.1719516343734672e-06, "loss": 0.5001, "step": 2348 }, { "epoch": 0.7933130699088146, "grad_norm": 1.8322932720184326, "learning_rate": 2.1651495280878597e-06, "loss": 0.4651, "step": 2349 }, { "epoch": 0.7936507936507936, "grad_norm": 1.8785189390182495, "learning_rate": 2.158356796592147e-06, "loss": 0.4684, "step": 2350 }, { "epoch": 0.7939885173927728, "grad_norm": 1.9102997779846191, "learning_rate": 2.151573448014177e-06, "loss": 0.5009, "step": 2351 }, { "epoch": 0.7943262411347518, "grad_norm": 1.6709588766098022, "learning_rate": 2.1447994904705614e-06, "loss": 0.468, "step": 2352 }, { "epoch": 0.7946639648767309, "grad_norm": 1.781628131866455, "learning_rate": 2.1380349320666795e-06, "loss": 0.5036, "step": 2353 }, { "epoch": 0.7950016886187099, "grad_norm": 2.2441275119781494, "learning_rate": 2.1312797808966625e-06, "loss": 0.5047, "step": 2354 }, { "epoch": 0.795339412360689, "grad_norm": 1.7883529663085938, "learning_rate": 2.1245340450433836e-06, "loss": 0.506, "step": 2355 }, { "epoch": 0.795677136102668, "grad_norm": 3.5914323329925537, "learning_rate": 2.1177977325784584e-06, "loss": 0.4745, "step": 2356 }, { "epoch": 0.796014859844647, "grad_norm": 1.645218014717102, "learning_rate": 2.111070851562218e-06, "loss": 0.4843, "step": 2357 }, { "epoch": 0.7963525835866262, "grad_norm": 1.6152302026748657, "learning_rate": 2.1043534100437123e-06, "loss": 0.4979, "step": 2358 }, { "epoch": 0.7966903073286052, "grad_norm": 1.5301105976104736, "learning_rate": 2.097645416060693e-06, "loss": 0.4824, "step": 2359 }, { "epoch": 0.7970280310705843, "grad_norm": 2.63529372215271, "learning_rate": 2.0909468776396148e-06, "loss": 0.5068, "step": 2360 }, { "epoch": 0.7973657548125633, "grad_norm": 1.5376330614089966, "learning_rate": 2.084257802795607e-06, "loss": 0.4423, "step": 2361 }, { "epoch": 0.7977034785545424, "grad_norm": 1.753307580947876, "learning_rate": 2.0775781995324886e-06, "loss": 0.4868, "step": 2362 }, { "epoch": 0.7980412022965214, "grad_norm": 1.721227765083313, "learning_rate": 2.0709080758427356e-06, "loss": 0.4919, "step": 2363 }, { "epoch": 0.7983789260385005, "grad_norm": 1.6664161682128906, "learning_rate": 2.064247439707482e-06, "loss": 0.4826, "step": 2364 }, { "epoch": 0.7987166497804795, "grad_norm": 3.313415288925171, "learning_rate": 2.057596299096515e-06, "loss": 0.5114, "step": 2365 }, { "epoch": 0.7990543735224587, "grad_norm": 2.170578718185425, "learning_rate": 2.0509546619682553e-06, "loss": 0.5042, "step": 2366 }, { "epoch": 0.7993920972644377, "grad_norm": 1.670846939086914, "learning_rate": 2.044322536269754e-06, "loss": 0.503, "step": 2367 }, { "epoch": 0.7997298210064168, "grad_norm": 2.0124552249908447, "learning_rate": 2.037699929936676e-06, "loss": 0.5369, "step": 2368 }, { "epoch": 0.8000675447483958, "grad_norm": 1.7125941514968872, "learning_rate": 2.031086850893309e-06, "loss": 0.4744, "step": 2369 }, { "epoch": 0.8004052684903749, "grad_norm": 2.475916624069214, "learning_rate": 2.024483307052526e-06, "loss": 0.5001, "step": 2370 }, { "epoch": 0.8007429922323539, "grad_norm": 1.8545215129852295, "learning_rate": 2.0178893063158e-06, "loss": 0.4903, "step": 2371 }, { "epoch": 0.801080715974333, "grad_norm": 1.9729602336883545, "learning_rate": 2.0113048565731787e-06, "loss": 0.4887, "step": 2372 }, { "epoch": 0.8014184397163121, "grad_norm": 1.8084174394607544, "learning_rate": 2.004729965703289e-06, "loss": 0.4558, "step": 2373 }, { "epoch": 0.8017561634582911, "grad_norm": 1.7405003309249878, "learning_rate": 1.9981646415733157e-06, "loss": 0.4869, "step": 2374 }, { "epoch": 0.8020938872002702, "grad_norm": 1.855022668838501, "learning_rate": 1.9916088920389955e-06, "loss": 0.4838, "step": 2375 }, { "epoch": 0.8024316109422492, "grad_norm": 2.1681766510009766, "learning_rate": 1.9850627249446075e-06, "loss": 0.4937, "step": 2376 }, { "epoch": 0.8027693346842283, "grad_norm": 2.0090715885162354, "learning_rate": 1.9785261481229745e-06, "loss": 0.4862, "step": 2377 }, { "epoch": 0.8031070584262073, "grad_norm": 1.5903204679489136, "learning_rate": 1.971999169395432e-06, "loss": 0.4425, "step": 2378 }, { "epoch": 0.8034447821681864, "grad_norm": 2.0983035564422607, "learning_rate": 1.965481796571839e-06, "loss": 0.4776, "step": 2379 }, { "epoch": 0.8037825059101655, "grad_norm": 1.548117756843567, "learning_rate": 1.9589740374505516e-06, "loss": 0.5013, "step": 2380 }, { "epoch": 0.8041202296521446, "grad_norm": 2.0063014030456543, "learning_rate": 1.952475899818437e-06, "loss": 0.4912, "step": 2381 }, { "epoch": 0.8044579533941236, "grad_norm": 1.978198766708374, "learning_rate": 1.945987391450833e-06, "loss": 0.5355, "step": 2382 }, { "epoch": 0.8047956771361027, "grad_norm": 1.9789292812347412, "learning_rate": 1.9395085201115727e-06, "loss": 0.5033, "step": 2383 }, { "epoch": 0.8051334008780817, "grad_norm": 2.2713143825531006, "learning_rate": 1.933039293552944e-06, "loss": 0.5095, "step": 2384 }, { "epoch": 0.8054711246200608, "grad_norm": 1.7236199378967285, "learning_rate": 1.9265797195156988e-06, "loss": 0.4994, "step": 2385 }, { "epoch": 0.8058088483620398, "grad_norm": 5.58459997177124, "learning_rate": 1.920129805729043e-06, "loss": 0.4953, "step": 2386 }, { "epoch": 0.806146572104019, "grad_norm": 2.09468674659729, "learning_rate": 1.9136895599106196e-06, "loss": 0.4799, "step": 2387 }, { "epoch": 0.806484295845998, "grad_norm": 1.8735566139221191, "learning_rate": 1.907258989766505e-06, "loss": 0.5066, "step": 2388 }, { "epoch": 0.8068220195879771, "grad_norm": 1.878137469291687, "learning_rate": 1.900838102991196e-06, "loss": 0.5266, "step": 2389 }, { "epoch": 0.8071597433299561, "grad_norm": 1.7696536779403687, "learning_rate": 1.8944269072676013e-06, "loss": 0.5122, "step": 2390 }, { "epoch": 0.8074974670719351, "grad_norm": 1.7407069206237793, "learning_rate": 1.8880254102670447e-06, "loss": 0.4793, "step": 2391 }, { "epoch": 0.8078351908139142, "grad_norm": 2.2034058570861816, "learning_rate": 1.8816336196492323e-06, "loss": 0.4818, "step": 2392 }, { "epoch": 0.8081729145558932, "grad_norm": 1.9851166009902954, "learning_rate": 1.8752515430622609e-06, "loss": 0.4941, "step": 2393 }, { "epoch": 0.8085106382978723, "grad_norm": 1.8734411001205444, "learning_rate": 1.8688791881426017e-06, "loss": 0.5005, "step": 2394 }, { "epoch": 0.8088483620398514, "grad_norm": 2.012382984161377, "learning_rate": 1.862516562515101e-06, "loss": 0.5021, "step": 2395 }, { "epoch": 0.8091860857818305, "grad_norm": 1.6459431648254395, "learning_rate": 1.8561636737929556e-06, "loss": 0.4526, "step": 2396 }, { "epoch": 0.8095238095238095, "grad_norm": 4.252561569213867, "learning_rate": 1.8498205295777128e-06, "loss": 0.4919, "step": 2397 }, { "epoch": 0.8098615332657886, "grad_norm": 2.5294301509857178, "learning_rate": 1.843487137459261e-06, "loss": 0.4601, "step": 2398 }, { "epoch": 0.8101992570077676, "grad_norm": 2.0782177448272705, "learning_rate": 1.8371635050158243e-06, "loss": 0.4687, "step": 2399 }, { "epoch": 0.8105369807497467, "grad_norm": 1.9377280473709106, "learning_rate": 1.830849639813942e-06, "loss": 0.4746, "step": 2400 }, { "epoch": 0.8108747044917257, "grad_norm": 2.042790412902832, "learning_rate": 1.8245455494084696e-06, "loss": 0.4667, "step": 2401 }, { "epoch": 0.8112124282337049, "grad_norm": 2.034383773803711, "learning_rate": 1.8182512413425624e-06, "loss": 0.4953, "step": 2402 }, { "epoch": 0.8115501519756839, "grad_norm": 1.7530754804611206, "learning_rate": 1.811966723147679e-06, "loss": 0.5257, "step": 2403 }, { "epoch": 0.811887875717663, "grad_norm": 1.9195135831832886, "learning_rate": 1.8056920023435599e-06, "loss": 0.4981, "step": 2404 }, { "epoch": 0.812225599459642, "grad_norm": 1.733148455619812, "learning_rate": 1.7994270864382202e-06, "loss": 0.4988, "step": 2405 }, { "epoch": 0.8125633232016211, "grad_norm": 1.8975133895874023, "learning_rate": 1.7931719829279448e-06, "loss": 0.5166, "step": 2406 }, { "epoch": 0.8129010469436001, "grad_norm": 1.9382555484771729, "learning_rate": 1.7869266992972755e-06, "loss": 0.4873, "step": 2407 }, { "epoch": 0.8132387706855791, "grad_norm": 2.071626663208008, "learning_rate": 1.7806912430190094e-06, "loss": 0.4848, "step": 2408 }, { "epoch": 0.8135764944275583, "grad_norm": 2.5242457389831543, "learning_rate": 1.77446562155418e-06, "loss": 0.5446, "step": 2409 }, { "epoch": 0.8139142181695374, "grad_norm": 1.488741159439087, "learning_rate": 1.7682498423520545e-06, "loss": 0.4654, "step": 2410 }, { "epoch": 0.8142519419115164, "grad_norm": 1.7139415740966797, "learning_rate": 1.7620439128501198e-06, "loss": 0.5215, "step": 2411 }, { "epoch": 0.8145896656534954, "grad_norm": 1.7172082662582397, "learning_rate": 1.7558478404740876e-06, "loss": 0.4882, "step": 2412 }, { "epoch": 0.8149273893954745, "grad_norm": 1.9238184690475464, "learning_rate": 1.7496616326378623e-06, "loss": 0.523, "step": 2413 }, { "epoch": 0.8152651131374535, "grad_norm": 1.675175666809082, "learning_rate": 1.7434852967435523e-06, "loss": 0.502, "step": 2414 }, { "epoch": 0.8156028368794326, "grad_norm": 1.6569234132766724, "learning_rate": 1.73731884018145e-06, "loss": 0.4966, "step": 2415 }, { "epoch": 0.8159405606214117, "grad_norm": 1.8827682733535767, "learning_rate": 1.7311622703300324e-06, "loss": 0.4962, "step": 2416 }, { "epoch": 0.8162782843633908, "grad_norm": 4.2671003341674805, "learning_rate": 1.725015594555941e-06, "loss": 0.5155, "step": 2417 }, { "epoch": 0.8166160081053698, "grad_norm": 3.7091119289398193, "learning_rate": 1.7188788202139794e-06, "loss": 0.5057, "step": 2418 }, { "epoch": 0.8169537318473489, "grad_norm": 2.4014389514923096, "learning_rate": 1.7127519546471039e-06, "loss": 0.5208, "step": 2419 }, { "epoch": 0.8172914555893279, "grad_norm": 3.108933448791504, "learning_rate": 1.706635005186419e-06, "loss": 0.5161, "step": 2420 }, { "epoch": 0.817629179331307, "grad_norm": 1.8535352945327759, "learning_rate": 1.7005279791511575e-06, "loss": 0.4657, "step": 2421 }, { "epoch": 0.817966903073286, "grad_norm": 1.8475009202957153, "learning_rate": 1.6944308838486823e-06, "loss": 0.5056, "step": 2422 }, { "epoch": 0.8183046268152652, "grad_norm": 1.789693832397461, "learning_rate": 1.6883437265744717e-06, "loss": 0.5219, "step": 2423 }, { "epoch": 0.8186423505572442, "grad_norm": 1.743080496788025, "learning_rate": 1.6822665146121076e-06, "loss": 0.4986, "step": 2424 }, { "epoch": 0.8189800742992233, "grad_norm": 2.1451499462127686, "learning_rate": 1.6761992552332884e-06, "loss": 0.4888, "step": 2425 }, { "epoch": 0.8193177980412023, "grad_norm": 1.780124545097351, "learning_rate": 1.6701419556977882e-06, "loss": 0.5123, "step": 2426 }, { "epoch": 0.8196555217831814, "grad_norm": 1.7464144229888916, "learning_rate": 1.6640946232534682e-06, "loss": 0.4835, "step": 2427 }, { "epoch": 0.8199932455251604, "grad_norm": 1.7431195974349976, "learning_rate": 1.6580572651362636e-06, "loss": 0.4957, "step": 2428 }, { "epoch": 0.8203309692671394, "grad_norm": 1.85854971408844, "learning_rate": 1.6520298885701714e-06, "loss": 0.4902, "step": 2429 }, { "epoch": 0.8206686930091185, "grad_norm": 1.770286202430725, "learning_rate": 1.6460125007672556e-06, "loss": 0.4995, "step": 2430 }, { "epoch": 0.8210064167510976, "grad_norm": 2.941640853881836, "learning_rate": 1.640005108927618e-06, "loss": 0.5156, "step": 2431 }, { "epoch": 0.8213441404930767, "grad_norm": 1.7239917516708374, "learning_rate": 1.6340077202394033e-06, "loss": 0.5038, "step": 2432 }, { "epoch": 0.8216818642350557, "grad_norm": 1.830190896987915, "learning_rate": 1.6280203418787832e-06, "loss": 0.5025, "step": 2433 }, { "epoch": 0.8220195879770348, "grad_norm": 1.977504849433899, "learning_rate": 1.6220429810099603e-06, "loss": 0.4771, "step": 2434 }, { "epoch": 0.8223573117190138, "grad_norm": 1.7216362953186035, "learning_rate": 1.6160756447851432e-06, "loss": 0.5082, "step": 2435 }, { "epoch": 0.8226950354609929, "grad_norm": 1.7008848190307617, "learning_rate": 1.6101183403445475e-06, "loss": 0.4836, "step": 2436 }, { "epoch": 0.8230327592029719, "grad_norm": 1.9677833318710327, "learning_rate": 1.604171074816384e-06, "loss": 0.4995, "step": 2437 }, { "epoch": 0.8233704829449511, "grad_norm": 1.8812280893325806, "learning_rate": 1.5982338553168563e-06, "loss": 0.4821, "step": 2438 }, { "epoch": 0.8237082066869301, "grad_norm": 1.7746137380599976, "learning_rate": 1.5923066889501415e-06, "loss": 0.4851, "step": 2439 }, { "epoch": 0.8240459304289092, "grad_norm": 1.6402124166488647, "learning_rate": 1.586389582808392e-06, "loss": 0.4943, "step": 2440 }, { "epoch": 0.8243836541708882, "grad_norm": 2.0051774978637695, "learning_rate": 1.5804825439717165e-06, "loss": 0.4804, "step": 2441 }, { "epoch": 0.8247213779128673, "grad_norm": 1.4221009016036987, "learning_rate": 1.5745855795081889e-06, "loss": 0.4571, "step": 2442 }, { "epoch": 0.8250591016548463, "grad_norm": 1.6807363033294678, "learning_rate": 1.5686986964738171e-06, "loss": 0.4795, "step": 2443 }, { "epoch": 0.8253968253968254, "grad_norm": 1.7401728630065918, "learning_rate": 1.5628219019125512e-06, "loss": 0.4605, "step": 2444 }, { "epoch": 0.8257345491388045, "grad_norm": 1.80463707447052, "learning_rate": 1.5569552028562684e-06, "loss": 0.4592, "step": 2445 }, { "epoch": 0.8260722728807836, "grad_norm": 1.6439859867095947, "learning_rate": 1.551098606324768e-06, "loss": 0.4827, "step": 2446 }, { "epoch": 0.8264099966227626, "grad_norm": 2.8343183994293213, "learning_rate": 1.5452521193257641e-06, "loss": 0.5011, "step": 2447 }, { "epoch": 0.8267477203647416, "grad_norm": 3.17095685005188, "learning_rate": 1.5394157488548657e-06, "loss": 0.4866, "step": 2448 }, { "epoch": 0.8270854441067207, "grad_norm": 1.5953701734542847, "learning_rate": 1.5335895018955838e-06, "loss": 0.4395, "step": 2449 }, { "epoch": 0.8274231678486997, "grad_norm": 1.7234811782836914, "learning_rate": 1.527773385419311e-06, "loss": 0.5054, "step": 2450 }, { "epoch": 0.8277608915906788, "grad_norm": 1.6750421524047852, "learning_rate": 1.5219674063853252e-06, "loss": 0.4702, "step": 2451 }, { "epoch": 0.8280986153326579, "grad_norm": 1.6545888185501099, "learning_rate": 1.5161715717407687e-06, "loss": 0.4921, "step": 2452 }, { "epoch": 0.828436339074637, "grad_norm": 1.7004224061965942, "learning_rate": 1.5103858884206457e-06, "loss": 0.4905, "step": 2453 }, { "epoch": 0.828774062816616, "grad_norm": 1.7643380165100098, "learning_rate": 1.5046103633478148e-06, "loss": 0.4633, "step": 2454 }, { "epoch": 0.8291117865585951, "grad_norm": 1.792162299156189, "learning_rate": 1.4988450034329838e-06, "loss": 0.4814, "step": 2455 }, { "epoch": 0.8294495103005741, "grad_norm": 1.8866151571273804, "learning_rate": 1.4930898155746921e-06, "loss": 0.4871, "step": 2456 }, { "epoch": 0.8297872340425532, "grad_norm": 1.9470422267913818, "learning_rate": 1.4873448066593089e-06, "loss": 0.4771, "step": 2457 }, { "epoch": 0.8301249577845322, "grad_norm": 2.028930425643921, "learning_rate": 1.4816099835610209e-06, "loss": 0.492, "step": 2458 }, { "epoch": 0.8304626815265113, "grad_norm": 1.735535979270935, "learning_rate": 1.4758853531418372e-06, "loss": 0.4615, "step": 2459 }, { "epoch": 0.8308004052684904, "grad_norm": 1.8996124267578125, "learning_rate": 1.4701709222515604e-06, "loss": 0.5141, "step": 2460 }, { "epoch": 0.8311381290104695, "grad_norm": 1.819139003753662, "learning_rate": 1.4644666977277922e-06, "loss": 0.4885, "step": 2461 }, { "epoch": 0.8314758527524485, "grad_norm": 1.6537361145019531, "learning_rate": 1.4587726863959239e-06, "loss": 0.5058, "step": 2462 }, { "epoch": 0.8318135764944276, "grad_norm": 2.849712371826172, "learning_rate": 1.4530888950691202e-06, "loss": 0.4852, "step": 2463 }, { "epoch": 0.8321513002364066, "grad_norm": 1.799085021018982, "learning_rate": 1.4474153305483264e-06, "loss": 0.5279, "step": 2464 }, { "epoch": 0.8324890239783856, "grad_norm": 1.6352843046188354, "learning_rate": 1.441751999622244e-06, "loss": 0.4643, "step": 2465 }, { "epoch": 0.8328267477203647, "grad_norm": 1.6692978143692017, "learning_rate": 1.4360989090673284e-06, "loss": 0.483, "step": 2466 }, { "epoch": 0.8331644714623438, "grad_norm": 3.680781841278076, "learning_rate": 1.4304560656477895e-06, "loss": 0.4841, "step": 2467 }, { "epoch": 0.8335021952043229, "grad_norm": 1.8535411357879639, "learning_rate": 1.4248234761155676e-06, "loss": 0.4809, "step": 2468 }, { "epoch": 0.8338399189463019, "grad_norm": 1.8995721340179443, "learning_rate": 1.4192011472103407e-06, "loss": 0.4668, "step": 2469 }, { "epoch": 0.834177642688281, "grad_norm": 2.4377171993255615, "learning_rate": 1.4135890856595047e-06, "loss": 0.4938, "step": 2470 }, { "epoch": 0.83451536643026, "grad_norm": 1.6863787174224854, "learning_rate": 1.4079872981781706e-06, "loss": 0.5012, "step": 2471 }, { "epoch": 0.8348530901722391, "grad_norm": 1.677014708518982, "learning_rate": 1.4023957914691565e-06, "loss": 0.4766, "step": 2472 }, { "epoch": 0.8351908139142181, "grad_norm": 1.7441376447677612, "learning_rate": 1.396814572222982e-06, "loss": 0.4927, "step": 2473 }, { "epoch": 0.8355285376561973, "grad_norm": 2.109144449234009, "learning_rate": 1.3912436471178525e-06, "loss": 0.513, "step": 2474 }, { "epoch": 0.8358662613981763, "grad_norm": 2.2988452911376953, "learning_rate": 1.3856830228196593e-06, "loss": 0.508, "step": 2475 }, { "epoch": 0.8362039851401554, "grad_norm": 2.18215012550354, "learning_rate": 1.3801327059819647e-06, "loss": 0.4992, "step": 2476 }, { "epoch": 0.8365417088821344, "grad_norm": 1.9162951707839966, "learning_rate": 1.3745927032460038e-06, "loss": 0.534, "step": 2477 }, { "epoch": 0.8368794326241135, "grad_norm": 1.8726035356521606, "learning_rate": 1.3690630212406653e-06, "loss": 0.4648, "step": 2478 }, { "epoch": 0.8372171563660925, "grad_norm": 1.5579783916473389, "learning_rate": 1.3635436665824886e-06, "loss": 0.4701, "step": 2479 }, { "epoch": 0.8375548801080716, "grad_norm": 3.252487897872925, "learning_rate": 1.3580346458756554e-06, "loss": 0.4997, "step": 2480 }, { "epoch": 0.8378926038500507, "grad_norm": 2.0345444679260254, "learning_rate": 1.3525359657119897e-06, "loss": 0.4913, "step": 2481 }, { "epoch": 0.8382303275920298, "grad_norm": 1.746577262878418, "learning_rate": 1.3470476326709337e-06, "loss": 0.5405, "step": 2482 }, { "epoch": 0.8385680513340088, "grad_norm": 1.792932391166687, "learning_rate": 1.341569653319551e-06, "loss": 0.5357, "step": 2483 }, { "epoch": 0.8389057750759878, "grad_norm": 1.6897506713867188, "learning_rate": 1.3361020342125176e-06, "loss": 0.5001, "step": 2484 }, { "epoch": 0.8392434988179669, "grad_norm": 1.7498420476913452, "learning_rate": 1.3306447818921154e-06, "loss": 0.4884, "step": 2485 }, { "epoch": 0.8395812225599459, "grad_norm": 1.9979479312896729, "learning_rate": 1.3251979028882179e-06, "loss": 0.4942, "step": 2486 }, { "epoch": 0.839918946301925, "grad_norm": 1.800220012664795, "learning_rate": 1.319761403718286e-06, "loss": 0.4866, "step": 2487 }, { "epoch": 0.8402566700439041, "grad_norm": 1.840551495552063, "learning_rate": 1.3143352908873663e-06, "loss": 0.4757, "step": 2488 }, { "epoch": 0.8405943937858832, "grad_norm": 1.8712488412857056, "learning_rate": 1.3089195708880686e-06, "loss": 0.471, "step": 2489 }, { "epoch": 0.8409321175278622, "grad_norm": 2.0839338302612305, "learning_rate": 1.3035142502005792e-06, "loss": 0.5401, "step": 2490 }, { "epoch": 0.8412698412698413, "grad_norm": 1.7275350093841553, "learning_rate": 1.2981193352926292e-06, "loss": 0.487, "step": 2491 }, { "epoch": 0.8416075650118203, "grad_norm": 1.62874436378479, "learning_rate": 1.2927348326195066e-06, "loss": 0.4952, "step": 2492 }, { "epoch": 0.8419452887537994, "grad_norm": 1.7364107370376587, "learning_rate": 1.2873607486240325e-06, "loss": 0.4888, "step": 2493 }, { "epoch": 0.8422830124957784, "grad_norm": 1.8485337495803833, "learning_rate": 1.281997089736574e-06, "loss": 0.5051, "step": 2494 }, { "epoch": 0.8426207362377575, "grad_norm": 2.2751340866088867, "learning_rate": 1.2766438623750133e-06, "loss": 0.4577, "step": 2495 }, { "epoch": 0.8429584599797366, "grad_norm": 1.9767004251480103, "learning_rate": 1.271301072944754e-06, "loss": 0.4783, "step": 2496 }, { "epoch": 0.8432961837217157, "grad_norm": 1.7066822052001953, "learning_rate": 1.2659687278387113e-06, "loss": 0.5151, "step": 2497 }, { "epoch": 0.8436339074636947, "grad_norm": 1.7063212394714355, "learning_rate": 1.2606468334373e-06, "loss": 0.5001, "step": 2498 }, { "epoch": 0.8439716312056738, "grad_norm": 1.7792918682098389, "learning_rate": 1.2553353961084358e-06, "loss": 0.5504, "step": 2499 }, { "epoch": 0.8443093549476528, "grad_norm": 1.5470627546310425, "learning_rate": 1.2500344222075189e-06, "loss": 0.4771, "step": 2500 }, { "epoch": 0.8446470786896318, "grad_norm": 1.8461023569107056, "learning_rate": 1.244743918077428e-06, "loss": 0.4959, "step": 2501 }, { "epoch": 0.8449848024316109, "grad_norm": 1.8302757740020752, "learning_rate": 1.2394638900485124e-06, "loss": 0.5127, "step": 2502 }, { "epoch": 0.84532252617359, "grad_norm": 1.8153729438781738, "learning_rate": 1.2341943444385951e-06, "loss": 0.4987, "step": 2503 }, { "epoch": 0.8456602499155691, "grad_norm": 1.7092945575714111, "learning_rate": 1.2289352875529482e-06, "loss": 0.4989, "step": 2504 }, { "epoch": 0.8459979736575481, "grad_norm": 1.9017432928085327, "learning_rate": 1.2236867256842955e-06, "loss": 0.5138, "step": 2505 }, { "epoch": 0.8463356973995272, "grad_norm": 2.013453960418701, "learning_rate": 1.2184486651128014e-06, "loss": 0.511, "step": 2506 }, { "epoch": 0.8466734211415062, "grad_norm": 1.63774836063385, "learning_rate": 1.213221112106071e-06, "loss": 0.4989, "step": 2507 }, { "epoch": 0.8470111448834853, "grad_norm": 2.077888250350952, "learning_rate": 1.2080040729191289e-06, "loss": 0.5082, "step": 2508 }, { "epoch": 0.8473488686254643, "grad_norm": 1.891480803489685, "learning_rate": 1.202797553794428e-06, "loss": 0.5019, "step": 2509 }, { "epoch": 0.8476865923674435, "grad_norm": 2.3117117881774902, "learning_rate": 1.197601560961824e-06, "loss": 0.5001, "step": 2510 }, { "epoch": 0.8480243161094225, "grad_norm": 1.6955264806747437, "learning_rate": 1.1924161006385815e-06, "loss": 0.5009, "step": 2511 }, { "epoch": 0.8483620398514016, "grad_norm": 1.8722612857818604, "learning_rate": 1.187241179029367e-06, "loss": 0.5049, "step": 2512 }, { "epoch": 0.8486997635933806, "grad_norm": 1.6086488962173462, "learning_rate": 1.1820768023262297e-06, "loss": 0.5009, "step": 2513 }, { "epoch": 0.8490374873353597, "grad_norm": 1.647614598274231, "learning_rate": 1.1769229767086053e-06, "loss": 0.4721, "step": 2514 }, { "epoch": 0.8493752110773387, "grad_norm": 1.7849152088165283, "learning_rate": 1.1717797083433002e-06, "loss": 0.4701, "step": 2515 }, { "epoch": 0.8497129348193178, "grad_norm": 2.6170690059661865, "learning_rate": 1.1666470033844968e-06, "loss": 0.5114, "step": 2516 }, { "epoch": 0.8500506585612969, "grad_norm": 1.7553811073303223, "learning_rate": 1.1615248679737312e-06, "loss": 0.4684, "step": 2517 }, { "epoch": 0.850388382303276, "grad_norm": 1.9082300662994385, "learning_rate": 1.1564133082398942e-06, "loss": 0.5186, "step": 2518 }, { "epoch": 0.850726106045255, "grad_norm": 1.990812063217163, "learning_rate": 1.151312330299219e-06, "loss": 0.507, "step": 2519 }, { "epoch": 0.851063829787234, "grad_norm": 1.902625322341919, "learning_rate": 1.1462219402552855e-06, "loss": 0.5006, "step": 2520 }, { "epoch": 0.8514015535292131, "grad_norm": 1.863521695137024, "learning_rate": 1.1411421441989979e-06, "loss": 0.4663, "step": 2521 }, { "epoch": 0.8517392772711921, "grad_norm": 1.8939602375030518, "learning_rate": 1.1360729482085852e-06, "loss": 0.4754, "step": 2522 }, { "epoch": 0.8520770010131712, "grad_norm": 1.7583485841751099, "learning_rate": 1.1310143583495926e-06, "loss": 0.4755, "step": 2523 }, { "epoch": 0.8524147247551502, "grad_norm": 4.704648494720459, "learning_rate": 1.1259663806748776e-06, "loss": 0.5197, "step": 2524 }, { "epoch": 0.8527524484971294, "grad_norm": 1.8820457458496094, "learning_rate": 1.1209290212245972e-06, "loss": 0.4935, "step": 2525 }, { "epoch": 0.8530901722391084, "grad_norm": 4.306023120880127, "learning_rate": 1.1159022860262036e-06, "loss": 0.4799, "step": 2526 }, { "epoch": 0.8534278959810875, "grad_norm": 1.5720821619033813, "learning_rate": 1.1108861810944338e-06, "loss": 0.474, "step": 2527 }, { "epoch": 0.8537656197230665, "grad_norm": 2.139662981033325, "learning_rate": 1.1058807124313132e-06, "loss": 0.522, "step": 2528 }, { "epoch": 0.8541033434650456, "grad_norm": 1.5107022523880005, "learning_rate": 1.1008858860261307e-06, "loss": 0.5044, "step": 2529 }, { "epoch": 0.8544410672070246, "grad_norm": 1.8529784679412842, "learning_rate": 1.0959017078554458e-06, "loss": 0.4608, "step": 2530 }, { "epoch": 0.8547787909490037, "grad_norm": 2.06048846244812, "learning_rate": 1.0909281838830787e-06, "loss": 0.4805, "step": 2531 }, { "epoch": 0.8551165146909828, "grad_norm": 2.7192764282226562, "learning_rate": 1.0859653200600972e-06, "loss": 0.4837, "step": 2532 }, { "epoch": 0.8554542384329619, "grad_norm": 1.7612273693084717, "learning_rate": 1.0810131223248177e-06, "loss": 0.4556, "step": 2533 }, { "epoch": 0.8557919621749409, "grad_norm": 1.9578105211257935, "learning_rate": 1.0760715966027923e-06, "loss": 0.4578, "step": 2534 }, { "epoch": 0.85612968591692, "grad_norm": 1.8702048063278198, "learning_rate": 1.0711407488068014e-06, "loss": 0.4809, "step": 2535 }, { "epoch": 0.856467409658899, "grad_norm": 1.7851426601409912, "learning_rate": 1.0662205848368522e-06, "loss": 0.4988, "step": 2536 }, { "epoch": 0.856805133400878, "grad_norm": 2.3286616802215576, "learning_rate": 1.0613111105801633e-06, "loss": 0.4748, "step": 2537 }, { "epoch": 0.8571428571428571, "grad_norm": 2.0510294437408447, "learning_rate": 1.0564123319111708e-06, "loss": 0.4737, "step": 2538 }, { "epoch": 0.8574805808848363, "grad_norm": 1.6554745435714722, "learning_rate": 1.051524254691505e-06, "loss": 0.4693, "step": 2539 }, { "epoch": 0.8578183046268153, "grad_norm": 2.764592170715332, "learning_rate": 1.0466468847699962e-06, "loss": 0.4772, "step": 2540 }, { "epoch": 0.8581560283687943, "grad_norm": 1.5182468891143799, "learning_rate": 1.0417802279826572e-06, "loss": 0.4803, "step": 2541 }, { "epoch": 0.8584937521107734, "grad_norm": 2.047261953353882, "learning_rate": 1.036924290152691e-06, "loss": 0.4925, "step": 2542 }, { "epoch": 0.8588314758527524, "grad_norm": 2.6357977390289307, "learning_rate": 1.0320790770904676e-06, "loss": 0.4781, "step": 2543 }, { "epoch": 0.8591691995947315, "grad_norm": 1.919888973236084, "learning_rate": 1.027244594593526e-06, "loss": 0.5022, "step": 2544 }, { "epoch": 0.8595069233367105, "grad_norm": 1.5964407920837402, "learning_rate": 1.0224208484465648e-06, "loss": 0.4848, "step": 2545 }, { "epoch": 0.8598446470786897, "grad_norm": 1.9345206022262573, "learning_rate": 1.017607844421441e-06, "loss": 0.4691, "step": 2546 }, { "epoch": 0.8601823708206687, "grad_norm": 1.597398042678833, "learning_rate": 1.0128055882771515e-06, "loss": 0.4946, "step": 2547 }, { "epoch": 0.8605200945626478, "grad_norm": 1.6686010360717773, "learning_rate": 1.0080140857598352e-06, "loss": 0.4933, "step": 2548 }, { "epoch": 0.8608578183046268, "grad_norm": 1.7743228673934937, "learning_rate": 1.0032333426027641e-06, "loss": 0.4729, "step": 2549 }, { "epoch": 0.8611955420466059, "grad_norm": 1.8585344552993774, "learning_rate": 9.984633645263386e-07, "loss": 0.5514, "step": 2550 }, { "epoch": 0.8615332657885849, "grad_norm": 1.498280644416809, "learning_rate": 9.937041572380724e-07, "loss": 0.4793, "step": 2551 }, { "epoch": 0.861870989530564, "grad_norm": 1.664556622505188, "learning_rate": 9.889557264325978e-07, "loss": 0.4715, "step": 2552 }, { "epoch": 0.8622087132725431, "grad_norm": 4.9264140129089355, "learning_rate": 9.842180777916487e-07, "loss": 0.5132, "step": 2553 }, { "epoch": 0.8625464370145222, "grad_norm": 1.8148272037506104, "learning_rate": 9.794912169840564e-07, "loss": 0.4689, "step": 2554 }, { "epoch": 0.8628841607565012, "grad_norm": 2.0997962951660156, "learning_rate": 9.747751496657488e-07, "loss": 0.4983, "step": 2555 }, { "epoch": 0.8632218844984803, "grad_norm": 2.0593008995056152, "learning_rate": 9.700698814797338e-07, "loss": 0.5083, "step": 2556 }, { "epoch": 0.8635596082404593, "grad_norm": 2.4295730590820312, "learning_rate": 9.653754180561014e-07, "loss": 0.4842, "step": 2557 }, { "epoch": 0.8638973319824383, "grad_norm": 2.1231155395507812, "learning_rate": 9.606917650120084e-07, "loss": 0.5003, "step": 2558 }, { "epoch": 0.8642350557244174, "grad_norm": 2.0425164699554443, "learning_rate": 9.56018927951684e-07, "loss": 0.4756, "step": 2559 }, { "epoch": 0.8645727794663964, "grad_norm": 1.9818636178970337, "learning_rate": 9.51356912466408e-07, "loss": 0.4558, "step": 2560 }, { "epoch": 0.8649105032083756, "grad_norm": 1.8112746477127075, "learning_rate": 9.467057241345168e-07, "loss": 0.4871, "step": 2561 }, { "epoch": 0.8652482269503546, "grad_norm": 2.0262632369995117, "learning_rate": 9.420653685213854e-07, "loss": 0.4797, "step": 2562 }, { "epoch": 0.8655859506923337, "grad_norm": 1.7880816459655762, "learning_rate": 9.374358511794368e-07, "loss": 0.4877, "step": 2563 }, { "epoch": 0.8659236744343127, "grad_norm": 1.962664008140564, "learning_rate": 9.328171776481165e-07, "loss": 0.5035, "step": 2564 }, { "epoch": 0.8662613981762918, "grad_norm": 1.5527554750442505, "learning_rate": 9.282093534538994e-07, "loss": 0.4677, "step": 2565 }, { "epoch": 0.8665991219182708, "grad_norm": 1.7754064798355103, "learning_rate": 9.236123841102762e-07, "loss": 0.485, "step": 2566 }, { "epoch": 0.8669368456602499, "grad_norm": 1.7187808752059937, "learning_rate": 9.190262751177481e-07, "loss": 0.495, "step": 2567 }, { "epoch": 0.867274569402229, "grad_norm": 3.302588939666748, "learning_rate": 9.144510319638278e-07, "loss": 0.4825, "step": 2568 }, { "epoch": 0.8676122931442081, "grad_norm": 1.9861351251602173, "learning_rate": 9.098866601230216e-07, "loss": 0.5026, "step": 2569 }, { "epoch": 0.8679500168861871, "grad_norm": 1.9290409088134766, "learning_rate": 9.053331650568264e-07, "loss": 0.4769, "step": 2570 }, { "epoch": 0.8682877406281662, "grad_norm": 2.637110710144043, "learning_rate": 9.00790552213725e-07, "loss": 0.4716, "step": 2571 }, { "epoch": 0.8686254643701452, "grad_norm": 1.7755980491638184, "learning_rate": 8.962588270291839e-07, "loss": 0.5049, "step": 2572 }, { "epoch": 0.8689631881121243, "grad_norm": 3.1036875247955322, "learning_rate": 8.917379949256388e-07, "loss": 0.521, "step": 2573 }, { "epoch": 0.8693009118541033, "grad_norm": 4.43895149230957, "learning_rate": 8.872280613124895e-07, "loss": 0.4675, "step": 2574 }, { "epoch": 0.8696386355960825, "grad_norm": 1.7383559942245483, "learning_rate": 8.827290315860981e-07, "loss": 0.5139, "step": 2575 }, { "epoch": 0.8699763593380615, "grad_norm": 1.6562116146087646, "learning_rate": 8.782409111297752e-07, "loss": 0.5071, "step": 2576 }, { "epoch": 0.8703140830800405, "grad_norm": 2.228973627090454, "learning_rate": 8.737637053137849e-07, "loss": 0.5218, "step": 2577 }, { "epoch": 0.8706518068220196, "grad_norm": 1.857865810394287, "learning_rate": 8.692974194953263e-07, "loss": 0.5319, "step": 2578 }, { "epoch": 0.8709895305639986, "grad_norm": 1.6849925518035889, "learning_rate": 8.648420590185325e-07, "loss": 0.4715, "step": 2579 }, { "epoch": 0.8713272543059777, "grad_norm": 1.8772845268249512, "learning_rate": 8.603976292144644e-07, "loss": 0.4533, "step": 2580 }, { "epoch": 0.8716649780479567, "grad_norm": 1.758715033531189, "learning_rate": 8.559641354011072e-07, "loss": 0.4785, "step": 2581 }, { "epoch": 0.8720027017899359, "grad_norm": 1.8679993152618408, "learning_rate": 8.515415828833562e-07, "loss": 0.5063, "step": 2582 }, { "epoch": 0.8723404255319149, "grad_norm": 1.9027328491210938, "learning_rate": 8.47129976953015e-07, "loss": 0.5086, "step": 2583 }, { "epoch": 0.872678149273894, "grad_norm": 1.7995573282241821, "learning_rate": 8.4272932288879e-07, "loss": 0.5032, "step": 2584 }, { "epoch": 0.873015873015873, "grad_norm": 1.9042366743087769, "learning_rate": 8.38339625956286e-07, "loss": 0.4993, "step": 2585 }, { "epoch": 0.8733535967578521, "grad_norm": 30.520692825317383, "learning_rate": 8.339608914079944e-07, "loss": 0.4893, "step": 2586 }, { "epoch": 0.8736913204998311, "grad_norm": 1.9027208089828491, "learning_rate": 8.295931244832888e-07, "loss": 0.488, "step": 2587 }, { "epoch": 0.8740290442418102, "grad_norm": 1.6071308851242065, "learning_rate": 8.252363304084199e-07, "loss": 0.516, "step": 2588 }, { "epoch": 0.8743667679837892, "grad_norm": 1.700516700744629, "learning_rate": 8.208905143965107e-07, "loss": 0.4892, "step": 2589 }, { "epoch": 0.8747044917257684, "grad_norm": 1.640088438987732, "learning_rate": 8.165556816475462e-07, "loss": 0.5157, "step": 2590 }, { "epoch": 0.8750422154677474, "grad_norm": 2.074371814727783, "learning_rate": 8.122318373483717e-07, "loss": 0.4933, "step": 2591 }, { "epoch": 0.8753799392097265, "grad_norm": 1.6877079010009766, "learning_rate": 8.0791898667268e-07, "loss": 0.4886, "step": 2592 }, { "epoch": 0.8757176629517055, "grad_norm": 1.6663894653320312, "learning_rate": 8.036171347810129e-07, "loss": 0.5036, "step": 2593 }, { "epoch": 0.8760553866936845, "grad_norm": 5.5421528816223145, "learning_rate": 7.993262868207552e-07, "loss": 0.5114, "step": 2594 }, { "epoch": 0.8763931104356636, "grad_norm": 1.682059645652771, "learning_rate": 7.950464479261177e-07, "loss": 0.4864, "step": 2595 }, { "epoch": 0.8767308341776426, "grad_norm": 1.8965541124343872, "learning_rate": 7.907776232181408e-07, "loss": 0.4631, "step": 2596 }, { "epoch": 0.8770685579196218, "grad_norm": 1.5753380060195923, "learning_rate": 7.865198178046862e-07, "loss": 0.4782, "step": 2597 }, { "epoch": 0.8774062816616008, "grad_norm": 2.039400577545166, "learning_rate": 7.822730367804332e-07, "loss": 0.4827, "step": 2598 }, { "epoch": 0.8777440054035799, "grad_norm": 1.7181869745254517, "learning_rate": 7.780372852268669e-07, "loss": 0.4384, "step": 2599 }, { "epoch": 0.8780817291455589, "grad_norm": 1.6570184230804443, "learning_rate": 7.738125682122732e-07, "loss": 0.4865, "step": 2600 }, { "epoch": 0.878419452887538, "grad_norm": 1.7039506435394287, "learning_rate": 7.695988907917374e-07, "loss": 0.494, "step": 2601 }, { "epoch": 0.878757176629517, "grad_norm": 1.7144337892532349, "learning_rate": 7.653962580071384e-07, "loss": 0.4731, "step": 2602 }, { "epoch": 0.8790949003714961, "grad_norm": 1.532623529434204, "learning_rate": 7.612046748871327e-07, "loss": 0.4684, "step": 2603 }, { "epoch": 0.8794326241134752, "grad_norm": 1.55143141746521, "learning_rate": 7.570241464471606e-07, "loss": 0.4867, "step": 2604 }, { "epoch": 0.8797703478554543, "grad_norm": 2.081272840499878, "learning_rate": 7.52854677689433e-07, "loss": 0.5052, "step": 2605 }, { "epoch": 0.8801080715974333, "grad_norm": 2.5801894664764404, "learning_rate": 7.486962736029247e-07, "loss": 0.4907, "step": 2606 }, { "epoch": 0.8804457953394124, "grad_norm": 1.8188486099243164, "learning_rate": 7.445489391633775e-07, "loss": 0.4913, "step": 2607 }, { "epoch": 0.8807835190813914, "grad_norm": 1.758267879486084, "learning_rate": 7.404126793332845e-07, "loss": 0.4913, "step": 2608 }, { "epoch": 0.8811212428233705, "grad_norm": 1.7529916763305664, "learning_rate": 7.36287499061884e-07, "loss": 0.4808, "step": 2609 }, { "epoch": 0.8814589665653495, "grad_norm": 1.7938553094863892, "learning_rate": 7.321734032851613e-07, "loss": 0.4634, "step": 2610 }, { "epoch": 0.8817966903073287, "grad_norm": 3.572237014770508, "learning_rate": 7.280703969258396e-07, "loss": 0.5146, "step": 2611 }, { "epoch": 0.8821344140493077, "grad_norm": 1.659851312637329, "learning_rate": 7.239784848933685e-07, "loss": 0.4876, "step": 2612 }, { "epoch": 0.8824721377912867, "grad_norm": 1.6163403987884521, "learning_rate": 7.198976720839268e-07, "loss": 0.4635, "step": 2613 }, { "epoch": 0.8828098615332658, "grad_norm": 1.9709886312484741, "learning_rate": 7.158279633804077e-07, "loss": 0.4825, "step": 2614 }, { "epoch": 0.8831475852752448, "grad_norm": 1.7251195907592773, "learning_rate": 7.117693636524237e-07, "loss": 0.4694, "step": 2615 }, { "epoch": 0.8834853090172239, "grad_norm": 2.987093448638916, "learning_rate": 7.077218777562933e-07, "loss": 0.4435, "step": 2616 }, { "epoch": 0.8838230327592029, "grad_norm": 2.2696213722229004, "learning_rate": 7.036855105350337e-07, "loss": 0.4805, "step": 2617 }, { "epoch": 0.8841607565011821, "grad_norm": 1.9427273273468018, "learning_rate": 6.996602668183605e-07, "loss": 0.4926, "step": 2618 }, { "epoch": 0.8844984802431611, "grad_norm": 1.854352593421936, "learning_rate": 6.956461514226764e-07, "loss": 0.5421, "step": 2619 }, { "epoch": 0.8848362039851402, "grad_norm": 1.8425906896591187, "learning_rate": 6.91643169151075e-07, "loss": 0.4539, "step": 2620 }, { "epoch": 0.8851739277271192, "grad_norm": 1.8357064723968506, "learning_rate": 6.876513247933215e-07, "loss": 0.4604, "step": 2621 }, { "epoch": 0.8855116514690983, "grad_norm": 1.788055419921875, "learning_rate": 6.836706231258583e-07, "loss": 0.4182, "step": 2622 }, { "epoch": 0.8858493752110773, "grad_norm": 2.078190326690674, "learning_rate": 6.797010689117922e-07, "loss": 0.5128, "step": 2623 }, { "epoch": 0.8861870989530564, "grad_norm": 1.726401686668396, "learning_rate": 6.757426669008948e-07, "loss": 0.514, "step": 2624 }, { "epoch": 0.8865248226950354, "grad_norm": 1.9789220094680786, "learning_rate": 6.71795421829593e-07, "loss": 0.4601, "step": 2625 }, { "epoch": 0.8868625464370146, "grad_norm": 2.31406569480896, "learning_rate": 6.678593384209597e-07, "loss": 0.4448, "step": 2626 }, { "epoch": 0.8872002701789936, "grad_norm": 1.616159200668335, "learning_rate": 6.639344213847154e-07, "loss": 0.4883, "step": 2627 }, { "epoch": 0.8875379939209727, "grad_norm": 1.7183767557144165, "learning_rate": 6.600206754172222e-07, "loss": 0.5038, "step": 2628 }, { "epoch": 0.8878757176629517, "grad_norm": 1.783092975616455, "learning_rate": 6.561181052014709e-07, "loss": 0.5132, "step": 2629 }, { "epoch": 0.8882134414049307, "grad_norm": 2.0423965454101562, "learning_rate": 6.522267154070816e-07, "loss": 0.5126, "step": 2630 }, { "epoch": 0.8885511651469098, "grad_norm": 1.7453845739364624, "learning_rate": 6.483465106902953e-07, "loss": 0.4719, "step": 2631 }, { "epoch": 0.8888888888888888, "grad_norm": 2.403984785079956, "learning_rate": 6.444774956939737e-07, "loss": 0.4753, "step": 2632 }, { "epoch": 0.889226612630868, "grad_norm": 2.121732234954834, "learning_rate": 6.406196750475846e-07, "loss": 0.4879, "step": 2633 }, { "epoch": 0.889564336372847, "grad_norm": 16.308605194091797, "learning_rate": 6.367730533672035e-07, "loss": 0.5065, "step": 2634 }, { "epoch": 0.8899020601148261, "grad_norm": 2.0514862537384033, "learning_rate": 6.329376352555039e-07, "loss": 0.5058, "step": 2635 }, { "epoch": 0.8902397838568051, "grad_norm": 1.9593366384506226, "learning_rate": 6.291134253017562e-07, "loss": 0.4898, "step": 2636 }, { "epoch": 0.8905775075987842, "grad_norm": 2.003631591796875, "learning_rate": 6.253004280818209e-07, "loss": 0.51, "step": 2637 }, { "epoch": 0.8909152313407632, "grad_norm": 1.6701866388320923, "learning_rate": 6.214986481581365e-07, "loss": 0.4782, "step": 2638 }, { "epoch": 0.8912529550827423, "grad_norm": 2.206568956375122, "learning_rate": 6.17708090079725e-07, "loss": 0.5155, "step": 2639 }, { "epoch": 0.8915906788247214, "grad_norm": 2.2347238063812256, "learning_rate": 6.13928758382174e-07, "loss": 0.4844, "step": 2640 }, { "epoch": 0.8919284025667005, "grad_norm": 1.7800589799880981, "learning_rate": 6.101606575876484e-07, "loss": 0.5062, "step": 2641 }, { "epoch": 0.8922661263086795, "grad_norm": 2.0032827854156494, "learning_rate": 6.064037922048661e-07, "loss": 0.4953, "step": 2642 }, { "epoch": 0.8926038500506586, "grad_norm": 1.7379231452941895, "learning_rate": 6.026581667291043e-07, "loss": 0.4857, "step": 2643 }, { "epoch": 0.8929415737926376, "grad_norm": 1.6305956840515137, "learning_rate": 5.989237856421915e-07, "loss": 0.4799, "step": 2644 }, { "epoch": 0.8932792975346167, "grad_norm": 1.8925247192382812, "learning_rate": 5.952006534124988e-07, "loss": 0.4952, "step": 2645 }, { "epoch": 0.8936170212765957, "grad_norm": 1.9018293619155884, "learning_rate": 5.914887744949426e-07, "loss": 0.4561, "step": 2646 }, { "epoch": 0.8939547450185749, "grad_norm": 1.5096726417541504, "learning_rate": 5.877881533309715e-07, "loss": 0.4687, "step": 2647 }, { "epoch": 0.8942924687605539, "grad_norm": 1.6955339908599854, "learning_rate": 5.840987943485621e-07, "loss": 0.4871, "step": 2648 }, { "epoch": 0.894630192502533, "grad_norm": 1.713036298751831, "learning_rate": 5.804207019622165e-07, "loss": 0.4468, "step": 2649 }, { "epoch": 0.894967916244512, "grad_norm": 1.9696085453033447, "learning_rate": 5.767538805729578e-07, "loss": 0.503, "step": 2650 }, { "epoch": 0.895305639986491, "grad_norm": 1.9109938144683838, "learning_rate": 5.730983345683195e-07, "loss": 0.5267, "step": 2651 }, { "epoch": 0.8956433637284701, "grad_norm": 3.7677125930786133, "learning_rate": 5.694540683223449e-07, "loss": 0.5045, "step": 2652 }, { "epoch": 0.8959810874704491, "grad_norm": 1.5781099796295166, "learning_rate": 5.658210861955804e-07, "loss": 0.4815, "step": 2653 }, { "epoch": 0.8963188112124282, "grad_norm": 1.6196420192718506, "learning_rate": 5.621993925350722e-07, "loss": 0.4972, "step": 2654 }, { "epoch": 0.8966565349544073, "grad_norm": 2.3901636600494385, "learning_rate": 5.585889916743559e-07, "loss": 0.547, "step": 2655 }, { "epoch": 0.8969942586963864, "grad_norm": 1.6394803524017334, "learning_rate": 5.54989887933457e-07, "loss": 0.4817, "step": 2656 }, { "epoch": 0.8973319824383654, "grad_norm": 2.5358943939208984, "learning_rate": 5.514020856188829e-07, "loss": 0.4798, "step": 2657 }, { "epoch": 0.8976697061803445, "grad_norm": 1.8801624774932861, "learning_rate": 5.478255890236184e-07, "loss": 0.4833, "step": 2658 }, { "epoch": 0.8980074299223235, "grad_norm": 1.6690540313720703, "learning_rate": 5.442604024271203e-07, "loss": 0.4768, "step": 2659 }, { "epoch": 0.8983451536643026, "grad_norm": 1.6666232347488403, "learning_rate": 5.40706530095313e-07, "loss": 0.5033, "step": 2660 }, { "epoch": 0.8986828774062816, "grad_norm": 1.816752552986145, "learning_rate": 5.371639762805825e-07, "loss": 0.4692, "step": 2661 }, { "epoch": 0.8990206011482608, "grad_norm": 1.8266865015029907, "learning_rate": 5.336327452217682e-07, "loss": 0.4914, "step": 2662 }, { "epoch": 0.8993583248902398, "grad_norm": 1.5868525505065918, "learning_rate": 5.301128411441681e-07, "loss": 0.5003, "step": 2663 }, { "epoch": 0.8996960486322189, "grad_norm": 2.5163698196411133, "learning_rate": 5.266042682595218e-07, "loss": 0.5131, "step": 2664 }, { "epoch": 0.9000337723741979, "grad_norm": 1.8719478845596313, "learning_rate": 5.231070307660113e-07, "loss": 0.4953, "step": 2665 }, { "epoch": 0.900371496116177, "grad_norm": 1.8248647451400757, "learning_rate": 5.196211328482559e-07, "loss": 0.4898, "step": 2666 }, { "epoch": 0.900709219858156, "grad_norm": 1.7236604690551758, "learning_rate": 5.16146578677309e-07, "loss": 0.5015, "step": 2667 }, { "epoch": 0.901046943600135, "grad_norm": 1.6772412061691284, "learning_rate": 5.126833724106473e-07, "loss": 0.4925, "step": 2668 }, { "epoch": 0.9013846673421142, "grad_norm": 1.911204218864441, "learning_rate": 5.092315181921681e-07, "loss": 0.48, "step": 2669 }, { "epoch": 0.9017223910840932, "grad_norm": 1.858886957168579, "learning_rate": 5.057910201521876e-07, "loss": 0.5132, "step": 2670 }, { "epoch": 0.9020601148260723, "grad_norm": 1.777329683303833, "learning_rate": 5.02361882407435e-07, "loss": 0.509, "step": 2671 }, { "epoch": 0.9023978385680513, "grad_norm": 1.9690685272216797, "learning_rate": 4.989441090610447e-07, "loss": 0.4902, "step": 2672 }, { "epoch": 0.9027355623100304, "grad_norm": 3.4203267097473145, "learning_rate": 4.955377042025534e-07, "loss": 0.4719, "step": 2673 }, { "epoch": 0.9030732860520094, "grad_norm": 1.818361759185791, "learning_rate": 4.921426719078948e-07, "loss": 0.4982, "step": 2674 }, { "epoch": 0.9034110097939885, "grad_norm": 1.6049683094024658, "learning_rate": 4.887590162393907e-07, "loss": 0.4779, "step": 2675 }, { "epoch": 0.9037487335359676, "grad_norm": 1.7634241580963135, "learning_rate": 4.853867412457603e-07, "loss": 0.4854, "step": 2676 }, { "epoch": 0.9040864572779467, "grad_norm": 1.9649057388305664, "learning_rate": 4.820258509620945e-07, "loss": 0.5091, "step": 2677 }, { "epoch": 0.9044241810199257, "grad_norm": 1.8185052871704102, "learning_rate": 4.786763494098689e-07, "loss": 0.4782, "step": 2678 }, { "epoch": 0.9047619047619048, "grad_norm": 1.6820358037948608, "learning_rate": 4.7533824059692867e-07, "loss": 0.5001, "step": 2679 }, { "epoch": 0.9050996285038838, "grad_norm": 1.6839449405670166, "learning_rate": 4.7201152851748534e-07, "loss": 0.4964, "step": 2680 }, { "epoch": 0.9054373522458629, "grad_norm": 1.997916340827942, "learning_rate": 4.6869621715212034e-07, "loss": 0.5082, "step": 2681 }, { "epoch": 0.9057750759878419, "grad_norm": 1.8725696802139282, "learning_rate": 4.653923104677671e-07, "loss": 0.5267, "step": 2682 }, { "epoch": 0.9061127997298211, "grad_norm": 1.8148362636566162, "learning_rate": 4.620998124177156e-07, "loss": 0.4977, "step": 2683 }, { "epoch": 0.9064505234718001, "grad_norm": 1.8282911777496338, "learning_rate": 4.588187269416011e-07, "loss": 0.4959, "step": 2684 }, { "epoch": 0.9067882472137792, "grad_norm": 1.889120101928711, "learning_rate": 4.5554905796541093e-07, "loss": 0.4951, "step": 2685 }, { "epoch": 0.9071259709557582, "grad_norm": 1.7703927755355835, "learning_rate": 4.522908094014655e-07, "loss": 0.486, "step": 2686 }, { "epoch": 0.9074636946977372, "grad_norm": 1.6769788265228271, "learning_rate": 4.4904398514842183e-07, "loss": 0.4867, "step": 2687 }, { "epoch": 0.9078014184397163, "grad_norm": 1.9006845951080322, "learning_rate": 4.458085890912645e-07, "loss": 0.5103, "step": 2688 }, { "epoch": 0.9081391421816953, "grad_norm": 1.9312748908996582, "learning_rate": 4.4258462510131216e-07, "loss": 0.5071, "step": 2689 }, { "epoch": 0.9084768659236744, "grad_norm": 1.9690388441085815, "learning_rate": 4.3937209703619476e-07, "loss": 0.4608, "step": 2690 }, { "epoch": 0.9088145896656535, "grad_norm": 1.7834193706512451, "learning_rate": 4.36171008739863e-07, "loss": 0.4511, "step": 2691 }, { "epoch": 0.9091523134076326, "grad_norm": 1.6698566675186157, "learning_rate": 4.329813640425784e-07, "loss": 0.4506, "step": 2692 }, { "epoch": 0.9094900371496116, "grad_norm": 6.4881415367126465, "learning_rate": 4.298031667609126e-07, "loss": 0.5373, "step": 2693 }, { "epoch": 0.9098277608915907, "grad_norm": 1.690985083580017, "learning_rate": 4.2663642069773693e-07, "loss": 0.488, "step": 2694 }, { "epoch": 0.9101654846335697, "grad_norm": 1.5913342237472534, "learning_rate": 4.234811296422192e-07, "loss": 0.5129, "step": 2695 }, { "epoch": 0.9105032083755488, "grad_norm": 1.6881697177886963, "learning_rate": 4.2033729736982274e-07, "loss": 0.4663, "step": 2696 }, { "epoch": 0.9108409321175278, "grad_norm": 1.8868311643600464, "learning_rate": 4.172049276423018e-07, "loss": 0.5012, "step": 2697 }, { "epoch": 0.911178655859507, "grad_norm": 1.8296246528625488, "learning_rate": 4.140840242076927e-07, "loss": 0.4942, "step": 2698 }, { "epoch": 0.911516379601486, "grad_norm": 2.015434503555298, "learning_rate": 4.1097459080031046e-07, "loss": 0.5492, "step": 2699 }, { "epoch": 0.9118541033434651, "grad_norm": 1.7565460205078125, "learning_rate": 4.0787663114075007e-07, "loss": 0.4406, "step": 2700 }, { "epoch": 0.9121918270854441, "grad_norm": 1.7124000787734985, "learning_rate": 4.0479014893587296e-07, "loss": 0.487, "step": 2701 }, { "epoch": 0.9125295508274232, "grad_norm": 1.691266655921936, "learning_rate": 4.017151478788117e-07, "loss": 0.4699, "step": 2702 }, { "epoch": 0.9128672745694022, "grad_norm": 1.8169528245925903, "learning_rate": 3.986516316489564e-07, "loss": 0.4683, "step": 2703 }, { "epoch": 0.9132049983113812, "grad_norm": 1.7136954069137573, "learning_rate": 3.955996039119581e-07, "loss": 0.4849, "step": 2704 }, { "epoch": 0.9135427220533604, "grad_norm": 1.8831433057785034, "learning_rate": 3.925590683197189e-07, "loss": 0.4559, "step": 2705 }, { "epoch": 0.9138804457953394, "grad_norm": 1.7162784337997437, "learning_rate": 3.895300285103931e-07, "loss": 0.5251, "step": 2706 }, { "epoch": 0.9142181695373185, "grad_norm": 1.637778401374817, "learning_rate": 3.8651248810837925e-07, "loss": 0.4728, "step": 2707 }, { "epoch": 0.9145558932792975, "grad_norm": 1.8524805307388306, "learning_rate": 3.835064507243125e-07, "loss": 0.5144, "step": 2708 }, { "epoch": 0.9148936170212766, "grad_norm": 1.9005764722824097, "learning_rate": 3.8051191995506664e-07, "loss": 0.4753, "step": 2709 }, { "epoch": 0.9152313407632556, "grad_norm": 1.8677594661712646, "learning_rate": 3.7752889938375113e-07, "loss": 0.4822, "step": 2710 }, { "epoch": 0.9155690645052347, "grad_norm": 1.5014750957489014, "learning_rate": 3.7455739257969527e-07, "loss": 0.4354, "step": 2711 }, { "epoch": 0.9159067882472138, "grad_norm": 1.8064463138580322, "learning_rate": 3.7159740309845707e-07, "loss": 0.4692, "step": 2712 }, { "epoch": 0.9162445119891929, "grad_norm": 1.7456063032150269, "learning_rate": 3.6864893448181114e-07, "loss": 0.4817, "step": 2713 }, { "epoch": 0.9165822357311719, "grad_norm": 1.5560284852981567, "learning_rate": 3.657119902577466e-07, "loss": 0.4667, "step": 2714 }, { "epoch": 0.916919959473151, "grad_norm": 1.530578851699829, "learning_rate": 3.627865739404657e-07, "loss": 0.4661, "step": 2715 }, { "epoch": 0.91725768321513, "grad_norm": 2.975733757019043, "learning_rate": 3.59872689030375e-07, "loss": 0.4613, "step": 2716 }, { "epoch": 0.9175954069571091, "grad_norm": 2.097104072570801, "learning_rate": 3.569703390140822e-07, "loss": 0.4859, "step": 2717 }, { "epoch": 0.9179331306990881, "grad_norm": 1.8984113931655884, "learning_rate": 3.5407952736439266e-07, "loss": 0.5138, "step": 2718 }, { "epoch": 0.9182708544410672, "grad_norm": 4.3312554359436035, "learning_rate": 3.512002575403106e-07, "loss": 0.4578, "step": 2719 }, { "epoch": 0.9186085781830463, "grad_norm": 1.9167301654815674, "learning_rate": 3.4833253298702353e-07, "loss": 0.4906, "step": 2720 }, { "epoch": 0.9189463019250254, "grad_norm": 2.2959184646606445, "learning_rate": 3.454763571359088e-07, "loss": 0.5078, "step": 2721 }, { "epoch": 0.9192840256670044, "grad_norm": 1.827867031097412, "learning_rate": 3.426317334045226e-07, "loss": 0.5252, "step": 2722 }, { "epoch": 0.9196217494089834, "grad_norm": 2.398646593093872, "learning_rate": 3.39798665196599e-07, "loss": 0.4645, "step": 2723 }, { "epoch": 0.9199594731509625, "grad_norm": 1.7657002210617065, "learning_rate": 3.3697715590204514e-07, "loss": 0.4685, "step": 2724 }, { "epoch": 0.9202971968929415, "grad_norm": 3.7932560443878174, "learning_rate": 3.341672088969405e-07, "loss": 0.5102, "step": 2725 }, { "epoch": 0.9206349206349206, "grad_norm": 1.540090799331665, "learning_rate": 3.313688275435234e-07, "loss": 0.4521, "step": 2726 }, { "epoch": 0.9209726443768997, "grad_norm": 1.7684299945831299, "learning_rate": 3.285820151901986e-07, "loss": 0.476, "step": 2727 }, { "epoch": 0.9213103681188788, "grad_norm": 2.2957923412323, "learning_rate": 3.258067751715277e-07, "loss": 0.5019, "step": 2728 }, { "epoch": 0.9216480918608578, "grad_norm": 1.6612054109573364, "learning_rate": 3.23043110808221e-07, "loss": 0.4633, "step": 2729 }, { "epoch": 0.9219858156028369, "grad_norm": 1.7139523029327393, "learning_rate": 3.202910254071434e-07, "loss": 0.4783, "step": 2730 }, { "epoch": 0.9223235393448159, "grad_norm": 1.5852694511413574, "learning_rate": 3.175505222613007e-07, "loss": 0.4666, "step": 2731 }, { "epoch": 0.922661263086795, "grad_norm": 1.7178550958633423, "learning_rate": 3.1482160464984315e-07, "loss": 0.4938, "step": 2732 }, { "epoch": 0.922998986828774, "grad_norm": 1.5872029066085815, "learning_rate": 3.1210427583805656e-07, "loss": 0.5, "step": 2733 }, { "epoch": 0.9233367105707532, "grad_norm": 1.959344506263733, "learning_rate": 3.0939853907736126e-07, "loss": 0.4757, "step": 2734 }, { "epoch": 0.9236744343127322, "grad_norm": 1.7974895238876343, "learning_rate": 3.067043976053041e-07, "loss": 0.4869, "step": 2735 }, { "epoch": 0.9240121580547113, "grad_norm": 1.797369360923767, "learning_rate": 3.040218546455631e-07, "loss": 0.4806, "step": 2736 }, { "epoch": 0.9243498817966903, "grad_norm": 2.9545092582702637, "learning_rate": 3.0135091340793177e-07, "loss": 0.5095, "step": 2737 }, { "epoch": 0.9246876055386694, "grad_norm": 2.3257672786712646, "learning_rate": 2.9869157708832805e-07, "loss": 0.5156, "step": 2738 }, { "epoch": 0.9250253292806484, "grad_norm": 1.7860456705093384, "learning_rate": 2.960438488687767e-07, "loss": 0.4859, "step": 2739 }, { "epoch": 0.9253630530226274, "grad_norm": 1.6691478490829468, "learning_rate": 2.9340773191742e-07, "loss": 0.4861, "step": 2740 }, { "epoch": 0.9257007767646066, "grad_norm": 1.4914517402648926, "learning_rate": 2.907832293885016e-07, "loss": 0.4736, "step": 2741 }, { "epoch": 0.9260385005065856, "grad_norm": 4.55917501449585, "learning_rate": 2.881703444223716e-07, "loss": 0.4701, "step": 2742 }, { "epoch": 0.9263762242485647, "grad_norm": 2.9661624431610107, "learning_rate": 2.8556908014547577e-07, "loss": 0.4833, "step": 2743 }, { "epoch": 0.9267139479905437, "grad_norm": 1.8780628442764282, "learning_rate": 2.829794396703578e-07, "loss": 0.5188, "step": 2744 }, { "epoch": 0.9270516717325228, "grad_norm": 1.760980486869812, "learning_rate": 2.804014260956511e-07, "loss": 0.485, "step": 2745 }, { "epoch": 0.9273893954745018, "grad_norm": 1.7110624313354492, "learning_rate": 2.778350425060794e-07, "loss": 0.4903, "step": 2746 }, { "epoch": 0.9277271192164809, "grad_norm": 1.8589853048324585, "learning_rate": 2.752802919724462e-07, "loss": 0.5165, "step": 2747 }, { "epoch": 0.92806484295846, "grad_norm": 2.065229892730713, "learning_rate": 2.7273717755164076e-07, "loss": 0.4757, "step": 2748 }, { "epoch": 0.9284025667004391, "grad_norm": 7.5170183181762695, "learning_rate": 2.7020570228662336e-07, "loss": 0.4893, "step": 2749 }, { "epoch": 0.9287402904424181, "grad_norm": 1.768997073173523, "learning_rate": 2.6768586920643324e-07, "loss": 0.4854, "step": 2750 }, { "epoch": 0.9290780141843972, "grad_norm": 2.134310722351074, "learning_rate": 2.651776813261753e-07, "loss": 0.4918, "step": 2751 }, { "epoch": 0.9294157379263762, "grad_norm": 5.948333263397217, "learning_rate": 2.626811416470221e-07, "loss": 0.4893, "step": 2752 }, { "epoch": 0.9297534616683553, "grad_norm": 1.6940765380859375, "learning_rate": 2.6019625315620746e-07, "loss": 0.4868, "step": 2753 }, { "epoch": 0.9300911854103343, "grad_norm": 1.7406810522079468, "learning_rate": 2.5772301882702634e-07, "loss": 0.4916, "step": 2754 }, { "epoch": 0.9304289091523134, "grad_norm": 2.895447015762329, "learning_rate": 2.552614416188248e-07, "loss": 0.4641, "step": 2755 }, { "epoch": 0.9307666328942925, "grad_norm": 2.0383968353271484, "learning_rate": 2.528115244770046e-07, "loss": 0.5043, "step": 2756 }, { "epoch": 0.9311043566362716, "grad_norm": 2.099989652633667, "learning_rate": 2.5037327033301194e-07, "loss": 0.5215, "step": 2757 }, { "epoch": 0.9314420803782506, "grad_norm": 1.7469338178634644, "learning_rate": 2.4794668210434194e-07, "loss": 0.5032, "step": 2758 }, { "epoch": 0.9317798041202296, "grad_norm": 1.7963645458221436, "learning_rate": 2.455317626945286e-07, "loss": 0.4775, "step": 2759 }, { "epoch": 0.9321175278622087, "grad_norm": 6.28437614440918, "learning_rate": 2.4312851499314173e-07, "loss": 0.5009, "step": 2760 }, { "epoch": 0.9324552516041877, "grad_norm": 1.614643931388855, "learning_rate": 2.4073694187578764e-07, "loss": 0.4876, "step": 2761 }, { "epoch": 0.9327929753461668, "grad_norm": 1.6950392723083496, "learning_rate": 2.3835704620410294e-07, "loss": 0.5039, "step": 2762 }, { "epoch": 0.9331306990881459, "grad_norm": 2.0297329425811768, "learning_rate": 2.3598883082575408e-07, "loss": 0.4938, "step": 2763 }, { "epoch": 0.933468422830125, "grad_norm": 2.057985544204712, "learning_rate": 2.336322985744266e-07, "loss": 0.5171, "step": 2764 }, { "epoch": 0.933806146572104, "grad_norm": 1.5595030784606934, "learning_rate": 2.3128745226983052e-07, "loss": 0.475, "step": 2765 }, { "epoch": 0.9341438703140831, "grad_norm": 2.3398940563201904, "learning_rate": 2.2895429471768925e-07, "loss": 0.477, "step": 2766 }, { "epoch": 0.9344815940560621, "grad_norm": 1.7392433881759644, "learning_rate": 2.2663282870974524e-07, "loss": 0.5063, "step": 2767 }, { "epoch": 0.9348193177980412, "grad_norm": 1.9855034351348877, "learning_rate": 2.2432305702374645e-07, "loss": 0.5268, "step": 2768 }, { "epoch": 0.9351570415400202, "grad_norm": 1.5864731073379517, "learning_rate": 2.2202498242345106e-07, "loss": 0.5025, "step": 2769 }, { "epoch": 0.9354947652819994, "grad_norm": 1.7080329656600952, "learning_rate": 2.1973860765861831e-07, "loss": 0.4728, "step": 2770 }, { "epoch": 0.9358324890239784, "grad_norm": 1.5888689756393433, "learning_rate": 2.174639354650132e-07, "loss": 0.4824, "step": 2771 }, { "epoch": 0.9361702127659575, "grad_norm": 1.4566351175308228, "learning_rate": 2.1520096856439188e-07, "loss": 0.4651, "step": 2772 }, { "epoch": 0.9365079365079365, "grad_norm": 1.6341731548309326, "learning_rate": 2.1294970966450835e-07, "loss": 0.4668, "step": 2773 }, { "epoch": 0.9368456602499156, "grad_norm": 2.494537591934204, "learning_rate": 2.107101614591045e-07, "loss": 0.5075, "step": 2774 }, { "epoch": 0.9371833839918946, "grad_norm": 1.7979137897491455, "learning_rate": 2.0848232662791457e-07, "loss": 0.491, "step": 2775 }, { "epoch": 0.9375211077338736, "grad_norm": 1.679598331451416, "learning_rate": 2.0626620783665286e-07, "loss": 0.5217, "step": 2776 }, { "epoch": 0.9378588314758528, "grad_norm": 1.729161024093628, "learning_rate": 2.040618077370149e-07, "loss": 0.5149, "step": 2777 }, { "epoch": 0.9381965552178319, "grad_norm": 1.7197105884552002, "learning_rate": 2.0186912896667744e-07, "loss": 0.4839, "step": 2778 }, { "epoch": 0.9385342789598109, "grad_norm": 2.0584189891815186, "learning_rate": 1.9968817414928953e-07, "loss": 0.471, "step": 2779 }, { "epoch": 0.9388720027017899, "grad_norm": 5.794499397277832, "learning_rate": 1.975189458944726e-07, "loss": 0.4917, "step": 2780 }, { "epoch": 0.939209726443769, "grad_norm": 1.7916624546051025, "learning_rate": 1.95361446797816e-07, "loss": 0.4664, "step": 2781 }, { "epoch": 0.939547450185748, "grad_norm": 1.911803960800171, "learning_rate": 1.9321567944087573e-07, "loss": 0.4993, "step": 2782 }, { "epoch": 0.9398851739277271, "grad_norm": 1.730268955230713, "learning_rate": 1.9108164639116689e-07, "loss": 0.5163, "step": 2783 }, { "epoch": 0.9402228976697061, "grad_norm": 2.4409878253936768, "learning_rate": 1.889593502021725e-07, "loss": 0.4636, "step": 2784 }, { "epoch": 0.9405606214116853, "grad_norm": 1.5976208448410034, "learning_rate": 1.8684879341332119e-07, "loss": 0.4872, "step": 2785 }, { "epoch": 0.9408983451536643, "grad_norm": 1.7083131074905396, "learning_rate": 1.8474997855000177e-07, "loss": 0.5057, "step": 2786 }, { "epoch": 0.9412360688956434, "grad_norm": 1.888115406036377, "learning_rate": 1.826629081235498e-07, "loss": 0.4984, "step": 2787 }, { "epoch": 0.9415737926376224, "grad_norm": 1.7774463891983032, "learning_rate": 1.8058758463124882e-07, "loss": 0.5035, "step": 2788 }, { "epoch": 0.9419115163796015, "grad_norm": 1.6561592817306519, "learning_rate": 1.7852401055632907e-07, "loss": 0.4627, "step": 2789 }, { "epoch": 0.9422492401215805, "grad_norm": 1.7733944654464722, "learning_rate": 1.7647218836795878e-07, "loss": 0.4909, "step": 2790 }, { "epoch": 0.9425869638635596, "grad_norm": 1.665628433227539, "learning_rate": 1.7443212052124626e-07, "loss": 0.5097, "step": 2791 }, { "epoch": 0.9429246876055387, "grad_norm": 2.249602794647217, "learning_rate": 1.7240380945723223e-07, "loss": 0.4947, "step": 2792 }, { "epoch": 0.9432624113475178, "grad_norm": 2.084829330444336, "learning_rate": 1.7038725760289532e-07, "loss": 0.4975, "step": 2793 }, { "epoch": 0.9436001350894968, "grad_norm": 3.4347901344299316, "learning_rate": 1.6838246737113983e-07, "loss": 0.4987, "step": 2794 }, { "epoch": 0.9439378588314759, "grad_norm": 1.5871942043304443, "learning_rate": 1.6638944116079582e-07, "loss": 0.4557, "step": 2795 }, { "epoch": 0.9442755825734549, "grad_norm": 1.643478512763977, "learning_rate": 1.6440818135662006e-07, "loss": 0.4975, "step": 2796 }, { "epoch": 0.9446133063154339, "grad_norm": 1.7357096672058105, "learning_rate": 1.624386903292885e-07, "loss": 0.4795, "step": 2797 }, { "epoch": 0.944951030057413, "grad_norm": 1.4465057849884033, "learning_rate": 1.604809704353949e-07, "loss": 0.4532, "step": 2798 }, { "epoch": 0.9452887537993921, "grad_norm": 1.8089712858200073, "learning_rate": 1.5853502401745101e-07, "loss": 0.4829, "step": 2799 }, { "epoch": 0.9456264775413712, "grad_norm": 2.14365553855896, "learning_rate": 1.5660085340387543e-07, "loss": 0.5096, "step": 2800 }, { "epoch": 0.9459642012833502, "grad_norm": 2.0428807735443115, "learning_rate": 1.5467846090900352e-07, "loss": 0.4917, "step": 2801 }, { "epoch": 0.9463019250253293, "grad_norm": 1.7564663887023926, "learning_rate": 1.5276784883307084e-07, "loss": 0.5089, "step": 2802 }, { "epoch": 0.9466396487673083, "grad_norm": 2.0762979984283447, "learning_rate": 1.508690194622231e-07, "loss": 0.5383, "step": 2803 }, { "epoch": 0.9469773725092874, "grad_norm": 1.559483289718628, "learning_rate": 1.4898197506850177e-07, "loss": 0.4975, "step": 2804 }, { "epoch": 0.9473150962512664, "grad_norm": 1.881965160369873, "learning_rate": 1.4710671790985066e-07, "loss": 0.4801, "step": 2805 }, { "epoch": 0.9476528199932456, "grad_norm": 1.8089704513549805, "learning_rate": 1.4524325023010932e-07, "loss": 0.5056, "step": 2806 }, { "epoch": 0.9479905437352246, "grad_norm": 1.796294093132019, "learning_rate": 1.4339157425900974e-07, "loss": 0.4396, "step": 2807 }, { "epoch": 0.9483282674772037, "grad_norm": 1.6827421188354492, "learning_rate": 1.4155169221217513e-07, "loss": 0.485, "step": 2808 }, { "epoch": 0.9486659912191827, "grad_norm": 1.6037895679473877, "learning_rate": 1.3972360629111338e-07, "loss": 0.4778, "step": 2809 }, { "epoch": 0.9490037149611618, "grad_norm": 1.8521066904067993, "learning_rate": 1.3790731868322472e-07, "loss": 0.475, "step": 2810 }, { "epoch": 0.9493414387031408, "grad_norm": 1.7503269910812378, "learning_rate": 1.3610283156178516e-07, "loss": 0.4906, "step": 2811 }, { "epoch": 0.9496791624451199, "grad_norm": 1.5101039409637451, "learning_rate": 1.3431014708595536e-07, "loss": 0.4375, "step": 2812 }, { "epoch": 0.950016886187099, "grad_norm": 1.8909211158752441, "learning_rate": 1.3252926740076943e-07, "loss": 0.5065, "step": 2813 }, { "epoch": 0.950354609929078, "grad_norm": 2.3220860958099365, "learning_rate": 1.3076019463714173e-07, "loss": 0.4712, "step": 2814 }, { "epoch": 0.9506923336710571, "grad_norm": 1.8050901889801025, "learning_rate": 1.2900293091185457e-07, "loss": 0.4524, "step": 2815 }, { "epoch": 0.9510300574130361, "grad_norm": 2.8117897510528564, "learning_rate": 1.272574783275615e-07, "loss": 0.447, "step": 2816 }, { "epoch": 0.9513677811550152, "grad_norm": 1.7777396440505981, "learning_rate": 1.255238389727842e-07, "loss": 0.4995, "step": 2817 }, { "epoch": 0.9517055048969942, "grad_norm": 1.7858659029006958, "learning_rate": 1.238020149219099e-07, "loss": 0.4905, "step": 2818 }, { "epoch": 0.9520432286389733, "grad_norm": 1.669569730758667, "learning_rate": 1.2209200823518509e-07, "loss": 0.4864, "step": 2819 }, { "epoch": 0.9523809523809523, "grad_norm": 1.7146551609039307, "learning_rate": 1.2039382095871966e-07, "loss": 0.4852, "step": 2820 }, { "epoch": 0.9527186761229315, "grad_norm": 2.735686779022217, "learning_rate": 1.1870745512447823e-07, "loss": 0.5254, "step": 2821 }, { "epoch": 0.9530563998649105, "grad_norm": 1.7936958074569702, "learning_rate": 1.1703291275028227e-07, "loss": 0.4783, "step": 2822 }, { "epoch": 0.9533941236068896, "grad_norm": 1.7466932535171509, "learning_rate": 1.153701958398068e-07, "loss": 0.4987, "step": 2823 }, { "epoch": 0.9537318473488686, "grad_norm": 2.4114866256713867, "learning_rate": 1.1371930638257367e-07, "loss": 0.4667, "step": 2824 }, { "epoch": 0.9540695710908477, "grad_norm": 1.922289252281189, "learning_rate": 1.1208024635395498e-07, "loss": 0.4871, "step": 2825 }, { "epoch": 0.9544072948328267, "grad_norm": 1.638764500617981, "learning_rate": 1.1045301771516748e-07, "loss": 0.5075, "step": 2826 }, { "epoch": 0.9547450185748058, "grad_norm": 3.4925873279571533, "learning_rate": 1.0883762241327367e-07, "loss": 0.4975, "step": 2827 }, { "epoch": 0.9550827423167849, "grad_norm": 1.898197054862976, "learning_rate": 1.0723406238117185e-07, "loss": 0.4726, "step": 2828 }, { "epoch": 0.955420466058764, "grad_norm": 2.3836772441864014, "learning_rate": 1.0564233953760494e-07, "loss": 0.4956, "step": 2829 }, { "epoch": 0.955758189800743, "grad_norm": 2.0597951412200928, "learning_rate": 1.0406245578714613e-07, "loss": 0.5004, "step": 2830 }, { "epoch": 0.956095913542722, "grad_norm": 1.525310754776001, "learning_rate": 1.0249441302020546e-07, "loss": 0.5012, "step": 2831 }, { "epoch": 0.9564336372847011, "grad_norm": 1.5979444980621338, "learning_rate": 1.0093821311302543e-07, "loss": 0.4377, "step": 2832 }, { "epoch": 0.9567713610266801, "grad_norm": 1.931251049041748, "learning_rate": 9.939385792767653e-08, "loss": 0.5048, "step": 2833 }, { "epoch": 0.9571090847686592, "grad_norm": 1.8035855293273926, "learning_rate": 9.786134931205726e-08, "loss": 0.4076, "step": 2834 }, { "epoch": 0.9574468085106383, "grad_norm": 1.7987521886825562, "learning_rate": 9.63406890998897e-08, "loss": 0.4904, "step": 2835 }, { "epoch": 0.9577845322526174, "grad_norm": 2.129390239715576, "learning_rate": 9.483187911072167e-08, "loss": 0.5303, "step": 2836 }, { "epoch": 0.9581222559945964, "grad_norm": 1.7083660364151, "learning_rate": 9.333492114991682e-08, "loss": 0.5149, "step": 2837 }, { "epoch": 0.9584599797365755, "grad_norm": 1.9057093858718872, "learning_rate": 9.184981700866347e-08, "loss": 0.4778, "step": 2838 }, { "epoch": 0.9587977034785545, "grad_norm": 2.2769863605499268, "learning_rate": 9.037656846395904e-08, "loss": 0.5078, "step": 2839 }, { "epoch": 0.9591354272205336, "grad_norm": 1.9223194122314453, "learning_rate": 8.891517727862231e-08, "loss": 0.5209, "step": 2840 }, { "epoch": 0.9594731509625126, "grad_norm": 2.3340351581573486, "learning_rate": 8.746564520128009e-08, "loss": 0.5134, "step": 2841 }, { "epoch": 0.9598108747044918, "grad_norm": 1.7975378036499023, "learning_rate": 8.602797396636941e-08, "loss": 0.4863, "step": 2842 }, { "epoch": 0.9601485984464708, "grad_norm": 1.9000566005706787, "learning_rate": 8.460216529413422e-08, "loss": 0.5051, "step": 2843 }, { "epoch": 0.9604863221884499, "grad_norm": 3.425651788711548, "learning_rate": 8.318822089062872e-08, "loss": 0.4898, "step": 2844 }, { "epoch": 0.9608240459304289, "grad_norm": 1.7012701034545898, "learning_rate": 8.178614244770621e-08, "loss": 0.511, "step": 2845 }, { "epoch": 0.961161769672408, "grad_norm": 1.5590755939483643, "learning_rate": 8.039593164302362e-08, "loss": 0.4904, "step": 2846 }, { "epoch": 0.961499493414387, "grad_norm": 1.6353875398635864, "learning_rate": 7.901759014003807e-08, "loss": 0.4623, "step": 2847 }, { "epoch": 0.961837217156366, "grad_norm": 2.0361266136169434, "learning_rate": 7.765111958800365e-08, "loss": 0.4887, "step": 2848 }, { "epoch": 0.9621749408983451, "grad_norm": 1.818369746208191, "learning_rate": 7.629652162197132e-08, "loss": 0.4728, "step": 2849 }, { "epoch": 0.9625126646403243, "grad_norm": 2.326601028442383, "learning_rate": 7.495379786278456e-08, "loss": 0.4768, "step": 2850 }, { "epoch": 0.9628503883823033, "grad_norm": 2.0656321048736572, "learning_rate": 7.36229499170793e-08, "loss": 0.5016, "step": 2851 }, { "epoch": 0.9631881121242823, "grad_norm": 1.7879680395126343, "learning_rate": 7.230397937728174e-08, "loss": 0.4767, "step": 2852 }, { "epoch": 0.9635258358662614, "grad_norm": 2.2644503116607666, "learning_rate": 7.099688782160719e-08, "loss": 0.486, "step": 2853 }, { "epoch": 0.9638635596082404, "grad_norm": 1.7180482149124146, "learning_rate": 6.970167681405459e-08, "loss": 0.4865, "step": 2854 }, { "epoch": 0.9642012833502195, "grad_norm": 1.6256760358810425, "learning_rate": 6.841834790440871e-08, "loss": 0.4671, "step": 2855 }, { "epoch": 0.9645390070921985, "grad_norm": 2.4042160511016846, "learning_rate": 6.714690262823675e-08, "loss": 0.4779, "step": 2856 }, { "epoch": 0.9648767308341777, "grad_norm": 1.8345909118652344, "learning_rate": 6.588734250688622e-08, "loss": 0.5204, "step": 2857 }, { "epoch": 0.9652144545761567, "grad_norm": 1.7423306703567505, "learning_rate": 6.463966904748487e-08, "loss": 0.4645, "step": 2858 }, { "epoch": 0.9655521783181358, "grad_norm": 2.107717275619507, "learning_rate": 6.340388374293515e-08, "loss": 0.5192, "step": 2859 }, { "epoch": 0.9658899020601148, "grad_norm": 1.6589694023132324, "learning_rate": 6.217998807191761e-08, "loss": 0.4978, "step": 2860 }, { "epoch": 0.9662276258020939, "grad_norm": 3.8879876136779785, "learning_rate": 6.096798349888078e-08, "loss": 0.4842, "step": 2861 }, { "epoch": 0.9665653495440729, "grad_norm": 7.755805492401123, "learning_rate": 5.97678714740535e-08, "loss": 0.4928, "step": 2862 }, { "epoch": 0.966903073286052, "grad_norm": 1.7660034894943237, "learning_rate": 5.8579653433425976e-08, "loss": 0.4937, "step": 2863 }, { "epoch": 0.9672407970280311, "grad_norm": 1.6688764095306396, "learning_rate": 5.7403330798762036e-08, "loss": 0.4735, "step": 2864 }, { "epoch": 0.9675785207700102, "grad_norm": 1.9276624917984009, "learning_rate": 5.623890497759021e-08, "loss": 0.4871, "step": 2865 }, { "epoch": 0.9679162445119892, "grad_norm": 1.7076053619384766, "learning_rate": 5.508637736320488e-08, "loss": 0.5044, "step": 2866 }, { "epoch": 0.9682539682539683, "grad_norm": 1.8196897506713867, "learning_rate": 5.394574933466179e-08, "loss": 0.4865, "step": 2867 }, { "epoch": 0.9685916919959473, "grad_norm": 2.3867928981781006, "learning_rate": 5.281702225678032e-08, "loss": 0.5045, "step": 2868 }, { "epoch": 0.9689294157379263, "grad_norm": 4.3850297927856445, "learning_rate": 5.170019748013788e-08, "loss": 0.4775, "step": 2869 }, { "epoch": 0.9692671394799054, "grad_norm": 1.9987167119979858, "learning_rate": 5.0595276341071084e-08, "loss": 0.5055, "step": 2870 }, { "epoch": 0.9696048632218845, "grad_norm": 1.567685842514038, "learning_rate": 4.9502260161673475e-08, "loss": 0.4791, "step": 2871 }, { "epoch": 0.9699425869638636, "grad_norm": 1.6523592472076416, "learning_rate": 4.842115024979443e-08, "loss": 0.4945, "step": 2872 }, { "epoch": 0.9702803107058426, "grad_norm": 1.5906076431274414, "learning_rate": 4.735194789903475e-08, "loss": 0.4499, "step": 2873 }, { "epoch": 0.9706180344478217, "grad_norm": 4.702713489532471, "learning_rate": 4.62946543887488e-08, "loss": 0.4384, "step": 2874 }, { "epoch": 0.9709557581898007, "grad_norm": 4.890462398529053, "learning_rate": 4.5249270984041304e-08, "loss": 0.5057, "step": 2875 }, { "epoch": 0.9712934819317798, "grad_norm": 3.1097567081451416, "learning_rate": 4.4215798935766106e-08, "loss": 0.4481, "step": 2876 }, { "epoch": 0.9716312056737588, "grad_norm": 1.6218560934066772, "learning_rate": 4.3194239480522925e-08, "loss": 0.4535, "step": 2877 }, { "epoch": 0.971968929415738, "grad_norm": 1.5854623317718506, "learning_rate": 4.218459384065954e-08, "loss": 0.489, "step": 2878 }, { "epoch": 0.972306653157717, "grad_norm": 1.9298069477081299, "learning_rate": 4.118686322426735e-08, "loss": 0.4915, "step": 2879 }, { "epoch": 0.9726443768996961, "grad_norm": 1.906441569328308, "learning_rate": 4.020104882518139e-08, "loss": 0.4928, "step": 2880 }, { "epoch": 0.9729821006416751, "grad_norm": 2.039767026901245, "learning_rate": 3.922715182297587e-08, "loss": 0.5247, "step": 2881 }, { "epoch": 0.9733198243836542, "grad_norm": 2.0052525997161865, "learning_rate": 3.826517338296865e-08, "loss": 0.4961, "step": 2882 }, { "epoch": 0.9736575481256332, "grad_norm": 2.209313154220581, "learning_rate": 3.7315114656215625e-08, "loss": 0.4709, "step": 2883 }, { "epoch": 0.9739952718676123, "grad_norm": 1.830849528312683, "learning_rate": 3.637697677950858e-08, "loss": 0.4748, "step": 2884 }, { "epoch": 0.9743329956095913, "grad_norm": 1.8074798583984375, "learning_rate": 3.5450760875376245e-08, "loss": 0.5169, "step": 2885 }, { "epoch": 0.9746707193515705, "grad_norm": 1.8495455980300903, "learning_rate": 3.4536468052082106e-08, "loss": 0.4898, "step": 2886 }, { "epoch": 0.9750084430935495, "grad_norm": 1.646493911743164, "learning_rate": 3.363409940362328e-08, "loss": 0.4658, "step": 2887 }, { "epoch": 0.9753461668355285, "grad_norm": 1.8319928646087646, "learning_rate": 3.27436560097294e-08, "loss": 0.501, "step": 2888 }, { "epoch": 0.9756838905775076, "grad_norm": 2.107086181640625, "learning_rate": 3.1865138935860404e-08, "loss": 0.4925, "step": 2889 }, { "epoch": 0.9760216143194866, "grad_norm": 2.869595527648926, "learning_rate": 3.0998549233205446e-08, "loss": 0.5022, "step": 2890 }, { "epoch": 0.9763593380614657, "grad_norm": 1.7928413152694702, "learning_rate": 3.0143887938682834e-08, "loss": 0.5045, "step": 2891 }, { "epoch": 0.9766970618034447, "grad_norm": 1.6791882514953613, "learning_rate": 2.9301156074936775e-08, "loss": 0.4706, "step": 2892 }, { "epoch": 0.9770347855454239, "grad_norm": 1.6068544387817383, "learning_rate": 2.847035465033954e-08, "loss": 0.4686, "step": 2893 }, { "epoch": 0.9773725092874029, "grad_norm": 1.9623427391052246, "learning_rate": 2.7651484658984816e-08, "loss": 0.5417, "step": 2894 }, { "epoch": 0.977710233029382, "grad_norm": 1.7659317255020142, "learning_rate": 2.6844547080692175e-08, "loss": 0.4916, "step": 2895 }, { "epoch": 0.978047956771361, "grad_norm": 1.9257184267044067, "learning_rate": 2.6049542881001478e-08, "loss": 0.4766, "step": 2896 }, { "epoch": 0.9783856805133401, "grad_norm": 1.5955991744995117, "learning_rate": 2.526647301117735e-08, "loss": 0.471, "step": 2897 }, { "epoch": 0.9787234042553191, "grad_norm": 3.0735089778900146, "learning_rate": 2.4495338408201397e-08, "loss": 0.4957, "step": 2898 }, { "epoch": 0.9790611279972982, "grad_norm": 1.8432021141052246, "learning_rate": 2.373613999477331e-08, "loss": 0.5137, "step": 2899 }, { "epoch": 0.9793988517392773, "grad_norm": 1.594757318496704, "learning_rate": 2.298887867931199e-08, "loss": 0.484, "step": 2900 }, { "epoch": 0.9797365754812564, "grad_norm": 1.731073021888733, "learning_rate": 2.2253555355954415e-08, "loss": 0.5201, "step": 2901 }, { "epoch": 0.9800742992232354, "grad_norm": 1.7861618995666504, "learning_rate": 2.153017090455123e-08, "loss": 0.497, "step": 2902 }, { "epoch": 0.9804120229652145, "grad_norm": 1.8095918893814087, "learning_rate": 2.0818726190667826e-08, "loss": 0.4729, "step": 2903 }, { "epoch": 0.9807497467071935, "grad_norm": 1.8138346672058105, "learning_rate": 2.0119222065582146e-08, "loss": 0.4982, "step": 2904 }, { "epoch": 0.9810874704491725, "grad_norm": 1.6440869569778442, "learning_rate": 1.943165936628688e-08, "loss": 0.4666, "step": 2905 }, { "epoch": 0.9814251941911516, "grad_norm": 1.976345419883728, "learning_rate": 1.8756038915486165e-08, "loss": 0.5154, "step": 2906 }, { "epoch": 0.9817629179331308, "grad_norm": 1.64108145236969, "learning_rate": 1.8092361521592215e-08, "loss": 0.4781, "step": 2907 }, { "epoch": 0.9821006416751098, "grad_norm": 2.243189573287964, "learning_rate": 1.7440627978728698e-08, "loss": 0.494, "step": 2908 }, { "epoch": 0.9824383654170888, "grad_norm": 2.213418483734131, "learning_rate": 1.680083906672625e-08, "loss": 0.5139, "step": 2909 }, { "epoch": 0.9827760891590679, "grad_norm": 1.9171961545944214, "learning_rate": 1.6172995551125836e-08, "loss": 0.5199, "step": 2910 }, { "epoch": 0.9831138129010469, "grad_norm": 2.028578758239746, "learning_rate": 1.555709818317319e-08, "loss": 0.5083, "step": 2911 }, { "epoch": 0.983451536643026, "grad_norm": 1.5713304281234741, "learning_rate": 1.4953147699819927e-08, "loss": 0.513, "step": 2912 }, { "epoch": 0.983789260385005, "grad_norm": 1.7378904819488525, "learning_rate": 1.4361144823722417e-08, "loss": 0.46, "step": 2913 }, { "epoch": 0.9841269841269841, "grad_norm": 2.394237756729126, "learning_rate": 1.3781090263242924e-08, "loss": 0.5104, "step": 2914 }, { "epoch": 0.9844647078689632, "grad_norm": 1.7667584419250488, "learning_rate": 1.3212984712445143e-08, "loss": 0.4826, "step": 2915 }, { "epoch": 0.9848024316109423, "grad_norm": 1.786786437034607, "learning_rate": 1.2656828851096426e-08, "loss": 0.5129, "step": 2916 }, { "epoch": 0.9851401553529213, "grad_norm": 2.0469682216644287, "learning_rate": 1.2112623344663344e-08, "loss": 0.5045, "step": 2917 }, { "epoch": 0.9854778790949004, "grad_norm": 2.591304063796997, "learning_rate": 1.1580368844316125e-08, "loss": 0.5134, "step": 2918 }, { "epoch": 0.9858156028368794, "grad_norm": 1.5978221893310547, "learning_rate": 1.1060065986923107e-08, "loss": 0.4697, "step": 2919 }, { "epoch": 0.9861533265788585, "grad_norm": 1.7224928140640259, "learning_rate": 1.0551715395054063e-08, "loss": 0.4941, "step": 2920 }, { "epoch": 0.9864910503208375, "grad_norm": 1.7128952741622925, "learning_rate": 1.0055317676974652e-08, "loss": 0.4811, "step": 2921 }, { "epoch": 0.9868287740628167, "grad_norm": 4.265770435333252, "learning_rate": 9.570873426649752e-09, "loss": 0.4906, "step": 2922 }, { "epoch": 0.9871664978047957, "grad_norm": 1.8765422105789185, "learning_rate": 9.098383223741236e-09, "loss": 0.5214, "step": 2923 }, { "epoch": 0.9875042215467748, "grad_norm": 2.5328822135925293, "learning_rate": 8.637847633607976e-09, "loss": 0.5055, "step": 2924 }, { "epoch": 0.9878419452887538, "grad_norm": 1.5970348119735718, "learning_rate": 8.18926720730251e-09, "loss": 0.4898, "step": 2925 }, { "epoch": 0.9881796690307328, "grad_norm": 1.665510654449463, "learning_rate": 7.752642481573258e-09, "loss": 0.4585, "step": 2926 }, { "epoch": 0.9885173927727119, "grad_norm": 2.10377836227417, "learning_rate": 7.327973978865643e-09, "loss": 0.5508, "step": 2927 }, { "epoch": 0.9888551165146909, "grad_norm": 1.8082786798477173, "learning_rate": 6.915262207316531e-09, "loss": 0.4754, "step": 2928 }, { "epoch": 0.9891928402566701, "grad_norm": 1.8634628057479858, "learning_rate": 6.514507660754232e-09, "loss": 0.4849, "step": 2929 }, { "epoch": 0.9895305639986491, "grad_norm": 1.8286116123199463, "learning_rate": 6.125710818701836e-09, "loss": 0.4825, "step": 2930 }, { "epoch": 0.9898682877406282, "grad_norm": 1.6727182865142822, "learning_rate": 5.748872146376094e-09, "loss": 0.5313, "step": 2931 }, { "epoch": 0.9902060114826072, "grad_norm": 2.1350622177124023, "learning_rate": 5.383992094681878e-09, "loss": 0.4941, "step": 2932 }, { "epoch": 0.9905437352245863, "grad_norm": 1.8903685808181763, "learning_rate": 5.031071100216611e-09, "loss": 0.4877, "step": 2933 }, { "epoch": 0.9908814589665653, "grad_norm": 1.9301029443740845, "learning_rate": 4.690109585268054e-09, "loss": 0.5051, "step": 2934 }, { "epoch": 0.9912191827085444, "grad_norm": 1.8503081798553467, "learning_rate": 4.361107957813193e-09, "loss": 0.4747, "step": 2935 }, { "epoch": 0.9915569064505235, "grad_norm": 1.811928391456604, "learning_rate": 4.044066611519348e-09, "loss": 0.5223, "step": 2936 }, { "epoch": 0.9918946301925026, "grad_norm": 1.8204113245010376, "learning_rate": 3.738985925743066e-09, "loss": 0.4732, "step": 2937 }, { "epoch": 0.9922323539344816, "grad_norm": 1.7166372537612915, "learning_rate": 3.445866265526787e-09, "loss": 0.4868, "step": 2938 }, { "epoch": 0.9925700776764607, "grad_norm": 1.9357937574386597, "learning_rate": 3.164707981604398e-09, "loss": 0.4884, "step": 2939 }, { "epoch": 0.9929078014184397, "grad_norm": 2.0600385665893555, "learning_rate": 2.8955114103956793e-09, "loss": 0.4812, "step": 2940 }, { "epoch": 0.9932455251604188, "grad_norm": 1.9285318851470947, "learning_rate": 2.638276874006307e-09, "loss": 0.5104, "step": 2941 }, { "epoch": 0.9935832489023978, "grad_norm": 1.672191858291626, "learning_rate": 2.3930046802322914e-09, "loss": 0.51, "step": 2942 }, { "epoch": 0.993920972644377, "grad_norm": 1.6545971632003784, "learning_rate": 2.159695122553318e-09, "loss": 0.498, "step": 2943 }, { "epoch": 0.994258696386356, "grad_norm": 1.9274837970733643, "learning_rate": 1.938348480134966e-09, "loss": 0.4902, "step": 2944 }, { "epoch": 0.994596420128335, "grad_norm": 1.6314034461975098, "learning_rate": 1.7289650178309302e-09, "loss": 0.488, "step": 2945 }, { "epoch": 0.9949341438703141, "grad_norm": 1.5956038236618042, "learning_rate": 1.531544986177469e-09, "loss": 0.4942, "step": 2946 }, { "epoch": 0.9952718676122931, "grad_norm": 1.8705260753631592, "learning_rate": 1.3460886213989555e-09, "loss": 0.5066, "step": 2947 }, { "epoch": 0.9956095913542722, "grad_norm": 1.9946147203445435, "learning_rate": 1.1725961454034374e-09, "loss": 0.4838, "step": 2948 }, { "epoch": 0.9959473150962512, "grad_norm": 1.9682279825210571, "learning_rate": 1.011067765781526e-09, "loss": 0.4887, "step": 2949 }, { "epoch": 0.9962850388382303, "grad_norm": 1.6931291818618774, "learning_rate": 8.615036758108375e-10, "loss": 0.4395, "step": 2950 }, { "epoch": 0.9966227625802094, "grad_norm": 2.1473541259765625, "learning_rate": 7.239040544526621e-10, "loss": 0.4795, "step": 2951 }, { "epoch": 0.9969604863221885, "grad_norm": 1.7647171020507812, "learning_rate": 5.982690663519642e-10, "loss": 0.4486, "step": 2952 }, { "epoch": 0.9972982100641675, "grad_norm": 1.7494064569473267, "learning_rate": 4.845988618362718e-10, "loss": 0.4808, "step": 2953 }, { "epoch": 0.9976359338061466, "grad_norm": 2.115054130554199, "learning_rate": 3.8289357691900785e-10, "loss": 0.4893, "step": 2954 }, { "epoch": 0.9979736575481256, "grad_norm": 1.7712682485580444, "learning_rate": 2.931533332950487e-10, "loss": 0.5, "step": 2955 }, { "epoch": 0.9983113812901047, "grad_norm": 1.888079047203064, "learning_rate": 2.1537823834183457e-10, "loss": 0.4735, "step": 2956 }, { "epoch": 0.9986491050320837, "grad_norm": 1.8775482177734375, "learning_rate": 1.4956838512270033e-10, "loss": 0.5259, "step": 2957 }, { "epoch": 0.9989868287740629, "grad_norm": 1.9074558019638062, "learning_rate": 9.572385238243443e-11, "loss": 0.4755, "step": 2958 }, { "epoch": 0.9993245525160419, "grad_norm": 2.724679470062256, "learning_rate": 5.384470454838919e-11, "loss": 0.4919, "step": 2959 }, { "epoch": 0.999662276258021, "grad_norm": 1.8419549465179443, "learning_rate": 2.3930991730480858e-11, "loss": 0.5089, "step": 2960 }, { "epoch": 1.0, "grad_norm": 1.7753173112869263, "learning_rate": 5.98274972229973e-12, "loss": 0.4816, "step": 2961 }, { "epoch": 1.0, "step": 2961, "total_flos": 5.496832632467489e+18, "train_loss": 0.5445912595396708, "train_runtime": 8300.4353, "train_samples_per_second": 45.659, "train_steps_per_second": 0.357 } ], "logging_steps": 1.0, "max_steps": 2961, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.496832632467489e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }